sqlintermediate

SQL Data Deduplication Techniques

Remove duplicate records using ROW_NUMBER, DISTINCT ON, and self-join deduplication strategies.

sql
-- Method 1: ROW_NUMBER window function (most common)
WITH ranked AS (
    SELECT *,
        ROW_NUMBER() OVER (
            PARTITION BY email
            ORDER BY updated_at DESC
        ) AS rn
    FROM users
)
DELETE FROM users
WHERE id IN (
    SELECT id FROM ranked WHERE rn > 1
);

-- Method 2: DISTINCT ON (PostgreSQL)
CREATE TABLE users_clean AS
SELECT DISTINCT ON (email) *
FROM users
ORDER BY email, updated_at DESC;

-- Method 3: Keep latest, delete rest using CTE
WITH dupes AS (
    SELECT id,
        ROW_NUMBER() OVER (
            PARTITION BY email, phone
            ORDER BY created_at DESC
        ) AS rn
    FROM contacts
)
DELETE FROM contacts
WHERE id IN (SELECT id FROM dupes WHERE rn > 1);

-- Method 4: Find and review duplicates before deleting
SELECT email, COUNT(*) AS cnt, ARRAY_AGG(id) AS duplicate_ids
FROM users
GROUP BY email
HAVING COUNT(*) > 1
ORDER BY cnt DESC;

-- Method 5: Merge duplicates (keep combined data)
INSERT INTO users_merged (email, name, phone, created_at)
SELECT
    email,
    (ARRAY_AGG(name ORDER BY updated_at DESC))[1] AS name,
    (ARRAY_AGG(phone ORDER BY phone IS NOT NULL DESC))[1] AS phone,
    MIN(created_at) AS created_at
FROM users
GROUP BY email;

Use Cases

  • Cleaning duplicate records in production databases
  • Data warehouse deduplication during ETL
  • Merging records from multiple data sources

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.