pythonintermediate

Read Large CSV in Chunks with Pandas

Process CSV files larger than RAM by reading in chunks — memory-efficient ETL pattern for data pipelines.

pythonPress ⌘/Ctrl + Shift + C to copy

import pandas as pd
from typing import Iterator


def process_large_csv(
    filepath: str,
    chunk_size: int = 10_000,
) -> Iterator[pd.DataFrame]:
    """
    Stream a large CSV file in chunks.
    Each chunk is a DataFrame — process and discard before loading next.
    """
    reader = pd.read_csv(
        filepath,
        chunksize=chunk_size,
        dtype_backend="pyarrow",  # faster, less memory
        low_memory=False,
    )
    for chunk in reader:
        # Drop nulls, clean, transform
        chunk = chunk.dropna(subset=["id", "email"])
        chunk["email"] = chunk["email"].str.lower().str.strip()
        yield chunk


def main():
    total = 0
    for chunk in process_large_csv("users.csv"):
        total += len(chunk)
        # load_to_db(chunk)
        print(f"Processed {total:,} rows", end="\r")
    print(f"\nDone. Total: {total:,}")


if __name__ == "__main__":
    main()