pythonintermediate

Python Data Profiling Script

Generate a data quality profile report with null counts, distributions, and anomaly detection.

python
import pandas as pd
from typing import Any


def profile_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    profile: list[dict[str, Any]] = []

    for col in df.columns:
        series = df[col]
        info: dict[str, Any] = {
            "column": col,
            "dtype": str(series.dtype),
            "count": len(series),
            "nulls": series.isna().sum(),
            "null_pct": round(series.isna().mean() * 100, 2),
            "unique": series.nunique(),
            "unique_pct": round(series.nunique() / len(series) * 100, 2),
        }

        if pd.api.types.is_numeric_dtype(series):
            info.update({
                "min": series.min(),
                "max": series.max(),
                "mean": round(series.mean(), 2),
                "median": series.median(),
                "std": round(series.std(), 2),
                "zeros": (series == 0).sum(),
            })
        elif pd.api.types.is_string_dtype(series):
            non_null = series.dropna()
            info.update({
                "min_length": non_null.str.len().min() if len(non_null) else 0,
                "max_length": non_null.str.len().max() if len(non_null) else 0,
                "empty_strings": (non_null == "").sum(),
                "top_value": non_null.mode().iloc[0] if len(non_null) else None,
            })

        profile.append(info)

    return pd.DataFrame(profile)


df = pd.read_csv("data.csv")
report = profile_dataframe(df)
print(report.to_string(index=False))
report.to_csv("data_profile.csv", index=False)

Use Cases

  • Automated data quality reporting
  • Understanding new datasets before processing
  • Detecting anomalies and data drift

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.