Free local CSV audit helper — Sven Hermeson

# Free local CSV audit helper

A small no-signup Python helper that lets a prospect audit a CSV on their own machine before sending anything to Sven Hermeson or any other service provider.

## Why this exists

For the $99 CSV cleanup + workflow automation pilot, prospects should not paste or upload sensitive data just to find out whether a file has obvious cleanup problems. This helper gives them a local first pass:

- row and normalized-column counts
- duplicate key groups for fields like `email` or `sku`
- rows missing required fields
- columns with high blank rates
- a short suggested cleanup plan

It uses only Python's standard library and does not upload data.

## Demo command

```bash
python3 csv_audit.py sample_contacts.csv --dedupe email --required name,email --json sample_audit.json
```

Sample output:

```text
CSV Audit Summary
=================
File: sample_contacts.csv
Rows: 5
Columns (6): name, email, company, phone, last_contacted, notes
Duplicate groups found: 1
Rows missing required fields: 1
High-blank columns:
- notes: 2 blank (40.0%)
Suggested cleanup plan:
- Review 1 duplicate key group(s) before import or outreach.
- Fix or exclude 1 row(s) missing required fields.
- Check high-blank columns; remove unused fields or fill important values.
```

## Safe-use notes

- Run it locally on a copy of your file.
- Do not send secrets, payment data, regulated data, or highly sensitive customer records through temporary intake channels.
- For professional cleanup, share only a tiny redacted sample first.

Primary offer: https://rentry.co/hermes-csv-cleanup-pilot


--- csv_audit.py ---

```python
#!/usr/bin/env python3
"""Local CSV audit helper for small-business data cleanup scoping.

Usage:
  python3 csv_audit.py contacts.csv --dedupe email --required email,name

This script reads a CSV on your own machine and prints a plain-English audit:
row/column counts, likely duplicate keys, missing required fields, blank-cell
rates, and a short cleanup plan. It does not upload data anywhere and uses only
Python's standard library.
"""
from __future__ import annotations

import argparse
import csv
import json
from collections import Counter, defaultdict
from pathlib import Path
from typing import Iterable


def norm_header(value: str) -> str:
    return "_".join((value or "").strip().lower().replace("-", " ").split())


def norm_cell(value: str) -> str:
    return (value or "").strip()


def split_fields(value: str | None) -> list[str]:
    if not value:
        return []
    return [norm_header(x) for x in value.split(",") if norm_header(x)]


def pct(n: int, d: int) -> str:
    return "0.0%" if d == 0 else f"{(100*n/d):.1f}%"


def load_csv(path: Path) -> tuple[list[str], list[dict[str, str]]]:
    with path.open(newline="", encoding="utf-8-sig") as f:
        reader = csv.DictReader(f)
        if not reader.fieldnames:
            raise SystemExit("Input CSV has no header row.")
        original = reader.fieldnames
        normalized = [norm_header(h) for h in original]
        if len(set(normalized)) != len(normalized):
            duplicates = [h for h, c in Counter(normalized).items() if c > 1]
            raise SystemExit(f"Header names collide after normalization: {duplicates}")
        rows = []
        for raw in reader:
            rows.append({norm_header(k): norm_cell(v) for k, v in raw.items()})
        return normalized, rows


def audit(path: Path, dedupe: list[str], required: list[str]) -> dict:
    headers, rows = load_csv(path)
    total = len(rows)
    missing_by_col = {h: sum(1 for r in rows if not r.get(h)) for h in headers}

    missing_required_rows = []
    for i, row in enumerate(rows, start=2):
        missing = [f for f in required if not row.get(f)]
        if missing:
            missing_required_rows.append({"csv_line": i, "missing": missing})

    duplicate_groups = []
    if dedupe:
        groups: dict[tuple[str, ...], list[int]] = defaultdict(list)
        for i, row in enumerate(rows, start=2):
            key = tuple((row.get(f) or "").strip().lower() for f in dedupe)
            if all(key):
                groups[key].append(i)
        for key, lines in groups.items():
            if len(lines) > 1:
                duplicate_groups.append({"key": dict(zip(dedupe, key)), "csv_lines": lines})

    high_blank_cols = [
        {"column": h, "blank_cells": n, "blank_rate": pct(n, total)}
        for h, n in missing_by_col.items()
        if total and n / total >= 0.25
    ]

    plan = []
    if duplicate_groups:
        plan.append(f"Review {len(duplicate_groups)} duplicate key group(s) before import or outreach.")
    if missing_required_rows:
        plan.append(f"Fix or exclude {len(missing_required_rows)} row(s) missing required fields.")
    if high_blank_cols:
        plan.append("Check high-blank columns; remove unused fields or fill important values.")
    if not plan:
        plan.append("No obvious duplicate/missing-field issues found with the selected checks.")

    return {
        "file": str(path),
        "rows": total,
        "columns": headers,
        "dedupe_keys_checked": dedupe,
        "required_fields_checked": required,
        "duplicate_group_count": len(duplicate_groups),
        "duplicate_groups_sample": duplicate_groups[:10],
        "missing_required_row_count": len(missing_required_rows),
        "missing_required_rows_sample": missing_required_rows[:10],
        "high_blank_columns": high_blank_cols,
        "suggested_cleanup_plan": plan,
    }


def print_report(result: dict) -> None:
    print("CSV Audit Summary")
    print("=================")
    print(f"File: {result['file']}")
    print(f"Rows: {result['rows']}")
    print(f"Columns ({len(result['columns'])}): {', '.join(result['columns'])}")
    print(f"Duplicate groups found: {result['duplicate_group_count']}")
    print(f"Rows missing required fields: {result['missing_required_row_count']}")
    if result["high_blank_columns"]:
        print("High-blank columns:")
        for col in result["high_blank_columns"]:
            print(f"- {col['column']}: {col['blank_cells']} blank ({col['blank_rate']})")
    print("Suggested cleanup plan:")
    for item in result["suggested_cleanup_plan"]:
        print(f"- {item}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Audit a local CSV before cleanup/import.")
    parser.add_argument("csv_path", type=Path)
    parser.add_argument("--dedupe", default="", help="Comma-separated dedupe key fields, e.g. email,phone")
    parser.add_argument("--required", default="", help="Comma-separated required fields, e.g. email,name")
    parser.add_argument("--json", dest="json_path", type=Path, help="Optional path to write full JSON audit")
    args = parser.parse_args()

    result = audit(args.csv_path, split_fields(args.dedupe), split_fields(args.required))
    print_report(result)
    if args.json_path:
        args.json_path.write_text(json.dumps(result, indent=2), encoding="utf-8")
        print(f"\nWrote JSON audit: {args.json_path}")


if __name__ == "__main__":
    main()

```