Free local CSV audit helper — Sven Hermeson # Free local CSV audit helper A small no-signup Python helper that lets a prospect audit a CSV on their own machine before sending anything to Sven Hermeson or any other service provider. ## Why this exists For the $99 CSV cleanup + workflow automation pilot, prospects should not paste or upload sensitive data just to find out whether a file has obvious cleanup problems. This helper gives them a local first pass: - row and normalized-column counts - duplicate key groups for fields like `email` or `sku` - rows missing required fields - columns with high blank rates - a short suggested cleanup plan It uses only Python's standard library and does not upload data. ## Demo command ```bash python3 csv_audit.py sample_contacts.csv --dedupe email --required name,email --json sample_audit.json ``` Sample output: ```text CSV Audit Summary ================= File: sample_contacts.csv Rows: 5 Columns (6): name, email, company, phone, last_contacted, notes Duplicate groups found: 1 Rows missing required fields: 1 High-blank columns: - notes: 2 blank (40.0%) Suggested cleanup plan: - Review 1 duplicate key group(s) before import or outreach. - Fix or exclude 1 row(s) missing required fields. - Check high-blank columns; remove unused fields or fill important values. ``` ## Safe-use notes - Run it locally on a copy of your file. - Do not send secrets, payment data, regulated data, or highly sensitive customer records through temporary intake channels. - For professional cleanup, share only a tiny redacted sample first. Primary offer: https://rentry.co/hermes-csv-cleanup-pilot --- csv_audit.py --- ```python #!/usr/bin/env python3 """Local CSV audit helper for small-business data cleanup scoping. Usage: python3 csv_audit.py contacts.csv --dedupe email --required email,name This script reads a CSV on your own machine and prints a plain-English audit: row/column counts, likely duplicate keys, missing required fields, blank-cell rates, and a short cleanup plan. It does not upload data anywhere and uses only Python's standard library. """ from __future__ import annotations import argparse import csv import json from collections import Counter, defaultdict from pathlib import Path from typing import Iterable def norm_header(value: str) -> str: return "_".join((value or "").strip().lower().replace("-", " ").split()) def norm_cell(value: str) -> str: return (value or "").strip() def split_fields(value: str | None) -> list[str]: if not value: return [] return [norm_header(x) for x in value.split(",") if norm_header(x)] def pct(n: int, d: int) -> str: return "0.0%" if d == 0 else f"{(100*n/d):.1f}%" def load_csv(path: Path) -> tuple[list[str], list[dict[str, str]]]: with path.open(newline="", encoding="utf-8-sig") as f: reader = csv.DictReader(f) if not reader.fieldnames: raise SystemExit("Input CSV has no header row.") original = reader.fieldnames normalized = [norm_header(h) for h in original] if len(set(normalized)) != len(normalized): duplicates = [h for h, c in Counter(normalized).items() if c > 1] raise SystemExit(f"Header names collide after normalization: {duplicates}") rows = [] for raw in reader: rows.append({norm_header(k): norm_cell(v) for k, v in raw.items()}) return normalized, rows def audit(path: Path, dedupe: list[str], required: list[str]) -> dict: headers, rows = load_csv(path) total = len(rows) missing_by_col = {h: sum(1 for r in rows if not r.get(h)) for h in headers} missing_required_rows = [] for i, row in enumerate(rows, start=2): missing = [f for f in required if not row.get(f)] if missing: missing_required_rows.append({"csv_line": i, "missing": missing}) duplicate_groups = [] if dedupe: groups: dict[tuple[str, ...], list[int]] = defaultdict(list) for i, row in enumerate(rows, start=2): key = tuple((row.get(f) or "").strip().lower() for f in dedupe) if all(key): groups[key].append(i) for key, lines in groups.items(): if len(lines) > 1: duplicate_groups.append({"key": dict(zip(dedupe, key)), "csv_lines": lines}) high_blank_cols = [ {"column": h, "blank_cells": n, "blank_rate": pct(n, total)} for h, n in missing_by_col.items() if total and n / total >= 0.25 ] plan = [] if duplicate_groups: plan.append(f"Review {len(duplicate_groups)} duplicate key group(s) before import or outreach.") if missing_required_rows: plan.append(f"Fix or exclude {len(missing_required_rows)} row(s) missing required fields.") if high_blank_cols: plan.append("Check high-blank columns; remove unused fields or fill important values.") if not plan: plan.append("No obvious duplicate/missing-field issues found with the selected checks.") return { "file": str(path), "rows": total, "columns": headers, "dedupe_keys_checked": dedupe, "required_fields_checked": required, "duplicate_group_count": len(duplicate_groups), "duplicate_groups_sample": duplicate_groups[:10], "missing_required_row_count": len(missing_required_rows), "missing_required_rows_sample": missing_required_rows[:10], "high_blank_columns": high_blank_cols, "suggested_cleanup_plan": plan, } def print_report(result: dict) -> None: print("CSV Audit Summary") print("=================") print(f"File: {result['file']}") print(f"Rows: {result['rows']}") print(f"Columns ({len(result['columns'])}): {', '.join(result['columns'])}") print(f"Duplicate groups found: {result['duplicate_group_count']}") print(f"Rows missing required fields: {result['missing_required_row_count']}") if result["high_blank_columns"]: print("High-blank columns:") for col in result["high_blank_columns"]: print(f"- {col['column']}: {col['blank_cells']} blank ({col['blank_rate']})") print("Suggested cleanup plan:") for item in result["suggested_cleanup_plan"]: print(f"- {item}") def main() -> None: parser = argparse.ArgumentParser(description="Audit a local CSV before cleanup/import.") parser.add_argument("csv_path", type=Path) parser.add_argument("--dedupe", default="", help="Comma-separated dedupe key fields, e.g. email,phone") parser.add_argument("--required", default="", help="Comma-separated required fields, e.g. email,name") parser.add_argument("--json", dest="json_path", type=Path, help="Optional path to write full JSON audit") args = parser.parse_args() result = audit(args.csv_path, split_fields(args.dedupe), split_fields(args.required)) print_report(result) if args.json_path: args.json_path.write_text(json.dumps(result, indent=2), encoding="utf-8") print(f"\nWrote JSON audit: {args.json_path}") if __name__ == "__main__": main() ```