#!/usr/bin/env python3 """ safe-export.py — strip prompts/PII/API keys from LangSmith or Helicone exports. Outputs clean schema: timestamps, model, token counts, tool calls, retries, session IDs, error codes. Runs entirely locally. No network calls. Usage: python safe-export.py input.json [output.json] python safe-export.py input.csv [output.csv] cat runs.json | python safe-export.py - """ import sys, json, csv, re, io from pathlib import Path # Fields whose values contain prompt/response text — strip entirely STRIP_FIELDS = { "inputs", "outputs", "input", "output", "messages", "prompt", "response", "content", "text", "system", "user", "assistant", "request_body", "response_body", "kwargs", "serialized", } # Regex for secrets and PII SECRET_RE = re.compile( r"(sk-[A-Za-z0-9]{20,}|Bearer\s+[A-Za-z0-9\-_.]{20,}|" r"[Aa][Pp][Ii][_-]?[Kk][Ee][Yy][\s:=]+\S+|" r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})", re.I ) KEEP_FIELDS = { "id", "run_id", "trace_id", "session_id", "parent_run_id", "start_time", "end_time", "created_at", "timestamp", "run_type", "name", "model", "model_name", "prompt_tokens", "completion_tokens", "total_tokens", "input_tokens", "output_tokens", "latency", "latency_ms", "duration_ms", "status", "error", "error_type", "error_code", "retry_count", "attempt", "tool_calls", "tools", "tags", "metadata", } def redact_secrets(v): if isinstance(v, str): return SECRET_RE.sub("[REDACTED]", v) if isinstance(v, dict): return {k2: redact_secrets(v2) for k2, v2 in v.items()} if isinstance(v, list): return [redact_secrets(i) for i in v] return v def clean_tool_calls(tc): """Keep tool names only, drop all arguments.""" if isinstance(tc, list): return [{"name": t.get("name", t.get("function", {}).get("name", "?"))} for t in tc] return tc def clean_record(rec: dict) -> dict: out = {} for k, v in rec.items(): if not k: continue if k.lower() in STRIP_FIELDS: continue if k not in KEEP_FIELDS and k.lower() not in KEEP_FIELDS: continue if k in ("tool_calls", "tools"): out[k] = clean_tool_calls(v) else: out[k] = redact_secrets(v) return out def process_json(data): if isinstance(data, list): return [clean_record(r) for r in data if isinstance(r, dict)] if isinstance(data, dict): return clean_record(data) return data def process_csv(text: str) -> list: reader = csv.DictReader(io.StringIO(text)) return [clean_record(dict(row)) for row in reader] def main(): args = sys.argv[1:] if not args or args[0] in ("-h", "--help"): print(__doc__); return src = args[0] dst = args[1] if len(args) > 1 else None raw = sys.stdin.read() if src == "-" else Path(src).read_text() ext = "" if src == "-" else Path(src).suffix.lower() stripped = raw.lstrip() is_json = stripped.startswith(("{", "[")) if ext == ".csv" or (not is_json and "," in raw[:200] and "\n" in raw[:200]): result = process_csv(raw) out = json.dumps(result, indent=2) else: result = process_json(json.loads(raw)) out = json.dumps(result, indent=2) if dst: Path(dst).write_text(out) print(f"Wrote {len(result) if isinstance(result, list) else 1} record(s) to {dst}") else: print(out) if __name__ == "__main__": main()