#!/usr/bin/env python3
"""
safe-export.py — strip prompts/PII/API keys from LangSmith or Helicone exports.
Outputs clean schema: timestamps, model, token counts, tool calls, retries,
session IDs, error codes. Runs entirely locally. No network calls.
Usage:
python safe-export.py input.json [output.json]
python safe-export.py input.csv [output.csv]
cat runs.json | python safe-export.py -
"""
import sys, json, csv, re, io
from pathlib import Path
# Fields whose values contain prompt/response text — strip entirely
STRIP_FIELDS = {
"inputs", "outputs", "input", "output", "messages", "prompt",
"response", "content", "text", "system", "user", "assistant",
"request_body", "response_body", "kwargs", "serialized",
}
# Regex for secrets and PII
SECRET_RE = re.compile(
r"(sk-[A-Za-z0-9]{20,}|Bearer\s+[A-Za-z0-9\-_.]{20,}|"
r"[Aa][Pp][Ii][_-]?[Kk][Ee][Yy][\s:=]+\S+|"
r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})", re.I
)
KEEP_FIELDS = {
"id", "run_id", "trace_id", "session_id", "parent_run_id",
"start_time", "end_time", "created_at", "timestamp",
"run_type", "name", "model", "model_name",
"prompt_tokens", "completion_tokens", "total_tokens",
"input_tokens", "output_tokens",
"latency", "latency_ms", "duration_ms",
"status", "error", "error_type", "error_code",
"retry_count", "attempt", "tool_calls", "tools",
"tags", "metadata",
}
def redact_secrets(v):
if isinstance(v, str):
return SECRET_RE.sub("[REDACTED]", v)
if isinstance(v, dict):
return {k2: redact_secrets(v2) for k2, v2 in v.items()}
if isinstance(v, list):
return [redact_secrets(i) for i in v]
return v
def clean_tool_calls(tc):
"""Keep tool names only, drop all arguments."""
if isinstance(tc, list):
return [{"name": t.get("name", t.get("function", {}).get("name", "?"))} for t in tc]
return tc
def clean_record(rec: dict) -> dict:
out = {}
for k, v in rec.items():
if not k:
continue
if k.lower() in STRIP_FIELDS:
continue
if k not in KEEP_FIELDS and k.lower() not in KEEP_FIELDS:
continue
if k in ("tool_calls", "tools"):
out[k] = clean_tool_calls(v)
else:
out[k] = redact_secrets(v)
return out
def process_json(data):
if isinstance(data, list):
return [clean_record(r) for r in data if isinstance(r, dict)]
if isinstance(data, dict):
return clean_record(data)
return data
def process_csv(text: str) -> list:
reader = csv.DictReader(io.StringIO(text))
return [clean_record(dict(row)) for row in reader]
def main():
args = sys.argv[1:]
if not args or args[0] in ("-h", "--help"):
print(__doc__); return
src = args[0]
dst = args[1] if len(args) > 1 else None
raw = sys.stdin.read() if src == "-" else Path(src).read_text()
ext = "" if src == "-" else Path(src).suffix.lower()
stripped = raw.lstrip()
is_json = stripped.startswith(("{", "["))
if ext == ".csv" or (not is_json and "," in raw[:200] and "\n" in raw[:200]):
result = process_csv(raw)
out = json.dumps(result, indent=2)
else:
result = process_json(json.loads(raw))
out = json.dumps(result, indent=2)
if dst:
Path(dst).write_text(out)
print(f"Wrote {len(result) if isinstance(result, list) else 1} record(s) to {dst}")
else:
print(out)
if __name__ == "__main__":
main()