1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python3
"""
safe-export.py — strip prompts/PII/API keys from LangSmith or Helicone exports.
Outputs clean schema: timestamps, model, token counts, tool calls, retries,
session IDs, error codes. Runs entirely locally. No network calls.

Usage:
  python safe-export.py input.json [output.json]
  python safe-export.py input.csv  [output.csv]
  cat runs.json | python safe-export.py -
"""

import sys, json, csv, re, io
from pathlib import Path

# Fields whose values contain prompt/response text — strip entirely
STRIP_FIELDS = {
    "inputs", "outputs", "input", "output", "messages", "prompt",
    "response", "content", "text", "system", "user", "assistant",
    "request_body", "response_body", "kwargs", "serialized",
}

# Regex for secrets and PII
SECRET_RE = re.compile(
    r"(sk-[A-Za-z0-9]{20,}|Bearer\s+[A-Za-z0-9\-_.]{20,}|"
    r"[Aa][Pp][Ii][_-]?[Kk][Ee][Yy][\s:=]+\S+|"
    r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})", re.I
)

KEEP_FIELDS = {
    "id", "run_id", "trace_id", "session_id", "parent_run_id",
    "start_time", "end_time", "created_at", "timestamp",
    "run_type", "name", "model", "model_name",
    "prompt_tokens", "completion_tokens", "total_tokens",
    "input_tokens", "output_tokens",
    "latency", "latency_ms", "duration_ms",
    "status", "error", "error_type", "error_code",
    "retry_count", "attempt", "tool_calls", "tools",
    "tags", "metadata",
}

def redact_secrets(v):
    if isinstance(v, str):
        return SECRET_RE.sub("[REDACTED]", v)
    if isinstance(v, dict):
        return {k2: redact_secrets(v2) for k2, v2 in v.items()}
    if isinstance(v, list):
        return [redact_secrets(i) for i in v]
    return v

def clean_tool_calls(tc):
    """Keep tool names only, drop all arguments."""
    if isinstance(tc, list):
        return [{"name": t.get("name", t.get("function", {}).get("name", "?"))} for t in tc]
    return tc

def clean_record(rec: dict) -> dict:
    out = {}
    for k, v in rec.items():
        if not k:
            continue
        if k.lower() in STRIP_FIELDS:
            continue
        if k not in KEEP_FIELDS and k.lower() not in KEEP_FIELDS:
            continue
        if k in ("tool_calls", "tools"):
            out[k] = clean_tool_calls(v)
        else:
            out[k] = redact_secrets(v)
    return out

def process_json(data):
    if isinstance(data, list):
        return [clean_record(r) for r in data if isinstance(r, dict)]
    if isinstance(data, dict):
        return clean_record(data)
    return data

def process_csv(text: str) -> list:
    reader = csv.DictReader(io.StringIO(text))
    return [clean_record(dict(row)) for row in reader]

def main():
    args = sys.argv[1:]
    if not args or args[0] in ("-h", "--help"):
        print(__doc__); return
    src = args[0]
    dst = args[1] if len(args) > 1 else None
    raw = sys.stdin.read() if src == "-" else Path(src).read_text()
    ext = "" if src == "-" else Path(src).suffix.lower()
    stripped = raw.lstrip()
    is_json = stripped.startswith(("{", "["))
    if ext == ".csv" or (not is_json and "," in raw[:200] and "\n" in raw[:200]):
        result = process_csv(raw)
        out = json.dumps(result, indent=2)
    else:
        result = process_json(json.loads(raw))
        out = json.dumps(result, indent=2)
    if dst:
        Path(dst).write_text(out)
        print(f"Wrote {len(result) if isinstance(result, list) else 1} record(s) to {dst}")
    else:
        print(out)

if __name__ == "__main__":
    main()