1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python3
"""
Checks URLs from a CSV file (e.g. .pdf files on justice.gov) by swapping the file extension
with common media formats (video, image, audio, archive, document) and saves
any files that respond successfully.

Usage:
    python run.py urls.csv
    python run.py  (prompts for CSV path)

CSV Format:
    Single column with URLs, one per line. Header row is optional.

Requirements (pip install):
    requests
"""

import sys
import os
import re
import csv
import requests
from urllib.parse import urlparse

MEDIA_EXTENSIONS = {
    "Video": [
        ".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm",
        ".mpeg", ".mpg", ".3gp", ".ogv", ".ts", ".m4v", ".vob",
        ".asf", ".divx", ".rm", ".rmvb", ".swf", ".m2ts", ".mts",
        ".f4v", ".gifv", ".dv", ".mxf", ".roq", ".nsv", ".amv",
        ".m2v", ".svi", ".3g2", ".mpe", ".yuv", ".wmv9",
    ],
    "Image": [
        ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif",
        ".webp", ".svg", ".ico", ".heic", ".heif", ".avif",
        ".raw", ".cr2", ".nef", ".arw", ".dng", ".psd", ".ai",
        ".eps", ".jfif", ".jp2", ".jpx",
    ],
    "Audio": [
        ".mp3", ".wav", ".flac", ".aac", ".ogg", ".wma", ".m4a",
        ".aiff", ".aif", ".opus", ".alac", ".ape", ".mid", ".midi",
        ".ac3", ".amr", ".au", ".ra", ".mka",
    ],
    "Archive": [
        ".zip", ".rar", ".7z", ".tar", ".gz", ".tar.gz", ".tgz",
        ".bz2", ".xz", ".cab", ".iso", ".dmg",
    ],
    "Document": [
        ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
        ".rtf", ".odt", ".ods", ".odp", ".txt", ".csv",
        ".epub", ".mobi", ".tex", ".pages", ".numbers", ".key",
    ],
}

OUTPUT_DIR = "files"


def pass_age_verification(session: requests.Session, base_url: str):
    """
    Handles the justice.gov age verification gate.
    Sets the required cookie to bypass the age verification.
    """
    # justice.gov uses a cookie named 'justiceGovAgeVerified' for age verification
    # Setting this cookie allows direct access to age-gated content
    print("[*] Setting age verification cookie...")
    session.cookies.set('justiceGovAgeVerified', 'true', domain='www.justice.gov', path='/')
    print("[+] Age verification cookie set.")


def strip_extension(url: str) -> str:
    """Remove the existing file extension from the URL."""
    return re.sub(r"\.[a-zA-Z0-9]+$", "", url)


def check_url(session: requests.Session, url: str) -> requests.Response | None:
    """HEAD then GET a URL. Returns the GET response if it looks like a real file."""
    try:
        head = session.head(url, timeout=15, allow_redirects=True)
        if head.status_code != 200:
            return None

        content_type = head.headers.get("Content-Type", "").lower()
        # Skip HTML pages (error pages, redirects to age gate, etc.)
        if "text/html" in content_type:
            return None

        # Looks promising - do a full GET
        resp = session.get(url, timeout=60, stream=True)
        if resp.status_code == 200 and "text/html" not in resp.headers.get("Content-Type", "").lower():
            return resp

    except requests.RequestException:
        pass
    return None


def read_urls_from_csv(csv_path: str) -> list[str]:
    """Read URLs from a CSV file. Expects URLs in the first column."""
    urls = []
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            if not row:
                continue
            url = row[0].strip()
            # Skip empty rows and potential header rows
            if url and url.lower() not in ("url", "urls", "link", "links"):
                urls.append(url)
    return urls


def process_url(session: requests.Session, original_url: str, all_extensions: list[tuple[str, str]]) -> int:
    """Process a single URL and return the number of files found."""
    base_url_no_ext = strip_extension(original_url)

    print(f"\n[*] Base URL: {base_url_no_ext}")
    print(f"[*] Testing {len(all_extensions)} file extensions across {len(MEDIA_EXTENSIONS)} categories...\n")

    found = 0
    current_category = None
    for category, ext in all_extensions:
        if category != current_category:
            current_category = category
            print(f"\n  -- {category} --")

        test_url = base_url_no_ext + ext
        label = ext.ljust(10)
        print(f"    {label} - checking...", end="", flush=True)

        resp = check_url(session, test_url)
        if resp:
            # Save into a category subfolder
            cat_dir = os.path.join(OUTPUT_DIR, category.lower())
            os.makedirs(cat_dir, exist_ok=True)

            parsed = urlparse(test_url)
            filename = os.path.basename(parsed.path)
            if not filename:
                filename = f"file_{found}{ext}"

            filepath = os.path.join(cat_dir, filename)

            with open(filepath, "wb") as f:
                for chunk in resp.iter_content(chunk_size=1024 * 256):
                    f.write(chunk)

            size_mb = os.path.getsize(filepath) / (1024 * 1024)
            content_type = resp.headers.get("Content-Type", "unknown")
            print(f" FOUND! Saved -> {filepath} ({size_mb:.2f} MB, {content_type})")
            found += 1
            print(f"\n    [*] File found, skipping remaining checks for this URL.")
            break
        else:
            print(" --")

    return found


def main():
    # Get CSV path from command line or prompt
    if len(sys.argv) > 1:
        csv_path = sys.argv[1]
    else:
        csv_path = input("Enter CSV file path: ").strip()

    if not csv_path:
        print("No CSV path provided. Exiting.")
        sys.exit(1)

    if not os.path.exists(csv_path):
        print(f"File not found: {csv_path}")
        sys.exit(1)

    # Read URLs from CSV
    urls = read_urls_from_csv(csv_path)
    if not urls:
        print("No URLs found in CSV. Exiting.")
        sys.exit(1)

    print(f"[*] Loaded {len(urls)} URL(s) from {csv_path}")

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/120.0.0.0 Safari/537.36",
    })

    # Handle age gate
    pass_age_verification(session, urls[0])

    # Build extension list once
    all_extensions = []
    for category, exts in MEDIA_EXTENSIONS.items():
        all_extensions.extend((category, ext) for ext in exts)

    # Process each URL, skipping duplicates
    total_found = 0
    processed_bases = set()
    skipped = 0

    for i, url in enumerate(urls, 1):
        base_url = strip_extension(url)

        if base_url in processed_bases:
            print(f"\n[*] Skipping URL {i}/{len(urls)} (duplicate base URL)")
            print(f"[*] {url}")
            skipped += 1
            continue

        processed_bases.add(base_url)

        print(f"\n{'='*60}")
        print(f"[*] Processing URL {i}/{len(urls)}")
        print(f"[*] {url}")
        print('='*60)

        found = process_url(session, url, all_extensions)
        total_found += found
        print(f"\n[{'+'if found else '!'}] URL complete. {found} file(s) found.")

    print(f"\n{'='*60}")
    print(f"[{'+'if total_found else '!'}] All done. {total_found} total file(s) saved to ./{OUTPUT_DIR}/")
    print(f"[*] Processed {len(urls) - skipped} URL(s), skipped {skipped} duplicate(s)")
    print('='*60)


if __name__ == "__main__":
    main()