Stirling-PDF/scripts/download_pdf_samples.py
2025-11-09 12:43:33 +00:00

196 lines
6.0 KiB
Python

#!/usr/bin/env python3
"""
Download large batches of PDF URLs into a local directory so they can be fed to
scripts/harvest_type3_fonts.py (or any other processing pipeline).
Usage examples:
# Download every URL listed in pdf_urls.txt into tmp/type3-pdfs
python scripts/download_pdf_samples.py \
--urls-file pdf_urls.txt \
--output-dir tmp/type3-pdfs
# Mix inline URLs with a file and use 16 concurrent downloads
python scripts/download_pdf_samples.py \
--urls https://example.com/a.pdf https://example.com/b.pdf \
--urls-file more_urls.txt \
--output-dir tmp/type3-pdfs \
--workers 16
"""
from __future__ import annotations
import argparse
import concurrent.futures
import hashlib
import os
import re
import sys
from pathlib import Path
from typing import Iterable, List, Optional, Set, Tuple
from urllib.parse import unquote, urlparse
import requests
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Bulk download PDF URLs.")
parser.add_argument(
"--urls",
nargs="*",
default=[],
help="Inline list of PDF URLs (can be combined with --urls-file).",
)
parser.add_argument(
"--urls-file",
action="append",
help="Text file containing one URL per line (can be repeated).",
)
parser.add_argument(
"--output-dir",
default="tmp/harvest-pdfs",
help="Directory to store downloaded PDFs (default: %(default)s).",
)
parser.add_argument(
"--workers",
type=int,
default=min(8, (os.cpu_count() or 4) * 2),
help="Number of concurrent downloads (default: %(default)s).",
)
parser.add_argument(
"--timeout",
type=int,
default=120,
help="Per-request timeout in seconds (default: %(default)s).",
)
parser.add_argument(
"--overwrite",
action="store_true",
help="Overwrite existing files (default: skip already downloaded PDFs).",
)
return parser.parse_args()
def load_urls(args: argparse.Namespace) -> List[str]:
urls: List[str] = []
seen: Set[str] = set()
def add(url: str) -> None:
clean = url.strip()
if not clean or clean.startswith("#"):
return
if clean not in seen:
seen.add(clean)
urls.append(clean)
for url in args.urls:
add(url)
if args.urls_file:
for file in args.urls_file:
path = Path(file)
if not path.exists():
print(f"[WARN] URL file not found: {file}", file=sys.stderr)
continue
with path.open("r", encoding="utf-8") as handle:
for line in handle:
add(line)
if not urls:
raise SystemExit("No URLs supplied. Use --urls and/or --urls-file.")
return urls
def sanitize_filename(name: str) -> str:
return re.sub(r"[^A-Za-z0-9._-]+", "_", name).strip("_") or "download"
def build_filename(url: str, output_dir: Path) -> Path:
parsed = urlparse(url)
candidate = Path(unquote(parsed.path)).name
if not candidate:
candidate = "download.pdf"
candidate = sanitize_filename(candidate)
if not candidate.lower().endswith(".pdf"):
candidate += ".pdf"
target = output_dir / candidate
if not target.exists():
return target
stem = target.stem
suffix = target.suffix
digest = hashlib.sha1(url.encode("utf-8")).hexdigest()[:8]
return output_dir / f"{stem}-{digest}{suffix}"
def download_pdf(
url: str,
output_dir: Path,
timeout: int,
overwrite: bool,
) -> Tuple[str, Optional[Path], Optional[str]]:
try:
dest = build_filename(url, output_dir)
if dest.exists() and not overwrite:
return url, dest, "exists"
response = requests.get(url, stream=True, timeout=timeout)
response.raise_for_status()
content_type = response.headers.get("Content-Type", "").lower()
if "pdf" not in content_type and not url.lower().endswith(".pdf"):
# Peek into the first bytes to be safe
peek = response.raw.read(5, decode_content=True)
if not peek.startswith(b"%PDF"):
return url, None, f"Skipping non-PDF content-type ({content_type or 'unknown'})"
content = peek + response.content[len(peek):]
else:
content = response.content
output_dir.mkdir(parents=True, exist_ok=True)
dest.write_bytes(content)
return url, dest, None
except Exception as exc: # pylint: disable=broad-except
return url, None, str(exc)
def main() -> None:
args = parse_args()
urls = load_urls(args)
output_dir = Path(args.output_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers...")
successes = 0
skipped = 0
failures: List[Tuple[str, str]] = []
with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
future_to_url = {
executor.submit(
download_pdf, url, output_dir, args.timeout, args.overwrite
): url
for url in urls
}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
result_url, path, error = future.result()
if error == "exists":
skipped += 1
print(f"[SKIP] {url} (already downloaded)")
elif error:
failures.append((result_url, error))
print(f"[FAIL] {url} -> {error}", file=sys.stderr)
else:
successes += 1
print(f"[OK] {url} -> {path}")
print()
print(f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}")
if failures:
print("Failures:")
for url, error in failures:
print(f" {url} -> {error}")
if __name__ == "__main__":
main()