mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
196 lines
6.0 KiB
Python
196 lines
6.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Download large batches of PDF URLs into a local directory so they can be fed to
|
|
scripts/harvest_type3_fonts.py (or any other processing pipeline).
|
|
|
|
Usage examples:
|
|
|
|
# Download every URL listed in pdf_urls.txt into tmp/type3-pdfs
|
|
python scripts/download_pdf_samples.py \
|
|
--urls-file pdf_urls.txt \
|
|
--output-dir tmp/type3-pdfs
|
|
|
|
# Mix inline URLs with a file and use 16 concurrent downloads
|
|
python scripts/download_pdf_samples.py \
|
|
--urls https://example.com/a.pdf https://example.com/b.pdf \
|
|
--urls-file more_urls.txt \
|
|
--output-dir tmp/type3-pdfs \
|
|
--workers 16
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import concurrent.futures
|
|
import hashlib
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Iterable, List, Optional, Set, Tuple
|
|
from urllib.parse import unquote, urlparse
|
|
|
|
import requests
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Bulk download PDF URLs.")
|
|
parser.add_argument(
|
|
"--urls",
|
|
nargs="*",
|
|
default=[],
|
|
help="Inline list of PDF URLs (can be combined with --urls-file).",
|
|
)
|
|
parser.add_argument(
|
|
"--urls-file",
|
|
action="append",
|
|
help="Text file containing one URL per line (can be repeated).",
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
default="tmp/harvest-pdfs",
|
|
help="Directory to store downloaded PDFs (default: %(default)s).",
|
|
)
|
|
parser.add_argument(
|
|
"--workers",
|
|
type=int,
|
|
default=min(8, (os.cpu_count() or 4) * 2),
|
|
help="Number of concurrent downloads (default: %(default)s).",
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
default=120,
|
|
help="Per-request timeout in seconds (default: %(default)s).",
|
|
)
|
|
parser.add_argument(
|
|
"--overwrite",
|
|
action="store_true",
|
|
help="Overwrite existing files (default: skip already downloaded PDFs).",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def load_urls(args: argparse.Namespace) -> List[str]:
|
|
urls: List[str] = []
|
|
seen: Set[str] = set()
|
|
|
|
def add(url: str) -> None:
|
|
clean = url.strip()
|
|
if not clean or clean.startswith("#"):
|
|
return
|
|
if clean not in seen:
|
|
seen.add(clean)
|
|
urls.append(clean)
|
|
|
|
for url in args.urls:
|
|
add(url)
|
|
if args.urls_file:
|
|
for file in args.urls_file:
|
|
path = Path(file)
|
|
if not path.exists():
|
|
print(f"[WARN] URL file not found: {file}", file=sys.stderr)
|
|
continue
|
|
with path.open("r", encoding="utf-8") as handle:
|
|
for line in handle:
|
|
add(line)
|
|
if not urls:
|
|
raise SystemExit("No URLs supplied. Use --urls and/or --urls-file.")
|
|
return urls
|
|
|
|
|
|
def sanitize_filename(name: str) -> str:
|
|
return re.sub(r"[^A-Za-z0-9._-]+", "_", name).strip("_") or "download"
|
|
|
|
|
|
def build_filename(url: str, output_dir: Path) -> Path:
|
|
parsed = urlparse(url)
|
|
candidate = Path(unquote(parsed.path)).name
|
|
if not candidate:
|
|
candidate = "download.pdf"
|
|
candidate = sanitize_filename(candidate)
|
|
if not candidate.lower().endswith(".pdf"):
|
|
candidate += ".pdf"
|
|
target = output_dir / candidate
|
|
if not target.exists():
|
|
return target
|
|
stem = target.stem
|
|
suffix = target.suffix
|
|
digest = hashlib.sha1(url.encode("utf-8")).hexdigest()[:8]
|
|
return output_dir / f"{stem}-{digest}{suffix}"
|
|
|
|
|
|
def download_pdf(
|
|
url: str,
|
|
output_dir: Path,
|
|
timeout: int,
|
|
overwrite: bool,
|
|
) -> Tuple[str, Optional[Path], Optional[str]]:
|
|
try:
|
|
dest = build_filename(url, output_dir)
|
|
if dest.exists() and not overwrite:
|
|
return url, dest, "exists"
|
|
|
|
response = requests.get(url, stream=True, timeout=timeout)
|
|
response.raise_for_status()
|
|
|
|
content_type = response.headers.get("Content-Type", "").lower()
|
|
if "pdf" not in content_type and not url.lower().endswith(".pdf"):
|
|
# Peek into the first bytes to be safe
|
|
peek = response.raw.read(5, decode_content=True)
|
|
if not peek.startswith(b"%PDF"):
|
|
return url, None, f"Skipping non-PDF content-type ({content_type or 'unknown'})"
|
|
content = peek + response.content[len(peek):]
|
|
else:
|
|
content = response.content
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
dest.write_bytes(content)
|
|
return url, dest, None
|
|
except Exception as exc: # pylint: disable=broad-except
|
|
return url, None, str(exc)
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
urls = load_urls(args)
|
|
output_dir = Path(args.output_dir).resolve()
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Downloading {len(urls)} PDFs to {output_dir} using {args.workers} workers...")
|
|
|
|
successes = 0
|
|
skipped = 0
|
|
failures: List[Tuple[str, str]] = []
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
|
|
future_to_url = {
|
|
executor.submit(
|
|
download_pdf, url, output_dir, args.timeout, args.overwrite
|
|
): url
|
|
for url in urls
|
|
}
|
|
for future in concurrent.futures.as_completed(future_to_url):
|
|
url = future_to_url[future]
|
|
result_url, path, error = future.result()
|
|
if error == "exists":
|
|
skipped += 1
|
|
print(f"[SKIP] {url} (already downloaded)")
|
|
elif error:
|
|
failures.append((result_url, error))
|
|
print(f"[FAIL] {url} -> {error}", file=sys.stderr)
|
|
else:
|
|
successes += 1
|
|
print(f"[OK] {url} -> {path}")
|
|
|
|
print()
|
|
print(f"Completed. Success: {successes}, Skipped: {skipped}, Failures: {len(failures)}")
|
|
if failures:
|
|
print("Failures:")
|
|
for url, error in failures:
|
|
print(f" {url} -> {error}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|