Stirling-PDF/scripts/analyze_pdf_json.py

#!/usr/bin/env python3
"""
Quick inspection utility for PDF→JSON exports.

Usage:
    python scripts/analyze_pdf_json.py path/to/export.json

The script prints size and font statistics so we can confirm whether the
lightweight export (no COS dictionaries) is active and how large the font
payloads are.
"""
from __future__ import annotations

import argparse
import base64
import json
import math
from pathlib import Path
from dataclasses import dataclass
from typing import Any, Dict, Iterable, List, Tuple


def human_bytes(value: float) -> str:
    if value <= 0:
        return "0 B"
    units = ["B", "KB", "MB", "GB", "TB"]
    order = min(int(math.log(value, 1024)), len(units) - 1)
    scaled = value / (1024**order)
    return f"{scaled:.1f} {units[order]}"


def base64_payload_size(encoded: str | None) -> int:
    if not encoded:
        return 0
    length = len(encoded.strip())
    if length == 0:
        return 0
    return int(length * 0.75)


@dataclass
class FontBreakdown:
    total: int = 0
    with_cos: int = 0
    with_program: int = 0
    with_web_program: int = 0
    with_pdf_program: int = 0
    program_bytes: int = 0
    web_program_bytes: int = 0
    pdf_program_bytes: int = 0
    metadata_bytes: int = 0
    sample_cos_ids: List[Tuple[str | None, str | None]] = None


@dataclass
class PageBreakdown:
    page_count: int = 0
    total_text_elements: int = 0
    total_image_elements: int = 0
    text_payload_chars: int = 0
    text_struct_bytes: int = 0
    image_struct_bytes: int = 0
    resources_bytes: int = 0
    content_stream_bytes: int = 0
    annotations_bytes: int = 0


@dataclass
class DocumentBreakdown:
    total_bytes: int
    fonts: FontBreakdown
    pages: PageBreakdown
    metadata_bytes: int
    xmp_bytes: int
    form_fields_bytes: int
    lazy_flag_bytes: int


def approx_struct_size(obj: Any) -> int:
    if obj is None:
        return 0
    return len(json.dumps(obj, separators=(",", ":")))


def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown:
    total = 0
    with_cos = 0
    with_prog = 0
    with_web_prog = 0
    with_pdf_prog = 0
    program_bytes = 0
    web_program_bytes = 0
    pdf_program_bytes = 0
    metadata_bytes = 0
    sample_cos_ids: List[Tuple[str | None, str | None]] = []

    for font in fonts:
        total += 1
        font_id = font.get("id")
        uid = font.get("uid")
        cos_value = font.get("cosDictionary")
        if cos_value:
            with_cos += 1
            if len(sample_cos_ids) < 5:
                sample_cos_ids.append((font_id, uid))

        metadata_bytes += approx_struct_size(
            {k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}}
        )

        program = font.get("program")
        web_program = font.get("webProgram")
        pdf_program = font.get("pdfProgram")

        if program:
            with_prog += 1
            program_bytes += base64_payload_size(program)
        if web_program:
            with_web_prog += 1
            web_program_bytes += base64_payload_size(web_program)
        if pdf_program:
            with_pdf_prog += 1
            pdf_program_bytes += base64_payload_size(pdf_program)

    return FontBreakdown(
        total=total,
        with_cos=with_cos,
        with_program=with_prog,
        with_web_program=with_web_prog,
        with_pdf_program=with_pdf_prog,
        program_bytes=program_bytes,
        web_program_bytes=web_program_bytes,
        pdf_program_bytes=pdf_program_bytes,
        metadata_bytes=metadata_bytes,
        sample_cos_ids=sample_cos_ids,
    )


def analyze_pages(pages: Iterable[Dict[str, Any]]) -> PageBreakdown:
    page_count = 0
    total_text = 0
    total_images = 0
    text_chars = 0
    text_struct_bytes = 0
    image_struct_bytes = 0
    resources_bytes = 0
    stream_bytes = 0
    annotations_bytes = 0

    for page in pages:
        page_count += 1
        texts = page.get("textElements") or []
        images = page.get("imageElements") or []
        resources = page.get("resources")
        streams = page.get("contentStreams") or []
        annotations = page.get("annotations") or []

        total_text += len(texts)
        total_images += len(images)
        text_struct_bytes += approx_struct_size(texts)
        image_struct_bytes += approx_struct_size(images)
        resources_bytes += approx_struct_size(resources)
        stream_bytes += approx_struct_size(streams)
        annotations_bytes += approx_struct_size(annotations)

        for elem in texts:
            text = elem.get("text")
            if text:
                text_chars += len(text)

    return PageBreakdown(
        page_count=page_count,
        total_text_elements=total_text,
        total_image_elements=total_images,
        text_payload_chars=text_chars,
        text_struct_bytes=text_struct_bytes,
        image_struct_bytes=image_struct_bytes,
        resources_bytes=resources_bytes,
        content_stream_bytes=stream_bytes,
        annotations_bytes=annotations_bytes,
    )


def analyze_document(document: Dict[str, Any], total_size: int) -> DocumentBreakdown:
    fonts = document.get("fonts") or []
    pages = document.get("pages") or []
    metadata = document.get("metadata") or {}

    font_stats = analyze_fonts(fonts)
    page_stats = analyze_pages(pages)

    return DocumentBreakdown(
        total_bytes=total_size,
        fonts=font_stats,
        pages=page_stats,
        metadata_bytes=approx_struct_size(metadata),
        xmp_bytes=base64_payload_size(document.get("xmpMetadata")),
        form_fields_bytes=approx_struct_size(document.get("formFields")),
        lazy_flag_bytes=approx_struct_size(document.get("lazyImages")),
    )


def main() -> None:
    parser = argparse.ArgumentParser(description="Inspect a PDF JSON export.")
    parser.add_argument("json_path", type=Path, help="Path to the JSON export.")
    args = parser.parse_args()

    json_path = args.json_path
    if not json_path.exists():
        raise SystemExit(f"File not found: {json_path}")

    file_size = json_path.stat().st_size
    print(f"File: {json_path}")
    print(f"Size: {human_bytes(file_size)} ({file_size:,} bytes)")

    with json_path.open("r", encoding="utf-8") as handle:
        document = json.load(handle)

    if not isinstance(document, dict):
        raise SystemExit("Unexpected JSON structure (expected an object at root).")

    summary = analyze_document(document, file_size)
    page_stats = summary.pages
    print(f"Pages: {page_stats.page_count}")
    print(f"Total text elements: {page_stats.total_text_elements:,}")
    print(f"Total image elements: {page_stats.total_image_elements:,}")
    print(
        f"Page structural bytes (text arrays + images + streams + annotations): "
        f"{human_bytes(page_stats.text_struct_bytes + page_stats.image_struct_bytes + page_stats.content_stream_bytes + page_stats.annotations_bytes)}"
    )

    font_stats = summary.fonts
    print("\nFont summary:")
    print(f"  Fonts total: {font_stats.total}")
    print(f"  Fonts with cosDictionary: {font_stats.with_cos}")
    print(f"  Fonts with program: {font_stats.with_program}")
    print(f"  Fonts with webProgram: {font_stats.with_web_program}")
    print(f"  Fonts with pdfProgram: {font_stats.with_pdf_program}")
    print(
        "  Payload sizes:"
        f" program={human_bytes(font_stats.program_bytes)},"
        f" webProgram={human_bytes(font_stats.web_program_bytes)},"
        f" pdfProgram={human_bytes(font_stats.pdf_program_bytes)},"
        f" metadata={human_bytes(font_stats.metadata_bytes)}"
    )
    if font_stats.sample_cos_ids:
        print("  Sample fonts still carrying cosDictionary:")
        for idx, (font_id, uid) in enumerate(font_stats.sample_cos_ids, start=1):
            print(f"    {idx}. id={font_id!r}, uid={uid!r}")
    else:
        print("  No fonts retain cosDictionary entries.")

    print("\nOther sections:")
    print(f"  Metadata bytes: {human_bytes(summary.metadata_bytes)}")
    print(f"  XMP metadata bytes: {human_bytes(summary.xmp_bytes)}")
    print(f"  Form fields bytes: {human_bytes(summary.form_fields_bytes)}")
    print(f"  Lazy flag bytes: {summary.lazy_flag_bytes}")
    print(
        f"  Text payload characters (not counting JSON overhead): "
        f"{page_stats.text_payload_chars:,}"
    )
    print(
        f"  Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}"
    )
    print(
        f"  Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}"
    )
    print(
        f"  Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}"
    )
    print(
        f"  Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}"
    )


if __name__ == "__main__":
    main()