json size clenaup 450 to 35mb

2026-04-22 23:08:53 +02:00 · 2025-11-05 23:35:08 +00:00
parent d4e95a6ed7
commit d4c702f96c
6 changed files with 687 additions and 40 deletions
--- a/scripts/analyze_pdf_json.py
+++ b/scripts/analyze_pdf_json.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""
+Quick inspection utility for PDF→JSON exports.
+
+Usage:
+    python scripts/analyze_pdf_json.py path/to/export.json
+
+The script prints size and font statistics so we can confirm whether the
+lightweight export (no COS dictionaries) is active and how large the font
+payloads are.
+"""
+from __future__ import annotations
+
+import argparse
+import base64
+import json
+import math
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Tuple
+
+
+def human_bytes(value: float) -> str:
+    if value <= 0:
+        return "0 B"
+    units = ["B", "KB", "MB", "GB", "TB"]
+    order = min(int(math.log(value, 1024)), len(units) - 1)
+    scaled = value / (1024**order)
+    return f"{scaled:.1f} {units[order]}"
+
+
+def base64_payload_size(encoded: str | None) -> int:
+    if not encoded:
+        return 0
+    length = len(encoded.strip())
+    if length == 0:
+        return 0
+    return int(length * 0.75)
+
+
+@dataclass
+class FontBreakdown:
+    total: int = 0
+    with_cos: int = 0
+    with_program: int = 0
+    with_web_program: int = 0
+    with_pdf_program: int = 0
+    program_bytes: int = 0
+    web_program_bytes: int = 0
+    pdf_program_bytes: int = 0
+    metadata_bytes: int = 0
+    sample_cos_ids: List[Tuple[str | None, str | None]] = None
+
+
+@dataclass
+class PageBreakdown:
+    page_count: int = 0
+    total_text_elements: int = 0
+    total_image_elements: int = 0
+    text_payload_chars: int = 0
+    text_struct_bytes: int = 0
+    image_struct_bytes: int = 0
+    resources_bytes: int = 0
+    content_stream_bytes: int = 0
+    annotations_bytes: int = 0
+
+
+@dataclass
+class DocumentBreakdown:
+    total_bytes: int
+    fonts: FontBreakdown
+    pages: PageBreakdown
+    metadata_bytes: int
+    xmp_bytes: int
+    form_fields_bytes: int
+    lazy_flag_bytes: int
+
+
+def approx_struct_size(obj: Any) -> int:
+    if obj is None:
+        return 0
+    return len(json.dumps(obj, separators=(",", ":")))
+
+
+def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown:
+    total = 0
+    with_cos = 0
+    with_prog = 0
+    with_web_prog = 0
+    with_pdf_prog = 0
+    program_bytes = 0
+    web_program_bytes = 0
+    pdf_program_bytes = 0
+    metadata_bytes = 0
+    sample_cos_ids: List[Tuple[str | None, str | None]] = []
+
+    for font in fonts:
+        total += 1
+        font_id = font.get("id")
+        uid = font.get("uid")
+        cos_value = font.get("cosDictionary")
+        if cos_value:
+            with_cos += 1
+            if len(sample_cos_ids) < 5:
+                sample_cos_ids.append((font_id, uid))
+
+        metadata_bytes += approx_struct_size(
+            {k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}}
+        )
+
+        program = font.get("program")
+        web_program = font.get("webProgram")
+        pdf_program = font.get("pdfProgram")
+
+        if program:
+            with_prog += 1
+            program_bytes += base64_payload_size(program)
+        if web_program:
+            with_web_prog += 1
+            web_program_bytes += base64_payload_size(web_program)
+        if pdf_program:
+            with_pdf_prog += 1
+            pdf_program_bytes += base64_payload_size(pdf_program)
+
+    return FontBreakdown(
+        total=total,
+        with_cos=with_cos,
+        with_program=with_prog,
+        with_web_program=with_web_prog,
+        with_pdf_program=with_pdf_prog,
+        program_bytes=program_bytes,
+        web_program_bytes=web_program_bytes,
+        pdf_program_bytes=pdf_program_bytes,
+        metadata_bytes=metadata_bytes,
+        sample_cos_ids=sample_cos_ids,
+    )
+
+
+def analyze_pages(pages: Iterable[Dict[str, Any]]) -> PageBreakdown:
+    page_count = 0
+    total_text = 0
+    total_images = 0
+    text_chars = 0
+    text_struct_bytes = 0
+    image_struct_bytes = 0
+    resources_bytes = 0
+    stream_bytes = 0
+    annotations_bytes = 0
+
+    for page in pages:
+        page_count += 1
+        texts = page.get("textElements") or []
+        images = page.get("imageElements") or []
+        resources = page.get("resources")
+        streams = page.get("contentStreams") or []
+        annotations = page.get("annotations") or []
+
+        total_text += len(texts)
+        total_images += len(images)
+        text_struct_bytes += approx_struct_size(texts)
+        image_struct_bytes += approx_struct_size(images)
+        resources_bytes += approx_struct_size(resources)
+        stream_bytes += approx_struct_size(streams)
+        annotations_bytes += approx_struct_size(annotations)
+
+        for elem in texts:
+            text = elem.get("text")
+            if text:
+                text_chars += len(text)
+
+    return PageBreakdown(
+        page_count=page_count,
+        total_text_elements=total_text,
+        total_image_elements=total_images,
+        text_payload_chars=text_chars,
+        text_struct_bytes=text_struct_bytes,
+        image_struct_bytes=image_struct_bytes,
+        resources_bytes=resources_bytes,
+        content_stream_bytes=stream_bytes,
+        annotations_bytes=annotations_bytes,
+    )
+
+
+def analyze_document(document: Dict[str, Any], total_size: int) -> DocumentBreakdown:
+    fonts = document.get("fonts") or []
+    pages = document.get("pages") or []
+    metadata = document.get("metadata") or {}
+
+    font_stats = analyze_fonts(fonts)
+    page_stats = analyze_pages(pages)
+
+    return DocumentBreakdown(
+        total_bytes=total_size,
+        fonts=font_stats,
+        pages=page_stats,
+        metadata_bytes=approx_struct_size(metadata),
+        xmp_bytes=base64_payload_size(document.get("xmpMetadata")),
+        form_fields_bytes=approx_struct_size(document.get("formFields")),
+        lazy_flag_bytes=approx_struct_size(document.get("lazyImages")),
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Inspect a PDF JSON export.")
+    parser.add_argument("json_path", type=Path, help="Path to the JSON export.")
+    args = parser.parse_args()
+
+    json_path = args.json_path
+    if not json_path.exists():
+        raise SystemExit(f"File not found: {json_path}")
+
+    file_size = json_path.stat().st_size
+    print(f"File: {json_path}")
+    print(f"Size: {human_bytes(file_size)} ({file_size:,} bytes)")
+
+    with json_path.open("r", encoding="utf-8") as handle:
+        document = json.load(handle)
+
+    if not isinstance(document, dict):
+        raise SystemExit("Unexpected JSON structure (expected an object at root).")
+
+    summary = analyze_document(document, file_size)
+    page_stats = summary.pages
+    print(f"Pages: {page_stats.page_count}")
+    print(f"Total text elements: {page_stats.total_text_elements:,}")
+    print(f"Total image elements: {page_stats.total_image_elements:,}")
+    print(
+        f"Page structural bytes (text arrays + images + streams + annotations): "
+        f"{human_bytes(page_stats.text_struct_bytes + page_stats.image_struct_bytes + page_stats.content_stream_bytes + page_stats.annotations_bytes)}"
+    )
+
+    font_stats = summary.fonts
+    print("\nFont summary:")
+    print(f"  Fonts total: {font_stats.total}")
+    print(f"  Fonts with cosDictionary: {font_stats.with_cos}")
+    print(f"  Fonts with program: {font_stats.with_program}")
+    print(f"  Fonts with webProgram: {font_stats.with_web_program}")
+    print(f"  Fonts with pdfProgram: {font_stats.with_pdf_program}")
+    print(
+        "  Payload sizes:"
+        f" program={human_bytes(font_stats.program_bytes)},"
+        f" webProgram={human_bytes(font_stats.web_program_bytes)},"
+        f" pdfProgram={human_bytes(font_stats.pdf_program_bytes)},"
+        f" metadata={human_bytes(font_stats.metadata_bytes)}"
+    )
+    if font_stats.sample_cos_ids:
+        print("  Sample fonts still carrying cosDictionary:")
+        for idx, (font_id, uid) in enumerate(font_stats.sample_cos_ids, start=1):
+            print(f"    {idx}. id={font_id!r}, uid={uid!r}")
+    else:
+        print("  No fonts retain cosDictionary entries.")
+
+    print("\nOther sections:")
+    print(f"  Metadata bytes: {human_bytes(summary.metadata_bytes)}")
+    print(f"  XMP metadata bytes: {human_bytes(summary.xmp_bytes)}")
+    print(f"  Form fields bytes: {human_bytes(summary.form_fields_bytes)}")
+    print(f"  Lazy flag bytes: {summary.lazy_flag_bytes}")
+    print(
+        f"  Text payload characters (not counting JSON overhead): "
+        f"{page_stats.text_payload_chars:,}"
+    )
+    print(
+        f"  Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}"
+    )
+    print(
+        f"  Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}"
+    )
+    print(
+        f"  Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}"
+    )
+    print(
+        f"  Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}"
+    )
+
+
+if __name__ == "__main__":
+    main()