Stirling-PDF/scripts/analyze_pdf_json.py
2025-11-05 23:35:08 +00:00

278 lines
8.6 KiB
Python

#!/usr/bin/env python3
"""
Quick inspection utility for PDF→JSON exports.
Usage:
python scripts/analyze_pdf_json.py path/to/export.json
The script prints size and font statistics so we can confirm whether the
lightweight export (no COS dictionaries) is active and how large the font
payloads are.
"""
from __future__ import annotations
import argparse
import base64
import json
import math
from pathlib import Path
from dataclasses import dataclass
from typing import Any, Dict, Iterable, List, Tuple
def human_bytes(value: float) -> str:
if value <= 0:
return "0 B"
units = ["B", "KB", "MB", "GB", "TB"]
order = min(int(math.log(value, 1024)), len(units) - 1)
scaled = value / (1024**order)
return f"{scaled:.1f} {units[order]}"
def base64_payload_size(encoded: str | None) -> int:
if not encoded:
return 0
length = len(encoded.strip())
if length == 0:
return 0
return int(length * 0.75)
@dataclass
class FontBreakdown:
total: int = 0
with_cos: int = 0
with_program: int = 0
with_web_program: int = 0
with_pdf_program: int = 0
program_bytes: int = 0
web_program_bytes: int = 0
pdf_program_bytes: int = 0
metadata_bytes: int = 0
sample_cos_ids: List[Tuple[str | None, str | None]] = None
@dataclass
class PageBreakdown:
page_count: int = 0
total_text_elements: int = 0
total_image_elements: int = 0
text_payload_chars: int = 0
text_struct_bytes: int = 0
image_struct_bytes: int = 0
resources_bytes: int = 0
content_stream_bytes: int = 0
annotations_bytes: int = 0
@dataclass
class DocumentBreakdown:
total_bytes: int
fonts: FontBreakdown
pages: PageBreakdown
metadata_bytes: int
xmp_bytes: int
form_fields_bytes: int
lazy_flag_bytes: int
def approx_struct_size(obj: Any) -> int:
if obj is None:
return 0
return len(json.dumps(obj, separators=(",", ":")))
def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown:
total = 0
with_cos = 0
with_prog = 0
with_web_prog = 0
with_pdf_prog = 0
program_bytes = 0
web_program_bytes = 0
pdf_program_bytes = 0
metadata_bytes = 0
sample_cos_ids: List[Tuple[str | None, str | None]] = []
for font in fonts:
total += 1
font_id = font.get("id")
uid = font.get("uid")
cos_value = font.get("cosDictionary")
if cos_value:
with_cos += 1
if len(sample_cos_ids) < 5:
sample_cos_ids.append((font_id, uid))
metadata_bytes += approx_struct_size(
{k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}}
)
program = font.get("program")
web_program = font.get("webProgram")
pdf_program = font.get("pdfProgram")
if program:
with_prog += 1
program_bytes += base64_payload_size(program)
if web_program:
with_web_prog += 1
web_program_bytes += base64_payload_size(web_program)
if pdf_program:
with_pdf_prog += 1
pdf_program_bytes += base64_payload_size(pdf_program)
return FontBreakdown(
total=total,
with_cos=with_cos,
with_program=with_prog,
with_web_program=with_web_prog,
with_pdf_program=with_pdf_prog,
program_bytes=program_bytes,
web_program_bytes=web_program_bytes,
pdf_program_bytes=pdf_program_bytes,
metadata_bytes=metadata_bytes,
sample_cos_ids=sample_cos_ids,
)
def analyze_pages(pages: Iterable[Dict[str, Any]]) -> PageBreakdown:
page_count = 0
total_text = 0
total_images = 0
text_chars = 0
text_struct_bytes = 0
image_struct_bytes = 0
resources_bytes = 0
stream_bytes = 0
annotations_bytes = 0
for page in pages:
page_count += 1
texts = page.get("textElements") or []
images = page.get("imageElements") or []
resources = page.get("resources")
streams = page.get("contentStreams") or []
annotations = page.get("annotations") or []
total_text += len(texts)
total_images += len(images)
text_struct_bytes += approx_struct_size(texts)
image_struct_bytes += approx_struct_size(images)
resources_bytes += approx_struct_size(resources)
stream_bytes += approx_struct_size(streams)
annotations_bytes += approx_struct_size(annotations)
for elem in texts:
text = elem.get("text")
if text:
text_chars += len(text)
return PageBreakdown(
page_count=page_count,
total_text_elements=total_text,
total_image_elements=total_images,
text_payload_chars=text_chars,
text_struct_bytes=text_struct_bytes,
image_struct_bytes=image_struct_bytes,
resources_bytes=resources_bytes,
content_stream_bytes=stream_bytes,
annotations_bytes=annotations_bytes,
)
def analyze_document(document: Dict[str, Any], total_size: int) -> DocumentBreakdown:
fonts = document.get("fonts") or []
pages = document.get("pages") or []
metadata = document.get("metadata") or {}
font_stats = analyze_fonts(fonts)
page_stats = analyze_pages(pages)
return DocumentBreakdown(
total_bytes=total_size,
fonts=font_stats,
pages=page_stats,
metadata_bytes=approx_struct_size(metadata),
xmp_bytes=base64_payload_size(document.get("xmpMetadata")),
form_fields_bytes=approx_struct_size(document.get("formFields")),
lazy_flag_bytes=approx_struct_size(document.get("lazyImages")),
)
def main() -> None:
parser = argparse.ArgumentParser(description="Inspect a PDF JSON export.")
parser.add_argument("json_path", type=Path, help="Path to the JSON export.")
args = parser.parse_args()
json_path = args.json_path
if not json_path.exists():
raise SystemExit(f"File not found: {json_path}")
file_size = json_path.stat().st_size
print(f"File: {json_path}")
print(f"Size: {human_bytes(file_size)} ({file_size:,} bytes)")
with json_path.open("r", encoding="utf-8") as handle:
document = json.load(handle)
if not isinstance(document, dict):
raise SystemExit("Unexpected JSON structure (expected an object at root).")
summary = analyze_document(document, file_size)
page_stats = summary.pages
print(f"Pages: {page_stats.page_count}")
print(f"Total text elements: {page_stats.total_text_elements:,}")
print(f"Total image elements: {page_stats.total_image_elements:,}")
print(
f"Page structural bytes (text arrays + images + streams + annotations): "
f"{human_bytes(page_stats.text_struct_bytes + page_stats.image_struct_bytes + page_stats.content_stream_bytes + page_stats.annotations_bytes)}"
)
font_stats = summary.fonts
print("\nFont summary:")
print(f" Fonts total: {font_stats.total}")
print(f" Fonts with cosDictionary: {font_stats.with_cos}")
print(f" Fonts with program: {font_stats.with_program}")
print(f" Fonts with webProgram: {font_stats.with_web_program}")
print(f" Fonts with pdfProgram: {font_stats.with_pdf_program}")
print(
" Payload sizes:"
f" program={human_bytes(font_stats.program_bytes)},"
f" webProgram={human_bytes(font_stats.web_program_bytes)},"
f" pdfProgram={human_bytes(font_stats.pdf_program_bytes)},"
f" metadata={human_bytes(font_stats.metadata_bytes)}"
)
if font_stats.sample_cos_ids:
print(" Sample fonts still carrying cosDictionary:")
for idx, (font_id, uid) in enumerate(font_stats.sample_cos_ids, start=1):
print(f" {idx}. id={font_id!r}, uid={uid!r}")
else:
print(" No fonts retain cosDictionary entries.")
print("\nOther sections:")
print(f" Metadata bytes: {human_bytes(summary.metadata_bytes)}")
print(f" XMP metadata bytes: {human_bytes(summary.xmp_bytes)}")
print(f" Form fields bytes: {human_bytes(summary.form_fields_bytes)}")
print(f" Lazy flag bytes: {summary.lazy_flag_bytes}")
print(
f" Text payload characters (not counting JSON overhead): "
f"{page_stats.text_payload_chars:,}"
)
print(
f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}"
)
print(
f" Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}"
)
print(
f" Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}"
)
print(
f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}"
)
if __name__ == "__main__":
main()