mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
278 lines
8.6 KiB
Python
278 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Quick inspection utility for PDF→JSON exports.
|
|
|
|
Usage:
|
|
python scripts/analyze_pdf_json.py path/to/export.json
|
|
|
|
The script prints size and font statistics so we can confirm whether the
|
|
lightweight export (no COS dictionaries) is active and how large the font
|
|
payloads are.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import base64
|
|
import json
|
|
import math
|
|
from pathlib import Path
|
|
from dataclasses import dataclass
|
|
from typing import Any, Dict, Iterable, List, Tuple
|
|
|
|
|
|
def human_bytes(value: float) -> str:
|
|
if value <= 0:
|
|
return "0 B"
|
|
units = ["B", "KB", "MB", "GB", "TB"]
|
|
order = min(int(math.log(value, 1024)), len(units) - 1)
|
|
scaled = value / (1024**order)
|
|
return f"{scaled:.1f} {units[order]}"
|
|
|
|
|
|
def base64_payload_size(encoded: str | None) -> int:
|
|
if not encoded:
|
|
return 0
|
|
length = len(encoded.strip())
|
|
if length == 0:
|
|
return 0
|
|
return int(length * 0.75)
|
|
|
|
|
|
@dataclass
|
|
class FontBreakdown:
|
|
total: int = 0
|
|
with_cos: int = 0
|
|
with_program: int = 0
|
|
with_web_program: int = 0
|
|
with_pdf_program: int = 0
|
|
program_bytes: int = 0
|
|
web_program_bytes: int = 0
|
|
pdf_program_bytes: int = 0
|
|
metadata_bytes: int = 0
|
|
sample_cos_ids: List[Tuple[str | None, str | None]] = None
|
|
|
|
|
|
@dataclass
|
|
class PageBreakdown:
|
|
page_count: int = 0
|
|
total_text_elements: int = 0
|
|
total_image_elements: int = 0
|
|
text_payload_chars: int = 0
|
|
text_struct_bytes: int = 0
|
|
image_struct_bytes: int = 0
|
|
resources_bytes: int = 0
|
|
content_stream_bytes: int = 0
|
|
annotations_bytes: int = 0
|
|
|
|
|
|
@dataclass
|
|
class DocumentBreakdown:
|
|
total_bytes: int
|
|
fonts: FontBreakdown
|
|
pages: PageBreakdown
|
|
metadata_bytes: int
|
|
xmp_bytes: int
|
|
form_fields_bytes: int
|
|
lazy_flag_bytes: int
|
|
|
|
|
|
def approx_struct_size(obj: Any) -> int:
|
|
if obj is None:
|
|
return 0
|
|
return len(json.dumps(obj, separators=(",", ":")))
|
|
|
|
|
|
def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown:
|
|
total = 0
|
|
with_cos = 0
|
|
with_prog = 0
|
|
with_web_prog = 0
|
|
with_pdf_prog = 0
|
|
program_bytes = 0
|
|
web_program_bytes = 0
|
|
pdf_program_bytes = 0
|
|
metadata_bytes = 0
|
|
sample_cos_ids: List[Tuple[str | None, str | None]] = []
|
|
|
|
for font in fonts:
|
|
total += 1
|
|
font_id = font.get("id")
|
|
uid = font.get("uid")
|
|
cos_value = font.get("cosDictionary")
|
|
if cos_value:
|
|
with_cos += 1
|
|
if len(sample_cos_ids) < 5:
|
|
sample_cos_ids.append((font_id, uid))
|
|
|
|
metadata_bytes += approx_struct_size(
|
|
{k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}}
|
|
)
|
|
|
|
program = font.get("program")
|
|
web_program = font.get("webProgram")
|
|
pdf_program = font.get("pdfProgram")
|
|
|
|
if program:
|
|
with_prog += 1
|
|
program_bytes += base64_payload_size(program)
|
|
if web_program:
|
|
with_web_prog += 1
|
|
web_program_bytes += base64_payload_size(web_program)
|
|
if pdf_program:
|
|
with_pdf_prog += 1
|
|
pdf_program_bytes += base64_payload_size(pdf_program)
|
|
|
|
return FontBreakdown(
|
|
total=total,
|
|
with_cos=with_cos,
|
|
with_program=with_prog,
|
|
with_web_program=with_web_prog,
|
|
with_pdf_program=with_pdf_prog,
|
|
program_bytes=program_bytes,
|
|
web_program_bytes=web_program_bytes,
|
|
pdf_program_bytes=pdf_program_bytes,
|
|
metadata_bytes=metadata_bytes,
|
|
sample_cos_ids=sample_cos_ids,
|
|
)
|
|
|
|
|
|
def analyze_pages(pages: Iterable[Dict[str, Any]]) -> PageBreakdown:
|
|
page_count = 0
|
|
total_text = 0
|
|
total_images = 0
|
|
text_chars = 0
|
|
text_struct_bytes = 0
|
|
image_struct_bytes = 0
|
|
resources_bytes = 0
|
|
stream_bytes = 0
|
|
annotations_bytes = 0
|
|
|
|
for page in pages:
|
|
page_count += 1
|
|
texts = page.get("textElements") or []
|
|
images = page.get("imageElements") or []
|
|
resources = page.get("resources")
|
|
streams = page.get("contentStreams") or []
|
|
annotations = page.get("annotations") or []
|
|
|
|
total_text += len(texts)
|
|
total_images += len(images)
|
|
text_struct_bytes += approx_struct_size(texts)
|
|
image_struct_bytes += approx_struct_size(images)
|
|
resources_bytes += approx_struct_size(resources)
|
|
stream_bytes += approx_struct_size(streams)
|
|
annotations_bytes += approx_struct_size(annotations)
|
|
|
|
for elem in texts:
|
|
text = elem.get("text")
|
|
if text:
|
|
text_chars += len(text)
|
|
|
|
return PageBreakdown(
|
|
page_count=page_count,
|
|
total_text_elements=total_text,
|
|
total_image_elements=total_images,
|
|
text_payload_chars=text_chars,
|
|
text_struct_bytes=text_struct_bytes,
|
|
image_struct_bytes=image_struct_bytes,
|
|
resources_bytes=resources_bytes,
|
|
content_stream_bytes=stream_bytes,
|
|
annotations_bytes=annotations_bytes,
|
|
)
|
|
|
|
|
|
def analyze_document(document: Dict[str, Any], total_size: int) -> DocumentBreakdown:
|
|
fonts = document.get("fonts") or []
|
|
pages = document.get("pages") or []
|
|
metadata = document.get("metadata") or {}
|
|
|
|
font_stats = analyze_fonts(fonts)
|
|
page_stats = analyze_pages(pages)
|
|
|
|
return DocumentBreakdown(
|
|
total_bytes=total_size,
|
|
fonts=font_stats,
|
|
pages=page_stats,
|
|
metadata_bytes=approx_struct_size(metadata),
|
|
xmp_bytes=base64_payload_size(document.get("xmpMetadata")),
|
|
form_fields_bytes=approx_struct_size(document.get("formFields")),
|
|
lazy_flag_bytes=approx_struct_size(document.get("lazyImages")),
|
|
)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Inspect a PDF JSON export.")
|
|
parser.add_argument("json_path", type=Path, help="Path to the JSON export.")
|
|
args = parser.parse_args()
|
|
|
|
json_path = args.json_path
|
|
if not json_path.exists():
|
|
raise SystemExit(f"File not found: {json_path}")
|
|
|
|
file_size = json_path.stat().st_size
|
|
print(f"File: {json_path}")
|
|
print(f"Size: {human_bytes(file_size)} ({file_size:,} bytes)")
|
|
|
|
with json_path.open("r", encoding="utf-8") as handle:
|
|
document = json.load(handle)
|
|
|
|
if not isinstance(document, dict):
|
|
raise SystemExit("Unexpected JSON structure (expected an object at root).")
|
|
|
|
summary = analyze_document(document, file_size)
|
|
page_stats = summary.pages
|
|
print(f"Pages: {page_stats.page_count}")
|
|
print(f"Total text elements: {page_stats.total_text_elements:,}")
|
|
print(f"Total image elements: {page_stats.total_image_elements:,}")
|
|
print(
|
|
f"Page structural bytes (text arrays + images + streams + annotations): "
|
|
f"{human_bytes(page_stats.text_struct_bytes + page_stats.image_struct_bytes + page_stats.content_stream_bytes + page_stats.annotations_bytes)}"
|
|
)
|
|
|
|
font_stats = summary.fonts
|
|
print("\nFont summary:")
|
|
print(f" Fonts total: {font_stats.total}")
|
|
print(f" Fonts with cosDictionary: {font_stats.with_cos}")
|
|
print(f" Fonts with program: {font_stats.with_program}")
|
|
print(f" Fonts with webProgram: {font_stats.with_web_program}")
|
|
print(f" Fonts with pdfProgram: {font_stats.with_pdf_program}")
|
|
print(
|
|
" Payload sizes:"
|
|
f" program={human_bytes(font_stats.program_bytes)},"
|
|
f" webProgram={human_bytes(font_stats.web_program_bytes)},"
|
|
f" pdfProgram={human_bytes(font_stats.pdf_program_bytes)},"
|
|
f" metadata={human_bytes(font_stats.metadata_bytes)}"
|
|
)
|
|
if font_stats.sample_cos_ids:
|
|
print(" Sample fonts still carrying cosDictionary:")
|
|
for idx, (font_id, uid) in enumerate(font_stats.sample_cos_ids, start=1):
|
|
print(f" {idx}. id={font_id!r}, uid={uid!r}")
|
|
else:
|
|
print(" No fonts retain cosDictionary entries.")
|
|
|
|
print("\nOther sections:")
|
|
print(f" Metadata bytes: {human_bytes(summary.metadata_bytes)}")
|
|
print(f" XMP metadata bytes: {human_bytes(summary.xmp_bytes)}")
|
|
print(f" Form fields bytes: {human_bytes(summary.form_fields_bytes)}")
|
|
print(f" Lazy flag bytes: {summary.lazy_flag_bytes}")
|
|
print(
|
|
f" Text payload characters (not counting JSON overhead): "
|
|
f"{page_stats.text_payload_chars:,}"
|
|
)
|
|
print(
|
|
f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}"
|
|
)
|
|
print(
|
|
f" Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}"
|
|
)
|
|
print(
|
|
f" Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}"
|
|
)
|
|
print(
|
|
f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|