mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-02-17 13:52:14 +01:00
json size clenaup 450 to 35mb
This commit is contained in:
277
scripts/analyze_pdf_json.py
Normal file
277
scripts/analyze_pdf_json.py
Normal file
@@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick inspection utility for PDF→JSON exports.
|
||||
|
||||
Usage:
|
||||
python scripts/analyze_pdf_json.py path/to/export.json
|
||||
|
||||
The script prints size and font statistics so we can confirm whether the
|
||||
lightweight export (no COS dictionaries) is active and how large the font
|
||||
payloads are.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Iterable, List, Tuple
|
||||
|
||||
|
||||
def human_bytes(value: float) -> str:
|
||||
if value <= 0:
|
||||
return "0 B"
|
||||
units = ["B", "KB", "MB", "GB", "TB"]
|
||||
order = min(int(math.log(value, 1024)), len(units) - 1)
|
||||
scaled = value / (1024**order)
|
||||
return f"{scaled:.1f} {units[order]}"
|
||||
|
||||
|
||||
def base64_payload_size(encoded: str | None) -> int:
|
||||
if not encoded:
|
||||
return 0
|
||||
length = len(encoded.strip())
|
||||
if length == 0:
|
||||
return 0
|
||||
return int(length * 0.75)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FontBreakdown:
|
||||
total: int = 0
|
||||
with_cos: int = 0
|
||||
with_program: int = 0
|
||||
with_web_program: int = 0
|
||||
with_pdf_program: int = 0
|
||||
program_bytes: int = 0
|
||||
web_program_bytes: int = 0
|
||||
pdf_program_bytes: int = 0
|
||||
metadata_bytes: int = 0
|
||||
sample_cos_ids: List[Tuple[str | None, str | None]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageBreakdown:
|
||||
page_count: int = 0
|
||||
total_text_elements: int = 0
|
||||
total_image_elements: int = 0
|
||||
text_payload_chars: int = 0
|
||||
text_struct_bytes: int = 0
|
||||
image_struct_bytes: int = 0
|
||||
resources_bytes: int = 0
|
||||
content_stream_bytes: int = 0
|
||||
annotations_bytes: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentBreakdown:
|
||||
total_bytes: int
|
||||
fonts: FontBreakdown
|
||||
pages: PageBreakdown
|
||||
metadata_bytes: int
|
||||
xmp_bytes: int
|
||||
form_fields_bytes: int
|
||||
lazy_flag_bytes: int
|
||||
|
||||
|
||||
def approx_struct_size(obj: Any) -> int:
|
||||
if obj is None:
|
||||
return 0
|
||||
return len(json.dumps(obj, separators=(",", ":")))
|
||||
|
||||
|
||||
def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown:
|
||||
total = 0
|
||||
with_cos = 0
|
||||
with_prog = 0
|
||||
with_web_prog = 0
|
||||
with_pdf_prog = 0
|
||||
program_bytes = 0
|
||||
web_program_bytes = 0
|
||||
pdf_program_bytes = 0
|
||||
metadata_bytes = 0
|
||||
sample_cos_ids: List[Tuple[str | None, str | None]] = []
|
||||
|
||||
for font in fonts:
|
||||
total += 1
|
||||
font_id = font.get("id")
|
||||
uid = font.get("uid")
|
||||
cos_value = font.get("cosDictionary")
|
||||
if cos_value:
|
||||
with_cos += 1
|
||||
if len(sample_cos_ids) < 5:
|
||||
sample_cos_ids.append((font_id, uid))
|
||||
|
||||
metadata_bytes += approx_struct_size(
|
||||
{k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}}
|
||||
)
|
||||
|
||||
program = font.get("program")
|
||||
web_program = font.get("webProgram")
|
||||
pdf_program = font.get("pdfProgram")
|
||||
|
||||
if program:
|
||||
with_prog += 1
|
||||
program_bytes += base64_payload_size(program)
|
||||
if web_program:
|
||||
with_web_prog += 1
|
||||
web_program_bytes += base64_payload_size(web_program)
|
||||
if pdf_program:
|
||||
with_pdf_prog += 1
|
||||
pdf_program_bytes += base64_payload_size(pdf_program)
|
||||
|
||||
return FontBreakdown(
|
||||
total=total,
|
||||
with_cos=with_cos,
|
||||
with_program=with_prog,
|
||||
with_web_program=with_web_prog,
|
||||
with_pdf_program=with_pdf_prog,
|
||||
program_bytes=program_bytes,
|
||||
web_program_bytes=web_program_bytes,
|
||||
pdf_program_bytes=pdf_program_bytes,
|
||||
metadata_bytes=metadata_bytes,
|
||||
sample_cos_ids=sample_cos_ids,
|
||||
)
|
||||
|
||||
|
||||
def analyze_pages(pages: Iterable[Dict[str, Any]]) -> PageBreakdown:
|
||||
page_count = 0
|
||||
total_text = 0
|
||||
total_images = 0
|
||||
text_chars = 0
|
||||
text_struct_bytes = 0
|
||||
image_struct_bytes = 0
|
||||
resources_bytes = 0
|
||||
stream_bytes = 0
|
||||
annotations_bytes = 0
|
||||
|
||||
for page in pages:
|
||||
page_count += 1
|
||||
texts = page.get("textElements") or []
|
||||
images = page.get("imageElements") or []
|
||||
resources = page.get("resources")
|
||||
streams = page.get("contentStreams") or []
|
||||
annotations = page.get("annotations") or []
|
||||
|
||||
total_text += len(texts)
|
||||
total_images += len(images)
|
||||
text_struct_bytes += approx_struct_size(texts)
|
||||
image_struct_bytes += approx_struct_size(images)
|
||||
resources_bytes += approx_struct_size(resources)
|
||||
stream_bytes += approx_struct_size(streams)
|
||||
annotations_bytes += approx_struct_size(annotations)
|
||||
|
||||
for elem in texts:
|
||||
text = elem.get("text")
|
||||
if text:
|
||||
text_chars += len(text)
|
||||
|
||||
return PageBreakdown(
|
||||
page_count=page_count,
|
||||
total_text_elements=total_text,
|
||||
total_image_elements=total_images,
|
||||
text_payload_chars=text_chars,
|
||||
text_struct_bytes=text_struct_bytes,
|
||||
image_struct_bytes=image_struct_bytes,
|
||||
resources_bytes=resources_bytes,
|
||||
content_stream_bytes=stream_bytes,
|
||||
annotations_bytes=annotations_bytes,
|
||||
)
|
||||
|
||||
|
||||
def analyze_document(document: Dict[str, Any], total_size: int) -> DocumentBreakdown:
|
||||
fonts = document.get("fonts") or []
|
||||
pages = document.get("pages") or []
|
||||
metadata = document.get("metadata") or {}
|
||||
|
||||
font_stats = analyze_fonts(fonts)
|
||||
page_stats = analyze_pages(pages)
|
||||
|
||||
return DocumentBreakdown(
|
||||
total_bytes=total_size,
|
||||
fonts=font_stats,
|
||||
pages=page_stats,
|
||||
metadata_bytes=approx_struct_size(metadata),
|
||||
xmp_bytes=base64_payload_size(document.get("xmpMetadata")),
|
||||
form_fields_bytes=approx_struct_size(document.get("formFields")),
|
||||
lazy_flag_bytes=approx_struct_size(document.get("lazyImages")),
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Inspect a PDF JSON export.")
|
||||
parser.add_argument("json_path", type=Path, help="Path to the JSON export.")
|
||||
args = parser.parse_args()
|
||||
|
||||
json_path = args.json_path
|
||||
if not json_path.exists():
|
||||
raise SystemExit(f"File not found: {json_path}")
|
||||
|
||||
file_size = json_path.stat().st_size
|
||||
print(f"File: {json_path}")
|
||||
print(f"Size: {human_bytes(file_size)} ({file_size:,} bytes)")
|
||||
|
||||
with json_path.open("r", encoding="utf-8") as handle:
|
||||
document = json.load(handle)
|
||||
|
||||
if not isinstance(document, dict):
|
||||
raise SystemExit("Unexpected JSON structure (expected an object at root).")
|
||||
|
||||
summary = analyze_document(document, file_size)
|
||||
page_stats = summary.pages
|
||||
print(f"Pages: {page_stats.page_count}")
|
||||
print(f"Total text elements: {page_stats.total_text_elements:,}")
|
||||
print(f"Total image elements: {page_stats.total_image_elements:,}")
|
||||
print(
|
||||
f"Page structural bytes (text arrays + images + streams + annotations): "
|
||||
f"{human_bytes(page_stats.text_struct_bytes + page_stats.image_struct_bytes + page_stats.content_stream_bytes + page_stats.annotations_bytes)}"
|
||||
)
|
||||
|
||||
font_stats = summary.fonts
|
||||
print("\nFont summary:")
|
||||
print(f" Fonts total: {font_stats.total}")
|
||||
print(f" Fonts with cosDictionary: {font_stats.with_cos}")
|
||||
print(f" Fonts with program: {font_stats.with_program}")
|
||||
print(f" Fonts with webProgram: {font_stats.with_web_program}")
|
||||
print(f" Fonts with pdfProgram: {font_stats.with_pdf_program}")
|
||||
print(
|
||||
" Payload sizes:"
|
||||
f" program={human_bytes(font_stats.program_bytes)},"
|
||||
f" webProgram={human_bytes(font_stats.web_program_bytes)},"
|
||||
f" pdfProgram={human_bytes(font_stats.pdf_program_bytes)},"
|
||||
f" metadata={human_bytes(font_stats.metadata_bytes)}"
|
||||
)
|
||||
if font_stats.sample_cos_ids:
|
||||
print(" Sample fonts still carrying cosDictionary:")
|
||||
for idx, (font_id, uid) in enumerate(font_stats.sample_cos_ids, start=1):
|
||||
print(f" {idx}. id={font_id!r}, uid={uid!r}")
|
||||
else:
|
||||
print(" No fonts retain cosDictionary entries.")
|
||||
|
||||
print("\nOther sections:")
|
||||
print(f" Metadata bytes: {human_bytes(summary.metadata_bytes)}")
|
||||
print(f" XMP metadata bytes: {human_bytes(summary.xmp_bytes)}")
|
||||
print(f" Form fields bytes: {human_bytes(summary.form_fields_bytes)}")
|
||||
print(f" Lazy flag bytes: {summary.lazy_flag_bytes}")
|
||||
print(
|
||||
f" Text payload characters (not counting JSON overhead): "
|
||||
f"{page_stats.text_payload_chars:,}"
|
||||
)
|
||||
print(
|
||||
f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}"
|
||||
)
|
||||
print(
|
||||
f" Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}"
|
||||
)
|
||||
print(
|
||||
f" Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}"
|
||||
)
|
||||
print(
|
||||
f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user