json size clenaup 450 to 35mb

This commit is contained in:
Anthony Stirling 2025-11-05 23:35:08 +00:00
parent d4e95a6ed7
commit d4c702f96c
6 changed files with 687 additions and 40 deletions

View File

@ -42,14 +42,16 @@ public class ConvertPdfJsonController {
description =
"Extracts PDF text, fonts, and metadata into an editable JSON structure that can be"
+ " transformed back into a PDF. Input:PDF Output:JSON Type:SISO")
public ResponseEntity<byte[]> convertPdfToJson(@ModelAttribute PDFFile request)
public ResponseEntity<byte[]> convertPdfToJson(
@ModelAttribute PDFFile request,
@RequestParam(value = "lightweight", defaultValue = "false") boolean lightweight)
throws Exception {
MultipartFile inputFile = request.getFileInput();
if (inputFile == null) {
throw ExceptionUtils.createNullArgumentException("fileInput");
}
byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile);
byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile, lightweight);
String originalName = inputFile.getOriginalFilename();
String baseName =
(originalName != null && !originalName.isBlank())

View File

@ -1,6 +1,5 @@
package stirling.software.SPDF.model.json;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
@ -33,7 +32,7 @@ public class PdfJsonTextElement {
private Float y;
private Float width;
private Float height;
@Builder.Default private List<Float> textMatrix = new ArrayList<>();
private List<Float> textMatrix;
private PdfJsonTextColor fillColor;
private PdfJsonTextColor strokeColor;
private Integer renderingMode;

View File

@ -18,6 +18,7 @@ import java.time.format.DateTimeParseException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Calendar;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
@ -141,6 +142,10 @@ public class PdfJsonConversionService {
private volatile boolean ghostscriptAvailable;
private static final float FLOAT_EPSILON = 0.0001f;
private static final float ORIENTATION_TOLERANCE = 0.0005f;
private static final float BASELINE_TOLERANCE = 0.5f;
@PostConstruct
private void initializeToolAvailability() {
initializeGhostscriptAvailability();
@ -185,12 +190,24 @@ public class PdfJsonConversionService {
}
public byte[] convertPdfToJson(MultipartFile file) throws IOException {
return convertPdfToJson(file, null);
return convertPdfToJson(file, null, false);
}
public byte[] convertPdfToJson(MultipartFile file, boolean lightweight) throws IOException {
return convertPdfToJson(file, null, lightweight);
}
public byte[] convertPdfToJson(
MultipartFile file, Consumer<PdfJsonConversionProgress> progressCallback)
throws IOException {
return convertPdfToJson(file, progressCallback, false);
}
public byte[] convertPdfToJson(
MultipartFile file,
Consumer<PdfJsonConversionProgress> progressCallback,
boolean lightweight)
throws IOException {
if (file == null) {
throw ExceptionUtils.createNullArgumentException("fileInput");
}
@ -341,7 +358,7 @@ public class PdfJsonConversionService {
pdfJson.setMetadata(extractMetadata(document));
pdfJson.setXmpMetadata(extractXmpMetadata(document));
pdfJson.setLazyImages(useLazyImages);
List<PdfJsonFont> serializedFonts = new ArrayList<>(fonts.values());
List<PdfJsonFont> serializedFonts = cloneFontList(fonts.values());
serializedFonts.sort(
Comparator.comparing(
PdfJsonFont::getUid,
@ -385,6 +402,10 @@ public class PdfJsonConversionService {
scheduleDocumentCleanup(jobId);
}
if (lightweight) {
applyLightweightTransformations(pdfJson);
}
progress.accept(
PdfJsonConversionProgress.of(95, "serializing", "Generating JSON output"));
@ -394,8 +415,7 @@ public class PdfJsonConversionService {
pdfJson.getPages().size(),
useLazyImages);
byte[] result =
objectMapper.writerWithDefaultPrettyPrinter().writeValueAsBytes(pdfJson);
byte[] result = objectMapper.writeValueAsBytes(pdfJson);
progress.accept(PdfJsonConversionProgress.complete());
return result;
}
@ -668,6 +688,78 @@ public class PdfJsonConversionService {
return buildFontKey(page, fontId);
}
private List<PdfJsonFont> cloneFontList(Collection<PdfJsonFont> source) {
List<PdfJsonFont> clones = new ArrayList<>();
if (source == null) {
return clones;
}
for (PdfJsonFont font : source) {
PdfJsonFont copy = cloneFont(font);
if (copy != null) {
clones.add(copy);
}
}
return clones;
}
private PdfJsonFont cloneFont(PdfJsonFont font) {
if (font == null) {
return null;
}
return PdfJsonFont.builder()
.id(font.getId())
.pageNumber(font.getPageNumber())
.uid(font.getUid())
.baseName(font.getBaseName())
.subtype(font.getSubtype())
.encoding(font.getEncoding())
.cidSystemInfo(font.getCidSystemInfo())
.embedded(font.getEmbedded())
.program(font.getProgram())
.programFormat(font.getProgramFormat())
.webProgram(font.getWebProgram())
.webProgramFormat(font.getWebProgramFormat())
.pdfProgram(font.getPdfProgram())
.pdfProgramFormat(font.getPdfProgramFormat())
.toUnicode(font.getToUnicode())
.standard14Name(font.getStandard14Name())
.fontDescriptorFlags(font.getFontDescriptorFlags())
.ascent(font.getAscent())
.descent(font.getDescent())
.capHeight(font.getCapHeight())
.xHeight(font.getXHeight())
.italicAngle(font.getItalicAngle())
.unitsPerEm(font.getUnitsPerEm())
.cosDictionary(font.getCosDictionary())
.build();
}
private void applyLightweightTransformations(PdfJsonDocument document) {
if (document == null) {
return;
}
List<PdfJsonFont> fonts = document.getFonts();
if (fonts == null) {
return;
}
for (PdfJsonFont font : fonts) {
if (font == null) {
continue;
}
boolean hasUsableProgram =
hasPayload(font.getPdfProgram())
|| hasPayload(font.getWebProgram())
|| hasPayload(font.getProgram());
if (hasUsableProgram) {
font.setCosDictionary(null);
}
}
}
private boolean hasPayload(String value) {
return value != null && !value.isBlank();
}
private PdfJsonFont buildFontModel(
PDDocument document, PDFont font, String fontId, int pageNumber) throws IOException {
PDFontDescriptor descriptor = font.getFontDescriptor();
@ -3301,6 +3393,7 @@ public class PdfJsonConversionService {
private int currentPage = 1;
private Map<PDFont, String> currentFontResources = Collections.emptyMap();
private int currentZOrderCounter;
TextCollectingStripper(
PDDocument document,
@ -3320,6 +3413,7 @@ public class PdfJsonConversionService {
currentPage = getCurrentPageNo();
currentFontResources =
pageFontResources.getOrDefault(currentPage, Collections.emptyMap());
currentZOrderCounter = 0;
}
@Override
@ -3331,42 +3425,289 @@ public class PdfJsonConversionService {
List<PdfJsonTextElement> pageElements =
textByPage.computeIfAbsent(currentPage, key -> new ArrayList<>());
TextRunAccumulator accumulator = null;
for (TextPosition position : textPositions) {
PDFont font = position.getFont();
String fontId = registerFont(font);
PdfJsonTextElement element = new PdfJsonTextElement();
element.setText(position.getUnicode());
element.setFontId(fontId);
element.setFontSize(position.getFontSizeInPt());
element.setFontSizeInPt(position.getFontSizeInPt());
element.setX(position.getXDirAdj());
element.setY(position.getYDirAdj());
element.setWidth(position.getWidthDirAdj());
element.setHeight(position.getHeightDir());
element.setTextMatrix(extractMatrix(position));
element.setFontMatrixSize(computeFontMatrixSize(element.getTextMatrix()));
element.setSpaceWidth(position.getWidthOfSpace());
PDGraphicsState graphicsState = getGraphicsState();
if (graphicsState != null) {
PDTextState textState = graphicsState.getTextState();
if (textState != null) {
element.setCharacterSpacing(textState.getCharacterSpacing());
element.setWordSpacing(textState.getWordSpacing());
element.setHorizontalScaling(textState.getHorizontalScaling());
element.setLeading(textState.getLeading());
element.setRise(textState.getRise());
if (textState.getRenderingMode() != null) {
element.setRenderingMode(textState.getRenderingMode().intValue());
}
}
element.setFillColor(toTextColor(graphicsState.getNonStrokingColor()));
element.setStrokeColor(toTextColor(graphicsState.getStrokingColor()));
PdfJsonTextElement element = createTextElement(position, fontId);
if (accumulator == null) {
accumulator = new TextRunAccumulator(element, position);
} else if (!accumulator.canAppend(element, position)) {
PdfJsonTextElement built = accumulator.build();
built.setZOrder(1_000_000 + currentZOrderCounter++);
pageElements.add(built);
accumulator = new TextRunAccumulator(element, position);
} else {
accumulator.append(element, position);
}
element.setZOrder(1_000_000 + pageElements.size());
pageElements.add(element);
}
if (accumulator != null) {
PdfJsonTextElement built = accumulator.build();
built.setZOrder(1_000_000 + currentZOrderCounter++);
pageElements.add(built);
}
}
private PdfJsonTextElement createTextElement(TextPosition position, String fontId)
throws IOException {
PdfJsonTextElement element = new PdfJsonTextElement();
element.setText(position.getUnicode());
element.setFontId(fontId);
element.setFontSize(position.getFontSizeInPt());
element.setX(position.getXDirAdj());
element.setY(position.getYDirAdj());
element.setWidth(position.getWidthDirAdj());
element.setHeight(position.getHeightDir());
element.setTextMatrix(extractMatrix(position));
element.setFontMatrixSize(computeFontMatrixSize(element.getTextMatrix()));
element.setSpaceWidth(position.getWidthOfSpace());
PDGraphicsState graphicsState = getGraphicsState();
if (graphicsState != null) {
PDTextState textState = graphicsState.getTextState();
if (textState != null) {
element.setCharacterSpacing(textState.getCharacterSpacing());
element.setWordSpacing(textState.getWordSpacing());
element.setHorizontalScaling(textState.getHorizontalScaling());
element.setLeading(textState.getLeading());
element.setRise(textState.getRise());
if (textState.getRenderingMode() != null) {
element.setRenderingMode(textState.getRenderingMode().intValue());
}
}
element.setFillColor(toTextColor(graphicsState.getNonStrokingColor()));
element.setStrokeColor(toTextColor(graphicsState.getStrokingColor()));
}
return element;
}
private void compactTextElement(PdfJsonTextElement element) {
if (element == null) {
return;
}
List<Float> matrix = element.getTextMatrix();
if (matrix != null) {
if (matrix.isEmpty()) {
element.setTextMatrix(null);
} else if (matrix.size() == 6) {
element.setX(null);
element.setY(null);
}
}
if (isZero(element.getCharacterSpacing())) {
element.setCharacterSpacing(null);
}
if (isZero(element.getWordSpacing())) {
element.setWordSpacing(null);
}
if (isZero(element.getLeading())) {
element.setLeading(null);
}
if (isZero(element.getRise())) {
element.setRise(null);
}
if (element.getHorizontalScaling() != null
&& Math.abs(element.getHorizontalScaling() - 100f) < FLOAT_EPSILON) {
element.setHorizontalScaling(null);
}
if (element.getRenderingMode() != null && element.getRenderingMode() == 0) {
element.setRenderingMode(null);
}
if (isDefaultBlack(element.getFillColor())) {
element.setFillColor(null);
}
if (isDefaultBlack(element.getStrokeColor())) {
element.setStrokeColor(null);
}
}
private boolean isZero(Float value) {
return value != null && Math.abs(value) < FLOAT_EPSILON;
}
private boolean isDefaultBlack(PdfJsonTextColor color) {
if (color == null || color.getComponents() == null) {
return true;
}
List<Float> components = color.getComponents();
if (components.isEmpty()) {
return true;
}
String space = color.getColorSpace();
if (space == null || "DeviceRGB".equals(space)) {
if (components.size() < 3) {
return false;
}
return Math.abs(components.get(0)) < FLOAT_EPSILON
&& Math.abs(components.get(1)) < FLOAT_EPSILON
&& Math.abs(components.get(2)) < FLOAT_EPSILON;
}
if ("DeviceGray".equals(space)) {
return Math.abs(components.get(0)) < FLOAT_EPSILON;
}
return false;
}
private Float baselineFrom(PdfJsonTextElement element) {
List<Float> matrix = element.getTextMatrix();
if (matrix != null && matrix.size() >= 6) {
return matrix.get(5);
}
return element.getY();
}
private TextStyleKey buildStyleKey(PdfJsonTextElement element) {
return new TextStyleKey(
element.getFontId(),
element.getFontSize(),
element.getFontMatrixSize(),
element.getCharacterSpacing(),
element.getWordSpacing(),
element.getHorizontalScaling(),
element.getLeading(),
element.getRise(),
element.getFillColor(),
element.getStrokeColor(),
element.getRenderingMode(),
element.getSpaceWidth());
}
private class TextRunAccumulator {
private final PdfJsonTextElement baseElement;
private final TextStyleKey styleKey;
private final float orientationA;
private final float orientationB;
private final float orientationC;
private final float orientationD;
private final Float baseline;
private final List<Float> baseMatrix;
private final float startXCoord;
private final float startYCoord;
private final StringBuilder textBuilder = new StringBuilder();
private float totalWidth;
private float maxHeight;
private float endXCoord;
TextRunAccumulator(PdfJsonTextElement element, TextPosition position) {
this.baseElement = element;
this.styleKey = buildStyleKey(element);
this.baseMatrix =
element.getTextMatrix() != null
? new ArrayList<>(element.getTextMatrix())
: null;
if (baseMatrix != null && baseMatrix.size() >= 6) {
orientationA = baseMatrix.get(0);
orientationB = baseMatrix.get(1);
orientationC = baseMatrix.get(2);
orientationD = baseMatrix.get(3);
startXCoord = baseMatrix.get(4);
startYCoord = baseMatrix.get(5);
} else {
orientationA = 1f;
orientationB = 0f;
orientationC = 0f;
orientationD = 1f;
startXCoord = element.getX() != null ? element.getX() : 0f;
startYCoord = element.getY() != null ? element.getY() : 0f;
}
this.baseline = baselineFrom(element);
this.totalWidth = element.getWidth() != null ? element.getWidth() : 0f;
this.maxHeight = element.getHeight() != null ? element.getHeight() : 0f;
this.endXCoord = position.getXDirAdj() + position.getWidthDirAdj();
this.textBuilder.append(element.getText());
}
boolean canAppend(PdfJsonTextElement element, TextPosition position) {
if (!styleKey.equals(buildStyleKey(element))) {
return false;
}
List<Float> matrix = element.getTextMatrix();
float a = 1f;
float b = 0f;
float c = 0f;
float d = 1f;
if (matrix != null && matrix.size() >= 4) {
a = matrix.get(0);
b = matrix.get(1);
c = matrix.get(2);
d = matrix.get(3);
}
if (Math.abs(a - orientationA) > ORIENTATION_TOLERANCE
|| Math.abs(b - orientationB) > ORIENTATION_TOLERANCE
|| Math.abs(c - orientationC) > ORIENTATION_TOLERANCE
|| Math.abs(d - orientationD) > ORIENTATION_TOLERANCE) {
return false;
}
Float otherBaseline = baselineFrom(element);
if (baseline != null && otherBaseline != null) {
if (Math.abs(otherBaseline - baseline) > BASELINE_TOLERANCE) {
return false;
}
} else if (baseline != null || otherBaseline != null) {
return false;
}
return true;
}
void append(PdfJsonTextElement element, TextPosition position) {
textBuilder.append(element.getText());
float width =
element.getWidth() != null ? element.getWidth() : position.getWidthDirAdj();
totalWidth += width;
float height =
element.getHeight() != null ? element.getHeight() : position.getHeightDir();
if (height > maxHeight) {
maxHeight = height;
}
endXCoord = position.getXDirAdj() + position.getWidthDirAdj();
}
PdfJsonTextElement build() {
PdfJsonTextElement result = baseElement;
result.setText(textBuilder.toString());
float widthCandidate = endXCoord - startXCoord;
if (widthCandidate > totalWidth) {
totalWidth = widthCandidate;
}
result.setWidth(totalWidth);
result.setHeight(maxHeight);
if (baseMatrix != null && baseMatrix.size() == 6) {
List<Float> matrix = new ArrayList<>(baseMatrix);
matrix.set(0, orientationA);
matrix.set(1, orientationB);
matrix.set(2, orientationC);
matrix.set(3, orientationD);
matrix.set(4, startXCoord);
matrix.set(5, startYCoord);
result.setTextMatrix(matrix);
result.setX(null);
result.setY(null);
}
compactTextElement(result);
return result;
}
}
private record TextStyleKey(
String fontId,
Float fontSize,
Float fontMatrixSize,
Float characterSpacing,
Float wordSpacing,
Float horizontalScaling,
Float leading,
Float rise,
PdfJsonTextColor fillColor,
PdfJsonTextColor strokeColor,
Integer renderingMode,
Float spaceWidth) {}
private List<Float> extractMatrix(TextPosition position) {
float[] values = new float[6];
values[0] = position.getTextMatrix().getValue(0, 0);

View File

@ -0,0 +1,28 @@
# PDF JSON Editor Backlog
- **Type3 Font Support (Text Additions)**
- Parse Type3 charprocs to extract glyph outlines, build a synthetic TrueType/OpenType font (FontTools, Ghostscript `ps2ttf`, etc.), and store it in `webProgram` / `pdfProgram` for client use.
- Preserve the original Type3 resources for round-trip fidelity; use the synthesized font only for edited elements while reusing the original stream elsewhere.
- Extend conversion logic so fallback kicks in only when conversion fails, and track which elements rely on the synthetic font to avoid mixing source glyphs (`PdfJsonConversionService.java:998-1090`, `1840-2012`).
- Update the viewer/renderer to surface conversion errors and block editing when no faithful font can be produced.
- **Vector Artwork Preview**
- Reuse `contentStreams` already emitted by the backend to render vector paths alongside text/images in the React workspace (`frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1048-1285`).
- Either render via Canvas/SVG on the client or call back to a server-rendered bitmap for the background. Keep edited text/images layered on top.
- Maintain export fidelity by writing any untouched vector stream back during PDF regeneration (`PdfJsonConversionService.java:1714-1799`, `520-612`).
- **Lazy Fetch Endpoints**
- Provide separate endpoints to fetch:
1. Raw COS dictionaries/font programs when the user opens advanced panels.
2. Page-level raster/vector previews to avoid sending large `imageData` upfront.
- Reuse the existing job cache (`documentCache`) to serve these on demand and clean up after timeouts (`PdfJsonConversionService.java:3608-3687`).
- **Editor UX Safeguards**
- Respect `fallbackFontService` indicators; mark groups using fallback glyphs so the UI can warn about possible appearance shifts (`frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1260-1287`).
- Surface when Type3 conversion was downgraded (e.g., rasterized glyphs) and limit editing to operations that keep the PDF stable.
- **Canonical Font Sharing**
- Emit fonts once per unique embedded program. Add a `canonicalFonts` array containing the full payload (program, ToUnicode, metadata) and a compact `fontAliases` mapping `{pageNumber, fontId, canonicalUid}` so text elements can still reference per-page IDs.
- Store COS dictionaries only on canonical entries; aliases should keep light fields (e.g., size adjustments) if they differ.
- Update `buildFontMap` to resolve aliases when recreating PDFBox fonts, and adjust the front end to load programs via the canonical UID.
- Optional: expose a lazy endpoint for the original COS dictionary if the canonical record strips it, so export still reconstructs untouched fonts.

View File

@ -279,7 +279,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
console.log('Sending conversion request with async=true');
const response = await apiClient.post(
`${CONVERSION_ENDPOINTS['pdf-json']}?async=true`,
`${CONVERSION_ENDPOINTS['pdf-json']}?async=true&lightweight=true`,
formData,
{
responseType: 'json',
@ -632,7 +632,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
}
const { document, filename } = payload;
const serialized = JSON.stringify(document, null, 2);
const serialized = JSON.stringify(document);
downloadTextAsFile(serialized, filename, 'application/json');
if (onComplete) {
@ -760,7 +760,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
}
const { document, filename } = payload;
const serialized = JSON.stringify(document, null, 2);
const serialized = JSON.stringify(document);
const jsonFile = new File([serialized], filename, { type: 'application/json' });
const formData = new FormData();

277
scripts/analyze_pdf_json.py Normal file
View File

@ -0,0 +1,277 @@
#!/usr/bin/env python3
"""
Quick inspection utility for PDFJSON exports.
Usage:
python scripts/analyze_pdf_json.py path/to/export.json
The script prints size and font statistics so we can confirm whether the
lightweight export (no COS dictionaries) is active and how large the font
payloads are.
"""
from __future__ import annotations
import argparse
import base64
import json
import math
from pathlib import Path
from dataclasses import dataclass
from typing import Any, Dict, Iterable, List, Tuple
def human_bytes(value: float) -> str:
if value <= 0:
return "0 B"
units = ["B", "KB", "MB", "GB", "TB"]
order = min(int(math.log(value, 1024)), len(units) - 1)
scaled = value / (1024**order)
return f"{scaled:.1f} {units[order]}"
def base64_payload_size(encoded: str | None) -> int:
if not encoded:
return 0
length = len(encoded.strip())
if length == 0:
return 0
return int(length * 0.75)
@dataclass
class FontBreakdown:
total: int = 0
with_cos: int = 0
with_program: int = 0
with_web_program: int = 0
with_pdf_program: int = 0
program_bytes: int = 0
web_program_bytes: int = 0
pdf_program_bytes: int = 0
metadata_bytes: int = 0
sample_cos_ids: List[Tuple[str | None, str | None]] = None
@dataclass
class PageBreakdown:
page_count: int = 0
total_text_elements: int = 0
total_image_elements: int = 0
text_payload_chars: int = 0
text_struct_bytes: int = 0
image_struct_bytes: int = 0
resources_bytes: int = 0
content_stream_bytes: int = 0
annotations_bytes: int = 0
@dataclass
class DocumentBreakdown:
total_bytes: int
fonts: FontBreakdown
pages: PageBreakdown
metadata_bytes: int
xmp_bytes: int
form_fields_bytes: int
lazy_flag_bytes: int
def approx_struct_size(obj: Any) -> int:
if obj is None:
return 0
return len(json.dumps(obj, separators=(",", ":")))
def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown:
total = 0
with_cos = 0
with_prog = 0
with_web_prog = 0
with_pdf_prog = 0
program_bytes = 0
web_program_bytes = 0
pdf_program_bytes = 0
metadata_bytes = 0
sample_cos_ids: List[Tuple[str | None, str | None]] = []
for font in fonts:
total += 1
font_id = font.get("id")
uid = font.get("uid")
cos_value = font.get("cosDictionary")
if cos_value:
with_cos += 1
if len(sample_cos_ids) < 5:
sample_cos_ids.append((font_id, uid))
metadata_bytes += approx_struct_size(
{k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}}
)
program = font.get("program")
web_program = font.get("webProgram")
pdf_program = font.get("pdfProgram")
if program:
with_prog += 1
program_bytes += base64_payload_size(program)
if web_program:
with_web_prog += 1
web_program_bytes += base64_payload_size(web_program)
if pdf_program:
with_pdf_prog += 1
pdf_program_bytes += base64_payload_size(pdf_program)
return FontBreakdown(
total=total,
with_cos=with_cos,
with_program=with_prog,
with_web_program=with_web_prog,
with_pdf_program=with_pdf_prog,
program_bytes=program_bytes,
web_program_bytes=web_program_bytes,
pdf_program_bytes=pdf_program_bytes,
metadata_bytes=metadata_bytes,
sample_cos_ids=sample_cos_ids,
)
def analyze_pages(pages: Iterable[Dict[str, Any]]) -> PageBreakdown:
page_count = 0
total_text = 0
total_images = 0
text_chars = 0
text_struct_bytes = 0
image_struct_bytes = 0
resources_bytes = 0
stream_bytes = 0
annotations_bytes = 0
for page in pages:
page_count += 1
texts = page.get("textElements") or []
images = page.get("imageElements") or []
resources = page.get("resources")
streams = page.get("contentStreams") or []
annotations = page.get("annotations") or []
total_text += len(texts)
total_images += len(images)
text_struct_bytes += approx_struct_size(texts)
image_struct_bytes += approx_struct_size(images)
resources_bytes += approx_struct_size(resources)
stream_bytes += approx_struct_size(streams)
annotations_bytes += approx_struct_size(annotations)
for elem in texts:
text = elem.get("text")
if text:
text_chars += len(text)
return PageBreakdown(
page_count=page_count,
total_text_elements=total_text,
total_image_elements=total_images,
text_payload_chars=text_chars,
text_struct_bytes=text_struct_bytes,
image_struct_bytes=image_struct_bytes,
resources_bytes=resources_bytes,
content_stream_bytes=stream_bytes,
annotations_bytes=annotations_bytes,
)
def analyze_document(document: Dict[str, Any], total_size: int) -> DocumentBreakdown:
fonts = document.get("fonts") or []
pages = document.get("pages") or []
metadata = document.get("metadata") or {}
font_stats = analyze_fonts(fonts)
page_stats = analyze_pages(pages)
return DocumentBreakdown(
total_bytes=total_size,
fonts=font_stats,
pages=page_stats,
metadata_bytes=approx_struct_size(metadata),
xmp_bytes=base64_payload_size(document.get("xmpMetadata")),
form_fields_bytes=approx_struct_size(document.get("formFields")),
lazy_flag_bytes=approx_struct_size(document.get("lazyImages")),
)
def main() -> None:
parser = argparse.ArgumentParser(description="Inspect a PDF JSON export.")
parser.add_argument("json_path", type=Path, help="Path to the JSON export.")
args = parser.parse_args()
json_path = args.json_path
if not json_path.exists():
raise SystemExit(f"File not found: {json_path}")
file_size = json_path.stat().st_size
print(f"File: {json_path}")
print(f"Size: {human_bytes(file_size)} ({file_size:,} bytes)")
with json_path.open("r", encoding="utf-8") as handle:
document = json.load(handle)
if not isinstance(document, dict):
raise SystemExit("Unexpected JSON structure (expected an object at root).")
summary = analyze_document(document, file_size)
page_stats = summary.pages
print(f"Pages: {page_stats.page_count}")
print(f"Total text elements: {page_stats.total_text_elements:,}")
print(f"Total image elements: {page_stats.total_image_elements:,}")
print(
f"Page structural bytes (text arrays + images + streams + annotations): "
f"{human_bytes(page_stats.text_struct_bytes + page_stats.image_struct_bytes + page_stats.content_stream_bytes + page_stats.annotations_bytes)}"
)
font_stats = summary.fonts
print("\nFont summary:")
print(f" Fonts total: {font_stats.total}")
print(f" Fonts with cosDictionary: {font_stats.with_cos}")
print(f" Fonts with program: {font_stats.with_program}")
print(f" Fonts with webProgram: {font_stats.with_web_program}")
print(f" Fonts with pdfProgram: {font_stats.with_pdf_program}")
print(
" Payload sizes:"
f" program={human_bytes(font_stats.program_bytes)},"
f" webProgram={human_bytes(font_stats.web_program_bytes)},"
f" pdfProgram={human_bytes(font_stats.pdf_program_bytes)},"
f" metadata={human_bytes(font_stats.metadata_bytes)}"
)
if font_stats.sample_cos_ids:
print(" Sample fonts still carrying cosDictionary:")
for idx, (font_id, uid) in enumerate(font_stats.sample_cos_ids, start=1):
print(f" {idx}. id={font_id!r}, uid={uid!r}")
else:
print(" No fonts retain cosDictionary entries.")
print("\nOther sections:")
print(f" Metadata bytes: {human_bytes(summary.metadata_bytes)}")
print(f" XMP metadata bytes: {human_bytes(summary.xmp_bytes)}")
print(f" Form fields bytes: {human_bytes(summary.form_fields_bytes)}")
print(f" Lazy flag bytes: {summary.lazy_flag_bytes}")
print(
f" Text payload characters (not counting JSON overhead): "
f"{page_stats.text_payload_chars:,}"
)
print(
f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}"
)
print(
f" Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}"
)
print(
f" Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}"
)
print(
f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}"
)
if __name__ == "__main__":
main()