diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java b/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java index 0cd22d3ff..1523d3c1b 100644 --- a/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java +++ b/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java @@ -42,14 +42,16 @@ public class ConvertPdfJsonController { description = "Extracts PDF text, fonts, and metadata into an editable JSON structure that can be" + " transformed back into a PDF. Input:PDF Output:JSON Type:SISO") - public ResponseEntity convertPdfToJson(@ModelAttribute PDFFile request) + public ResponseEntity convertPdfToJson( + @ModelAttribute PDFFile request, + @RequestParam(value = "lightweight", defaultValue = "false") boolean lightweight) throws Exception { MultipartFile inputFile = request.getFileInput(); if (inputFile == null) { throw ExceptionUtils.createNullArgumentException("fileInput"); } - byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile); + byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile, lightweight); String originalName = inputFile.getOriginalFilename(); String baseName = (originalName != null && !originalName.isBlank()) diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java index 921971e53..ad80060f3 100644 --- a/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java +++ b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java @@ -1,6 +1,5 @@ package stirling.software.SPDF.model.json; -import java.util.ArrayList; import java.util.List; import com.fasterxml.jackson.annotation.JsonInclude; @@ -33,7 +32,7 @@ public class PdfJsonTextElement { private Float y; private Float width; private Float height; - @Builder.Default private List textMatrix = new ArrayList<>(); + private List textMatrix; private PdfJsonTextColor fillColor; private PdfJsonTextColor strokeColor; private Integer renderingMode; diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java index fa91cfee4..7de527d54 100644 --- a/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java +++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java @@ -18,6 +18,7 @@ import java.time.format.DateTimeParseException; import java.util.ArrayList; import java.util.Base64; import java.util.Calendar; +import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; @@ -141,6 +142,10 @@ public class PdfJsonConversionService { private volatile boolean ghostscriptAvailable; + private static final float FLOAT_EPSILON = 0.0001f; + private static final float ORIENTATION_TOLERANCE = 0.0005f; + private static final float BASELINE_TOLERANCE = 0.5f; + @PostConstruct private void initializeToolAvailability() { initializeGhostscriptAvailability(); @@ -185,12 +190,24 @@ public class PdfJsonConversionService { } public byte[] convertPdfToJson(MultipartFile file) throws IOException { - return convertPdfToJson(file, null); + return convertPdfToJson(file, null, false); + } + + public byte[] convertPdfToJson(MultipartFile file, boolean lightweight) throws IOException { + return convertPdfToJson(file, null, lightweight); } public byte[] convertPdfToJson( MultipartFile file, Consumer progressCallback) throws IOException { + return convertPdfToJson(file, progressCallback, false); + } + + public byte[] convertPdfToJson( + MultipartFile file, + Consumer progressCallback, + boolean lightweight) + throws IOException { if (file == null) { throw ExceptionUtils.createNullArgumentException("fileInput"); } @@ -341,7 +358,7 @@ public class PdfJsonConversionService { pdfJson.setMetadata(extractMetadata(document)); pdfJson.setXmpMetadata(extractXmpMetadata(document)); pdfJson.setLazyImages(useLazyImages); - List serializedFonts = new ArrayList<>(fonts.values()); + List serializedFonts = cloneFontList(fonts.values()); serializedFonts.sort( Comparator.comparing( PdfJsonFont::getUid, @@ -385,6 +402,10 @@ public class PdfJsonConversionService { scheduleDocumentCleanup(jobId); } + if (lightweight) { + applyLightweightTransformations(pdfJson); + } + progress.accept( PdfJsonConversionProgress.of(95, "serializing", "Generating JSON output")); @@ -394,8 +415,7 @@ public class PdfJsonConversionService { pdfJson.getPages().size(), useLazyImages); - byte[] result = - objectMapper.writerWithDefaultPrettyPrinter().writeValueAsBytes(pdfJson); + byte[] result = objectMapper.writeValueAsBytes(pdfJson); progress.accept(PdfJsonConversionProgress.complete()); return result; } @@ -668,6 +688,78 @@ public class PdfJsonConversionService { return buildFontKey(page, fontId); } + private List cloneFontList(Collection source) { + List clones = new ArrayList<>(); + if (source == null) { + return clones; + } + for (PdfJsonFont font : source) { + PdfJsonFont copy = cloneFont(font); + if (copy != null) { + clones.add(copy); + } + } + return clones; + } + + private PdfJsonFont cloneFont(PdfJsonFont font) { + if (font == null) { + return null; + } + return PdfJsonFont.builder() + .id(font.getId()) + .pageNumber(font.getPageNumber()) + .uid(font.getUid()) + .baseName(font.getBaseName()) + .subtype(font.getSubtype()) + .encoding(font.getEncoding()) + .cidSystemInfo(font.getCidSystemInfo()) + .embedded(font.getEmbedded()) + .program(font.getProgram()) + .programFormat(font.getProgramFormat()) + .webProgram(font.getWebProgram()) + .webProgramFormat(font.getWebProgramFormat()) + .pdfProgram(font.getPdfProgram()) + .pdfProgramFormat(font.getPdfProgramFormat()) + .toUnicode(font.getToUnicode()) + .standard14Name(font.getStandard14Name()) + .fontDescriptorFlags(font.getFontDescriptorFlags()) + .ascent(font.getAscent()) + .descent(font.getDescent()) + .capHeight(font.getCapHeight()) + .xHeight(font.getXHeight()) + .italicAngle(font.getItalicAngle()) + .unitsPerEm(font.getUnitsPerEm()) + .cosDictionary(font.getCosDictionary()) + .build(); + } + + private void applyLightweightTransformations(PdfJsonDocument document) { + if (document == null) { + return; + } + List fonts = document.getFonts(); + if (fonts == null) { + return; + } + for (PdfJsonFont font : fonts) { + if (font == null) { + continue; + } + boolean hasUsableProgram = + hasPayload(font.getPdfProgram()) + || hasPayload(font.getWebProgram()) + || hasPayload(font.getProgram()); + if (hasUsableProgram) { + font.setCosDictionary(null); + } + } + } + + private boolean hasPayload(String value) { + return value != null && !value.isBlank(); + } + private PdfJsonFont buildFontModel( PDDocument document, PDFont font, String fontId, int pageNumber) throws IOException { PDFontDescriptor descriptor = font.getFontDescriptor(); @@ -3301,6 +3393,7 @@ public class PdfJsonConversionService { private int currentPage = 1; private Map currentFontResources = Collections.emptyMap(); + private int currentZOrderCounter; TextCollectingStripper( PDDocument document, @@ -3320,6 +3413,7 @@ public class PdfJsonConversionService { currentPage = getCurrentPageNo(); currentFontResources = pageFontResources.getOrDefault(currentPage, Collections.emptyMap()); + currentZOrderCounter = 0; } @Override @@ -3331,42 +3425,289 @@ public class PdfJsonConversionService { List pageElements = textByPage.computeIfAbsent(currentPage, key -> new ArrayList<>()); + TextRunAccumulator accumulator = null; for (TextPosition position : textPositions) { PDFont font = position.getFont(); String fontId = registerFont(font); - PdfJsonTextElement element = new PdfJsonTextElement(); - element.setText(position.getUnicode()); - element.setFontId(fontId); - element.setFontSize(position.getFontSizeInPt()); - element.setFontSizeInPt(position.getFontSizeInPt()); - element.setX(position.getXDirAdj()); - element.setY(position.getYDirAdj()); - element.setWidth(position.getWidthDirAdj()); - element.setHeight(position.getHeightDir()); - element.setTextMatrix(extractMatrix(position)); - element.setFontMatrixSize(computeFontMatrixSize(element.getTextMatrix())); - element.setSpaceWidth(position.getWidthOfSpace()); - PDGraphicsState graphicsState = getGraphicsState(); - if (graphicsState != null) { - PDTextState textState = graphicsState.getTextState(); - if (textState != null) { - element.setCharacterSpacing(textState.getCharacterSpacing()); - element.setWordSpacing(textState.getWordSpacing()); - element.setHorizontalScaling(textState.getHorizontalScaling()); - element.setLeading(textState.getLeading()); - element.setRise(textState.getRise()); - if (textState.getRenderingMode() != null) { - element.setRenderingMode(textState.getRenderingMode().intValue()); - } - } - element.setFillColor(toTextColor(graphicsState.getNonStrokingColor())); - element.setStrokeColor(toTextColor(graphicsState.getStrokingColor())); + PdfJsonTextElement element = createTextElement(position, fontId); + + if (accumulator == null) { + accumulator = new TextRunAccumulator(element, position); + } else if (!accumulator.canAppend(element, position)) { + PdfJsonTextElement built = accumulator.build(); + built.setZOrder(1_000_000 + currentZOrderCounter++); + pageElements.add(built); + accumulator = new TextRunAccumulator(element, position); + } else { + accumulator.append(element, position); } - element.setZOrder(1_000_000 + pageElements.size()); - pageElements.add(element); + } + + if (accumulator != null) { + PdfJsonTextElement built = accumulator.build(); + built.setZOrder(1_000_000 + currentZOrderCounter++); + pageElements.add(built); } } + private PdfJsonTextElement createTextElement(TextPosition position, String fontId) + throws IOException { + PdfJsonTextElement element = new PdfJsonTextElement(); + element.setText(position.getUnicode()); + element.setFontId(fontId); + element.setFontSize(position.getFontSizeInPt()); + element.setX(position.getXDirAdj()); + element.setY(position.getYDirAdj()); + element.setWidth(position.getWidthDirAdj()); + element.setHeight(position.getHeightDir()); + element.setTextMatrix(extractMatrix(position)); + element.setFontMatrixSize(computeFontMatrixSize(element.getTextMatrix())); + element.setSpaceWidth(position.getWidthOfSpace()); + + PDGraphicsState graphicsState = getGraphicsState(); + if (graphicsState != null) { + PDTextState textState = graphicsState.getTextState(); + if (textState != null) { + element.setCharacterSpacing(textState.getCharacterSpacing()); + element.setWordSpacing(textState.getWordSpacing()); + element.setHorizontalScaling(textState.getHorizontalScaling()); + element.setLeading(textState.getLeading()); + element.setRise(textState.getRise()); + if (textState.getRenderingMode() != null) { + element.setRenderingMode(textState.getRenderingMode().intValue()); + } + } + element.setFillColor(toTextColor(graphicsState.getNonStrokingColor())); + element.setStrokeColor(toTextColor(graphicsState.getStrokingColor())); + } + return element; + } + + private void compactTextElement(PdfJsonTextElement element) { + if (element == null) { + return; + } + + List matrix = element.getTextMatrix(); + if (matrix != null) { + if (matrix.isEmpty()) { + element.setTextMatrix(null); + } else if (matrix.size() == 6) { + element.setX(null); + element.setY(null); + } + } + + if (isZero(element.getCharacterSpacing())) { + element.setCharacterSpacing(null); + } + if (isZero(element.getWordSpacing())) { + element.setWordSpacing(null); + } + if (isZero(element.getLeading())) { + element.setLeading(null); + } + if (isZero(element.getRise())) { + element.setRise(null); + } + if (element.getHorizontalScaling() != null + && Math.abs(element.getHorizontalScaling() - 100f) < FLOAT_EPSILON) { + element.setHorizontalScaling(null); + } + if (element.getRenderingMode() != null && element.getRenderingMode() == 0) { + element.setRenderingMode(null); + } + if (isDefaultBlack(element.getFillColor())) { + element.setFillColor(null); + } + if (isDefaultBlack(element.getStrokeColor())) { + element.setStrokeColor(null); + } + } + + private boolean isZero(Float value) { + return value != null && Math.abs(value) < FLOAT_EPSILON; + } + + private boolean isDefaultBlack(PdfJsonTextColor color) { + if (color == null || color.getComponents() == null) { + return true; + } + List components = color.getComponents(); + if (components.isEmpty()) { + return true; + } + String space = color.getColorSpace(); + if (space == null || "DeviceRGB".equals(space)) { + if (components.size() < 3) { + return false; + } + return Math.abs(components.get(0)) < FLOAT_EPSILON + && Math.abs(components.get(1)) < FLOAT_EPSILON + && Math.abs(components.get(2)) < FLOAT_EPSILON; + } + if ("DeviceGray".equals(space)) { + return Math.abs(components.get(0)) < FLOAT_EPSILON; + } + return false; + } + + private Float baselineFrom(PdfJsonTextElement element) { + List matrix = element.getTextMatrix(); + if (matrix != null && matrix.size() >= 6) { + return matrix.get(5); + } + return element.getY(); + } + + private TextStyleKey buildStyleKey(PdfJsonTextElement element) { + return new TextStyleKey( + element.getFontId(), + element.getFontSize(), + element.getFontMatrixSize(), + element.getCharacterSpacing(), + element.getWordSpacing(), + element.getHorizontalScaling(), + element.getLeading(), + element.getRise(), + element.getFillColor(), + element.getStrokeColor(), + element.getRenderingMode(), + element.getSpaceWidth()); + } + + private class TextRunAccumulator { + private final PdfJsonTextElement baseElement; + private final TextStyleKey styleKey; + private final float orientationA; + private final float orientationB; + private final float orientationC; + private final float orientationD; + private final Float baseline; + private final List baseMatrix; + private final float startXCoord; + private final float startYCoord; + private final StringBuilder textBuilder = new StringBuilder(); + private float totalWidth; + private float maxHeight; + private float endXCoord; + + TextRunAccumulator(PdfJsonTextElement element, TextPosition position) { + this.baseElement = element; + this.styleKey = buildStyleKey(element); + this.baseMatrix = + element.getTextMatrix() != null + ? new ArrayList<>(element.getTextMatrix()) + : null; + if (baseMatrix != null && baseMatrix.size() >= 6) { + orientationA = baseMatrix.get(0); + orientationB = baseMatrix.get(1); + orientationC = baseMatrix.get(2); + orientationD = baseMatrix.get(3); + startXCoord = baseMatrix.get(4); + startYCoord = baseMatrix.get(5); + } else { + orientationA = 1f; + orientationB = 0f; + orientationC = 0f; + orientationD = 1f; + startXCoord = element.getX() != null ? element.getX() : 0f; + startYCoord = element.getY() != null ? element.getY() : 0f; + } + this.baseline = baselineFrom(element); + this.totalWidth = element.getWidth() != null ? element.getWidth() : 0f; + this.maxHeight = element.getHeight() != null ? element.getHeight() : 0f; + this.endXCoord = position.getXDirAdj() + position.getWidthDirAdj(); + this.textBuilder.append(element.getText()); + } + + boolean canAppend(PdfJsonTextElement element, TextPosition position) { + if (!styleKey.equals(buildStyleKey(element))) { + return false; + } + List matrix = element.getTextMatrix(); + float a = 1f; + float b = 0f; + float c = 0f; + float d = 1f; + if (matrix != null && matrix.size() >= 4) { + a = matrix.get(0); + b = matrix.get(1); + c = matrix.get(2); + d = matrix.get(3); + } + if (Math.abs(a - orientationA) > ORIENTATION_TOLERANCE + || Math.abs(b - orientationB) > ORIENTATION_TOLERANCE + || Math.abs(c - orientationC) > ORIENTATION_TOLERANCE + || Math.abs(d - orientationD) > ORIENTATION_TOLERANCE) { + return false; + } + + Float otherBaseline = baselineFrom(element); + if (baseline != null && otherBaseline != null) { + if (Math.abs(otherBaseline - baseline) > BASELINE_TOLERANCE) { + return false; + } + } else if (baseline != null || otherBaseline != null) { + return false; + } + + return true; + } + + void append(PdfJsonTextElement element, TextPosition position) { + textBuilder.append(element.getText()); + float width = + element.getWidth() != null ? element.getWidth() : position.getWidthDirAdj(); + totalWidth += width; + float height = + element.getHeight() != null ? element.getHeight() : position.getHeightDir(); + if (height > maxHeight) { + maxHeight = height; + } + endXCoord = position.getXDirAdj() + position.getWidthDirAdj(); + } + + PdfJsonTextElement build() { + PdfJsonTextElement result = baseElement; + result.setText(textBuilder.toString()); + float widthCandidate = endXCoord - startXCoord; + if (widthCandidate > totalWidth) { + totalWidth = widthCandidate; + } + result.setWidth(totalWidth); + result.setHeight(maxHeight); + if (baseMatrix != null && baseMatrix.size() == 6) { + List matrix = new ArrayList<>(baseMatrix); + matrix.set(0, orientationA); + matrix.set(1, orientationB); + matrix.set(2, orientationC); + matrix.set(3, orientationD); + matrix.set(4, startXCoord); + matrix.set(5, startYCoord); + result.setTextMatrix(matrix); + result.setX(null); + result.setY(null); + } + compactTextElement(result); + return result; + } + } + + private record TextStyleKey( + String fontId, + Float fontSize, + Float fontMatrixSize, + Float characterSpacing, + Float wordSpacing, + Float horizontalScaling, + Float leading, + Float rise, + PdfJsonTextColor fillColor, + PdfJsonTextColor strokeColor, + Integer renderingMode, + Float spaceWidth) {} + private List extractMatrix(TextPosition position) { float[] values = new float[6]; values[0] = position.getTextMatrix().getValue(0, 0); diff --git a/docs/pdf-json-editor-backlog.md b/docs/pdf-json-editor-backlog.md new file mode 100644 index 000000000..a87a596b1 --- /dev/null +++ b/docs/pdf-json-editor-backlog.md @@ -0,0 +1,28 @@ +# PDF JSON Editor Backlog + +- **Type3 Font Support (Text Additions)** + - Parse Type3 charprocs to extract glyph outlines, build a synthetic TrueType/OpenType font (FontTools, Ghostscript `ps2ttf`, etc.), and store it in `webProgram` / `pdfProgram` for client use. + - Preserve the original Type3 resources for round-trip fidelity; use the synthesized font only for edited elements while reusing the original stream elsewhere. + - Extend conversion logic so fallback kicks in only when conversion fails, and track which elements rely on the synthetic font to avoid mixing source glyphs (`PdfJsonConversionService.java:998-1090`, `1840-2012`). + - Update the viewer/renderer to surface conversion errors and block editing when no faithful font can be produced. + +- **Vector Artwork Preview** + - Reuse `contentStreams` already emitted by the backend to render vector paths alongside text/images in the React workspace (`frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1048-1285`). + - Either render via Canvas/SVG on the client or call back to a server-rendered bitmap for the background. Keep edited text/images layered on top. + - Maintain export fidelity by writing any untouched vector stream back during PDF regeneration (`PdfJsonConversionService.java:1714-1799`, `520-612`). + +- **Lazy Fetch Endpoints** + - Provide separate endpoints to fetch: + 1. Raw COS dictionaries/font programs when the user opens advanced panels. + 2. Page-level raster/vector previews to avoid sending large `imageData` upfront. + - Reuse the existing job cache (`documentCache`) to serve these on demand and clean up after timeouts (`PdfJsonConversionService.java:3608-3687`). + +- **Editor UX Safeguards** + - Respect `fallbackFontService` indicators; mark groups using fallback glyphs so the UI can warn about possible appearance shifts (`frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1260-1287`). + - Surface when Type3 conversion was downgraded (e.g., rasterized glyphs) and limit editing to operations that keep the PDF stable. + +- **Canonical Font Sharing** + - Emit fonts once per unique embedded program. Add a `canonicalFonts` array containing the full payload (program, ToUnicode, metadata) and a compact `fontAliases` mapping `{pageNumber, fontId, canonicalUid}` so text elements can still reference per-page IDs. + - Store COS dictionaries only on canonical entries; aliases should keep light fields (e.g., size adjustments) if they differ. + - Update `buildFontMap` to resolve aliases when recreating PDFBox fonts, and adjust the front end to load programs via the canonical UID. + - Optional: expose a lazy endpoint for the original COS dictionary if the canonical record strips it, so export still reconstructs untouched fonts. diff --git a/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx b/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx index b236de5a8..47b183102 100644 --- a/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx +++ b/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx @@ -279,7 +279,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { console.log('Sending conversion request with async=true'); const response = await apiClient.post( - `${CONVERSION_ENDPOINTS['pdf-json']}?async=true`, + `${CONVERSION_ENDPOINTS['pdf-json']}?async=true&lightweight=true`, formData, { responseType: 'json', @@ -632,7 +632,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { } const { document, filename } = payload; - const serialized = JSON.stringify(document, null, 2); + const serialized = JSON.stringify(document); downloadTextAsFile(serialized, filename, 'application/json'); if (onComplete) { @@ -760,7 +760,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { } const { document, filename } = payload; - const serialized = JSON.stringify(document, null, 2); + const serialized = JSON.stringify(document); const jsonFile = new File([serialized], filename, { type: 'application/json' }); const formData = new FormData(); diff --git a/scripts/analyze_pdf_json.py b/scripts/analyze_pdf_json.py new file mode 100644 index 000000000..1a9ba9b21 --- /dev/null +++ b/scripts/analyze_pdf_json.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python3 +""" +Quick inspection utility for PDF→JSON exports. + +Usage: + python scripts/analyze_pdf_json.py path/to/export.json + +The script prints size and font statistics so we can confirm whether the +lightweight export (no COS dictionaries) is active and how large the font +payloads are. +""" +from __future__ import annotations + +import argparse +import base64 +import json +import math +from pathlib import Path +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Tuple + + +def human_bytes(value: float) -> str: + if value <= 0: + return "0 B" + units = ["B", "KB", "MB", "GB", "TB"] + order = min(int(math.log(value, 1024)), len(units) - 1) + scaled = value / (1024**order) + return f"{scaled:.1f} {units[order]}" + + +def base64_payload_size(encoded: str | None) -> int: + if not encoded: + return 0 + length = len(encoded.strip()) + if length == 0: + return 0 + return int(length * 0.75) + + +@dataclass +class FontBreakdown: + total: int = 0 + with_cos: int = 0 + with_program: int = 0 + with_web_program: int = 0 + with_pdf_program: int = 0 + program_bytes: int = 0 + web_program_bytes: int = 0 + pdf_program_bytes: int = 0 + metadata_bytes: int = 0 + sample_cos_ids: List[Tuple[str | None, str | None]] = None + + +@dataclass +class PageBreakdown: + page_count: int = 0 + total_text_elements: int = 0 + total_image_elements: int = 0 + text_payload_chars: int = 0 + text_struct_bytes: int = 0 + image_struct_bytes: int = 0 + resources_bytes: int = 0 + content_stream_bytes: int = 0 + annotations_bytes: int = 0 + + +@dataclass +class DocumentBreakdown: + total_bytes: int + fonts: FontBreakdown + pages: PageBreakdown + metadata_bytes: int + xmp_bytes: int + form_fields_bytes: int + lazy_flag_bytes: int + + +def approx_struct_size(obj: Any) -> int: + if obj is None: + return 0 + return len(json.dumps(obj, separators=(",", ":"))) + + +def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown: + total = 0 + with_cos = 0 + with_prog = 0 + with_web_prog = 0 + with_pdf_prog = 0 + program_bytes = 0 + web_program_bytes = 0 + pdf_program_bytes = 0 + metadata_bytes = 0 + sample_cos_ids: List[Tuple[str | None, str | None]] = [] + + for font in fonts: + total += 1 + font_id = font.get("id") + uid = font.get("uid") + cos_value = font.get("cosDictionary") + if cos_value: + with_cos += 1 + if len(sample_cos_ids) < 5: + sample_cos_ids.append((font_id, uid)) + + metadata_bytes += approx_struct_size( + {k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}} + ) + + program = font.get("program") + web_program = font.get("webProgram") + pdf_program = font.get("pdfProgram") + + if program: + with_prog += 1 + program_bytes += base64_payload_size(program) + if web_program: + with_web_prog += 1 + web_program_bytes += base64_payload_size(web_program) + if pdf_program: + with_pdf_prog += 1 + pdf_program_bytes += base64_payload_size(pdf_program) + + return FontBreakdown( + total=total, + with_cos=with_cos, + with_program=with_prog, + with_web_program=with_web_prog, + with_pdf_program=with_pdf_prog, + program_bytes=program_bytes, + web_program_bytes=web_program_bytes, + pdf_program_bytes=pdf_program_bytes, + metadata_bytes=metadata_bytes, + sample_cos_ids=sample_cos_ids, + ) + + +def analyze_pages(pages: Iterable[Dict[str, Any]]) -> PageBreakdown: + page_count = 0 + total_text = 0 + total_images = 0 + text_chars = 0 + text_struct_bytes = 0 + image_struct_bytes = 0 + resources_bytes = 0 + stream_bytes = 0 + annotations_bytes = 0 + + for page in pages: + page_count += 1 + texts = page.get("textElements") or [] + images = page.get("imageElements") or [] + resources = page.get("resources") + streams = page.get("contentStreams") or [] + annotations = page.get("annotations") or [] + + total_text += len(texts) + total_images += len(images) + text_struct_bytes += approx_struct_size(texts) + image_struct_bytes += approx_struct_size(images) + resources_bytes += approx_struct_size(resources) + stream_bytes += approx_struct_size(streams) + annotations_bytes += approx_struct_size(annotations) + + for elem in texts: + text = elem.get("text") + if text: + text_chars += len(text) + + return PageBreakdown( + page_count=page_count, + total_text_elements=total_text, + total_image_elements=total_images, + text_payload_chars=text_chars, + text_struct_bytes=text_struct_bytes, + image_struct_bytes=image_struct_bytes, + resources_bytes=resources_bytes, + content_stream_bytes=stream_bytes, + annotations_bytes=annotations_bytes, + ) + + +def analyze_document(document: Dict[str, Any], total_size: int) -> DocumentBreakdown: + fonts = document.get("fonts") or [] + pages = document.get("pages") or [] + metadata = document.get("metadata") or {} + + font_stats = analyze_fonts(fonts) + page_stats = analyze_pages(pages) + + return DocumentBreakdown( + total_bytes=total_size, + fonts=font_stats, + pages=page_stats, + metadata_bytes=approx_struct_size(metadata), + xmp_bytes=base64_payload_size(document.get("xmpMetadata")), + form_fields_bytes=approx_struct_size(document.get("formFields")), + lazy_flag_bytes=approx_struct_size(document.get("lazyImages")), + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Inspect a PDF JSON export.") + parser.add_argument("json_path", type=Path, help="Path to the JSON export.") + args = parser.parse_args() + + json_path = args.json_path + if not json_path.exists(): + raise SystemExit(f"File not found: {json_path}") + + file_size = json_path.stat().st_size + print(f"File: {json_path}") + print(f"Size: {human_bytes(file_size)} ({file_size:,} bytes)") + + with json_path.open("r", encoding="utf-8") as handle: + document = json.load(handle) + + if not isinstance(document, dict): + raise SystemExit("Unexpected JSON structure (expected an object at root).") + + summary = analyze_document(document, file_size) + page_stats = summary.pages + print(f"Pages: {page_stats.page_count}") + print(f"Total text elements: {page_stats.total_text_elements:,}") + print(f"Total image elements: {page_stats.total_image_elements:,}") + print( + f"Page structural bytes (text arrays + images + streams + annotations): " + f"{human_bytes(page_stats.text_struct_bytes + page_stats.image_struct_bytes + page_stats.content_stream_bytes + page_stats.annotations_bytes)}" + ) + + font_stats = summary.fonts + print("\nFont summary:") + print(f" Fonts total: {font_stats.total}") + print(f" Fonts with cosDictionary: {font_stats.with_cos}") + print(f" Fonts with program: {font_stats.with_program}") + print(f" Fonts with webProgram: {font_stats.with_web_program}") + print(f" Fonts with pdfProgram: {font_stats.with_pdf_program}") + print( + " Payload sizes:" + f" program={human_bytes(font_stats.program_bytes)}," + f" webProgram={human_bytes(font_stats.web_program_bytes)}," + f" pdfProgram={human_bytes(font_stats.pdf_program_bytes)}," + f" metadata={human_bytes(font_stats.metadata_bytes)}" + ) + if font_stats.sample_cos_ids: + print(" Sample fonts still carrying cosDictionary:") + for idx, (font_id, uid) in enumerate(font_stats.sample_cos_ids, start=1): + print(f" {idx}. id={font_id!r}, uid={uid!r}") + else: + print(" No fonts retain cosDictionary entries.") + + print("\nOther sections:") + print(f" Metadata bytes: {human_bytes(summary.metadata_bytes)}") + print(f" XMP metadata bytes: {human_bytes(summary.xmp_bytes)}") + print(f" Form fields bytes: {human_bytes(summary.form_fields_bytes)}") + print(f" Lazy flag bytes: {summary.lazy_flag_bytes}") + print( + f" Text payload characters (not counting JSON overhead): " + f"{page_stats.text_payload_chars:,}" + ) + print( + f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}" + ) + print( + f" Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}" + ) + print( + f" Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}" + ) + print( + f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}" + ) + + +if __name__ == "__main__": + main()