json size clenaup 450 to 35mb

2025-11-16 01:21:16 +01:00 · 2025-11-05 23:35:08 +00:00 · 2025-11-05 23:35:08 +00:00 · d4c702f96c
commit d4c702f96c
parent d4e95a6ed7
6 changed files with 687 additions and 40 deletions
--- a/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java
+++ b/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java
@ -42,14 +42,16 @@ public class ConvertPdfJsonController {
            description =
                    "Extracts PDF text, fonts, and metadata into an editable JSON structure that can be"
                            + " transformed back into a PDF. Input:PDF Output:JSON Type:SISO")
-    public ResponseEntity<byte[]> convertPdfToJson(@ModelAttribute PDFFile request)
+    public ResponseEntity<byte[]> convertPdfToJson(
+            @ModelAttribute PDFFile request,
+            @RequestParam(value = "lightweight", defaultValue = "false") boolean lightweight)
            throws Exception {
        MultipartFile inputFile = request.getFileInput();
        if (inputFile == null) {
            throw ExceptionUtils.createNullArgumentException("fileInput");
        }

-        byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile);
+        byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile, lightweight);
        String originalName = inputFile.getOriginalFilename();
        String baseName =
                (originalName != null && !originalName.isBlank())
--- a/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java
+++ b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java
@ -1,6 +1,5 @@
 package stirling.software.SPDF.model.json;

-import java.util.ArrayList;
 import java.util.List;

 import com.fasterxml.jackson.annotation.JsonInclude;
@ -33,7 +32,7 @@ public class PdfJsonTextElement {
    private Float y;
    private Float width;
    private Float height;
-    @Builder.Default private List<Float> textMatrix = new ArrayList<>();
+    private List<Float> textMatrix;
    private PdfJsonTextColor fillColor;
    private PdfJsonTextColor strokeColor;
    private Integer renderingMode;
--- a/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java
+++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java
@ -18,6 +18,7 @@ import java.time.format.DateTimeParseException;
 import java.util.ArrayList;
 import java.util.Base64;
 import java.util.Calendar;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
@ -141,6 +142,10 @@ public class PdfJsonConversionService {

    private volatile boolean ghostscriptAvailable;

+    private static final float FLOAT_EPSILON = 0.0001f;
+    private static final float ORIENTATION_TOLERANCE = 0.0005f;
+    private static final float BASELINE_TOLERANCE = 0.5f;
+
    @PostConstruct
    private void initializeToolAvailability() {
        initializeGhostscriptAvailability();
@ -185,12 +190,24 @@ public class PdfJsonConversionService {
    }

    public byte[] convertPdfToJson(MultipartFile file) throws IOException {
-        return convertPdfToJson(file, null);
+        return convertPdfToJson(file, null, false);
+    }
+
+    public byte[] convertPdfToJson(MultipartFile file, boolean lightweight) throws IOException {
+        return convertPdfToJson(file, null, lightweight);
    }

    public byte[] convertPdfToJson(
            MultipartFile file, Consumer<PdfJsonConversionProgress> progressCallback)
            throws IOException {
+        return convertPdfToJson(file, progressCallback, false);
+    }
+
+    public byte[] convertPdfToJson(
+            MultipartFile file,
+            Consumer<PdfJsonConversionProgress> progressCallback,
+            boolean lightweight)
+            throws IOException {
        if (file == null) {
            throw ExceptionUtils.createNullArgumentException("fileInput");
        }
@ -341,7 +358,7 @@ public class PdfJsonConversionService {
                pdfJson.setMetadata(extractMetadata(document));
                pdfJson.setXmpMetadata(extractXmpMetadata(document));
                pdfJson.setLazyImages(useLazyImages);
-                List<PdfJsonFont> serializedFonts = new ArrayList<>(fonts.values());
+                List<PdfJsonFont> serializedFonts = cloneFontList(fonts.values());
                serializedFonts.sort(
                        Comparator.comparing(
                                PdfJsonFont::getUid,
@ -385,6 +402,10 @@ public class PdfJsonConversionService {
                    scheduleDocumentCleanup(jobId);
                }

+                if (lightweight) {
+                    applyLightweightTransformations(pdfJson);
+                }
+
                progress.accept(
                        PdfJsonConversionProgress.of(95, "serializing", "Generating JSON output"));

@ -394,8 +415,7 @@ public class PdfJsonConversionService {
                        pdfJson.getPages().size(),
                        useLazyImages);

-                byte[] result =
-                        objectMapper.writerWithDefaultPrettyPrinter().writeValueAsBytes(pdfJson);
+                byte[] result = objectMapper.writeValueAsBytes(pdfJson);
                progress.accept(PdfJsonConversionProgress.complete());
                return result;
            }
@ -668,6 +688,78 @@ public class PdfJsonConversionService {
        return buildFontKey(page, fontId);
    }

+    private List<PdfJsonFont> cloneFontList(Collection<PdfJsonFont> source) {
+        List<PdfJsonFont> clones = new ArrayList<>();
+        if (source == null) {
+            return clones;
+        }
+        for (PdfJsonFont font : source) {
+            PdfJsonFont copy = cloneFont(font);
+            if (copy != null) {
+                clones.add(copy);
+            }
+        }
+        return clones;
+    }
+
+    private PdfJsonFont cloneFont(PdfJsonFont font) {
+        if (font == null) {
+            return null;
+        }
+        return PdfJsonFont.builder()
+                .id(font.getId())
+                .pageNumber(font.getPageNumber())
+                .uid(font.getUid())
+                .baseName(font.getBaseName())
+                .subtype(font.getSubtype())
+                .encoding(font.getEncoding())
+                .cidSystemInfo(font.getCidSystemInfo())
+                .embedded(font.getEmbedded())
+                .program(font.getProgram())
+                .programFormat(font.getProgramFormat())
+                .webProgram(font.getWebProgram())
+                .webProgramFormat(font.getWebProgramFormat())
+                .pdfProgram(font.getPdfProgram())
+                .pdfProgramFormat(font.getPdfProgramFormat())
+                .toUnicode(font.getToUnicode())
+                .standard14Name(font.getStandard14Name())
+                .fontDescriptorFlags(font.getFontDescriptorFlags())
+                .ascent(font.getAscent())
+                .descent(font.getDescent())
+                .capHeight(font.getCapHeight())
+                .xHeight(font.getXHeight())
+                .italicAngle(font.getItalicAngle())
+                .unitsPerEm(font.getUnitsPerEm())
+                .cosDictionary(font.getCosDictionary())
+                .build();
+    }
+
+    private void applyLightweightTransformations(PdfJsonDocument document) {
+        if (document == null) {
+            return;
+        }
+        List<PdfJsonFont> fonts = document.getFonts();
+        if (fonts == null) {
+            return;
+        }
+        for (PdfJsonFont font : fonts) {
+            if (font == null) {
+                continue;
+            }
+            boolean hasUsableProgram =
+                    hasPayload(font.getPdfProgram())
+                            || hasPayload(font.getWebProgram())
+                            || hasPayload(font.getProgram());
+            if (hasUsableProgram) {
+                font.setCosDictionary(null);
+            }
+        }
+    }
+
+    private boolean hasPayload(String value) {
+        return value != null && !value.isBlank();
+    }
+
    private PdfJsonFont buildFontModel(
            PDDocument document, PDFont font, String fontId, int pageNumber) throws IOException {
        PDFontDescriptor descriptor = font.getFontDescriptor();
@ -3301,6 +3393,7 @@ public class PdfJsonConversionService {

        private int currentPage = 1;
        private Map<PDFont, String> currentFontResources = Collections.emptyMap();
+        private int currentZOrderCounter;

        TextCollectingStripper(
                PDDocument document,
@ -3320,6 +3413,7 @@ public class PdfJsonConversionService {
            currentPage = getCurrentPageNo();
            currentFontResources =
                    pageFontResources.getOrDefault(currentPage, Collections.emptyMap());
+            currentZOrderCounter = 0;
        }

        @Override
@ -3331,42 +3425,289 @@ public class PdfJsonConversionService {
            List<PdfJsonTextElement> pageElements =
                    textByPage.computeIfAbsent(currentPage, key -> new ArrayList<>());

+            TextRunAccumulator accumulator = null;
            for (TextPosition position : textPositions) {
                PDFont font = position.getFont();
                String fontId = registerFont(font);
-                PdfJsonTextElement element = new PdfJsonTextElement();
-                element.setText(position.getUnicode());
-                element.setFontId(fontId);
-                element.setFontSize(position.getFontSizeInPt());
-                element.setFontSizeInPt(position.getFontSizeInPt());
-                element.setX(position.getXDirAdj());
-                element.setY(position.getYDirAdj());
-                element.setWidth(position.getWidthDirAdj());
-                element.setHeight(position.getHeightDir());
-                element.setTextMatrix(extractMatrix(position));
-                element.setFontMatrixSize(computeFontMatrixSize(element.getTextMatrix()));
-                element.setSpaceWidth(position.getWidthOfSpace());
-                PDGraphicsState graphicsState = getGraphicsState();
-                if (graphicsState != null) {
-                    PDTextState textState = graphicsState.getTextState();
-                    if (textState != null) {
-                        element.setCharacterSpacing(textState.getCharacterSpacing());
-                        element.setWordSpacing(textState.getWordSpacing());
-                        element.setHorizontalScaling(textState.getHorizontalScaling());
-                        element.setLeading(textState.getLeading());
-                        element.setRise(textState.getRise());
-                        if (textState.getRenderingMode() != null) {
-                            element.setRenderingMode(textState.getRenderingMode().intValue());
-                        }
-                    }
-                    element.setFillColor(toTextColor(graphicsState.getNonStrokingColor()));
-                    element.setStrokeColor(toTextColor(graphicsState.getStrokingColor()));
+                PdfJsonTextElement element = createTextElement(position, fontId);
+
+                if (accumulator == null) {
+                    accumulator = new TextRunAccumulator(element, position);
+                } else if (!accumulator.canAppend(element, position)) {
+                    PdfJsonTextElement built = accumulator.build();
+                    built.setZOrder(1_000_000 + currentZOrderCounter++);
+                    pageElements.add(built);
+                    accumulator = new TextRunAccumulator(element, position);
+                } else {
+                    accumulator.append(element, position);
                }
-                element.setZOrder(1_000_000 + pageElements.size());
-                pageElements.add(element);
+            }
+
+            if (accumulator != null) {
+                PdfJsonTextElement built = accumulator.build();
+                built.setZOrder(1_000_000 + currentZOrderCounter++);
+                pageElements.add(built);
            }
        }

+        private PdfJsonTextElement createTextElement(TextPosition position, String fontId)
+                throws IOException {
+            PdfJsonTextElement element = new PdfJsonTextElement();
+            element.setText(position.getUnicode());
+            element.setFontId(fontId);
+            element.setFontSize(position.getFontSizeInPt());
+            element.setX(position.getXDirAdj());
+            element.setY(position.getYDirAdj());
+            element.setWidth(position.getWidthDirAdj());
+            element.setHeight(position.getHeightDir());
+            element.setTextMatrix(extractMatrix(position));
+            element.setFontMatrixSize(computeFontMatrixSize(element.getTextMatrix()));
+            element.setSpaceWidth(position.getWidthOfSpace());
+
+            PDGraphicsState graphicsState = getGraphicsState();
+            if (graphicsState != null) {
+                PDTextState textState = graphicsState.getTextState();
+                if (textState != null) {
+                    element.setCharacterSpacing(textState.getCharacterSpacing());
+                    element.setWordSpacing(textState.getWordSpacing());
+                    element.setHorizontalScaling(textState.getHorizontalScaling());
+                    element.setLeading(textState.getLeading());
+                    element.setRise(textState.getRise());
+                    if (textState.getRenderingMode() != null) {
+                        element.setRenderingMode(textState.getRenderingMode().intValue());
+                    }
+                }
+                element.setFillColor(toTextColor(graphicsState.getNonStrokingColor()));
+                element.setStrokeColor(toTextColor(graphicsState.getStrokingColor()));
+            }
+            return element;
+        }
+
+        private void compactTextElement(PdfJsonTextElement element) {
+            if (element == null) {
+                return;
+            }
+
+            List<Float> matrix = element.getTextMatrix();
+            if (matrix != null) {
+                if (matrix.isEmpty()) {
+                    element.setTextMatrix(null);
+                } else if (matrix.size() == 6) {
+                    element.setX(null);
+                    element.setY(null);
+                }
+            }
+
+            if (isZero(element.getCharacterSpacing())) {
+                element.setCharacterSpacing(null);
+            }
+            if (isZero(element.getWordSpacing())) {
+                element.setWordSpacing(null);
+            }
+            if (isZero(element.getLeading())) {
+                element.setLeading(null);
+            }
+            if (isZero(element.getRise())) {
+                element.setRise(null);
+            }
+            if (element.getHorizontalScaling() != null
+                    && Math.abs(element.getHorizontalScaling() - 100f) < FLOAT_EPSILON) {
+                element.setHorizontalScaling(null);
+            }
+            if (element.getRenderingMode() != null && element.getRenderingMode() == 0) {
+                element.setRenderingMode(null);
+            }
+            if (isDefaultBlack(element.getFillColor())) {
+                element.setFillColor(null);
+            }
+            if (isDefaultBlack(element.getStrokeColor())) {
+                element.setStrokeColor(null);
+            }
+        }
+
+        private boolean isZero(Float value) {
+            return value != null && Math.abs(value) < FLOAT_EPSILON;
+        }
+
+        private boolean isDefaultBlack(PdfJsonTextColor color) {
+            if (color == null || color.getComponents() == null) {
+                return true;
+            }
+            List<Float> components = color.getComponents();
+            if (components.isEmpty()) {
+                return true;
+            }
+            String space = color.getColorSpace();
+            if (space == null || "DeviceRGB".equals(space)) {
+                if (components.size() < 3) {
+                    return false;
+                }
+                return Math.abs(components.get(0)) < FLOAT_EPSILON
+                        && Math.abs(components.get(1)) < FLOAT_EPSILON
+                        && Math.abs(components.get(2)) < FLOAT_EPSILON;
+            }
+            if ("DeviceGray".equals(space)) {
+                return Math.abs(components.get(0)) < FLOAT_EPSILON;
+            }
+            return false;
+        }
+
+        private Float baselineFrom(PdfJsonTextElement element) {
+            List<Float> matrix = element.getTextMatrix();
+            if (matrix != null && matrix.size() >= 6) {
+                return matrix.get(5);
+            }
+            return element.getY();
+        }
+
+        private TextStyleKey buildStyleKey(PdfJsonTextElement element) {
+            return new TextStyleKey(
+                    element.getFontId(),
+                    element.getFontSize(),
+                    element.getFontMatrixSize(),
+                    element.getCharacterSpacing(),
+                    element.getWordSpacing(),
+                    element.getHorizontalScaling(),
+                    element.getLeading(),
+                    element.getRise(),
+                    element.getFillColor(),
+                    element.getStrokeColor(),
+                    element.getRenderingMode(),
+                    element.getSpaceWidth());
+        }
+
+        private class TextRunAccumulator {
+            private final PdfJsonTextElement baseElement;
+            private final TextStyleKey styleKey;
+            private final float orientationA;
+            private final float orientationB;
+            private final float orientationC;
+            private final float orientationD;
+            private final Float baseline;
+            private final List<Float> baseMatrix;
+            private final float startXCoord;
+            private final float startYCoord;
+            private final StringBuilder textBuilder = new StringBuilder();
+            private float totalWidth;
+            private float maxHeight;
+            private float endXCoord;
+
+            TextRunAccumulator(PdfJsonTextElement element, TextPosition position) {
+                this.baseElement = element;
+                this.styleKey = buildStyleKey(element);
+                this.baseMatrix =
+                        element.getTextMatrix() != null
+                                ? new ArrayList<>(element.getTextMatrix())
+                                : null;
+                if (baseMatrix != null && baseMatrix.size() >= 6) {
+                    orientationA = baseMatrix.get(0);
+                    orientationB = baseMatrix.get(1);
+                    orientationC = baseMatrix.get(2);
+                    orientationD = baseMatrix.get(3);
+                    startXCoord = baseMatrix.get(4);
+                    startYCoord = baseMatrix.get(5);
+                } else {
+                    orientationA = 1f;
+                    orientationB = 0f;
+                    orientationC = 0f;
+                    orientationD = 1f;
+                    startXCoord = element.getX() != null ? element.getX() : 0f;
+                    startYCoord = element.getY() != null ? element.getY() : 0f;
+                }
+                this.baseline = baselineFrom(element);
+                this.totalWidth = element.getWidth() != null ? element.getWidth() : 0f;
+                this.maxHeight = element.getHeight() != null ? element.getHeight() : 0f;
+                this.endXCoord = position.getXDirAdj() + position.getWidthDirAdj();
+                this.textBuilder.append(element.getText());
+            }
+
+            boolean canAppend(PdfJsonTextElement element, TextPosition position) {
+                if (!styleKey.equals(buildStyleKey(element))) {
+                    return false;
+                }
+                List<Float> matrix = element.getTextMatrix();
+                float a = 1f;
+                float b = 0f;
+                float c = 0f;
+                float d = 1f;
+                if (matrix != null && matrix.size() >= 4) {
+                    a = matrix.get(0);
+                    b = matrix.get(1);
+                    c = matrix.get(2);
+                    d = matrix.get(3);
+                }
+                if (Math.abs(a - orientationA) > ORIENTATION_TOLERANCE
+                        || Math.abs(b - orientationB) > ORIENTATION_TOLERANCE
+                        || Math.abs(c - orientationC) > ORIENTATION_TOLERANCE
+                        || Math.abs(d - orientationD) > ORIENTATION_TOLERANCE) {
+                    return false;
+                }
+
+                Float otherBaseline = baselineFrom(element);
+                if (baseline != null && otherBaseline != null) {
+                    if (Math.abs(otherBaseline - baseline) > BASELINE_TOLERANCE) {
+                        return false;
+                    }
+                } else if (baseline != null || otherBaseline != null) {
+                    return false;
+                }
+
+                return true;
+            }
+
+            void append(PdfJsonTextElement element, TextPosition position) {
+                textBuilder.append(element.getText());
+                float width =
+                        element.getWidth() != null ? element.getWidth() : position.getWidthDirAdj();
+                totalWidth += width;
+                float height =
+                        element.getHeight() != null ? element.getHeight() : position.getHeightDir();
+                if (height > maxHeight) {
+                    maxHeight = height;
+                }
+                endXCoord = position.getXDirAdj() + position.getWidthDirAdj();
+            }
+
+            PdfJsonTextElement build() {
+                PdfJsonTextElement result = baseElement;
+                result.setText(textBuilder.toString());
+                float widthCandidate = endXCoord - startXCoord;
+                if (widthCandidate > totalWidth) {
+                    totalWidth = widthCandidate;
+                }
+                result.setWidth(totalWidth);
+                result.setHeight(maxHeight);
+                if (baseMatrix != null && baseMatrix.size() == 6) {
+                    List<Float> matrix = new ArrayList<>(baseMatrix);
+                    matrix.set(0, orientationA);
+                    matrix.set(1, orientationB);
+                    matrix.set(2, orientationC);
+                    matrix.set(3, orientationD);
+                    matrix.set(4, startXCoord);
+                    matrix.set(5, startYCoord);
+                    result.setTextMatrix(matrix);
+                    result.setX(null);
+                    result.setY(null);
+                }
+                compactTextElement(result);
+                return result;
+            }
+        }
+
+        private record TextStyleKey(
+                String fontId,
+                Float fontSize,
+                Float fontMatrixSize,
+                Float characterSpacing,
+                Float wordSpacing,
+                Float horizontalScaling,
+                Float leading,
+                Float rise,
+                PdfJsonTextColor fillColor,
+                PdfJsonTextColor strokeColor,
+                Integer renderingMode,
+                Float spaceWidth) {}
+
        private List<Float> extractMatrix(TextPosition position) {
            float[] values = new float[6];
            values[0] = position.getTextMatrix().getValue(0, 0);
--- a/docs/pdf-json-editor-backlog.md
+++ b/docs/pdf-json-editor-backlog.md
@ -0,0 +1,28 @@
+# PDF JSON Editor Backlog
+
+- **Type3 Font Support (Text Additions)**
+  - Parse Type3 charprocs to extract glyph outlines, build a synthetic TrueType/OpenType font (FontTools, Ghostscript `ps2ttf`, etc.), and store it in `webProgram` / `pdfProgram` for client use.
+  - Preserve the original Type3 resources for round-trip fidelity; use the synthesized font only for edited elements while reusing the original stream elsewhere.
+  - Extend conversion logic so fallback kicks in only when conversion fails, and track which elements rely on the synthetic font to avoid mixing source glyphs (`PdfJsonConversionService.java:998-1090`, `1840-2012`).
+  - Update the viewer/renderer to surface conversion errors and block editing when no faithful font can be produced.
+
+- **Vector Artwork Preview**
+  - Reuse `contentStreams` already emitted by the backend to render vector paths alongside text/images in the React workspace (`frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1048-1285`).
+  - Either render via Canvas/SVG on the client or call back to a server-rendered bitmap for the background. Keep edited text/images layered on top.
+  - Maintain export fidelity by writing any untouched vector stream back during PDF regeneration (`PdfJsonConversionService.java:1714-1799`, `520-612`).
+
+- **Lazy Fetch Endpoints**
+  - Provide separate endpoints to fetch:
+    1. Raw COS dictionaries/font programs when the user opens advanced panels.
+    2. Page-level raster/vector previews to avoid sending large `imageData` upfront.
+  - Reuse the existing job cache (`documentCache`) to serve these on demand and clean up after timeouts (`PdfJsonConversionService.java:3608-3687`).
+
+- **Editor UX Safeguards**
+  - Respect `fallbackFontService` indicators; mark groups using fallback glyphs so the UI can warn about possible appearance shifts (`frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1260-1287`).
+  - Surface when Type3 conversion was downgraded (e.g., rasterized glyphs) and limit editing to operations that keep the PDF stable.
+
+- **Canonical Font Sharing**
+  - Emit fonts once per unique embedded program. Add a `canonicalFonts` array containing the full payload (program, ToUnicode, metadata) and a compact `fontAliases` mapping `{pageNumber, fontId, canonicalUid}` so text elements can still reference per-page IDs.
+  - Store COS dictionaries only on canonical entries; aliases should keep light fields (e.g., size adjustments) if they differ.
+  - Update `buildFontMap` to resolve aliases when recreating PDFBox fonts, and adjust the front end to load programs via the canonical UID.
+  - Optional: expose a lazy endpoint for the original COS dictionary if the canonical record strips it, so export still reconstructs untouched fonts.
--- a/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx
+++ b/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx
@ -279,7 +279,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {

          console.log('Sending conversion request with async=true');
          const response = await apiClient.post(
-            `${CONVERSION_ENDPOINTS['pdf-json']}?async=true`,
+            `${CONVERSION_ENDPOINTS['pdf-json']}?async=true&lightweight=true`,
            formData,
            {
              responseType: 'json',
@ -632,7 +632,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
    }

    const { document, filename } = payload;
-    const serialized = JSON.stringify(document, null, 2);
+    const serialized = JSON.stringify(document);
    downloadTextAsFile(serialized, filename, 'application/json');

    if (onComplete) {
@ -760,7 +760,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
      }

      const { document, filename } = payload;
-      const serialized = JSON.stringify(document, null, 2);
+      const serialized = JSON.stringify(document);
      const jsonFile = new File([serialized], filename, { type: 'application/json' });

      const formData = new FormData();
--- a/scripts/analyze_pdf_json.py
+++ b/scripts/analyze_pdf_json.py
@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""
+Quick inspection utility for PDF→JSON exports.
+
+Usage:
+    python scripts/analyze_pdf_json.py path/to/export.json
+
+The script prints size and font statistics so we can confirm whether the
+lightweight export (no COS dictionaries) is active and how large the font
+payloads are.
+"""
+from __future__ import annotations
+
+import argparse
+import base64
+import json
+import math
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Tuple
+
+
+def human_bytes(value: float) -> str:
+    if value <= 0:
+        return "0 B"
+    units = ["B", "KB", "MB", "GB", "TB"]
+    order = min(int(math.log(value, 1024)), len(units) - 1)
+    scaled = value / (1024**order)
+    return f"{scaled:.1f} {units[order]}"
+
+
+def base64_payload_size(encoded: str | None) -> int:
+    if not encoded:
+        return 0
+    length = len(encoded.strip())
+    if length == 0:
+        return 0
+    return int(length * 0.75)
+
+
+@dataclass
+class FontBreakdown:
+    total: int = 0
+    with_cos: int = 0
+    with_program: int = 0
+    with_web_program: int = 0
+    with_pdf_program: int = 0
+    program_bytes: int = 0
+    web_program_bytes: int = 0
+    pdf_program_bytes: int = 0
+    metadata_bytes: int = 0
+    sample_cos_ids: List[Tuple[str | None, str | None]] = None
+
+
+@dataclass
+class PageBreakdown:
+    page_count: int = 0
+    total_text_elements: int = 0
+    total_image_elements: int = 0
+    text_payload_chars: int = 0
+    text_struct_bytes: int = 0
+    image_struct_bytes: int = 0
+    resources_bytes: int = 0
+    content_stream_bytes: int = 0
+    annotations_bytes: int = 0
+
+
+@dataclass
+class DocumentBreakdown:
+    total_bytes: int
+    fonts: FontBreakdown
+    pages: PageBreakdown
+    metadata_bytes: int
+    xmp_bytes: int
+    form_fields_bytes: int
+    lazy_flag_bytes: int
+
+
+def approx_struct_size(obj: Any) -> int:
+    if obj is None:
+        return 0
+    return len(json.dumps(obj, separators=(",", ":")))
+
+
+def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown:
+    total = 0
+    with_cos = 0
+    with_prog = 0
+    with_web_prog = 0
+    with_pdf_prog = 0
+    program_bytes = 0
+    web_program_bytes = 0
+    pdf_program_bytes = 0
+    metadata_bytes = 0
+    sample_cos_ids: List[Tuple[str | None, str | None]] = []
+
+    for font in fonts:
+        total += 1
+        font_id = font.get("id")
+        uid = font.get("uid")
+        cos_value = font.get("cosDictionary")
+        if cos_value:
+            with_cos += 1
+            if len(sample_cos_ids) < 5:
+                sample_cos_ids.append((font_id, uid))
+
+        metadata_bytes += approx_struct_size(
+            {k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}}
+        )
+
+        program = font.get("program")
+        web_program = font.get("webProgram")
+        pdf_program = font.get("pdfProgram")
+
+        if program:
+            with_prog += 1
+            program_bytes += base64_payload_size(program)
+        if web_program:
+            with_web_prog += 1
+            web_program_bytes += base64_payload_size(web_program)
+        if pdf_program:
+            with_pdf_prog += 1
+            pdf_program_bytes += base64_payload_size(pdf_program)
+
+    return FontBreakdown(
+        total=total,
+        with_cos=with_cos,
+        with_program=with_prog,
+        with_web_program=with_web_prog,
+        with_pdf_program=with_pdf_prog,
+        program_bytes=program_bytes,
+        web_program_bytes=web_program_bytes,
+        pdf_program_bytes=pdf_program_bytes,
+        metadata_bytes=metadata_bytes,
+        sample_cos_ids=sample_cos_ids,
+    )
+
+
+def analyze_pages(pages: Iterable[Dict[str, Any]]) -> PageBreakdown:
+    page_count = 0
+    total_text = 0
+    total_images = 0
+    text_chars = 0
+    text_struct_bytes = 0
+    image_struct_bytes = 0
+    resources_bytes = 0
+    stream_bytes = 0
+    annotations_bytes = 0
+
+    for page in pages:
+        page_count += 1
+        texts = page.get("textElements") or []
+        images = page.get("imageElements") or []
+        resources = page.get("resources")
+        streams = page.get("contentStreams") or []
+        annotations = page.get("annotations") or []
+
+        total_text += len(texts)
+        total_images += len(images)
+        text_struct_bytes += approx_struct_size(texts)
+        image_struct_bytes += approx_struct_size(images)
+        resources_bytes += approx_struct_size(resources)
+        stream_bytes += approx_struct_size(streams)
+        annotations_bytes += approx_struct_size(annotations)
+
+        for elem in texts:
+            text = elem.get("text")
+            if text:
+                text_chars += len(text)
+
+    return PageBreakdown(
+        page_count=page_count,
+        total_text_elements=total_text,
+        total_image_elements=total_images,
+        text_payload_chars=text_chars,
+        text_struct_bytes=text_struct_bytes,
+        image_struct_bytes=image_struct_bytes,
+        resources_bytes=resources_bytes,
+        content_stream_bytes=stream_bytes,
+        annotations_bytes=annotations_bytes,
+    )
+
+
+def analyze_document(document: Dict[str, Any], total_size: int) -> DocumentBreakdown:
+    fonts = document.get("fonts") or []
+    pages = document.get("pages") or []
+    metadata = document.get("metadata") or {}
+
+    font_stats = analyze_fonts(fonts)
+    page_stats = analyze_pages(pages)
+
+    return DocumentBreakdown(
+        total_bytes=total_size,
+        fonts=font_stats,
+        pages=page_stats,
+        metadata_bytes=approx_struct_size(metadata),
+        xmp_bytes=base64_payload_size(document.get("xmpMetadata")),
+        form_fields_bytes=approx_struct_size(document.get("formFields")),
+        lazy_flag_bytes=approx_struct_size(document.get("lazyImages")),
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Inspect a PDF JSON export.")
+    parser.add_argument("json_path", type=Path, help="Path to the JSON export.")
+    args = parser.parse_args()
+
+    json_path = args.json_path
+    if not json_path.exists():
+        raise SystemExit(f"File not found: {json_path}")
+
+    file_size = json_path.stat().st_size
+    print(f"File: {json_path}")
+    print(f"Size: {human_bytes(file_size)} ({file_size:,} bytes)")
+
+    with json_path.open("r", encoding="utf-8") as handle:
+        document = json.load(handle)
+
+    if not isinstance(document, dict):
+        raise SystemExit("Unexpected JSON structure (expected an object at root).")
+
+    summary = analyze_document(document, file_size)
+    page_stats = summary.pages
+    print(f"Pages: {page_stats.page_count}")
+    print(f"Total text elements: {page_stats.total_text_elements:,}")
+    print(f"Total image elements: {page_stats.total_image_elements:,}")
+    print(
+        f"Page structural bytes (text arrays + images + streams + annotations): "
+        f"{human_bytes(page_stats.text_struct_bytes + page_stats.image_struct_bytes + page_stats.content_stream_bytes + page_stats.annotations_bytes)}"
+    )
+
+    font_stats = summary.fonts
+    print("\nFont summary:")
+    print(f"  Fonts total: {font_stats.total}")
+    print(f"  Fonts with cosDictionary: {font_stats.with_cos}")
+    print(f"  Fonts with program: {font_stats.with_program}")
+    print(f"  Fonts with webProgram: {font_stats.with_web_program}")
+    print(f"  Fonts with pdfProgram: {font_stats.with_pdf_program}")
+    print(
+        "  Payload sizes:"
+        f" program={human_bytes(font_stats.program_bytes)},"
+        f" webProgram={human_bytes(font_stats.web_program_bytes)},"
+        f" pdfProgram={human_bytes(font_stats.pdf_program_bytes)},"
+        f" metadata={human_bytes(font_stats.metadata_bytes)}"
+    )
+    if font_stats.sample_cos_ids:
+        print("  Sample fonts still carrying cosDictionary:")
+        for idx, (font_id, uid) in enumerate(font_stats.sample_cos_ids, start=1):
+            print(f"    {idx}. id={font_id!r}, uid={uid!r}")
+    else:
+        print("  No fonts retain cosDictionary entries.")
+
+    print("\nOther sections:")
+    print(f"  Metadata bytes: {human_bytes(summary.metadata_bytes)}")
+    print(f"  XMP metadata bytes: {human_bytes(summary.xmp_bytes)}")
+    print(f"  Form fields bytes: {human_bytes(summary.form_fields_bytes)}")
+    print(f"  Lazy flag bytes: {summary.lazy_flag_bytes}")
+    print(
+        f"  Text payload characters (not counting JSON overhead): "
+        f"{page_stats.text_payload_chars:,}"
+    )
+    print(
+        f"  Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}"
+    )
+    print(
+        f"  Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}"
+    )
+    print(
+        f"  Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}"
+    )
+    print(
+        f"  Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}"
+    )
+
+
+if __name__ == "__main__":
+    main()