mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
json size clenaup 450 to 35mb
This commit is contained in:
parent
d4e95a6ed7
commit
d4c702f96c
@ -42,14 +42,16 @@ public class ConvertPdfJsonController {
|
||||
description =
|
||||
"Extracts PDF text, fonts, and metadata into an editable JSON structure that can be"
|
||||
+ " transformed back into a PDF. Input:PDF Output:JSON Type:SISO")
|
||||
public ResponseEntity<byte[]> convertPdfToJson(@ModelAttribute PDFFile request)
|
||||
public ResponseEntity<byte[]> convertPdfToJson(
|
||||
@ModelAttribute PDFFile request,
|
||||
@RequestParam(value = "lightweight", defaultValue = "false") boolean lightweight)
|
||||
throws Exception {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
if (inputFile == null) {
|
||||
throw ExceptionUtils.createNullArgumentException("fileInput");
|
||||
}
|
||||
|
||||
byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile);
|
||||
byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile, lightweight);
|
||||
String originalName = inputFile.getOriginalFilename();
|
||||
String baseName =
|
||||
(originalName != null && !originalName.isBlank())
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
package stirling.software.SPDF.model.json;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
@ -33,7 +32,7 @@ public class PdfJsonTextElement {
|
||||
private Float y;
|
||||
private Float width;
|
||||
private Float height;
|
||||
@Builder.Default private List<Float> textMatrix = new ArrayList<>();
|
||||
private List<Float> textMatrix;
|
||||
private PdfJsonTextColor fillColor;
|
||||
private PdfJsonTextColor strokeColor;
|
||||
private Integer renderingMode;
|
||||
|
||||
@ -18,6 +18,7 @@ import java.time.format.DateTimeParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Base64;
|
||||
import java.util.Calendar;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
@ -141,6 +142,10 @@ public class PdfJsonConversionService {
|
||||
|
||||
private volatile boolean ghostscriptAvailable;
|
||||
|
||||
private static final float FLOAT_EPSILON = 0.0001f;
|
||||
private static final float ORIENTATION_TOLERANCE = 0.0005f;
|
||||
private static final float BASELINE_TOLERANCE = 0.5f;
|
||||
|
||||
@PostConstruct
|
||||
private void initializeToolAvailability() {
|
||||
initializeGhostscriptAvailability();
|
||||
@ -185,12 +190,24 @@ public class PdfJsonConversionService {
|
||||
}
|
||||
|
||||
public byte[] convertPdfToJson(MultipartFile file) throws IOException {
|
||||
return convertPdfToJson(file, null);
|
||||
return convertPdfToJson(file, null, false);
|
||||
}
|
||||
|
||||
public byte[] convertPdfToJson(MultipartFile file, boolean lightweight) throws IOException {
|
||||
return convertPdfToJson(file, null, lightweight);
|
||||
}
|
||||
|
||||
public byte[] convertPdfToJson(
|
||||
MultipartFile file, Consumer<PdfJsonConversionProgress> progressCallback)
|
||||
throws IOException {
|
||||
return convertPdfToJson(file, progressCallback, false);
|
||||
}
|
||||
|
||||
public byte[] convertPdfToJson(
|
||||
MultipartFile file,
|
||||
Consumer<PdfJsonConversionProgress> progressCallback,
|
||||
boolean lightweight)
|
||||
throws IOException {
|
||||
if (file == null) {
|
||||
throw ExceptionUtils.createNullArgumentException("fileInput");
|
||||
}
|
||||
@ -341,7 +358,7 @@ public class PdfJsonConversionService {
|
||||
pdfJson.setMetadata(extractMetadata(document));
|
||||
pdfJson.setXmpMetadata(extractXmpMetadata(document));
|
||||
pdfJson.setLazyImages(useLazyImages);
|
||||
List<PdfJsonFont> serializedFonts = new ArrayList<>(fonts.values());
|
||||
List<PdfJsonFont> serializedFonts = cloneFontList(fonts.values());
|
||||
serializedFonts.sort(
|
||||
Comparator.comparing(
|
||||
PdfJsonFont::getUid,
|
||||
@ -385,6 +402,10 @@ public class PdfJsonConversionService {
|
||||
scheduleDocumentCleanup(jobId);
|
||||
}
|
||||
|
||||
if (lightweight) {
|
||||
applyLightweightTransformations(pdfJson);
|
||||
}
|
||||
|
||||
progress.accept(
|
||||
PdfJsonConversionProgress.of(95, "serializing", "Generating JSON output"));
|
||||
|
||||
@ -394,8 +415,7 @@ public class PdfJsonConversionService {
|
||||
pdfJson.getPages().size(),
|
||||
useLazyImages);
|
||||
|
||||
byte[] result =
|
||||
objectMapper.writerWithDefaultPrettyPrinter().writeValueAsBytes(pdfJson);
|
||||
byte[] result = objectMapper.writeValueAsBytes(pdfJson);
|
||||
progress.accept(PdfJsonConversionProgress.complete());
|
||||
return result;
|
||||
}
|
||||
@ -668,6 +688,78 @@ public class PdfJsonConversionService {
|
||||
return buildFontKey(page, fontId);
|
||||
}
|
||||
|
||||
private List<PdfJsonFont> cloneFontList(Collection<PdfJsonFont> source) {
|
||||
List<PdfJsonFont> clones = new ArrayList<>();
|
||||
if (source == null) {
|
||||
return clones;
|
||||
}
|
||||
for (PdfJsonFont font : source) {
|
||||
PdfJsonFont copy = cloneFont(font);
|
||||
if (copy != null) {
|
||||
clones.add(copy);
|
||||
}
|
||||
}
|
||||
return clones;
|
||||
}
|
||||
|
||||
private PdfJsonFont cloneFont(PdfJsonFont font) {
|
||||
if (font == null) {
|
||||
return null;
|
||||
}
|
||||
return PdfJsonFont.builder()
|
||||
.id(font.getId())
|
||||
.pageNumber(font.getPageNumber())
|
||||
.uid(font.getUid())
|
||||
.baseName(font.getBaseName())
|
||||
.subtype(font.getSubtype())
|
||||
.encoding(font.getEncoding())
|
||||
.cidSystemInfo(font.getCidSystemInfo())
|
||||
.embedded(font.getEmbedded())
|
||||
.program(font.getProgram())
|
||||
.programFormat(font.getProgramFormat())
|
||||
.webProgram(font.getWebProgram())
|
||||
.webProgramFormat(font.getWebProgramFormat())
|
||||
.pdfProgram(font.getPdfProgram())
|
||||
.pdfProgramFormat(font.getPdfProgramFormat())
|
||||
.toUnicode(font.getToUnicode())
|
||||
.standard14Name(font.getStandard14Name())
|
||||
.fontDescriptorFlags(font.getFontDescriptorFlags())
|
||||
.ascent(font.getAscent())
|
||||
.descent(font.getDescent())
|
||||
.capHeight(font.getCapHeight())
|
||||
.xHeight(font.getXHeight())
|
||||
.italicAngle(font.getItalicAngle())
|
||||
.unitsPerEm(font.getUnitsPerEm())
|
||||
.cosDictionary(font.getCosDictionary())
|
||||
.build();
|
||||
}
|
||||
|
||||
private void applyLightweightTransformations(PdfJsonDocument document) {
|
||||
if (document == null) {
|
||||
return;
|
||||
}
|
||||
List<PdfJsonFont> fonts = document.getFonts();
|
||||
if (fonts == null) {
|
||||
return;
|
||||
}
|
||||
for (PdfJsonFont font : fonts) {
|
||||
if (font == null) {
|
||||
continue;
|
||||
}
|
||||
boolean hasUsableProgram =
|
||||
hasPayload(font.getPdfProgram())
|
||||
|| hasPayload(font.getWebProgram())
|
||||
|| hasPayload(font.getProgram());
|
||||
if (hasUsableProgram) {
|
||||
font.setCosDictionary(null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean hasPayload(String value) {
|
||||
return value != null && !value.isBlank();
|
||||
}
|
||||
|
||||
private PdfJsonFont buildFontModel(
|
||||
PDDocument document, PDFont font, String fontId, int pageNumber) throws IOException {
|
||||
PDFontDescriptor descriptor = font.getFontDescriptor();
|
||||
@ -3301,6 +3393,7 @@ public class PdfJsonConversionService {
|
||||
|
||||
private int currentPage = 1;
|
||||
private Map<PDFont, String> currentFontResources = Collections.emptyMap();
|
||||
private int currentZOrderCounter;
|
||||
|
||||
TextCollectingStripper(
|
||||
PDDocument document,
|
||||
@ -3320,6 +3413,7 @@ public class PdfJsonConversionService {
|
||||
currentPage = getCurrentPageNo();
|
||||
currentFontResources =
|
||||
pageFontResources.getOrDefault(currentPage, Collections.emptyMap());
|
||||
currentZOrderCounter = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -3331,42 +3425,289 @@ public class PdfJsonConversionService {
|
||||
List<PdfJsonTextElement> pageElements =
|
||||
textByPage.computeIfAbsent(currentPage, key -> new ArrayList<>());
|
||||
|
||||
TextRunAccumulator accumulator = null;
|
||||
for (TextPosition position : textPositions) {
|
||||
PDFont font = position.getFont();
|
||||
String fontId = registerFont(font);
|
||||
PdfJsonTextElement element = new PdfJsonTextElement();
|
||||
element.setText(position.getUnicode());
|
||||
element.setFontId(fontId);
|
||||
element.setFontSize(position.getFontSizeInPt());
|
||||
element.setFontSizeInPt(position.getFontSizeInPt());
|
||||
element.setX(position.getXDirAdj());
|
||||
element.setY(position.getYDirAdj());
|
||||
element.setWidth(position.getWidthDirAdj());
|
||||
element.setHeight(position.getHeightDir());
|
||||
element.setTextMatrix(extractMatrix(position));
|
||||
element.setFontMatrixSize(computeFontMatrixSize(element.getTextMatrix()));
|
||||
element.setSpaceWidth(position.getWidthOfSpace());
|
||||
PDGraphicsState graphicsState = getGraphicsState();
|
||||
if (graphicsState != null) {
|
||||
PDTextState textState = graphicsState.getTextState();
|
||||
if (textState != null) {
|
||||
element.setCharacterSpacing(textState.getCharacterSpacing());
|
||||
element.setWordSpacing(textState.getWordSpacing());
|
||||
element.setHorizontalScaling(textState.getHorizontalScaling());
|
||||
element.setLeading(textState.getLeading());
|
||||
element.setRise(textState.getRise());
|
||||
if (textState.getRenderingMode() != null) {
|
||||
element.setRenderingMode(textState.getRenderingMode().intValue());
|
||||
}
|
||||
}
|
||||
element.setFillColor(toTextColor(graphicsState.getNonStrokingColor()));
|
||||
element.setStrokeColor(toTextColor(graphicsState.getStrokingColor()));
|
||||
PdfJsonTextElement element = createTextElement(position, fontId);
|
||||
|
||||
if (accumulator == null) {
|
||||
accumulator = new TextRunAccumulator(element, position);
|
||||
} else if (!accumulator.canAppend(element, position)) {
|
||||
PdfJsonTextElement built = accumulator.build();
|
||||
built.setZOrder(1_000_000 + currentZOrderCounter++);
|
||||
pageElements.add(built);
|
||||
accumulator = new TextRunAccumulator(element, position);
|
||||
} else {
|
||||
accumulator.append(element, position);
|
||||
}
|
||||
element.setZOrder(1_000_000 + pageElements.size());
|
||||
pageElements.add(element);
|
||||
}
|
||||
|
||||
if (accumulator != null) {
|
||||
PdfJsonTextElement built = accumulator.build();
|
||||
built.setZOrder(1_000_000 + currentZOrderCounter++);
|
||||
pageElements.add(built);
|
||||
}
|
||||
}
|
||||
|
||||
private PdfJsonTextElement createTextElement(TextPosition position, String fontId)
|
||||
throws IOException {
|
||||
PdfJsonTextElement element = new PdfJsonTextElement();
|
||||
element.setText(position.getUnicode());
|
||||
element.setFontId(fontId);
|
||||
element.setFontSize(position.getFontSizeInPt());
|
||||
element.setX(position.getXDirAdj());
|
||||
element.setY(position.getYDirAdj());
|
||||
element.setWidth(position.getWidthDirAdj());
|
||||
element.setHeight(position.getHeightDir());
|
||||
element.setTextMatrix(extractMatrix(position));
|
||||
element.setFontMatrixSize(computeFontMatrixSize(element.getTextMatrix()));
|
||||
element.setSpaceWidth(position.getWidthOfSpace());
|
||||
|
||||
PDGraphicsState graphicsState = getGraphicsState();
|
||||
if (graphicsState != null) {
|
||||
PDTextState textState = graphicsState.getTextState();
|
||||
if (textState != null) {
|
||||
element.setCharacterSpacing(textState.getCharacterSpacing());
|
||||
element.setWordSpacing(textState.getWordSpacing());
|
||||
element.setHorizontalScaling(textState.getHorizontalScaling());
|
||||
element.setLeading(textState.getLeading());
|
||||
element.setRise(textState.getRise());
|
||||
if (textState.getRenderingMode() != null) {
|
||||
element.setRenderingMode(textState.getRenderingMode().intValue());
|
||||
}
|
||||
}
|
||||
element.setFillColor(toTextColor(graphicsState.getNonStrokingColor()));
|
||||
element.setStrokeColor(toTextColor(graphicsState.getStrokingColor()));
|
||||
}
|
||||
return element;
|
||||
}
|
||||
|
||||
private void compactTextElement(PdfJsonTextElement element) {
|
||||
if (element == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
List<Float> matrix = element.getTextMatrix();
|
||||
if (matrix != null) {
|
||||
if (matrix.isEmpty()) {
|
||||
element.setTextMatrix(null);
|
||||
} else if (matrix.size() == 6) {
|
||||
element.setX(null);
|
||||
element.setY(null);
|
||||
}
|
||||
}
|
||||
|
||||
if (isZero(element.getCharacterSpacing())) {
|
||||
element.setCharacterSpacing(null);
|
||||
}
|
||||
if (isZero(element.getWordSpacing())) {
|
||||
element.setWordSpacing(null);
|
||||
}
|
||||
if (isZero(element.getLeading())) {
|
||||
element.setLeading(null);
|
||||
}
|
||||
if (isZero(element.getRise())) {
|
||||
element.setRise(null);
|
||||
}
|
||||
if (element.getHorizontalScaling() != null
|
||||
&& Math.abs(element.getHorizontalScaling() - 100f) < FLOAT_EPSILON) {
|
||||
element.setHorizontalScaling(null);
|
||||
}
|
||||
if (element.getRenderingMode() != null && element.getRenderingMode() == 0) {
|
||||
element.setRenderingMode(null);
|
||||
}
|
||||
if (isDefaultBlack(element.getFillColor())) {
|
||||
element.setFillColor(null);
|
||||
}
|
||||
if (isDefaultBlack(element.getStrokeColor())) {
|
||||
element.setStrokeColor(null);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isZero(Float value) {
|
||||
return value != null && Math.abs(value) < FLOAT_EPSILON;
|
||||
}
|
||||
|
||||
private boolean isDefaultBlack(PdfJsonTextColor color) {
|
||||
if (color == null || color.getComponents() == null) {
|
||||
return true;
|
||||
}
|
||||
List<Float> components = color.getComponents();
|
||||
if (components.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
String space = color.getColorSpace();
|
||||
if (space == null || "DeviceRGB".equals(space)) {
|
||||
if (components.size() < 3) {
|
||||
return false;
|
||||
}
|
||||
return Math.abs(components.get(0)) < FLOAT_EPSILON
|
||||
&& Math.abs(components.get(1)) < FLOAT_EPSILON
|
||||
&& Math.abs(components.get(2)) < FLOAT_EPSILON;
|
||||
}
|
||||
if ("DeviceGray".equals(space)) {
|
||||
return Math.abs(components.get(0)) < FLOAT_EPSILON;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private Float baselineFrom(PdfJsonTextElement element) {
|
||||
List<Float> matrix = element.getTextMatrix();
|
||||
if (matrix != null && matrix.size() >= 6) {
|
||||
return matrix.get(5);
|
||||
}
|
||||
return element.getY();
|
||||
}
|
||||
|
||||
private TextStyleKey buildStyleKey(PdfJsonTextElement element) {
|
||||
return new TextStyleKey(
|
||||
element.getFontId(),
|
||||
element.getFontSize(),
|
||||
element.getFontMatrixSize(),
|
||||
element.getCharacterSpacing(),
|
||||
element.getWordSpacing(),
|
||||
element.getHorizontalScaling(),
|
||||
element.getLeading(),
|
||||
element.getRise(),
|
||||
element.getFillColor(),
|
||||
element.getStrokeColor(),
|
||||
element.getRenderingMode(),
|
||||
element.getSpaceWidth());
|
||||
}
|
||||
|
||||
private class TextRunAccumulator {
|
||||
private final PdfJsonTextElement baseElement;
|
||||
private final TextStyleKey styleKey;
|
||||
private final float orientationA;
|
||||
private final float orientationB;
|
||||
private final float orientationC;
|
||||
private final float orientationD;
|
||||
private final Float baseline;
|
||||
private final List<Float> baseMatrix;
|
||||
private final float startXCoord;
|
||||
private final float startYCoord;
|
||||
private final StringBuilder textBuilder = new StringBuilder();
|
||||
private float totalWidth;
|
||||
private float maxHeight;
|
||||
private float endXCoord;
|
||||
|
||||
TextRunAccumulator(PdfJsonTextElement element, TextPosition position) {
|
||||
this.baseElement = element;
|
||||
this.styleKey = buildStyleKey(element);
|
||||
this.baseMatrix =
|
||||
element.getTextMatrix() != null
|
||||
? new ArrayList<>(element.getTextMatrix())
|
||||
: null;
|
||||
if (baseMatrix != null && baseMatrix.size() >= 6) {
|
||||
orientationA = baseMatrix.get(0);
|
||||
orientationB = baseMatrix.get(1);
|
||||
orientationC = baseMatrix.get(2);
|
||||
orientationD = baseMatrix.get(3);
|
||||
startXCoord = baseMatrix.get(4);
|
||||
startYCoord = baseMatrix.get(5);
|
||||
} else {
|
||||
orientationA = 1f;
|
||||
orientationB = 0f;
|
||||
orientationC = 0f;
|
||||
orientationD = 1f;
|
||||
startXCoord = element.getX() != null ? element.getX() : 0f;
|
||||
startYCoord = element.getY() != null ? element.getY() : 0f;
|
||||
}
|
||||
this.baseline = baselineFrom(element);
|
||||
this.totalWidth = element.getWidth() != null ? element.getWidth() : 0f;
|
||||
this.maxHeight = element.getHeight() != null ? element.getHeight() : 0f;
|
||||
this.endXCoord = position.getXDirAdj() + position.getWidthDirAdj();
|
||||
this.textBuilder.append(element.getText());
|
||||
}
|
||||
|
||||
boolean canAppend(PdfJsonTextElement element, TextPosition position) {
|
||||
if (!styleKey.equals(buildStyleKey(element))) {
|
||||
return false;
|
||||
}
|
||||
List<Float> matrix = element.getTextMatrix();
|
||||
float a = 1f;
|
||||
float b = 0f;
|
||||
float c = 0f;
|
||||
float d = 1f;
|
||||
if (matrix != null && matrix.size() >= 4) {
|
||||
a = matrix.get(0);
|
||||
b = matrix.get(1);
|
||||
c = matrix.get(2);
|
||||
d = matrix.get(3);
|
||||
}
|
||||
if (Math.abs(a - orientationA) > ORIENTATION_TOLERANCE
|
||||
|| Math.abs(b - orientationB) > ORIENTATION_TOLERANCE
|
||||
|| Math.abs(c - orientationC) > ORIENTATION_TOLERANCE
|
||||
|| Math.abs(d - orientationD) > ORIENTATION_TOLERANCE) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Float otherBaseline = baselineFrom(element);
|
||||
if (baseline != null && otherBaseline != null) {
|
||||
if (Math.abs(otherBaseline - baseline) > BASELINE_TOLERANCE) {
|
||||
return false;
|
||||
}
|
||||
} else if (baseline != null || otherBaseline != null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void append(PdfJsonTextElement element, TextPosition position) {
|
||||
textBuilder.append(element.getText());
|
||||
float width =
|
||||
element.getWidth() != null ? element.getWidth() : position.getWidthDirAdj();
|
||||
totalWidth += width;
|
||||
float height =
|
||||
element.getHeight() != null ? element.getHeight() : position.getHeightDir();
|
||||
if (height > maxHeight) {
|
||||
maxHeight = height;
|
||||
}
|
||||
endXCoord = position.getXDirAdj() + position.getWidthDirAdj();
|
||||
}
|
||||
|
||||
PdfJsonTextElement build() {
|
||||
PdfJsonTextElement result = baseElement;
|
||||
result.setText(textBuilder.toString());
|
||||
float widthCandidate = endXCoord - startXCoord;
|
||||
if (widthCandidate > totalWidth) {
|
||||
totalWidth = widthCandidate;
|
||||
}
|
||||
result.setWidth(totalWidth);
|
||||
result.setHeight(maxHeight);
|
||||
if (baseMatrix != null && baseMatrix.size() == 6) {
|
||||
List<Float> matrix = new ArrayList<>(baseMatrix);
|
||||
matrix.set(0, orientationA);
|
||||
matrix.set(1, orientationB);
|
||||
matrix.set(2, orientationC);
|
||||
matrix.set(3, orientationD);
|
||||
matrix.set(4, startXCoord);
|
||||
matrix.set(5, startYCoord);
|
||||
result.setTextMatrix(matrix);
|
||||
result.setX(null);
|
||||
result.setY(null);
|
||||
}
|
||||
compactTextElement(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
private record TextStyleKey(
|
||||
String fontId,
|
||||
Float fontSize,
|
||||
Float fontMatrixSize,
|
||||
Float characterSpacing,
|
||||
Float wordSpacing,
|
||||
Float horizontalScaling,
|
||||
Float leading,
|
||||
Float rise,
|
||||
PdfJsonTextColor fillColor,
|
||||
PdfJsonTextColor strokeColor,
|
||||
Integer renderingMode,
|
||||
Float spaceWidth) {}
|
||||
|
||||
private List<Float> extractMatrix(TextPosition position) {
|
||||
float[] values = new float[6];
|
||||
values[0] = position.getTextMatrix().getValue(0, 0);
|
||||
|
||||
28
docs/pdf-json-editor-backlog.md
Normal file
28
docs/pdf-json-editor-backlog.md
Normal file
@ -0,0 +1,28 @@
|
||||
# PDF JSON Editor Backlog
|
||||
|
||||
- **Type3 Font Support (Text Additions)**
|
||||
- Parse Type3 charprocs to extract glyph outlines, build a synthetic TrueType/OpenType font (FontTools, Ghostscript `ps2ttf`, etc.), and store it in `webProgram` / `pdfProgram` for client use.
|
||||
- Preserve the original Type3 resources for round-trip fidelity; use the synthesized font only for edited elements while reusing the original stream elsewhere.
|
||||
- Extend conversion logic so fallback kicks in only when conversion fails, and track which elements rely on the synthetic font to avoid mixing source glyphs (`PdfJsonConversionService.java:998-1090`, `1840-2012`).
|
||||
- Update the viewer/renderer to surface conversion errors and block editing when no faithful font can be produced.
|
||||
|
||||
- **Vector Artwork Preview**
|
||||
- Reuse `contentStreams` already emitted by the backend to render vector paths alongside text/images in the React workspace (`frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1048-1285`).
|
||||
- Either render via Canvas/SVG on the client or call back to a server-rendered bitmap for the background. Keep edited text/images layered on top.
|
||||
- Maintain export fidelity by writing any untouched vector stream back during PDF regeneration (`PdfJsonConversionService.java:1714-1799`, `520-612`).
|
||||
|
||||
- **Lazy Fetch Endpoints**
|
||||
- Provide separate endpoints to fetch:
|
||||
1. Raw COS dictionaries/font programs when the user opens advanced panels.
|
||||
2. Page-level raster/vector previews to avoid sending large `imageData` upfront.
|
||||
- Reuse the existing job cache (`documentCache`) to serve these on demand and clean up after timeouts (`PdfJsonConversionService.java:3608-3687`).
|
||||
|
||||
- **Editor UX Safeguards**
|
||||
- Respect `fallbackFontService` indicators; mark groups using fallback glyphs so the UI can warn about possible appearance shifts (`frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1260-1287`).
|
||||
- Surface when Type3 conversion was downgraded (e.g., rasterized glyphs) and limit editing to operations that keep the PDF stable.
|
||||
|
||||
- **Canonical Font Sharing**
|
||||
- Emit fonts once per unique embedded program. Add a `canonicalFonts` array containing the full payload (program, ToUnicode, metadata) and a compact `fontAliases` mapping `{pageNumber, fontId, canonicalUid}` so text elements can still reference per-page IDs.
|
||||
- Store COS dictionaries only on canonical entries; aliases should keep light fields (e.g., size adjustments) if they differ.
|
||||
- Update `buildFontMap` to resolve aliases when recreating PDFBox fonts, and adjust the front end to load programs via the canonical UID.
|
||||
- Optional: expose a lazy endpoint for the original COS dictionary if the canonical record strips it, so export still reconstructs untouched fonts.
|
||||
@ -279,7 +279,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
|
||||
console.log('Sending conversion request with async=true');
|
||||
const response = await apiClient.post(
|
||||
`${CONVERSION_ENDPOINTS['pdf-json']}?async=true`,
|
||||
`${CONVERSION_ENDPOINTS['pdf-json']}?async=true&lightweight=true`,
|
||||
formData,
|
||||
{
|
||||
responseType: 'json',
|
||||
@ -632,7 +632,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}
|
||||
|
||||
const { document, filename } = payload;
|
||||
const serialized = JSON.stringify(document, null, 2);
|
||||
const serialized = JSON.stringify(document);
|
||||
downloadTextAsFile(serialized, filename, 'application/json');
|
||||
|
||||
if (onComplete) {
|
||||
@ -760,7 +760,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}
|
||||
|
||||
const { document, filename } = payload;
|
||||
const serialized = JSON.stringify(document, null, 2);
|
||||
const serialized = JSON.stringify(document);
|
||||
const jsonFile = new File([serialized], filename, { type: 'application/json' });
|
||||
|
||||
const formData = new FormData();
|
||||
|
||||
277
scripts/analyze_pdf_json.py
Normal file
277
scripts/analyze_pdf_json.py
Normal file
@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick inspection utility for PDF→JSON exports.
|
||||
|
||||
Usage:
|
||||
python scripts/analyze_pdf_json.py path/to/export.json
|
||||
|
||||
The script prints size and font statistics so we can confirm whether the
|
||||
lightweight export (no COS dictionaries) is active and how large the font
|
||||
payloads are.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Iterable, List, Tuple
|
||||
|
||||
|
||||
def human_bytes(value: float) -> str:
|
||||
if value <= 0:
|
||||
return "0 B"
|
||||
units = ["B", "KB", "MB", "GB", "TB"]
|
||||
order = min(int(math.log(value, 1024)), len(units) - 1)
|
||||
scaled = value / (1024**order)
|
||||
return f"{scaled:.1f} {units[order]}"
|
||||
|
||||
|
||||
def base64_payload_size(encoded: str | None) -> int:
|
||||
if not encoded:
|
||||
return 0
|
||||
length = len(encoded.strip())
|
||||
if length == 0:
|
||||
return 0
|
||||
return int(length * 0.75)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FontBreakdown:
|
||||
total: int = 0
|
||||
with_cos: int = 0
|
||||
with_program: int = 0
|
||||
with_web_program: int = 0
|
||||
with_pdf_program: int = 0
|
||||
program_bytes: int = 0
|
||||
web_program_bytes: int = 0
|
||||
pdf_program_bytes: int = 0
|
||||
metadata_bytes: int = 0
|
||||
sample_cos_ids: List[Tuple[str | None, str | None]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageBreakdown:
|
||||
page_count: int = 0
|
||||
total_text_elements: int = 0
|
||||
total_image_elements: int = 0
|
||||
text_payload_chars: int = 0
|
||||
text_struct_bytes: int = 0
|
||||
image_struct_bytes: int = 0
|
||||
resources_bytes: int = 0
|
||||
content_stream_bytes: int = 0
|
||||
annotations_bytes: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentBreakdown:
|
||||
total_bytes: int
|
||||
fonts: FontBreakdown
|
||||
pages: PageBreakdown
|
||||
metadata_bytes: int
|
||||
xmp_bytes: int
|
||||
form_fields_bytes: int
|
||||
lazy_flag_bytes: int
|
||||
|
||||
|
||||
def approx_struct_size(obj: Any) -> int:
|
||||
if obj is None:
|
||||
return 0
|
||||
return len(json.dumps(obj, separators=(",", ":")))
|
||||
|
||||
|
||||
def analyze_fonts(fonts: Iterable[Dict[str, Any]]) -> FontBreakdown:
|
||||
total = 0
|
||||
with_cos = 0
|
||||
with_prog = 0
|
||||
with_web_prog = 0
|
||||
with_pdf_prog = 0
|
||||
program_bytes = 0
|
||||
web_program_bytes = 0
|
||||
pdf_program_bytes = 0
|
||||
metadata_bytes = 0
|
||||
sample_cos_ids: List[Tuple[str | None, str | None]] = []
|
||||
|
||||
for font in fonts:
|
||||
total += 1
|
||||
font_id = font.get("id")
|
||||
uid = font.get("uid")
|
||||
cos_value = font.get("cosDictionary")
|
||||
if cos_value:
|
||||
with_cos += 1
|
||||
if len(sample_cos_ids) < 5:
|
||||
sample_cos_ids.append((font_id, uid))
|
||||
|
||||
metadata_bytes += approx_struct_size(
|
||||
{k: v for k, v in font.items() if k not in {"program", "webProgram", "pdfProgram"}}
|
||||
)
|
||||
|
||||
program = font.get("program")
|
||||
web_program = font.get("webProgram")
|
||||
pdf_program = font.get("pdfProgram")
|
||||
|
||||
if program:
|
||||
with_prog += 1
|
||||
program_bytes += base64_payload_size(program)
|
||||
if web_program:
|
||||
with_web_prog += 1
|
||||
web_program_bytes += base64_payload_size(web_program)
|
||||
if pdf_program:
|
||||
with_pdf_prog += 1
|
||||
pdf_program_bytes += base64_payload_size(pdf_program)
|
||||
|
||||
return FontBreakdown(
|
||||
total=total,
|
||||
with_cos=with_cos,
|
||||
with_program=with_prog,
|
||||
with_web_program=with_web_prog,
|
||||
with_pdf_program=with_pdf_prog,
|
||||
program_bytes=program_bytes,
|
||||
web_program_bytes=web_program_bytes,
|
||||
pdf_program_bytes=pdf_program_bytes,
|
||||
metadata_bytes=metadata_bytes,
|
||||
sample_cos_ids=sample_cos_ids,
|
||||
)
|
||||
|
||||
|
||||
def analyze_pages(pages: Iterable[Dict[str, Any]]) -> PageBreakdown:
|
||||
page_count = 0
|
||||
total_text = 0
|
||||
total_images = 0
|
||||
text_chars = 0
|
||||
text_struct_bytes = 0
|
||||
image_struct_bytes = 0
|
||||
resources_bytes = 0
|
||||
stream_bytes = 0
|
||||
annotations_bytes = 0
|
||||
|
||||
for page in pages:
|
||||
page_count += 1
|
||||
texts = page.get("textElements") or []
|
||||
images = page.get("imageElements") or []
|
||||
resources = page.get("resources")
|
||||
streams = page.get("contentStreams") or []
|
||||
annotations = page.get("annotations") or []
|
||||
|
||||
total_text += len(texts)
|
||||
total_images += len(images)
|
||||
text_struct_bytes += approx_struct_size(texts)
|
||||
image_struct_bytes += approx_struct_size(images)
|
||||
resources_bytes += approx_struct_size(resources)
|
||||
stream_bytes += approx_struct_size(streams)
|
||||
annotations_bytes += approx_struct_size(annotations)
|
||||
|
||||
for elem in texts:
|
||||
text = elem.get("text")
|
||||
if text:
|
||||
text_chars += len(text)
|
||||
|
||||
return PageBreakdown(
|
||||
page_count=page_count,
|
||||
total_text_elements=total_text,
|
||||
total_image_elements=total_images,
|
||||
text_payload_chars=text_chars,
|
||||
text_struct_bytes=text_struct_bytes,
|
||||
image_struct_bytes=image_struct_bytes,
|
||||
resources_bytes=resources_bytes,
|
||||
content_stream_bytes=stream_bytes,
|
||||
annotations_bytes=annotations_bytes,
|
||||
)
|
||||
|
||||
|
||||
def analyze_document(document: Dict[str, Any], total_size: int) -> DocumentBreakdown:
|
||||
fonts = document.get("fonts") or []
|
||||
pages = document.get("pages") or []
|
||||
metadata = document.get("metadata") or {}
|
||||
|
||||
font_stats = analyze_fonts(fonts)
|
||||
page_stats = analyze_pages(pages)
|
||||
|
||||
return DocumentBreakdown(
|
||||
total_bytes=total_size,
|
||||
fonts=font_stats,
|
||||
pages=page_stats,
|
||||
metadata_bytes=approx_struct_size(metadata),
|
||||
xmp_bytes=base64_payload_size(document.get("xmpMetadata")),
|
||||
form_fields_bytes=approx_struct_size(document.get("formFields")),
|
||||
lazy_flag_bytes=approx_struct_size(document.get("lazyImages")),
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Inspect a PDF JSON export.")
|
||||
parser.add_argument("json_path", type=Path, help="Path to the JSON export.")
|
||||
args = parser.parse_args()
|
||||
|
||||
json_path = args.json_path
|
||||
if not json_path.exists():
|
||||
raise SystemExit(f"File not found: {json_path}")
|
||||
|
||||
file_size = json_path.stat().st_size
|
||||
print(f"File: {json_path}")
|
||||
print(f"Size: {human_bytes(file_size)} ({file_size:,} bytes)")
|
||||
|
||||
with json_path.open("r", encoding="utf-8") as handle:
|
||||
document = json.load(handle)
|
||||
|
||||
if not isinstance(document, dict):
|
||||
raise SystemExit("Unexpected JSON structure (expected an object at root).")
|
||||
|
||||
summary = analyze_document(document, file_size)
|
||||
page_stats = summary.pages
|
||||
print(f"Pages: {page_stats.page_count}")
|
||||
print(f"Total text elements: {page_stats.total_text_elements:,}")
|
||||
print(f"Total image elements: {page_stats.total_image_elements:,}")
|
||||
print(
|
||||
f"Page structural bytes (text arrays + images + streams + annotations): "
|
||||
f"{human_bytes(page_stats.text_struct_bytes + page_stats.image_struct_bytes + page_stats.content_stream_bytes + page_stats.annotations_bytes)}"
|
||||
)
|
||||
|
||||
font_stats = summary.fonts
|
||||
print("\nFont summary:")
|
||||
print(f" Fonts total: {font_stats.total}")
|
||||
print(f" Fonts with cosDictionary: {font_stats.with_cos}")
|
||||
print(f" Fonts with program: {font_stats.with_program}")
|
||||
print(f" Fonts with webProgram: {font_stats.with_web_program}")
|
||||
print(f" Fonts with pdfProgram: {font_stats.with_pdf_program}")
|
||||
print(
|
||||
" Payload sizes:"
|
||||
f" program={human_bytes(font_stats.program_bytes)},"
|
||||
f" webProgram={human_bytes(font_stats.web_program_bytes)},"
|
||||
f" pdfProgram={human_bytes(font_stats.pdf_program_bytes)},"
|
||||
f" metadata={human_bytes(font_stats.metadata_bytes)}"
|
||||
)
|
||||
if font_stats.sample_cos_ids:
|
||||
print(" Sample fonts still carrying cosDictionary:")
|
||||
for idx, (font_id, uid) in enumerate(font_stats.sample_cos_ids, start=1):
|
||||
print(f" {idx}. id={font_id!r}, uid={uid!r}")
|
||||
else:
|
||||
print(" No fonts retain cosDictionary entries.")
|
||||
|
||||
print("\nOther sections:")
|
||||
print(f" Metadata bytes: {human_bytes(summary.metadata_bytes)}")
|
||||
print(f" XMP metadata bytes: {human_bytes(summary.xmp_bytes)}")
|
||||
print(f" Form fields bytes: {human_bytes(summary.form_fields_bytes)}")
|
||||
print(f" Lazy flag bytes: {summary.lazy_flag_bytes}")
|
||||
print(
|
||||
f" Text payload characters (not counting JSON overhead): "
|
||||
f"{page_stats.text_payload_chars:,}"
|
||||
)
|
||||
print(
|
||||
f" Approx text structure bytes: {human_bytes(page_stats.text_struct_bytes)}"
|
||||
)
|
||||
print(
|
||||
f" Approx image structure bytes: {human_bytes(page_stats.image_struct_bytes)}"
|
||||
)
|
||||
print(
|
||||
f" Approx content stream bytes: {human_bytes(page_stats.content_stream_bytes)}"
|
||||
)
|
||||
print(
|
||||
f" Approx annotations bytes: {human_bytes(page_stats.annotations_bytes)}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user