diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java index 7f7b7a956..5e7123c93 100644 --- a/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java +++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java @@ -401,17 +401,29 @@ public class PdfJsonConversionService { boolean rewriteSucceeded = true; if (hasText) { - if (preflightResult.usesFallback()) { - rewriteSucceeded = false; - } else if (!preservedStreams.isEmpty()) { - log.info("Attempting token rewrite for page {}", pageNumberValue); - rewriteSucceeded = rewriteTextOperators(document, page, elements); - if (!rewriteSucceeded) { + if (!preservedStreams.isEmpty()) { + if (preflightResult.usesFallback()) { log.info( - "Token rewrite failed for page {}, regenerating text stream", + "Fallback fonts required for page {}; clearing original text tokens", pageNumberValue); + rewriteSucceeded = + rewriteTextOperators(document, page, elements, true); + if (!rewriteSucceeded) { + log.info( + "Failed to clear original text tokens on page {}; forcing regeneration", + pageNumberValue); + } } else { - log.info("Token rewrite succeeded for page {}", pageNumberValue); + log.info("Attempting token rewrite for page {}", pageNumberValue); + rewriteSucceeded = + rewriteTextOperators(document, page, elements, false); + if (!rewriteSucceeded) { + log.info( + "Token rewrite failed for page {}, regenerating text stream", + pageNumberValue); + } else { + log.info("Token rewrite succeeded for page {}", pageNumberValue); + } } } else { rewriteSucceeded = false; @@ -419,7 +431,7 @@ public class PdfJsonConversionService { } boolean shouldRegenerate = preservedStreams.isEmpty(); - if (hasText && !rewriteSucceeded) { + if (hasText && (!rewriteSucceeded || preflightResult.usesFallback())) { shouldRegenerate = true; } if (hasImages && preservedStreams.isEmpty()) { @@ -433,6 +445,17 @@ public class PdfJsonConversionService { if (shouldRegenerate) { log.info("Regenerating page content for page {}", pageNumberValue); + AppendMode appendMode = AppendMode.OVERWRITE; + if (!preservedStreams.isEmpty()) { + PDStream vectorStream = + extractVectorGraphics(document, preservedStreams, imageElements); + if (vectorStream != null) { + page.setContents(Collections.singletonList(vectorStream)); + appendMode = AppendMode.APPEND; + } else { + page.setContents(new ArrayList<>()); + } + } regeneratePageContent( document, page, @@ -440,7 +463,8 @@ public class PdfJsonConversionService { imageElements, fontMap, fontModels, - pageNumberValue); + pageNumberValue, + appendMode); log.info("Page content regeneration complete for page {}", pageNumberValue); } @@ -2141,6 +2165,116 @@ public class PdfJsonConversionService { } } + private PDStream extractVectorGraphics( + PDDocument document, + List preservedStreams, + List imageElements) + throws IOException { + if (preservedStreams == null || preservedStreams.isEmpty()) { + return null; + } + + Set imageObjectNames = new HashSet<>(); + if (imageElements != null) { + for (PdfJsonImageElement element : imageElements) { + if (element == null) { + continue; + } + String objectName = element.getObjectName(); + if (objectName != null && !objectName.isBlank()) { + imageObjectNames.add(objectName); + } + } + } + + List filteredTokens = new ArrayList<>(); + for (PDStream stream : preservedStreams) { + if (stream == null) { + continue; + } + try { + PDFStreamParser parser = new PDFStreamParser(stream.toByteArray()); + List tokens = parser.parse(); + collectVectorTokens(tokens, filteredTokens, imageObjectNames); + } catch (IOException ex) { + log.debug( + "Failed to parse preserved content stream for vector extraction: {}", + ex.getMessage()); + } + } + + if (filteredTokens.isEmpty()) { + return null; + } + + PDStream vectorStream = new PDStream(document); + try (OutputStream outputStream = vectorStream.createOutputStream(COSName.FLATE_DECODE)) { + new ContentStreamWriter(outputStream).writeTokens(filteredTokens); + } + return vectorStream; + } + + private void collectVectorTokens( + List sourceTokens, + List targetTokens, + Set imageObjectNames) { + if (sourceTokens == null || sourceTokens.isEmpty()) { + return; + } + + boolean insideText = false; + boolean insideInlineImage = false; + + for (Object token : sourceTokens) { + if (token instanceof Operator operator) { + String name = operator.getName(); + if (OperatorName.BEGIN_TEXT.equals(name)) { + insideText = true; + continue; + } + if (OperatorName.END_TEXT.equals(name)) { + insideText = false; + continue; + } + if (OperatorName.BEGIN_INLINE_IMAGE.equals(name) + || OperatorName.BEGIN_INLINE_IMAGE_DATA.equals(name)) { + if (!insideText) { + targetTokens.add(operator); + } + insideInlineImage = true; + continue; + } + if (OperatorName.END_INLINE_IMAGE.equals(name)) { + if (!insideText) { + targetTokens.add(operator); + } + insideInlineImage = false; + continue; + } + if (insideText && !insideInlineImage) { + continue; + } + if (OperatorName.DRAW_OBJECT.equals(name) + && imageObjectNames != null + && !imageObjectNames.isEmpty() + && !targetTokens.isEmpty()) { + Object previous = targetTokens.get(targetTokens.size() - 1); + if (previous instanceof COSName cosName + && imageObjectNames.contains(cosName.getName())) { + targetTokens.remove(targetTokens.size() - 1); + continue; + } + } + targetTokens.add(operator); + } else { + if (insideText && !insideInlineImage) { + continue; + } + targetTokens.add(token); + } + } + } + private void regeneratePageContent( PDDocument document, PDPage page, @@ -2148,13 +2282,15 @@ public class PdfJsonConversionService { List imageElements, Map fontMap, List fontModels, - int pageNumber) + int pageNumber, + AppendMode appendMode) throws IOException { List drawables = mergeDrawables(textElements, imageElements); Map imageCache = new HashMap<>(); + AppendMode mode = appendMode != null ? appendMode : AppendMode.OVERWRITE; try (PDPageContentStream contentStream = - new PDPageContentStream(document, page, AppendMode.OVERWRITE, true, true)) { + new PDPageContentStream(document, page, mode, true, true)) { boolean textOpen = false; for (DrawableElement drawable : drawables) { switch (drawable.type()) { @@ -2618,7 +2754,10 @@ public class PdfJsonConversionService { } private boolean rewriteTextOperators( - PDDocument document, PDPage page, List elements) { + PDDocument document, + PDPage page, + List elements, + boolean removeOnly) { if (elements == null || elements.isEmpty()) { return true; } @@ -2663,7 +2802,8 @@ public class PdfJsonConversionService { return false; } log.trace("Rewriting Tj operator using font {}", currentFontName); - if (!rewriteShowText(cosString, currentFont, currentFontName, cursor)) { + if (!rewriteShowText( + cosString, currentFont, currentFontName, cursor, removeOnly)) { log.debug("Failed to rewrite Tj operator; aborting rewrite"); return false; } @@ -2674,7 +2814,8 @@ public class PdfJsonConversionService { return false; } log.trace("Rewriting TJ operator using font {}", currentFontName); - if (!rewriteShowTextArray(array, currentFont, currentFontName, cursor)) { + if (!rewriteShowTextArray( + array, currentFont, currentFontName, cursor, removeOnly)) { log.debug("Failed to rewrite TJ operator; aborting rewrite"); return false; } @@ -2703,7 +2844,11 @@ public class PdfJsonConversionService { } private boolean rewriteShowText( - COSString cosString, PDFont font, String expectedFontName, TextElementCursor cursor) + COSString cosString, + PDFont font, + String expectedFontName, + TextElementCursor cursor, + boolean removeOnly) throws IOException { if (font == null) { return false; @@ -2713,6 +2858,10 @@ public class PdfJsonConversionService { if (consumed == null) { return false; } + if (removeOnly) { + cosString.setValue(new byte[0]); + return true; + } String replacement = mergeText(consumed); try { byte[] encoded = font.encode(replacement); @@ -2725,7 +2874,11 @@ public class PdfJsonConversionService { } private boolean rewriteShowTextArray( - COSArray array, PDFont font, String expectedFontName, TextElementCursor cursor) + COSArray array, + PDFont font, + String expectedFontName, + TextElementCursor cursor, + boolean removeOnly) throws IOException { if (font == null) { return false; @@ -2738,6 +2891,10 @@ public class PdfJsonConversionService { if (consumed == null) { return false; } + if (removeOnly) { + array.set(i, new COSString(new byte[0])); + continue; + } String replacement = mergeText(consumed); try { byte[] encoded = font.encode(replacement); diff --git a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts index 068c378af..685929e30 100644 --- a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts +++ b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts @@ -733,15 +733,11 @@ export const restoreGlyphElements = ( rebuiltElements.push(...group.originalElements.map(cloneTextElement)); }); - const textDirty = groups.some((group) => group.text !== group.originalText); - const imageDirty = areImageListsDifferent(images, baselineImages); - const nextStreams = textDirty || imageDirty ? [] : page.contentStreams ?? []; - return { ...page, textElements: rebuiltElements, imageElements: images.map(cloneImageElement), - contentStreams: nextStreams, + contentStreams: page.contentStreams ?? [], }; });