editor carriies over vector

This commit is contained in:
Anthony Stirling 2025-11-01 12:14:54 +00:00
parent 29e8270eea
commit efaec14e08
2 changed files with 175 additions and 22 deletions

View File

@ -401,17 +401,29 @@ public class PdfJsonConversionService {
boolean rewriteSucceeded = true; boolean rewriteSucceeded = true;
if (hasText) { if (hasText) {
if (preflightResult.usesFallback()) { if (!preservedStreams.isEmpty()) {
rewriteSucceeded = false; if (preflightResult.usesFallback()) {
} else if (!preservedStreams.isEmpty()) {
log.info("Attempting token rewrite for page {}", pageNumberValue);
rewriteSucceeded = rewriteTextOperators(document, page, elements);
if (!rewriteSucceeded) {
log.info( log.info(
"Token rewrite failed for page {}, regenerating text stream", "Fallback fonts required for page {}; clearing original text tokens",
pageNumberValue); pageNumberValue);
rewriteSucceeded =
rewriteTextOperators(document, page, elements, true);
if (!rewriteSucceeded) {
log.info(
"Failed to clear original text tokens on page {}; forcing regeneration",
pageNumberValue);
}
} else { } else {
log.info("Token rewrite succeeded for page {}", pageNumberValue); log.info("Attempting token rewrite for page {}", pageNumberValue);
rewriteSucceeded =
rewriteTextOperators(document, page, elements, false);
if (!rewriteSucceeded) {
log.info(
"Token rewrite failed for page {}, regenerating text stream",
pageNumberValue);
} else {
log.info("Token rewrite succeeded for page {}", pageNumberValue);
}
} }
} else { } else {
rewriteSucceeded = false; rewriteSucceeded = false;
@ -419,7 +431,7 @@ public class PdfJsonConversionService {
} }
boolean shouldRegenerate = preservedStreams.isEmpty(); boolean shouldRegenerate = preservedStreams.isEmpty();
if (hasText && !rewriteSucceeded) { if (hasText && (!rewriteSucceeded || preflightResult.usesFallback())) {
shouldRegenerate = true; shouldRegenerate = true;
} }
if (hasImages && preservedStreams.isEmpty()) { if (hasImages && preservedStreams.isEmpty()) {
@ -433,6 +445,17 @@ public class PdfJsonConversionService {
if (shouldRegenerate) { if (shouldRegenerate) {
log.info("Regenerating page content for page {}", pageNumberValue); log.info("Regenerating page content for page {}", pageNumberValue);
AppendMode appendMode = AppendMode.OVERWRITE;
if (!preservedStreams.isEmpty()) {
PDStream vectorStream =
extractVectorGraphics(document, preservedStreams, imageElements);
if (vectorStream != null) {
page.setContents(Collections.singletonList(vectorStream));
appendMode = AppendMode.APPEND;
} else {
page.setContents(new ArrayList<>());
}
}
regeneratePageContent( regeneratePageContent(
document, document,
page, page,
@ -440,7 +463,8 @@ public class PdfJsonConversionService {
imageElements, imageElements,
fontMap, fontMap,
fontModels, fontModels,
pageNumberValue); pageNumberValue,
appendMode);
log.info("Page content regeneration complete for page {}", pageNumberValue); log.info("Page content regeneration complete for page {}", pageNumberValue);
} }
@ -2141,6 +2165,116 @@ public class PdfJsonConversionService {
} }
} }
private PDStream extractVectorGraphics(
PDDocument document,
List<PDStream> preservedStreams,
List<PdfJsonImageElement> imageElements)
throws IOException {
if (preservedStreams == null || preservedStreams.isEmpty()) {
return null;
}
Set<String> imageObjectNames = new HashSet<>();
if (imageElements != null) {
for (PdfJsonImageElement element : imageElements) {
if (element == null) {
continue;
}
String objectName = element.getObjectName();
if (objectName != null && !objectName.isBlank()) {
imageObjectNames.add(objectName);
}
}
}
List<Object> filteredTokens = new ArrayList<>();
for (PDStream stream : preservedStreams) {
if (stream == null) {
continue;
}
try {
PDFStreamParser parser = new PDFStreamParser(stream.toByteArray());
List<Object> tokens = parser.parse();
collectVectorTokens(tokens, filteredTokens, imageObjectNames);
} catch (IOException ex) {
log.debug(
"Failed to parse preserved content stream for vector extraction: {}",
ex.getMessage());
}
}
if (filteredTokens.isEmpty()) {
return null;
}
PDStream vectorStream = new PDStream(document);
try (OutputStream outputStream = vectorStream.createOutputStream(COSName.FLATE_DECODE)) {
new ContentStreamWriter(outputStream).writeTokens(filteredTokens);
}
return vectorStream;
}
private void collectVectorTokens(
List<Object> sourceTokens,
List<Object> targetTokens,
Set<String> imageObjectNames) {
if (sourceTokens == null || sourceTokens.isEmpty()) {
return;
}
boolean insideText = false;
boolean insideInlineImage = false;
for (Object token : sourceTokens) {
if (token instanceof Operator operator) {
String name = operator.getName();
if (OperatorName.BEGIN_TEXT.equals(name)) {
insideText = true;
continue;
}
if (OperatorName.END_TEXT.equals(name)) {
insideText = false;
continue;
}
if (OperatorName.BEGIN_INLINE_IMAGE.equals(name)
|| OperatorName.BEGIN_INLINE_IMAGE_DATA.equals(name)) {
if (!insideText) {
targetTokens.add(operator);
}
insideInlineImage = true;
continue;
}
if (OperatorName.END_INLINE_IMAGE.equals(name)) {
if (!insideText) {
targetTokens.add(operator);
}
insideInlineImage = false;
continue;
}
if (insideText && !insideInlineImage) {
continue;
}
if (OperatorName.DRAW_OBJECT.equals(name)
&& imageObjectNames != null
&& !imageObjectNames.isEmpty()
&& !targetTokens.isEmpty()) {
Object previous = targetTokens.get(targetTokens.size() - 1);
if (previous instanceof COSName cosName
&& imageObjectNames.contains(cosName.getName())) {
targetTokens.remove(targetTokens.size() - 1);
continue;
}
}
targetTokens.add(operator);
} else {
if (insideText && !insideInlineImage) {
continue;
}
targetTokens.add(token);
}
}
}
private void regeneratePageContent( private void regeneratePageContent(
PDDocument document, PDDocument document,
PDPage page, PDPage page,
@ -2148,13 +2282,15 @@ public class PdfJsonConversionService {
List<PdfJsonImageElement> imageElements, List<PdfJsonImageElement> imageElements,
Map<String, PDFont> fontMap, Map<String, PDFont> fontMap,
List<PdfJsonFont> fontModels, List<PdfJsonFont> fontModels,
int pageNumber) int pageNumber,
AppendMode appendMode)
throws IOException { throws IOException {
List<DrawableElement> drawables = mergeDrawables(textElements, imageElements); List<DrawableElement> drawables = mergeDrawables(textElements, imageElements);
Map<String, PDImageXObject> imageCache = new HashMap<>(); Map<String, PDImageXObject> imageCache = new HashMap<>();
AppendMode mode = appendMode != null ? appendMode : AppendMode.OVERWRITE;
try (PDPageContentStream contentStream = try (PDPageContentStream contentStream =
new PDPageContentStream(document, page, AppendMode.OVERWRITE, true, true)) { new PDPageContentStream(document, page, mode, true, true)) {
boolean textOpen = false; boolean textOpen = false;
for (DrawableElement drawable : drawables) { for (DrawableElement drawable : drawables) {
switch (drawable.type()) { switch (drawable.type()) {
@ -2618,7 +2754,10 @@ public class PdfJsonConversionService {
} }
private boolean rewriteTextOperators( private boolean rewriteTextOperators(
PDDocument document, PDPage page, List<PdfJsonTextElement> elements) { PDDocument document,
PDPage page,
List<PdfJsonTextElement> elements,
boolean removeOnly) {
if (elements == null || elements.isEmpty()) { if (elements == null || elements.isEmpty()) {
return true; return true;
} }
@ -2663,7 +2802,8 @@ public class PdfJsonConversionService {
return false; return false;
} }
log.trace("Rewriting Tj operator using font {}", currentFontName); log.trace("Rewriting Tj operator using font {}", currentFontName);
if (!rewriteShowText(cosString, currentFont, currentFontName, cursor)) { if (!rewriteShowText(
cosString, currentFont, currentFontName, cursor, removeOnly)) {
log.debug("Failed to rewrite Tj operator; aborting rewrite"); log.debug("Failed to rewrite Tj operator; aborting rewrite");
return false; return false;
} }
@ -2674,7 +2814,8 @@ public class PdfJsonConversionService {
return false; return false;
} }
log.trace("Rewriting TJ operator using font {}", currentFontName); log.trace("Rewriting TJ operator using font {}", currentFontName);
if (!rewriteShowTextArray(array, currentFont, currentFontName, cursor)) { if (!rewriteShowTextArray(
array, currentFont, currentFontName, cursor, removeOnly)) {
log.debug("Failed to rewrite TJ operator; aborting rewrite"); log.debug("Failed to rewrite TJ operator; aborting rewrite");
return false; return false;
} }
@ -2703,7 +2844,11 @@ public class PdfJsonConversionService {
} }
private boolean rewriteShowText( private boolean rewriteShowText(
COSString cosString, PDFont font, String expectedFontName, TextElementCursor cursor) COSString cosString,
PDFont font,
String expectedFontName,
TextElementCursor cursor,
boolean removeOnly)
throws IOException { throws IOException {
if (font == null) { if (font == null) {
return false; return false;
@ -2713,6 +2858,10 @@ public class PdfJsonConversionService {
if (consumed == null) { if (consumed == null) {
return false; return false;
} }
if (removeOnly) {
cosString.setValue(new byte[0]);
return true;
}
String replacement = mergeText(consumed); String replacement = mergeText(consumed);
try { try {
byte[] encoded = font.encode(replacement); byte[] encoded = font.encode(replacement);
@ -2725,7 +2874,11 @@ public class PdfJsonConversionService {
} }
private boolean rewriteShowTextArray( private boolean rewriteShowTextArray(
COSArray array, PDFont font, String expectedFontName, TextElementCursor cursor) COSArray array,
PDFont font,
String expectedFontName,
TextElementCursor cursor,
boolean removeOnly)
throws IOException { throws IOException {
if (font == null) { if (font == null) {
return false; return false;
@ -2738,6 +2891,10 @@ public class PdfJsonConversionService {
if (consumed == null) { if (consumed == null) {
return false; return false;
} }
if (removeOnly) {
array.set(i, new COSString(new byte[0]));
continue;
}
String replacement = mergeText(consumed); String replacement = mergeText(consumed);
try { try {
byte[] encoded = font.encode(replacement); byte[] encoded = font.encode(replacement);

View File

@ -733,15 +733,11 @@ export const restoreGlyphElements = (
rebuiltElements.push(...group.originalElements.map(cloneTextElement)); rebuiltElements.push(...group.originalElements.map(cloneTextElement));
}); });
const textDirty = groups.some((group) => group.text !== group.originalText);
const imageDirty = areImageListsDifferent(images, baselineImages);
const nextStreams = textDirty || imageDirty ? [] : page.contentStreams ?? [];
return { return {
...page, ...page,
textElements: rebuiltElements, textElements: rebuiltElements,
imageElements: images.map(cloneImageElement), imageElements: images.map(cloneImageElement),
contentStreams: nextStreams, contentStreams: page.contentStreams ?? [],
}; };
}); });