image stuff

This commit is contained in:
Anthony Stirling 2025-10-23 16:38:06 +01:00
parent 930c68c8c5
commit af19a5af23
10 changed files with 1207 additions and 75 deletions

View File

@ -0,0 +1,37 @@
package stirling.software.SPDF.model.json;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonImageElement {
private String id;
private String objectName;
private Boolean inlineImage;
private Integer nativeWidth;
private Integer nativeHeight;
private Float x;
private Float y;
private Float width;
private Float height;
private Float left;
private Float right;
private Float top;
private Float bottom;
@Builder.Default private List<Float> transform = new ArrayList<>();
private Integer zOrder;
private String imageData;
private String imageFormat;
}

View File

@ -23,6 +23,7 @@ public class PdfJsonPage {
private Integer rotation;
@Builder.Default private List<PdfJsonTextElement> textElements = new ArrayList<>();
@Builder.Default private List<PdfJsonImageElement> imageElements = new ArrayList<>();
/** Serialized representation of the page resources dictionary. */
private PdfJsonCosValue resources;

View File

@ -24,6 +24,8 @@ public class PdfJsonTextElement {
private Float fontSizeInPt;
private Float characterSpacing;
private Float wordSpacing;
private Float spaceWidth;
private Integer zOrder;
private Float horizontalScaling;
private Float leading;
private Float rise;

View File

@ -1,5 +1,8 @@
package stirling.software.SPDF.service;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@ -22,8 +25,13 @@ import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.TimeZone;
import java.util.UUID;
import javax.imageio.ImageIO;
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSBoolean;
@ -53,6 +61,8 @@ import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.state.PDTextState;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
@ -74,6 +84,7 @@ import stirling.software.SPDF.model.json.PdfJsonCosValue;
import stirling.software.SPDF.model.json.PdfJsonDocument;
import stirling.software.SPDF.model.json.PdfJsonFont;
import stirling.software.SPDF.model.json.PdfJsonFontCidSystemInfo;
import stirling.software.SPDF.model.json.PdfJsonImageElement;
import stirling.software.SPDF.model.json.PdfJsonMetadata;
import stirling.software.SPDF.model.json.PdfJsonPage;
import stirling.software.SPDF.model.json.PdfJsonStream;
@ -128,6 +139,8 @@ public class PdfJsonConversionService {
stripper.setSortByPosition(true);
stripper.getText(document);
Map<Integer, List<PdfJsonImageElement>> imagesByPage = collectImages(document);
PdfJsonDocument pdfJson = new PdfJsonDocument();
pdfJson.setMetadata(extractMetadata(document));
pdfJson.setXmpMetadata(extractXmpMetadata(document));
@ -136,7 +149,7 @@ public class PdfJsonConversionService {
Comparator.comparing(
PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder())));
pdfJson.setFonts(serializedFonts);
pdfJson.setPages(extractPages(document, textByPage));
pdfJson.setPages(extractPages(document, textByPage, imagesByPage));
log.info(
"PDF→JSON conversion complete (fonts: {}, pages: {})",
@ -201,6 +214,10 @@ public class PdfJsonConversionService {
pageModel.getTextElements() != null
? pageModel.getTextElements()
: new ArrayList<>();
List<PdfJsonImageElement> imageElements =
pageModel.getImageElements() != null
? pageModel.getImageElements()
: new ArrayList<>();
boolean fallbackAssigned =
preflightTextElements(
@ -218,15 +235,13 @@ public class PdfJsonConversionService {
}
boolean hasText = !elements.isEmpty();
boolean rewriteSucceeded = false;
boolean hasImages = !imageElements.isEmpty();
boolean rewriteSucceeded = true;
if (!preservedStreams.isEmpty() && hasText) {
if (hasText) {
if (fallbackAssigned) {
log.info(
"Skipping token rewrite for page {} because fallback font was applied",
pageNumberValue);
rewriteSucceeded = false;
} else {
} else if (!preservedStreams.isEmpty()) {
log.info("Attempting token rewrite for page {}", pageNumberValue);
rewriteSucceeded = rewriteTextOperators(document, page, elements);
if (!rewriteSucceeded) {
@ -236,18 +251,29 @@ public class PdfJsonConversionService {
} else {
log.info("Token rewrite succeeded for page {}", pageNumberValue);
}
} else {
rewriteSucceeded = false;
}
}
if (!hasText) {
boolean shouldRegenerate = preservedStreams.isEmpty();
if (hasText && !rewriteSucceeded) {
shouldRegenerate = true;
}
if (hasImages && preservedStreams.isEmpty()) {
shouldRegenerate = true;
}
if (!(hasText || hasImages)) {
pageIndex++;
continue;
}
if (!rewriteSucceeded) {
log.info("Regenerating text content for page {}", pageNumberValue);
regenerateTextContent(document, page, elements, fontMap, pageNumberValue);
log.info("Text regeneration complete for page {}", pageNumberValue);
if (shouldRegenerate) {
log.info("Regenerating page content for page {}", pageNumberValue);
regeneratePageContent(
document, page, elements, imageElements, fontMap, pageNumberValue);
log.info("Page content regeneration complete for page {}", pageNumberValue);
}
pageIndex++;
}
@ -571,7 +597,9 @@ public class PdfJsonConversionService {
}
private List<PdfJsonPage> extractPages(
PDDocument document, Map<Integer, List<PdfJsonTextElement>> textByPage)
PDDocument document,
Map<Integer, List<PdfJsonTextElement>> textByPage,
Map<Integer, List<PdfJsonImageElement>> imagesByPage)
throws IOException {
List<PdfJsonPage> pages = new ArrayList<>();
int pageIndex = 0;
@ -583,6 +611,7 @@ public class PdfJsonConversionService {
pageModel.setHeight(mediaBox.getHeight());
pageModel.setRotation(page.getRotation());
pageModel.setTextElements(textByPage.getOrDefault(pageIndex + 1, new ArrayList<>()));
pageModel.setImageElements(imagesByPage.getOrDefault(pageIndex + 1, new ArrayList<>()));
pageModel.setResources(
serializeCosValue(page.getCOSObject().getDictionaryObject(COSName.RESOURCES)));
pageModel.setContentStreams(extractContentStreams(page));
@ -592,6 +621,19 @@ public class PdfJsonConversionService {
return pages;
}
private Map<Integer, List<PdfJsonImageElement>> collectImages(PDDocument document)
throws IOException {
Map<Integer, List<PdfJsonImageElement>> imagesByPage = new LinkedHashMap<>();
int pageNumber = 1;
for (PDPage page : document.getPages()) {
ImageCollectingEngine engine =
new ImageCollectingEngine(page, pageNumber, imagesByPage);
engine.processPage(page);
pageNumber++;
}
return imagesByPage;
}
private PdfJsonMetadata extractMetadata(PDDocument document) {
PdfJsonMetadata metadata = new PdfJsonMetadata();
PDDocumentInformation info = document.getDocumentInformation();
@ -911,17 +953,27 @@ public class PdfJsonConversionService {
}
}
private void regenerateTextContent(
private void regeneratePageContent(
PDDocument document,
PDPage page,
List<PdfJsonTextElement> elements,
List<PdfJsonTextElement> textElements,
List<PdfJsonImageElement> imageElements,
Map<String, PDFont> fontMap,
int pageNumber)
throws IOException {
List<DrawableElement> drawables = mergeDrawables(textElements, imageElements);
Map<String, PDImageXObject> imageCache = new HashMap<>();
try (PDPageContentStream contentStream =
new PDPageContentStream(document, page, AppendMode.OVERWRITE, true, true)) {
boolean textOpen = false;
for (PdfJsonTextElement element : elements) {
for (DrawableElement drawable : drawables) {
switch (drawable.type()) {
case TEXT -> {
PdfJsonTextElement element = drawable.textElement();
if (element == null) {
continue;
}
PDFont font = fontMap.get(buildFontKey(pageNumber, element.getFontId()));
if (font == null && FALLBACK_FONT_ID.equals(element.getFontId())) {
font = fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID));
@ -931,7 +983,7 @@ public class PdfJsonConversionService {
if (font != null) {
try {
encodeWithTest(font, text);
font.encode(text);
} catch (IOException | IllegalArgumentException ex) {
log.debug(
"Edited text contains glyphs missing from font {} ({}), switching to fallback",
@ -943,16 +995,15 @@ public class PdfJsonConversionService {
font = loadFallbackPdfFont(document);
fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), font);
}
encodeWithTest(font, text);
}
} else {
}
if (font == null) {
element.setFontId(FALLBACK_FONT_ID);
font = fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID));
if (font == null) {
font = loadFallbackPdfFont(document);
fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), font);
}
encodeWithTest(font, text);
}
if (!textOpen) {
@ -964,7 +1015,23 @@ public class PdfJsonConversionService {
contentStream.setFont(font, fontScale);
applyRenderingMode(contentStream, element.getRenderingMode());
applyTextMatrix(contentStream, element);
contentStream.showText(text);
String sanitized = sanitizeForFont(font, text);
if (!sanitized.isEmpty()) {
contentStream.showText(sanitized);
}
}
case IMAGE -> {
if (textOpen) {
contentStream.endText();
textOpen = false;
}
PdfJsonImageElement element = drawable.imageElement();
if (element == null) {
continue;
}
drawImageElement(contentStream, document, element, imageCache);
}
}
}
if (textOpen) {
contentStream.endText();
@ -972,11 +1039,47 @@ public class PdfJsonConversionService {
}
}
private void encodeWithTest(PDFont font, String text) throws IOException {
private String sanitizeForFont(PDFont font, String text) {
if (text == null || text.isEmpty()) {
return;
return "";
}
font.encode(text);
StringBuilder builder = new StringBuilder(text.length());
text.codePoints()
.forEach(
codePoint -> {
String candidate = new String(Character.toChars(codePoint));
try {
font.encode(candidate);
builder.append(candidate);
return;
} catch (IOException | IllegalArgumentException ex) {
String mapped = mapUnsupportedGlyph(codePoint);
if (mapped != null) {
try {
font.encode(mapped);
builder.append(mapped);
return;
} catch (IOException | IllegalArgumentException ignore) {
// fall through to generic replacement
}
}
log.debug(
"Replacing unsupported glyph {} ({}) with '?' for font {}",
candidate,
String.format("U+%04X", codePoint),
font.getName());
builder.append('?');
}
});
return builder.toString();
}
private String mapUnsupportedGlyph(int codePoint) {
return switch (codePoint) {
case 0x276E -> "<";
case 0x276F -> ">";
default -> null;
};
}
private void applyTextState(PDPageContentStream contentStream, PdfJsonTextElement element)
@ -1198,7 +1301,7 @@ public class PdfJsonConversionService {
byte[] encoded = font.encode(replacement);
cosString.setValue(encoded);
return true;
} catch (IOException | IllegalArgumentException ex) {
} catch (IOException | IllegalArgumentException | UnsupportedOperationException ex) {
log.debug("Failed to encode replacement text: {}", ex.getMessage());
return false;
}
@ -1222,7 +1325,9 @@ public class PdfJsonConversionService {
try {
byte[] encoded = font.encode(replacement);
array.set(i, new COSString(encoded));
} catch (IOException | IllegalArgumentException ex) {
} catch (IOException
| IllegalArgumentException
| UnsupportedOperationException ex) {
log.debug("Failed to encode replacement text in TJ array: {}", ex.getMessage());
return false;
}
@ -1542,6 +1647,377 @@ public class PdfJsonConversionService {
return calendar;
}
private class ImageCollectingEngine extends PDFGraphicsStreamEngine {
private final int pageNumber;
private final Map<Integer, List<PdfJsonImageElement>> imagesByPage;
private COSName currentXObjectName;
private int imageCounter = 0;
protected ImageCollectingEngine(
PDPage page, int pageNumber, Map<Integer, List<PdfJsonImageElement>> imagesByPage)
throws IOException {
super(page);
this.pageNumber = pageNumber;
this.imagesByPage = imagesByPage;
}
@Override
public void processPage(PDPage page) throws IOException {
super.processPage(page);
}
@Override
public void drawImage(PDImage pdImage) throws IOException {
EncodedImage encoded = encodeImage(pdImage);
if (encoded == null) {
return;
}
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
Bounds bounds = computeBounds(ctm);
List<Float> matrixValues = toMatrixValues(ctm);
PdfJsonImageElement element =
PdfJsonImageElement.builder()
.id(UUID.randomUUID().toString())
.objectName(
currentXObjectName != null
? currentXObjectName.getName()
: null)
.inlineImage(!(pdImage instanceof PDImageXObject))
.nativeWidth(pdImage.getWidth())
.nativeHeight(pdImage.getHeight())
.x(bounds.left)
.y(bounds.bottom)
.width(bounds.width())
.height(bounds.height())
.left(bounds.left)
.right(bounds.right)
.top(bounds.top)
.bottom(bounds.bottom)
.transform(matrixValues)
.zOrder(-1_000_000 + imageCounter)
.imageData(encoded.base64())
.imageFormat(encoded.format())
.build();
imageCounter++;
imagesByPage.computeIfAbsent(pageNumber, key -> new ArrayList<>()).add(element);
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3)
throws IOException {
// Not needed for image extraction
}
@Override
public void clip(int windingRule) throws IOException {
// Not needed for image extraction
}
@Override
public void moveTo(float x, float y) throws IOException {
// Not needed for image extraction
}
@Override
public void lineTo(float x, float y) throws IOException {
// Not needed for image extraction
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
throws IOException {
// Not needed for image extraction
}
@Override
public Point2D getCurrentPoint() throws IOException {
return new Point2D.Float();
}
@Override
public void closePath() throws IOException {
// Not needed for image extraction
}
@Override
public void endPath() throws IOException {
// Not needed for image extraction
}
@Override
public void shadingFill(COSName shadingName) throws IOException {
// Not needed for image extraction
}
@Override
public void fillAndStrokePath(int windingRule) throws IOException {
// Not needed for image extraction
}
@Override
public void fillPath(int windingRule) throws IOException {
// Not needed for image extraction
}
@Override
public void strokePath() throws IOException {
// Not needed for image extraction
}
@Override
protected void processOperator(Operator operator, List<COSBase> operands)
throws IOException {
if (OperatorName.DRAW_OBJECT.equals(operator.getName())
&& !operands.isEmpty()
&& operands.get(0) instanceof COSName name) {
currentXObjectName = name;
}
super.processOperator(operator, operands);
currentXObjectName = null;
}
private Bounds computeBounds(Matrix ctm) {
AffineTransform transform = ctm.createAffineTransform();
Point2D.Float p0 = new Point2D.Float(0, 0);
Point2D.Float p1 = new Point2D.Float(1, 0);
Point2D.Float p2 = new Point2D.Float(0, 1);
Point2D.Float p3 = new Point2D.Float(1, 1);
transform.transform(p0, p0);
transform.transform(p1, p1);
transform.transform(p2, p2);
transform.transform(p3, p3);
float minX = Math.min(Math.min(p0.x, p1.x), Math.min(p2.x, p3.x));
float maxX = Math.max(Math.max(p0.x, p1.x), Math.max(p2.x, p3.x));
float minY = Math.min(Math.min(p0.y, p1.y), Math.min(p2.y, p3.y));
float maxY = Math.max(Math.max(p0.y, p1.y), Math.max(p2.y, p3.y));
if (!Float.isFinite(minX) || !Float.isFinite(minY)) {
return new Bounds(0f, 0f, 0f, 0f);
}
return new Bounds(minX, maxX, minY, maxY);
}
}
private record Bounds(float left, float right, float bottom, float top) {
float width() {
return Math.max(0f, right - left);
}
float height() {
return Math.max(0f, top - bottom);
}
}
private enum DrawableType {
TEXT,
IMAGE
}
private record DrawableElement(
DrawableType type,
PdfJsonTextElement textElement,
PdfJsonImageElement imageElement,
int zOrder,
int sequence) {}
private record EncodedImage(String base64, String format) {}
private List<Float> toMatrixValues(Matrix matrix) {
List<Float> values = new ArrayList<>(6);
values.add(matrix.getValue(0, 0));
values.add(matrix.getValue(0, 1));
values.add(matrix.getValue(1, 0));
values.add(matrix.getValue(1, 1));
values.add(matrix.getValue(2, 0));
values.add(matrix.getValue(2, 1));
return values;
}
private EncodedImage encodeImage(PDImage image) {
try {
BufferedImage bufferedImage = image.getImage();
if (bufferedImage == null) {
return null;
}
String format = resolveImageFormat(image);
if (format == null || format.isBlank()) {
format = "png";
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
boolean written = ImageIO.write(bufferedImage, format, baos);
if (!written) {
if (!"png".equalsIgnoreCase(format)) {
baos.reset();
if (!ImageIO.write(bufferedImage, "png", baos)) {
return null;
}
format = "png";
} else {
return null;
}
}
return new EncodedImage(Base64.getEncoder().encodeToString(baos.toByteArray()), format);
} catch (IOException ex) {
log.debug("Failed to encode image: {}", ex.getMessage());
return null;
}
}
private String resolveImageFormat(PDImage image) {
if (image instanceof PDImageXObject xObject) {
String suffix = xObject.getSuffix();
if (suffix != null && !suffix.isBlank()) {
return suffix.toLowerCase(Locale.ROOT);
}
}
return "png";
}
private List<DrawableElement> mergeDrawables(
List<PdfJsonTextElement> textElements, List<PdfJsonImageElement> imageElements) {
List<DrawableElement> drawables = new ArrayList<>();
int sequence = 0;
if (imageElements != null) {
int imageIndex = 0;
for (PdfJsonImageElement imageElement : imageElements) {
if (imageElement == null) {
continue;
}
int order =
imageElement.getZOrder() != null
? imageElement.getZOrder()
: Integer.MIN_VALUE / 2 + imageIndex;
drawables.add(
new DrawableElement(
DrawableType.IMAGE, null, imageElement, order, sequence++));
imageIndex++;
}
}
if (textElements != null) {
int textIndex = 0;
for (PdfJsonTextElement textElement : textElements) {
if (textElement == null) {
continue;
}
int order =
textElement.getZOrder() != null
? textElement.getZOrder()
: 1_000_000 + textIndex;
drawables.add(
new DrawableElement(
DrawableType.TEXT, textElement, null, order, sequence++));
textIndex++;
}
}
drawables.sort(
Comparator.comparingInt(DrawableElement::zOrder)
.thenComparingInt(DrawableElement::sequence));
return drawables;
}
private void drawImageElement(
PDPageContentStream contentStream,
PDDocument document,
PdfJsonImageElement element,
Map<String, PDImageXObject> cache)
throws IOException {
if (element == null || element.getImageData() == null || element.getImageData().isBlank()) {
return;
}
String cacheKey =
element.getId() != null && !element.getId().isBlank()
? element.getId()
: Integer.toHexString(System.identityHashCode(element));
PDImageXObject image = cache.get(cacheKey);
if (image == null) {
image = createImageXObject(document, element);
if (image == null) {
return;
}
cache.put(cacheKey, image);
}
float width = safeFloat(element.getWidth(), fallbackWidth(element));
float height = safeFloat(element.getHeight(), fallbackHeight(element));
if (width <= 0f) {
width = Math.max(1f, fallbackWidth(element));
}
if (height <= 0f) {
height = Math.max(1f, fallbackHeight(element));
}
float left = resolveLeft(element, width);
float bottom = resolveBottom(element, height);
contentStream.drawImage(image, left, bottom, width, height);
}
private PDImageXObject createImageXObject(PDDocument document, PdfJsonImageElement element)
throws IOException {
byte[] data;
try {
data = Base64.getDecoder().decode(element.getImageData());
} catch (IllegalArgumentException ex) {
log.debug("Failed to decode image element: {}", ex.getMessage());
return null;
}
String name = element.getId() != null ? element.getId() : UUID.randomUUID().toString();
return PDImageXObject.createFromByteArray(document, data, name);
}
private float fallbackWidth(PdfJsonImageElement element) {
if (element.getRight() != null && element.getLeft() != null) {
return Math.max(0f, element.getRight() - element.getLeft());
}
if (element.getNativeWidth() != null) {
return element.getNativeWidth();
}
return 1f;
}
private float resolveLeft(PdfJsonImageElement element, float width) {
if (element.getLeft() != null) {
return element.getLeft();
}
if (element.getX() != null) {
return element.getX();
}
if (element.getRight() != null) {
return element.getRight() - width;
}
return 0f;
}
private float resolveBottom(PdfJsonImageElement element, float height) {
if (element.getBottom() != null) {
return element.getBottom();
}
if (element.getY() != null) {
return element.getY();
}
if (element.getTop() != null) {
return element.getTop() - height;
}
return 0f;
}
private float fallbackHeight(PdfJsonImageElement element) {
if (element.getTop() != null && element.getBottom() != null) {
return Math.max(0f, element.getTop() - element.getBottom());
}
if (element.getNativeHeight() != null) {
return element.getNativeHeight();
}
return 1f;
}
private class TextCollectingStripper extends PDFTextStripper {
private final PDDocument document;
@ -1595,6 +2071,7 @@ public class PdfJsonConversionService {
element.setHeight(position.getHeightDir());
element.setTextMatrix(extractMatrix(position));
element.setFontMatrixSize(computeFontMatrixSize(element.getTextMatrix()));
element.setSpaceWidth(position.getWidthOfSpace());
PDGraphicsState graphicsState = getGraphicsState();
if (graphicsState != null) {
PDTextState textState = graphicsState.getTextState();
@ -1611,6 +2088,7 @@ public class PdfJsonConversionService {
element.setFillColor(toTextColor(graphicsState.getNonStrokingColor()));
element.setStrokeColor(toTextColor(graphicsState.getStrokingColor()));
}
element.setZOrder(1_000_000 + pageElements.size());
pageElements.add(element);
}
}

View File

@ -54,6 +54,7 @@
"react": "^19.1.1",
"react-dom": "^19.1.1",
"react-i18next": "^15.7.3",
"react-rnd": "^10.5.2",
"react-router-dom": "^7.9.1",
"signature_pad": "^5.0.4",
"tailwindcss": "^4.1.13",
@ -11036,6 +11037,16 @@
"node": ">=0.10.0"
}
},
"node_modules/re-resizable": {
"version": "6.11.2",
"resolved": "https://registry.npmjs.org/re-resizable/-/re-resizable-6.11.2.tgz",
"integrity": "sha512-2xI2P3OHs5qw7K0Ud1aLILK6MQxW50TcO+DetD9eIV58j84TqYeHoZcL9H4GXFXXIh7afhH8mv5iUCXII7OW7A==",
"license": "MIT",
"peerDependencies": {
"react": "^16.13.1 || ^17.0.0 || ^18.0.0 || ^19.0.0",
"react-dom": "^16.13.1 || ^17.0.0 || ^18.0.0 || ^19.0.0"
}
},
"node_modules/react": {
"version": "19.1.1",
"resolved": "https://registry.npmjs.org/react/-/react-19.1.1.tgz",
@ -11057,6 +11068,29 @@
"react": "^19.1.1"
}
},
"node_modules/react-draggable": {
"version": "4.4.6",
"resolved": "https://registry.npmjs.org/react-draggable/-/react-draggable-4.4.6.tgz",
"integrity": "sha512-LtY5Xw1zTPqHkVmtM3X8MUOxNDOUhv/khTgBgrUvwaS064bwVvxT+q5El0uUFNx5IEPKXuRejr7UqLwBIg5pdw==",
"license": "MIT",
"dependencies": {
"clsx": "^1.1.1",
"prop-types": "^15.8.1"
},
"peerDependencies": {
"react": ">= 16.3.0",
"react-dom": ">= 16.3.0"
}
},
"node_modules/react-draggable/node_modules/clsx": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/clsx/-/clsx-1.2.1.tgz",
"integrity": "sha512-EcR6r5a8bj6pu3ycsa/E/cKVGuTgZJZdsyUYHOksG/UHIiKfjxzRxYJpyVBwYaQeOvghal9fcc4PidlgzugAQg==",
"license": "MIT",
"engines": {
"node": ">=6"
}
},
"node_modules/react-dropzone": {
"version": "14.3.8",
"resolved": "https://registry.npmjs.org/react-dropzone/-/react-dropzone-14.3.8.tgz",
@ -11175,6 +11209,27 @@
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
"license": "0BSD"
},
"node_modules/react-rnd": {
"version": "10.5.2",
"resolved": "https://registry.npmjs.org/react-rnd/-/react-rnd-10.5.2.tgz",
"integrity": "sha512-0Tm4x7k7pfHf2snewJA8x7Nwgt3LV+58MVEWOVsFjk51eYruFEa6Wy7BNdxt4/lH0wIRsu7Gm3KjSXY2w7YaNw==",
"license": "MIT",
"dependencies": {
"re-resizable": "6.11.2",
"react-draggable": "4.4.6",
"tslib": "2.6.2"
},
"peerDependencies": {
"react": ">=16.3.0",
"react-dom": ">=16.3.0"
}
},
"node_modules/react-rnd/node_modules/tslib": {
"version": "2.6.2",
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.2.tgz",
"integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==",
"license": "0BSD"
},
"node_modules/react-router": {
"version": "7.9.1",
"resolved": "https://registry.npmjs.org/react-router/-/react-router-7.9.1.tgz",

View File

@ -49,6 +49,7 @@
"posthog-js": "^1.268.0",
"react": "^19.1.1",
"react-dom": "^19.1.1",
"react-rnd": "^10.5.2",
"react-i18next": "^15.7.3",
"react-router-dom": "^7.9.1",
"signature_pad": "^5.0.4",

View File

@ -1,4 +1,4 @@
import React, { useEffect, useMemo, useState } from 'react';
import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from 'react';
import {
Alert,
Badge,
@ -21,16 +21,59 @@ import PictureAsPdfIcon from '@mui/icons-material/PictureAsPdfOutlined';
import AutorenewIcon from '@mui/icons-material/Autorenew';
import WarningAmberIcon from '@mui/icons-material/WarningAmber';
import UploadIcon from '@mui/icons-material/Upload';
import { Rnd } from 'react-rnd';
import {
PdfJsonEditorViewData,
PdfJsonPage,
} from '../../../tools/pdfJsonEditorTypes';
import { pageDimensions } from '../../../tools/pdfJsonEditorUtils';
import { getImageBounds, pageDimensions } from '../../../tools/pdfJsonEditorUtils';
const MAX_RENDER_WIDTH = 820;
const MIN_BOX_SIZE = 18;
const getCaretOffset = (element: HTMLElement): number => {
const selection = window.getSelection();
if (!selection || selection.rangeCount === 0 || !element.contains(selection.focusNode)) {
return element.innerText.length;
}
const range = selection.getRangeAt(0).cloneRange();
range.selectNodeContents(element);
range.setEnd(selection.focusNode as Node, selection.focusOffset);
return range.toString().length;
};
const setCaretOffset = (element: HTMLElement, offset: number): void => {
const selection = window.getSelection();
if (!selection) {
return;
}
const targetOffset = Math.max(0, Math.min(offset, element.innerText.length));
const range = document.createRange();
let remaining = targetOffset;
const walker = document.createTreeWalker(element, NodeFilter.SHOW_TEXT);
let node = walker.nextNode();
while (node) {
const textNode = node as Text;
const length = textNode.length;
if (remaining <= length) {
range.setStart(textNode, remaining);
range.collapse(true);
selection.removeAllRanges();
selection.addRange(range);
return;
}
remaining -= length;
node = walker.nextNode();
}
range.selectNodeContents(element);
range.collapse(false);
selection.removeAllRanges();
selection.addRange(range);
};
interface PdfJsonEditorViewProps {
data: PdfJsonEditorViewData;
}
@ -61,10 +104,15 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
const { t } = useTranslation();
const [activeGroupId, setActiveGroupId] = useState<string | null>(null);
const [editingGroupId, setEditingGroupId] = useState<string | null>(null);
const [activeImageId, setActiveImageId] = useState<string | null>(null);
const containerRef = useRef<HTMLDivElement | null>(null);
const editorRefs = useRef<Map<string, HTMLDivElement>>(new Map());
const caretOffsetsRef = useRef<Map<string, number>>(new Map());
const {
document: pdfDocument,
groupsByPage,
imagesByPage,
selectedPage,
dirtyPages,
hasDocument,
@ -76,6 +124,8 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
onLoadJson,
onSelectPage,
onGroupEdit,
onImageTransform,
onImageReset,
onReset,
onDownloadJson,
onGeneratePdf,
@ -114,6 +164,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
const pages = pdfDocument?.pages ?? [];
const currentPage = pages[selectedPage] ?? null;
const pageGroups = groupsByPage[selectedPage] ?? [];
const pageImages = imagesByPage[selectedPage] ?? [];
const visibleGroups = useMemo(
() =>
pageGroups.filter((group) => {
@ -123,6 +174,14 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
[editingGroupId, pageGroups]
);
const orderedImages = useMemo(
() =>
[...pageImages].sort(
(first, second) => (first?.zOrder ?? -1_000_000) - (second?.zOrder ?? -1_000_000),
),
[pageImages],
);
const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage);
const scale = useMemo(() => Math.min(MAX_RENDER_WIDTH / pageWidth, 1.5), [pageWidth]);
const scaledWidth = pageWidth * scale;
@ -131,8 +190,21 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
useEffect(() => {
setActiveGroupId(null);
setEditingGroupId(null);
setActiveImageId(null);
}, [selectedPage]);
useLayoutEffect(() => {
if (!editingGroupId) {
return;
}
const editor = editorRefs.current.get(editingGroupId);
if (!editor) {
return;
}
const offset = caretOffsetsRef.current.get(editingGroupId) ?? editor.innerText.length;
setCaretOffset(editor, offset);
}, [editingGroupId, groupsByPage, imagesByPage]);
useEffect(() => {
if (!editingGroupId) {
return;
@ -160,6 +232,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
const handleBackgroundClick = () => {
setEditingGroupId(null);
setActiveGroupId(null);
setActiveImageId(null);
};
const renderGroupContainer = (
@ -205,6 +278,28 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
</Box>
);
const emitImageTransform = useCallback(
(
imageId: string,
leftPx: number,
topPx: number,
widthPx: number,
heightPx: number,
) => {
const rawLeft = leftPx / scale;
const rawTop = pageHeight - topPx / scale;
const width = Math.max(widthPx / scale, 0.01);
const height = Math.max(heightPx / scale, 0.01);
const maxLeft = Math.max(pageWidth - width, 0);
const left = Math.min(Math.max(rawLeft, 0), maxLeft);
const minTop = Math.min(height, pageHeight);
const top = Math.min(Math.max(rawTop, minTop), pageHeight);
const bottom = Math.max(top - height, 0);
onImageTransform(selectedPage, imageId, { left, bottom, width, height, transform: [] });
},
[onImageTransform, pageHeight, pageWidth, scale, selectedPage],
);
return (
<Stack gap="xl" className="h-full" style={{ padding: '1.5rem', overflow: 'auto' }}>
<Card withBorder radius="md" shadow="xs" padding="lg">
@ -341,8 +436,121 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
borderRadius: '0.5rem',
overflow: 'hidden',
}}
ref={containerRef}
>
{visibleGroups.length === 0 ? (
{orderedImages.map((image, imageIndex) => {
if (!image?.imageData) {
return null;
}
const bounds = getImageBounds(image);
const width = Math.max(bounds.right - bounds.left, 1);
const height = Math.max(bounds.top - bounds.bottom, 1);
const cssWidth = Math.max(width * scale, 2);
const cssHeight = Math.max(height * scale, 2);
const cssLeft = bounds.left * scale;
const cssTop = (pageHeight - bounds.top) * scale;
const imageId = image.id ?? `page-${selectedPage}-image-${imageIndex}`;
const isActive = activeImageId === imageId;
const src = `data:image/${image.imageFormat ?? 'png'};base64,${image.imageData}`;
const baseZIndex = (image.zOrder ?? -1_000_000) + 1_050_000;
const zIndex = isActive ? baseZIndex + 1_000_000 : baseZIndex;
return (
<Rnd
key={`image-${imageId}`}
bounds="parent"
size={{ width: cssWidth, height: cssHeight }}
position={{ x: cssLeft, y: cssTop }}
onDragStart={() => {
setActiveGroupId(null);
setEditingGroupId(null);
setActiveImageId(imageId);
}}
onDrag={(event, data) => {
emitImageTransform(
imageId,
data.x,
data.y,
cssWidth,
cssHeight,
);
}}
onDragStop={(event, data) => {
emitImageTransform(
imageId,
data.x,
data.y,
cssWidth,
cssHeight,
);
}}
onResizeStart={() => {
setActiveImageId(imageId);
setActiveGroupId(null);
setEditingGroupId(null);
}}
onResize={(event, _direction, ref, _delta, position) => {
const nextWidth = parseFloat(ref.style.width);
const nextHeight = parseFloat(ref.style.height);
emitImageTransform(
imageId,
position.x,
position.y,
nextWidth,
nextHeight,
);
}}
onResizeStop={(event, _direction, ref, _delta, position) => {
const nextWidth = parseFloat(ref.style.width);
const nextHeight = parseFloat(ref.style.height);
emitImageTransform(
imageId,
position.x,
position.y,
nextWidth,
nextHeight,
);
}}
style={{ zIndex }}
>
<Box
onMouseEnter={() => setActiveImageId(imageId)}
onMouseLeave={() => {
setActiveImageId((current) => (current === imageId ? null : current));
}}
onDoubleClick={(event) => {
event.stopPropagation();
onImageReset(selectedPage, imageId);
}}
style={{
width: '100%',
height: '100%',
cursor: isActive ? 'grabbing' : 'grab',
outline: isActive
? '2px solid rgba(59, 130, 246, 0.9)'
: '1px solid rgba(148, 163, 184, 0.4)',
outlineOffset: '-1px',
borderRadius: 4,
backgroundColor: 'rgba(255,255,255,0.04)',
transition: 'outline 120ms ease',
}}
>
<img
src={src}
alt={t('pdfJsonEditor.imageLabel', 'Placed image')}
style={{
width: '100%',
height: '100%',
objectFit: 'contain',
pointerEvents: 'none',
userSelect: 'none',
}}
/>
</Box>
</Rnd>
);
})}
{visibleGroups.length === 0 && orderedImages.length === 0 ? (
<Group justify="center" align="center" style={{ height: '100%' }}>
<Stack gap={4} align="center">
<Text size="sm" c="dimmed">
@ -373,6 +581,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
justifyContent: 'flex-start',
pointerEvents: 'auto',
cursor: 'text',
zIndex: 2_000_000,
};
if (isEditing) {
@ -383,17 +592,38 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
true,
changed,
<div
ref={(node) => {
if (node) {
editorRefs.current.set(group.id, node);
} else {
editorRefs.current.delete(group.id);
}
}}
contentEditable
suppressContentEditableWarning
data-editor-group={group.id}
onBlur={(event) => {
const value = event.currentTarget.innerText.replace(/\u00A0/g, ' ');
caretOffsetsRef.current.delete(group.id);
editorRefs.current.delete(group.id);
setActiveGroupId(null);
onGroupEdit(group.pageIndex, group.id, value);
setEditingGroupId(null);
}}
onInput={(event) => {
const value = event.currentTarget.innerText.replace(/\u00A0/g, ' ');
const offset = getCaretOffset(event.currentTarget);
caretOffsetsRef.current.set(group.id, offset);
onGroupEdit(group.pageIndex, group.id, value);
requestAnimationFrame(() => {
if (editingGroupId !== group.id) {
return;
}
const editor = editorRefs.current.get(group.id);
if (editor) {
setCaretOffset(editor, caretOffsetsRef.current.get(group.id) ?? editor.innerText.length);
}
});
}}
style={{
width: '100%',

View File

@ -11,6 +11,7 @@ import { downloadBlob, downloadTextAsFile } from '../utils/downloadUtils';
import { getFilenameFromHeaders } from '../utils/fileResponseUtils';
import {
PdfJsonDocument,
PdfJsonImageElement,
TextGroup,
PdfJsonEditorViewData,
} from './pdfJsonEditorTypes';
@ -19,6 +20,9 @@ import {
getDirtyPages,
groupDocumentText,
restoreGlyphElements,
extractDocumentImages,
cloneImageElement,
valueOr,
} from './pdfJsonEditorUtils';
import PdfJsonEditorView from '../components/tools/pdfJsonEditor/PdfJsonEditorView';
@ -46,13 +50,19 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const [loadedDocument, setLoadedDocument] = useState<PdfJsonDocument | null>(null);
const [groupsByPage, setGroupsByPage] = useState<TextGroup[][]>([]);
const [imagesByPage, setImagesByPage] = useState<PdfJsonImageElement[][]>([]);
const [selectedPage, setSelectedPage] = useState(0);
const [fileName, setFileName] = useState('');
const [errorMessage, setErrorMessage] = useState<string | null>(null);
const [isGeneratingPdf, setIsGeneratingPdf] = useState(false);
const [isConverting, setIsConverting] = useState(false);
const dirtyPages = useMemo(() => getDirtyPages(groupsByPage), [groupsByPage]);
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
const dirtyPages = useMemo(
() => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current),
[groupsByPage, imagesByPage],
);
const hasChanges = useMemo(() => dirtyPages.some(Boolean), [dirtyPages]);
const hasDocument = loadedDocument !== null;
const viewLabel = useMemo(() => t('pdfJsonEditor.viewLabel', 'PDF Editor'), [t]);
@ -60,12 +70,17 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const resetToDocument = useCallback((document: PdfJsonDocument | null) => {
if (!document) {
setGroupsByPage([]);
setImagesByPage([]);
originalImagesRef.current = [];
setSelectedPage(0);
return;
}
const cloned = deepCloneDocument(document);
const groups = groupDocumentText(cloned);
const images = extractDocumentImages(cloned);
originalImagesRef.current = images.map((page) => page.map(cloneImageElement));
setGroupsByPage(groups);
setImagesByPage(images);
setSelectedPage(0);
}, []);
@ -108,6 +123,8 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
console.error('Failed to load file', error);
setLoadedDocument(null);
setGroupsByPage([]);
setImagesByPage([]);
originalImagesRef.current = [];
if (isPdf) {
setErrorMessage(
@ -142,6 +159,80 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
);
}, []);
const handleImageTransform = useCallback(
(
pageIndex: number,
imageId: string,
next: { left: number; bottom: number; width: number; height: number; transform: number[] },
) => {
setImagesByPage((previous) =>
previous.map((images, idx) => {
if (idx !== pageIndex) {
return images;
}
let changed = false;
const updated = images.map((image) => {
if ((image.id ?? '') !== imageId) {
return image;
}
const originalTransform = image.transform ?? originalImagesRef.current[idx]?.find((base) => (base.id ?? '') === imageId)?.transform;
const scaleXSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[0]) || 1 : 1;
const scaleYSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[3]) || 1 : 1;
const right = next.left + next.width;
const top = next.bottom + next.height;
const updatedImage: PdfJsonImageElement = {
...image,
x: next.left,
y: next.bottom,
left: next.left,
bottom: next.bottom,
right,
top,
width: next.width,
height: next.height,
transform: scaleXSign < 0 || scaleYSign < 0 ? [
next.width * scaleXSign,
0,
0,
next.height * scaleYSign,
next.left,
scaleYSign >= 0 ? next.bottom : next.bottom + next.height,
] : null,
};
const isSame =
Math.abs(valueOr(image.left, 0) - next.left) < 1e-4 &&
Math.abs(valueOr(image.bottom, 0) - next.bottom) < 1e-4 &&
Math.abs(valueOr(image.width, 0) - next.width) < 1e-4 &&
Math.abs(valueOr(image.height, 0) - next.height) < 1e-4;
if (!isSame) {
changed = true;
}
return updatedImage;
});
return changed ? updated : images;
}),
);
},
[],
);
const handleImageReset = useCallback((pageIndex: number, imageId: string) => {
const baseline = originalImagesRef.current[pageIndex]?.find((image) => (image.id ?? '') === imageId);
if (!baseline) {
return;
}
setImagesByPage((previous) =>
previous.map((images, idx) => {
if (idx !== pageIndex) {
return images;
}
return images.map((image) => ((image.id ?? '') === imageId ? cloneImageElement(baseline) : image));
}),
);
}, []);
const handleResetEdits = useCallback(() => {
if (!loadedDocument) {
return;
@ -155,13 +246,18 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
return null;
}
const updatedDocument = restoreGlyphElements(loadedDocument, groupsByPage);
const updatedDocument = restoreGlyphElements(
loadedDocument,
groupsByPage,
imagesByPage,
originalImagesRef.current,
);
const baseName = sanitizeBaseName(fileName || loadedDocument.metadata?.title || undefined);
return {
document: updatedDocument,
filename: `${baseName}.json`,
};
}, [fileName, groupsByPage, loadedDocument]);
}, [fileName, groupsByPage, imagesByPage, loadedDocument]);
const handleDownloadJson = useCallback(() => {
const payload = buildPayload();
@ -229,6 +325,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const viewData = useMemo<PdfJsonEditorViewData>(() => ({
document: loadedDocument,
groupsByPage,
imagesByPage,
selectedPage,
dirtyPages,
hasDocument,
@ -240,10 +337,14 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
onLoadJson: handleLoadFile,
onSelectPage: handleSelectPage,
onGroupEdit: handleGroupTextChange,
onImageTransform: handleImageTransform,
onImageReset: handleImageReset,
onReset: handleResetEdits,
onDownloadJson: handleDownloadJson,
onGeneratePdf: handleGeneratePdf,
}), [
handleImageTransform,
imagesByPage,
dirtyPages,
errorMessage,
fileName,
@ -251,6 +352,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
handleDownloadJson,
handleGeneratePdf,
handleGroupTextChange,
handleImageReset,
handleLoadFile,
handleResetEdits,
handleSelectPage,

View File

@ -33,6 +33,8 @@ export interface PdfJsonTextElement {
fontSizeInPt?: number | null;
characterSpacing?: number | null;
wordSpacing?: number | null;
spaceWidth?: number | null;
zOrder?: number | null;
horizontalScaling?: number | null;
leading?: number | null;
rise?: number | null;
@ -46,6 +48,26 @@ export interface PdfJsonTextElement {
strokeColor?: PdfJsonTextColor | null;
}
export interface PdfJsonImageElement {
id?: string | null;
objectName?: string | null;
inlineImage?: boolean | null;
nativeWidth?: number | null;
nativeHeight?: number | null;
x?: number | null;
y?: number | null;
width?: number | null;
height?: number | null;
left?: number | null;
right?: number | null;
top?: number | null;
bottom?: number | null;
transform?: number[] | null;
zOrder?: number | null;
imageData?: string | null;
imageFormat?: string | null;
}
export interface PdfJsonStream {
dictionary?: Record<string, unknown> | null;
rawData?: string | null;
@ -57,6 +79,7 @@ export interface PdfJsonPage {
height?: number | null;
rotation?: number | null;
textElements?: PdfJsonTextElement[] | null;
imageElements?: PdfJsonImageElement[] | null;
resources?: unknown;
contentStreams?: PdfJsonStream[] | null;
}
@ -107,6 +130,7 @@ export const DEFAULT_PAGE_HEIGHT = 792;
export interface PdfJsonEditorViewData {
document: PdfJsonDocument | null;
groupsByPage: TextGroup[][];
imagesByPage: PdfJsonImageElement[][];
selectedPage: number;
dirtyPages: boolean[];
hasDocument: boolean;
@ -118,6 +142,18 @@ export interface PdfJsonEditorViewData {
onLoadJson: (file: File | null) => Promise<void> | void;
onSelectPage: (pageIndex: number) => void;
onGroupEdit: (pageIndex: number, groupId: string, value: string) => void;
onImageTransform: (
pageIndex: number,
imageId: string,
next: {
left: number;
bottom: number;
width: number;
height: number;
transform: number[];
},
) => void;
onImageReset: (pageIndex: number, imageId: string) => void;
onReset: () => void;
onDownloadJson: () => void;
onGeneratePdf: () => void;

View File

@ -3,6 +3,7 @@ import {
PdfJsonDocument,
PdfJsonPage,
PdfJsonTextElement,
PdfJsonImageElement,
TextGroup,
DEFAULT_PAGE_HEIGHT,
DEFAULT_PAGE_WIDTH,
@ -11,6 +12,9 @@ import {
const LINE_TOLERANCE = 2;
const GAP_FACTOR = 0.6;
const SPACE_MIN_GAP = 1.5;
const MIN_CHAR_WIDTH_FACTOR = 0.35;
const MAX_CHAR_WIDTH_FACTOR = 1.25;
const EXTRA_GAP_RATIO = 0.8;
export const valueOr = (value: number | null | undefined, fallback = 0): number => {
if (value === null || value === undefined || Number.isNaN(value)) {
@ -24,6 +28,11 @@ export const cloneTextElement = (element: PdfJsonTextElement): PdfJsonTextElemen
textMatrix: element.textMatrix ? [...element.textMatrix] : element.textMatrix ?? undefined,
});
export const cloneImageElement = (element: PdfJsonImageElement): PdfJsonImageElement => ({
...element,
transform: element.transform ? [...element.transform] : element.transform ?? undefined,
});
const getBaseline = (element: PdfJsonTextElement): number => {
if (element.textMatrix && element.textMatrix.length === 6) {
return valueOr(element.textMatrix[5]);
@ -71,6 +80,41 @@ const getElementBounds = (element: PdfJsonTextElement): BoundingBox => {
};
};
export const getImageBounds = (element: PdfJsonImageElement): BoundingBox => {
const left = valueOr(element.left ?? element.x, 0);
const computedWidth = valueOr(element.width, Math.max(valueOr(element.right, left) - left, 0));
const right = valueOr(element.right ?? left + computedWidth, left + computedWidth);
const bottom = valueOr(element.bottom ?? element.y, 0);
const computedHeight = valueOr(element.height, Math.max(valueOr(element.top, bottom) - bottom, 0));
const top = valueOr(element.top ?? bottom + computedHeight, bottom + computedHeight);
return {
left,
right,
bottom,
top,
};
};
const getSpacingHint = (element: PdfJsonTextElement): number => {
const spaceWidth = valueOr(element.spaceWidth, 0);
if (spaceWidth > 0) {
return spaceWidth;
}
const wordSpacing = valueOr(element.wordSpacing, 0);
if (wordSpacing > 0) {
return wordSpacing;
}
const characterSpacing = valueOr(element.characterSpacing, 0);
return Math.max(characterSpacing, 0);
};
const estimateCharWidth = (element: PdfJsonTextElement, avgFontSize: number): number => {
const rawWidth = getWidth(element);
const minWidth = avgFontSize * MIN_CHAR_WIDTH_FACTOR;
const maxWidth = avgFontSize * MAX_CHAR_WIDTH_FACTOR;
return Math.min(Math.max(rawWidth, minWidth), maxWidth);
};
const mergeBounds = (bounds: BoundingBox[]): BoundingBox => {
if (bounds.length === 0) {
return { left: 0, right: 0, top: 0, bottom: 0 };
@ -88,10 +132,32 @@ const mergeBounds = (bounds: BoundingBox[]): BoundingBox => {
const shouldInsertSpace = (prev: PdfJsonTextElement, current: PdfJsonTextElement): boolean => {
const prevRight = getX(prev) + getWidth(prev);
const gap = getX(current) - prevRight;
const trailingGap = Math.max(0, getX(current) - prevRight);
const avgFontSize = (getFontSize(prev) + getFontSize(current)) / 2;
const threshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR);
return gap > threshold;
const baselineAdvance = Math.max(0, getX(current) - getX(prev));
const charWidthEstimate = estimateCharWidth(prev, avgFontSize);
const inferredGap = Math.max(0, baselineAdvance - charWidthEstimate);
const spacingHint = Math.max(
SPACE_MIN_GAP,
getSpacingHint(prev),
getSpacingHint(current),
avgFontSize * GAP_FACTOR,
);
if (trailingGap > spacingHint) {
return true;
}
if (inferredGap > spacingHint * EXTRA_GAP_RATIO) {
return true;
}
const prevText = (prev.text ?? '').trimEnd();
if (prevText.endsWith('-')) {
return false;
}
return false;
};
const buildGroupText = (elements: PdfJsonTextElement[]): string => {
@ -212,6 +278,27 @@ export const groupDocumentText = (document: PdfJsonDocument | null | undefined):
return pages.map((page, index) => groupPageTextElements(page, index));
};
export const extractPageImages = (
page: PdfJsonPage | null | undefined,
pageIndex: number,
): PdfJsonImageElement[] => {
const images = page?.imageElements ?? [];
return images.map((image, imageIndex) => {
const clone = cloneImageElement(image);
if (!clone.id || clone.id.trim().length === 0) {
clone.id = `page-${pageIndex}-image-${imageIndex}`;
}
return clone;
});
};
export const extractDocumentImages = (
document: PdfJsonDocument | null | undefined,
): PdfJsonImageElement[][] => {
const pages = document?.pages ?? [];
return pages.map((page, index) => extractPageImages(page, index));
};
export const deepCloneDocument = (document: PdfJsonDocument): PdfJsonDocument => {
if (typeof structuredClone === 'function') {
return structuredClone(document);
@ -277,14 +364,19 @@ const distributeTextAcrossElements = (text: string | undefined, elements: PdfJso
export const buildUpdatedDocument = (
source: PdfJsonDocument,
groupsByPage: TextGroup[][],
imagesByPage: PdfJsonImageElement[][],
): PdfJsonDocument => {
const updated = deepCloneDocument(source);
const pages = updated.pages ?? [];
updated.pages = pages.map((page, pageIndex) => {
const groups = groupsByPage[pageIndex] ?? [];
const images = imagesByPage[pageIndex] ?? [];
if (!groups.length) {
return page;
return {
...page,
imageElements: images.map(cloneImageElement),
};
}
const updatedElements: PdfJsonTextElement[] = groups.flatMap((group) => {
@ -297,6 +389,7 @@ export const buildUpdatedDocument = (
return {
...page,
textElements: updatedElements,
imageElements: images.map(cloneImageElement),
contentStreams: page.contentStreams ?? [],
};
});
@ -307,14 +400,22 @@ export const buildUpdatedDocument = (
export const restoreGlyphElements = (
source: PdfJsonDocument,
groupsByPage: TextGroup[][],
imagesByPage: PdfJsonImageElement[][],
originalImagesByPage: PdfJsonImageElement[][],
): PdfJsonDocument => {
const updated = deepCloneDocument(source);
const pages = updated.pages ?? [];
updated.pages = pages.map((page, pageIndex) => {
const groups = groupsByPage[pageIndex] ?? [];
const images = imagesByPage[pageIndex] ?? [];
const baselineImages = originalImagesByPage[pageIndex] ?? [];
if (!groups.length) {
return page;
return {
...page,
imageElements: images.map(cloneImageElement),
};
}
const rebuiltElements: PdfJsonTextElement[] = [];
@ -327,16 +428,105 @@ export const restoreGlyphElements = (
rebuiltElements.push(...originals);
});
const textDirty = groups.some((group) => group.text !== group.originalText);
const imageDirty = areImageListsDifferent(images, baselineImages);
const nextStreams = textDirty || imageDirty ? [] : page.contentStreams ?? [];
return {
...page,
textElements: rebuiltElements,
contentStreams: page.contentStreams ?? [],
imageElements: images.map(cloneImageElement),
contentStreams: nextStreams,
};
});
return updated;
};
export const getDirtyPages = (groupsByPage: TextGroup[][]): boolean[] => {
return groupsByPage.map((groups) => groups.some((group) => group.text !== group.originalText));
const approxEqual = (a: number | null | undefined, b: number | null | undefined, tolerance = 0.25): boolean => {
const first = typeof a === 'number' && Number.isFinite(a) ? a : 0;
const second = typeof b === 'number' && Number.isFinite(b) ? b : 0;
return Math.abs(first - second) <= tolerance;
};
const arrayApproxEqual = (
first: number[] | null | undefined,
second: number[] | null | undefined,
tolerance = 0.25,
): boolean => {
if (!first && !second) {
return true;
}
if (!first || !second) {
return false;
}
if (first.length !== second.length) {
return false;
}
for (let index = 0; index < first.length; index += 1) {
if (!approxEqual(first[index], second[index], tolerance)) {
return false;
}
}
return true;
};
const areImageElementsEqual = (
current: PdfJsonImageElement,
original: PdfJsonImageElement,
): boolean => {
if (current === original) {
return true;
}
if (!current || !original) {
return false;
}
const sameData = (current.imageData ?? null) === (original.imageData ?? null);
const sameFormat = (current.imageFormat ?? null) === (original.imageFormat ?? null);
return (
sameData &&
sameFormat &&
approxEqual(current.x, original.x) &&
approxEqual(current.y, original.y) &&
approxEqual(current.width, original.width) &&
approxEqual(current.height, original.height) &&
approxEqual(current.left, original.left) &&
approxEqual(current.right, original.right) &&
approxEqual(current.top, original.top) &&
approxEqual(current.bottom, original.bottom) &&
(current.zOrder ?? null) === (original.zOrder ?? null) &&
arrayApproxEqual(current.transform, original.transform)
);
};
export const areImageListsDifferent = (
current: PdfJsonImageElement[],
original: PdfJsonImageElement[],
): boolean => {
if (current.length !== original.length) {
return true;
}
for (let index = 0; index < current.length; index += 1) {
if (!areImageElementsEqual(current[index], original[index])) {
return true;
}
}
return false;
};
export const getDirtyPages = (
groupsByPage: TextGroup[][],
imagesByPage: PdfJsonImageElement[][],
originalImagesByPage: PdfJsonImageElement[][],
): boolean[] => {
return groupsByPage.map((groups, index) => {
const textDirty = groups.some((group) => group.text !== group.originalText);
const imageDirty = areImageListsDifferent(
imagesByPage[index] ?? [],
originalImagesByPage[index] ?? [],
);
return textDirty || imageDirty;
});
};