From 4d9cf450096017f851165ca18544382f051b4dee Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Thu, 23 Oct 2025 11:20:16 +0100 Subject: [PATCH] test --- .../converters/ConvertPdfJsonController.java | 9 +- .../SPDF/model/json/PdfJsonCosValue.java | 49 + .../SPDF/model/json/PdfJsonDocument.java | 3 + .../software/SPDF/model/json/PdfJsonFont.java | 34 +- .../model/json/PdfJsonFontCidSystemInfo.java | 20 + .../software/SPDF/model/json/PdfJsonPage.java | 6 + .../SPDF/model/json/PdfJsonStream.java | 27 + .../SPDF/model/json/PdfJsonTextElement.java | 2 + .../service/PdfJsonConversionService.java | 1362 +++++++++++++++-- .../src/main/resources/application.properties | 1 + compare_json.py | 43 + .../public/locales/en-GB/translation.json | 3 + .../tools/pdfJsonEditor/PdfJsonEditorView.tsx | 463 ++++++ frontend/src/constants/convertConstants.ts | 21 +- .../src/constants/convertSupportedFornats.ts | 2 +- .../src/data/useTranslatedToolRegistry.tsx | 14 + frontend/src/tools/PdfJsonEditor.tsx | 289 ++++ frontend/src/tools/pdfJsonEditorTypes.ts | 110 ++ frontend/src/tools/pdfJsonEditorUtils.ts | 344 +++++ frontend/src/types/toolId.ts | 2 +- 20 files changed, 2628 insertions(+), 176 deletions(-) create mode 100644 app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonCosValue.java create mode 100644 app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFontCidSystemInfo.java create mode 100644 app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonStream.java create mode 100644 compare_json.py create mode 100644 frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx create mode 100644 frontend/src/tools/PdfJsonEditor.tsx create mode 100644 frontend/src/tools/pdfJsonEditorTypes.ts create mode 100644 frontend/src/tools/pdfJsonEditorUtils.ts diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java index 372d0e201..582679dfd 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java @@ -31,7 +31,8 @@ public class ConvertPdfJsonController { description = "Extracts PDF text, fonts, and metadata into an editable JSON structure that can be" + " transformed back into a PDF. Input:PDF Output:JSON Type:SISO") - public ResponseEntity convertPdfToJson(@ModelAttribute PDFFile request) throws Exception { + public ResponseEntity convertPdfToJson(@ModelAttribute PDFFile request) + throws Exception { MultipartFile inputFile = request.getFileInput(); if (inputFile == null) { throw ExceptionUtils.createNullArgumentException("fileInput"); @@ -44,8 +45,7 @@ public class ConvertPdfJsonController { ? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "") : "document"; String docName = baseName + ".json"; - return WebResponseUtils.bytesToWebResponse( - jsonBytes, docName, MediaType.APPLICATION_JSON); + return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON); } @AutoJobPostMapping(consumes = "multipart/form-data", value = "/json/pdf") @@ -55,7 +55,8 @@ public class ConvertPdfJsonController { description = "Rebuilds a PDF from the editable JSON structure generated by the PDF to JSON" + " endpoint. Input:JSON Output:PDF Type:SISO") - public ResponseEntity convertJsonToPdf(@ModelAttribute GeneralFile request) throws Exception { + public ResponseEntity convertJsonToPdf(@ModelAttribute GeneralFile request) + throws Exception { MultipartFile jsonFile = request.getFileInput(); if (jsonFile == null) { throw ExceptionUtils.createNullArgumentException("fileInput"); diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonCosValue.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonCosValue.java new file mode 100644 index 000000000..043414c4b --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonCosValue.java @@ -0,0 +1,49 @@ +package stirling.software.SPDF.model.json; + +import java.util.List; +import java.util.Map; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonCosValue { + + public enum Type { + NULL, + BOOLEAN, + INTEGER, + FLOAT, + NAME, + STRING, + ARRAY, + DICTIONARY, + STREAM + } + + private Type type; + + /** + * Holds the decoded value for primitives (boolean, integer, float, name, string). For name + * values the stored value is the PDF name literal. For string values the content is Base64 + * encoded to safely transport arbitrary binaries. + */ + private Object value; + + /** Reference to nested values for arrays. */ + private List items; + + /** Reference to nested values for dictionaries. */ + private Map entries; + + /** Stream payload when {@code type == STREAM}. */ + private PdfJsonStream stream; +} diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java index 805f664ce..3f5bd1f8b 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java @@ -19,6 +19,9 @@ public class PdfJsonDocument { private PdfJsonMetadata metadata; + /** Optional XMP metadata packet stored as Base64. */ + private String xmpMetadata; + @Builder.Default private List fonts = new ArrayList<>(); @Builder.Default private List pages = new ArrayList<>(); diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java index a0eba01f0..98d251103 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java @@ -14,12 +14,42 @@ import lombok.NoArgsConstructor; @JsonInclude(JsonInclude.Include.NON_NULL) public class PdfJsonFont { + /** PDF resource name (e.g. F1) used as the primary identifier. */ private String id; - private String name; + + /** Logical page number that owns this font resource. */ + private Integer pageNumber; + + /** Stable UID combining page number and resource for diagnostics. */ + private String uid; + + /** Reported PostScript/Base font name. */ + private String baseName; + + /** Declared subtype in the COS dictionary. */ private String subtype; + + /** Encoding dictionary or name. */ private String encoding; + + /** CID system info for Type0 fonts. */ + private PdfJsonFontCidSystemInfo cidSystemInfo; + + /** True when the original PDF embedded the font program. */ private Boolean embedded; + + /** Font program bytes (TTF/OTF/CFF/PFB) encoded as Base64. */ + private String program; + + /** Hint describing the font program type (ttf, otf, cff, pfb, etc.). */ + private String programFormat; + + /** ToUnicode stream encoded as Base64 when present. */ + private String toUnicode; + + /** Mapped Standard 14 font name when available. */ private String standard14Name; + + /** Font descriptor flags copied from the source document. */ private Integer fontDescriptorFlags; - private String base64Data; } diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFontCidSystemInfo.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFontCidSystemInfo.java new file mode 100644 index 000000000..7ddd20f5f --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFontCidSystemInfo.java @@ -0,0 +1,20 @@ +package stirling.software.SPDF.model.json; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonFontCidSystemInfo { + + private String registry; + private String ordering; + private Integer supplement; +} diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java index 8a02cc33e..63b614d02 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java @@ -23,4 +23,10 @@ public class PdfJsonPage { private Integer rotation; @Builder.Default private List textElements = new ArrayList<>(); + + /** Serialized representation of the page resources dictionary. */ + private PdfJsonCosValue resources; + + /** Raw content streams associated with the page, preserved for lossless round-tripping. */ + @Builder.Default private List contentStreams = new ArrayList<>(); } diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonStream.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonStream.java new file mode 100644 index 000000000..eb8ca66a2 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonStream.java @@ -0,0 +1,27 @@ +package stirling.software.SPDF.model.json; + +import java.util.Map; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonStream { + + /** + * A dictionary of entries that describe the stream metadata (Filter, DecodeParms, etc). Each + * entry is represented using {@link PdfJsonCosValue} so nested structures are supported. + */ + private Map dictionary; + + /** Raw stream bytes in Base64 form. Data is stored exactly as it appeared in the source PDF. */ + private String rawData; +} diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java index 97be75234..5c72159aa 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java @@ -20,6 +20,8 @@ public class PdfJsonTextElement { private String text; private String fontId; private Float fontSize; + private Float fontMatrixSize; + private Float fontSizeInPt; private Float x; private Float y; private Float width; diff --git a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java index c35022fb6..ada32e8b0 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java @@ -4,28 +4,48 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.time.Instant; import java.time.format.DateTimeParseException; import java.util.ArrayList; import java.util.Base64; import java.util.Calendar; +import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; +import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.TimeZone; +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSBoolean; import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSFloat; +import org.apache.pdfbox.cos.COSInteger; import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSNull; +import org.apache.pdfbox.cos.COSObject; +import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdfparser.PDFStreamParser; +import org.apache.pdfbox.pdfwriter.ContentStreamWriter; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.common.PDMetadata; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; import org.apache.pdfbox.pdmodel.font.PDType0Font; @@ -35,6 +55,9 @@ import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.util.Matrix; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.core.io.Resource; +import org.springframework.core.io.ResourceLoader; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; @@ -43,10 +66,13 @@ import com.fasterxml.jackson.databind.ObjectMapper; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import stirling.software.SPDF.model.json.PdfJsonCosValue; import stirling.software.SPDF.model.json.PdfJsonDocument; import stirling.software.SPDF.model.json.PdfJsonFont; +import stirling.software.SPDF.model.json.PdfJsonFontCidSystemInfo; import stirling.software.SPDF.model.json.PdfJsonMetadata; import stirling.software.SPDF.model.json.PdfJsonPage; +import stirling.software.SPDF.model.json.PdfJsonStream; import stirling.software.SPDF.model.json.PdfJsonTextElement; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.ExceptionUtils; @@ -58,27 +84,61 @@ public class PdfJsonConversionService { private final CustomPDFDocumentFactory pdfDocumentFactory; private final ObjectMapper objectMapper; + private final ResourceLoader resourceLoader; + + private static final String FALLBACK_FONT_ID = "fallback-noto-sans"; + private static final String DEFAULT_FALLBACK_FONT_LOCATION = + "classpath:/static/fonts/NotoSans-Regular.ttf"; + + @Value("${stirling.pdf.fallback-font:" + DEFAULT_FALLBACK_FONT_LOCATION + "}") + private String fallbackFontLocation; + + private byte[] fallbackFontBytes; public byte[] convertPdfToJson(MultipartFile file) throws IOException { if (file == null) { throw ExceptionUtils.createNullArgumentException("fileInput"); } try (PDDocument document = pdfDocumentFactory.load(file.getInputStream(), true)) { + int totalPages = document.getNumberOfPages(); + log.info("Converting PDF to JSON ({} pages)", totalPages); Map fonts = new LinkedHashMap<>(); Map> textByPage = new LinkedHashMap<>(); - TextCollectingStripper stripper = new TextCollectingStripper(fonts, textByPage); + Map> pageFontResources = new HashMap<>(); + int pageNumber = 1; + for (PDPage page : document.getPages()) { + Map resourceMap = + collectFontsForPage(document, page, pageNumber, fonts); + pageFontResources.put(pageNumber, resourceMap); + log.debug( + "PDF→JSON: collected {} font resources on page {}", + resourceMap.size(), + pageNumber); + pageNumber++; + } + + TextCollectingStripper stripper = + new TextCollectingStripper(document, fonts, textByPage, pageFontResources); stripper.setSortByPosition(true); stripper.getText(document); PdfJsonDocument pdfJson = new PdfJsonDocument(); pdfJson.setMetadata(extractMetadata(document)); - pdfJson.setFonts(new ArrayList<>(fonts.values())); + pdfJson.setXmpMetadata(extractXmpMetadata(document)); + List serializedFonts = new ArrayList<>(fonts.values()); + serializedFonts.sort( + Comparator.comparing( + PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder()))); + pdfJson.setFonts(serializedFonts); pdfJson.setPages(extractPages(document, textByPage)); - return objectMapper - .writerWithDefaultPrettyPrinter() - .writeValueAsBytes(pdfJson); + log.info( + "PDF→JSON conversion complete (fonts: {}, pages: {})", + serializedFonts.size(), + pdfJson.getPages().size()); + + return objectMapper.writerWithDefaultPrettyPrinter().writeValueAsBytes(pdfJson); } } @@ -87,21 +147,33 @@ public class PdfJsonConversionService { throw ExceptionUtils.createNullArgumentException("fileInput"); } byte[] jsonBytes = file.getBytes(); - PdfJsonDocument pdfJson = - objectMapper.readValue(jsonBytes, PdfJsonDocument.class); + PdfJsonDocument pdfJson = objectMapper.readValue(jsonBytes, PdfJsonDocument.class); + + List fontModels = pdfJson.getFonts(); + if (fontModels == null) { + fontModels = new ArrayList<>(); + pdfJson.setFonts(fontModels); + } try (PDDocument document = new PDDocument()) { applyMetadata(document, pdfJson.getMetadata()); + applyXmpMetadata(document, pdfJson.getXmpMetadata()); - Map fontMap = buildFontMap(document, pdfJson.getFonts()); - PDFont defaultFont = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + Map fontMap = buildFontMap(document, fontModels); + log.info("Converting JSON to PDF ({} font resources)", fontMap.size()); List pages = pdfJson.getPages(); if (pages == null) { pages = new ArrayList<>(); } + int pageIndex = 0; for (PdfJsonPage pageModel : pages) { + int pageNumberValue = + pageModel.getPageNumber() != null + ? pageModel.getPageNumber() + : pageIndex + 1; + log.info("Reconstructing page {}", pageNumberValue); PDRectangle pageSize = new PDRectangle( safeFloat(pageModel.getWidth(), 612f), @@ -112,37 +184,67 @@ public class PdfJsonConversionService { } document.addPage(page); - List elements = pageModel.getTextElements(); - if (elements == null || elements.isEmpty()) { + applyPageResources(document, page, pageModel.getResources()); + + List preservedStreams = + buildContentStreams(document, pageModel.getContentStreams()); + if (!preservedStreams.isEmpty()) { + page.setContents(preservedStreams); + } + + List elements = + pageModel.getTextElements() != null + ? pageModel.getTextElements() + : new ArrayList<>(); + + boolean fallbackAssigned = + preflightTextElements( + document, fontMap, fontModels, elements, pageNumberValue); + + log.info( + "Page {} preflight complete (elements={}, fallbackApplied={})", + pageNumberValue, + elements.size(), + fallbackAssigned); + + if (elements.stream().anyMatch(el -> FALLBACK_FONT_ID.equals(el.getFontId()))) { + ensureFallbackResource(page, fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID))); + log.info("Page {} uses fallback font for some elements", pageNumberValue); + } + + boolean hasText = !elements.isEmpty(); + boolean rewriteSucceeded = false; + + if (!preservedStreams.isEmpty() && hasText) { + if (fallbackAssigned) { + log.info( + "Skipping token rewrite for page {} because fallback font was applied", + pageNumberValue); + rewriteSucceeded = false; + } else { + log.info("Attempting token rewrite for page {}", pageNumberValue); + rewriteSucceeded = rewriteTextOperators(document, page, elements); + if (!rewriteSucceeded) { + log.info( + "Token rewrite failed for page {}, regenerating text stream", + pageNumberValue); + } else { + log.info("Token rewrite succeeded for page {}", pageNumberValue); + } + } + } + + if (!hasText) { + pageIndex++; continue; } - try (PDPageContentStream contentStream = - new PDPageContentStream( - document, - page, - AppendMode.APPEND, - true, - true)) { - contentStream.beginText(); - for (PdfJsonTextElement element : elements) { - PDFont font = fontMap.getOrDefault(element.getFontId(), defaultFont); - float fontSize = safeFloat(element.getFontSize(), 12f); - contentStream.setFont(font, fontSize); - applyRenderingMode(contentStream, element.getRenderingMode()); - applyTextMatrix(contentStream, element); - try { - contentStream.showText(Objects.toString(element.getText(), "")); - } catch (IllegalArgumentException ex) { - log.debug( - "Falling back to default font for text element due to encoding issue: {}", - ex.getMessage()); - contentStream.setFont(defaultFont, fontSize); - contentStream.showText(Objects.toString(element.getText(), "")); - } - } - contentStream.endText(); + if (!rewriteSucceeded) { + log.info("Regenerating text content for page {}", pageNumberValue); + regenerateTextContent(document, page, elements, fontMap, pageNumberValue); + log.info("Text regeneration complete for page {}", pageNumberValue); } + pageIndex++; } try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { @@ -152,8 +254,320 @@ public class PdfJsonConversionService { } } + private Map collectFontsForPage( + PDDocument document, PDPage page, int pageNumber, Map fonts) + throws IOException { + PDResources resources = page.getResources(); + if (resources == null) { + return Collections.emptyMap(); + } + + Map mapping = new HashMap<>(); + for (COSName resourceName : resources.getFontNames()) { + PDFont font = resources.getFont(resourceName); + if (font == null) { + continue; + } + String fontId = resourceName.getName(); + mapping.put(font, fontId); + + String key = buildFontKey(pageNumber, fontId); + if (!fonts.containsKey(key)) { + fonts.put(key, buildFontModel(document, font, fontId, pageNumber)); + } + } + return mapping; + } + + private String buildFontKey(int pageNumber, String fontId) { + return pageNumber + ":" + fontId; + } + + private String buildFontKey(Integer pageNumber, String fontId) { + int page = pageNumber != null ? pageNumber : -1; + return buildFontKey(page, fontId); + } + + private PdfJsonFont buildFontModel( + PDDocument document, PDFont font, String fontId, int pageNumber) throws IOException { + String subtype = font.getCOSObject().getNameAsString(COSName.SUBTYPE); + String encoding = resolveEncoding(font); + PdfJsonFontCidSystemInfo cidInfo = extractCidSystemInfo(font.getCOSObject()); + boolean embedded = font.isEmbedded(); + FontProgramData programData = embedded ? extractFontProgram(font) : null; + String toUnicode = extractToUnicode(font.getCOSObject()); + String standard14Name = resolveStandard14Name(font); + Integer flags = + font.getFontDescriptor() != null ? font.getFontDescriptor().getFlags() : null; + + return PdfJsonFont.builder() + .id(fontId) + .pageNumber(pageNumber) + .uid(buildFontKey(pageNumber, fontId)) + .baseName(font.getName()) + .subtype(subtype) + .encoding(encoding) + .cidSystemInfo(cidInfo) + .embedded(embedded) + .program(programData != null ? programData.getBase64() : null) + .programFormat(programData != null ? programData.getFormat() : null) + .toUnicode(toUnicode) + .standard14Name(standard14Name) + .fontDescriptorFlags(flags) + .build(); + } + + private boolean preflightTextElements( + PDDocument document, + Map fontMap, + List fontModels, + List elements, + int pageNumber) + throws IOException { + if (elements == null || elements.isEmpty()) { + return false; + } + + PDFont fallbackFont = fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID)); + boolean fallbackApplied = false; + for (PdfJsonTextElement element : elements) { + String text = Objects.toString(element.getText(), ""); + if (text.isEmpty()) { + continue; + } + + PDFont font = fontMap.get(buildFontKey(pageNumber, element.getFontId())); + boolean encodable = false; + if (font != null) { + try { + font.encode(text); + encodable = true; + } catch (IOException | IllegalArgumentException ex) { + log.debug( + "Font {} missing glyphs for text '{}': {}", + element.getFontId(), + text, + ex.getMessage()); + } + } + + if (encodable) { + continue; + } + + element.setFontId(FALLBACK_FONT_ID); + log.info( + "Assigning fallback font to text element on page {} (text='{}')", + pageNumber, + abbreviate(text)); + if (fallbackFont == null) { + fallbackFont = loadFallbackPdfFont(document); + fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), fallbackFont); + if (fontModels.stream().noneMatch(f -> FALLBACK_FONT_ID.equals(f.getId()))) { + fontModels.add(buildFallbackFontModel()); + } + } + fallbackApplied = true; + } + return fallbackApplied; + } + + private PdfJsonFont buildFallbackFontModel() throws IOException { + byte[] bytes = loadFallbackFontBytes(); + String base64 = Base64.getEncoder().encodeToString(bytes); + return PdfJsonFont.builder() + .id(FALLBACK_FONT_ID) + .uid(FALLBACK_FONT_ID) + .baseName("NotoSans-Regular") + .subtype("TrueType") + .embedded(true) + .program(base64) + .programFormat("ttf") + .build(); + } + + private void ensureFallbackResource(PDPage page, PDFont fallbackFont) { + if (fallbackFont == null) { + return; + } + PDResources resources = page.getResources(); + if (resources == null) { + resources = new PDResources(); + page.setResources(resources); + } + COSName fallbackName = COSName.getPDFName(FALLBACK_FONT_ID); + boolean exists = false; + for (COSName name : resources.getFontNames()) { + if (fallbackName.equals(name)) { + exists = true; + break; + } + } + if (!exists) { + resources.put(fallbackName, fallbackFont); + } + } + + private PDFont loadFallbackPdfFont(PDDocument document) throws IOException { + byte[] bytes = loadFallbackFontBytes(); + try (InputStream stream = new ByteArrayInputStream(bytes)) { + return PDType0Font.load(document, stream, true); + } + } + + private byte[] loadFallbackFontBytes() throws IOException { + if (fallbackFontBytes == null) { + Resource resource = resourceLoader.getResource(fallbackFontLocation); + if (!resource.exists()) { + throw new IOException( + "Fallback font resource not found at " + fallbackFontLocation); + } + try (InputStream inputStream = resource.getInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + inputStream.transferTo(baos); + fallbackFontBytes = baos.toByteArray(); + } + } + return fallbackFontBytes; + } + + private PdfJsonFontCidSystemInfo extractCidSystemInfo(COSDictionary fontDictionary) { + if (fontDictionary == null) { + return null; + } + COSBase base = fontDictionary.getDictionaryObject(COSName.CIDSYSTEMINFO); + if (!(base instanceof COSDictionary cidDictionary)) { + return null; + } + String registry = cidDictionary.getString(COSName.REGISTRY); + String ordering = cidDictionary.getString(COSName.ORDERING); + int supplementValue = cidDictionary.getInt(COSName.SUPPLEMENT, -1); + if (registry == null && ordering == null && supplementValue < 0) { + return null; + } + PdfJsonFontCidSystemInfo info = new PdfJsonFontCidSystemInfo(); + info.setRegistry(registry); + info.setOrdering(ordering); + if (supplementValue >= 0) { + info.setSupplement(supplementValue); + } + return info; + } + + private FontProgramData extractFontProgram(PDFont font) throws IOException { + PDFontDescriptor descriptor = font.getFontDescriptor(); + if (descriptor == null) { + return null; + } + + PDStream fontFile3 = descriptor.getFontFile3(); + if (fontFile3 != null) { + String subtype = fontFile3.getCOSObject().getNameAsString(COSName.SUBTYPE); + return readFontProgram(fontFile3, subtype != null ? subtype : "fontfile3", false); + } + + PDStream fontFile2 = descriptor.getFontFile2(); + if (fontFile2 != null) { + return readFontProgram(fontFile2, null, true); + } + + PDStream fontFile = descriptor.getFontFile(); + if (fontFile != null) { + return readFontProgram(fontFile, "type1", false); + } + + return null; + } + + private FontProgramData readFontProgram( + PDStream stream, String formatHint, boolean detectTrueType) throws IOException { + try (InputStream inputStream = stream.createInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + inputStream.transferTo(baos); + byte[] data = baos.toByteArray(); + String format = formatHint; + if (detectTrueType) { + format = detectTrueTypeFormat(data); + } + String base64 = Base64.getEncoder().encodeToString(data); + return new FontProgramData(base64, format); + } + } + + private String detectTrueTypeFormat(byte[] data) { + if (data == null || data.length < 4) { + return "ttf"; + } + String tag = new String(data, 0, 4, StandardCharsets.US_ASCII); + if ("OTTO".equals(tag)) { + return "otf"; + } + if ("true".equals(tag) || "typ1".equals(tag)) { + return "ttf"; + } + int value = + ((data[0] & 0xFF) << 24) + | ((data[1] & 0xFF) << 16) + | ((data[2] & 0xFF) << 8) + | (data[3] & 0xFF); + if (value == 0x00010000) { + return "ttf"; + } + return "ttf"; + } + + private String extractToUnicode(COSDictionary fontDictionary) throws IOException { + if (fontDictionary == null) { + return null; + } + COSBase base = fontDictionary.getDictionaryObject(COSName.TO_UNICODE); + if (!(base instanceof COSStream stream)) { + return null; + } + try (InputStream inputStream = stream.createInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + inputStream.transferTo(baos); + byte[] data = baos.toByteArray(); + if (data.length == 0) { + return null; + } + return Base64.getEncoder().encodeToString(data); + } + } + + private String resolveEncoding(PDFont font) { + if (font == null) { + return null; + } + COSDictionary dictionary = font.getCOSObject(); + if (dictionary == null) { + return null; + } + COSBase encoding = dictionary.getDictionaryObject(COSName.ENCODING); + if (encoding instanceof COSName name) { + return name.getName(); + } + if (encoding instanceof COSDictionary encodingDictionary) { + return encodingDictionary.getNameAsString(COSName.BASE_ENCODING); + } + return null; + } + + private String resolveStandard14Name(PDFont font) { + if (font == null) { + return null; + } + try { + Standard14Fonts.FontName mapped = Standard14Fonts.getMappedFontName(font.getName()); + return mapped != null ? mapped.getName() : null; + } catch (IllegalArgumentException ex) { + return null; + } + } + private List extractPages( - PDDocument document, Map> textByPage) { + PDDocument document, Map> textByPage) + throws IOException { List pages = new ArrayList<>(); int pageIndex = 0; for (PDPage page : document.getPages()) { @@ -164,6 +578,9 @@ public class PdfJsonConversionService { pageModel.setHeight(mediaBox.getHeight()); pageModel.setRotation(page.getRotation()); pageModel.setTextElements(textByPage.getOrDefault(pageIndex + 1, new ArrayList<>())); + pageModel.setResources( + serializeCosValue(page.getCOSObject().getDictionaryObject(COSName.RESOURCES))); + pageModel.setContentStreams(extractContentStreams(page)); pages.add(pageModel); pageIndex++; } @@ -188,6 +605,28 @@ public class PdfJsonConversionService { return metadata; } + private String extractXmpMetadata(PDDocument document) { + if (document.getDocumentCatalog() == null) { + return null; + } + PDMetadata metadata = document.getDocumentCatalog().getMetadata(); + if (metadata == null) { + return null; + } + try (InputStream inputStream = metadata.createInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + inputStream.transferTo(baos); + byte[] data = baos.toByteArray(); + if (data.length == 0) { + return null; + } + return Base64.getEncoder().encodeToString(data); + } catch (IOException ex) { + log.debug("Failed to extract XMP metadata: {}", ex.getMessage()); + return null; + } + } + private void applyMetadata(PDDocument document, PdfJsonMetadata metadata) { if (metadata == null) { return; @@ -210,63 +649,738 @@ public class PdfJsonConversionService { info.setTrapped(metadata.getTrapped()); } + private void applyXmpMetadata(PDDocument document, String base64) { + if (base64 == null || base64.isBlank()) { + return; + } + try (InputStream inputStream = + new ByteArrayInputStream(Base64.getDecoder().decode(base64))) { + PDMetadata metadata = new PDMetadata(document, inputStream); + document.getDocumentCatalog().setMetadata(metadata); + } catch (IllegalArgumentException | IOException ex) { + log.debug("Failed to apply XMP metadata: {}", ex.getMessage()); + } + } + + private void applyPageResources( + PDDocument document, PDPage page, PdfJsonCosValue resourcesModel) throws IOException { + if (resourcesModel == null) { + return; + } + COSBase base = deserializeCosValue(resourcesModel, document); + if (base instanceof COSDictionary dictionary) { + page.setResources(new PDResources(dictionary)); + } + } + + private List buildContentStreams( + PDDocument document, List streamModels) throws IOException { + List streams = new ArrayList<>(); + if (streamModels == null) { + return streams; + } + for (PdfJsonStream streamModel : streamModels) { + if (streamModel == null) { + continue; + } + COSStream cosStream = buildStreamFromModel(streamModel, document); + if (cosStream != null) { + streams.add(new PDStream(cosStream)); + } + } + return streams; + } + + private List extractContentStreams(PDPage page) throws IOException { + List streams = new ArrayList<>(); + Iterator iterator = page.getContentStreams(); + if (iterator == null) { + return streams; + } + while (iterator.hasNext()) { + PDStream stream = iterator.next(); + PdfJsonStream model = serializeStream(stream); + if (model != null) { + streams.add(model); + } + } + return streams; + } + + private COSStream buildStreamFromModel(PdfJsonStream streamModel, PDDocument document) + throws IOException { + COSStream cosStream = document.getDocument().createCOSStream(); + if (streamModel.getDictionary() != null) { + for (Map.Entry entry : + streamModel.getDictionary().entrySet()) { + COSName key = COSName.getPDFName(entry.getKey()); + COSBase value = deserializeCosValue(entry.getValue(), document); + if (value != null) { + cosStream.setItem(key, value); + } + } + } + String rawData = streamModel.getRawData(); + if (rawData != null && !rawData.isBlank()) { + byte[] data; + try { + data = Base64.getDecoder().decode(rawData); + } catch (IllegalArgumentException ex) { + log.debug("Invalid base64 content stream data: {}", ex.getMessage()); + data = new byte[0]; + } + try (OutputStream outputStream = cosStream.createRawOutputStream()) { + outputStream.write(data); + } + cosStream.setItem(COSName.LENGTH, COSInteger.get(data.length)); + } else { + cosStream.setItem(COSName.LENGTH, COSInteger.get(0)); + } + return cosStream; + } + + private PdfJsonStream serializeStream(PDStream stream) throws IOException { + if (stream == null) { + return null; + } + return serializeStream(stream.getCOSObject()); + } + + private PdfJsonStream serializeStream(COSStream cosStream) throws IOException { + if (cosStream == null) { + return null; + } + Map dictionary = new LinkedHashMap<>(); + for (COSName key : cosStream.keySet()) { + COSBase value = cosStream.getDictionaryObject(key); + PdfJsonCosValue serialized = serializeCosValue(value); + if (serialized != null) { + dictionary.put(key.getName(), serialized); + } + } + String rawData = null; + try (InputStream inputStream = cosStream.createRawInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + if (inputStream != null) { + inputStream.transferTo(baos); + } + byte[] data = baos.toByteArray(); + if (data.length > 0) { + rawData = Base64.getEncoder().encodeToString(data); + } + } + return PdfJsonStream.builder().dictionary(dictionary).rawData(rawData).build(); + } + + private PdfJsonCosValue serializeCosValue(COSBase base) throws IOException { + if (base == null) { + return null; + } + if (base instanceof COSObject cosObject) { + base = cosObject.getObject(); + if (base == null) { + return null; + } + } + PdfJsonCosValue.PdfJsonCosValueBuilder builder = PdfJsonCosValue.builder(); + if (base instanceof COSNull) { + builder.type(PdfJsonCosValue.Type.NULL); + return builder.build(); + } + if (base instanceof COSBoolean booleanValue) { + builder.type(PdfJsonCosValue.Type.BOOLEAN).value(booleanValue.getValue()); + return builder.build(); + } + if (base instanceof COSInteger integer) { + builder.type(PdfJsonCosValue.Type.INTEGER).value(integer.longValue()); + return builder.build(); + } + if (base instanceof COSFloat floatValue) { + builder.type(PdfJsonCosValue.Type.FLOAT).value(floatValue.floatValue()); + return builder.build(); + } + if (base instanceof COSName name) { + builder.type(PdfJsonCosValue.Type.NAME).value(name.getName()); + return builder.build(); + } + if (base instanceof COSString cosString) { + builder.type(PdfJsonCosValue.Type.STRING) + .value(Base64.getEncoder().encodeToString(cosString.getBytes())); + return builder.build(); + } + if (base instanceof COSArray array) { + List items = new ArrayList<>(array.size()); + for (COSBase item : array) { + PdfJsonCosValue serialized = serializeCosValue(item); + items.add(serialized); + } + builder.type(PdfJsonCosValue.Type.ARRAY).items(items); + return builder.build(); + } + if (base instanceof COSStream stream) { + builder.type(PdfJsonCosValue.Type.STREAM).stream(serializeStream(stream)); + return builder.build(); + } + if (base instanceof COSDictionary dictionary) { + Map entries = new LinkedHashMap<>(); + for (COSName key : dictionary.keySet()) { + PdfJsonCosValue serialized = serializeCosValue(dictionary.getDictionaryObject(key)); + entries.put(key.getName(), serialized); + } + builder.type(PdfJsonCosValue.Type.DICTIONARY).entries(entries); + return builder.build(); + } + return null; + } + + private COSBase deserializeCosValue(PdfJsonCosValue value, PDDocument document) + throws IOException { + if (value == null || value.getType() == null) { + return null; + } + switch (value.getType()) { + case NULL: + return COSNull.NULL; + case BOOLEAN: + if (value.getValue() instanceof Boolean bool) { + return COSBoolean.getBoolean(bool); + } + return null; + case INTEGER: + if (value.getValue() instanceof Number number) { + return COSInteger.get(number.longValue()); + } + return null; + case FLOAT: + if (value.getValue() instanceof Number number) { + return new COSFloat(number.floatValue()); + } + return null; + case NAME: + if (value.getValue() instanceof String name) { + return COSName.getPDFName(name); + } + return null; + case STRING: + if (value.getValue() instanceof String encoded) { + try { + byte[] bytes = Base64.getDecoder().decode(encoded); + return new COSString(bytes); + } catch (IllegalArgumentException ex) { + log.debug("Failed to decode COSString value: {}", ex.getMessage()); + } + } + return null; + case ARRAY: + COSArray array = new COSArray(); + if (value.getItems() != null) { + for (PdfJsonCosValue item : value.getItems()) { + COSBase entry = deserializeCosValue(item, document); + if (entry != null) { + array.add(entry); + } else { + array.add(COSNull.NULL); + } + } + } + return array; + case DICTIONARY: + COSDictionary dictionary = new COSDictionary(); + if (value.getEntries() != null) { + for (Map.Entry entry : value.getEntries().entrySet()) { + COSName key = COSName.getPDFName(entry.getKey()); + COSBase entryValue = deserializeCosValue(entry.getValue(), document); + if (entryValue != null) { + dictionary.setItem(key, entryValue); + } + } + } + return dictionary; + case STREAM: + if (value.getStream() != null) { + return buildStreamFromModel(value.getStream(), document); + } + return null; + default: + return null; + } + } + + private void regenerateTextContent( + PDDocument document, + PDPage page, + List elements, + Map fontMap, + int pageNumber) + throws IOException { + try (PDPageContentStream contentStream = + new PDPageContentStream(document, page, AppendMode.OVERWRITE, true, true)) { + boolean textOpen = false; + for (PdfJsonTextElement element : elements) { + PDFont font = fontMap.get(buildFontKey(pageNumber, element.getFontId())); + if (font == null && FALLBACK_FONT_ID.equals(element.getFontId())) { + font = fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID)); + } + float fontScale = safeFloat(element.getFontMatrixSize(), 0f); + if (fontScale == 0f) { + fontScale = safeFloat(element.getFontSize(), 12f); + } + String text = Objects.toString(element.getText(), ""); + + if (font != null) { + try { + encodeWithTest(font, text); + } catch (IOException | IllegalArgumentException ex) { + log.debug( + "Edited text contains glyphs missing from font {} ({}), switching to fallback", + element.getFontId(), + ex.getMessage()); + font = fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID)); + element.setFontId(FALLBACK_FONT_ID); + if (font == null) { + font = loadFallbackPdfFont(document); + fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), font); + } + encodeWithTest(font, text); + } + } else { + element.setFontId(FALLBACK_FONT_ID); + font = fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID)); + if (font == null) { + font = loadFallbackPdfFont(document); + fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), font); + } + encodeWithTest(font, text); + } + + if (!textOpen) { + contentStream.beginText(); + textOpen = true; + } + + contentStream.setFont(font, fontScale); + applyRenderingMode(contentStream, element.getRenderingMode()); + applyTextMatrix(contentStream, element); + contentStream.showText(text); + } + if (textOpen) { + contentStream.endText(); + } + } + } + + private void encodeWithTest(PDFont font, String text) throws IOException { + if (text == null || text.isEmpty()) { + return; + } + font.encode(text); + } + + private String abbreviate(String value) { + if (value == null) { + return ""; + } + String trimmed = value.replaceAll("\s+", " ").trim(); + if (trimmed.length() <= 32) { + return trimmed; + } + return trimmed.substring(0, 29) + "..."; + } + + private static class FontProgramData { + private final String base64; + private final String format; + + private FontProgramData(String base64, String format) { + this.base64 = base64; + this.format = format; + } + + private String getBase64() { + return base64; + } + + private String getFormat() { + return format; + } + } + + private boolean rewriteTextOperators( + PDDocument document, PDPage page, List elements) { + if (elements == null || elements.isEmpty()) { + return true; + } + PDResources resources = page.getResources(); + if (resources == null) { + return false; + } + try { + log.debug("Attempting token-level rewrite for page"); + PDFStreamParser parser = new PDFStreamParser(page); + List tokens = parser.parse(); + log.debug("Parsed {} tokens for rewrite", tokens.size()); + TextElementCursor cursor = new TextElementCursor(elements); + PDFont currentFont = null; + String currentFontName = null; + + for (int i = 0; i < tokens.size(); i++) { + Object token = tokens.get(i); + if (!(token instanceof Operator operator)) { + continue; + } + String operatorName = operator.getName(); + switch (operatorName) { + case "Tf": + if (i >= 2 && tokens.get(i - 2) instanceof COSName fontResourceName) { + currentFont = resources.getFont(fontResourceName); + currentFontName = fontResourceName.getName(); + log.trace( + "Encountered Tf operator; switching to font resource {}", + currentFontName); + } else { + currentFont = null; + currentFontName = null; + log.debug( + "Tf operator missing resource operand; clearing current font"); + } + break; + case "Tj": + if (i == 0 || !(tokens.get(i - 1) instanceof COSString cosString)) { + log.debug( + "Encountered Tj without preceding string operand; aborting rewrite"); + return false; + } + log.trace("Rewriting Tj operator using font {}", currentFontName); + if (!rewriteShowText(cosString, currentFont, currentFontName, cursor)) { + log.debug("Failed to rewrite Tj operator; aborting rewrite"); + return false; + } + break; + case "TJ": + if (i == 0 || !(tokens.get(i - 1) instanceof COSArray array)) { + log.debug("Encountered TJ without array operand; aborting rewrite"); + return false; + } + log.trace("Rewriting TJ operator using font {}", currentFontName); + if (!rewriteShowTextArray(array, currentFont, currentFontName, cursor)) { + log.debug("Failed to rewrite TJ operator; aborting rewrite"); + return false; + } + break; + default: + break; + } + } + + if (cursor.hasRemaining()) { + log.debug("Rewrite cursor still has {} elements; falling back", cursor.remaining()); + return false; + } + + PDStream newStream = new PDStream(document); + try (OutputStream outputStream = newStream.createOutputStream(COSName.FLATE_DECODE)) { + new ContentStreamWriter(outputStream).writeTokens(tokens); + } + page.setContents(newStream); + log.debug("Token rewrite completed successfully"); + return true; + } catch (IOException ex) { + log.debug("Failed to rewrite content stream: {}", ex.getMessage()); + return false; + } + } + + private boolean rewriteShowText( + COSString cosString, PDFont font, String expectedFontName, TextElementCursor cursor) + throws IOException { + if (font == null) { + return false; + } + int glyphCount = countGlyphs(cosString, font); + List consumed = cursor.consume(expectedFontName, glyphCount); + if (consumed == null) { + return false; + } + String replacement = mergeText(consumed); + try { + byte[] encoded = font.encode(replacement); + cosString.setValue(encoded); + return true; + } catch (IOException | IllegalArgumentException ex) { + log.debug("Failed to encode replacement text: {}", ex.getMessage()); + return false; + } + } + + private boolean rewriteShowTextArray( + COSArray array, PDFont font, String expectedFontName, TextElementCursor cursor) + throws IOException { + if (font == null) { + return false; + } + for (int i = 0; i < array.size(); i++) { + COSBase element = array.get(i); + if (element instanceof COSString cosString) { + int glyphCount = countGlyphs(cosString, font); + List consumed = cursor.consume(expectedFontName, glyphCount); + if (consumed == null) { + return false; + } + String replacement = mergeText(consumed); + try { + byte[] encoded = font.encode(replacement); + array.set(i, new COSString(encoded)); + } catch (IOException | IllegalArgumentException ex) { + log.debug("Failed to encode replacement text in TJ array: {}", ex.getMessage()); + return false; + } + } + } + return true; + } + + private int countGlyphs(COSString value, PDFont font) { + if (value == null) { + return 0; + } + if (font != null) { + try (InputStream inputStream = new ByteArrayInputStream(value.getBytes())) { + int count = 0; + int code; + while ((code = font.readCode(inputStream)) != -1) { + count++; + } + if (count > 0) { + return count; + } + } catch (IOException ex) { + log.debug("Failed to decode glyphs: {}", ex.getMessage()); + } + } + byte[] bytes = value.getBytes(); + return Math.max(1, bytes.length); + } + + private String mergeText(List elements) { + StringBuilder builder = new StringBuilder(); + for (PdfJsonTextElement element : elements) { + builder.append(Objects.toString(element.getText(), "")); + } + return builder.toString(); + } + + private static class TextElementCursor { + private final List elements; + private int index = 0; + + TextElementCursor(List elements) { + this.elements = elements; + } + + boolean hasRemaining() { + return index < elements.size(); + } + + int remaining() { + return Math.max(0, elements.size() - index); + } + + List consume(String expectedFontName, int glyphCount) { + if (glyphCount <= 0) { + return Collections.emptyList(); + } + List consumed = new ArrayList<>(); + int remaining = glyphCount; + while (remaining > 0 && index < elements.size()) { + PdfJsonTextElement element = elements.get(index); + if (!fontMatches(expectedFontName, element.getFontId())) { + return null; + } + consumed.add(element); + remaining -= countGlyphs(element); + index++; + } + if (remaining > 0) { + return null; + } + return consumed; + } + + private boolean fontMatches(String expected, String actual) { + if (expected == null || expected.isEmpty()) { + return true; + } + if (actual == null) { + return false; + } + return Objects.equals(expected, actual); + } + + private int countGlyphs(PdfJsonTextElement element) { + String text = element.getText(); + if (text != null && !text.isEmpty()) { + return Math.max(1, text.codePointCount(0, text.length())); + } + return 1; + } + } + private Map buildFontMap(PDDocument document, List fonts) throws IOException { Map fontMap = new HashMap<>(); - if (fonts == null) { - return fontMap; - } - for (PdfJsonFont fontModel : fonts) { - PDFont font = createFontFromModel(document, fontModel); - if (font != null && fontModel.getId() != null) { - fontMap.put(fontModel.getId(), font); + if (fonts != null) { + for (PdfJsonFont fontModel : fonts) { + if (FALLBACK_FONT_ID.equals(fontModel.getId())) { + continue; + } + PDFont loadedFont = createFontFromModel(document, fontModel); + if (loadedFont != null && fontModel.getId() != null) { + fontMap.put( + buildFontKey(fontModel.getPageNumber(), fontModel.getId()), loadedFont); + } } } + + boolean fallbackPresent = + fonts != null && fonts.stream().anyMatch(f -> FALLBACK_FONT_ID.equals(f.getId())); + if (!fallbackPresent) { + PdfJsonFont fallbackModel = buildFallbackFontModel(); + if (fonts != null) { + fonts.add(fallbackModel); + log.info("Added fallback font definition to JSON font list"); + } + PDFont fallbackFont = createFontFromModel(document, fallbackModel); + fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), fallbackFont); + } else if (!fontMap.containsKey(buildFontKey(-1, FALLBACK_FONT_ID))) { + PdfJsonFont fallbackModel = + fonts.stream() + .filter(f -> FALLBACK_FONT_ID.equals(f.getId())) + .findFirst() + .orElse(buildFallbackFontModel()); + PDFont fallbackFont = createFontFromModel(document, fallbackModel); + fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), fallbackFont); + } + return fontMap; } private PDFont createFontFromModel(PDDocument document, PdfJsonFont fontModel) throws IOException { - if (fontModel == null) { + if (fontModel == null || fontModel.getId() == null) { return null; } - String base64 = fontModel.getBase64Data(); - if (base64 != null && !base64.isBlank()) { - byte[] fontBytes = Base64.getDecoder().decode(base64); - try (InputStream fontStream = new ByteArrayInputStream(fontBytes)) { - return PDType0Font.load(document, fontStream, true); + + if (FALLBACK_FONT_ID.equals(fontModel.getId())) { + return loadFallbackPdfFont(document); + } + + String program = fontModel.getProgram(); + if (program != null && !program.isBlank()) { + byte[] fontBytes = Base64.getDecoder().decode(program); + String format = + fontModel.getProgramFormat() != null + ? fontModel.getProgramFormat().toLowerCase(Locale.ROOT) + : ""; + try { + if (isType1Format(format)) { + try (InputStream stream = new ByteArrayInputStream(fontBytes)) { + PDFont font = new PDType1Font(document, stream); + applyAdditionalFontMetadata(document, font, fontModel); + return font; + } + } + try (InputStream stream = new ByteArrayInputStream(fontBytes)) { + PDFont font = PDType0Font.load(document, stream, true); + applyAdditionalFontMetadata(document, font, fontModel); + return font; + } } catch (IOException ex) { log.debug( - "Unable to load font as Type0 ({}): {}", - fontModel.getName(), + "Unable to load embedded font program for {}: {}", + fontModel.getId(), ex.getMessage()); } } + String standardName = fontModel.getStandard14Name(); if (standardName != null) { try { - Standard14Fonts.FontName fontName = - Standard14Fonts.getMappedFontName(standardName); - return new PDType1Font(fontName); + Standard14Fonts.FontName fontName = Standard14Fonts.getMappedFontName(standardName); + if (fontName != null) { + PDFont font = new PDType1Font(fontName); + applyAdditionalFontMetadata(document, font, fontModel); + return font; + } + log.warn( + "Standard 14 font mapping for {} returned null, using fallback", + standardName); } catch (IllegalArgumentException ex) { - log.warn("Unknown Standard 14 font {}, using Helvetica", standardName); + log.warn("Unknown Standard 14 font {}, using fallback", standardName); } } - return new PDType1Font(Standard14Fonts.FontName.HELVETICA); + + PDFont fallback = loadFallbackPdfFont(document); + applyAdditionalFontMetadata(document, fallback, fontModel); + return fallback; + } + + private boolean isType1Format(String format) { + if (format == null) { + return false; + } + return "type1".equals(format) || format.endsWith("pfb"); + } + + private void applyAdditionalFontMetadata( + PDDocument document, PDFont font, PdfJsonFont fontModel) throws IOException { + if (fontModel.getToUnicode() != null && !fontModel.getToUnicode().isBlank()) { + byte[] bytes = Base64.getDecoder().decode(fontModel.getToUnicode()); + PDStream toUnicodeStream = new PDStream(document); + try (OutputStream outputStream = toUnicodeStream.createOutputStream()) { + outputStream.write(bytes); + } + font.getCOSObject().setItem(COSName.TO_UNICODE, toUnicodeStream.getCOSObject()); + } + + PdfJsonFontCidSystemInfo cidInfo = fontModel.getCidSystemInfo(); + if (cidInfo != null) { + COSDictionary cidDictionary = new COSDictionary(); + if (cidInfo.getRegistry() != null) { + cidDictionary.setString(COSName.REGISTRY, cidInfo.getRegistry()); + } + if (cidInfo.getOrdering() != null) { + cidDictionary.setString(COSName.ORDERING, cidInfo.getOrdering()); + } + if (cidInfo.getSupplement() != null) { + cidDictionary.setInt(COSName.SUPPLEMENT, cidInfo.getSupplement()); + } + font.getCOSObject().setItem(COSName.CIDSYSTEMINFO, cidDictionary); + } } private void applyTextMatrix(PDPageContentStream contentStream, PdfJsonTextElement element) throws IOException { List matrix = element.getTextMatrix(); if (matrix != null && matrix.size() == 6) { - contentStream.setTextMatrix( - new Matrix( - matrix.get(0), - matrix.get(1), - matrix.get(2), - matrix.get(3), - matrix.get(4), - matrix.get(5))); + float fontScale = safeFloat(element.getFontMatrixSize(), 0f); + if (fontScale == 0f) { + fontScale = safeFloat(element.getFontSize(), 1f); + } + float a = matrix.get(0); + float b = matrix.get(1); + float c = matrix.get(2); + float d = matrix.get(3); + float e = matrix.get(4); + float f = matrix.get(5); + + if (fontScale != 0f) { + a /= fontScale; + b /= fontScale; + c /= fontScale; + d /= fontScale; + } + + contentStream.setTextMatrix(new Matrix(a, b, c, d, e, f)); return; } float x = safeFloat(element.getX(), 0f); @@ -287,8 +1401,7 @@ public class PdfJsonConversionService { try { contentStream.setRenderingMode(mode); } catch (IllegalArgumentException ex) { - log.debug( - "Failed to apply rendering mode {}: {}", renderingMode, ex.getMessage()); + log.debug("Failed to apply rendering mode {}: {}", renderingMode, ex.getMessage()); } } @@ -323,21 +1436,32 @@ public class PdfJsonConversionService { private class TextCollectingStripper extends PDFTextStripper { + private final PDDocument document; private final Map fonts; private final Map> textByPage; + private final Map> pageFontResources; + private int currentPage = 1; + private Map currentFontResources = Collections.emptyMap(); TextCollectingStripper( - Map fonts, Map> textByPage) + PDDocument document, + Map fonts, + Map> textByPage, + Map> pageFontResources) throws IOException { + this.document = document; this.fonts = fonts; this.textByPage = textByPage; + this.pageFontResources = pageFontResources; } @Override protected void startPage(PDPage page) throws IOException { super.startPage(page); currentPage = getCurrentPageNo(); + currentFontResources = + pageFontResources.getOrDefault(currentPage, Collections.emptyMap()); } @Override @@ -356,6 +1480,7 @@ public class PdfJsonConversionService { element.setText(position.getUnicode()); element.setFontId(fontId); element.setFontSize(position.getFontSizeInPt()); + element.setFontMatrixSize(position.getFontSize()); element.setX(position.getXDirAdj()); element.setY(position.getYDirAdj()); element.setWidth(position.getWidthDirAdj()); @@ -381,102 +1506,15 @@ public class PdfJsonConversionService { } private String registerFont(PDFont font) throws IOException { - String id = font.getName(); - if (!fonts.containsKey(id)) { - PdfJsonFont fontModel = new PdfJsonFont(); - fontModel.setId(id); - fontModel.setName(font.getName()); - fontModel.setSubtype(resolveSubtype(font)); - fontModel.setEncoding(resolveEncoding(font)); - fontModel.setEmbedded(!isStandard14Font(font)); - fontModel.setStandard14Name(resolveStandard14Name(font)); - fontModel.setFontDescriptorFlags( - font.getFontDescriptor() != null - ? font.getFontDescriptor().getFlags() - : null); - fontModel.setBase64Data(extractFontData(font)); - fonts.put(id, fontModel); + String fontId = currentFontResources.get(font); + if (fontId == null || fontId.isBlank()) { + fontId = font.getName(); } - return id; - } - - private String resolveStandard14Name(PDFont font) { - if (font == null) { - return null; - } - if (isStandard14Font(font)) { - return font.getName(); - } - try { - Standard14Fonts.FontName mapped = - Standard14Fonts.getMappedFontName(font.getName()); - return mapped.getName(); - } catch (IllegalArgumentException ex) { - return null; - } - } - - private String extractFontData(PDFont font) throws IOException { - if (font == null || isStandard14Font(font)) { - return null; - } - PDFontDescriptor descriptor = font.getFontDescriptor(); - if (descriptor == null) { - return null; - } - org.apache.pdfbox.pdmodel.common.PDStream fontStream = descriptor.getFontFile(); - if (fontStream == null) { - fontStream = descriptor.getFontFile2(); - } - if (fontStream == null) { - fontStream = descriptor.getFontFile3(); - } - if (fontStream == null) { - return null; - } - try (InputStream inputStream = fontStream.createInputStream(); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - inputStream.transferTo(baos); - return Base64.getEncoder().encodeToString(baos.toByteArray()); - } - } - - private String resolveSubtype(PDFont font) { - if (font == null) { - return null; - } - COSDictionary dictionary = font.getCOSObject(); - return dictionary != null ? dictionary.getNameAsString(COSName.SUBTYPE) : null; - } - - private String resolveEncoding(PDFont font) { - if (font == null) { - return null; - } - COSDictionary dictionary = font.getCOSObject(); - if (dictionary == null) { - return null; - } - COSBase encoding = dictionary.getDictionaryObject(COSName.ENCODING); - if (encoding instanceof COSName) { - return ((COSName) encoding).getName(); - } - if (encoding instanceof COSDictionary) { - return ((COSDictionary) encoding).getNameAsString(COSName.BASE_ENCODING); - } - return null; - } - - private boolean isStandard14Font(PDFont font) { - if (font == null) { - return false; - } - try { - Standard14Fonts.getMappedFontName(font.getName()); - return true; - } catch (IllegalArgumentException ex) { - return false; + String key = buildFontKey(currentPage, fontId); + if (!fonts.containsKey(key)) { + fonts.put(key, buildFontModel(document, font, fontId, currentPage)); } + return fontId; } } diff --git a/app/core/src/main/resources/application.properties b/app/core/src/main/resources/application.properties index 1f4e831df..77b1e88dc 100644 --- a/app/core/src/main/resources/application.properties +++ b/app/core/src/main/resources/application.properties @@ -7,6 +7,7 @@ logging.level.org.eclipse.jetty=WARN #logging.level.org.opensaml=DEBUG #logging.level.stirling.software.proprietary.security=DEBUG logging.level.com.zaxxer.hikari=WARN +logging.level.stirling.software.SPDF.service.PdfJsonConversionService=TRACE spring.jpa.open-in-view=false server.forward-headers-strategy=NATIVE server.error.path=/error diff --git a/compare_json.py b/compare_json.py new file mode 100644 index 000000000..97e6b72e6 --- /dev/null +++ b/compare_json.py @@ -0,0 +1,43 @@ +import json +import sys +from pathlib import Path + +if len(sys.argv) != 3: + print('Usage: compare_json.py ') + sys.exit(1) + +path1, path2 = map(Path, sys.argv[1:]) + +def load(path): + with path.open('r', encoding='utf-8') as fh: + return json.load(fh) + +doc1 = load(path1) +doc2 = load(path2) + +if doc1 == doc2: + print('Documents identical') + sys.exit(0) + +pages1 = doc1.get('pages', []) +pages2 = doc2.get('pages', []) + +for page_index, (p1, p2) in enumerate(zip(pages1, pages2), start=1): + elems1 = p1.get('textElements') or [] + elems2 = p2.get('textElements') or [] + if len(elems1) != len(elems2): + print(f'Page {page_index}: element count {len(elems1)} vs {len(elems2)}') + diff_found = False + for elem_index, (e1, e2) in enumerate(zip(elems1, elems2)): + if e1 == e2: + continue + diff_found = True + print(f'Page {page_index} element {elem_index} differs') + common_keys = sorted(set(e1) | set(e2)) + for key in common_keys: + if e1.get(key) != e2.get(key): + print(f' {key}: {e1.get(key)!r} -> {e2.get(key)!r}') + break + if diff_found: + break + diff --git a/frontend/public/locales/en-GB/translation.json b/frontend/public/locales/en-GB/translation.json index 5cb16aacd..35a0c30d2 100644 --- a/frontend/public/locales/en-GB/translation.json +++ b/frontend/public/locales/en-GB/translation.json @@ -4006,5 +4006,8 @@ "finish": "Finish", "startTour": "Start Tour", "startTourDescription": "Take a guided tour of Stirling PDF's key features" + }, + "pdfJsonEditor": { + "viewLabel": "JSON Editor" } } diff --git a/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx b/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx new file mode 100644 index 000000000..2d7e97eb9 --- /dev/null +++ b/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx @@ -0,0 +1,463 @@ +import React, { useEffect, useMemo, useState } from 'react'; +import { + Alert, + Badge, + Box, + Button, + Card, + Divider, + FileButton, + Group, + Pagination, + ScrollArea, + Stack, + Text, + Title, +} from '@mantine/core'; +import { useTranslation } from 'react-i18next'; +import DescriptionIcon from '@mui/icons-material/DescriptionOutlined'; +import FileDownloadIcon from '@mui/icons-material/FileDownloadOutlined'; +import PictureAsPdfIcon from '@mui/icons-material/PictureAsPdfOutlined'; +import AutorenewIcon from '@mui/icons-material/Autorenew'; +import WarningAmberIcon from '@mui/icons-material/WarningAmber'; +import UploadIcon from '@mui/icons-material/Upload'; + +import { + PdfJsonEditorViewData, + PdfJsonPage, +} from '../../../tools/pdfJsonEditorTypes'; +import { pageDimensions } from '../../../tools/pdfJsonEditorUtils'; + +const MAX_RENDER_WIDTH = 820; +const MIN_BOX_SIZE = 18; + +interface PdfJsonEditorViewProps { + data: PdfJsonEditorViewData; +} + +const toCssBounds = ( + page: PdfJsonPage | null | undefined, + pageHeight: number, + scale: number, + bounds: { left: number; right: number; top: number; bottom: number }, +) => { + const width = Math.max(bounds.right - bounds.left, 1); + const height = Math.max(bounds.bottom - bounds.top, 1); + const scaledWidth = Math.max(width * scale, MIN_BOX_SIZE); + const scaledHeight = Math.max(height * scale, MIN_BOX_SIZE / 2); + const top = Math.max(pageHeight - bounds.bottom, 0) * scale; + + return { + left: bounds.left * scale, + top, + width: scaledWidth, + height: scaledHeight, + }; +}; + +const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { + const { t } = useTranslation(); + const [activeGroupId, setActiveGroupId] = useState(null); + const [editingGroupId, setEditingGroupId] = useState(null); + + const { + document: pdfDocument, + groupsByPage, + selectedPage, + dirtyPages, + hasDocument, + fileName, + errorMessage, + isGeneratingPdf, + hasChanges, + onLoadJson, + onSelectPage, + onGroupEdit, + onReset, + onDownloadJson, + onGeneratePdf, + } = data; + + const pages = pdfDocument?.pages ?? []; + const currentPage = pages[selectedPage] ?? null; + const pageGroups = groupsByPage[selectedPage] ?? []; + const visibleGroups = useMemo( + () => + pageGroups.filter((group) => { + const hasContent = ((group.text ?? '').trim().length > 0) || ((group.originalText ?? '').trim().length > 0); + return hasContent || editingGroupId === group.id; + }), + [editingGroupId, pageGroups] + ); + + const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage); + const scale = useMemo(() => Math.min(MAX_RENDER_WIDTH / pageWidth, 1.5), [pageWidth]); + const scaledWidth = pageWidth * scale; + const scaledHeight = pageHeight * scale; + + useEffect(() => { + setActiveGroupId(null); + setEditingGroupId(null); + }, [selectedPage]); + + useEffect(() => { + if (!editingGroupId) { + return; + } + const editor = document.querySelector(`[data-editor-group="${editingGroupId}"]`); + if (editor) { + editor.focus(); + const selection = window.getSelection(); + if (selection) { + selection.removeAllRanges(); + const range = document.createRange(); + range.selectNodeContents(editor); + range.collapse(false); + selection.addRange(range); + } + } + }, [editingGroupId]); + + const handlePageChange = (pageNumber: number) => { + setActiveGroupId(null); + setEditingGroupId(null); + onSelectPage(pageNumber - 1); + }; + + const handleBackgroundClick = () => { + setEditingGroupId(null); + setActiveGroupId(null); + }; + + const renderGroupContainer = ( + groupId: string, + isActive: boolean, + isChanged: boolean, + content: React.ReactNode, + onActivate?: (event: React.MouseEvent) => void, + ) => ( + { + event.stopPropagation(); + onActivate?.(event); + }} + onMouseEnter={() => setActiveGroupId(groupId)} + onMouseLeave={() => { + if (editingGroupId !== groupId) { + setActiveGroupId((current) => (current === groupId ? null : current)); + } + }} + > + {content} + + ); + + return ( + + + + + + + {t('pdfJsonEditor.title', 'PDF JSON Editor')} + {hasChanges && {t('pdfJsonEditor.badges.unsaved', 'Edited')}} + + + + {(props) => ( + + )} + + + + + + + + {fileName && ( + + {t('pdfJsonEditor.currentFile', 'Current file: {{name}}', { name: fileName })} + + )} + + + + {errorMessage && ( + } color="red" radius="md"> + {errorMessage} + + )} + + {!hasDocument && ( + + + + + {t('pdfJsonEditor.empty.title', 'No JSON loaded yet')} + + + {t('pdfJsonEditor.empty.subtitle', 'Use the Load JSON button above to open a file generated by the PDF → JSON converter.')} + + + + )} + + {hasDocument && ( + + + + + {t('pdfJsonEditor.pageSummary', 'Page {{number}} of {{total}}', { + number: selectedPage + 1, + total: pages.length, + })} + + {dirtyPages[selectedPage] && ( + + {t('pdfJsonEditor.badges.modified', 'Edited')} + + )} + + {pages.length > 1 && ( + + )} + + + + + + + {visibleGroups.length === 0 ? ( + + + + {t('pdfJsonEditor.noTextOnPage', 'No editable text was detected on this page.')} + + + + ) : ( + visibleGroups.map((group) => { + const bounds = toCssBounds(currentPage, pageHeight, scale, group.bounds); + const changed = group.text !== group.originalText; + const isActive = activeGroupId === group.id || editingGroupId === group.id; + const isEditing = editingGroupId === group.id; + const fontSizePx = Math.max((group.fontSize ?? 12) * scale, 8); + + const visualHeight = Math.max(bounds.height, fontSizePx * 1.35); + + const containerStyle: React.CSSProperties = { + position: 'absolute', + left: `${bounds.left}px`, + top: `${bounds.top}px`, + width: `${bounds.width}px`, + height: `${visualHeight}px`, + display: 'flex', + alignItems: 'flex-start', + justifyContent: 'flex-start', + pointerEvents: 'auto', + cursor: 'text', + }; + + const commonProps = { + key: group.id, + style: containerStyle, + }; + + if (isEditing) { + return ( + + {renderGroupContainer( + group.id, + true, + changed, +
{ + const value = event.currentTarget.innerText.replace(/\u00A0/g, ' '); + onGroupEdit(group.pageIndex, group.id, value); + setEditingGroupId(null); + }} + onInput={(event) => { + const value = event.currentTarget.innerText.replace(/\u00A0/g, ' '); + onGroupEdit(group.pageIndex, group.id, value); + }} + style={{ + width: '100%', + height: '100%', + padding: '3px 4px', + backgroundColor: 'rgba(255,255,255,0.95)', + color: '#111827', + fontSize: `${fontSizePx}px`, + lineHeight: 1.25, + outline: 'none', + border: 'none', + display: 'block', + whiteSpace: 'pre-wrap', + overflowWrap: 'anywhere', + cursor: 'text', + }} + > + {group.text || '\u00A0'} +
, + )} +
+ ); + } + + return ( + + {renderGroupContainer( + group.id, + isActive, + changed, +
+ {group.text || '\u00A0'} +
, + () => { + setEditingGroupId(group.id); + setActiveGroupId(group.id); + }, + )} +
+ ); + }) + )} +
+
+
+
+ + + + {t('pdfJsonEditor.groupList', 'Detected Text Groups')} + + + + {visibleGroups.map((group) => { + const changed = group.text !== group.originalText; + return ( + setActiveGroupId(group.id)} + onMouseLeave={() => setActiveGroupId((current) => (current === group.id ? null : current))} + style={{ cursor: 'pointer' }} + onClick={() => { + setActiveGroupId(group.id); + setEditingGroupId(group.id); + }} + > + + + {changed && {t('pdfJsonEditor.badges.modified', 'Edited')}} + {group.fontId && ( + {group.fontId} + )} + {group.fontSize && ( + + {t('pdfJsonEditor.fontSizeValue', '{{size}}pt', { size: group.fontSize.toFixed(1) })} + + )} + + + {group.text || t('pdfJsonEditor.emptyGroup', '[Empty Group]')} + + + + ); + })} + + + + +
+ )} +
+ ); +}; + +export default PdfJsonEditorView; diff --git a/frontend/src/constants/convertConstants.ts b/frontend/src/constants/convertConstants.ts index 5978d523b..a7e12c266 100644 --- a/frontend/src/constants/convertConstants.ts +++ b/frontend/src/constants/convertConstants.ts @@ -31,7 +31,9 @@ export const CONVERSION_ENDPOINTS = { 'pdf-pdfa': '/api/v1/convert/pdf/pdfa', 'html-pdf': '/api/v1/convert/html/pdf', 'markdown-pdf': '/api/v1/convert/markdown/pdf', - 'eml-pdf': '/api/v1/convert/eml/pdf' + 'eml-pdf': '/api/v1/convert/eml/pdf', + 'pdf-json': '/api/v1/convert/pdf/json', + 'json-pdf': '/api/v1/convert/json/pdf' } as const; export const ENDPOINT_NAMES = { @@ -48,7 +50,9 @@ export const ENDPOINT_NAMES = { 'pdf-pdfa': 'pdf-to-pdfa', 'html-pdf': 'html-to-pdf', 'markdown-pdf': 'markdown-to-pdf', - 'eml-pdf': 'eml-to-pdf' + 'eml-pdf': 'eml-to-pdf', + 'pdf-json': 'pdf-to-json', + 'json-pdf': 'json-to-pdf' } as const; @@ -80,6 +84,7 @@ export const FROM_FORMAT_OPTIONS = [ { value: 'txt', label: 'TXT', group: 'Text' }, { value: 'rtf', label: 'RTF', group: 'Text' }, { value: 'eml', label: 'EML', group: 'Email' }, + { value: 'json', label: 'JSON', group: 'Data' }, ]; export const TO_FORMAT_OPTIONS = [ @@ -101,13 +106,14 @@ export const TO_FORMAT_OPTIONS = [ { value: 'webp', label: 'WEBP', group: 'Image' }, { value: 'html', label: 'HTML', group: 'Web' }, { value: 'xml', label: 'XML', group: 'Web' }, + { value: 'json', label: 'JSON', group: 'Data' }, ]; // Conversion matrix - what each source format can convert to export const CONVERSION_MATRIX: Record = { 'any': ['pdf'], // Mixed files always convert to PDF 'image': ['pdf'], // Multiple images always convert to PDF - 'pdf': ['png', 'jpg', 'gif', 'tiff', 'bmp', 'webp', 'docx', 'odt', 'pptx', 'odp', 'csv', 'txt', 'rtf', 'md', 'html', 'xml', 'pdfa'], + 'pdf': ['png', 'jpg', 'gif', 'tiff', 'bmp', 'webp', 'docx', 'odt', 'pptx', 'odp', 'csv', 'txt', 'rtf', 'md', 'html', 'xml', 'pdfa', 'json'], 'docx': ['pdf'], 'doc': ['pdf'], 'odt': ['pdf'], 'xlsx': ['pdf'], 'xls': ['pdf'], 'ods': ['pdf'], 'pptx': ['pdf'], 'ppt': ['pdf'], 'odp': ['pdf'], @@ -116,7 +122,8 @@ export const CONVERSION_MATRIX: Record = { 'zip': ['pdf'], 'md': ['pdf'], 'txt': ['pdf'], 'rtf': ['pdf'], - 'eml': ['pdf'] + 'eml': ['pdf'], + 'json': ['pdf'] }; // Map extensions to endpoint keys @@ -130,7 +137,8 @@ export const EXTENSION_TO_ENDPOINT: Record> = { 'csv': 'pdf-to-csv', 'txt': 'pdf-to-text', 'rtf': 'pdf-to-text', 'md': 'pdf-to-markdown', 'html': 'pdf-to-html', 'xml': 'pdf-to-xml', - 'pdfa': 'pdf-to-pdfa' + 'pdfa': 'pdf-to-pdfa', + 'json': 'pdf-to-json' }, 'docx': { 'pdf': 'file-to-pdf' }, 'doc': { 'pdf': 'file-to-pdf' }, 'odt': { 'pdf': 'file-to-pdf' }, 'xlsx': { 'pdf': 'file-to-pdf' }, 'xls': { 'pdf': 'file-to-pdf' }, 'ods': { 'pdf': 'file-to-pdf' }, @@ -141,7 +149,8 @@ export const EXTENSION_TO_ENDPOINT: Record> = { 'zip': { 'pdf': 'html-to-pdf' }, 'md': { 'pdf': 'markdown-to-pdf' }, 'txt': { 'pdf': 'file-to-pdf' }, 'rtf': { 'pdf': 'file-to-pdf' }, - 'eml': { 'pdf': 'eml-to-pdf' } + 'eml': { 'pdf': 'eml-to-pdf' }, + 'json': { 'pdf': 'json-to-pdf' } }; export type ColorType = typeof COLOR_TYPES[keyof typeof COLOR_TYPES]; diff --git a/frontend/src/constants/convertSupportedFornats.ts b/frontend/src/constants/convertSupportedFornats.ts index b9bea3227..5934cce3a 100644 --- a/frontend/src/constants/convertSupportedFornats.ts +++ b/frontend/src/constants/convertSupportedFornats.ts @@ -5,7 +5,7 @@ export const CONVERT_SUPPORTED_FORMATS = [ // OpenDocument 'odt', 'ott', 'ods', 'ots', 'odp', 'otp', 'odg', 'otg', // Text formats - 'txt', 'text', 'xml', 'rtf', 'html', 'lwp', 'md', + 'txt', 'text', 'xml', 'rtf', 'html', 'lwp', 'md', 'json', // Images 'bmp', 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'pbm', 'pgm', 'ppm', 'ras', 'xbm', 'xpm', 'svg', 'svm', 'wmf', 'webp', // StarOffice diff --git a/frontend/src/data/useTranslatedToolRegistry.tsx b/frontend/src/data/useTranslatedToolRegistry.tsx index f50b5ed55..f6659718e 100644 --- a/frontend/src/data/useTranslatedToolRegistry.tsx +++ b/frontend/src/data/useTranslatedToolRegistry.tsx @@ -5,6 +5,7 @@ import SplitPdfPanel from "../tools/Split"; import CompressPdfPanel from "../tools/Compress"; import OCRPanel from "../tools/OCR"; import ConvertPanel from "../tools/Convert"; +import PdfJsonEditor from "../tools/PdfJsonEditor"; import Sanitize from "../tools/Sanitize"; import AddPassword from "../tools/AddPassword"; import ChangePermissions from "../tools/ChangePermissions"; @@ -710,6 +711,19 @@ export function useTranslatedToolCatalog(): TranslatedToolCatalog { supportsAutomate: false, automationSettings: null }, + pdfJsonEditor: { + icon: , + name: t("home.pdfJsonEditor.title", "PDF JSON Editor"), + component: PdfJsonEditor, + description: t("home.pdfJsonEditor.desc", "Review and edit Stirling PDF JSON exports with grouped text editing and PDF regeneration"), + categoryId: ToolCategoryId.ADVANCED_TOOLS, + subcategoryId: SubcategoryId.DEVELOPER_TOOLS, + workbench: 'custom:pdfJsonEditor', + endpoints: ["json-pdf"], + synonyms: getSynonyms(t, "pdfJsonEditor"), + supportsAutomate: false, + automationSettings: null + }, devApi: { icon: , name: t("home.devApi.title", "API"), diff --git a/frontend/src/tools/PdfJsonEditor.tsx b/frontend/src/tools/PdfJsonEditor.tsx new file mode 100644 index 000000000..7264ffaf5 --- /dev/null +++ b/frontend/src/tools/PdfJsonEditor.tsx @@ -0,0 +1,289 @@ +import { useCallback, useEffect, useMemo, useState, useRef } from 'react'; +import { useTranslation } from 'react-i18next'; +import DescriptionIcon from '@mui/icons-material/DescriptionOutlined'; + +import { useToolWorkflow } from '../contexts/ToolWorkflowContext'; +import { useNavigationActions, useNavigationState } from '../contexts/NavigationContext'; +import { BaseToolProps, ToolComponent } from '../types/tool'; +import { CONVERSION_ENDPOINTS } from '../constants/convertConstants'; +import apiClient from '../services/apiClient'; +import { downloadBlob, downloadTextAsFile } from '../utils/downloadUtils'; +import { getFilenameFromHeaders } from '../utils/fileResponseUtils'; +import { + PdfJsonDocument, + TextGroup, + PdfJsonEditorViewData, +} from './pdfJsonEditorTypes'; +import { + deepCloneDocument, + getDirtyPages, + groupDocumentText, + restoreGlyphElements, +} from './pdfJsonEditorUtils'; +import PdfJsonEditorView from '../components/tools/pdfJsonEditor/PdfJsonEditorView'; + +const VIEW_ID = 'pdfJsonEditorView'; +const WORKBENCH_ID = 'custom:pdfJsonEditor' as const; + +const sanitizeBaseName = (name?: string | null): string => { + if (!name || name.trim().length === 0) { + return 'document'; + } + return name.replace(/\.[^.]+$/u, ''); +}; + +const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { + const { t } = useTranslation(); + const { + registerCustomWorkbenchView, + unregisterCustomWorkbenchView, + setCustomWorkbenchViewData, + clearCustomWorkbenchViewData, + setLeftPanelView, + } = useToolWorkflow(); + const { actions: navigationActions } = useNavigationActions(); + const navigationState = useNavigationState(); + + const [loadedDocument, setLoadedDocument] = useState(null); + const [groupsByPage, setGroupsByPage] = useState([]); + const [selectedPage, setSelectedPage] = useState(0); + const [fileName, setFileName] = useState(''); + const [errorMessage, setErrorMessage] = useState(null); + const [isGeneratingPdf, setIsGeneratingPdf] = useState(false); + + const dirtyPages = useMemo(() => getDirtyPages(groupsByPage), [groupsByPage]); + const hasChanges = useMemo(() => dirtyPages.some(Boolean), [dirtyPages]); + const hasDocument = loadedDocument !== null; + const viewLabel = useMemo(() => t('pdfJsonEditor.viewLabel', 'JSON Editor'), [t]); + + const resetToDocument = useCallback((document: PdfJsonDocument | null) => { + if (!document) { + setGroupsByPage([]); + setSelectedPage(0); + return; + } + const cloned = deepCloneDocument(document); + const groups = groupDocumentText(cloned); + setGroupsByPage(groups); + setSelectedPage(0); + }, []); + + const handleLoadFile = useCallback( + async (file: File | null) => { + if (!file) { + return; + } + try { + const content = await file.text(); + const parsed = JSON.parse(content) as PdfJsonDocument; + setLoadedDocument(parsed); + resetToDocument(parsed); + setFileName(file.name); + setErrorMessage(null); + } catch (error) { + console.error('Failed to parse JSON', error); + setLoadedDocument(null); + setGroupsByPage([]); + setErrorMessage( + t( + 'pdfJsonEditor.errors.invalidJson', + 'Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.' + ) + ); + } + }, + [resetToDocument, t] + ); + + const handleSelectPage = useCallback((pageIndex: number) => { + setSelectedPage(pageIndex); + }, []); + + const handleGroupTextChange = useCallback((pageIndex: number, groupId: string, value: string) => { + setGroupsByPage((previous) => + previous.map((groups, idx) => + idx !== pageIndex + ? groups + : groups.map((group) => (group.id === groupId ? { ...group, text: value } : group)) + ) + ); + }, []); + + const handleResetEdits = useCallback(() => { + if (!loadedDocument) { + return; + } + resetToDocument(loadedDocument); + setErrorMessage(null); + }, [loadedDocument, resetToDocument]); + + const buildPayload = useCallback(() => { + if (!loadedDocument) { + return null; + } + + const updatedDocument = restoreGlyphElements(loadedDocument, groupsByPage); + const baseName = sanitizeBaseName(fileName || loadedDocument.metadata?.title || undefined); + return { + document: updatedDocument, + filename: `${baseName}.json`, + }; + }, [fileName, groupsByPage, loadedDocument]); + + const handleDownloadJson = useCallback(() => { + const payload = buildPayload(); + if (!payload) { + return; + } + + const { document, filename } = payload; + const serialized = JSON.stringify(document, null, 2); + downloadTextAsFile(serialized, filename, 'application/json'); + + if (onComplete) { + const exportedFile = new File([serialized], filename, { type: 'application/json' }); + onComplete([exportedFile]); + } + }, [buildPayload, onComplete]); + + const handleGeneratePdf = useCallback(async () => { + const payload = buildPayload(); + if (!payload) { + return; + } + + const { document, filename } = payload; + const serialized = JSON.stringify(document, null, 2); + const jsonFile = new File([serialized], filename, { type: 'application/json' }); + + const formData = new FormData(); + formData.append('fileInput', jsonFile); + + try { + setIsGeneratingPdf(true); + const response = await apiClient.post(CONVERSION_ENDPOINTS['json-pdf'], formData, { + responseType: 'blob', + }); + + const contentDisposition = response.headers?.['content-disposition'] ?? ''; + const detectedName = getFilenameFromHeaders(contentDisposition); + const baseName = sanitizeBaseName(filename).replace(/-edited$/u, ''); + const downloadName = detectedName || `${baseName || 'document'}.pdf`; + + downloadBlob(response.data, downloadName); + + if (onComplete) { + const pdfFile = new File([response.data], downloadName, { type: 'application/pdf' }); + onComplete([pdfFile]); + } + setErrorMessage(null); + } catch (error: any) { + console.error('Failed to convert JSON back to PDF', error); + const message = + error?.response?.data || + error?.message || + t('pdfJsonEditor.errors.pdfConversion', 'Unable to convert the edited JSON back into a PDF.'); + const msgString = typeof message === 'string' ? message : String(message); + setErrorMessage(msgString); + if (onError) { + onError(msgString); + } + } finally { + setIsGeneratingPdf(false); + } + }, [buildPayload, onComplete, onError, t]); + + const viewData = useMemo(() => ({ + document: loadedDocument, + groupsByPage, + selectedPage, + dirtyPages, + hasDocument, + fileName, + errorMessage, + isGeneratingPdf, + hasChanges, + onLoadJson: handleLoadFile, + onSelectPage: handleSelectPage, + onGroupEdit: handleGroupTextChange, + onReset: handleResetEdits, + onDownloadJson: handleDownloadJson, + onGeneratePdf: handleGeneratePdf, + }), [ + dirtyPages, + errorMessage, + fileName, + groupsByPage, + handleDownloadJson, + handleGeneratePdf, + handleGroupTextChange, + handleLoadFile, + handleResetEdits, + handleSelectPage, + hasChanges, + hasDocument, + isGeneratingPdf, + loadedDocument, + selectedPage, + ]); + + const latestViewDataRef = useRef(viewData); + latestViewDataRef.current = viewData; + + useEffect(() => { + registerCustomWorkbenchView({ + id: VIEW_ID, + workbenchId: WORKBENCH_ID, + label: viewLabel, + icon: , + component: PdfJsonEditorView, + }); + setLeftPanelView('hidden'); + setCustomWorkbenchViewData(VIEW_ID, latestViewDataRef.current); + + return () => { + clearCustomWorkbenchViewData(VIEW_ID); + unregisterCustomWorkbenchView(VIEW_ID); + setLeftPanelView('toolPicker'); + }; + }, [ + clearCustomWorkbenchViewData, + registerCustomWorkbenchView, + setCustomWorkbenchViewData, + setLeftPanelView, + viewLabel, + unregisterCustomWorkbenchView, + ]); + + useEffect(() => { + if ( + navigationState.selectedTool === 'pdfJsonEditor' && + navigationState.workbench !== WORKBENCH_ID + ) { + navigationActions.setWorkbench(WORKBENCH_ID); + } + }, [navigationActions, navigationState.selectedTool, navigationState.workbench]); + + const lastSentViewDataRef = useRef(null); + + useEffect(() => { + if (lastSentViewDataRef.current === viewData) { + return; + } + lastSentViewDataRef.current = viewData; + setCustomWorkbenchViewData(VIEW_ID, viewData); + }, [setCustomWorkbenchViewData, viewData]); + + // All editing happens in the custom workbench view. + return null; +}; + +(PdfJsonEditor as ToolComponent).tool = () => { + throw new Error('PDF JSON Editor does not support automation operations.'); +}; + +(PdfJsonEditor as ToolComponent).getDefaultParameters = () => ({ + groups: [], +}); + +export default PdfJsonEditor as ToolComponent; diff --git a/frontend/src/tools/pdfJsonEditorTypes.ts b/frontend/src/tools/pdfJsonEditorTypes.ts new file mode 100644 index 000000000..207c88dd8 --- /dev/null +++ b/frontend/src/tools/pdfJsonEditorTypes.ts @@ -0,0 +1,110 @@ +export interface PdfJsonFontCidSystemInfo { + registry?: string | null; + ordering?: string | null; + supplement?: number | null; +} + +export interface PdfJsonFont { + id?: string; + pageNumber?: number | null; + uid?: string | null; + baseName?: string | null; + subtype?: string | null; + encoding?: string | null; + cidSystemInfo?: PdfJsonFontCidSystemInfo | null; + embedded?: boolean | null; + program?: string | null; + programFormat?: string | null; + toUnicode?: string | null; + standard14Name?: string | null; + fontDescriptorFlags?: number | null; +} + +export interface PdfJsonTextElement { + text?: string | null; + fontId?: string | null; + fontSize?: number | null; + fontMatrixSize?: number | null; + fontSizeInPt?: number | null; + renderingMode?: number | null; + x?: number | null; + y?: number | null; + width?: number | null; + height?: number | null; + textMatrix?: number[] | null; +} + +export interface PdfJsonStream { + dictionary?: Record | null; + rawData?: string | null; +} + +export interface PdfJsonPage { + pageNumber?: number | null; + width?: number | null; + height?: number | null; + rotation?: number | null; + textElements?: PdfJsonTextElement[] | null; + resources?: unknown; + contentStreams?: PdfJsonStream[] | null; +} + +export interface PdfJsonMetadata { + title?: string | null; + author?: string | null; + subject?: string | null; + keywords?: string | null; + creator?: string | null; + producer?: string | null; + creationDate?: string | null; + modificationDate?: string | null; + trapped?: string | null; + numberOfPages?: number | null; +} + +export interface PdfJsonDocument { + metadata?: PdfJsonMetadata | null; + xmpMetadata?: string | null; + fonts?: PdfJsonFont[] | null; + pages?: PdfJsonPage[] | null; +} + +export interface BoundingBox { + left: number; + right: number; + top: number; + bottom: number; +} + +export interface TextGroup { + id: string; + pageIndex: number; + fontId?: string | null; + fontSize?: number | null; + elements: PdfJsonTextElement[]; + originalElements: PdfJsonTextElement[]; + text: string; + originalText: string; + bounds: BoundingBox; +} + +export const DEFAULT_PAGE_WIDTH = 612; +export const DEFAULT_PAGE_HEIGHT = 792; + +export interface PdfJsonEditorViewData { + document: PdfJsonDocument | null; + groupsByPage: TextGroup[][]; + selectedPage: number; + dirtyPages: boolean[]; + hasDocument: boolean; + fileName: string; + errorMessage: string | null; + isGeneratingPdf: boolean; + hasChanges: boolean; + onLoadJson: (file: File | null) => Promise | void; + onSelectPage: (pageIndex: number) => void; + onGroupEdit: (pageIndex: number, groupId: string, value: string) => void; + onReset: () => void; + onDownloadJson: () => void; + onGeneratePdf: () => void; +} diff --git a/frontend/src/tools/pdfJsonEditorUtils.ts b/frontend/src/tools/pdfJsonEditorUtils.ts new file mode 100644 index 000000000..b636a8f68 --- /dev/null +++ b/frontend/src/tools/pdfJsonEditorUtils.ts @@ -0,0 +1,344 @@ +import { + BoundingBox, + PdfJsonDocument, + PdfJsonPage, + PdfJsonTextElement, + TextGroup, + DEFAULT_PAGE_HEIGHT, + DEFAULT_PAGE_WIDTH, +} from './pdfJsonEditorTypes'; + +const LINE_TOLERANCE = 2; +const GAP_FACTOR = 0.6; +const SPACE_MIN_GAP = 1.5; + +export const valueOr = (value: number | null | undefined, fallback = 0): number => { + if (value === null || value === undefined || Number.isNaN(value)) { + return fallback; + } + return value; +}; + +export const cloneTextElement = (element: PdfJsonTextElement): PdfJsonTextElement => ({ + ...element, + textMatrix: element.textMatrix ? [...element.textMatrix] : element.textMatrix ?? undefined, +}); + +const getBaseline = (element: PdfJsonTextElement): number => { + if (element.textMatrix && element.textMatrix.length === 6) { + return valueOr(element.textMatrix[5]); + } + return valueOr(element.y); +}; + +const getX = (element: PdfJsonTextElement): number => { + if (element.textMatrix && element.textMatrix.length === 6) { + return valueOr(element.textMatrix[4]); + } + return valueOr(element.x); +}; + +const getWidth = (element: PdfJsonTextElement): number => { + const width = valueOr(element.width, 0); + if (width === 0 && element.text) { + const fontSize = valueOr(element.fontSize, 12); + return fontSize * Math.max(element.text.length * 0.45, 0.5); + } + return width; +}; + +const getFontSize = (element: PdfJsonTextElement): number => valueOr(element.fontSize, 12); + +const getHeight = (element: PdfJsonTextElement): number => { + const height = valueOr(element.height); + if (height === 0) { + return getFontSize(element) * 1.05; + } + return height; +}; + +const getElementBounds = (element: PdfJsonTextElement): BoundingBox => { + const left = getX(element); + const width = getWidth(element); + const bottom = getBaseline(element); + const height = getHeight(element); + const top = bottom - height; + return { + left, + right: left + width, + top, + bottom, + }; +}; + +const mergeBounds = (bounds: BoundingBox[]): BoundingBox => { + if (bounds.length === 0) { + return { left: 0, right: 0, top: 0, bottom: 0 }; + } + return bounds.reduce( + (acc, current) => ({ + left: Math.min(acc.left, current.left), + right: Math.max(acc.right, current.right), + top: Math.min(acc.top, current.top), + bottom: Math.max(acc.bottom, current.bottom), + }), + { ...bounds[0] } + ); +}; + +const shouldInsertSpace = (prev: PdfJsonTextElement, current: PdfJsonTextElement): boolean => { + const prevRight = getX(prev) + getWidth(prev); + const gap = getX(current) - prevRight; + const avgFontSize = (getFontSize(prev) + getFontSize(current)) / 2; + const threshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR); + return gap > threshold; +}; + +const buildGroupText = (elements: PdfJsonTextElement[]): string => { + let result = ''; + elements.forEach((element, index) => { + const value = element.text ?? ''; + if (index === 0) { + result += value; + return; + } + + const previous = elements[index - 1]; + const needsSpace = shouldInsertSpace(previous, element); + const startsWithWhitespace = /^\s/u.test(value); + + if (needsSpace && !startsWithWhitespace) { + result += ' '; + } + result += value; + }); + return result; +}; + +const createGroup = ( + pageIndex: number, + idSuffix: number, + elements: PdfJsonTextElement[], +): TextGroup => { + const clones = elements.map(cloneTextElement); + const originalClones = clones.map(cloneTextElement); + const bounds = mergeBounds(elements.map(getElementBounds)); + + return { + id: `${pageIndex}-${idSuffix}`, + pageIndex, + fontId: elements[0]?.fontId, + fontSize: elements[0]?.fontSize, + elements: clones, + originalElements: originalClones, + text: buildGroupText(elements), + originalText: buildGroupText(elements), + bounds, + }; +}; + +export const groupPageTextElements = (page: PdfJsonPage | null | undefined, pageIndex: number): TextGroup[] => { + if (!page?.textElements || page.textElements.length === 0) { + return []; + } + + const elements = page.textElements + .map(cloneTextElement) + .filter((element) => element.text !== null && element.text !== undefined); + + elements.sort((a, b) => getBaseline(b) - getBaseline(a)); + + const lines: { baseline: number; elements: PdfJsonTextElement[] }[] = []; + + elements.forEach((element) => { + const baseline = getBaseline(element); + const fontSize = getFontSize(element); + const tolerance = Math.max(LINE_TOLERANCE, fontSize * 0.12); + + const existingLine = lines.find((line) => Math.abs(line.baseline - baseline) <= tolerance); + + if (existingLine) { + existingLine.elements.push(element); + } else { + lines.push({ baseline, elements: [element] }); + } + }); + + lines.forEach((line) => { + line.elements.sort((a, b) => getX(a) - getX(b)); + }); + + let groupCounter = 0; + const groups: TextGroup[] = []; + + lines.forEach((line) => { + let currentBucket: PdfJsonTextElement[] = []; + + line.elements.forEach((element) => { + if (currentBucket.length === 0) { + currentBucket.push(element); + return; + } + + const previous = currentBucket[currentBucket.length - 1]; + const gap = getX(element) - (getX(previous) + getWidth(previous)); + const avgFontSize = (getFontSize(previous) + getFontSize(element)) / 2; + const splitThreshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR); + + const sameFont = previous.fontId === element.fontId; + const shouldSplit = gap > splitThreshold * (sameFont ? 1.4 : 1.0); + + if (shouldSplit) { + groups.push(createGroup(pageIndex, groupCounter, currentBucket)); + groupCounter += 1; + currentBucket = [element]; + } else { + currentBucket.push(element); + } + }); + + if (currentBucket.length > 0) { + groups.push(createGroup(pageIndex, groupCounter, currentBucket)); + groupCounter += 1; + } + }); + + return groups; +}; + +export const groupDocumentText = (document: PdfJsonDocument | null | undefined): TextGroup[][] => { + const pages = document?.pages ?? []; + return pages.map((page, index) => groupPageTextElements(page, index)); +}; + +export const deepCloneDocument = (document: PdfJsonDocument): PdfJsonDocument => { + if (typeof structuredClone === 'function') { + return structuredClone(document); + } + return JSON.parse(JSON.stringify(document)); +}; + +export const pageDimensions = (page: PdfJsonPage | null | undefined): { width: number; height: number } => { + return { + width: valueOr(page?.width, DEFAULT_PAGE_WIDTH), + height: valueOr(page?.height, DEFAULT_PAGE_HEIGHT), + }; +}; + +export const createMergedElement = (group: TextGroup): PdfJsonTextElement => { + const reference = group.originalElements[0]; + const merged = cloneTextElement(reference); + merged.text = group.text; + if (reference.textMatrix && reference.textMatrix.length === 6) { + merged.textMatrix = [...reference.textMatrix]; + } + return merged; +}; + +const distributeTextAcrossElements = (text: string | undefined, elements: PdfJsonTextElement[]): void => { + if (elements.length === 0) { + return; + } + + const targetChars = Array.from(text ?? ''); + let cursor = 0; + + elements.forEach((element, index) => { + const originalText = element.text ?? ''; + let sliceLength = Array.from(originalText).length; + if (sliceLength <= 0) { + sliceLength = 1; + } + + if (index === elements.length - 1) { + element.text = targetChars.slice(cursor).join(''); + cursor = targetChars.length; + return; + } + + const slice = targetChars.slice(cursor, cursor + sliceLength).join(''); + element.text = slice; + cursor = Math.min(cursor + sliceLength, targetChars.length); + }); + + if (cursor < targetChars.length) { + const last = elements[elements.length - 1]; + last.text = (last.text ?? '') + targetChars.slice(cursor).join(''); + } + + elements.forEach((element) => { + if (element.text == null) { + element.text = ''; + } + }); +}; + +export const buildUpdatedDocument = ( + source: PdfJsonDocument, + groupsByPage: TextGroup[][], +): PdfJsonDocument => { + const updated = deepCloneDocument(source); + const pages = updated.pages ?? []; + + updated.pages = pages.map((page, pageIndex) => { + const groups = groupsByPage[pageIndex] ?? []; + if (!groups.length) { + return page; + } + + const hasPageChanges = groups.some((group) => group.text !== group.originalText); + const updatedElements: PdfJsonTextElement[] = groups.flatMap((group) => { + if (group.text === group.originalText) { + return group.originalElements.map(cloneTextElement); + } + return [createMergedElement(group)]; + }); + + return { + ...page, + textElements: updatedElements, + contentStreams: page.contentStreams ?? [], + }; + }); + + return updated; +}; + +export const restoreGlyphElements = ( + source: PdfJsonDocument, + groupsByPage: TextGroup[][], +): PdfJsonDocument => { + const updated = deepCloneDocument(source); + const pages = updated.pages ?? []; + + updated.pages = pages.map((page, pageIndex) => { + const groups = groupsByPage[pageIndex] ?? []; + if (!groups.length) { + return page; + } + + const rebuiltElements: PdfJsonTextElement[] = []; + let pageChanged = false; + + groups.forEach((group) => { + const originals = group.originalElements.map(cloneTextElement); + if (group.text !== group.originalText) { + pageChanged = true; + distributeTextAcrossElements(group.text, originals); + } + rebuiltElements.push(...originals); + }); + + return { + ...page, + textElements: rebuiltElements, + contentStreams: page.contentStreams ?? [], + }; + }); + + return updated; +}; + +export const getDirtyPages = (groupsByPage: TextGroup[][]): boolean[] => { + return groupsByPage.map((groups) => groups.some((group) => group.text !== group.originalText)); +}; diff --git a/frontend/src/types/toolId.ts b/frontend/src/types/toolId.ts index d485cd220..988d2a9f2 100644 --- a/frontend/src/types/toolId.ts +++ b/frontend/src/types/toolId.ts @@ -46,6 +46,7 @@ export const REGULAR_TOOL_IDS = [ 'validateSignature', 'replaceColor', 'showJS', + 'pdfJsonEditor', 'bookletImposition', ] as const; @@ -92,4 +93,3 @@ type Disjoint = [A & B] extends [never] ? true : false; type _Check1 = Assert>; type _Check2 = Assert>; type _Check3 = Assert>; -