diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonAnnotation.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonAnnotation.java new file mode 100644 index 000000000..b994279fe --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonAnnotation.java @@ -0,0 +1,61 @@ +package stirling.software.SPDF.model.json; + +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * Represents a PDF annotation (comments, highlights, stamps, etc.). Annotations often contain OCR + * text layers or other metadata not visible in content streams. + */ +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonAnnotation { + + /** Annotation subtype (Text, Highlight, Link, Stamp, Widget, etc.) */ + private String subtype; + + /** Human-readable text content of the annotation */ + private String contents; + + /** Annotation rectangle [x1, y1, x2, y2] */ + private List rect; + + /** Annotation appearance characteristics */ + private String appearanceState; + + /** Color components (e.g., [r, g, b] for RGB) */ + private List color; + + /** Annotation flags (print, hidden, etc.) */ + private Integer flags; + + /** For link annotations: destination or action */ + private String destination; + + /** For text annotations: icon name */ + private String iconName; + + /** Subject/title of the annotation */ + private String subject; + + /** Author of the annotation */ + private String author; + + /** Creation date (ISO 8601 format) */ + private String creationDate; + + /** Modification date (ISO 8601 format) */ + private String modificationDate; + + /** Full annotation dictionary for lossless round-tripping */ + private PdfJsonCosValue rawData; +} diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java index 3f5bd1f8b..d590b34b9 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java @@ -25,4 +25,7 @@ public class PdfJsonDocument { @Builder.Default private List fonts = new ArrayList<>(); @Builder.Default private List pages = new ArrayList<>(); + + /** Form fields (AcroForm) at document level */ + @Builder.Default private List formFields = new ArrayList<>(); } diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java index 98d251103..a3496677c 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java @@ -52,4 +52,22 @@ public class PdfJsonFont { /** Font descriptor flags copied from the source document. */ private Integer fontDescriptorFlags; + + /** Font ascent in glyph units (typically 1/1000). */ + private Float ascent; + + /** Font descent in glyph units (typically negative). */ + private Float descent; + + /** Capital height when available. */ + private Float capHeight; + + /** x-height when available. */ + private Float xHeight; + + /** Italic angle reported by the font descriptor. */ + private Float italicAngle; + + /** Units per em extracted from the font matrix. */ + private Integer unitsPerEm; } diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFormField.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFormField.java new file mode 100644 index 000000000..2a7c220a8 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFormField.java @@ -0,0 +1,66 @@ +package stirling.software.SPDF.model.json; + +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** Represents a PDF form field (AcroForm). */ +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonFormField { + + /** Fully qualified field name (e.g., "form1.textfield1") */ + private String name; + + /** Partial field name (last component) */ + private String partialName; + + /** Field type (Tx=text, Btn=button, Ch=choice, Sig=signature) */ + private String fieldType; + + /** Field value as string */ + private String value; + + /** Default value */ + private String defaultValue; + + /** Field flags (readonly, required, multiline, etc.) */ + private Integer flags; + + /** Alternative field name (for accessibility) */ + private String alternateFieldName; + + /** Mapping name (for export) */ + private String mappingName; + + /** Page number where field appears (1-indexed) */ + private Integer pageNumber; + + /** Field rectangle [x1, y1, x2, y2] on the page */ + private List rect; + + /** For choice fields: list of options */ + private List options; + + /** For choice fields: selected indices */ + private List selectedIndices; + + /** For button fields: whether it's checked */ + private Boolean checked; + + /** Font information for text fields */ + private String fontName; + + private Float fontSize; + + /** Full field dictionary for lossless round-tripping */ + private PdfJsonCosValue rawData; +} diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java index 8bc7c6d65..fa1417d5c 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java @@ -24,6 +24,7 @@ public class PdfJsonPage { @Builder.Default private List textElements = new ArrayList<>(); @Builder.Default private List imageElements = new ArrayList<>(); + @Builder.Default private List annotations = new ArrayList<>(); /** Serialized representation of the page resources dictionary. */ private PdfJsonCosValue resources; diff --git a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java index 4e6e03920..31387eeae 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java @@ -3,12 +3,16 @@ package stirling.software.SPDF.service; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; import java.awt.image.BufferedImage; +import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.io.OutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.time.Instant; import java.time.format.DateTimeParseException; import java.util.ArrayList; @@ -17,15 +21,19 @@ import java.util.Calendar; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.TimeZone; import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; import javax.imageio.ImageIO; @@ -66,8 +74,13 @@ import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; import org.apache.pdfbox.pdmodel.graphics.state.PDTextState; import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget; +import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; +import org.apache.pdfbox.pdmodel.interactive.form.PDField; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.util.DateConverter; import org.apache.pdfbox.util.Matrix; import org.springframework.beans.factory.annotation.Value; import org.springframework.core.io.Resource; @@ -80,10 +93,13 @@ import com.fasterxml.jackson.databind.ObjectMapper; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import stirling.software.SPDF.config.EndpointConfiguration; +import stirling.software.SPDF.model.json.PdfJsonAnnotation; import stirling.software.SPDF.model.json.PdfJsonCosValue; import stirling.software.SPDF.model.json.PdfJsonDocument; import stirling.software.SPDF.model.json.PdfJsonFont; import stirling.software.SPDF.model.json.PdfJsonFontCidSystemInfo; +import stirling.software.SPDF.model.json.PdfJsonFormField; import stirling.software.SPDF.model.json.PdfJsonImageElement; import stirling.software.SPDF.model.json.PdfJsonMetadata; import stirling.software.SPDF.model.json.PdfJsonPage; @@ -92,6 +108,10 @@ import stirling.software.SPDF.model.json.PdfJsonTextColor; import stirling.software.SPDF.model.json.PdfJsonTextElement; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.ExceptionUtils; +import stirling.software.common.util.ProcessExecutor; +import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult; +import stirling.software.common.util.TempFile; +import stirling.software.common.util.TempFileManager; @Slf4j @Service @@ -101,62 +121,139 @@ public class PdfJsonConversionService { private final CustomPDFDocumentFactory pdfDocumentFactory; private final ObjectMapper objectMapper; private final ResourceLoader resourceLoader; + private final EndpointConfiguration endpointConfiguration; + private final TempFileManager tempFileManager; private static final String FALLBACK_FONT_ID = "fallback-noto-sans"; private static final String DEFAULT_FALLBACK_FONT_LOCATION = "classpath:/static/fonts/NotoSans-Regular.ttf"; + private static final String FALLBACK_FONT_CJK_ID = "fallback-noto-cjk"; + private static final String FALLBACK_FONT_JP_ID = "fallback-noto-jp"; + private static final String FALLBACK_FONT_KR_ID = "fallback-noto-korean"; + private static final String FALLBACK_FONT_AR_ID = "fallback-noto-arabic"; + private static final String FALLBACK_FONT_TH_ID = "fallback-noto-thai"; + + private static final Map BUILT_IN_FALLBACK_FONTS = + Map.ofEntries( + Map.entry( + FALLBACK_FONT_CJK_ID, + new FallbackFontSpec( + "classpath:/static/fonts/NotoSansSC-Regular.ttf", + "NotoSansSC-Regular", + "ttf")), + Map.entry( + FALLBACK_FONT_JP_ID, + new FallbackFontSpec( + "classpath:/static/fonts/NotoSansJP-Regular.ttf", + "NotoSansJP-Regular", + "ttf")), + Map.entry( + FALLBACK_FONT_KR_ID, + new FallbackFontSpec( + "classpath:/static/fonts/malgun.ttf", "MalgunGothic", "ttf")), + Map.entry( + FALLBACK_FONT_AR_ID, + new FallbackFontSpec( + "classpath:/static/fonts/NotoSansArabic-Regular.ttf", + "NotoSansArabic-Regular", + "ttf")), + Map.entry( + FALLBACK_FONT_TH_ID, + new FallbackFontSpec( + "classpath:/static/fonts/NotoSansThai-Regular.ttf", + "NotoSansThai-Regular", + "ttf"))); @Value("${stirling.pdf.fallback-font:" + DEFAULT_FALLBACK_FONT_LOCATION + "}") private String fallbackFontLocation; - private byte[] fallbackFontBytes; + @Value("${stirling.pdf.json.font-normalization.enabled:true}") + private boolean fontNormalizationEnabled; + + @Value("${stirling.pdf.json.cff-converter.enabled:true}") + private boolean cffConversionEnabled; + + @Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}") + private String fontforgeCommand; + + private final Map fallbackFontCache = new ConcurrentHashMap<>(); public byte[] convertPdfToJson(MultipartFile file) throws IOException { if (file == null) { throw ExceptionUtils.createNullArgumentException("fileInput"); } - try (PDDocument document = pdfDocumentFactory.load(file.getInputStream(), true)) { - int totalPages = document.getNumberOfPages(); - log.info("Converting PDF to JSON ({} pages)", totalPages); - Map fonts = new LinkedHashMap<>(); - Map> textByPage = new LinkedHashMap<>(); - Map> pageFontResources = new HashMap<>(); - int pageNumber = 1; - for (PDPage page : document.getPages()) { - Map resourceMap = - collectFontsForPage(document, page, pageNumber, fonts); - pageFontResources.put(pageNumber, resourceMap); - log.debug( - "PDF→JSON: collected {} font resources on page {}", - resourceMap.size(), - pageNumber); - pageNumber++; + TempFile normalizedFile = null; + try (TempFile originalFile = new TempFile(tempFileManager, ".pdf")) { + file.transferTo(originalFile.getFile()); + Path workingPath = originalFile.getPath(); + + if (fontNormalizationEnabled && canRunGhostscript()) { + try { + normalizedFile = normalizePdfFonts(workingPath); + if (normalizedFile != null && normalizedFile.exists()) { + workingPath = normalizedFile.getPath(); + log.info("Using Ghostscript-normalized PDF for JSON export"); + } + } catch (IOException ex) { + log.warn( + "Ghostscript font normalization failed ({}); using original PDF", + ex.getMessage()); + closeQuietly(normalizedFile); + normalizedFile = null; + } } - TextCollectingStripper stripper = - new TextCollectingStripper(document, fonts, textByPage, pageFontResources); - stripper.setSortByPosition(true); - stripper.getText(document); + try (PDDocument document = pdfDocumentFactory.load(workingPath, true)) { + int totalPages = document.getNumberOfPages(); + log.info("Converting PDF to JSON ({} pages)", totalPages); + Map fonts = new LinkedHashMap<>(); + Map> textByPage = new LinkedHashMap<>(); - Map> imagesByPage = collectImages(document); + Map> pageFontResources = new HashMap<>(); + int pageNumber = 1; + for (PDPage page : document.getPages()) { + Map resourceMap = + collectFontsForPage(document, page, pageNumber, fonts); + pageFontResources.put(pageNumber, resourceMap); + log.debug( + "PDF→JSON: collected {} font resources on page {}", + resourceMap.size(), + pageNumber); + pageNumber++; + } - PdfJsonDocument pdfJson = new PdfJsonDocument(); - pdfJson.setMetadata(extractMetadata(document)); - pdfJson.setXmpMetadata(extractXmpMetadata(document)); - List serializedFonts = new ArrayList<>(fonts.values()); - serializedFonts.sort( - Comparator.comparing( - PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder()))); - pdfJson.setFonts(serializedFonts); - pdfJson.setPages(extractPages(document, textByPage, imagesByPage)); + TextCollectingStripper stripper = + new TextCollectingStripper(document, fonts, textByPage, pageFontResources); + stripper.setSortByPosition(true); + stripper.getText(document); - log.info( - "PDF→JSON conversion complete (fonts: {}, pages: {})", - serializedFonts.size(), - pdfJson.getPages().size()); + Map> imagesByPage = collectImages(document); + Map> annotationsByPage = + collectAnnotations(document); - return objectMapper.writerWithDefaultPrettyPrinter().writeValueAsBytes(pdfJson); + PdfJsonDocument pdfJson = new PdfJsonDocument(); + pdfJson.setMetadata(extractMetadata(document)); + pdfJson.setXmpMetadata(extractXmpMetadata(document)); + List serializedFonts = new ArrayList<>(fonts.values()); + serializedFonts.sort( + Comparator.comparing( + PdfJsonFont::getUid, + Comparator.nullsLast(Comparator.naturalOrder()))); + pdfJson.setFonts(serializedFonts); + pdfJson.setPages( + extractPages(document, textByPage, imagesByPage, annotationsByPage)); + pdfJson.setFormFields(collectFormFields(document)); + + log.info( + "PDF→JSON conversion complete (fonts: {}, pages: {})", + serializedFonts.size(), + pdfJson.getPages().size()); + + return objectMapper.writerWithDefaultPrettyPrinter().writeValueAsBytes(pdfJson); + } + } finally { + closeQuietly(normalizedFile); } } @@ -210,16 +307,23 @@ public class PdfJsonConversionService { page.setContents(preservedStreams); } - List elements = - pageModel.getTextElements() != null - ? pageModel.getTextElements() - : new ArrayList<>(); List imageElements = pageModel.getImageElements() != null ? pageModel.getImageElements() : new ArrayList<>(); - boolean fallbackAssigned = + // Reconstruct image XObjects if content streams are preserved + // (images were filtered out during serialization to avoid duplication) + if (!preservedStreams.isEmpty() && !imageElements.isEmpty()) { + reconstructImageXObjects(document, page, preservedStreams, imageElements); + } + + List elements = + pageModel.getTextElements() != null + ? pageModel.getTextElements() + : new ArrayList<>(); + + PreflightResult preflightResult = preflightTextElements( document, fontMap, fontModels, elements, pageNumberValue); @@ -227,11 +331,14 @@ public class PdfJsonConversionService { "Page {} preflight complete (elements={}, fallbackApplied={})", pageNumberValue, elements.size(), - fallbackAssigned); + preflightResult.usesFallback()); - if (elements.stream().anyMatch(el -> FALLBACK_FONT_ID.equals(el.getFontId()))) { - ensureFallbackResource(page, fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID))); - log.info("Page {} uses fallback font for some elements", pageNumberValue); + if (!preflightResult.fallbackFontIds().isEmpty()) { + ensureFallbackResources(page, preflightResult.fallbackFontIds(), fontMap); + log.info( + "Page {} registered fallback fonts: {}", + pageNumberValue, + preflightResult.fallbackFontIds()); } boolean hasText = !elements.isEmpty(); @@ -239,7 +346,7 @@ public class PdfJsonConversionService { boolean rewriteSucceeded = true; if (hasText) { - if (fallbackAssigned) { + if (preflightResult.usesFallback()) { rewriteSucceeded = false; } else if (!preservedStreams.isEmpty()) { log.info("Attempting token rewrite for page {}", pageNumberValue); @@ -272,12 +379,31 @@ public class PdfJsonConversionService { if (shouldRegenerate) { log.info("Regenerating page content for page {}", pageNumberValue); regeneratePageContent( - document, page, elements, imageElements, fontMap, pageNumberValue); + document, + page, + elements, + imageElements, + fontMap, + fontModels, + pageNumberValue); log.info("Page content regeneration complete for page {}", pageNumberValue); } + + // Restore annotations for this page + List annotations = + pageModel.getAnnotations() != null + ? pageModel.getAnnotations() + : new ArrayList<>(); + restoreAnnotations(document, page, annotations); + pageIndex++; } + // Restore form fields + List formFields = + pdfJson.getFormFields() != null ? pdfJson.getFormFields() : new ArrayList<>(); + restoreFormFields(document, formFields); + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { document.save(baos); return baos.toByteArray(); @@ -321,6 +447,7 @@ public class PdfJsonConversionService { private PdfJsonFont buildFontModel( PDDocument document, PDFont font, String fontId, int pageNumber) throws IOException { + PDFontDescriptor descriptor = font.getFontDescriptor(); String subtype = font.getCOSObject().getNameAsString(COSName.SUBTYPE); String encoding = resolveEncoding(font); PdfJsonFontCidSystemInfo cidInfo = extractCidSystemInfo(font.getCOSObject()); @@ -328,8 +455,7 @@ public class PdfJsonConversionService { FontProgramData programData = embedded ? extractFontProgram(font) : null; String toUnicode = extractToUnicode(font.getCOSObject()); String standard14Name = resolveStandard14Name(font); - Integer flags = - font.getFontDescriptor() != null ? font.getFontDescriptor().getFlags() : null; + Integer flags = descriptor != null ? descriptor.getFlags() : null; return PdfJsonFont.builder() .id(fontId) @@ -345,10 +471,16 @@ public class PdfJsonConversionService { .toUnicode(toUnicode) .standard14Name(standard14Name) .fontDescriptorFlags(flags) + .ascent(descriptor != null ? descriptor.getAscent() : null) + .descent(descriptor != null ? descriptor.getDescent() : null) + .capHeight(descriptor != null ? descriptor.getCapHeight() : null) + .xHeight(descriptor != null ? descriptor.getXHeight() : null) + .italicAngle(descriptor != null ? descriptor.getItalicAngle() : null) + .unitsPerEm(extractUnitsPerEm(font)) .build(); } - private boolean preflightTextElements( + private PreflightResult preflightTextElements( PDDocument document, Map fontMap, List fontModels, @@ -356,11 +488,12 @@ public class PdfJsonConversionService { int pageNumber) throws IOException { if (elements == null || elements.isEmpty()) { - return false; + return PreflightResult.empty(); } - PDFont fallbackFont = fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID)); - boolean fallbackApplied = false; + Set fallbackIds = new LinkedHashSet<>(); + boolean fallbackNeeded = false; + for (PdfJsonTextElement element : elements) { String text = Objects.toString(element.getText(), ""); if (text.isEmpty()) { @@ -368,57 +501,99 @@ public class PdfJsonConversionService { } PDFont font = fontMap.get(buildFontKey(pageNumber, element.getFontId())); - boolean encodable = false; - if (font != null) { - try { - font.encode(text); - encodable = true; - } catch (IOException | IllegalArgumentException ex) { - log.debug( - "Font {} missing glyphs for text '{}': {}", - element.getFontId(), - text, - ex.getMessage()); - } + if (font == null && element.getFontId() != null) { + font = fontMap.get(buildFontKey(-1, element.getFontId())); } - if (encodable) { + if (font == null) { + fallbackNeeded = true; + fallbackIds.add(FALLBACK_FONT_ID); continue; } - element.setFontId(FALLBACK_FONT_ID); - log.info( - "Assigning fallback font to text element on page {} (text='{}')", - pageNumber, - abbreviate(text)); - if (fallbackFont == null) { - fallbackFont = loadFallbackPdfFont(document); - fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), fallbackFont); - if (fontModels.stream().noneMatch(f -> FALLBACK_FONT_ID.equals(f.getId()))) { - fontModels.add(buildFallbackFontModel()); + if (!canEncodeFully(font, text)) { + fallbackNeeded = true; + for (int offset = 0; offset < text.length(); ) { + int codePoint = text.codePointAt(offset); + offset += Character.charCount(codePoint); + if (!canEncode(font, codePoint)) { + String fallbackId = resolveFallbackFontId(codePoint); + fallbackIds.add(fallbackId != null ? fallbackId : FALLBACK_FONT_ID); + } } } - fallbackApplied = true; } - return fallbackApplied; + + for (String fallbackId : fallbackIds) { + ensureFallbackFont(document, fontMap, fontModels, fallbackId); + } + + if (fallbackNeeded && fallbackIds.isEmpty()) { + fallbackIds.add(FALLBACK_FONT_ID); + ensureFallbackFont(document, fontMap, fontModels, FALLBACK_FONT_ID); + } + + return new PreflightResult(fallbackNeeded, fallbackIds); } private PdfJsonFont buildFallbackFontModel() throws IOException { - byte[] bytes = loadFallbackFontBytes(); + return buildFallbackFontModel(FALLBACK_FONT_ID); + } + + private PdfJsonFont buildFallbackFontModel(String fallbackId) throws IOException { + FallbackFontSpec spec = getFallbackFontSpec(fallbackId); + if (spec == null) { + throw new IOException("Unknown fallback font id " + fallbackId); + } + byte[] bytes = loadFallbackFontBytes(fallbackId, spec); String base64 = Base64.getEncoder().encodeToString(bytes); return PdfJsonFont.builder() - .id(FALLBACK_FONT_ID) - .uid(FALLBACK_FONT_ID) - .baseName("NotoSans-Regular") + .id(fallbackId) + .uid(fallbackId) + .baseName(spec.baseName()) .subtype("TrueType") .embedded(true) .program(base64) - .programFormat("ttf") + .programFormat(spec.format()) .build(); } - private void ensureFallbackResource(PDPage page, PDFont fallbackFont) { - if (fallbackFont == null) { + private FallbackFontSpec getFallbackFontSpec(String fallbackId) { + if (FALLBACK_FONT_ID.equals(fallbackId)) { + String baseName = inferBaseName(fallbackFontLocation, "NotoSans-Regular"); + String format = inferFormat(fallbackFontLocation, "ttf"); + return new FallbackFontSpec(fallbackFontLocation, baseName, format); + } + return BUILT_IN_FALLBACK_FONTS.get(fallbackId); + } + + private String inferBaseName(String location, String defaultName) { + if (location == null || location.isBlank()) { + return defaultName; + } + int slash = location.lastIndexOf('/'); + String fileName = slash >= 0 ? location.substring(slash + 1) : location; + int dot = fileName.lastIndexOf('.'); + if (dot > 0) { + fileName = fileName.substring(0, dot); + } + return fileName.isEmpty() ? defaultName : fileName; + } + + private String inferFormat(String location, String defaultFormat) { + if (location == null || location.isBlank()) { + return defaultFormat; + } + int dot = location.lastIndexOf('.'); + if (dot >= 0 && dot < location.length() - 1) { + return location.substring(dot + 1).toLowerCase(Locale.ROOT); + } + return defaultFormat; + } + + private void ensureFallbackResources( + PDPage page, Set fallbackFontIds, Map fontMap) { + if (fallbackFontIds == null || fallbackFontIds.isEmpty()) { return; } PDResources resources = page.getResources(); @@ -426,40 +601,204 @@ public class PdfJsonConversionService { resources = new PDResources(); page.setResources(resources); } - COSName fallbackName = COSName.getPDFName(FALLBACK_FONT_ID); - boolean exists = false; - for (COSName name : resources.getFontNames()) { - if (fallbackName.equals(name)) { - exists = true; - break; + for (String fallbackId : fallbackFontIds) { + if (fallbackId == null) { + continue; + } + PDFont fallbackFont = fontMap.get(buildFontKey(-1, fallbackId)); + if (fallbackFont == null) { + continue; + } + COSName fallbackName = COSName.getPDFName(fallbackId); + boolean exists = false; + for (COSName name : resources.getFontNames()) { + if (fallbackName.equals(name)) { + exists = true; + break; + } + } + if (!exists) { + resources.put(fallbackName, fallbackFont); } - } - if (!exists) { - resources.put(fallbackName, fallbackFont); } } private PDFont loadFallbackPdfFont(PDDocument document) throws IOException { - byte[] bytes = loadFallbackFontBytes(); + return loadFallbackPdfFont(document, FALLBACK_FONT_ID); + } + + private PDFont loadFallbackPdfFont(PDDocument document, String fallbackId) throws IOException { + FallbackFontSpec spec = getFallbackFontSpec(fallbackId); + if (spec == null) { + throw new IOException("Unknown fallback font id " + fallbackId); + } + byte[] bytes = loadFallbackFontBytes(fallbackId, spec); try (InputStream stream = new ByteArrayInputStream(bytes)) { return PDType0Font.load(document, stream, true); } } - private byte[] loadFallbackFontBytes() throws IOException { - if (fallbackFontBytes == null) { - Resource resource = resourceLoader.getResource(fallbackFontLocation); - if (!resource.exists()) { - throw new IOException( - "Fallback font resource not found at " + fallbackFontLocation); - } - try (InputStream inputStream = resource.getInputStream(); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - inputStream.transferTo(baos); - fallbackFontBytes = baos.toByteArray(); - } + private PDFont ensureFallbackFont( + PDDocument document, + Map fontMap, + List fontModels, + String fallbackId) + throws IOException { + String effectiveId = fallbackId != null ? fallbackId : FALLBACK_FONT_ID; + String key = buildFontKey(-1, effectiveId); + PDFont font = fontMap.get(key); + if (font != null) { + return font; } - return fallbackFontBytes; + PDFont loaded = loadFallbackPdfFont(document, effectiveId); + fontMap.put(key, loaded); + if (fontModels != null + && fontModels.stream().noneMatch(f -> effectiveId.equals(f.getId()))) { + fontModels.add(buildFallbackFontModel(effectiveId)); + } + return loaded; + } + + private byte[] loadFallbackFontBytes(String fallbackId, FallbackFontSpec spec) + throws IOException { + if (spec == null) { + throw new IOException("No fallback font specification for " + fallbackId); + } + byte[] cached = fallbackFontCache.get(fallbackId); + if (cached != null) { + return cached; + } + Resource resource = resourceLoader.getResource(spec.resourceLocation()); + if (!resource.exists()) { + throw new IOException("Fallback font resource not found at " + spec.resourceLocation()); + } + try (InputStream inputStream = resource.getInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + inputStream.transferTo(baos); + byte[] bytes = baos.toByteArray(); + fallbackFontCache.put(fallbackId, bytes); + return bytes; + } + } + + private boolean canRunGhostscript() { + try { + return endpointConfiguration != null + && endpointConfiguration.isGroupEnabled("Ghostscript"); + } catch (Exception ex) { + log.debug("Ghostscript availability check failed: {}", ex.getMessage()); + return false; + } + } + + private TempFile normalizePdfFonts(Path sourcePath) throws IOException { + if (sourcePath == null || !Files.exists(sourcePath)) { + return null; + } + TempFile outputFile = new TempFile(tempFileManager, ".pdf"); + List command = new ArrayList<>(); + command.add("gs"); + command.add("-sDEVICE=pdfwrite"); + command.add("-dCompatibilityLevel=1.7"); + command.add("-dPDFSETTINGS=/prepress"); + command.add("-dEmbedAllFonts=true"); + command.add("-dSubsetFonts=true"); + command.add("-dCompressFonts=true"); + command.add("-dNOPAUSE"); + command.add("-dBATCH"); + command.add("-dQUIET"); + command.add("-o"); + command.add(outputFile.getAbsolutePath()); + command.add("-c"); + command.add("<> setdistillerparams"); + command.add("-f"); + command.add(sourcePath.toString()); + try { + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(command); + if (result.getRc() == 0 + && Files.exists(outputFile.getPath()) + && Files.size(outputFile.getPath()) > 0) { + return outputFile; + } + log.warn("Ghostscript normalization exited with code {}", result.getRc()); + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + closeQuietly(outputFile); + throw new IOException("Ghostscript normalization interrupted", ex); + } catch (IOException ex) { + closeQuietly(outputFile); + throw ex; + } + + closeQuietly(outputFile); + return null; + } + + private byte[] convertCffProgramToTrueType(byte[] fontBytes) { + if (!cffConversionEnabled + || fontforgeCommand == null + || fontforgeCommand.isBlank() + || fontBytes == null + || fontBytes.length == 0) { + return null; + } + + try (TempFile inputFile = new TempFile(tempFileManager, ".cff"); + TempFile outputFile = new TempFile(tempFileManager, ".ttf")) { + Files.write(inputFile.getPath(), fontBytes); + + List command = new ArrayList<>(); + command.add(fontforgeCommand); + command.add("-lang=ff"); + command.add("-c"); + command.add( + "Open($1); SelectWorthOutputting(); SetFontOrder(2); Reencode(\"unicode\"); " + + "Generate($2); Close(); Quit()"); + command.add(inputFile.getAbsolutePath()); + command.add(outputFile.getAbsolutePath()); + + ProcessBuilder builder = new ProcessBuilder(command); + builder.redirectErrorStream(true); + Process process = builder.start(); + + StringBuilder output = new StringBuilder(); + Thread reader = + new Thread( + () -> { + try (BufferedReader br = + new BufferedReader( + new InputStreamReader( + process.getInputStream(), + StandardCharsets.UTF_8))) { + String line; + while ((line = br.readLine()) != null) { + output.append(line).append('\n'); + } + } catch (IOException ignored) { + } + }); + reader.start(); + int exitCode = process.waitFor(); + reader.join(); + + if (exitCode == 0 && Files.exists(outputFile.getPath())) { + return Files.readAllBytes(outputFile.getPath()); + } + + log.warn( + "FontForge conversion exited with code {}: {}", + exitCode, + output.toString().trim()); + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + log.warn("FontForge conversion interrupted"); + } catch (IOException ex) { + log.warn("FontForge conversion failed: {}", ex.getMessage()); + } + + return null; } private PdfJsonFontCidSystemInfo extractCidSystemInfo(COSDictionary fontDictionary) { @@ -599,7 +938,8 @@ public class PdfJsonConversionService { private List extractPages( PDDocument document, Map> textByPage, - Map> imagesByPage) + Map> imagesByPage, + Map> annotationsByPage) throws IOException { List pages = new ArrayList<>(); int pageIndex = 0; @@ -612,8 +952,13 @@ public class PdfJsonConversionService { pageModel.setRotation(page.getRotation()); pageModel.setTextElements(textByPage.getOrDefault(pageIndex + 1, new ArrayList<>())); pageModel.setImageElements(imagesByPage.getOrDefault(pageIndex + 1, new ArrayList<>())); - pageModel.setResources( - serializeCosValue(page.getCOSObject().getDictionaryObject(COSName.RESOURCES))); + pageModel.setAnnotations( + annotationsByPage.getOrDefault(pageIndex + 1, new ArrayList<>())); + // Serialize resources but exclude image XObject streams to avoid duplication with + // imageElements + COSBase resourcesBase = page.getCOSObject().getDictionaryObject(COSName.RESOURCES); + COSBase filteredResources = filterImageXObjectsFromResources(resourcesBase); + pageModel.setResources(serializeCosValue(filteredResources)); pageModel.setContentStreams(extractContentStreams(page)); pages.add(pageModel); pageIndex++; @@ -634,6 +979,215 @@ public class PdfJsonConversionService { return imagesByPage; } + private Map> collectAnnotations(PDDocument document) + throws IOException { + Map> annotationsByPage = new LinkedHashMap<>(); + int pageNumber = 1; + for (PDPage page : document.getPages()) { + List annotations = new ArrayList<>(); + for (PDAnnotation annotation : page.getAnnotations()) { + try { + PdfJsonAnnotation ann = new PdfJsonAnnotation(); + ann.setSubtype(annotation.getSubtype()); + ann.setContents(annotation.getContents()); + + PDRectangle rect = annotation.getRectangle(); + if (rect != null) { + ann.setRect( + List.of( + rect.getLowerLeftX(), + rect.getLowerLeftY(), + rect.getUpperRightX(), + rect.getUpperRightY())); + } + + COSName appearanceState = annotation.getAppearanceState(); + if (appearanceState != null) { + ann.setAppearanceState(appearanceState.getName()); + } + + if (annotation.getColor() != null) { + float[] colorComponents = annotation.getColor().getComponents(); + List colorList = new ArrayList<>(colorComponents.length); + for (float c : colorComponents) { + colorList.add(c); + } + ann.setColor(colorList); + } + + COSDictionary annotDict = annotation.getCOSObject(); + COSString title = (COSString) annotDict.getDictionaryObject(COSName.T); + if (title != null) { + ann.setAuthor(title.getString()); + } + + COSString subj = (COSString) annotDict.getDictionaryObject(COSName.SUBJ); + if (subj != null) { + ann.setSubject(subj.getString()); + } + + COSString creationDateStr = + (COSString) annotDict.getDictionaryObject(COSName.CREATION_DATE); + if (creationDateStr != null) { + try { + Calendar creationDate = + DateConverter.toCalendar(creationDateStr.getString()); + ann.setCreationDate(formatCalendar(creationDate)); + } catch (Exception e) { + log.debug( + "Failed to parse annotation creation date: {}", e.getMessage()); + } + } + + COSString modDateStr = (COSString) annotDict.getDictionaryObject(COSName.M); + if (modDateStr != null) { + try { + Calendar modDate = DateConverter.toCalendar(modDateStr.getString()); + ann.setModificationDate(formatCalendar(modDate)); + } catch (Exception e) { + log.debug( + "Failed to parse annotation modification date: {}", + e.getMessage()); + } + } + + // Store raw dictionary for lossless round-trip + ann.setRawData(serializeCosValue(annotDict)); + + annotations.add(ann); + } catch (Exception e) { + log.warn( + "Failed to extract annotation on page {}: {}", + pageNumber, + e.getMessage()); + } + } + if (!annotations.isEmpty()) { + annotationsByPage.put(pageNumber, annotations); + } + pageNumber++; + } + return annotationsByPage; + } + + private List collectFormFields(PDDocument document) { + List formFields = new ArrayList<>(); + PDAcroForm acroForm = document.getDocumentCatalog().getAcroForm(); + if (acroForm == null) { + return formFields; + } + + try { + for (PDField field : acroForm.getFields()) { + try { + PdfJsonFormField formField = new PdfJsonFormField(); + formField.setName(field.getFullyQualifiedName()); + formField.setPartialName(field.getPartialName()); + formField.setFieldType(field.getFieldType()); + formField.setValue(field.getValueAsString()); + + // Get default value from COS dictionary + COSBase dv = field.getCOSObject().getDictionaryObject(COSName.DV); + if (dv != null) { + if (dv instanceof COSString) { + formField.setDefaultValue(((COSString) dv).getString()); + } else if (dv instanceof COSName) { + formField.setDefaultValue(((COSName) dv).getName()); + } + } + + formField.setFlags(field.getFieldFlags()); + formField.setAlternateFieldName(field.getAlternateFieldName()); + formField.setMappingName(field.getMappingName()); + + // Find which page the field is on + PDAnnotationWidget widget = + field.getWidgets().isEmpty() ? null : field.getWidgets().get(0); + if (widget != null) { + PDPage fieldPage = widget.getPage(); + if (fieldPage != null) { + int pageNum = document.getPages().indexOf(fieldPage) + 1; + formField.setPageNumber(pageNum); + + PDRectangle rect = widget.getRectangle(); + if (rect != null) { + formField.setRect( + List.of( + rect.getLowerLeftX(), + rect.getLowerLeftY(), + rect.getUpperRightX(), + rect.getUpperRightY())); + } + } + } + + // Store raw dictionary for lossless round-trip + formField.setRawData(serializeCosValue(field.getCOSObject())); + + formFields.add(formField); + } catch (Exception e) { + log.warn( + "Failed to extract form field {}: {}", + field.getFullyQualifiedName(), + e.getMessage()); + } + } + } catch (Exception e) { + log.warn("Failed to extract form fields: {}", e.getMessage()); + } + + return formFields; + } + + /** + * Filters out image XObject streams from resources to avoid duplication with imageElements. + * Images are already captured in imageElements[] with their base64 data, so we don't need them + * in the resources dictionary. + */ + private COSBase filterImageXObjectsFromResources(COSBase resourcesBase) { + if (!(resourcesBase instanceof COSDictionary)) { + return resourcesBase; + } + + // Clone the resources dictionary + COSDictionary resources = new COSDictionary((COSDictionary) resourcesBase); + + // Get the XObject dictionary + COSBase xobjectBase = resources.getDictionaryObject(COSName.XOBJECT); + if (!(xobjectBase instanceof COSDictionary)) { + return resources; + } + + COSDictionary xobjects = (COSDictionary) xobjectBase; + COSDictionary filteredXObjects = new COSDictionary(); + + // Copy all XObjects except images + for (COSName key : xobjects.keySet()) { + COSBase value = xobjects.getDictionaryObject(key); + if (value instanceof COSStream) { + COSStream stream = (COSStream) value; + COSName type = (COSName) stream.getDictionaryObject(COSName.TYPE); + COSName subtype = (COSName) stream.getDictionaryObject(COSName.SUBTYPE); + + // Skip if this is an Image XObject + if (COSName.XOBJECT.equals(type) && COSName.IMAGE.equals(subtype)) { + continue; + } + } + // Keep non-image XObjects (Form XObjects, etc.) + filteredXObjects.setItem(key, value); + } + + // If all XObjects were images, remove the XObject entry entirely + if (filteredXObjects.keySet().isEmpty()) { + resources.removeItem(COSName.XOBJECT); + } else { + resources.setItem(COSName.XOBJECT, filteredXObjects); + } + + return resources; + } + private PdfJsonMetadata extractMetadata(PDDocument document) { PdfJsonMetadata metadata = new PdfJsonMetadata(); PDDocumentInformation info = document.getDocumentInformation(); @@ -709,6 +1263,93 @@ public class PdfJsonConversionService { } } + private void restoreAnnotations( + PDDocument document, PDPage page, List annotations) { + if (annotations == null || annotations.isEmpty()) { + return; + } + + for (PdfJsonAnnotation annModel : annotations) { + try { + // Restore from raw COS data if available for lossless round-trip + if (annModel.getRawData() != null) { + COSBase rawAnnot = deserializeCosValue(annModel.getRawData(), document); + if (rawAnnot instanceof COSDictionary) { + PDAnnotation annotation = + PDAnnotation.createAnnotation((COSDictionary) rawAnnot); + page.getAnnotations().add(annotation); + log.debug("Restored annotation from raw data: {}", annModel.getSubtype()); + continue; + } + } + + // Fallback: reconstruct from structured fields + // Note: This is simplified - full annotation reconstruction is complex + // Most use cases should rely on rawData for lossless round-trip + log.debug( + "Warning: Annotation {} has no rawData, basic reconstruction may lose information", + annModel.getSubtype()); + + } catch (Exception e) { + log.warn( + "Failed to restore annotation {}: {}", + annModel.getSubtype(), + e.getMessage()); + } + } + } + + private void restoreFormFields(PDDocument document, List formFields) { + if (formFields == null || formFields.isEmpty()) { + return; + } + + try { + PDAcroForm acroForm = document.getDocumentCatalog().getAcroForm(); + if (acroForm == null) { + acroForm = new PDAcroForm(document); + document.getDocumentCatalog().setAcroForm(acroForm); + } + + COSArray fieldsArray = + (COSArray) acroForm.getCOSObject().getDictionaryObject(COSName.FIELDS); + if (fieldsArray == null) { + fieldsArray = new COSArray(); + acroForm.getCOSObject().setItem(COSName.FIELDS, fieldsArray); + } + + for (PdfJsonFormField fieldModel : formFields) { + try { + // Restore from raw COS data if available for lossless round-trip + if (fieldModel.getRawData() != null) { + COSBase rawField = deserializeCosValue(fieldModel.getRawData(), document); + if (rawField instanceof COSDictionary) { + // Add the field dictionary directly to the fields array + fieldsArray.add(rawField); + log.debug( + "Restored form field from raw data: {}", fieldModel.getName()); + continue; + } + } + + // Fallback: reconstruct from structured fields + // Note: This is simplified - full field reconstruction is complex + log.debug( + "Warning: Form field {} has no rawData, basic reconstruction may lose information", + fieldModel.getName()); + + } catch (Exception e) { + log.warn( + "Failed to restore form field {}: {}", + fieldModel.getName(), + e.getMessage()); + } + } + } catch (Exception e) { + log.warn("Failed to restore form fields: {}", e.getMessage()); + } + } + private void applyPageResources( PDDocument document, PDPage page, PdfJsonCosValue resourcesModel) throws IOException { if (resourcesModel == null) { @@ -720,6 +1361,81 @@ public class PdfJsonConversionService { } } + /** + * Reconstructs image XObjects from imageElements when content streams are preserved. During + * serialization, image streams are filtered out from resources to avoid duplication. This + * method adds them back by scanning content streams for XObject references and matching them + * with imageElements by objectName. + */ + private void reconstructImageXObjects( + PDDocument document, + PDPage page, + List contentStreams, + List imageElements) + throws IOException { + + // Build map of objectName -> imageElement + Map imageMap = new HashMap<>(); + for (PdfJsonImageElement img : imageElements) { + if (img.getObjectName() != null && !img.getObjectName().isBlank()) { + imageMap.put(img.getObjectName(), img); + } + } + + if (imageMap.isEmpty()) { + return; + } + + // Scan content streams for image XObject references + Set referencedXObjects = new HashSet<>(); + for (PDStream stream : contentStreams) { + try { + byte[] contentBytes = stream.toByteArray(); + PDFStreamParser parser = new PDFStreamParser(contentBytes); + List tokens = parser.parse(); + + for (int i = 0; i < tokens.size(); i++) { + Object token = tokens.get(i); + if (token instanceof Operator op + && OperatorName.DRAW_OBJECT.equals(op.getName())) { + if (i > 0 && tokens.get(i - 1) instanceof COSName name) { + referencedXObjects.add(name.getName()); + } + } + } + } catch (Exception e) { + log.warn("Failed to parse content stream for image references: {}", e.getMessage()); + } + } + + // Reconstruct referenced image XObjects + PDResources resources = page.getResources(); + if (resources == null) { + resources = new PDResources(); + page.setResources(resources); + } + + for (String xobjName : referencedXObjects) { + PdfJsonImageElement imageElement = imageMap.get(xobjName); + if (imageElement == null) { + log.warn( + "Content stream references image XObject '{}' but no matching imageElement found", + xobjName); + continue; + } + + try { + PDImageXObject image = createImageXObject(document, imageElement); + if (image != null) { + resources.put(COSName.getPDFName(xobjName), image); + log.debug("Reconstructed image XObject: {}", xobjName); + } + } catch (Exception e) { + log.warn("Failed to reconstruct image XObject '{}': {}", xobjName, e.getMessage()); + } + } + } + private List buildContentStreams( PDDocument document, List streamModels) throws IOException { List streams = new ArrayList<>(); @@ -959,6 +1675,7 @@ public class PdfJsonConversionService { List textElements, List imageElements, Map fontMap, + List fontModels, int pageNumber) throws IOException { List drawables = mergeDrawables(textElements, imageElements); @@ -974,50 +1691,39 @@ public class PdfJsonConversionService { if (element == null) { continue; } - PDFont font = fontMap.get(buildFontKey(pageNumber, element.getFontId())); - if (font == null && FALLBACK_FONT_ID.equals(element.getFontId())) { - font = fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID)); - } - float fontScale = resolveFontMatrixSize(element); String text = Objects.toString(element.getText(), ""); - if (font != null) { - try { - font.encode(text); - } catch (IOException | IllegalArgumentException ex) { - log.debug( - "Edited text contains glyphs missing from font {} ({}), switching to fallback", - element.getFontId(), - ex.getMessage()); - font = fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID)); - element.setFontId(FALLBACK_FONT_ID); - if (font == null) { - font = loadFallbackPdfFont(document); - fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), font); - } - } - } - if (font == null) { - element.setFontId(FALLBACK_FONT_ID); - font = fontMap.get(buildFontKey(-1, FALLBACK_FONT_ID)); - if (font == null) { - font = loadFallbackPdfFont(document); - fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), font); - } - } - if (!textOpen) { contentStream.beginText(); textOpen = true; } + PDFont baseFont = + fontMap.get(buildFontKey(pageNumber, element.getFontId())); + if (baseFont == null && element.getFontId() != null) { + baseFont = fontMap.get(buildFontKey(-1, element.getFontId())); + } + + float fontScale = resolveFontMatrixSize(element); + applyTextState(contentStream, element); - contentStream.setFont(font, fontScale); applyRenderingMode(contentStream, element.getRenderingMode()); applyTextMatrix(contentStream, element); - String sanitized = sanitizeForFont(font, text); - if (!sanitized.isEmpty()) { - contentStream.showText(sanitized); + + List runs = + buildFontRuns( + document, fontMap, fontModels, baseFont, text, element); + + PDFont activeFont = null; + for (FontRun run : runs) { + if (run == null || run.text().isEmpty()) { + continue; + } + if (run.font() != activeFont) { + contentStream.setFont(run.font(), fontScale); + activeFont = run.font(); + } + contentStream.showText(run.text()); } } case IMAGE -> { @@ -1039,39 +1745,156 @@ public class PdfJsonConversionService { } } - private String sanitizeForFont(PDFont font, String text) { + private List buildFontRuns( + PDDocument document, + Map fontMap, + List fontModels, + PDFont primaryFont, + String text, + PdfJsonTextElement element) + throws IOException { + List runs = new ArrayList<>(); if (text == null || text.isEmpty()) { - return ""; + return runs; + } + + PDFont baseFont = primaryFont; + if (baseFont == null) { + baseFont = ensureFallbackFont(document, fontMap, fontModels, FALLBACK_FONT_ID); + } + if (baseFont == null) { + log.warn("Unable to resolve a base font for text element; skipping text content"); + return runs; + } + + StringBuilder buffer = new StringBuilder(); + PDFont currentFont = baseFont; + + for (int offset = 0; offset < text.length(); ) { + int codePoint = text.codePointAt(offset); + offset += Character.charCount(codePoint); + String glyph = new String(Character.toChars(codePoint)); + PDFont targetFont = currentFont; + + if (!canEncode(baseFont, codePoint)) { + String fallbackId = resolveFallbackFontId(codePoint); + targetFont = ensureFallbackFont(document, fontMap, fontModels, fallbackId); + if (targetFont == null || !canEncode(targetFont, glyph)) { + String mapped = mapUnsupportedGlyph(codePoint); + if (mapped != null) { + if (canEncode(baseFont, mapped)) { + glyph = mapped; + targetFont = baseFont; + } else if (targetFont != null && canEncode(targetFont, mapped)) { + glyph = mapped; + } + } + } + if (targetFont == null || !canEncode(targetFont, glyph)) { + glyph = "?"; + targetFont = + ensureFallbackFont(document, fontMap, fontModels, FALLBACK_FONT_ID); + if (targetFont == null || !canEncode(targetFont, glyph)) { + log.debug( + "Dropping unsupported glyph U+{} for text element", + Integer.toHexString(codePoint)); + continue; + } + } + if (targetFont != baseFont) { + log.trace( + "Using fallback font '{}' for code point U+{}", + targetFont.getName(), + Integer.toHexString(codePoint)); + } + } + + if (targetFont != currentFont) { + if (buffer.length() > 0) { + runs.add(new FontRun(currentFont, buffer.toString())); + buffer.setLength(0); + } + currentFont = targetFont; + } + buffer.append(glyph); + } + + if (buffer.length() > 0) { + runs.add(new FontRun(currentFont, buffer.toString())); + } + + return runs; + } + + private Integer extractUnitsPerEm(PDFont font) { + if (font == null) { + return null; + } + Matrix matrix = font.getFontMatrix(); + if (matrix != null) { + float scaleX = matrix.getScaleX(); + if (scaleX != 0f) { + int units = Math.round(Math.abs(1f / scaleX)); + if (units > 0 && units < 10_000) { + return units; + } + } + } + return 1000; + } + + private boolean canEncodeFully(PDFont font, String text) { + return canEncode(font, text); + } + + private boolean canEncode(PDFont font, int codePoint) { + return canEncode(font, new String(Character.toChars(codePoint))); + } + + private boolean canEncode(PDFont font, String text) { + if (font == null || text == null || text.isEmpty()) { + return false; + } + try { + font.encode(text); + return true; + } catch (IOException | IllegalArgumentException ex) { + return false; + } + } + + private String resolveFallbackFontId(int codePoint) { + Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint); + if (block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F + || block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION + || block == Character.UnicodeBlock.BOPOMOFO + || block == Character.UnicodeBlock.BOPOMOFO_EXTENDED + || block == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { + return FALLBACK_FONT_CJK_ID; + } + + Character.UnicodeScript script = Character.UnicodeScript.of(codePoint); + switch (script) { + case HAN: + return FALLBACK_FONT_CJK_ID; + case HIRAGANA: + case KATAKANA: + return FALLBACK_FONT_JP_ID; + case HANGUL: + return FALLBACK_FONT_KR_ID; + case ARABIC: + return FALLBACK_FONT_AR_ID; + case THAI: + return FALLBACK_FONT_TH_ID; + default: + return FALLBACK_FONT_ID; } - StringBuilder builder = new StringBuilder(text.length()); - text.codePoints() - .forEach( - codePoint -> { - String candidate = new String(Character.toChars(codePoint)); - try { - font.encode(candidate); - builder.append(candidate); - return; - } catch (IOException | IllegalArgumentException ex) { - String mapped = mapUnsupportedGlyph(codePoint); - if (mapped != null) { - try { - font.encode(mapped); - builder.append(mapped); - return; - } catch (IOException | IllegalArgumentException ignore) { - // fall through to generic replacement - } - } - log.debug( - "Replacing unsupported glyph {} ({}) with '?' for font {}", - candidate, - String.format("U+%04X", codePoint), - font.getName()); - builder.append('?'); - } - }); - return builder.toString(); } private String mapUnsupportedGlyph(int codePoint) { @@ -1082,6 +1905,17 @@ public class PdfJsonConversionService { }; } + private void closeQuietly(TempFile tempFile) { + if (tempFile == null) { + return; + } + try { + tempFile.close(); + } catch (Exception ex) { + log.debug("Failed to close temporary file: {}", ex.getMessage()); + } + } + private void applyTextState(PDPageContentStream contentStream, PdfJsonTextElement element) throws IOException { if (element.getCharacterSpacing() != null) { @@ -1200,6 +2034,72 @@ public class PdfJsonConversionService { } } + private static final class PreflightResult { + private static final PreflightResult EMPTY = new PreflightResult(false, Set.of()); + + private final boolean usesFallback; + private final Set fallbackFontIds; + + private PreflightResult(boolean usesFallback, Set fallbackFontIds) { + this.usesFallback = usesFallback; + this.fallbackFontIds = fallbackFontIds != null ? Set.copyOf(fallbackFontIds) : Set.of(); + } + + private static PreflightResult empty() { + return EMPTY; + } + + private boolean usesFallback() { + return usesFallback; + } + + private Set fallbackFontIds() { + return fallbackFontIds; + } + } + + private static final class FallbackFontSpec { + private final String resourceLocation; + private final String baseName; + private final String format; + + private FallbackFontSpec(String resourceLocation, String baseName, String format) { + this.resourceLocation = resourceLocation; + this.baseName = baseName; + this.format = format; + } + + private String resourceLocation() { + return resourceLocation; + } + + private String baseName() { + return baseName; + } + + private String format() { + return format; + } + } + + private static final class FontRun { + private final PDFont font; + private final String text; + + private FontRun(PDFont font, String text) { + this.font = font; + this.text = text; + } + + private PDFont font() { + return font; + } + + private String text() { + return text; + } + } + private boolean rewriteTextOperators( PDDocument document, PDPage page, List elements) { if (elements == null || elements.isEmpty()) { @@ -1479,6 +2379,20 @@ public class PdfJsonConversionService { ? fontModel.getProgramFormat().toLowerCase(Locale.ROOT) : ""; try { + if (isCffFormat(format)) { + byte[] converted = convertCffProgramToTrueType(fontBytes); + if (converted != null) { + fontBytes = converted; + format = "ttf"; + log.debug( + "Converted CFF font {} to TrueType outlines for embedding", + fontModel.getId()); + } else { + log.debug( + "Unable to convert CFF font {} to TrueType; attempting direct load", + fontModel.getId()); + } + } if (isType1Format(format)) { try (InputStream stream = new ByteArrayInputStream(fontBytes)) { PDFont font = new PDType1Font(document, stream); @@ -1528,6 +2442,16 @@ public class PdfJsonConversionService { return "type1".equals(format) || format.endsWith("pfb"); } + private boolean isCffFormat(String format) { + if (format == null) { + return false; + } + String normalized = format.toLowerCase(Locale.ROOT); + return normalized.contains("type1c") + || normalized.contains("cidfonttype0c") + || "cff".equals(normalized); + } + private void applyAdditionalFontMetadata( PDDocument document, PDFont font, PdfJsonFont fontModel) throws IOException { if (fontModel.getToUnicode() != null && !fontModel.getToUnicode().isBlank()) { @@ -1593,9 +2517,13 @@ public class PdfJsonConversionService { float b = matrix.get(1); float c = matrix.get(2); float d = matrix.get(3); - float scale = (float) Math.max(Math.hypot(a, c), Math.hypot(b, d)); - if (scale > 0f) { - return scale; + float verticalScale = (float) Math.hypot(b, d); + if (verticalScale > 0f) { + return verticalScale; + } + float horizontalScale = (float) Math.hypot(a, c); + if (horizontalScale > 0f) { + return horizontalScale; } } return safeFloat(element.getFontSize(), 12f); @@ -1945,6 +2873,20 @@ public class PdfJsonConversionService { cache.put(cacheKey, image); } + List transform = element.getTransform(); + if (transform != null && transform.size() == 6) { + Matrix matrix = + new Matrix( + safeFloat(transform.get(0), 1f), + safeFloat(transform.get(1), 0f), + safeFloat(transform.get(2), 0f), + safeFloat(transform.get(3), 1f), + safeFloat(transform.get(4), 0f), + safeFloat(transform.get(5), 0f)); + contentStream.drawImage(image, matrix); + return; + } + float width = safeFloat(element.getWidth(), fallbackWidth(element)); float height = safeFloat(element.getHeight(), fallbackHeight(element)); if (width <= 0f) { diff --git a/app/core/src/main/resources/settings.yml.template b/app/core/src/main/resources/settings.yml.template index 6bf882685..fd389337e 100644 --- a/app/core/src/main/resources/settings.yml.template +++ b/app/core/src/main/resources/settings.yml.template @@ -168,6 +168,16 @@ system: startupCleanup: true # Clean up old temp files on startup cleanupSystemTemp: false # Whether to clean broader system temp directory +stirling: + pdf: + fallback-font: classpath:/static/fonts/NotoSans-Regular.ttf # Override to point at a custom fallback font + json: + font-normalization: + enabled: true # Run Ghostscript preflight to normalize fonts before PDF→JSON + cff-converter: + enabled: true # Attempt to transcode CFF/Type1C programs to OTF using FontForge when available + fontforge-command: fontforge # Override if FontForge is installed under a different name/path + ui: appName: '' # application's visible name homeDescription: '' # short description or tagline shown on the homepage diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 58655dfdb..b154ec782 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -83,6 +83,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a gcompat \ libc6-compat \ libreoffice \ + ghostscript \ + fontforge \ # pdftohtml poppler-utils \ # OCR MY PDF (unpaper for descew and other advanced features) @@ -119,4 +121,4 @@ EXPOSE 8080/tcp # Set user and run command ENTRYPOINT ["tini", "--", "/scripts/init.sh"] -CMD ["sh", "-c", "java -Dfile.encoding=UTF-8 -Djava.io.tmpdir=/tmp/stirling-pdf -jar /app.jar & /opt/venv/bin/unoserver --port 2003 --interface 127.0.0.1"] \ No newline at end of file +CMD ["sh", "-c", "java -Dfile.encoding=UTF-8 -Djava.io.tmpdir=/tmp/stirling-pdf -jar /app.jar & /opt/venv/bin/unoserver --port 2003 --interface 127.0.0.1"] diff --git a/docker/backend/Dockerfile.fat b/docker/backend/Dockerfile.fat index bd12e3063..25fa2a0b8 100644 --- a/docker/backend/Dockerfile.fat +++ b/docker/backend/Dockerfile.fat @@ -73,6 +73,8 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a gcompat \ libc6-compat \ libreoffice \ + ghostscript \ + fontforge \ # pdftohtml poppler-utils \ # OCR MY PDF (unpaper for descew and other advanced featues) @@ -109,4 +111,4 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a EXPOSE 8080/tcp # Set user and run command ENTRYPOINT ["tini", "--", "/scripts/init.sh"] -CMD ["sh", "-c", "java -Dfile.encoding=UTF-8 -Djava.io.tmpdir=/tmp/stirling-pdf -jar /app.jar & /opt/venv/bin/unoserver --port 2003 --interface 127.0.0.1"] \ No newline at end of file +CMD ["sh", "-c", "java -Dfile.encoding=UTF-8 -Djava.io.tmpdir=/tmp/stirling-pdf -jar /app.jar & /opt/venv/bin/unoserver --port 2003 --interface 127.0.0.1"] diff --git a/docker/backend/Dockerfile.ultra-lite b/docker/backend/Dockerfile.ultra-lite index 0b74e3b0a..e18e4a0b4 100644 --- a/docker/backend/Dockerfile.ultra-lite +++ b/docker/backend/Dockerfile.ultra-lite @@ -59,7 +59,9 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et curl \ shadow \ su-exec \ - openjdk21-jre && \ + openjdk21-jre \ + ghostscript \ + fontforge && \ # User permissions mkdir -p /configs /logs /customFiles /usr/share/fonts/opentype/noto /tmp/stirling-pdf /pipeline/watchedFolders /pipeline/finishedFolders && \ chmod +x /scripts/*.sh && \ diff --git a/docker/frontend/nginx.conf b/docker/frontend/nginx.conf index ffe913738..3be5ec900 100644 --- a/docker/frontend/nginx.conf +++ b/docker/frontend/nginx.conf @@ -24,7 +24,7 @@ http { index index.html index.htm; # Global settings for file uploads - client_max_body_size 100m; + client_max_body_size 0; # Handle client-side routing - support subpaths location / { @@ -48,12 +48,12 @@ http { proxy_cache off; # Timeout settings for large file uploads - proxy_connect_timeout 60s; - proxy_send_timeout 60s; - proxy_read_timeout 60s; - + proxy_connect_timeout 600s; + proxy_send_timeout 600s; + proxy_read_timeout 600s; + # Request size limits for file uploads - client_max_body_size 100m; + client_max_body_size 0; proxy_request_buffering off; } diff --git a/frontend/public/locales/en-GB/translation.json b/frontend/public/locales/en-GB/translation.json index d58014625..4ff3a2f4d 100644 --- a/frontend/public/locales/en-GB/translation.json +++ b/frontend/public/locales/en-GB/translation.json @@ -4031,6 +4031,7 @@ "fontSizeValue": "{{size}}pt", "noTextOnPage": "No editable text was detected on this page.", "emptyGroup": "[Empty Group]", + "imageLabel": "Placed image", "empty": { "title": "No document loaded", "subtitle": "Load a PDF or JSON file to begin editing text content." diff --git a/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx b/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx index c54e38bb3..f317042c5 100644 --- a/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx +++ b/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx @@ -1,10 +1,12 @@ import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from 'react'; import { + ActionIcon, Alert, Badge, Box, Button, Card, + Collapse, Divider, FileButton, Group, @@ -21,10 +23,13 @@ import PictureAsPdfIcon from '@mui/icons-material/PictureAsPdfOutlined'; import AutorenewIcon from '@mui/icons-material/Autorenew'; import WarningAmberIcon from '@mui/icons-material/WarningAmber'; import UploadIcon from '@mui/icons-material/Upload'; +import ExpandMoreIcon from '@mui/icons-material/ExpandMore'; +import ExpandLessIcon from '@mui/icons-material/ExpandLess'; import { Rnd } from 'react-rnd'; import { PdfJsonEditorViewData, + PdfJsonFont, PdfJsonPage, } from '../../../tools/pdfJsonEditorTypes'; import { getImageBounds, pageDimensions } from '../../../tools/pdfJsonEditorUtils'; @@ -32,6 +37,68 @@ import { getImageBounds, pageDimensions } from '../../../tools/pdfJsonEditorUtil const MAX_RENDER_WIDTH = 820; const MIN_BOX_SIZE = 18; +const normalizeFontFormat = (format?: string | null): string => { + if (!format) { + return 'ttf'; + } + const lower = format.toLowerCase(); + if (lower.includes('woff2')) { + return 'woff2'; + } + if (lower.includes('woff')) { + return 'woff'; + } + if (lower.includes('otf')) { + return 'otf'; + } + if (lower.includes('cff')) { + return 'otf'; + } + return 'ttf'; +}; + +const getFontMimeType = (format: string): string => { + switch (format) { + case 'woff2': + return 'font/woff2'; + case 'woff': + return 'font/woff'; + case 'otf': + return 'font/otf'; + default: + return 'font/ttf'; + } +}; + +const getFontFormatHint = (format: string): string | null => { + switch (format) { + case 'woff2': + return 'woff2'; + case 'woff': + return 'woff'; + case 'otf': + return 'opentype'; + case 'ttf': + return 'truetype'; + default: + return null; + } +}; + +const decodeBase64ToUint8Array = (value: string): Uint8Array => { + const binary = window.atob(value); + const bytes = new Uint8Array(binary.length); + for (let index = 0; index < binary.length; index += 1) { + bytes[index] = binary.charCodeAt(index); + } + return bytes; +}; + +const buildFontFamilyName = (font: PdfJsonFont): string => { + const base = (font.uid ?? font.id ?? 'font').toString(); + return `pdf-font-${base.replace(/[^a-zA-Z0-9_-]/g, '')}`; +}; + const getCaretOffset = (element: HTMLElement): number => { const selection = window.getSelection(); if (!selection || selection.rangeCount === 0 || !element.contains(selection.focusNode)) { @@ -85,11 +152,13 @@ const toCssBounds = ( bounds: { left: number; right: number; top: number; bottom: number }, ) => { const width = Math.max(bounds.right - bounds.left, 1); + // Note: This codebase uses inverted naming where bounds.bottom > bounds.top + // bounds.bottom = visually upper edge (larger Y in PDF coords) + // bounds.top = visually lower edge (smaller Y in PDF coords) const height = Math.max(bounds.bottom - bounds.top, 1); - // Add 20% buffer to width to account for padding and font rendering variations - const bufferedWidth = width * 1.2; - const scaledWidth = Math.max(bufferedWidth * scale, MIN_BOX_SIZE); + const scaledWidth = Math.max(width * scale, MIN_BOX_SIZE); const scaledHeight = Math.max(height * scale, MIN_BOX_SIZE / 2); + // Convert PDF's visually upper edge (bounds.bottom) to CSS top const top = Math.max(pageHeight - bounds.bottom, 0) * scale; return { @@ -105,6 +174,8 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const [activeGroupId, setActiveGroupId] = useState(null); const [editingGroupId, setEditingGroupId] = useState(null); const [activeImageId, setActiveImageId] = useState(null); + const [fontFamilies, setFontFamilies] = useState>(new Map()); + const [textGroupsExpanded, setTextGroupsExpanded] = useState(false); const containerRef = useRef(null); const editorRefs = useRef>(new Map()); const caretOffsetsRef = useRef>(new Map()); @@ -135,6 +206,10 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { if (!fontId || !pdfDocument?.fonts) { return 'sans-serif'; } + const loadedFamily = fontFamilies.get(fontId); + if (loadedFamily) { + return `'${loadedFamily}', sans-serif`; + } const font = pdfDocument.fonts.find((f) => f.id === fontId); if (!font) { return 'sans-serif'; @@ -161,10 +236,134 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { return 'Arial, Helvetica, sans-serif'; }; + const getLineHeightPx = (fontId: string | null | undefined, fontSizePx: number): number => { + if (fontSizePx <= 0) { + return fontSizePx; + } + const metrics = fontId ? fontMetrics.get(fontId) : undefined; + if (!metrics || metrics.unitsPerEm <= 0) { + return fontSizePx * 1.2; + } + const totalUnits = metrics.ascent - metrics.descent; + if (totalUnits <= 0) { + return fontSizePx * 1.2; + } + const lineHeight = (totalUnits / metrics.unitsPerEm) * fontSizePx; + return Math.max(lineHeight, fontSizePx * 1.05); + }; + + const getFontWeight = (fontId: string | null | undefined): number | 'normal' | 'bold' => { + if (!fontId || !pdfDocument?.fonts) { + return 'normal'; + } + const font = pdfDocument.fonts.find((f) => f.id === fontId); + if (!font || !font.fontDescriptorFlags) { + return 'normal'; + } + + // PDF font descriptor flag bit 18 (value 262144 = 0x40000) indicates ForceBold + const FORCE_BOLD_FLAG = 262144; + if ((font.fontDescriptorFlags & FORCE_BOLD_FLAG) !== 0) { + return 'bold'; + } + + // Also check if font name contains "Bold" + const fontName = font.standard14Name || font.baseName || ''; + if (fontName.toLowerCase().includes('bold')) { + return 'bold'; + } + + return 'normal'; + }; + const pages = pdfDocument?.pages ?? []; const currentPage = pages[selectedPage] ?? null; const pageGroups = groupsByPage[selectedPage] ?? []; const pageImages = imagesByPage[selectedPage] ?? []; + + const fontMetrics = useMemo(() => { + const metrics = new Map(); + pdfDocument?.fonts?.forEach((font) => { + if (!font?.id) { + return; + } + const unitsPerEm = font.unitsPerEm && font.unitsPerEm > 0 ? font.unitsPerEm : 1000; + const ascent = font.ascent ?? unitsPerEm; + const descent = font.descent ?? -(unitsPerEm * 0.2); + metrics.set(font.id, { unitsPerEm, ascent, descent }); + }); + return metrics; + }, [pdfDocument?.fonts]); + + useEffect(() => { + if (typeof FontFace === 'undefined') { + setFontFamilies(new Map()); + return undefined; + } + + let disposed = false; + const active: { fontFace: FontFace; url?: string }[] = []; + + const registerFonts = async () => { + const fonts = pdfDocument?.fonts ?? []; + if (fonts.length === 0) { + setFontFamilies(new Map()); + return; + } + + const next = new Map(); + for (const font of fonts) { + if (!font?.id || !font.program) { + continue; + } + try { + const format = normalizeFontFormat(font.programFormat); + const data = decodeBase64ToUint8Array(font.program); + const blob = new Blob([data as BlobPart], { type: getFontMimeType(format) }); + const url = URL.createObjectURL(blob); + const formatHint = getFontFormatHint(format); + const familyName = buildFontFamilyName(font); + const source = formatHint ? `url(${url}) format('${formatHint}')` : `url(${url})`; + const fontFace = new FontFace(familyName, source); + await fontFace.load(); + if (disposed) { + document.fonts.delete(fontFace); + URL.revokeObjectURL(url); + continue; + } + document.fonts.add(fontFace); + active.push({ fontFace, url }); + next.set(font.id, familyName); + } catch (error) { + // Silently ignore font loading failures - embedded PDF fonts often lack web font tables + // Fallback to web-safe fonts is already implemented via getFontFamily() + } + } + + if (!disposed) { + setFontFamilies(next); + } else { + active.forEach(({ fontFace, url }) => { + document.fonts.delete(fontFace); + if (url) { + URL.revokeObjectURL(url); + } + }); + } + }; + + registerFonts(); + + return () => { + disposed = true; + active.forEach(({ fontFace, url }) => { + document.fonts.delete(fontFace); + if (url) { + URL.revokeObjectURL(url); + } + }); + }; + }, [pdfDocument?.fonts]); const visibleGroups = useMemo( () => pageGroups.filter((group) => { @@ -419,25 +618,33 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { + {orderedImages.map((image, imageIndex) => { if (!image?.imageData) { return null; @@ -466,7 +673,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { setEditingGroupId(null); setActiveImageId(imageId); }} - onDrag={(event, data) => { + onDrag={(_event, data) => { emitImageTransform( imageId, data.x, @@ -475,7 +682,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { cssHeight, ); }} - onDragStop={(event, data) => { + onDragStop={(_event, data) => { emitImageTransform( imageId, data.x, @@ -489,7 +696,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { setActiveGroupId(null); setEditingGroupId(null); }} - onResize={(event, _direction, ref, _delta, position) => { + onResize={(_event, _direction, ref, _delta, position) => { const nextWidth = parseFloat(ref.style.width); const nextHeight = parseFloat(ref.style.height); emitImageTransform( @@ -500,7 +707,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { nextHeight, ); }} - onResizeStop={(event, _direction, ref, _delta, position) => { + onResizeStop={(_event, _direction, ref, _delta, position) => { const nextWidth = parseFloat(ref.style.width); const nextHeight = parseFloat(ref.style.height); emitImageTransform( @@ -567,21 +774,48 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const baseFontSize = group.fontMatrixSize ?? group.fontSize ?? 12; const fontSizePx = Math.max(baseFontSize * scale, 6); const fontFamily = getFontFamily(group.fontId); + const lineHeightPx = getLineHeightPx(group.fontId, fontSizePx); + const lineHeightRatio = fontSizePx > 0 ? Math.max(lineHeightPx / fontSizePx, 1.05) : 1.2; + const hasRotation = group.rotation != null && Math.abs(group.rotation) > 0.5; + const baselineLength = group.baselineLength ?? Math.max(group.bounds.right - group.bounds.left, 0); - const visualHeight = Math.max(bounds.height, fontSizePx * 1.2); + let containerLeft = bounds.left; + let containerTop = bounds.top; + let containerWidth = Math.max(bounds.width, fontSizePx); + let containerHeight = Math.max(bounds.height, lineHeightPx); + let transform: string | undefined; + let transformOrigin: React.CSSProperties['transformOrigin']; + + if (hasRotation) { + const anchorX = group.anchor?.x ?? group.bounds.left; + const anchorY = group.anchor?.y ?? group.bounds.bottom; + containerLeft = anchorX * scale; + containerTop = Math.max(pageHeight - anchorY, 0) * scale; + containerWidth = Math.max(baselineLength * scale, MIN_BOX_SIZE); + containerHeight = Math.max(lineHeightPx, fontSizePx * lineHeightRatio); + transformOrigin = 'left bottom'; + // Negate rotation because Y-axis is flipped from PDF to web coordinates + transform = `rotate(${-group.rotation}deg)`; + } + + // Extract styling from group + const textColor = group.color || '#111827'; + const fontWeight = group.fontWeight || getFontWeight(group.fontId); const containerStyle: React.CSSProperties = { position: 'absolute', - left: `${bounds.left}px`, - top: `${bounds.top}px`, - width: `${bounds.width}px`, - height: `${visualHeight}px`, + left: `${containerLeft}px`, + top: `${containerTop}px`, + width: `${containerWidth}px`, + height: `${containerHeight}px`, display: 'flex', alignItems: 'flex-start', justifyContent: 'flex-start', pointerEvents: 'auto', cursor: 'text', zIndex: 2_000_000, + transform, + transformOrigin, }; if (isEditing) { @@ -628,17 +862,17 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { style={{ width: '100%', height: '100%', - padding: '3px 4px', + padding: 0, backgroundColor: 'rgba(255,255,255,0.95)', - color: '#111827', + color: textColor, fontSize: `${fontSizePx}px`, fontFamily, - lineHeight: 1.25, + fontWeight, + lineHeight: lineHeightRatio, outline: 'none', border: 'none', display: 'block', - whiteSpace: 'pre-wrap', - overflowWrap: 'anywhere', + whiteSpace: 'nowrap', cursor: 'text', overflow: 'visible', }} @@ -660,12 +894,13 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { style={{ width: '100%', minHeight: '100%', - padding: '2px 4px', - whiteSpace: 'pre-wrap', + padding: 0, + whiteSpace: 'nowrap', fontSize: `${fontSizePx}px`, fontFamily, - lineHeight: 1.25, - color: '#111827', + fontWeight, + lineHeight: lineHeightRatio, + color: textColor, display: 'block', cursor: 'text', overflow: 'visible', @@ -682,6 +917,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { ); }) )} + @@ -689,48 +925,61 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { - {t('pdfJsonEditor.groupList', 'Detected Text Groups')} - - - - {visibleGroups.map((group) => { - const changed = group.text !== group.originalText; - return ( - setActiveGroupId(group.id)} - onMouseLeave={() => setActiveGroupId((current) => (current === group.id ? null : current))} - style={{ cursor: 'pointer' }} - onClick={() => { - setActiveGroupId(group.id); - setEditingGroupId(group.id); - }} - > - - - {changed && {t('pdfJsonEditor.badges.modified', 'Edited')}} - {group.fontId && ( - {group.fontId} - )} - {group.fontSize && ( - - {t('pdfJsonEditor.fontSizeValue', '{{size}}pt', { size: group.fontSize.toFixed(1) })} - - )} - - - {group.text || t('pdfJsonEditor.emptyGroup', '[Empty Group]')} - - - - ); - })} + + {t('pdfJsonEditor.groupList', 'Detected Text Groups')} + setTextGroupsExpanded(!textGroupsExpanded)} + aria-label={textGroupsExpanded ? 'Collapse' : 'Expand'} + > + {textGroupsExpanded ? : } + + + + + + + + {visibleGroups.map((group) => { + const changed = group.text !== group.originalText; + return ( + setActiveGroupId(group.id)} + onMouseLeave={() => setActiveGroupId((current) => (current === group.id ? null : current))} + style={{ cursor: 'pointer' }} + onClick={() => { + setActiveGroupId(group.id); + setEditingGroupId(group.id); + }} + > + + + {changed && {t('pdfJsonEditor.badges.modified', 'Edited')}} + {group.fontId && ( + {group.fontId} + )} + {group.fontSize && ( + + {t('pdfJsonEditor.fontSizeValue', '{{size}}pt', { size: group.fontSize.toFixed(1) })} + + )} + + + {group.text || t('pdfJsonEditor.emptyGroup', '[Empty Group]')} + + + + ); + })} + + - + diff --git a/frontend/src/tools/pdfJsonEditorTypes.ts b/frontend/src/tools/pdfJsonEditorTypes.ts index ff991b916..c1da39656 100644 --- a/frontend/src/tools/pdfJsonEditorTypes.ts +++ b/frontend/src/tools/pdfJsonEditorTypes.ts @@ -23,6 +23,12 @@ export interface PdfJsonFont { toUnicode?: string | null; standard14Name?: string | null; fontDescriptorFlags?: number | null; + ascent?: number | null; + descent?: number | null; + capHeight?: number | null; + xHeight?: number | null; + italicAngle?: number | null; + unitsPerEm?: number | null; } export interface PdfJsonTextElement { @@ -117,6 +123,11 @@ export interface TextGroup { fontId?: string | null; fontSize?: number | null; fontMatrixSize?: number | null; + color?: string | null; + fontWeight?: number | 'normal' | 'bold' | null; + rotation?: number | null; + anchor?: { x: number; y: number } | null; + baselineLength?: number | null; elements: PdfJsonTextElement[]; originalElements: PdfJsonTextElement[]; text: string; diff --git a/frontend/src/tools/pdfJsonEditorUtils.ts b/frontend/src/tools/pdfJsonEditorUtils.ts index 425c6ba44..8085fe759 100644 --- a/frontend/src/tools/pdfJsonEditorUtils.ts +++ b/frontend/src/tools/pdfJsonEditorUtils.ts @@ -69,9 +69,15 @@ const getHeight = (element: PdfJsonTextElement): number => { const getElementBounds = (element: PdfJsonTextElement): BoundingBox => { const left = getX(element); const width = getWidth(element); - const bottom = getBaseline(element); + const baseline = getBaseline(element); const height = getHeight(element); - const top = bottom - height; + // In PDF coordinates, baseline is where text sits + // Typical typography: ~80% of height above baseline (ascenders), ~20% below (descenders) + // Using codebase's inverted naming: bottom (visual top) > top (visual bottom) + const ascent = height * 0.8; + const descent = height * 0.2; + const bottom = baseline + ascent; // Visual top of text + const top = baseline - descent; // Visual bottom (includes descenders) return { left, right: left + width, @@ -181,6 +187,136 @@ const buildGroupText = (elements: PdfJsonTextElement[]): string => { return result; }; +const rgbToCss = (components: number[]): string => { + if (components.length >= 3) { + const r = Math.round(Math.max(0, Math.min(1, components[0])) * 255); + const g = Math.round(Math.max(0, Math.min(1, components[1])) * 255); + const b = Math.round(Math.max(0, Math.min(1, components[2])) * 255); + return `rgb(${r}, ${g}, ${b})`; + } + return 'rgb(0, 0, 0)'; +}; + +const cmykToCss = (components: number[]): string => { + if (components.length >= 4) { + const c = Math.max(0, Math.min(1, components[0])); + const m = Math.max(0, Math.min(1, components[1])); + const y = Math.max(0, Math.min(1, components[2])); + const k = Math.max(0, Math.min(1, components[3])); + const r = Math.round(255 * (1 - c) * (1 - k)); + const g = Math.round(255 * (1 - m) * (1 - k)); + const b = Math.round(255 * (1 - y) * (1 - k)); + return `rgb(${r}, ${g}, ${b})`; + } + return 'rgb(0, 0, 0)'; +}; + +const grayToCss = (components: number[]): string => { + if (components.length >= 1) { + const gray = Math.round(Math.max(0, Math.min(1, components[0])) * 255); + return `rgb(${gray}, ${gray}, ${gray})`; + } + return 'rgb(0, 0, 0)'; +}; + +const extractColor = (element: PdfJsonTextElement): string | null => { + const fillColor = element.fillColor; + if (!fillColor || !fillColor.components || fillColor.components.length === 0) { + return null; + } + + const colorSpace = (fillColor.colorSpace ?? '').toLowerCase(); + + if (colorSpace.includes('rgb') || colorSpace.includes('srgb')) { + return rgbToCss(fillColor.components); + } + if (colorSpace.includes('cmyk')) { + return cmykToCss(fillColor.components); + } + if (colorSpace.includes('gray') || colorSpace.includes('grey')) { + return grayToCss(fillColor.components); + } + + // Default to RGB interpretation + if (fillColor.components.length >= 3) { + return rgbToCss(fillColor.components); + } + if (fillColor.components.length === 1) { + return grayToCss(fillColor.components); + } + + return null; +}; + +const RAD_TO_DEG = 180 / Math.PI; + +const normalizeAngle = (angle: number): number => { + let normalized = angle % 360; + if (normalized > 180) { + normalized -= 360; + } else if (normalized <= -180) { + normalized += 360; + } + return normalized; +}; + +const extractElementRotation = (element: PdfJsonTextElement): number | null => { + const matrix = element.textMatrix; + if (!matrix || matrix.length !== 6) { + return null; + } + const a = matrix[0]; + const b = matrix[1]; + if (Math.abs(a) < 1e-6 && Math.abs(b) < 1e-6) { + return null; + } + const angle = Math.atan2(b, a) * RAD_TO_DEG; + if (Math.abs(angle) < 0.5) { + return null; + } + return normalizeAngle(angle); +}; + +const computeGroupRotation = (elements: PdfJsonTextElement[]): number | null => { + const angles = elements + .map(extractElementRotation) + .filter((angle): angle is number => angle !== null); + if (angles.length === 0) { + return null; + } + const vector = angles.reduce( + (acc, angle) => { + const radians = (angle * Math.PI) / 180; + acc.x += Math.cos(radians); + acc.y += Math.sin(radians); + return acc; + }, + { x: 0, y: 0 }, + ); + if (Math.abs(vector.x) < 1e-6 && Math.abs(vector.y) < 1e-6) { + return null; + } + const average = Math.atan2(vector.y, vector.x) * RAD_TO_DEG; + const normalized = normalizeAngle(average); + return Math.abs(normalized) < 0.5 ? null : normalized; +}; + +const getAnchorPoint = (element: PdfJsonTextElement): { x: number; y: number } => { + if (element.textMatrix && element.textMatrix.length === 6) { + return { + x: valueOr(element.textMatrix[4]), + y: valueOr(element.textMatrix[5]), + }; + } + return { + x: valueOr(element.x), + y: valueOr(element.y), + }; +}; + +const computeBaselineLength = (elements: PdfJsonTextElement[]): number => + elements.reduce((acc, current) => acc + getWidth(current), 0); + const createGroup = ( pageIndex: number, idSuffix: number, @@ -189,13 +325,22 @@ const createGroup = ( const clones = elements.map(cloneTextElement); const originalClones = clones.map(cloneTextElement); const bounds = mergeBounds(elements.map(getElementBounds)); + const firstElement = elements[0]; + const rotation = computeGroupRotation(elements); + const anchor = rotation !== null ? getAnchorPoint(firstElement) : null; + const baselineLength = computeBaselineLength(elements); return { id: `${pageIndex}-${idSuffix}`, pageIndex, - fontId: elements[0]?.fontId, - fontSize: elements[0]?.fontSize, - fontMatrixSize: elements[0]?.fontMatrixSize, + fontId: firstElement?.fontId, + fontSize: firstElement?.fontSize, + fontMatrixSize: firstElement?.fontMatrixSize, + color: firstElement ? extractColor(firstElement) : null, + fontWeight: null, // Will be determined from font descriptor + rotation, + anchor, + baselineLength, elements: clones, originalElements: originalClones, text: buildGroupText(elements), @@ -253,7 +398,18 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page const splitThreshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR); const sameFont = previous.fontId === element.fontId; - const shouldSplit = gap > splitThreshold * (sameFont ? 1.4 : 1.0); + let shouldSplit = gap > splitThreshold * (sameFont ? 1.4 : 1.0); + + const previousRotation = extractElementRotation(previous); + const currentRotation = extractElementRotation(element); + if ( + shouldSplit && + previousRotation !== null && + currentRotation !== null && + Math.abs(normalizeAngle(previousRotation - currentRotation)) < 1 + ) { + shouldSplit = false; + } if (shouldSplit) { groups.push(createGroup(pageIndex, groupCounter, currentBucket));