diff --git a/app/common/build.gradle b/app/common/build.gradle index f79ec6982..eeacdda71 100644 --- a/app/common/build.gradle +++ b/app/common/build.gradle @@ -37,6 +37,8 @@ dependencies { api 'com.drewnoakes:metadata-extractor:2.19.0' // Image metadata extractor api 'com.vladsch.flexmark:flexmark-html2md-converter:0.64.8' api "org.apache.pdfbox:pdfbox:$pdfboxVersion" + api "org.apache.pdfbox:xmpbox:$pdfboxVersion" + api "org.apache.pdfbox:preflight:$pdfboxVersion" api 'jakarta.servlet:jakarta.servlet-api:6.1.0' api 'org.snakeyaml:snakeyaml-engine:2.10' api "org.springdoc:springdoc-openapi-starter-webmvc-ui:2.8.9" diff --git a/app/core/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java b/app/common/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java rename to app/common/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java diff --git a/app/core/src/main/java/stirling/software/SPDF/config/swagger/ErrorResponse.java b/app/common/src/main/java/stirling/software/SPDF/config/swagger/ErrorResponse.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/config/swagger/ErrorResponse.java rename to app/common/src/main/java/stirling/software/SPDF/config/swagger/ErrorResponse.java diff --git a/app/core/src/main/java/stirling/software/SPDF/config/swagger/StandardPdfResponse.java b/app/common/src/main/java/stirling/software/SPDF/config/swagger/StandardPdfResponse.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/config/swagger/StandardPdfResponse.java rename to app/common/src/main/java/stirling/software/SPDF/config/swagger/StandardPdfResponse.java diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java b/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java rename to app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonAnnotation.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonAnnotation.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonAnnotation.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonAnnotation.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonCosValue.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonCosValue.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonCosValue.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonCosValue.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java similarity index 92% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java index 72a532d92..d8cae80b9 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java +++ b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java @@ -50,6 +50,12 @@ public class PdfJsonFont { /** Format hint for the webProgram payload. */ private String webProgramFormat; + /** PDF-friendly font program (e.g. converted TrueType) encoded as Base64. */ + private String pdfProgram; + + /** Format hint for the pdfProgram payload. */ + private String pdfProgramFormat; + /** ToUnicode stream encoded as Base64 when present. */ private String toUnicode; diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFontCidSystemInfo.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonFontCidSystemInfo.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFontCidSystemInfo.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonFontCidSystemInfo.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFormField.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonFormField.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFormField.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonFormField.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonImageElement.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonImageElement.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonImageElement.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonImageElement.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonMetadata.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonMetadata.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonMetadata.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonMetadata.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonPage.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonStream.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonStream.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonStream.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonStream.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextColor.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextColor.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextColor.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextColor.java diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java similarity index 100% rename from app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java rename to app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java diff --git a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java similarity index 91% rename from app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java rename to app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java index 7eeca8176..7f7b7a956 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java +++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java @@ -16,6 +16,7 @@ import java.nio.file.Path; import java.time.Instant; import java.time.format.DateTimeParseException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Base64; import java.util.Calendar; import java.util.Collections; @@ -536,6 +537,8 @@ public class PdfJsonConversionService { .programFormat(programData != null ? programData.getFormat() : null) .webProgram(programData != null ? programData.getWebBase64() : null) .webProgramFormat(programData != null ? programData.getWebFormat() : null) + .pdfProgram(programData != null ? programData.getPdfBase64() : null) + .pdfProgramFormat(programData != null ? programData.getPdfFormat() : null) .toUnicode(toUnicode) .standard14Name(standard14Name) .fontDescriptorFlags(flags) @@ -832,7 +835,8 @@ public class PdfJsonConversionService { } else if ("fontforge".equalsIgnoreCase(cffConverterMethod)) { return convertCffUsingFontForge(fontBytes); } else { - log.warn("Unknown CFF converter method: {}, falling back to Python", cffConverterMethod); + log.warn( + "Unknown CFF converter method: {}, falling back to Python", cffConverterMethod); return convertCffUsingPython(fontBytes, toUnicode); } } @@ -848,7 +852,8 @@ public class PdfJsonConversionService { try (TempFile inputFile = new TempFile(tempFileManager, ".cff"); TempFile outputFile = new TempFile(tempFileManager, ".otf"); - TempFile toUnicodeFile = toUnicode != null ? new TempFile(tempFileManager, ".tounicode") : null) { + TempFile toUnicodeFile = + toUnicode != null ? new TempFile(tempFileManager, ".tounicode") : null) { Files.write(inputFile.getPath(), fontBytes); // Write ToUnicode CMap data if available @@ -926,7 +931,10 @@ public class PdfJsonConversionService { } else { String outputStr = output.toString().trim(); if (!outputStr.isEmpty()) { - log.warn("Python CFF→OTF wrapping failed with exit code {}: {}", exitCode, outputStr); + log.warn( + "Python CFF→OTF wrapping failed with exit code {}: {}", + exitCode, + outputStr); } else { log.warn("Python CFF→OTF wrapping failed with exit code {}", exitCode); } @@ -957,7 +965,7 @@ public class PdfJsonConversionService { command.add("-c"); command.add( "Open($1); " - + "ScaleToEm(1000); " // Force 1000 units per em (standard for Type1) + + "ScaleToEm(1000); " // Force 1000 units per em (standard for Type1) + "SelectWorthOutputting(); " + "SetFontOrder(2); " + "Reencode(\"unicode\"); " @@ -998,7 +1006,8 @@ public class PdfJsonConversionService { if (!finished) { process.destroyForcibly(); reader.interrupt(); - log.warn("FontForge conversion timed out after 30 seconds - font may be too complex or causing FontForge to hang"); + log.warn( + "FontForge conversion timed out after 30 seconds - font may be too complex or causing FontForge to hang"); return null; } @@ -1023,9 +1032,7 @@ public class PdfJsonConversionService { // Additional validation: check unitsPerEm in head table String validationError = validateFontTables(convertedBytes); if (validationError != null) { - log.warn( - "FontForge produced invalid font: {}", - validationError); + log.warn("FontForge produced invalid font: {}", validationError); return null; } @@ -1063,6 +1070,7 @@ public class PdfJsonConversionService { /** * Validates critical OpenType/TrueType font tables to ensure browser compatibility. + * * @return Error message if invalid, null if valid */ private String validateFontTables(byte[] fontBytes) { @@ -1081,22 +1089,25 @@ public class PdfJsonConversionService { int offset = 12; // Skip sfnt header for (int i = 0; i < numTables && offset + 16 <= fontBytes.length; i++) { String tag = new String(fontBytes, offset, 4, StandardCharsets.US_ASCII); - int tableOffset = ((fontBytes[offset + 8] & 0xFF) << 24) - | ((fontBytes[offset + 9] & 0xFF) << 16) - | ((fontBytes[offset + 10] & 0xFF) << 8) - | (fontBytes[offset + 11] & 0xFF); - int tableLength = ((fontBytes[offset + 12] & 0xFF) << 24) - | ((fontBytes[offset + 13] & 0xFF) << 16) - | ((fontBytes[offset + 14] & 0xFF) << 8) - | (fontBytes[offset + 15] & 0xFF); + int tableOffset = + ((fontBytes[offset + 8] & 0xFF) << 24) + | ((fontBytes[offset + 9] & 0xFF) << 16) + | ((fontBytes[offset + 10] & 0xFF) << 8) + | (fontBytes[offset + 11] & 0xFF); + int tableLength = + ((fontBytes[offset + 12] & 0xFF) << 24) + | ((fontBytes[offset + 13] & 0xFF) << 16) + | ((fontBytes[offset + 14] & 0xFF) << 8) + | (fontBytes[offset + 15] & 0xFF); if ("head".equals(tag)) { if (tableOffset + 18 > fontBytes.length) { return "head table truncated"; } // Check unitsPerEm at offset 18 in head table - int unitsPerEm = ((fontBytes[tableOffset + 18] & 0xFF) << 8) - | (fontBytes[tableOffset + 19] & 0xFF); + int unitsPerEm = + ((fontBytes[tableOffset + 18] & 0xFF) << 8) + | (fontBytes[tableOffset + 19] & 0xFF); if (unitsPerEm < 16 || unitsPerEm > 16384) { return "Invalid unitsPerEm: " + unitsPerEm + " (must be 16-16384)"; } @@ -1111,8 +1122,11 @@ public class PdfJsonConversionService { } private String buildUnicodeMapping(PDFont font, String toUnicodeBase64) throws IOException { - log.debug("buildUnicodeMapping called for font: {}, hasToUnicode: {}, isCID: {}", - font.getName(), toUnicodeBase64 != null, font instanceof PDType0Font); + log.debug( + "buildUnicodeMapping called for font: {}, hasToUnicode: {}, isCID: {}", + font.getName(), + toUnicodeBase64 != null, + font instanceof PDType0Font); if (toUnicodeBase64 == null || toUnicodeBase64.isBlank()) { log.debug("No ToUnicode data for font: {}", font.getName()); @@ -1135,7 +1149,8 @@ public class PdfJsonConversionService { String toUnicodeStr = new String(toUnicodeBytes, StandardCharsets.UTF_8); // Parse ToUnicode CMap for bfchar and bfrange - java.util.regex.Pattern bfcharPattern = java.util.regex.Pattern.compile("<([0-9A-Fa-f]+)>\\s*<([0-9A-Fa-f]+)>"); + java.util.regex.Pattern bfcharPattern = + java.util.regex.Pattern.compile("<([0-9A-Fa-f]+)>\\s*<([0-9A-Fa-f]+)>"); java.util.regex.Matcher matcher = bfcharPattern.matcher(toUnicodeStr); while (matcher.find()) { int charCode = Integer.parseInt(matcher.group(1), 16); @@ -1162,22 +1177,33 @@ public class PdfJsonConversionService { json.append(","); } first = false; - json.append(String.format("{\"code\":%d,\"cid\":%d,\"gid\":%d,\"unicode\":%d}", - charCode, cid, gid, unicode)); + json.append( + String.format( + "{\"code\":%d,\"cid\":%d,\"gid\":%d,\"unicode\":%d}", + charCode, cid, gid, unicode)); } catch (Exception e) { // Skip entries that fail to map - log.debug("Failed to map charCode {} in font {}: {}", charCode, font.getName(), e.getMessage()); + log.debug( + "Failed to map charCode {} in font {}: {}", + charCode, + font.getName(), + e.getMessage()); } } json.append("]}"); String jsonStr = json.toString(); - log.debug("Built Unicode mapping for CID font {} with {} entries", - font.getName(), charCodeToUnicode.size()); + log.debug( + "Built Unicode mapping for CID font {} with {} entries", + font.getName(), + charCodeToUnicode.size()); return Base64.getEncoder().encodeToString(jsonStr.getBytes(StandardCharsets.UTF_8)); } catch (Exception e) { - log.warn("Failed to build Unicode mapping for font {}: {}", font.getName(), e.getMessage()); + log.warn( + "Failed to build Unicode mapping for font {}: {}", + font.getName(), + e.getMessage()); return toUnicodeBase64; // Fall back to raw ToUnicode } } @@ -1214,7 +1240,8 @@ public class PdfJsonConversionService { PDStream fontFile3 = descriptor.getFontFile3(); if (fontFile3 != null) { String subtype = fontFile3.getCOSObject().getNameAsString(COSName.SUBTYPE); - return readFontProgram(fontFile3, subtype != null ? subtype : "fontfile3", false, toUnicode); + return readFontProgram( + fontFile3, subtype != null ? subtype : "fontfile3", false, toUnicode); } PDStream fontFile2 = descriptor.getFontFile2(); @@ -1231,7 +1258,8 @@ public class PdfJsonConversionService { } private FontProgramData readFontProgram( - PDStream stream, String formatHint, boolean detectTrueType, String toUnicode) throws IOException { + PDStream stream, String formatHint, boolean detectTrueType, String toUnicode) + throws IOException { try (InputStream inputStream = stream.createInputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream()) { inputStream.transferTo(baos); @@ -1242,22 +1270,72 @@ public class PdfJsonConversionService { } String webBase64 = null; String webFormat = null; + String pdfBase64 = null; + String pdfFormat = null; if (format != null && isCffFormat(format)) { - log.debug("Detected CFF font format: {}, wrapping as OpenType-CFF for web preview", format); + log.debug( + "Detected CFF font format: {}, wrapping as OpenType-CFF for web preview", + format); byte[] converted = convertCffProgramToTrueType(data, toUnicode); if (converted != null && converted.length > 0) { + String detectedFormat = detectFontFlavor(converted); webBase64 = Base64.getEncoder().encodeToString(converted); - webFormat = "otf"; - log.debug("CFF→OTF wrapping successful: {} bytes → {} bytes", data.length, converted.length); + webFormat = detectedFormat; + if ("ttf".equals(detectedFormat)) { + pdfBase64 = webBase64; + pdfFormat = detectedFormat; + } + log.debug( + "Primary CFF conversion successful: {} bytes → {} bytes (format: {})", + data.length, + converted.length, + detectedFormat); } else { log.debug("CFF→OTF wrapping returned null or empty result"); } + + if (pdfBase64 == null && cffConversionEnabled) { + byte[] ttfConverted = convertCffUsingFontForge(data); + if (ttfConverted != null && ttfConverted.length > 0) { + String detectedFormat = detectFontFlavor(ttfConverted); + if (detectedFormat != null) { + pdfBase64 = Base64.getEncoder().encodeToString(ttfConverted); + pdfFormat = detectedFormat; + log.debug( + "FontForge conversion produced {} bytes (format: {})", + ttfConverted.length, + detectedFormat); + if (webBase64 == null) { + webBase64 = pdfBase64; + webFormat = detectedFormat; + } + } + } + } } String base64 = Base64.getEncoder().encodeToString(data); - return new FontProgramData(base64, format, webBase64, webFormat); + return new FontProgramData(base64, format, webBase64, webFormat, pdfBase64, pdfFormat); } } + private String detectFontFlavor(byte[] fontBytes) { + if (fontBytes == null || fontBytes.length < 4) { + return null; + } + int magic = + ((fontBytes[0] & 0xFF) << 24) + | ((fontBytes[1] & 0xFF) << 16) + | ((fontBytes[2] & 0xFF) << 8) + | (fontBytes[3] & 0xFF); + if (magic == 0x4F54544F) { // 'OTTO' + return "otf"; + } + if (magic == 0x00010000 || magic == 0x74727565) { // 1.0 or 'true' + return "ttf"; + } + return null; + } + private String detectTrueTypeFormat(byte[] data) { if (data == null || data.length < 4) { return "ttf"; @@ -2344,6 +2422,7 @@ public class PdfJsonConversionService { PDPageContentStream contentStream, PdfJsonTextColor color, boolean nonStroking) throws IOException { if (color == null || color.getComponents() == null) { + log.trace("[ColorApply] Skipping null color for nonStroking={}", nonStroking); return; } float[] components = new float[color.getComponents().size()]; @@ -2351,6 +2430,11 @@ public class PdfJsonConversionService { components[i] = color.getComponents().get(i); } String space = color.getColorSpace(); + log.trace( + "[ColorApply] Requested color space={} components={} nonStroking={}", + space, + Arrays.toString(components), + nonStroking); if (space == null) { // Infer color space from component count PDColorSpace colorSpace; @@ -2404,7 +2488,7 @@ public class PdfJsonConversionService { } break; default: - log.debug("Skipping unsupported color space {}", space); + log.debug("[ColorApply] Skipping unsupported color space {}", space); } } @@ -2424,12 +2508,22 @@ public class PdfJsonConversionService { private final String format; private final String webBase64; private final String webFormat; + private final String pdfBase64; + private final String pdfFormat; - private FontProgramData(String base64, String format, String webBase64, String webFormat) { + private FontProgramData( + String base64, + String format, + String webBase64, + String webFormat, + String pdfBase64, + String pdfFormat) { this.base64 = base64; this.format = format; this.webBase64 = webBase64; this.webFormat = webFormat; + this.pdfBase64 = pdfBase64; + this.pdfFormat = pdfFormat; } private String getBase64() { @@ -2447,6 +2541,14 @@ public class PdfJsonConversionService { private String getWebFormat() { return webFormat; } + + private String getPdfBase64() { + return pdfBase64; + } + + private String getPdfFormat() { + return pdfFormat; + } } private static final class PreflightResult { @@ -2803,7 +2905,9 @@ public class PdfJsonConversionService { try { font.encode("A"); applyAdditionalFontMetadata(document, font, fontModel); - log.debug("Successfully restored embedded font {} from dictionary", fontModel.getId()); + log.debug( + "Successfully restored embedded font {} from dictionary", + fontModel.getId()); return font; } catch (IOException | IllegalArgumentException encodingEx) { log.warn( @@ -2821,73 +2925,157 @@ public class PdfJsonConversionService { } } - byte[] fontBytes = null; - String format = null; - - // For CFF/Type1C fonts, prefer the webProgram (converted TrueType) because: - // 1. PDFBox's PDType0Font.load() expects TrueType/OpenType format - // 2. Raw CFF program bytes lack the descriptor context needed for reconstruction - // 3. FontForge-converted TrueType is reliable for both web preview and PDF export String originalFormat = fontModel.getProgramFormat() != null ? fontModel.getProgramFormat().toLowerCase(Locale.ROOT) : null; - // For JSON→PDF conversion, always use original font bytes - // (PDFBox doesn't support OpenType-CFF; webProgram is only for frontend web preview) + String program = fontModel.getProgram(); - if (program != null && !program.isBlank()) { - fontBytes = Base64.getDecoder().decode(program); - format = originalFormat; - log.debug("Using original font program for {} (format: {})", fontModel.getId(), originalFormat); - } else if (fontModel.getWebProgram() != null && !fontModel.getWebProgram().isBlank()) { - // Fallback to webProgram if original program is unavailable - fontBytes = Base64.getDecoder().decode(fontModel.getWebProgram()); - format = - fontModel.getWebProgramFormat() != null - ? fontModel.getWebProgramFormat().toLowerCase(Locale.ROOT) - : null; - log.debug("Using web-optimized font program for {} (original program unavailable)", fontModel.getId()); + String webProgram = fontModel.getWebProgram(); + String pdfProgram = fontModel.getPdfProgram(); + String webFormat = + fontModel.getWebProgramFormat() != null + ? fontModel.getWebProgramFormat().toLowerCase(Locale.ROOT) + : null; + String pdfFormat = + fontModel.getPdfProgramFormat() != null + ? fontModel.getPdfProgramFormat().toLowerCase(Locale.ROOT) + : null; + + record FontByteSource(byte[] bytes, String format, String originLabel) {} + + List candidates = new ArrayList<>(); + List deferredWebCandidates = new ArrayList<>(); + + boolean hasPdfProgram = pdfProgram != null && !pdfProgram.isBlank(); + boolean hasWebProgram = webProgram != null && !webProgram.isBlank(); + + if (hasPdfProgram) { + try { + byte[] bytes = Base64.getDecoder().decode(pdfProgram); + if (bytes.length > 0) { + candidates.add(new FontByteSource(bytes, pdfFormat, "pdfProgram")); + } + } catch (IllegalArgumentException ex) { + log.warn( + "Failed to decode pdfProgram for {}: {}", + fontModel.getId(), + ex.getMessage()); + } } - if (fontBytes != null && fontBytes.length > 0) { + if (hasWebProgram) { + try { + byte[] bytes = Base64.getDecoder().decode(webProgram); + if (bytes.length > 0) { + // Prefer the converted blob when the original program is CFF/Type1C, because + // PDFBox expects TrueType/OpenType data during reconstruction. + boolean preferWeb = + originalFormat == null + || isCffFormat(originalFormat) + || "cidfonttype0c".equals(originalFormat); + FontByteSource source = new FontByteSource(bytes, webFormat, "webProgram"); + if (preferWeb) { + candidates.add(source); + } else { + // Keep the converted blob as a secondary option in case loading the + // original program fails: some PDFs mix Type1 metadata with actual CFF + // payloads that PDFBox cannot parse. + deferredWebCandidates.add(source); + } + } + } catch (IllegalArgumentException ex) { + log.warn( + "Failed to decode webProgram for {}: {}", + fontModel.getId(), + ex.getMessage()); + } + } + + if (program != null && !program.isBlank()) { + try { + byte[] bytes = Base64.getDecoder().decode(program); + if (bytes.length > 0) { + // Original bytes should still be attempted. When we already preferred the + // converted blob, these will be appended as fallback. + candidates.add(new FontByteSource(bytes, originalFormat, "program")); + } + } catch (IllegalArgumentException ex) { + log.warn( + "Failed to decode font program for {}: {}", + fontModel.getId(), + ex.getMessage()); + } + } + + // If no candidates were added (e.g. both payloads missing/invalid) attempt to fall back to + // the converted program when it exists but we skipped it earlier. + if (candidates.isEmpty() && hasWebProgram) { + try { + byte[] bytes = Base64.getDecoder().decode(webProgram); + if (bytes.length > 0) { + candidates.add(new FontByteSource(bytes, webFormat, "webProgram")); + } + } catch (IllegalArgumentException ignored) { + // Already logged above when decoding failed the first time. + } + } + + candidates.addAll(deferredWebCandidates); + + for (FontByteSource source : candidates) { + byte[] fontBytes = source.bytes(); + String format = source.format(); + String originLabel = source.originLabel(); + + if (fontBytes == null || fontBytes.length == 0) { + continue; + } + try { if (isType1Format(format)) { try (InputStream stream = new ByteArrayInputStream(fontBytes)) { PDFont font = new PDType1Font(document, stream); applyAdditionalFontMetadata(document, font, fontModel); log.debug( - "Successfully loaded Type1 font {} from program bytes (format: {}, originalFormat: {})", + "Successfully loaded Type1 font {} from {} bytes (format: {}, originalFormat: {})", fontModel.getId(), + originLabel, format, originalFormat); return font; } } + try (InputStream stream = new ByteArrayInputStream(fontBytes)) { PDFont font = PDType0Font.load(document, stream, true); applyAdditionalFontMetadata(document, font, fontModel); log.debug( - "Successfully loaded Type0 font {} from program bytes (format: {}, originalFormat: {})", + "Successfully loaded Type0 font {} from {} bytes (format: {}, originalFormat: {})", fontModel.getId(), + originLabel, format, originalFormat); return font; } } catch (IOException ex) { log.warn( - "Unable to load embedded font program for {} (format: {}, originalFormat: {}): {}; falling back to Standard 14 or default", + "Unable to load embedded font program for {} from {} (format: {}, originalFormat: {}): {}", fontModel.getId(), + originLabel, format, originalFormat, ex.getMessage()); } - } else { - log.warn( - "Font {} has no program bytes available (originalFormat: {})", - fontModel.getId(), - originalFormat); } + log.warn( + "Font {} has no usable program bytes (originalFormat: {}, hasWebProgram: {}, hasPdfProgram: {})", + fontModel.getId(), + originalFormat, + hasWebProgram, + hasPdfProgram); + String standardName = fontModel.getStandard14Name(); if (standardName != null) { try { @@ -3506,6 +3694,15 @@ public class PdfJsonConversionService { element.setStrokeColor(toTextColor(graphicsState.getStrokingColor())); } element.setZOrder(1_000_000 + pageElements.size()); + if (log.isTraceEnabled()) { + log.trace( + "[TextCapture] text='{}' font={} size={} fill={} stroke={}", + sanitizeForLog(element.getText()), + fontId, + element.getFontSizeInPt(), + describeColor(element.getFillColor()), + describeColor(element.getStrokeColor())); + } pageElements.add(element); } } @@ -3557,17 +3754,48 @@ public class PdfJsonConversionService { } PDColorSpace colorSpace = color.getColorSpace(); if (colorSpace == null) { + log.debug("[ColorCapture] No color space for PDColor {}", color); return null; } float[] components = color.getComponents(); - List values = new ArrayList<>(components.length); - for (float component : components) { + String colorSpaceName = colorSpace.getName(); + log.trace( + "[ColorCapture] Raw color space={} components={}", + colorSpaceName, + Arrays.toString(components)); + float[] effective = components; + try { + float[] rgb = colorSpace.toRGB(components); + if (rgb != null && rgb.length >= 3) { + effective = rgb; + colorSpaceName = COSName.DEVICERGB.getName(); + } + } catch (IOException ex) { + log.debug( + "[ColorCapture] Failed to convert color space {} to RGB: {}", + colorSpaceName, + ex.getMessage()); + } + List values = new ArrayList<>(effective.length); + for (float component : effective) { values.add(component); } - return PdfJsonTextColor.builder() - .colorSpace(colorSpace.getName()) - .components(values) - .build(); + log.trace("[ColorCapture] Stored color space={} components={}", colorSpaceName, values); + return PdfJsonTextColor.builder().colorSpace(colorSpaceName).components(values).build(); + } + + private String sanitizeForLog(String value) { + if (value == null) { + return "null"; + } + return value.replace("\n", "\\n").replace("\r", "\\r"); + } + + private String describeColor(PdfJsonTextColor color) { + if (color == null || color.getComponents() == null) { + return "null"; + } + return color.getColorSpace() + "=" + color.getComponents(); } } diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index b154ec782..44e0288f9 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -28,8 +28,7 @@ FROM alpine:3.22.1@sha256:4bcff63911fcb4448bd4fdacec207030997caf25e9bea4045fa6c8 # Copy necessary files COPY scripts /scripts COPY app/core/src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto/ -# first /app directory is for the build stage, second is for the final image -COPY --from=build /app/app/core/build/libs/*.jar app.jar + ARG VERSION_TAG @@ -114,9 +113,12 @@ RUN echo "@main https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/a chmod +x /scripts/init.sh && \ # User permissions addgroup -S stirlingpdfgroup && adduser -S stirlingpdfuser -G stirlingpdfgroup && \ - chown -R stirlingpdfuser:stirlingpdfgroup $HOME /scripts /pipeline /usr/share/fonts/opentype/noto /configs /customFiles /pipeline /tmp/stirling-pdf && \ - chown stirlingpdfuser:stirlingpdfgroup /app.jar + chown -R stirlingpdfuser:stirlingpdfgroup $HOME /scripts /pipeline /usr/share/fonts/opentype/noto /configs /customFiles /pipeline /tmp/stirling-pdf + +COPY --from=build /app/app/core/build/libs/*.jar app.jar + +RUN chown stirlingpdfuser:stirlingpdfgroup /app.jar EXPOSE 8080/tcp # Set user and run command diff --git a/docker/compose/docker-compose.yml b/docker/compose/docker-compose.yml index 713bfec79..69ebc7996 100644 --- a/docker/compose/docker-compose.yml +++ b/docker/compose/docker-compose.yml @@ -29,6 +29,12 @@ services: METRICS_ENABLED: "true" SYSTEM_GOOGLEVISIBILITY: "true" SHOW_SURVEY: "true" + STIRLING_PDF_JSON_FONT_NORMALIZATION_ENABLED: "false" + STIRLING_PDF_JSON_CFF_CONVERTER_ENABLED: "true" + STIRLING_PDF_JSON_CFF_CONVERTER_METHOD: python + STIRLING_PDF_JSON_CFF_CONVERTER_PYTHON_COMMAND: /opt/venv/bin/python3 + STIRLING_PDF_JSON_CFF_CONVERTER_PYTHON_SCRIPT: /scripts/convert_cff_to_ttf.py + LOGGING_LEVEL_stirling.software.SPDF.service.PdfJsonConversionService: TRACE networks: - stirling-network diff --git a/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx b/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx index 5fb712b63..5569e5966 100644 --- a/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx +++ b/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx @@ -170,6 +170,33 @@ const toCssBounds = ( }; }; +const normalizePageNumber = (pageIndex: number | null | undefined): number | null => { + if (pageIndex === null || pageIndex === undefined || Number.isNaN(pageIndex)) { + return null; + } + return pageIndex + 1; +}; + +const buildFontLookupKeys = ( + fontId: string, + font: PdfJsonFont | null | undefined, + pageIndex: number | null | undefined, +): string[] => { + const keys: string[] = []; + const pageNumber = normalizePageNumber(pageIndex); + if (pageNumber !== null) { + keys.push(`${pageNumber}:${fontId}`); + } + if (font?.uid) { + keys.push(font.uid); + } + if (font?.pageNumber !== null && font?.pageNumber !== undefined && font?.id) { + keys.push(`${font.pageNumber}:${font.id}`); + } + keys.push(fontId); + return Array.from(new Set(keys.filter((value) => value && value.length > 0))); +}; + const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const { t } = useTranslation(); const [activeGroupId, setActiveGroupId] = useState(null); @@ -203,22 +230,45 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { onGeneratePdf, } = data; - const getFontFamily = (fontId: string | null | undefined): string => { + const resolveFont = (fontId: string | null | undefined, pageIndex: number | null | undefined): PdfJsonFont | null => { if (!fontId || !pdfDocument?.fonts) { - return 'sans-serif'; + return null; } - const loadedFamily = fontFamilies.get(fontId); - if (loadedFamily) { - return `'${loadedFamily}', sans-serif`; + const fonts = pdfDocument.fonts; + const pageNumber = normalizePageNumber(pageIndex); + if (pageNumber !== null) { + const pageMatch = fonts.find((font) => font?.id === fontId && font?.pageNumber === pageNumber); + if (pageMatch) { + return pageMatch; + } + const uidKey = `${pageNumber}:${fontId}`; + const uidMatch = fonts.find((font) => font?.uid === uidKey); + if (uidMatch) { + return uidMatch; + } } - const font = pdfDocument.fonts.find((f) => f.id === fontId); - if (!font) { + const directUid = fonts.find((font) => font?.uid === fontId); + if (directUid) { + return directUid; + } + return fonts.find((font) => font?.id === fontId) ?? null; + }; + + const getFontFamily = (fontId: string | null | undefined, pageIndex: number | null | undefined): string => { + if (!fontId) { return 'sans-serif'; } - // Map PDF fonts to web-safe fonts based on name - // Note: Embedded font data from PDFs often lacks tables required for web rendering (OS/2 table) - const fontName = font.standard14Name || font.baseName || ''; + const font = resolveFont(fontId, pageIndex); + const lookupKeys = buildFontLookupKeys(fontId, font ?? undefined, pageIndex); + for (const key of lookupKeys) { + const loadedFamily = fontFamilies.get(key); + if (loadedFamily) { + return `'${loadedFamily}', sans-serif`; + } + } + + const fontName = font?.standard14Name || font?.baseName || ''; const lowerName = fontName.toLowerCase(); if (lowerName.includes('times')) { @@ -237,27 +287,89 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { return 'Arial, Helvetica, sans-serif'; }; - const getLineHeightPx = (fontId: string | null | undefined, fontSizePx: number): number => { + const getFontMetricsFor = ( + fontId: string | null | undefined, + pageIndex: number | null | undefined, + ): { unitsPerEm: number; ascent: number; descent: number } | undefined => { + if (!fontId) { + return undefined; + } + const font = resolveFont(fontId, pageIndex); + const lookupKeys = buildFontLookupKeys(fontId, font ?? undefined, pageIndex); + for (const key of lookupKeys) { + const metrics = fontMetrics.get(key); + if (metrics) { + return metrics; + } + } + return undefined; + }; + + const getLineHeightPx = ( + fontId: string | null | undefined, + pageIndex: number | null | undefined, + fontSizePx: number, + ): number => { if (fontSizePx <= 0) { return fontSizePx; } - const metrics = fontId ? fontMetrics.get(fontId) : undefined; + const metrics = getFontMetricsFor(fontId, pageIndex); if (!metrics || metrics.unitsPerEm <= 0) { return fontSizePx * 1.2; } - const totalUnits = metrics.ascent - metrics.descent; + const unitsPerEm = metrics.unitsPerEm > 0 ? metrics.unitsPerEm : 1000; + const ascentUnits = metrics.ascent ?? unitsPerEm; + const descentUnits = Math.abs(metrics.descent ?? -(unitsPerEm * 0.2)); + const totalUnits = Math.max(unitsPerEm, ascentUnits + descentUnits); if (totalUnits <= 0) { return fontSizePx * 1.2; } - const lineHeight = (totalUnits / metrics.unitsPerEm) * fontSizePx; + const lineHeight = (totalUnits / unitsPerEm) * fontSizePx; return Math.max(lineHeight, fontSizePx * 1.05); }; - const getFontWeight = (fontId: string | null | undefined): number | 'normal' | 'bold' => { - if (!fontId || !pdfDocument?.fonts) { + const getFontGeometry = ( + fontId: string | null | undefined, + pageIndex: number | null | undefined, + ): { + unitsPerEm: number; + ascentUnits: number; + descentUnits: number; + totalUnits: number; + ascentRatio: number; + descentRatio: number; + } | undefined => { + const metrics = getFontMetricsFor(fontId, pageIndex); + if (!metrics) { + return undefined; + } + const unitsPerEm = metrics.unitsPerEm > 0 ? metrics.unitsPerEm : 1000; + const rawAscent = metrics.ascent ?? unitsPerEm; + const rawDescent = metrics.descent ?? -(unitsPerEm * 0.2); + const ascentUnits = Number.isFinite(rawAscent) ? rawAscent : unitsPerEm; + const descentUnits = Number.isFinite(rawDescent) ? Math.abs(rawDescent) : unitsPerEm * 0.2; + const totalUnits = Math.max(unitsPerEm, ascentUnits + descentUnits); + if (totalUnits <= 0 || !Number.isFinite(totalUnits)) { + return undefined; + } + return { + unitsPerEm, + ascentUnits, + descentUnits, + totalUnits, + ascentRatio: ascentUnits / totalUnits, + descentRatio: descentUnits / totalUnits, + }; + }; + + const getFontWeight = ( + fontId: string | null | undefined, + pageIndex: number | null | undefined, + ): number | 'normal' | 'bold' => { + if (!fontId) { return 'normal'; } - const font = pdfDocument.fonts.find((f) => f.id === fontId); + const font = resolveFont(fontId, pageIndex); if (!font || !font.fontDescriptorFlags) { return 'normal'; } @@ -291,7 +403,14 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const unitsPerEm = font.unitsPerEm && font.unitsPerEm > 0 ? font.unitsPerEm : 1000; const ascent = font.ascent ?? unitsPerEm; const descent = font.descent ?? -(unitsPerEm * 0.2); - metrics.set(font.id, { unitsPerEm, ascent, descent }); + const metric = { unitsPerEm, ascent, descent }; + metrics.set(font.id, metric); + if (font.uid) { + metrics.set(font.uid, metric); + } + if (font.pageNumber !== null && font.pageNumber !== undefined) { + metrics.set(`${font.pageNumber}:${font.id}`, metric); + } }); return metrics; }, [pdfDocument?.fonts]); @@ -313,18 +432,45 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { } const next = new Map(); + const pickFontSource = ( + font: PdfJsonFont + ): { data: string; format?: string | null; source: 'pdfProgram' | 'webProgram' | 'program' } | null => { + if (font.pdfProgram && font.pdfProgram.length > 0) { + return { data: font.pdfProgram, format: font.pdfProgramFormat, source: 'pdfProgram' }; + } + if (font.webProgram && font.webProgram.length > 0) { + return { data: font.webProgram, format: font.webProgramFormat, source: 'webProgram' }; + } + if (font.program && font.program.length > 0) { + return { data: font.program, format: font.programFormat, source: 'program' }; + } + return null; + }; + + const registerLoadedFontKeys = (font: PdfJsonFont, familyName: string) => { + if (font.id) { + next.set(font.id, familyName); + } + if (font.uid) { + next.set(font.uid, familyName); + } + if (font.pageNumber !== null && font.pageNumber !== undefined && font.id) { + next.set(`${font.pageNumber}:${font.id}`, familyName); + } + }; + for (const font of fonts) { - if (!font?.id) { + if (!font || !font.id) { continue; } - const programSource = font.webProgram && font.webProgram.length > 0 ? font.webProgram : font.program; - if (!programSource) { + const selection = pickFontSource(font); + if (!selection) { continue; } try { - const formatSource = font.webProgram && font.webProgram.length > 0 ? font.webProgramFormat : font.programFormat; + const formatSource = selection.format; const format = normalizeFontFormat(formatSource); - const data = decodeBase64ToUint8Array(programSource); + const data = decodeBase64ToUint8Array(selection.data); const blob = new Blob([data as BlobPart], { type: getFontMimeType(format) }); const url = URL.createObjectURL(blob); const formatHint = getFontFormatHint(format); @@ -332,12 +478,13 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const source = formatHint ? `url(${url}) format('${formatHint}')` : `url(${url})`; const fontFace = new FontFace(familyName, source); - console.debug(`[FontLoader] Loading font ${font.id} (${font.baseName}):`, { + console.debug(`[FontLoader] Loading font ${font.id} (${font.baseName}) using ${selection.source}:`, { formatSource, format, formatHint, familyName, dataLength: data.length, + hasPdfProgram: !!font.pdfProgram, hasWebProgram: !!font.webProgram, hasProgram: !!font.program }); @@ -350,12 +497,13 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { } document.fonts.add(fontFace); active.push({ fontFace, url }); - next.set(font.id, familyName); + registerLoadedFontKeys(font, familyName); console.debug(`[FontLoader] Successfully loaded font ${font.id}`); } catch (error) { - console.warn(`[FontLoader] Failed to load font ${font.id} (${font.baseName}):`, { + console.warn(`[FontLoader] Failed to load font ${font.id} (${font.baseName}) using ${selection.source}:`, { error: error instanceof Error ? error.message : String(error), - formatSource: font.webProgram && font.webProgram.length > 0 ? font.webProgramFormat : font.programFormat, + formatSource: selection.format, + hasPdfProgram: !!font.pdfProgram, hasWebProgram: !!font.webProgram, hasProgram: !!font.program }); @@ -796,12 +944,19 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const isEditing = editingGroupId === group.id; const baseFontSize = group.fontMatrixSize ?? group.fontSize ?? 12; const fontSizePx = Math.max(baseFontSize * scale, 6); - const fontFamily = getFontFamily(group.fontId); - const lineHeightPx = getLineHeightPx(group.fontId, fontSizePx); - const lineHeightRatio = fontSizePx > 0 ? Math.max(lineHeightPx / fontSizePx, 1.05) : 1.2; + const fontFamily = getFontFamily(group.fontId, group.pageIndex); + let lineHeightPx = getLineHeightPx(group.fontId, group.pageIndex, fontSizePx); + let lineHeightRatio = fontSizePx > 0 ? Math.max(lineHeightPx / fontSizePx, 1.05) : 1.2; const rotation = group.rotation ?? 0; const hasRotation = Math.abs(rotation) > 0.5; const baselineLength = group.baselineLength ?? Math.max(group.bounds.right - group.bounds.left, 0); + const geometry = getFontGeometry(group.fontId, group.pageIndex); + const ascentPx = geometry ? Math.max(fontSizePx * geometry.ascentRatio, fontSizePx * 0.7) : fontSizePx * 0.82; + const descentPx = geometry ? Math.max(fontSizePx * geometry.descentRatio, fontSizePx * 0.2) : fontSizePx * 0.22; + lineHeightPx = Math.max(lineHeightPx, ascentPx + descentPx); + if (fontSizePx > 0) { + lineHeightRatio = Math.max(lineHeightRatio, lineHeightPx / fontSizePx); + } let containerLeft = bounds.left; let containerTop = bounds.top; @@ -814,17 +969,27 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const anchorX = group.anchor?.x ?? group.bounds.left; const anchorY = group.anchor?.y ?? group.bounds.bottom; containerLeft = anchorX * scale; - containerTop = Math.max(pageHeight - anchorY, 0) * scale; + const anchorTop = Math.max(pageHeight - anchorY, 0) * scale; containerWidth = Math.max(baselineLength * scale, MIN_BOX_SIZE); containerHeight = Math.max(lineHeightPx, fontSizePx * lineHeightRatio); transformOrigin = 'left bottom'; // Negate rotation because Y-axis is flipped from PDF to web coordinates transform = `rotate(${-rotation}deg)`; + // Align the baseline (PDF anchor) with the bottom edge used as the + // transform origin. Without this adjustment rotated text appears shifted + // downward by roughly one line height. + containerTop = anchorTop - containerHeight; + } + + if (!hasRotation && group.baseline !== null && group.baseline !== undefined && geometry) { + const cssBaselineTop = (pageHeight - group.baseline) * scale; + containerTop = Math.max(cssBaselineTop - ascentPx, 0); + containerHeight = Math.max(containerHeight, ascentPx + descentPx); } // Extract styling from group const textColor = group.color || '#111827'; - const fontWeight = group.fontWeight || getFontWeight(group.fontId); + const fontWeight = group.fontWeight || getFontWeight(group.fontId, group.pageIndex); const containerStyle: React.CSSProperties = { position: 'absolute', diff --git a/frontend/src/tools/pdfJsonEditorTypes.ts b/frontend/src/tools/pdfJsonEditorTypes.ts index 226d1812c..6f4ee3ae6 100644 --- a/frontend/src/tools/pdfJsonEditorTypes.ts +++ b/frontend/src/tools/pdfJsonEditorTypes.ts @@ -30,6 +30,8 @@ export interface PdfJsonFont { programFormat?: string | null; webProgram?: string | null; webProgramFormat?: string | null; + pdfProgram?: string | null; + pdfProgramFormat?: string | null; toUnicode?: string | null; standard14Name?: string | null; fontDescriptorFlags?: number | null; @@ -140,6 +142,7 @@ export interface TextGroup { rotation?: number | null; anchor?: { x: number; y: number } | null; baselineLength?: number | null; + baseline?: number | null; elements: PdfJsonTextElement[]; originalElements: PdfJsonTextElement[]; text: string; diff --git a/frontend/src/tools/pdfJsonEditorUtils.ts b/frontend/src/tools/pdfJsonEditorUtils.ts index 49378088f..068c378af 100644 --- a/frontend/src/tools/pdfJsonEditorUtils.ts +++ b/frontend/src/tools/pdfJsonEditorUtils.ts @@ -419,6 +419,17 @@ const computeBaselineLength = ( metrics?: FontMetricsMap, ): number => elements.reduce((acc, current) => acc + getWidth(current, metrics), 0); +const computeAverageBaseline = (elements: PdfJsonTextElement[]): number | null => { + if (elements.length === 0) { + return null; + } + let sum = 0; + elements.forEach((element) => { + sum += getBaseline(element); + }); + return sum / elements.length; +}; + const createGroup = ( pageIndex: number, idSuffix: number, @@ -432,6 +443,7 @@ const createGroup = ( const rotation = computeGroupRotation(elements); const anchor = rotation !== null ? getAnchorPoint(firstElement) : null; const baselineLength = computeBaselineLength(elements, metrics); + const baseline = computeAverageBaseline(elements); return { id: `${pageIndex}-${idSuffix}`, @@ -444,6 +456,7 @@ const createGroup = ( rotation, anchor, baselineLength, + baseline, elements: clones, originalElements: originalClones, text: buildGroupText(elements, metrics), @@ -587,42 +600,52 @@ export const createMergedElement = (group: TextGroup): PdfJsonTextElement => { return merged; }; -const distributeTextAcrossElements = (text: string | undefined, elements: PdfJsonTextElement[]): void => { +const distributeTextAcrossElements = (text: string | undefined, elements: PdfJsonTextElement[]): boolean => { if (elements.length === 0) { - return; + return true; } const targetChars = Array.from(text ?? ''); - let cursor = 0; - - elements.forEach((element, index) => { - const originalText = element.text ?? ''; - let sliceLength = Array.from(originalText).length; - if (sliceLength <= 0) { - sliceLength = 1; - } - - if (index === elements.length - 1) { - element.text = targetChars.slice(cursor).join(''); - cursor = targetChars.length; - return; - } - - const slice = targetChars.slice(cursor, cursor + sliceLength).join(''); - element.text = slice; - cursor = Math.min(cursor + sliceLength, targetChars.length); - }); - - if (cursor < targetChars.length) { - const last = elements[elements.length - 1]; - last.text = (last.text ?? '') + targetChars.slice(cursor).join(''); + if (targetChars.length === 0) { + elements.forEach((element) => { + element.text = ''; + }); + return true; } + const capacities = elements.map((element) => { + const originalText = element.text ?? ''; + const graphemeCount = Array.from(originalText).length; + return graphemeCount > 0 ? graphemeCount : 1; + }); + const totalCapacity = capacities.reduce((sum, value) => sum + value, 0); + if (targetChars.length > totalCapacity) { + return false; + } + + let cursor = 0; + elements.forEach((element, index) => { + const remaining = targetChars.length - cursor; + let sliceLength = 0; + if (remaining > 0) { + if (index === elements.length - 1) { + sliceLength = remaining; + } else { + sliceLength = Math.min(capacities[index], remaining); + } + } + + element.text = sliceLength > 0 ? targetChars.slice(cursor, cursor + sliceLength).join('') : ''; + cursor += sliceLength; + }); + elements.forEach((element) => { if (element.text == null) { element.text = ''; } }); + + return true; }; export const buildUpdatedDocument = ( @@ -685,11 +708,29 @@ export const restoreGlyphElements = ( const rebuiltElements: PdfJsonTextElement[] = []; groups.forEach((group) => { - const originals = group.originalElements.map(cloneTextElement); if (group.text !== group.originalText) { - distributeTextAcrossElements(group.text, originals); + const originalGlyphCount = group.originalElements.reduce( + (sum, element) => sum + countGraphemes(element.text ?? ''), + 0, + ); + const targetGlyphCount = countGraphemes(group.text); + + if (targetGlyphCount !== originalGlyphCount) { + rebuiltElements.push(createMergedElement(group)); + return; + } + + const originals = group.originalElements.map(cloneTextElement); + const distributed = distributeTextAcrossElements(group.text, originals); + if (distributed) { + rebuiltElements.push(...originals); + } else { + rebuiltElements.push(createMergedElement(group)); + } + return; } - rebuiltElements.push(...originals); + + rebuiltElements.push(...group.originalElements.map(cloneTextElement)); }); const textDirty = groups.some((group) => group.text !== group.originalText);