diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java index a3496677c..72a532d92 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java @@ -44,6 +44,12 @@ public class PdfJsonFont { /** Hint describing the font program type (ttf, otf, cff, pfb, etc.). */ private String programFormat; + /** Web-optimized font program (e.g. converted TrueType) encoded as Base64. */ + private String webProgram; + + /** Format hint for the webProgram payload. */ + private String webProgramFormat; + /** ToUnicode stream encoded as Base64 when present. */ private String toUnicode; @@ -70,4 +76,7 @@ public class PdfJsonFont { /** Units per em extracted from the font matrix. */ private Integer unitsPerEm; + + /** Serialized COS dictionary describing the original font resource. */ + private PdfJsonCosValue cosDictionary; } diff --git a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java index 37e0b8074..921971e53 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java @@ -37,4 +37,5 @@ public class PdfJsonTextElement { private PdfJsonTextColor fillColor; private PdfJsonTextColor strokeColor; private Integer renderingMode; + private Boolean fallbackUsed; } diff --git a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java index 31387eeae..7eeca8176 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java @@ -34,6 +34,7 @@ import java.util.Set; import java.util.TimeZone; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; import javax.imageio.ImageIO; @@ -64,6 +65,7 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; +import org.apache.pdfbox.pdmodel.font.PDFontFactory; import org.apache.pdfbox.pdmodel.font.PDType0Font; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; @@ -90,6 +92,8 @@ import org.springframework.web.multipart.MultipartFile; import com.fasterxml.jackson.databind.ObjectMapper; +import jakarta.annotation.PostConstruct; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -173,11 +177,61 @@ public class PdfJsonConversionService { @Value("${stirling.pdf.json.cff-converter.enabled:true}") private boolean cffConversionEnabled; + @Value("${stirling.pdf.json.cff-converter.method:python}") + private String cffConverterMethod; + + @Value("${stirling.pdf.json.cff-converter.python-command:/opt/venv/bin/python3}") + private String pythonCommand; + + @Value("${stirling.pdf.json.cff-converter.python-script:/scripts/convert_cff_to_ttf.py}") + private String pythonScript; + @Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}") private String fontforgeCommand; private final Map fallbackFontCache = new ConcurrentHashMap<>(); + private volatile boolean ghostscriptAvailable; + + @PostConstruct + private void initializeGhostscriptAvailability() { + if (!fontNormalizationEnabled) { + ghostscriptAvailable = false; + return; + } + + if (!isGhostscriptGroupEnabled()) { + ghostscriptAvailable = false; + log.warn( + "Ghostscript font normalization disabled: Ghostscript group is not enabled in configuration"); + return; + } + + List command = List.of("gs", "-version"); + try { + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(command); + ghostscriptAvailable = result.getRc() == 0; + if (!ghostscriptAvailable) { + log.warn( + "Ghostscript executable not available (exit code {}); font normalization will be skipped", + result.getRc()); + } + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + ghostscriptAvailable = false; + log.warn( + "Ghostscript availability check interrupted; font normalization will be skipped: {}", + ex.getMessage()); + } catch (IOException ex) { + ghostscriptAvailable = false; + log.warn( + "Ghostscript executable not found or failed to start; font normalization will be skipped: {}", + ex.getMessage()); + } + } + public byte[] convertPdfToJson(MultipartFile file) throws IOException { if (file == null) { throw ExceptionUtils.createNullArgumentException("fileInput"); @@ -452,10 +506,22 @@ public class PdfJsonConversionService { String encoding = resolveEncoding(font); PdfJsonFontCidSystemInfo cidInfo = extractCidSystemInfo(font.getCOSObject()); boolean embedded = font.isEmbedded(); - FontProgramData programData = embedded ? extractFontProgram(font) : null; String toUnicode = extractToUnicode(font.getCOSObject()); + // Build complete CharCode→CID→GID→Unicode mapping for CID fonts + String unicodeMapping = buildUnicodeMapping(font, toUnicode); + FontProgramData programData = embedded ? extractFontProgram(font, unicodeMapping) : null; String standard14Name = resolveStandard14Name(font); Integer flags = descriptor != null ? descriptor.getFlags() : null; + PdfJsonCosValue cosDictionary = serializeCosValue(font.getCOSObject()); + + log.debug( + "Building font model: id={}, baseName={}, subtype={}, embedded={}, hasProgram={}, hasWebProgram={}", + fontId, + font.getName(), + subtype, + embedded, + programData != null && programData.getBase64() != null, + programData != null && programData.getWebBase64() != null); return PdfJsonFont.builder() .id(fontId) @@ -468,6 +534,8 @@ public class PdfJsonConversionService { .embedded(embedded) .program(programData != null ? programData.getBase64() : null) .programFormat(programData != null ? programData.getFormat() : null) + .webProgram(programData != null ? programData.getWebBase64() : null) + .webProgramFormat(programData != null ? programData.getWebFormat() : null) .toUnicode(toUnicode) .standard14Name(standard14Name) .fontDescriptorFlags(flags) @@ -477,6 +545,7 @@ public class PdfJsonConversionService { .xHeight(descriptor != null ? descriptor.getXHeight() : null) .italicAngle(descriptor != null ? descriptor.getItalicAngle() : null) .unitsPerEm(extractUnitsPerEm(font)) + .cosDictionary(cosDictionary) .build(); } @@ -508,11 +577,13 @@ public class PdfJsonConversionService { if (font == null) { fallbackNeeded = true; fallbackIds.add(FALLBACK_FONT_ID); + element.setFallbackUsed(Boolean.TRUE); continue; } if (!canEncodeFully(font, text)) { fallbackNeeded = true; + element.setFallbackUsed(Boolean.TRUE); for (int offset = 0; offset < text.length(); ) { int codePoint = text.codePointAt(offset); offset += Character.charCount(codePoint); @@ -682,11 +753,25 @@ public class PdfJsonConversionService { } private boolean canRunGhostscript() { + if (!fontNormalizationEnabled) { + return false; + } + if (!isGhostscriptGroupEnabled()) { + return false; + } + if (!ghostscriptAvailable) { + log.debug("Skipping Ghostscript normalization; executable not available"); + return false; + } + return true; + } + + private boolean isGhostscriptGroupEnabled() { try { return endpointConfiguration != null && endpointConfiguration.isGroupEnabled("Ghostscript"); } catch (Exception ex) { - log.debug("Ghostscript availability check failed: {}", ex.getMessage()); + log.debug("Ghostscript group check failed: {}", ex.getMessage()); return false; } } @@ -736,12 +821,129 @@ public class PdfJsonConversionService { return null; } - private byte[] convertCffProgramToTrueType(byte[] fontBytes) { - if (!cffConversionEnabled - || fontforgeCommand == null - || fontforgeCommand.isBlank() - || fontBytes == null - || fontBytes.length == 0) { + private byte[] convertCffProgramToTrueType(byte[] fontBytes, String toUnicode) { + if (!cffConversionEnabled || fontBytes == null || fontBytes.length == 0) { + return null; + } + + // Determine which converter to use + if ("python".equalsIgnoreCase(cffConverterMethod)) { + return convertCffUsingPython(fontBytes, toUnicode); + } else if ("fontforge".equalsIgnoreCase(cffConverterMethod)) { + return convertCffUsingFontForge(fontBytes); + } else { + log.warn("Unknown CFF converter method: {}, falling back to Python", cffConverterMethod); + return convertCffUsingPython(fontBytes, toUnicode); + } + } + + private byte[] convertCffUsingPython(byte[] fontBytes, String toUnicode) { + if (pythonCommand == null + || pythonCommand.isBlank() + || pythonScript == null + || pythonScript.isBlank()) { + log.debug("Python converter not configured"); + return null; + } + + try (TempFile inputFile = new TempFile(tempFileManager, ".cff"); + TempFile outputFile = new TempFile(tempFileManager, ".otf"); + TempFile toUnicodeFile = toUnicode != null ? new TempFile(tempFileManager, ".tounicode") : null) { + Files.write(inputFile.getPath(), fontBytes); + + // Write ToUnicode CMap data if available + if (toUnicode != null && toUnicodeFile != null) { + byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicode); + Files.write(toUnicodeFile.getPath(), toUnicodeBytes); + } + + List command = new ArrayList<>(); + command.add(pythonCommand); + command.add(pythonScript); + command.add(inputFile.getAbsolutePath()); + command.add(outputFile.getAbsolutePath()); + // Add optional ToUnicode file path + if (toUnicodeFile != null) { + command.add(toUnicodeFile.getAbsolutePath()); + } + + ProcessBuilder builder = new ProcessBuilder(command); + builder.redirectErrorStream(true); + Process process = builder.start(); + + StringBuilder output = new StringBuilder(); + Thread reader = + new Thread( + () -> { + try (BufferedReader br = + new BufferedReader( + new InputStreamReader( + process.getInputStream(), + StandardCharsets.UTF_8))) { + String line; + while ((line = br.readLine()) != null) { + output.append(line).append('\n'); + } + } catch (IOException ignored) { + } + }); + reader.start(); + + // Wait with timeout (Python fontTools is usually fast, but provide safety margin) + boolean finished = process.waitFor(30, TimeUnit.SECONDS); + if (!finished) { + process.destroyForcibly(); + reader.interrupt(); + log.warn( + "Python CFF→OTF wrapping timed out after 30 seconds - font may be corrupted"); + return null; + } + + int exitCode = process.exitValue(); + reader.join(5000); + + if (exitCode == 0 && Files.exists(outputFile.getPath())) { + byte[] convertedBytes = Files.readAllBytes(outputFile.getPath()); + if (convertedBytes.length > 0) { + String validationError = validateFontTables(convertedBytes); + if (validationError != null) { + log.warn("Python converter produced invalid font: {}", validationError); + return null; + } + + // Log Python script output for debugging + String outputStr = output.toString().trim(); + if (!outputStr.isEmpty()) { + log.debug("Python script output: {}", outputStr); + } + + log.debug( + "Python CFF→OTF wrapping successful: {} bytes → {} bytes", + fontBytes.length, + convertedBytes.length); + return convertedBytes; + } + } else { + String outputStr = output.toString().trim(); + if (!outputStr.isEmpty()) { + log.warn("Python CFF→OTF wrapping failed with exit code {}: {}", exitCode, outputStr); + } else { + log.warn("Python CFF→OTF wrapping failed with exit code {}", exitCode); + } + } + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + log.debug("Python CFF conversion interrupted", ex); + } catch (IOException ex) { + log.debug("Python CFF conversion I/O error", ex); + } + + return null; + } + + private byte[] convertCffUsingFontForge(byte[] fontBytes) { + if (fontforgeCommand == null || fontforgeCommand.isBlank()) { + log.debug("FontForge converter not configured"); return null; } @@ -754,8 +956,18 @@ public class PdfJsonConversionService { command.add("-lang=ff"); command.add("-c"); command.add( - "Open($1); SelectWorthOutputting(); SetFontOrder(2); Reencode(\"unicode\"); " - + "Generate($2); Close(); Quit()"); + "Open($1); " + + "ScaleToEm(1000); " // Force 1000 units per em (standard for Type1) + + "SelectWorthOutputting(); " + + "SetFontOrder(2); " + + "Reencode(\"unicode\"); " + + "RoundToInt(); " + + "RemoveOverlap(); " + + "Simplify(); " + + "CorrectDirection(); " + + "Generate($2, \"\", 4+16+32); " + + "Close(); " + + "Quit()"); command.add(inputFile.getAbsolutePath()); command.add(outputFile.getAbsolutePath()); @@ -780,11 +992,59 @@ public class PdfJsonConversionService { } }); reader.start(); - int exitCode = process.waitFor(); - reader.join(); + + // Wait with timeout to prevent hanging on problematic fonts + boolean finished = process.waitFor(30, TimeUnit.SECONDS); + if (!finished) { + process.destroyForcibly(); + reader.interrupt(); + log.warn("FontForge conversion timed out after 30 seconds - font may be too complex or causing FontForge to hang"); + return null; + } + + int exitCode = process.exitValue(); + reader.join(5000); // Wait max 5 seconds for reader thread if (exitCode == 0 && Files.exists(outputFile.getPath())) { - return Files.readAllBytes(outputFile.getPath()); + byte[] convertedBytes = Files.readAllBytes(outputFile.getPath()); + if (convertedBytes.length > 0) { + // Basic validation: check for TrueType magic number and critical tables + if (convertedBytes.length >= 4) { + int magic = + ((convertedBytes[0] & 0xFF) << 24) + | ((convertedBytes[1] & 0xFF) << 16) + | ((convertedBytes[2] & 0xFF) << 8) + | (convertedBytes[3] & 0xFF); + boolean validTrueType = + magic == 0x00010000 || magic == 0x74727565; // 1.0 or 'true' + boolean validOpenType = magic == 0x4F54544F; // 'OTTO' + + if (validTrueType || validOpenType) { + // Additional validation: check unitsPerEm in head table + String validationError = validateFontTables(convertedBytes); + if (validationError != null) { + log.warn( + "FontForge produced invalid font: {}", + validationError); + return null; + } + + log.debug( + "FontForge CFF→TrueType conversion successful: {} bytes, magic: 0x{}, type: {}", + convertedBytes.length, + Integer.toHexString(magic), + validOpenType ? "OpenType" : "TrueType"); + return convertedBytes; + } else { + log.warn( + "FontForge produced invalid font: magic number 0x{} (expected TrueType or OpenType)", + Integer.toHexString(magic)); + return null; + } + } + } + log.warn("FontForge produced empty output file"); + return null; } log.warn( @@ -801,6 +1061,127 @@ public class PdfJsonConversionService { return null; } + /** + * Validates critical OpenType/TrueType font tables to ensure browser compatibility. + * @return Error message if invalid, null if valid + */ + private String validateFontTables(byte[] fontBytes) { + try { + if (fontBytes.length < 12) { + return "Font file too small"; + } + + // Read table directory + int numTables = ((fontBytes[4] & 0xFF) << 8) | (fontBytes[5] & 0xFF); + if (numTables == 0 || numTables > 100) { + return "Invalid table count: " + numTables; + } + + // Find head table + int offset = 12; // Skip sfnt header + for (int i = 0; i < numTables && offset + 16 <= fontBytes.length; i++) { + String tag = new String(fontBytes, offset, 4, StandardCharsets.US_ASCII); + int tableOffset = ((fontBytes[offset + 8] & 0xFF) << 24) + | ((fontBytes[offset + 9] & 0xFF) << 16) + | ((fontBytes[offset + 10] & 0xFF) << 8) + | (fontBytes[offset + 11] & 0xFF); + int tableLength = ((fontBytes[offset + 12] & 0xFF) << 24) + | ((fontBytes[offset + 13] & 0xFF) << 16) + | ((fontBytes[offset + 14] & 0xFF) << 8) + | (fontBytes[offset + 15] & 0xFF); + + if ("head".equals(tag)) { + if (tableOffset + 18 > fontBytes.length) { + return "head table truncated"; + } + // Check unitsPerEm at offset 18 in head table + int unitsPerEm = ((fontBytes[tableOffset + 18] & 0xFF) << 8) + | (fontBytes[tableOffset + 19] & 0xFF); + if (unitsPerEm < 16 || unitsPerEm > 16384) { + return "Invalid unitsPerEm: " + unitsPerEm + " (must be 16-16384)"; + } + return null; // Valid + } + offset += 16; + } + return "head table not found"; + } catch (Exception ex) { + return "Validation error: " + ex.getMessage(); + } + } + + private String buildUnicodeMapping(PDFont font, String toUnicodeBase64) throws IOException { + log.debug("buildUnicodeMapping called for font: {}, hasToUnicode: {}, isCID: {}", + font.getName(), toUnicodeBase64 != null, font instanceof PDType0Font); + + if (toUnicodeBase64 == null || toUnicodeBase64.isBlank()) { + log.debug("No ToUnicode data for font: {}", font.getName()); + return null; + } + + // For CID fonts (Type0), build complete CharCode→CID→GID→Unicode mapping + if (!(font instanceof PDType0Font type0Font)) { + // For non-CID fonts, just return ToUnicode as-is + log.debug("Non-CID font {}, returning raw ToUnicode", font.getName()); + return toUnicodeBase64; + } + + log.debug("Building JSON mapping for CID font: {}", font.getName()); + + try { + // Build a map of CharCode → Unicode from ToUnicode + Map charCodeToUnicode = new HashMap<>(); + byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicodeBase64); + String toUnicodeStr = new String(toUnicodeBytes, StandardCharsets.UTF_8); + + // Parse ToUnicode CMap for bfchar and bfrange + java.util.regex.Pattern bfcharPattern = java.util.regex.Pattern.compile("<([0-9A-Fa-f]+)>\\s*<([0-9A-Fa-f]+)>"); + java.util.regex.Matcher matcher = bfcharPattern.matcher(toUnicodeStr); + while (matcher.find()) { + int charCode = Integer.parseInt(matcher.group(1), 16); + int unicode = Integer.parseInt(matcher.group(2), 16); + charCodeToUnicode.put(charCode, unicode); + } + + // Build JSON mapping: CharCode → CID → GID → Unicode + StringBuilder json = new StringBuilder(); + json.append("{\"isCID\":true,\"cidToGidIdentity\":true,\"entries\":["); + + boolean first = true; + for (Map.Entry entry : charCodeToUnicode.entrySet()) { + int charCode = entry.getKey(); + int unicode = entry.getValue(); + + try { + // Get CID from char code + int cid = type0Font.codeToCID(charCode); + // For Identity-H/V encoding, GID == CID + int gid = cid; + + if (!first) { + json.append(","); + } + first = false; + json.append(String.format("{\"code\":%d,\"cid\":%d,\"gid\":%d,\"unicode\":%d}", + charCode, cid, gid, unicode)); + } catch (Exception e) { + // Skip entries that fail to map + log.debug("Failed to map charCode {} in font {}: {}", charCode, font.getName(), e.getMessage()); + } + } + + json.append("]}"); + String jsonStr = json.toString(); + log.debug("Built Unicode mapping for CID font {} with {} entries", + font.getName(), charCodeToUnicode.size()); + return Base64.getEncoder().encodeToString(jsonStr.getBytes(StandardCharsets.UTF_8)); + + } catch (Exception e) { + log.warn("Failed to build Unicode mapping for font {}: {}", font.getName(), e.getMessage()); + return toUnicodeBase64; // Fall back to raw ToUnicode + } + } + private PdfJsonFontCidSystemInfo extractCidSystemInfo(COSDictionary fontDictionary) { if (fontDictionary == null) { return null; @@ -824,7 +1205,7 @@ public class PdfJsonConversionService { return info; } - private FontProgramData extractFontProgram(PDFont font) throws IOException { + private FontProgramData extractFontProgram(PDFont font, String toUnicode) throws IOException { PDFontDescriptor descriptor = font.getFontDescriptor(); if (descriptor == null) { return null; @@ -833,24 +1214,24 @@ public class PdfJsonConversionService { PDStream fontFile3 = descriptor.getFontFile3(); if (fontFile3 != null) { String subtype = fontFile3.getCOSObject().getNameAsString(COSName.SUBTYPE); - return readFontProgram(fontFile3, subtype != null ? subtype : "fontfile3", false); + return readFontProgram(fontFile3, subtype != null ? subtype : "fontfile3", false, toUnicode); } PDStream fontFile2 = descriptor.getFontFile2(); if (fontFile2 != null) { - return readFontProgram(fontFile2, null, true); + return readFontProgram(fontFile2, null, true, toUnicode); } PDStream fontFile = descriptor.getFontFile(); if (fontFile != null) { - return readFontProgram(fontFile, "type1", false); + return readFontProgram(fontFile, "type1", false, toUnicode); } return null; } private FontProgramData readFontProgram( - PDStream stream, String formatHint, boolean detectTrueType) throws IOException { + PDStream stream, String formatHint, boolean detectTrueType, String toUnicode) throws IOException { try (InputStream inputStream = stream.createInputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream()) { inputStream.transferTo(baos); @@ -859,8 +1240,21 @@ public class PdfJsonConversionService { if (detectTrueType) { format = detectTrueTypeFormat(data); } + String webBase64 = null; + String webFormat = null; + if (format != null && isCffFormat(format)) { + log.debug("Detected CFF font format: {}, wrapping as OpenType-CFF for web preview", format); + byte[] converted = convertCffProgramToTrueType(data, toUnicode); + if (converted != null && converted.length > 0) { + webBase64 = Base64.getEncoder().encodeToString(converted); + webFormat = "otf"; + log.debug("CFF→OTF wrapping successful: {} bytes → {} bytes", data.length, converted.length); + } else { + log.debug("CFF→OTF wrapping returned null or empty result"); + } + } String base64 = Base64.getEncoder().encodeToString(data); - return new FontProgramData(base64, format); + return new FontProgramData(base64, format, webBase64, webFormat); } } @@ -1759,8 +2153,12 @@ public class PdfJsonConversionService { } PDFont baseFont = primaryFont; + boolean fallbackApplied = primaryFont == null; if (baseFont == null) { baseFont = ensureFallbackFont(document, fontMap, fontModels, FALLBACK_FONT_ID); + if (baseFont != null) { + fallbackApplied = true; + } } if (baseFont == null) { log.warn("Unable to resolve a base font for text element; skipping text content"); @@ -1777,6 +2175,7 @@ public class PdfJsonConversionService { PDFont targetFont = currentFont; if (!canEncode(baseFont, codePoint)) { + fallbackApplied = true; String fallbackId = resolveFallbackFontId(codePoint); targetFont = ensureFallbackFont(document, fontMap, fontModels, fallbackId); if (targetFont == null || !canEncode(targetFont, glyph)) { @@ -1823,6 +2222,10 @@ public class PdfJsonConversionService { runs.add(new FontRun(currentFont, buffer.toString())); } + if (fallbackApplied) { + element.setFallbackUsed(Boolean.TRUE); + } + return runs; } @@ -2019,10 +2422,14 @@ public class PdfJsonConversionService { private static class FontProgramData { private final String base64; private final String format; + private final String webBase64; + private final String webFormat; - private FontProgramData(String base64, String format) { + private FontProgramData(String base64, String format, String webBase64, String webFormat) { this.base64 = base64; this.format = format; + this.webBase64 = webBase64; + this.webFormat = webFormat; } private String getBase64() { @@ -2032,6 +2439,14 @@ public class PdfJsonConversionService { private String getFormat() { return format; } + + private String getWebBase64() { + return webBase64; + } + + private String getWebFormat() { + return webFormat; + } } private static final class PreflightResult { @@ -2371,46 +2786,106 @@ public class PdfJsonConversionService { return loadFallbackPdfFont(document); } + // IMPORTANT: Dictionary restoration is disabled because deserialized dictionaries + // don't properly include the font stream references (FontFile/FontFile2/FontFile3). + // This results in fonts that structurally exist but can't encode glyphs, causing + // fallback to NotoSans. Instead, we ALWAYS use program bytes for reliable encoding. + // The cosDictionary field is preserved in the JSON for potential future use, but + // for now we rely on direct font program loading. + if (false && fontModel.getCosDictionary() != null) { + // Dictionary restoration code kept for reference but disabled + COSBase restored = deserializeCosValue(fontModel.getCosDictionary(), document); + if (restored instanceof COSDictionary cosDictionary) { + try { + PDFont font = PDFontFactory.createFont(cosDictionary); + if (font != null && font.isEmbedded()) { + // Verify font can actually encode a basic character + try { + font.encode("A"); + applyAdditionalFontMetadata(document, font, fontModel); + log.debug("Successfully restored embedded font {} from dictionary", fontModel.getId()); + return font; + } catch (IOException | IllegalArgumentException encodingEx) { + log.warn( + "Font {} restored from dictionary but failed encoding test: {}; falling back to program bytes", + fontModel.getId(), + encodingEx.getMessage()); + } + } + } catch (IOException ex) { + log.warn( + "Failed to restore font {} from stored dictionary: {}; falling back to program bytes", + fontModel.getId(), + ex.getMessage()); + } + } + } + + byte[] fontBytes = null; + String format = null; + + // For CFF/Type1C fonts, prefer the webProgram (converted TrueType) because: + // 1. PDFBox's PDType0Font.load() expects TrueType/OpenType format + // 2. Raw CFF program bytes lack the descriptor context needed for reconstruction + // 3. FontForge-converted TrueType is reliable for both web preview and PDF export + String originalFormat = + fontModel.getProgramFormat() != null + ? fontModel.getProgramFormat().toLowerCase(Locale.ROOT) + : null; + // For JSON→PDF conversion, always use original font bytes + // (PDFBox doesn't support OpenType-CFF; webProgram is only for frontend web preview) String program = fontModel.getProgram(); if (program != null && !program.isBlank()) { - byte[] fontBytes = Base64.getDecoder().decode(program); - String format = - fontModel.getProgramFormat() != null - ? fontModel.getProgramFormat().toLowerCase(Locale.ROOT) - : ""; + fontBytes = Base64.getDecoder().decode(program); + format = originalFormat; + log.debug("Using original font program for {} (format: {})", fontModel.getId(), originalFormat); + } else if (fontModel.getWebProgram() != null && !fontModel.getWebProgram().isBlank()) { + // Fallback to webProgram if original program is unavailable + fontBytes = Base64.getDecoder().decode(fontModel.getWebProgram()); + format = + fontModel.getWebProgramFormat() != null + ? fontModel.getWebProgramFormat().toLowerCase(Locale.ROOT) + : null; + log.debug("Using web-optimized font program for {} (original program unavailable)", fontModel.getId()); + } + + if (fontBytes != null && fontBytes.length > 0) { try { - if (isCffFormat(format)) { - byte[] converted = convertCffProgramToTrueType(fontBytes); - if (converted != null) { - fontBytes = converted; - format = "ttf"; - log.debug( - "Converted CFF font {} to TrueType outlines for embedding", - fontModel.getId()); - } else { - log.debug( - "Unable to convert CFF font {} to TrueType; attempting direct load", - fontModel.getId()); - } - } if (isType1Format(format)) { try (InputStream stream = new ByteArrayInputStream(fontBytes)) { PDFont font = new PDType1Font(document, stream); applyAdditionalFontMetadata(document, font, fontModel); + log.debug( + "Successfully loaded Type1 font {} from program bytes (format: {}, originalFormat: {})", + fontModel.getId(), + format, + originalFormat); return font; } } try (InputStream stream = new ByteArrayInputStream(fontBytes)) { PDFont font = PDType0Font.load(document, stream, true); applyAdditionalFontMetadata(document, font, fontModel); + log.debug( + "Successfully loaded Type0 font {} from program bytes (format: {}, originalFormat: {})", + fontModel.getId(), + format, + originalFormat); return font; } } catch (IOException ex) { - log.debug( - "Unable to load embedded font program for {}: {}", + log.warn( + "Unable to load embedded font program for {} (format: {}, originalFormat: {}): {}; falling back to Standard 14 or default", fontModel.getId(), + format, + originalFormat, ex.getMessage()); } + } else { + log.warn( + "Font {} has no program bytes available (originalFormat: {})", + fontModel.getId(), + originalFormat); } String standardName = fontModel.getStandard14Name(); diff --git a/app/core/src/main/resources/settings.yml.template b/app/core/src/main/resources/settings.yml.template index fd389337e..849eae60e 100644 --- a/app/core/src/main/resources/settings.yml.template +++ b/app/core/src/main/resources/settings.yml.template @@ -173,9 +173,12 @@ stirling: fallback-font: classpath:/static/fonts/NotoSans-Regular.ttf # Override to point at a custom fallback font json: font-normalization: - enabled: true # Run Ghostscript preflight to normalize fonts before PDF→JSON + enabled: false # IMPORTANT: Disable to preserve ToUnicode CMaps for correct font rendering. Ghostscript strips Unicode mappings from CID fonts. cff-converter: - enabled: true # Attempt to transcode CFF/Type1C programs to OTF using FontForge when available + enabled: true # Wrap CFF/Type1C fonts as OpenType-CFF for browser compatibility + method: python # Converter method: 'python' (fontTools, recommended - wraps as OTF), 'fontforge' (legacy - converts to TTF, may hang on CID fonts) + python-command: /opt/venv/bin/python3 # Python interpreter path + python-script: /scripts/convert_cff_to_ttf.py # Path to font wrapping script fontforge-command: fontforge # Override if FontForge is installed under a different name/path ui: diff --git a/docker/compose/docker-compose.yml b/docker/compose/docker-compose.yml index 6f8b1ace8..713bfec79 100644 --- a/docker/compose/docker-compose.yml +++ b/docker/compose/docker-compose.yml @@ -5,10 +5,6 @@ services: dockerfile: docker/backend/Dockerfile container_name: stirling-pdf-backend restart: on-failure:5 - deploy: - resources: - limits: - memory: 4G healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:8080/api/v1/info/status | grep -q 'UP'"] interval: 5s diff --git a/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx b/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx index f317042c5..5fb712b63 100644 --- a/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx +++ b/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx @@ -95,8 +95,9 @@ const decodeBase64ToUint8Array = (value: string): Uint8Array => { }; const buildFontFamilyName = (font: PdfJsonFont): string => { - const base = (font.uid ?? font.id ?? 'font').toString(); - return `pdf-font-${base.replace(/[^a-zA-Z0-9_-]/g, '')}`; + const preferred = (font.baseName ?? '').trim(); + const identifier = preferred.length > 0 ? preferred : (font.uid ?? font.id ?? 'font').toString(); + return `pdf-font-${identifier.replace(/[^a-zA-Z0-9_-]/g, '')}`; }; const getCaretOffset = (element: HTMLElement): number => { @@ -313,18 +314,34 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const next = new Map(); for (const font of fonts) { - if (!font?.id || !font.program) { + if (!font?.id) { + continue; + } + const programSource = font.webProgram && font.webProgram.length > 0 ? font.webProgram : font.program; + if (!programSource) { continue; } try { - const format = normalizeFontFormat(font.programFormat); - const data = decodeBase64ToUint8Array(font.program); + const formatSource = font.webProgram && font.webProgram.length > 0 ? font.webProgramFormat : font.programFormat; + const format = normalizeFontFormat(formatSource); + const data = decodeBase64ToUint8Array(programSource); const blob = new Blob([data as BlobPart], { type: getFontMimeType(format) }); const url = URL.createObjectURL(blob); const formatHint = getFontFormatHint(format); const familyName = buildFontFamilyName(font); const source = formatHint ? `url(${url}) format('${formatHint}')` : `url(${url})`; const fontFace = new FontFace(familyName, source); + + console.debug(`[FontLoader] Loading font ${font.id} (${font.baseName}):`, { + formatSource, + format, + formatHint, + familyName, + dataLength: data.length, + hasWebProgram: !!font.webProgram, + hasProgram: !!font.program + }); + await fontFace.load(); if (disposed) { document.fonts.delete(fontFace); @@ -334,8 +351,14 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { document.fonts.add(fontFace); active.push({ fontFace, url }); next.set(font.id, familyName); + console.debug(`[FontLoader] Successfully loaded font ${font.id}`); } catch (error) { - // Silently ignore font loading failures - embedded PDF fonts often lack web font tables + console.warn(`[FontLoader] Failed to load font ${font.id} (${font.baseName}):`, { + error: error instanceof Error ? error.message : String(error), + formatSource: font.webProgram && font.webProgram.length > 0 ? font.webProgramFormat : font.programFormat, + hasWebProgram: !!font.webProgram, + hasProgram: !!font.program + }); // Fallback to web-safe fonts is already implemented via getFontFamily() } } @@ -776,7 +799,8 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const fontFamily = getFontFamily(group.fontId); const lineHeightPx = getLineHeightPx(group.fontId, fontSizePx); const lineHeightRatio = fontSizePx > 0 ? Math.max(lineHeightPx / fontSizePx, 1.05) : 1.2; - const hasRotation = group.rotation != null && Math.abs(group.rotation) > 0.5; + const rotation = group.rotation ?? 0; + const hasRotation = Math.abs(rotation) > 0.5; const baselineLength = group.baselineLength ?? Math.max(group.bounds.right - group.bounds.left, 0); let containerLeft = bounds.left; @@ -795,7 +819,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { containerHeight = Math.max(lineHeightPx, fontSizePx * lineHeightRatio); transformOrigin = 'left bottom'; // Negate rotation because Y-axis is flipped from PDF to web coordinates - transform = `rotate(${-group.rotation}deg)`; + transform = `rotate(${-rotation}deg)`; } // Extract styling from group diff --git a/frontend/src/tools/pdfJsonEditorTypes.ts b/frontend/src/tools/pdfJsonEditorTypes.ts index c1da39656..226d1812c 100644 --- a/frontend/src/tools/pdfJsonEditorTypes.ts +++ b/frontend/src/tools/pdfJsonEditorTypes.ts @@ -9,6 +9,14 @@ export interface PdfJsonTextColor { components?: number[] | null; } +export interface PdfJsonCosValue { + type?: string | null; + value?: unknown; + items?: PdfJsonCosValue[] | null; + entries?: Record | null; + stream?: PdfJsonStream | null; +} + export interface PdfJsonFont { id?: string; pageNumber?: number | null; @@ -20,6 +28,8 @@ export interface PdfJsonFont { embedded?: boolean | null; program?: string | null; programFormat?: string | null; + webProgram?: string | null; + webProgramFormat?: string | null; toUnicode?: string | null; standard14Name?: string | null; fontDescriptorFlags?: number | null; @@ -29,6 +39,7 @@ export interface PdfJsonFont { xHeight?: number | null; italicAngle?: number | null; unitsPerEm?: number | null; + cosDictionary?: PdfJsonCosValue | null; } export interface PdfJsonTextElement { @@ -52,6 +63,7 @@ export interface PdfJsonTextElement { textMatrix?: number[] | null; fillColor?: PdfJsonTextColor | null; strokeColor?: PdfJsonTextColor | null; + fallbackUsed?: boolean | null; } export interface PdfJsonImageElement { diff --git a/frontend/src/tools/pdfJsonEditorUtils.ts b/frontend/src/tools/pdfJsonEditorUtils.ts index 8085fe759..49378088f 100644 --- a/frontend/src/tools/pdfJsonEditorUtils.ts +++ b/frontend/src/tools/pdfJsonEditorUtils.ts @@ -16,6 +16,48 @@ const MIN_CHAR_WIDTH_FACTOR = 0.35; const MAX_CHAR_WIDTH_FACTOR = 1.25; const EXTRA_GAP_RATIO = 0.8; +type FontMetrics = { + unitsPerEm: number; + ascent: number; + descent: number; +}; + +type FontMetricsMap = Map; + +const countGraphemes = (text: string): number => { + if (!text) { + return 0; + } + return Array.from(text).length; +}; + +const metricsFor = (metrics: FontMetricsMap | undefined, fontId?: string | null): FontMetrics | undefined => { + if (!metrics || !fontId) { + return undefined; + } + return metrics.get(fontId) ?? undefined; +}; + +const buildFontMetrics = (document: PdfJsonDocument | null | undefined): FontMetricsMap => { + const metrics: FontMetricsMap = new Map(); + document?.fonts?.forEach((font) => { + if (!font) { + return; + } + const unitsPerEm = font.unitsPerEm && font.unitsPerEm > 0 ? font.unitsPerEm : 1000; + const ascent = font.ascent ?? unitsPerEm * 0.8; + const descent = font.descent ?? -(unitsPerEm * 0.2); + const metric: FontMetrics = { unitsPerEm, ascent, descent }; + if (font.id) { + metrics.set(font.id, metric); + } + if (font.uid) { + metrics.set(font.uid, metric); + } + }); + return metrics; +}; + export const valueOr = (value: number | null | undefined, fallback = 0): number => { if (value === null || value === undefined || Number.isNaN(value)) { return fallback; @@ -47,37 +89,87 @@ const getX = (element: PdfJsonTextElement): number => { return valueOr(element.x); }; -const getWidth = (element: PdfJsonTextElement): number => { +const getWidth = (element: PdfJsonTextElement, metrics?: FontMetricsMap): number => { const width = valueOr(element.width, 0); - if (width === 0 && element.text) { - const fontSize = valueOr(element.fontSize, 12); - return fontSize * Math.max(element.text.length * 0.45, 0.5); + if (width > 0) { + return width; } - return width; + + const text = element.text ?? ''; + const glyphCount = Math.max(1, countGraphemes(text)); + const spacingFallback = Math.max( + valueOr(element.spaceWidth, 0), + valueOr(element.wordSpacing, 0), + valueOr(element.characterSpacing, 0), + ); + + if (spacingFallback > 0 && text.trim().length === 0) { + return spacingFallback; + } + + const fontSize = getFontSize(element); + const fontMetrics = metricsFor(metrics, element.fontId); + if (fontMetrics) { + const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000; + const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8; + const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2)); + const combinedUnits = Math.max(unitsPerEm * 0.8, ascentUnits + descentUnits); + const averageAdvanceUnits = Math.max(unitsPerEm * 0.5, combinedUnits / Math.max(1, glyphCount)); + const fallbackWidth = (averageAdvanceUnits / unitsPerEm) * glyphCount * fontSize; + if (fallbackWidth > 0) { + return fallbackWidth; + } + } + + return fontSize * glyphCount * 0.5; }; const getFontSize = (element: PdfJsonTextElement): number => valueOr(element.fontMatrixSize ?? element.fontSize, 12); -const getHeight = (element: PdfJsonTextElement): number => { - const height = valueOr(element.height); - if (height === 0) { - return getFontSize(element) * 1.05; +const getHeight = (element: PdfJsonTextElement, metrics?: FontMetricsMap): number => { + const height = valueOr(element.height, 0); + if (height > 0) { + return height; } - return height; + const fontSize = getFontSize(element); + const fontMetrics = metricsFor(metrics, element.fontId); + if (fontMetrics) { + const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000; + const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8; + const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2)); + const totalUnits = Math.max(unitsPerEm, ascentUnits + descentUnits); + if (totalUnits > 0) { + return (totalUnits / unitsPerEm) * fontSize; + } + } + return fontSize; }; -const getElementBounds = (element: PdfJsonTextElement): BoundingBox => { +const getElementBounds = ( + element: PdfJsonTextElement, + metrics?: FontMetricsMap, +): BoundingBox => { const left = getX(element); - const width = getWidth(element); + const width = getWidth(element, metrics); const baseline = getBaseline(element); - const height = getHeight(element); - // In PDF coordinates, baseline is where text sits - // Typical typography: ~80% of height above baseline (ascenders), ~20% below (descenders) - // Using codebase's inverted naming: bottom (visual top) > top (visual bottom) - const ascent = height * 0.8; - const descent = height * 0.2; - const bottom = baseline + ascent; // Visual top of text - const top = baseline - descent; // Visual bottom (includes descenders) + const height = getHeight(element, metrics); + + let ascentRatio = 0.8; + let descentRatio = 0.2; + const fontMetrics = metricsFor(metrics, element.fontId); + if (fontMetrics) { + const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000; + const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8; + const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2)); + const totalUnits = Math.max(unitsPerEm, ascentUnits + descentUnits); + if (totalUnits > 0) { + ascentRatio = ascentUnits / totalUnits; + descentRatio = descentUnits / totalUnits; + } + } + + const bottom = baseline + height * ascentRatio; + const top = baseline - height * descentRatio; return { left, right: left + width, @@ -114,8 +206,12 @@ const getSpacingHint = (element: PdfJsonTextElement): number => { return Math.max(characterSpacing, 0); }; -const estimateCharWidth = (element: PdfJsonTextElement, avgFontSize: number): number => { - const rawWidth = getWidth(element); +const estimateCharWidth = ( + element: PdfJsonTextElement, + avgFontSize: number, + metrics?: FontMetricsMap, +): number => { + const rawWidth = getWidth(element, metrics); const minWidth = avgFontSize * MIN_CHAR_WIDTH_FACTOR; const maxWidth = avgFontSize * MAX_CHAR_WIDTH_FACTOR; return Math.min(Math.max(rawWidth, minWidth), maxWidth); @@ -136,12 +232,16 @@ const mergeBounds = (bounds: BoundingBox[]): BoundingBox => { ); }; -const shouldInsertSpace = (prev: PdfJsonTextElement, current: PdfJsonTextElement): boolean => { - const prevRight = getX(prev) + getWidth(prev); +const shouldInsertSpace = ( + prev: PdfJsonTextElement, + current: PdfJsonTextElement, + metrics?: FontMetricsMap, +): boolean => { + const prevRight = getX(prev) + getWidth(prev, metrics); const trailingGap = Math.max(0, getX(current) - prevRight); const avgFontSize = (getFontSize(prev) + getFontSize(current)) / 2; const baselineAdvance = Math.max(0, getX(current) - getX(prev)); - const charWidthEstimate = estimateCharWidth(prev, avgFontSize); + const charWidthEstimate = estimateCharWidth(prev, avgFontSize, metrics); const inferredGap = Math.max(0, baselineAdvance - charWidthEstimate); const spacingHint = Math.max( SPACE_MIN_GAP, @@ -166,7 +266,7 @@ const shouldInsertSpace = (prev: PdfJsonTextElement, current: PdfJsonTextElement return false; }; -const buildGroupText = (elements: PdfJsonTextElement[]): string => { +const buildGroupText = (elements: PdfJsonTextElement[], metrics?: FontMetricsMap): string => { let result = ''; elements.forEach((element, index) => { const value = element.text ?? ''; @@ -176,7 +276,7 @@ const buildGroupText = (elements: PdfJsonTextElement[]): string => { } const previous = elements[index - 1]; - const needsSpace = shouldInsertSpace(previous, element); + const needsSpace = shouldInsertSpace(previous, element, metrics); const startsWithWhitespace = /^\s/u.test(value); if (needsSpace && !startsWithWhitespace) { @@ -314,21 +414,24 @@ const getAnchorPoint = (element: PdfJsonTextElement): { x: number; y: number } = }; }; -const computeBaselineLength = (elements: PdfJsonTextElement[]): number => - elements.reduce((acc, current) => acc + getWidth(current), 0); +const computeBaselineLength = ( + elements: PdfJsonTextElement[], + metrics?: FontMetricsMap, +): number => elements.reduce((acc, current) => acc + getWidth(current, metrics), 0); const createGroup = ( pageIndex: number, idSuffix: number, elements: PdfJsonTextElement[], + metrics?: FontMetricsMap, ): TextGroup => { const clones = elements.map(cloneTextElement); const originalClones = clones.map(cloneTextElement); - const bounds = mergeBounds(elements.map(getElementBounds)); + const bounds = mergeBounds(elements.map((element) => getElementBounds(element, metrics))); const firstElement = elements[0]; const rotation = computeGroupRotation(elements); const anchor = rotation !== null ? getAnchorPoint(firstElement) : null; - const baselineLength = computeBaselineLength(elements); + const baselineLength = computeBaselineLength(elements, metrics); return { id: `${pageIndex}-${idSuffix}`, @@ -343,13 +446,17 @@ const createGroup = ( baselineLength, elements: clones, originalElements: originalClones, - text: buildGroupText(elements), - originalText: buildGroupText(elements), + text: buildGroupText(elements, metrics), + originalText: buildGroupText(elements, metrics), bounds, }; }; -export const groupPageTextElements = (page: PdfJsonPage | null | undefined, pageIndex: number): TextGroup[] => { +export const groupPageTextElements = ( + page: PdfJsonPage | null | undefined, + pageIndex: number, + metrics?: FontMetricsMap, +): TextGroup[] => { if (!page?.textElements || page.textElements.length === 0) { return []; } @@ -393,7 +500,7 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page } const previous = currentBucket[currentBucket.length - 1]; - const gap = getX(element) - (getX(previous) + getWidth(previous)); + const gap = getX(element) - (getX(previous) + getWidth(previous, metrics)); const avgFontSize = (getFontSize(previous) + getFontSize(element)) / 2; const splitThreshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR); @@ -412,7 +519,7 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page } if (shouldSplit) { - groups.push(createGroup(pageIndex, groupCounter, currentBucket)); + groups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics)); groupCounter += 1; currentBucket = [element]; } else { @@ -421,7 +528,7 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page }); if (currentBucket.length > 0) { - groups.push(createGroup(pageIndex, groupCounter, currentBucket)); + groups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics)); groupCounter += 1; } }); @@ -431,7 +538,8 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page export const groupDocumentText = (document: PdfJsonDocument | null | undefined): TextGroup[][] => { const pages = document?.pages ?? []; - return pages.map((page, index) => groupPageTextElements(page, index)); + const metrics = buildFontMetrics(document); + return pages.map((page, index) => groupPageTextElements(page, index, metrics)); }; export const extractPageImages = ( diff --git a/scripts/convert_cff_to_ttf.py b/scripts/convert_cff_to_ttf.py new file mode 100644 index 000000000..7a7f99270 --- /dev/null +++ b/scripts/convert_cff_to_ttf.py @@ -0,0 +1,492 @@ +#!/usr/bin/env python3 +""" +Wrap raw CFF/Type1C data (extracted from PDFs) as OpenType-CFF for web compatibility. +Builds proper Unicode cmap from PDF ToUnicode data. +""" +import sys +import re +from pathlib import Path +from io import BytesIO +from fontTools.ttLib import TTFont, newTable +from fontTools.cffLib import CFFFontSet +from fontTools.ttLib.tables._c_m_a_p import cmap_format_4, cmap_format_12 +from fontTools.ttLib.tables._n_a_m_e import NameRecord +from fontTools.ttLib.tables.O_S_2f_2 import Panose + +def parse_unicode_mapping(mapping_path): + """ + Parse Unicode mapping (either JSON with CharCode→CID→GID→Unicode or raw ToUnicode CMap). + + Returns: + dict[int, int]: GID → Unicode codepoint + """ + try: + with open(mapping_path, 'rb') as f: + data = f.read().decode('utf-8', errors='ignore') + + # Try parsing as JSON first (CID font with complete mapping) + if data.strip().startswith('{'): + import json + try: + mapping_data = json.loads(data) + if mapping_data.get('isCID'): + # Build GID → Unicode mapping from entries + gid_to_unicode = {} + for entry in mapping_data.get('entries', []): + gid = entry['gid'] + unicode_val = entry['unicode'] + if unicode_val > 0: + gid_to_unicode[gid] = unicode_val + print(f"Parsed JSON mapping: {len(gid_to_unicode)} GID→Unicode entries", file=sys.stderr) + return gid_to_unicode + except json.JSONDecodeError: + pass + + # Fall back to parsing raw ToUnicode CMap (non-CID fonts) + # For non-CID fonts, CID/GID is the same as array index + gid_to_unicode = {} + + # Pattern for bfchar entries + bfchar_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>' + for match in re.finditer(bfchar_pattern, data): + gid = int(match.group(1), 16) # For non-CID, char code == GID + unicode_val = int(match.group(2), 16) + if unicode_val > 0: + gid_to_unicode[gid] = unicode_val + + # Pattern for bfrange entries + bfrange_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>' + for match in re.finditer(bfrange_pattern, data): + start_gid = int(match.group(1), 16) + end_gid = int(match.group(2), 16) + start_unicode = int(match.group(3), 16) + for i, gid in enumerate(range(start_gid, end_gid + 1)): + unicode_val = start_unicode + i + if unicode_val > 0: + gid_to_unicode[gid] = unicode_val + + print(f"Parsed ToUnicode CMap: {len(gid_to_unicode)} mappings", file=sys.stderr) + return gid_to_unicode + + except Exception as e: + print(f"Warning: Failed to parse Unicode mapping: {e}", file=sys.stderr) + return {} + +def wrap_cff_as_otf(input_path, output_path, tounicode_path=None): + """ + Wrap raw CFF data (from PDF font stream) as OpenType-CFF. + + Args: + input_path: Path to input CFF data file + output_path: Path to output OTF font + tounicode_path: Optional path to ToUnicode CMap file + + Returns: + True if successful, False otherwise + """ + try: + # Read raw CFF data + with open(input_path, 'rb') as f: + cff_data = f.read() + + # Parse raw CFF data + cff_fontset = CFFFontSet() + cff_fontset.decompile(BytesIO(cff_data), None) + + # Get the first (and usually only) font in the CFF set + if len(cff_fontset.fontNames) == 0: + print("ERROR: No fonts found in CFF data", file=sys.stderr) + return False + + cff_font = cff_fontset[cff_fontset.fontNames[0]] + + # Parse Unicode mapping (JSON or raw ToUnicode CMap) if provided + gid_to_unicode = {} + if tounicode_path: + gid_to_unicode = parse_unicode_mapping(tounicode_path) + + # Create a new OTF font + otf = TTFont(sfntVersion='OTTO') # 'OTTO' = CFF-flavored OpenType + + # Get glyph names + if hasattr(cff_font, 'charset') and cff_font.charset is not None: + glyph_order = ['.notdef'] + [name for name in cff_font.charset if name != '.notdef'] + else: + # Fallback to CharStrings keys + charstrings = cff_font.CharStrings + glyph_order = ['.notdef'] + [name for name in charstrings.keys() if name != '.notdef'] + + otf.setGlyphOrder(glyph_order) + + # === Add CFF table (the actual font outlines) === + cff_table = newTable('CFF ') + cff_table.cff = cff_fontset + otf['CFF '] = cff_table + + # === Calculate metrics from CFF === + charstrings = cff_font.CharStrings + + # Get defaults from CFF Private dict + private_dict = getattr(cff_font, 'Private', None) + default_width = getattr(private_dict, 'defaultWidthX', 500) if private_dict else 500 + + # Calculate bounding box, widths, and LSBs + x_min = 0 + y_min = -200 + x_max = 1000 + y_max = 800 + max_advance = 0 + min_lsb = 0 + min_rsb = 0 + max_extent = 0 + + widths = {} + lsbs = {} + + for glyph_name in glyph_order: + lsb = 0 + width = int(default_width) + + if glyph_name in charstrings: + try: + cs = charstrings[glyph_name] + + # Get width from charstring + if hasattr(cs, 'width'): + width = int(cs.width) + + # Calculate bounds for LSB and bbox + try: + bounds = cs.calcBounds(None) + if bounds: + glyph_xmin = int(bounds[0]) + glyph_ymin = int(bounds[1]) + glyph_xmax = int(bounds[2]) + glyph_ymax = int(bounds[3]) + + lsb = glyph_xmin + rsb = width - glyph_xmax + extent = lsb + glyph_xmax + + # Update global bounds + x_min = min(x_min, glyph_xmin) + y_min = min(y_min, glyph_ymin) + x_max = max(x_max, glyph_xmax) + y_max = max(y_max, glyph_ymax) + + # Update hhea metrics + min_lsb = min(min_lsb, lsb) + min_rsb = min(min_rsb, rsb) + max_extent = max(max_extent, extent) + except: + pass # Some glyphs may not have outlines + + except Exception as e: + pass # Use defaults + + widths[glyph_name] = width + lsbs[glyph_name] = lsb + max_advance = max(max_advance, width) + + if max_advance == 0: + max_advance = 1000 + if max_extent == 0: + max_extent = x_max + + units_per_em = 1000 # Standard for Type1/CFF + + # === Create head table === + head = newTable('head') + head.tableVersion = 1.0 + head.fontRevision = 1.0 + head.checkSumAdjustment = 0 + head.magicNumber = 0x5F0F3CF5 + head.flags = 0x000B # Baseline at y=0, LSB at x=0, integer PPEM + head.unitsPerEm = units_per_em + head.created = 3600000000 + head.modified = 3600000000 + head.xMin = x_min + head.yMin = y_min + head.xMax = x_max + head.yMax = y_max + head.macStyle = 0 + head.fontDirectionHint = 2 + head.indexToLocFormat = 0 + head.glyphDataFormat = 0 + head.lowestRecPPEM = 8 + otf['head'] = head + + # === Create hhea table with correct metrics === + hhea = newTable('hhea') + hhea.tableVersion = 0x00010000 + hhea.ascent = max(y_max, 800) + hhea.descent = min(y_min, -200) + hhea.lineGap = 0 + hhea.advanceWidthMax = max_advance + hhea.minLeftSideBearing = min_lsb + hhea.minRightSideBearing = min_rsb + hhea.xMaxExtent = max_extent + hhea.caretSlopeRise = 1 + hhea.caretSlopeRun = 0 + hhea.caretOffset = 0 + hhea.reserved0 = 0 + hhea.reserved1 = 0 + hhea.reserved2 = 0 + hhea.reserved3 = 0 + hhea.metricDataFormat = 0 + hhea.numberOfHMetrics = len(glyph_order) + otf['hhea'] = hhea + + # === Create hmtx table with correct LSBs === + hmtx = newTable('hmtx') + hmtx.metrics = {} + for glyph_name in glyph_order: + hmtx.metrics[glyph_name] = (widths.get(glyph_name, default_width), lsbs.get(glyph_name, 0)) + otf['hmtx'] = hmtx + + # === Create maxp table (simpler for CFF) === + maxp = newTable('maxp') + maxp.tableVersion = 0x00005000 # CFF version (0.5) + maxp.numGlyphs = len(glyph_order) + otf['maxp'] = maxp + + # === Build Unicode cmap from GID→Unicode mapping === + unicode_to_glyph = {} + + if gid_to_unicode: + # Debug: Show first few glyph names to understand naming convention + sample_glyphs = glyph_order[:min(10, len(glyph_order))] + print(f"Sample glyph names: {sample_glyphs}", file=sys.stderr) + + # Debug: Show which GIDs we have mappings for + sample_gids = sorted(gid_to_unicode.keys())[:10] + print(f"Sample GIDs from mapping: {sample_gids}", file=sys.stderr) + + # For CID fonts: glyph names are "cid00123" (5-digit zero-padded) + # For non-CID fonts: glyph names vary but GID == array index + is_cid_font = any(gn.startswith('cid') for gn in glyph_order[1:6]) # Check first few non-.notdef glyphs + + for gid, unicode_val in gid_to_unicode.items(): + if unicode_val > 0: + if is_cid_font: + # Build glyph name as cidNNNNN (5 digits, zero-padded) + glyph_name = f"cid{gid:05d}" + # Verify this glyph exists in glyph_order + if glyph_name in glyph_order: + unicode_to_glyph[unicode_val] = glyph_name + else: + # Try without padding (some fonts use "cid123" not "cid00123") + glyph_name_alt = f"cid{gid}" + if glyph_name_alt in glyph_order: + unicode_to_glyph[unicode_val] = glyph_name_alt + else: + # Non-CID font: GID is array index + if 0 <= gid < len(glyph_order): + glyph_name = glyph_order[gid] + unicode_to_glyph[unicode_val] = glyph_name + + print(f"Mapped {len(unicode_to_glyph)} Unicode codepoints (isCID={is_cid_font if gid_to_unicode else 'unknown'})", file=sys.stderr) + + # Also try to map from glyph names (uni0041 → U+0041) + for glyph_name in glyph_order: + if glyph_name.startswith('uni') and len(glyph_name) == 7: + try: + unicode_val = int(glyph_name[3:], 16) + if unicode_val not in unicode_to_glyph: + unicode_to_glyph[unicode_val] = glyph_name + except: + pass + elif glyph_name.startswith('u') and len(glyph_name) >= 5: + try: + unicode_val = int(glyph_name[1:], 16) + if unicode_val not in unicode_to_glyph: + unicode_to_glyph[unicode_val] = glyph_name + except: + pass + + # === Create cmap table === + cmap = newTable('cmap') + cmap.tableVersion = 0 + cmap_tables = [] + + # Windows Unicode BMP (format 4) - required + cmap4_win = cmap_format_4(4) + cmap4_win.platformID = 3 # Windows + cmap4_win.platEncID = 1 # Unicode BMP + cmap4_win.language = 0 + cmap4_win.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF} + cmap_tables.append(cmap4_win) + + # Windows Unicode UCS-4 (format 12) - for >BMP + if any(cp > 0xFFFF for cp in unicode_to_glyph): + cmap12_win = cmap_format_12(12) + cmap12_win.platformID = 3 # Windows + cmap12_win.platEncID = 10 # Unicode UCS-4 + cmap12_win.language = 0 + cmap12_win.cmap = dict(unicode_to_glyph) + cmap_tables.append(cmap12_win) + + # Mac Unicode (format 4) - for compatibility + cmap4_mac = cmap_format_4(4) + cmap4_mac.platformID = 1 # Mac + cmap4_mac.platEncID = 0 # Roman + cmap4_mac.language = 0 + cmap4_mac.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF} + cmap_tables.append(cmap4_mac) + + cmap.tables = [t for t in cmap_tables if t.cmap] or [cmap4_win] # Ensure at least one + otf['cmap'] = cmap + + print(f"Built cmap with {len(unicode_to_glyph)} Unicode mappings", file=sys.stderr) + + # === Create OS/2 table with correct metrics === + os2 = newTable('OS/2') + os2.version = 4 + os2.xAvgCharWidth = int(sum(widths.values()) / len(widths)) if widths else 500 + os2.usWeightClass = 400 # Normal + os2.usWidthClass = 5 # Medium + os2.fsType = 0 # Installable embedding + os2.ySubscriptXSize = 650 + os2.ySubscriptYSize = 600 + os2.ySubscriptXOffset = 0 + os2.ySubscriptYOffset = 75 + os2.ySuperscriptXSize = 650 + os2.ySuperscriptYSize = 600 + os2.ySuperscriptXOffset = 0 + os2.ySuperscriptYOffset = 350 + os2.yStrikeoutSize = 50 + os2.yStrikeoutPosition = 300 + os2.sFamilyClass = 0 + + # PANOSE - use proper object structure + os2.panose = Panose() + os2.panose.bFamilyType = 0 + os2.panose.bSerifStyle = 0 + os2.panose.bWeight = 0 + os2.panose.bProportion = 0 + os2.panose.bContrast = 0 + os2.panose.bStrokeVariation = 0 + os2.panose.bArmStyle = 0 + os2.panose.bLetterForm = 0 + os2.panose.bMidline = 0 + os2.panose.bXHeight = 0 + + os2.ulUnicodeRange1 = 0 + os2.ulUnicodeRange2 = 0 + os2.ulUnicodeRange3 = 0 + os2.ulUnicodeRange4 = 0 + os2.achVendID = 'SPDF' + os2.fsSelection = 0x0040 # REGULAR bit + + # Set character index range from actual cmap + if unicode_to_glyph: + codepoints = sorted(unicode_to_glyph.keys()) + os2.usFirstCharIndex = codepoints[0] + os2.usLastCharIndex = codepoints[-1] + else: + os2.usFirstCharIndex = 0x20 # space + os2.usLastCharIndex = 0x7E # tilde + + # Typo metrics match hhea + os2.sTypoAscender = hhea.ascent + os2.sTypoDescender = hhea.descent + os2.sTypoLineGap = hhea.lineGap + + # Windows metrics (positive values, cover bbox) + os2.usWinAscent = max(0, y_max) + os2.usWinDescent = max(0, -y_min) + + os2.ulCodePageRange1 = 0x00000001 # Latin 1 + os2.ulCodePageRange2 = 0 + os2.sxHeight = 500 + os2.sCapHeight = 700 + os2.usDefaultChar = 0 + os2.usBreakChar = 32 + os2.usMaxContext = 0 + otf['OS/2'] = os2 + + # === Create name table with Windows and Mac records === + name = newTable('name') + name.names = [] + + # Get font name from CFF if available + font_name = cff_fontset.fontNames[0] if cff_fontset.fontNames else "Converted" + + name_strings = { + 1: font_name, # Font Family + 2: "Regular", # Subfamily + 3: f"Stirling-PDF: {font_name}", # Unique ID + 4: font_name, # Full Name + 5: "Version 1.0", # Version + 6: font_name.replace(' ', '-'), # PostScript Name + } + + # Add both Windows and Mac name records + for name_id, value in name_strings.items(): + # Windows (platform 3, encoding 1, language 0x0409 = en-US) + rec_win = NameRecord() + rec_win.nameID = name_id + rec_win.platformID = 3 + rec_win.platEncID = 1 + rec_win.langID = 0x0409 + rec_win.string = value + name.names.append(rec_win) + + # Mac (platform 1, encoding 0, language 0) + rec_mac = NameRecord() + rec_mac.nameID = name_id + rec_mac.platformID = 1 + rec_mac.platEncID = 0 + rec_mac.langID = 0 + rec_mac.string = value + name.names.append(rec_mac) + + otf['name'] = name + + # === Create post table (format 3.0 for smaller web fonts) === + post = newTable('post') + post.formatType = 3.0 # No glyph names (smaller, web-optimized) + post.italicAngle = 0 + post.underlinePosition = -100 + post.underlineThickness = 50 + post.isFixedPitch = 0 + post.minMemType42 = 0 + post.maxMemType42 = 0 + post.minMemType1 = 0 + post.maxMemType1 = 0 + otf['post'] = post + + # Save the OTF font + otf.save(output_path) + otf.close() + + return True + + except Exception as e: + print(f"ERROR: Conversion failed: {str(e)}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return False + +def main(): + if len(sys.argv) < 3: + print("Usage: convert_cff_to_ttf.py [tounicode.cmap]", file=sys.stderr) + sys.exit(1) + + input_path = Path(sys.argv[1]) + output_path = Path(sys.argv[2]) + tounicode_path = Path(sys.argv[3]) if len(sys.argv) > 3 else None + + if not input_path.exists(): + print(f"ERROR: Input file not found: {input_path}", file=sys.stderr) + sys.exit(1) + + if tounicode_path and not tounicode_path.exists(): + print(f"Warning: ToUnicode file not found: {tounicode_path}", file=sys.stderr) + tounicode_path = None + + success = wrap_cff_as_otf(str(input_path), str(output_path), str(tounicode_path) if tounicode_path else None) + sys.exit(0 if success else 1) + +if __name__ == '__main__': + main()