garbage commit font remappings

2025-11-16 01:21:16 +01:00 · 2025-10-24 15:34:35 +01:00 · 2025-10-24 15:34:35 +01:00 · 0d9321e6a1
commit 0d9321e6a1
parent c7c5613c13
9 changed files with 1213 additions and 93 deletions
--- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java
+++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonFont.java
@ -44,6 +44,12 @@ public class PdfJsonFont {
    /** Hint describing the font program type (ttf, otf, cff, pfb, etc.). */
    private String programFormat;

+    /** Web-optimized font program (e.g. converted TrueType) encoded as Base64. */
+    private String webProgram;
+
+    /** Format hint for the webProgram payload. */
+    private String webProgramFormat;
+
    /** ToUnicode stream encoded as Base64 when present. */
    private String toUnicode;

@ -70,4 +76,7 @@ public class PdfJsonFont {

    /** Units per em extracted from the font matrix. */
    private Integer unitsPerEm;
+
+    /** Serialized COS dictionary describing the original font resource. */
+    private PdfJsonCosValue cosDictionary;
 }
--- a/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java
+++ b/app/core/src/main/java/stirling/software/SPDF/model/json/PdfJsonTextElement.java
@ -37,4 +37,5 @@ public class PdfJsonTextElement {
    private PdfJsonTextColor fillColor;
    private PdfJsonTextColor strokeColor;
    private Integer renderingMode;
+    private Boolean fallbackUsed;
 }
--- a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java
+++ b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java
@ -34,6 +34,7 @@ import java.util.Set;
 import java.util.TimeZone;
 import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;

 import javax.imageio.ImageIO;

@ -64,6 +65,7 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.pdmodel.common.PDStream;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
+import org.apache.pdfbox.pdmodel.font.PDFontFactory;
 import org.apache.pdfbox.pdmodel.font.PDType0Font;
 import org.apache.pdfbox.pdmodel.font.PDType1Font;
 import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
@ -90,6 +92,8 @@ import org.springframework.web.multipart.MultipartFile;

 import com.fasterxml.jackson.databind.ObjectMapper;

+import jakarta.annotation.PostConstruct;
+
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;

@ -173,11 +177,61 @@ public class PdfJsonConversionService {
    @Value("${stirling.pdf.json.cff-converter.enabled:true}")
    private boolean cffConversionEnabled;

+    @Value("${stirling.pdf.json.cff-converter.method:python}")
+    private String cffConverterMethod;
+
+    @Value("${stirling.pdf.json.cff-converter.python-command:/opt/venv/bin/python3}")
+    private String pythonCommand;
+
+    @Value("${stirling.pdf.json.cff-converter.python-script:/scripts/convert_cff_to_ttf.py}")
+    private String pythonScript;
+
    @Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}")
    private String fontforgeCommand;

    private final Map<String, byte[]> fallbackFontCache = new ConcurrentHashMap<>();

+    private volatile boolean ghostscriptAvailable;
+
+    @PostConstruct
+    private void initializeGhostscriptAvailability() {
+        if (!fontNormalizationEnabled) {
+            ghostscriptAvailable = false;
+            return;
+        }
+
+        if (!isGhostscriptGroupEnabled()) {
+            ghostscriptAvailable = false;
+            log.warn(
+                    "Ghostscript font normalization disabled: Ghostscript group is not enabled in configuration");
+            return;
+        }
+
+        List<String> command = List.of("gs", "-version");
+        try {
+            ProcessExecutorResult result =
+                    ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
+                            .runCommandWithOutputHandling(command);
+            ghostscriptAvailable = result.getRc() == 0;
+            if (!ghostscriptAvailable) {
+                log.warn(
+                        "Ghostscript executable not available (exit code {}); font normalization will be skipped",
+                        result.getRc());
+            }
+        } catch (InterruptedException ex) {
+            Thread.currentThread().interrupt();
+            ghostscriptAvailable = false;
+            log.warn(
+                    "Ghostscript availability check interrupted; font normalization will be skipped: {}",
+                    ex.getMessage());
+        } catch (IOException ex) {
+            ghostscriptAvailable = false;
+            log.warn(
+                    "Ghostscript executable not found or failed to start; font normalization will be skipped: {}",
+                    ex.getMessage());
+        }
+    }
+
    public byte[] convertPdfToJson(MultipartFile file) throws IOException {
        if (file == null) {
            throw ExceptionUtils.createNullArgumentException("fileInput");
@ -452,10 +506,22 @@ public class PdfJsonConversionService {
        String encoding = resolveEncoding(font);
        PdfJsonFontCidSystemInfo cidInfo = extractCidSystemInfo(font.getCOSObject());
        boolean embedded = font.isEmbedded();
-        FontProgramData programData = embedded ? extractFontProgram(font) : null;
        String toUnicode = extractToUnicode(font.getCOSObject());
+        // Build complete CharCode→CID→GID→Unicode mapping for CID fonts
+        String unicodeMapping = buildUnicodeMapping(font, toUnicode);
+        FontProgramData programData = embedded ? extractFontProgram(font, unicodeMapping) : null;
        String standard14Name = resolveStandard14Name(font);
        Integer flags = descriptor != null ? descriptor.getFlags() : null;
+        PdfJsonCosValue cosDictionary = serializeCosValue(font.getCOSObject());
+
+        log.debug(
+                "Building font model: id={}, baseName={}, subtype={}, embedded={}, hasProgram={}, hasWebProgram={}",
+                fontId,
+                font.getName(),
+                subtype,
+                embedded,
+                programData != null && programData.getBase64() != null,
+                programData != null && programData.getWebBase64() != null);

        return PdfJsonFont.builder()
                .id(fontId)
@ -468,6 +534,8 @@ public class PdfJsonConversionService {
                .embedded(embedded)
                .program(programData != null ? programData.getBase64() : null)
                .programFormat(programData != null ? programData.getFormat() : null)
+                .webProgram(programData != null ? programData.getWebBase64() : null)
+                .webProgramFormat(programData != null ? programData.getWebFormat() : null)
                .toUnicode(toUnicode)
                .standard14Name(standard14Name)
                .fontDescriptorFlags(flags)
@ -477,6 +545,7 @@ public class PdfJsonConversionService {
                .xHeight(descriptor != null ? descriptor.getXHeight() : null)
                .italicAngle(descriptor != null ? descriptor.getItalicAngle() : null)
                .unitsPerEm(extractUnitsPerEm(font))
+                .cosDictionary(cosDictionary)
                .build();
    }

@ -508,11 +577,13 @@ public class PdfJsonConversionService {
            if (font == null) {
                fallbackNeeded = true;
                fallbackIds.add(FALLBACK_FONT_ID);
+                element.setFallbackUsed(Boolean.TRUE);
                continue;
            }

            if (!canEncodeFully(font, text)) {
                fallbackNeeded = true;
+                element.setFallbackUsed(Boolean.TRUE);
                for (int offset = 0; offset < text.length(); ) {
                    int codePoint = text.codePointAt(offset);
                    offset += Character.charCount(codePoint);
@ -682,11 +753,25 @@ public class PdfJsonConversionService {
    }

    private boolean canRunGhostscript() {
+        if (!fontNormalizationEnabled) {
+            return false;
+        }
+        if (!isGhostscriptGroupEnabled()) {
+            return false;
+        }
+        if (!ghostscriptAvailable) {
+            log.debug("Skipping Ghostscript normalization; executable not available");
+            return false;
+        }
+        return true;
+    }
+
+    private boolean isGhostscriptGroupEnabled() {
        try {
            return endpointConfiguration != null
                    && endpointConfiguration.isGroupEnabled("Ghostscript");
        } catch (Exception ex) {
-            log.debug("Ghostscript availability check failed: {}", ex.getMessage());
+            log.debug("Ghostscript group check failed: {}", ex.getMessage());
            return false;
        }
    }
@ -736,12 +821,129 @@ public class PdfJsonConversionService {
        return null;
    }

-    private byte[] convertCffProgramToTrueType(byte[] fontBytes) {
-        if (!cffConversionEnabled
-                || fontforgeCommand == null
-                || fontforgeCommand.isBlank()
-                || fontBytes == null
-                || fontBytes.length == 0) {
+    private byte[] convertCffProgramToTrueType(byte[] fontBytes, String toUnicode) {
+        if (!cffConversionEnabled || fontBytes == null || fontBytes.length == 0) {
+            return null;
+        }
+
+        // Determine which converter to use
+        if ("python".equalsIgnoreCase(cffConverterMethod)) {
+            return convertCffUsingPython(fontBytes, toUnicode);
+        } else if ("fontforge".equalsIgnoreCase(cffConverterMethod)) {
+            return convertCffUsingFontForge(fontBytes);
+        } else {
+            log.warn("Unknown CFF converter method: {}, falling back to Python", cffConverterMethod);
+            return convertCffUsingPython(fontBytes, toUnicode);
+        }
+    }
+
+    private byte[] convertCffUsingPython(byte[] fontBytes, String toUnicode) {
+        if (pythonCommand == null
+                || pythonCommand.isBlank()
+                || pythonScript == null
+                || pythonScript.isBlank()) {
+            log.debug("Python converter not configured");
+            return null;
+        }
+
+        try (TempFile inputFile = new TempFile(tempFileManager, ".cff");
+                TempFile outputFile = new TempFile(tempFileManager, ".otf");
+                TempFile toUnicodeFile = toUnicode != null ? new TempFile(tempFileManager, ".tounicode") : null) {
+            Files.write(inputFile.getPath(), fontBytes);
+
+            // Write ToUnicode CMap data if available
+            if (toUnicode != null && toUnicodeFile != null) {
+                byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicode);
+                Files.write(toUnicodeFile.getPath(), toUnicodeBytes);
+            }
+
+            List<String> command = new ArrayList<>();
+            command.add(pythonCommand);
+            command.add(pythonScript);
+            command.add(inputFile.getAbsolutePath());
+            command.add(outputFile.getAbsolutePath());
+            // Add optional ToUnicode file path
+            if (toUnicodeFile != null) {
+                command.add(toUnicodeFile.getAbsolutePath());
+            }
+
+            ProcessBuilder builder = new ProcessBuilder(command);
+            builder.redirectErrorStream(true);
+            Process process = builder.start();
+
+            StringBuilder output = new StringBuilder();
+            Thread reader =
+                    new Thread(
+                            () -> {
+                                try (BufferedReader br =
+                                        new BufferedReader(
+                                                new InputStreamReader(
+                                                        process.getInputStream(),
+                                                        StandardCharsets.UTF_8))) {
+                                    String line;
+                                    while ((line = br.readLine()) != null) {
+                                        output.append(line).append('\n');
+                                    }
+                                } catch (IOException ignored) {
+                                }
+                            });
+            reader.start();
+
+            // Wait with timeout (Python fontTools is usually fast, but provide safety margin)
+            boolean finished = process.waitFor(30, TimeUnit.SECONDS);
+            if (!finished) {
+                process.destroyForcibly();
+                reader.interrupt();
+                log.warn(
+                        "Python CFF→OTF wrapping timed out after 30 seconds - font may be corrupted");
+                return null;
+            }
+
+            int exitCode = process.exitValue();
+            reader.join(5000);
+
+            if (exitCode == 0 && Files.exists(outputFile.getPath())) {
+                byte[] convertedBytes = Files.readAllBytes(outputFile.getPath());
+                if (convertedBytes.length > 0) {
+                    String validationError = validateFontTables(convertedBytes);
+                    if (validationError != null) {
+                        log.warn("Python converter produced invalid font: {}", validationError);
+                        return null;
+                    }
+
+                    // Log Python script output for debugging
+                    String outputStr = output.toString().trim();
+                    if (!outputStr.isEmpty()) {
+                        log.debug("Python script output: {}", outputStr);
+                    }
+
+                    log.debug(
+                            "Python CFF→OTF wrapping successful: {} bytes → {} bytes",
+                            fontBytes.length,
+                            convertedBytes.length);
+                    return convertedBytes;
+                }
+            } else {
+                String outputStr = output.toString().trim();
+                if (!outputStr.isEmpty()) {
+                    log.warn("Python CFF→OTF wrapping failed with exit code {}: {}", exitCode, outputStr);
+                } else {
+                    log.warn("Python CFF→OTF wrapping failed with exit code {}", exitCode);
+                }
+            }
+        } catch (InterruptedException ex) {
+            Thread.currentThread().interrupt();
+            log.debug("Python CFF conversion interrupted", ex);
+        } catch (IOException ex) {
+            log.debug("Python CFF conversion I/O error", ex);
+        }
+
+        return null;
+    }
+
+    private byte[] convertCffUsingFontForge(byte[] fontBytes) {
+        if (fontforgeCommand == null || fontforgeCommand.isBlank()) {
+            log.debug("FontForge converter not configured");
            return null;
        }

@ -754,8 +956,18 @@ public class PdfJsonConversionService {
            command.add("-lang=ff");
            command.add("-c");
            command.add(
-                    "Open($1); SelectWorthOutputting(); SetFontOrder(2); Reencode(\"unicode\"); "
-                            + "Generate($2); Close(); Quit()");
+                    "Open($1); "
+                            + "ScaleToEm(1000); "  // Force 1000 units per em (standard for Type1)
+                            + "SelectWorthOutputting(); "
+                            + "SetFontOrder(2); "
+                            + "Reencode(\"unicode\"); "
+                            + "RoundToInt(); "
+                            + "RemoveOverlap(); "
+                            + "Simplify(); "
+                            + "CorrectDirection(); "
+                            + "Generate($2, \"\", 4+16+32); "
+                            + "Close(); "
+                            + "Quit()");
            command.add(inputFile.getAbsolutePath());
            command.add(outputFile.getAbsolutePath());

@ -780,11 +992,59 @@ public class PdfJsonConversionService {
                                }
                            });
            reader.start();
-            int exitCode = process.waitFor();
-            reader.join();
+
+            // Wait with timeout to prevent hanging on problematic fonts
+            boolean finished = process.waitFor(30, TimeUnit.SECONDS);
+            if (!finished) {
+                process.destroyForcibly();
+                reader.interrupt();
+                log.warn("FontForge conversion timed out after 30 seconds - font may be too complex or causing FontForge to hang");
+                return null;
+            }
+
+            int exitCode = process.exitValue();
+            reader.join(5000); // Wait max 5 seconds for reader thread

            if (exitCode == 0 && Files.exists(outputFile.getPath())) {
-                return Files.readAllBytes(outputFile.getPath());
+                byte[] convertedBytes = Files.readAllBytes(outputFile.getPath());
+                if (convertedBytes.length > 0) {
+                    // Basic validation: check for TrueType magic number and critical tables
+                    if (convertedBytes.length >= 4) {
+                        int magic =
+                                ((convertedBytes[0] & 0xFF) << 24)
+                                        | ((convertedBytes[1] & 0xFF) << 16)
+                                        | ((convertedBytes[2] & 0xFF) << 8)
+                                        | (convertedBytes[3] & 0xFF);
+                        boolean validTrueType =
+                                magic == 0x00010000 || magic == 0x74727565; // 1.0 or 'true'
+                        boolean validOpenType = magic == 0x4F54544F; // 'OTTO'
+
+                        if (validTrueType || validOpenType) {
+                            // Additional validation: check unitsPerEm in head table
+                            String validationError = validateFontTables(convertedBytes);
+                            if (validationError != null) {
+                                log.warn(
+                                        "FontForge produced invalid font: {}",
+                                        validationError);
+                                return null;
+                            }
+
+                            log.debug(
+                                    "FontForge CFF→TrueType conversion successful: {} bytes, magic: 0x{}, type: {}",
+                                    convertedBytes.length,
+                                    Integer.toHexString(magic),
+                                    validOpenType ? "OpenType" : "TrueType");
+                            return convertedBytes;
+                        } else {
+                            log.warn(
+                                    "FontForge produced invalid font: magic number 0x{} (expected TrueType or OpenType)",
+                                    Integer.toHexString(magic));
+                            return null;
+                        }
+                    }
+                }
+                log.warn("FontForge produced empty output file");
+                return null;
            }

            log.warn(
@ -801,6 +1061,127 @@ public class PdfJsonConversionService {
        return null;
    }

+    /**
+     * Validates critical OpenType/TrueType font tables to ensure browser compatibility.
+     * @return Error message if invalid, null if valid
+     */
+    private String validateFontTables(byte[] fontBytes) {
+        try {
+            if (fontBytes.length < 12) {
+                return "Font file too small";
+            }
+
+            // Read table directory
+            int numTables = ((fontBytes[4] & 0xFF) << 8) | (fontBytes[5] & 0xFF);
+            if (numTables == 0 || numTables > 100) {
+                return "Invalid table count: " + numTables;
+            }
+
+            // Find head table
+            int offset = 12; // Skip sfnt header
+            for (int i = 0; i < numTables && offset + 16 <= fontBytes.length; i++) {
+                String tag = new String(fontBytes, offset, 4, StandardCharsets.US_ASCII);
+                int tableOffset = ((fontBytes[offset + 8] & 0xFF) << 24)
+                        | ((fontBytes[offset + 9] & 0xFF) << 16)
+                        | ((fontBytes[offset + 10] & 0xFF) << 8)
+                        | (fontBytes[offset + 11] & 0xFF);
+                int tableLength = ((fontBytes[offset + 12] & 0xFF) << 24)
+                        | ((fontBytes[offset + 13] & 0xFF) << 16)
+                        | ((fontBytes[offset + 14] & 0xFF) << 8)
+                        | (fontBytes[offset + 15] & 0xFF);
+
+                if ("head".equals(tag)) {
+                    if (tableOffset + 18 > fontBytes.length) {
+                        return "head table truncated";
+                    }
+                    // Check unitsPerEm at offset 18 in head table
+                    int unitsPerEm = ((fontBytes[tableOffset + 18] & 0xFF) << 8)
+                            | (fontBytes[tableOffset + 19] & 0xFF);
+                    if (unitsPerEm < 16 || unitsPerEm > 16384) {
+                        return "Invalid unitsPerEm: " + unitsPerEm + " (must be 16-16384)";
+                    }
+                    return null; // Valid
+                }
+                offset += 16;
+            }
+            return "head table not found";
+        } catch (Exception ex) {
+            return "Validation error: " + ex.getMessage();
+        }
+    }
+
+    private String buildUnicodeMapping(PDFont font, String toUnicodeBase64) throws IOException {
+        log.debug("buildUnicodeMapping called for font: {}, hasToUnicode: {}, isCID: {}",
+            font.getName(), toUnicodeBase64 != null, font instanceof PDType0Font);
+
+        if (toUnicodeBase64 == null || toUnicodeBase64.isBlank()) {
+            log.debug("No ToUnicode data for font: {}", font.getName());
+            return null;
+        }
+
+        // For CID fonts (Type0), build complete CharCode→CID→GID→Unicode mapping
+        if (!(font instanceof PDType0Font type0Font)) {
+            // For non-CID fonts, just return ToUnicode as-is
+            log.debug("Non-CID font {}, returning raw ToUnicode", font.getName());
+            return toUnicodeBase64;
+        }
+
+        log.debug("Building JSON mapping for CID font: {}", font.getName());
+
+        try {
+            // Build a map of CharCode → Unicode from ToUnicode
+            Map<Integer, Integer> charCodeToUnicode = new HashMap<>();
+            byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicodeBase64);
+            String toUnicodeStr = new String(toUnicodeBytes, StandardCharsets.UTF_8);
+
+            // Parse ToUnicode CMap for bfchar and bfrange
+            java.util.regex.Pattern bfcharPattern = java.util.regex.Pattern.compile("<([0-9A-Fa-f]+)>\\s*<([0-9A-Fa-f]+)>");
+            java.util.regex.Matcher matcher = bfcharPattern.matcher(toUnicodeStr);
+            while (matcher.find()) {
+                int charCode = Integer.parseInt(matcher.group(1), 16);
+                int unicode = Integer.parseInt(matcher.group(2), 16);
+                charCodeToUnicode.put(charCode, unicode);
+            }
+
+            // Build JSON mapping: CharCode → CID → GID → Unicode
+            StringBuilder json = new StringBuilder();
+            json.append("{\"isCID\":true,\"cidToGidIdentity\":true,\"entries\":[");
+
+            boolean first = true;
+            for (Map.Entry<Integer, Integer> entry : charCodeToUnicode.entrySet()) {
+                int charCode = entry.getKey();
+                int unicode = entry.getValue();
+
+                try {
+                    // Get CID from char code
+                    int cid = type0Font.codeToCID(charCode);
+                    // For Identity-H/V encoding, GID == CID
+                    int gid = cid;
+
+                    if (!first) {
+                        json.append(",");
+                    }
+                    first = false;
+                    json.append(String.format("{\"code\":%d,\"cid\":%d,\"gid\":%d,\"unicode\":%d}",
+                        charCode, cid, gid, unicode));
+                } catch (Exception e) {
+                    // Skip entries that fail to map
+                    log.debug("Failed to map charCode {} in font {}: {}", charCode, font.getName(), e.getMessage());
+                }
+            }
+
+            json.append("]}");
+            String jsonStr = json.toString();
+            log.debug("Built Unicode mapping for CID font {} with {} entries",
+                font.getName(), charCodeToUnicode.size());
+            return Base64.getEncoder().encodeToString(jsonStr.getBytes(StandardCharsets.UTF_8));
+
+        } catch (Exception e) {
+            log.warn("Failed to build Unicode mapping for font {}: {}", font.getName(), e.getMessage());
+            return toUnicodeBase64; // Fall back to raw ToUnicode
+        }
+    }
+
    private PdfJsonFontCidSystemInfo extractCidSystemInfo(COSDictionary fontDictionary) {
        if (fontDictionary == null) {
            return null;
@ -824,7 +1205,7 @@ public class PdfJsonConversionService {
        return info;
    }

-    private FontProgramData extractFontProgram(PDFont font) throws IOException {
+    private FontProgramData extractFontProgram(PDFont font, String toUnicode) throws IOException {
        PDFontDescriptor descriptor = font.getFontDescriptor();
        if (descriptor == null) {
            return null;
@ -833,24 +1214,24 @@ public class PdfJsonConversionService {
        PDStream fontFile3 = descriptor.getFontFile3();
        if (fontFile3 != null) {
            String subtype = fontFile3.getCOSObject().getNameAsString(COSName.SUBTYPE);
-            return readFontProgram(fontFile3, subtype != null ? subtype : "fontfile3", false);
+            return readFontProgram(fontFile3, subtype != null ? subtype : "fontfile3", false, toUnicode);
        }

        PDStream fontFile2 = descriptor.getFontFile2();
        if (fontFile2 != null) {
-            return readFontProgram(fontFile2, null, true);
+            return readFontProgram(fontFile2, null, true, toUnicode);
        }

        PDStream fontFile = descriptor.getFontFile();
        if (fontFile != null) {
-            return readFontProgram(fontFile, "type1", false);
+            return readFontProgram(fontFile, "type1", false, toUnicode);
        }

        return null;
    }

    private FontProgramData readFontProgram(
-            PDStream stream, String formatHint, boolean detectTrueType) throws IOException {
+            PDStream stream, String formatHint, boolean detectTrueType, String toUnicode) throws IOException {
        try (InputStream inputStream = stream.createInputStream();
                ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
            inputStream.transferTo(baos);
@ -859,8 +1240,21 @@ public class PdfJsonConversionService {
            if (detectTrueType) {
                format = detectTrueTypeFormat(data);
            }
+            String webBase64 = null;
+            String webFormat = null;
+            if (format != null && isCffFormat(format)) {
+                log.debug("Detected CFF font format: {}, wrapping as OpenType-CFF for web preview", format);
+                byte[] converted = convertCffProgramToTrueType(data, toUnicode);
+                if (converted != null && converted.length > 0) {
+                    webBase64 = Base64.getEncoder().encodeToString(converted);
+                    webFormat = "otf";
+                    log.debug("CFF→OTF wrapping successful: {} bytes → {} bytes", data.length, converted.length);
+                } else {
+                    log.debug("CFF→OTF wrapping returned null or empty result");
+                }
+            }
            String base64 = Base64.getEncoder().encodeToString(data);
-            return new FontProgramData(base64, format);
+            return new FontProgramData(base64, format, webBase64, webFormat);
        }
    }

@ -1759,8 +2153,12 @@ public class PdfJsonConversionService {
        }

        PDFont baseFont = primaryFont;
+        boolean fallbackApplied = primaryFont == null;
        if (baseFont == null) {
            baseFont = ensureFallbackFont(document, fontMap, fontModels, FALLBACK_FONT_ID);
+            if (baseFont != null) {
+                fallbackApplied = true;
+            }
        }
        if (baseFont == null) {
            log.warn("Unable to resolve a base font for text element; skipping text content");
@ -1777,6 +2175,7 @@ public class PdfJsonConversionService {
            PDFont targetFont = currentFont;

            if (!canEncode(baseFont, codePoint)) {
+                fallbackApplied = true;
                String fallbackId = resolveFallbackFontId(codePoint);
                targetFont = ensureFallbackFont(document, fontMap, fontModels, fallbackId);
                if (targetFont == null || !canEncode(targetFont, glyph)) {
@ -1823,6 +2222,10 @@ public class PdfJsonConversionService {
            runs.add(new FontRun(currentFont, buffer.toString()));
        }

+        if (fallbackApplied) {
+            element.setFallbackUsed(Boolean.TRUE);
+        }
+
        return runs;
    }

@ -2019,10 +2422,14 @@ public class PdfJsonConversionService {
    private static class FontProgramData {
        private final String base64;
        private final String format;
+        private final String webBase64;
+        private final String webFormat;

-        private FontProgramData(String base64, String format) {
+        private FontProgramData(String base64, String format, String webBase64, String webFormat) {
            this.base64 = base64;
            this.format = format;
+            this.webBase64 = webBase64;
+            this.webFormat = webFormat;
        }

        private String getBase64() {
@ -2032,6 +2439,14 @@ public class PdfJsonConversionService {
        private String getFormat() {
            return format;
        }
+
+        private String getWebBase64() {
+            return webBase64;
+        }
+
+        private String getWebFormat() {
+            return webFormat;
+        }
    }

    private static final class PreflightResult {
@ -2371,46 +2786,106 @@ public class PdfJsonConversionService {
            return loadFallbackPdfFont(document);
        }

+        // IMPORTANT: Dictionary restoration is disabled because deserialized dictionaries
+        // don't properly include the font stream references (FontFile/FontFile2/FontFile3).
+        // This results in fonts that structurally exist but can't encode glyphs, causing
+        // fallback to NotoSans. Instead, we ALWAYS use program bytes for reliable encoding.
+        // The cosDictionary field is preserved in the JSON for potential future use, but
+        // for now we rely on direct font program loading.
+        if (false && fontModel.getCosDictionary() != null) {
+            // Dictionary restoration code kept for reference but disabled
+            COSBase restored = deserializeCosValue(fontModel.getCosDictionary(), document);
+            if (restored instanceof COSDictionary cosDictionary) {
+                try {
+                    PDFont font = PDFontFactory.createFont(cosDictionary);
+                    if (font != null && font.isEmbedded()) {
+                        // Verify font can actually encode a basic character
+                        try {
+                            font.encode("A");
+                            applyAdditionalFontMetadata(document, font, fontModel);
+                            log.debug("Successfully restored embedded font {} from dictionary", fontModel.getId());
+                            return font;
+                        } catch (IOException | IllegalArgumentException encodingEx) {
+                            log.warn(
+                                    "Font {} restored from dictionary but failed encoding test: {}; falling back to program bytes",
+                                    fontModel.getId(),
+                                    encodingEx.getMessage());
+                        }
+                    }
+                } catch (IOException ex) {
+                    log.warn(
+                            "Failed to restore font {} from stored dictionary: {}; falling back to program bytes",
+                            fontModel.getId(),
+                            ex.getMessage());
+                }
+            }
+        }
+
+        byte[] fontBytes = null;
+        String format = null;
+
+        // For CFF/Type1C fonts, prefer the webProgram (converted TrueType) because:
+        // 1. PDFBox's PDType0Font.load() expects TrueType/OpenType format
+        // 2. Raw CFF program bytes lack the descriptor context needed for reconstruction
+        // 3. FontForge-converted TrueType is reliable for both web preview and PDF export
+        String originalFormat =
+                fontModel.getProgramFormat() != null
+                        ? fontModel.getProgramFormat().toLowerCase(Locale.ROOT)
+                        : null;
+        // For JSON→PDF conversion, always use original font bytes
+        // (PDFBox doesn't support OpenType-CFF; webProgram is only for frontend web preview)
        String program = fontModel.getProgram();
        if (program != null && !program.isBlank()) {
-            byte[] fontBytes = Base64.getDecoder().decode(program);
-            String format =
-                    fontModel.getProgramFormat() != null
-                            ? fontModel.getProgramFormat().toLowerCase(Locale.ROOT)
-                            : "";
+            fontBytes = Base64.getDecoder().decode(program);
+            format = originalFormat;
+            log.debug("Using original font program for {} (format: {})", fontModel.getId(), originalFormat);
+        } else if (fontModel.getWebProgram() != null && !fontModel.getWebProgram().isBlank()) {
+            // Fallback to webProgram if original program is unavailable
+            fontBytes = Base64.getDecoder().decode(fontModel.getWebProgram());
+            format =
+                    fontModel.getWebProgramFormat() != null
+                            ? fontModel.getWebProgramFormat().toLowerCase(Locale.ROOT)
+                            : null;
+            log.debug("Using web-optimized font program for {} (original program unavailable)", fontModel.getId());
+        }
+
+        if (fontBytes != null && fontBytes.length > 0) {
            try {
-                if (isCffFormat(format)) {
-                    byte[] converted = convertCffProgramToTrueType(fontBytes);
-                    if (converted != null) {
-                        fontBytes = converted;
-                        format = "ttf";
-                        log.debug(
-                                "Converted CFF font {} to TrueType outlines for embedding",
-                                fontModel.getId());
-                    } else {
-                        log.debug(
-                                "Unable to convert CFF font {} to TrueType; attempting direct load",
-                                fontModel.getId());
-                    }
-                }
                if (isType1Format(format)) {
                    try (InputStream stream = new ByteArrayInputStream(fontBytes)) {
                        PDFont font = new PDType1Font(document, stream);
                        applyAdditionalFontMetadata(document, font, fontModel);
+                        log.debug(
+                                "Successfully loaded Type1 font {} from program bytes (format: {}, originalFormat: {})",
+                                fontModel.getId(),
+                                format,
+                                originalFormat);
                        return font;
                    }
                }
                try (InputStream stream = new ByteArrayInputStream(fontBytes)) {
                    PDFont font = PDType0Font.load(document, stream, true);
                    applyAdditionalFontMetadata(document, font, fontModel);
+                    log.debug(
+                            "Successfully loaded Type0 font {} from program bytes (format: {}, originalFormat: {})",
+                            fontModel.getId(),
+                            format,
+                            originalFormat);
                    return font;
                }
            } catch (IOException ex) {
-                log.debug(
-                        "Unable to load embedded font program for {}: {}",
+                log.warn(
+                        "Unable to load embedded font program for {} (format: {}, originalFormat: {}): {}; falling back to Standard 14 or default",
                        fontModel.getId(),
+                        format,
+                        originalFormat,
                        ex.getMessage());
            }
+        } else {
+            log.warn(
+                    "Font {} has no program bytes available (originalFormat: {})",
+                    fontModel.getId(),
+                    originalFormat);
        }

        String standardName = fontModel.getStandard14Name();
--- a/app/core/src/main/resources/settings.yml.template
+++ b/app/core/src/main/resources/settings.yml.template
@ -173,9 +173,12 @@ stirling:
    fallback-font: classpath:/static/fonts/NotoSans-Regular.ttf # Override to point at a custom fallback font
    json:
      font-normalization:
-        enabled: true # Run Ghostscript preflight to normalize fonts before PDF→JSON
+        enabled: false # IMPORTANT: Disable to preserve ToUnicode CMaps for correct font rendering. Ghostscript strips Unicode mappings from CID fonts.
      cff-converter:
-        enabled: true # Attempt to transcode CFF/Type1C programs to OTF using FontForge when available
+        enabled: true # Wrap CFF/Type1C fonts as OpenType-CFF for browser compatibility
+        method: python # Converter method: 'python' (fontTools, recommended - wraps as OTF), 'fontforge' (legacy - converts to TTF, may hang on CID fonts)
+        python-command: /opt/venv/bin/python3 # Python interpreter path
+        python-script: /scripts/convert_cff_to_ttf.py # Path to font wrapping script
        fontforge-command: fontforge # Override if FontForge is installed under a different name/path

 ui:
--- a/docker/compose/docker-compose.yml
+++ b/docker/compose/docker-compose.yml
@ -5,10 +5,6 @@ services:
      dockerfile: docker/backend/Dockerfile
    container_name: stirling-pdf-backend
    restart: on-failure:5
-    deploy:
-      resources:
-        limits:
-          memory: 4G
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8080/api/v1/info/status | grep -q 'UP'"]
      interval: 5s
--- a/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx
+++ b/frontend/src/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx
@ -95,8 +95,9 @@ const decodeBase64ToUint8Array = (value: string): Uint8Array => {
 };

 const buildFontFamilyName = (font: PdfJsonFont): string => {
-  const base = (font.uid ?? font.id ?? 'font').toString();
-  return `pdf-font-${base.replace(/[^a-zA-Z0-9_-]/g, '')}`;
+  const preferred = (font.baseName ?? '').trim();
+  const identifier = preferred.length > 0 ? preferred : (font.uid ?? font.id ?? 'font').toString();
+  return `pdf-font-${identifier.replace(/[^a-zA-Z0-9_-]/g, '')}`;
 };

 const getCaretOffset = (element: HTMLElement): number => {
@ -313,18 +314,34 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {

      const next = new Map<string, string>();
      for (const font of fonts) {
-        if (!font?.id || !font.program) {
+        if (!font?.id) {
+          continue;
+        }
+        const programSource = font.webProgram && font.webProgram.length > 0 ? font.webProgram : font.program;
+        if (!programSource) {
          continue;
        }
        try {
-          const format = normalizeFontFormat(font.programFormat);
-          const data = decodeBase64ToUint8Array(font.program);
+          const formatSource = font.webProgram && font.webProgram.length > 0 ? font.webProgramFormat : font.programFormat;
+          const format = normalizeFontFormat(formatSource);
+          const data = decodeBase64ToUint8Array(programSource);
          const blob = new Blob([data as BlobPart], { type: getFontMimeType(format) });
          const url = URL.createObjectURL(blob);
          const formatHint = getFontFormatHint(format);
          const familyName = buildFontFamilyName(font);
          const source = formatHint ? `url(${url}) format('${formatHint}')` : `url(${url})`;
          const fontFace = new FontFace(familyName, source);
+
+          console.debug(`[FontLoader] Loading font ${font.id} (${font.baseName}):`, {
+            formatSource,
+            format,
+            formatHint,
+            familyName,
+            dataLength: data.length,
+            hasWebProgram: !!font.webProgram,
+            hasProgram: !!font.program
+          });
+
          await fontFace.load();
          if (disposed) {
            document.fonts.delete(fontFace);
@ -334,8 +351,14 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
          document.fonts.add(fontFace);
          active.push({ fontFace, url });
          next.set(font.id, familyName);
+          console.debug(`[FontLoader] Successfully loaded font ${font.id}`);
        } catch (error) {
-          // Silently ignore font loading failures - embedded PDF fonts often lack web font tables
+          console.warn(`[FontLoader] Failed to load font ${font.id} (${font.baseName}):`, {
+            error: error instanceof Error ? error.message : String(error),
+            formatSource: font.webProgram && font.webProgram.length > 0 ? font.webProgramFormat : font.programFormat,
+            hasWebProgram: !!font.webProgram,
+            hasProgram: !!font.program
+          });
          // Fallback to web-safe fonts is already implemented via getFontFamily()
        }
      }
@ -776,7 +799,8 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
                      const fontFamily = getFontFamily(group.fontId);
                      const lineHeightPx = getLineHeightPx(group.fontId, fontSizePx);
                      const lineHeightRatio = fontSizePx > 0 ? Math.max(lineHeightPx / fontSizePx, 1.05) : 1.2;
-                      const hasRotation = group.rotation != null && Math.abs(group.rotation) > 0.5;
+                      const rotation = group.rotation ?? 0;
+                      const hasRotation = Math.abs(rotation) > 0.5;
                      const baselineLength = group.baselineLength ?? Math.max(group.bounds.right - group.bounds.left, 0);

                      let containerLeft = bounds.left;
@ -795,7 +819,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
                        containerHeight = Math.max(lineHeightPx, fontSizePx * lineHeightRatio);
                        transformOrigin = 'left bottom';
                        // Negate rotation because Y-axis is flipped from PDF to web coordinates
-                        transform = `rotate(${-group.rotation}deg)`;
+                        transform = `rotate(${-rotation}deg)`;
                      }

                      // Extract styling from group
--- a/frontend/src/tools/pdfJsonEditorTypes.ts
+++ b/frontend/src/tools/pdfJsonEditorTypes.ts
@ -9,6 +9,14 @@ export interface PdfJsonTextColor {
  components?: number[] | null;
 }

+export interface PdfJsonCosValue {
+  type?: string | null;
+  value?: unknown;
+  items?: PdfJsonCosValue[] | null;
+  entries?: Record<string, PdfJsonCosValue | null> | null;
+  stream?: PdfJsonStream | null;
+}
+
 export interface PdfJsonFont {
  id?: string;
  pageNumber?: number | null;
@ -20,6 +28,8 @@ export interface PdfJsonFont {
  embedded?: boolean | null;
  program?: string | null;
  programFormat?: string | null;
+  webProgram?: string | null;
+  webProgramFormat?: string | null;
  toUnicode?: string | null;
  standard14Name?: string | null;
  fontDescriptorFlags?: number | null;
@ -29,6 +39,7 @@ export interface PdfJsonFont {
  xHeight?: number | null;
  italicAngle?: number | null;
  unitsPerEm?: number | null;
+  cosDictionary?: PdfJsonCosValue | null;
 }

 export interface PdfJsonTextElement {
@ -52,6 +63,7 @@ export interface PdfJsonTextElement {
  textMatrix?: number[] | null;
  fillColor?: PdfJsonTextColor | null;
  strokeColor?: PdfJsonTextColor | null;
+  fallbackUsed?: boolean | null;
 }

 export interface PdfJsonImageElement {
--- a/frontend/src/tools/pdfJsonEditorUtils.ts
+++ b/frontend/src/tools/pdfJsonEditorUtils.ts
@ -16,6 +16,48 @@ const MIN_CHAR_WIDTH_FACTOR = 0.35;
 const MAX_CHAR_WIDTH_FACTOR = 1.25;
 const EXTRA_GAP_RATIO = 0.8;

+type FontMetrics = {
+  unitsPerEm: number;
+  ascent: number;
+  descent: number;
+};
+
+type FontMetricsMap = Map<string, FontMetrics>;
+
+const countGraphemes = (text: string): number => {
+  if (!text) {
+    return 0;
+  }
+  return Array.from(text).length;
+};
+
+const metricsFor = (metrics: FontMetricsMap | undefined, fontId?: string | null): FontMetrics | undefined => {
+  if (!metrics || !fontId) {
+    return undefined;
+  }
+  return metrics.get(fontId) ?? undefined;
+};
+
+const buildFontMetrics = (document: PdfJsonDocument | null | undefined): FontMetricsMap => {
+  const metrics: FontMetricsMap = new Map();
+  document?.fonts?.forEach((font) => {
+    if (!font) {
+      return;
+    }
+    const unitsPerEm = font.unitsPerEm && font.unitsPerEm > 0 ? font.unitsPerEm : 1000;
+    const ascent = font.ascent ?? unitsPerEm * 0.8;
+    const descent = font.descent ?? -(unitsPerEm * 0.2);
+    const metric: FontMetrics = { unitsPerEm, ascent, descent };
+    if (font.id) {
+      metrics.set(font.id, metric);
+    }
+    if (font.uid) {
+      metrics.set(font.uid, metric);
+    }
+  });
+  return metrics;
+};
+
 export const valueOr = (value: number | null | undefined, fallback = 0): number => {
  if (value === null || value === undefined || Number.isNaN(value)) {
    return fallback;
@ -47,37 +89,87 @@ const getX = (element: PdfJsonTextElement): number => {
  return valueOr(element.x);
 };

-const getWidth = (element: PdfJsonTextElement): number => {
+const getWidth = (element: PdfJsonTextElement, metrics?: FontMetricsMap): number => {
  const width = valueOr(element.width, 0);
-  if (width === 0 && element.text) {
-    const fontSize = valueOr(element.fontSize, 12);
-    return fontSize * Math.max(element.text.length * 0.45, 0.5);
+  if (width > 0) {
+    return width;
  }
-  return width;
+
+  const text = element.text ?? '';
+  const glyphCount = Math.max(1, countGraphemes(text));
+  const spacingFallback = Math.max(
+    valueOr(element.spaceWidth, 0),
+    valueOr(element.wordSpacing, 0),
+    valueOr(element.characterSpacing, 0),
+  );
+
+  if (spacingFallback > 0 && text.trim().length === 0) {
+    return spacingFallback;
+  }
+
+  const fontSize = getFontSize(element);
+  const fontMetrics = metricsFor(metrics, element.fontId);
+  if (fontMetrics) {
+    const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000;
+    const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8;
+    const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2));
+    const combinedUnits = Math.max(unitsPerEm * 0.8, ascentUnits + descentUnits);
+    const averageAdvanceUnits = Math.max(unitsPerEm * 0.5, combinedUnits / Math.max(1, glyphCount));
+    const fallbackWidth = (averageAdvanceUnits / unitsPerEm) * glyphCount * fontSize;
+    if (fallbackWidth > 0) {
+      return fallbackWidth;
+    }
+  }
+
+  return fontSize * glyphCount * 0.5;
 };

 const getFontSize = (element: PdfJsonTextElement): number => valueOr(element.fontMatrixSize ?? element.fontSize, 12);

-const getHeight = (element: PdfJsonTextElement): number => {
-  const height = valueOr(element.height);
-  if (height === 0) {
-    return getFontSize(element) * 1.05;
+const getHeight = (element: PdfJsonTextElement, metrics?: FontMetricsMap): number => {
+  const height = valueOr(element.height, 0);
+  if (height > 0) {
+    return height;
  }
-  return height;
+  const fontSize = getFontSize(element);
+  const fontMetrics = metricsFor(metrics, element.fontId);
+  if (fontMetrics) {
+    const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000;
+    const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8;
+    const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2));
+    const totalUnits = Math.max(unitsPerEm, ascentUnits + descentUnits);
+    if (totalUnits > 0) {
+      return (totalUnits / unitsPerEm) * fontSize;
+    }
+  }
+  return fontSize;
 };

-const getElementBounds = (element: PdfJsonTextElement): BoundingBox => {
+const getElementBounds = (
+  element: PdfJsonTextElement,
+  metrics?: FontMetricsMap,
+): BoundingBox => {
  const left = getX(element);
-  const width = getWidth(element);
+  const width = getWidth(element, metrics);
  const baseline = getBaseline(element);
-  const height = getHeight(element);
-  // In PDF coordinates, baseline is where text sits
-  // Typical typography: ~80% of height above baseline (ascenders), ~20% below (descenders)
-  // Using codebase's inverted naming: bottom (visual top) > top (visual bottom)
-  const ascent = height * 0.8;
-  const descent = height * 0.2;
-  const bottom = baseline + ascent;  // Visual top of text
-  const top = baseline - descent;    // Visual bottom (includes descenders)
+  const height = getHeight(element, metrics);
+
+  let ascentRatio = 0.8;
+  let descentRatio = 0.2;
+  const fontMetrics = metricsFor(metrics, element.fontId);
+  if (fontMetrics) {
+    const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000;
+    const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8;
+    const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2));
+    const totalUnits = Math.max(unitsPerEm, ascentUnits + descentUnits);
+    if (totalUnits > 0) {
+      ascentRatio = ascentUnits / totalUnits;
+      descentRatio = descentUnits / totalUnits;
+    }
+  }
+
+  const bottom = baseline + height * ascentRatio;
+  const top = baseline - height * descentRatio;
  return {
    left,
    right: left + width,
@ -114,8 +206,12 @@ const getSpacingHint = (element: PdfJsonTextElement): number => {
  return Math.max(characterSpacing, 0);
 };

-const estimateCharWidth = (element: PdfJsonTextElement, avgFontSize: number): number => {
-  const rawWidth = getWidth(element);
+const estimateCharWidth = (
+  element: PdfJsonTextElement,
+  avgFontSize: number,
+  metrics?: FontMetricsMap,
+): number => {
+  const rawWidth = getWidth(element, metrics);
  const minWidth = avgFontSize * MIN_CHAR_WIDTH_FACTOR;
  const maxWidth = avgFontSize * MAX_CHAR_WIDTH_FACTOR;
  return Math.min(Math.max(rawWidth, minWidth), maxWidth);
@ -136,12 +232,16 @@ const mergeBounds = (bounds: BoundingBox[]): BoundingBox => {
  );
 };

-const shouldInsertSpace = (prev: PdfJsonTextElement, current: PdfJsonTextElement): boolean => {
-  const prevRight = getX(prev) + getWidth(prev);
+const shouldInsertSpace = (
+  prev: PdfJsonTextElement,
+  current: PdfJsonTextElement,
+  metrics?: FontMetricsMap,
+): boolean => {
+  const prevRight = getX(prev) + getWidth(prev, metrics);
  const trailingGap = Math.max(0, getX(current) - prevRight);
  const avgFontSize = (getFontSize(prev) + getFontSize(current)) / 2;
  const baselineAdvance = Math.max(0, getX(current) - getX(prev));
-  const charWidthEstimate = estimateCharWidth(prev, avgFontSize);
+  const charWidthEstimate = estimateCharWidth(prev, avgFontSize, metrics);
  const inferredGap = Math.max(0, baselineAdvance - charWidthEstimate);
  const spacingHint = Math.max(
    SPACE_MIN_GAP,
@ -166,7 +266,7 @@ const shouldInsertSpace = (prev: PdfJsonTextElement, current: PdfJsonTextElement
  return false;
 };

-const buildGroupText = (elements: PdfJsonTextElement[]): string => {
+const buildGroupText = (elements: PdfJsonTextElement[], metrics?: FontMetricsMap): string => {
  let result = '';
  elements.forEach((element, index) => {
    const value = element.text ?? '';
@ -176,7 +276,7 @@ const buildGroupText = (elements: PdfJsonTextElement[]): string => {
    }

    const previous = elements[index - 1];
-    const needsSpace = shouldInsertSpace(previous, element);
+    const needsSpace = shouldInsertSpace(previous, element, metrics);
    const startsWithWhitespace = /^\s/u.test(value);

    if (needsSpace && !startsWithWhitespace) {
@ -314,21 +414,24 @@ const getAnchorPoint = (element: PdfJsonTextElement): { x: number; y: number } =
  };
 };

-const computeBaselineLength = (elements: PdfJsonTextElement[]): number =>
-  elements.reduce((acc, current) => acc + getWidth(current), 0);
+const computeBaselineLength = (
+  elements: PdfJsonTextElement[],
+  metrics?: FontMetricsMap,
+): number => elements.reduce((acc, current) => acc + getWidth(current, metrics), 0);

 const createGroup = (
  pageIndex: number,
  idSuffix: number,
  elements: PdfJsonTextElement[],
+  metrics?: FontMetricsMap,
 ): TextGroup => {
  const clones = elements.map(cloneTextElement);
  const originalClones = clones.map(cloneTextElement);
-  const bounds = mergeBounds(elements.map(getElementBounds));
+  const bounds = mergeBounds(elements.map((element) => getElementBounds(element, metrics)));
  const firstElement = elements[0];
  const rotation = computeGroupRotation(elements);
  const anchor = rotation !== null ? getAnchorPoint(firstElement) : null;
-  const baselineLength = computeBaselineLength(elements);
+  const baselineLength = computeBaselineLength(elements, metrics);

  return {
    id: `${pageIndex}-${idSuffix}`,
@ -343,13 +446,17 @@ const createGroup = (
    baselineLength,
    elements: clones,
    originalElements: originalClones,
-    text: buildGroupText(elements),
-    originalText: buildGroupText(elements),
+    text: buildGroupText(elements, metrics),
+    originalText: buildGroupText(elements, metrics),
    bounds,
  };
 };

-export const groupPageTextElements = (page: PdfJsonPage | null | undefined, pageIndex: number): TextGroup[] => {
+export const groupPageTextElements = (
+  page: PdfJsonPage | null | undefined,
+  pageIndex: number,
+  metrics?: FontMetricsMap,
+): TextGroup[] => {
  if (!page?.textElements || page.textElements.length === 0) {
    return [];
  }
@ -393,7 +500,7 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page
      }

      const previous = currentBucket[currentBucket.length - 1];
-      const gap = getX(element) - (getX(previous) + getWidth(previous));
+      const gap = getX(element) - (getX(previous) + getWidth(previous, metrics));
      const avgFontSize = (getFontSize(previous) + getFontSize(element)) / 2;
      const splitThreshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR);

@ -412,7 +519,7 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page
      }

      if (shouldSplit) {
-        groups.push(createGroup(pageIndex, groupCounter, currentBucket));
+        groups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics));
        groupCounter += 1;
        currentBucket = [element];
      } else {
@ -421,7 +528,7 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page
    });

    if (currentBucket.length > 0) {
-      groups.push(createGroup(pageIndex, groupCounter, currentBucket));
+      groups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics));
      groupCounter += 1;
    }
  });
@ -431,7 +538,8 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page

 export const groupDocumentText = (document: PdfJsonDocument | null | undefined): TextGroup[][] => {
  const pages = document?.pages ?? [];
-  return pages.map((page, index) => groupPageTextElements(page, index));
+  const metrics = buildFontMetrics(document);
+  return pages.map((page, index) => groupPageTextElements(page, index, metrics));
 };

 export const extractPageImages = (
--- a/scripts/convert_cff_to_ttf.py
+++ b/scripts/convert_cff_to_ttf.py
@ -0,0 +1,492 @@
+#!/usr/bin/env python3
+"""
+Wrap raw CFF/Type1C data (extracted from PDFs) as OpenType-CFF for web compatibility.
+Builds proper Unicode cmap from PDF ToUnicode data.
+"""
+import sys
+import re
+from pathlib import Path
+from io import BytesIO
+from fontTools.ttLib import TTFont, newTable
+from fontTools.cffLib import CFFFontSet
+from fontTools.ttLib.tables._c_m_a_p import cmap_format_4, cmap_format_12
+from fontTools.ttLib.tables._n_a_m_e import NameRecord
+from fontTools.ttLib.tables.O_S_2f_2 import Panose
+
+def parse_unicode_mapping(mapping_path):
+    """
+    Parse Unicode mapping (either JSON with CharCode→CID→GID→Unicode or raw ToUnicode CMap).
+
+    Returns:
+        dict[int, int]: GID → Unicode codepoint
+    """
+    try:
+        with open(mapping_path, 'rb') as f:
+            data = f.read().decode('utf-8', errors='ignore')
+
+        # Try parsing as JSON first (CID font with complete mapping)
+        if data.strip().startswith('{'):
+            import json
+            try:
+                mapping_data = json.loads(data)
+                if mapping_data.get('isCID'):
+                    # Build GID → Unicode mapping from entries
+                    gid_to_unicode = {}
+                    for entry in mapping_data.get('entries', []):
+                        gid = entry['gid']
+                        unicode_val = entry['unicode']
+                        if unicode_val > 0:
+                            gid_to_unicode[gid] = unicode_val
+                    print(f"Parsed JSON mapping: {len(gid_to_unicode)} GID→Unicode entries", file=sys.stderr)
+                    return gid_to_unicode
+            except json.JSONDecodeError:
+                pass
+
+        # Fall back to parsing raw ToUnicode CMap (non-CID fonts)
+        # For non-CID fonts, CID/GID is the same as array index
+        gid_to_unicode = {}
+
+        # Pattern for bfchar entries
+        bfchar_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>'
+        for match in re.finditer(bfchar_pattern, data):
+            gid = int(match.group(1), 16)  # For non-CID, char code == GID
+            unicode_val = int(match.group(2), 16)
+            if unicode_val > 0:
+                gid_to_unicode[gid] = unicode_val
+
+        # Pattern for bfrange entries
+        bfrange_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>'
+        for match in re.finditer(bfrange_pattern, data):
+            start_gid = int(match.group(1), 16)
+            end_gid = int(match.group(2), 16)
+            start_unicode = int(match.group(3), 16)
+            for i, gid in enumerate(range(start_gid, end_gid + 1)):
+                unicode_val = start_unicode + i
+                if unicode_val > 0:
+                    gid_to_unicode[gid] = unicode_val
+
+        print(f"Parsed ToUnicode CMap: {len(gid_to_unicode)} mappings", file=sys.stderr)
+        return gid_to_unicode
+
+    except Exception as e:
+        print(f"Warning: Failed to parse Unicode mapping: {e}", file=sys.stderr)
+        return {}
+
+def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
+    """
+    Wrap raw CFF data (from PDF font stream) as OpenType-CFF.
+
+    Args:
+        input_path: Path to input CFF data file
+        output_path: Path to output OTF font
+        tounicode_path: Optional path to ToUnicode CMap file
+
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        # Read raw CFF data
+        with open(input_path, 'rb') as f:
+            cff_data = f.read()
+
+        # Parse raw CFF data
+        cff_fontset = CFFFontSet()
+        cff_fontset.decompile(BytesIO(cff_data), None)
+
+        # Get the first (and usually only) font in the CFF set
+        if len(cff_fontset.fontNames) == 0:
+            print("ERROR: No fonts found in CFF data", file=sys.stderr)
+            return False
+
+        cff_font = cff_fontset[cff_fontset.fontNames[0]]
+
+        # Parse Unicode mapping (JSON or raw ToUnicode CMap) if provided
+        gid_to_unicode = {}
+        if tounicode_path:
+            gid_to_unicode = parse_unicode_mapping(tounicode_path)
+
+        # Create a new OTF font
+        otf = TTFont(sfntVersion='OTTO')  # 'OTTO' = CFF-flavored OpenType
+
+        # Get glyph names
+        if hasattr(cff_font, 'charset') and cff_font.charset is not None:
+            glyph_order = ['.notdef'] + [name for name in cff_font.charset if name != '.notdef']
+        else:
+            # Fallback to CharStrings keys
+            charstrings = cff_font.CharStrings
+            glyph_order = ['.notdef'] + [name for name in charstrings.keys() if name != '.notdef']
+
+        otf.setGlyphOrder(glyph_order)
+
+        # === Add CFF table (the actual font outlines) ===
+        cff_table = newTable('CFF ')
+        cff_table.cff = cff_fontset
+        otf['CFF '] = cff_table
+
+        # === Calculate metrics from CFF ===
+        charstrings = cff_font.CharStrings
+
+        # Get defaults from CFF Private dict
+        private_dict = getattr(cff_font, 'Private', None)
+        default_width = getattr(private_dict, 'defaultWidthX', 500) if private_dict else 500
+
+        # Calculate bounding box, widths, and LSBs
+        x_min = 0
+        y_min = -200
+        x_max = 1000
+        y_max = 800
+        max_advance = 0
+        min_lsb = 0
+        min_rsb = 0
+        max_extent = 0
+
+        widths = {}
+        lsbs = {}
+
+        for glyph_name in glyph_order:
+            lsb = 0
+            width = int(default_width)
+
+            if glyph_name in charstrings:
+                try:
+                    cs = charstrings[glyph_name]
+
+                    # Get width from charstring
+                    if hasattr(cs, 'width'):
+                        width = int(cs.width)
+
+                    # Calculate bounds for LSB and bbox
+                    try:
+                        bounds = cs.calcBounds(None)
+                        if bounds:
+                            glyph_xmin = int(bounds[0])
+                            glyph_ymin = int(bounds[1])
+                            glyph_xmax = int(bounds[2])
+                            glyph_ymax = int(bounds[3])
+
+                            lsb = glyph_xmin
+                            rsb = width - glyph_xmax
+                            extent = lsb + glyph_xmax
+
+                            # Update global bounds
+                            x_min = min(x_min, glyph_xmin)
+                            y_min = min(y_min, glyph_ymin)
+                            x_max = max(x_max, glyph_xmax)
+                            y_max = max(y_max, glyph_ymax)
+
+                            # Update hhea metrics
+                            min_lsb = min(min_lsb, lsb)
+                            min_rsb = min(min_rsb, rsb)
+                            max_extent = max(max_extent, extent)
+                    except:
+                        pass  # Some glyphs may not have outlines
+
+                except Exception as e:
+                    pass  # Use defaults
+
+            widths[glyph_name] = width
+            lsbs[glyph_name] = lsb
+            max_advance = max(max_advance, width)
+
+        if max_advance == 0:
+            max_advance = 1000
+        if max_extent == 0:
+            max_extent = x_max
+
+        units_per_em = 1000  # Standard for Type1/CFF
+
+        # === Create head table ===
+        head = newTable('head')
+        head.tableVersion = 1.0
+        head.fontRevision = 1.0
+        head.checkSumAdjustment = 0
+        head.magicNumber = 0x5F0F3CF5
+        head.flags = 0x000B  # Baseline at y=0, LSB at x=0, integer PPEM
+        head.unitsPerEm = units_per_em
+        head.created = 3600000000
+        head.modified = 3600000000
+        head.xMin = x_min
+        head.yMin = y_min
+        head.xMax = x_max
+        head.yMax = y_max
+        head.macStyle = 0
+        head.fontDirectionHint = 2
+        head.indexToLocFormat = 0
+        head.glyphDataFormat = 0
+        head.lowestRecPPEM = 8
+        otf['head'] = head
+
+        # === Create hhea table with correct metrics ===
+        hhea = newTable('hhea')
+        hhea.tableVersion = 0x00010000
+        hhea.ascent = max(y_max, 800)
+        hhea.descent = min(y_min, -200)
+        hhea.lineGap = 0
+        hhea.advanceWidthMax = max_advance
+        hhea.minLeftSideBearing = min_lsb
+        hhea.minRightSideBearing = min_rsb
+        hhea.xMaxExtent = max_extent
+        hhea.caretSlopeRise = 1
+        hhea.caretSlopeRun = 0
+        hhea.caretOffset = 0
+        hhea.reserved0 = 0
+        hhea.reserved1 = 0
+        hhea.reserved2 = 0
+        hhea.reserved3 = 0
+        hhea.metricDataFormat = 0
+        hhea.numberOfHMetrics = len(glyph_order)
+        otf['hhea'] = hhea
+
+        # === Create hmtx table with correct LSBs ===
+        hmtx = newTable('hmtx')
+        hmtx.metrics = {}
+        for glyph_name in glyph_order:
+            hmtx.metrics[glyph_name] = (widths.get(glyph_name, default_width), lsbs.get(glyph_name, 0))
+        otf['hmtx'] = hmtx
+
+        # === Create maxp table (simpler for CFF) ===
+        maxp = newTable('maxp')
+        maxp.tableVersion = 0x00005000  # CFF version (0.5)
+        maxp.numGlyphs = len(glyph_order)
+        otf['maxp'] = maxp
+
+        # === Build Unicode cmap from GID→Unicode mapping ===
+        unicode_to_glyph = {}
+
+        if gid_to_unicode:
+            # Debug: Show first few glyph names to understand naming convention
+            sample_glyphs = glyph_order[:min(10, len(glyph_order))]
+            print(f"Sample glyph names: {sample_glyphs}", file=sys.stderr)
+
+            # Debug: Show which GIDs we have mappings for
+            sample_gids = sorted(gid_to_unicode.keys())[:10]
+            print(f"Sample GIDs from mapping: {sample_gids}", file=sys.stderr)
+
+            # For CID fonts: glyph names are "cid00123" (5-digit zero-padded)
+            # For non-CID fonts: glyph names vary but GID == array index
+            is_cid_font = any(gn.startswith('cid') for gn in glyph_order[1:6])  # Check first few non-.notdef glyphs
+
+            for gid, unicode_val in gid_to_unicode.items():
+                if unicode_val > 0:
+                    if is_cid_font:
+                        # Build glyph name as cidNNNNN (5 digits, zero-padded)
+                        glyph_name = f"cid{gid:05d}"
+                        # Verify this glyph exists in glyph_order
+                        if glyph_name in glyph_order:
+                            unicode_to_glyph[unicode_val] = glyph_name
+                        else:
+                            # Try without padding (some fonts use "cid123" not "cid00123")
+                            glyph_name_alt = f"cid{gid}"
+                            if glyph_name_alt in glyph_order:
+                                unicode_to_glyph[unicode_val] = glyph_name_alt
+                    else:
+                        # Non-CID font: GID is array index
+                        if 0 <= gid < len(glyph_order):
+                            glyph_name = glyph_order[gid]
+                            unicode_to_glyph[unicode_val] = glyph_name
+
+        print(f"Mapped {len(unicode_to_glyph)} Unicode codepoints (isCID={is_cid_font if gid_to_unicode else 'unknown'})", file=sys.stderr)
+
+        # Also try to map from glyph names (uni0041 → U+0041)
+        for glyph_name in glyph_order:
+            if glyph_name.startswith('uni') and len(glyph_name) == 7:
+                try:
+                    unicode_val = int(glyph_name[3:], 16)
+                    if unicode_val not in unicode_to_glyph:
+                        unicode_to_glyph[unicode_val] = glyph_name
+                except:
+                    pass
+            elif glyph_name.startswith('u') and len(glyph_name) >= 5:
+                try:
+                    unicode_val = int(glyph_name[1:], 16)
+                    if unicode_val not in unicode_to_glyph:
+                        unicode_to_glyph[unicode_val] = glyph_name
+                except:
+                    pass
+
+        # === Create cmap table ===
+        cmap = newTable('cmap')
+        cmap.tableVersion = 0
+        cmap_tables = []
+
+        # Windows Unicode BMP (format 4) - required
+        cmap4_win = cmap_format_4(4)
+        cmap4_win.platformID = 3  # Windows
+        cmap4_win.platEncID = 1   # Unicode BMP
+        cmap4_win.language = 0
+        cmap4_win.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF}
+        cmap_tables.append(cmap4_win)
+
+        # Windows Unicode UCS-4 (format 12) - for >BMP
+        if any(cp > 0xFFFF for cp in unicode_to_glyph):
+            cmap12_win = cmap_format_12(12)
+            cmap12_win.platformID = 3  # Windows
+            cmap12_win.platEncID = 10  # Unicode UCS-4
+            cmap12_win.language = 0
+            cmap12_win.cmap = dict(unicode_to_glyph)
+            cmap_tables.append(cmap12_win)
+
+        # Mac Unicode (format 4) - for compatibility
+        cmap4_mac = cmap_format_4(4)
+        cmap4_mac.platformID = 1  # Mac
+        cmap4_mac.platEncID = 0   # Roman
+        cmap4_mac.language = 0
+        cmap4_mac.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF}
+        cmap_tables.append(cmap4_mac)
+
+        cmap.tables = [t for t in cmap_tables if t.cmap] or [cmap4_win]  # Ensure at least one
+        otf['cmap'] = cmap
+
+        print(f"Built cmap with {len(unicode_to_glyph)} Unicode mappings", file=sys.stderr)
+
+        # === Create OS/2 table with correct metrics ===
+        os2 = newTable('OS/2')
+        os2.version = 4
+        os2.xAvgCharWidth = int(sum(widths.values()) / len(widths)) if widths else 500
+        os2.usWeightClass = 400  # Normal
+        os2.usWidthClass = 5     # Medium
+        os2.fsType = 0           # Installable embedding
+        os2.ySubscriptXSize = 650
+        os2.ySubscriptYSize = 600
+        os2.ySubscriptXOffset = 0
+        os2.ySubscriptYOffset = 75
+        os2.ySuperscriptXSize = 650
+        os2.ySuperscriptYSize = 600
+        os2.ySuperscriptXOffset = 0
+        os2.ySuperscriptYOffset = 350
+        os2.yStrikeoutSize = 50
+        os2.yStrikeoutPosition = 300
+        os2.sFamilyClass = 0
+
+        # PANOSE - use proper object structure
+        os2.panose = Panose()
+        os2.panose.bFamilyType = 0
+        os2.panose.bSerifStyle = 0
+        os2.panose.bWeight = 0
+        os2.panose.bProportion = 0
+        os2.panose.bContrast = 0
+        os2.panose.bStrokeVariation = 0
+        os2.panose.bArmStyle = 0
+        os2.panose.bLetterForm = 0
+        os2.panose.bMidline = 0
+        os2.panose.bXHeight = 0
+
+        os2.ulUnicodeRange1 = 0
+        os2.ulUnicodeRange2 = 0
+        os2.ulUnicodeRange3 = 0
+        os2.ulUnicodeRange4 = 0
+        os2.achVendID = 'SPDF'
+        os2.fsSelection = 0x0040  # REGULAR bit
+
+        # Set character index range from actual cmap
+        if unicode_to_glyph:
+            codepoints = sorted(unicode_to_glyph.keys())
+            os2.usFirstCharIndex = codepoints[0]
+            os2.usLastCharIndex = codepoints[-1]
+        else:
+            os2.usFirstCharIndex = 0x20  # space
+            os2.usLastCharIndex = 0x7E   # tilde
+
+        # Typo metrics match hhea
+        os2.sTypoAscender = hhea.ascent
+        os2.sTypoDescender = hhea.descent
+        os2.sTypoLineGap = hhea.lineGap
+
+        # Windows metrics (positive values, cover bbox)
+        os2.usWinAscent = max(0, y_max)
+        os2.usWinDescent = max(0, -y_min)
+
+        os2.ulCodePageRange1 = 0x00000001  # Latin 1
+        os2.ulCodePageRange2 = 0
+        os2.sxHeight = 500
+        os2.sCapHeight = 700
+        os2.usDefaultChar = 0
+        os2.usBreakChar = 32
+        os2.usMaxContext = 0
+        otf['OS/2'] = os2
+
+        # === Create name table with Windows and Mac records ===
+        name = newTable('name')
+        name.names = []
+
+        # Get font name from CFF if available
+        font_name = cff_fontset.fontNames[0] if cff_fontset.fontNames else "Converted"
+
+        name_strings = {
+            1: font_name,  # Font Family
+            2: "Regular",  # Subfamily
+            3: f"Stirling-PDF: {font_name}",  # Unique ID
+            4: font_name,  # Full Name
+            5: "Version 1.0",  # Version
+            6: font_name.replace(' ', '-'),  # PostScript Name
+        }
+
+        # Add both Windows and Mac name records
+        for name_id, value in name_strings.items():
+            # Windows (platform 3, encoding 1, language 0x0409 = en-US)
+            rec_win = NameRecord()
+            rec_win.nameID = name_id
+            rec_win.platformID = 3
+            rec_win.platEncID = 1
+            rec_win.langID = 0x0409
+            rec_win.string = value
+            name.names.append(rec_win)
+
+            # Mac (platform 1, encoding 0, language 0)
+            rec_mac = NameRecord()
+            rec_mac.nameID = name_id
+            rec_mac.platformID = 1
+            rec_mac.platEncID = 0
+            rec_mac.langID = 0
+            rec_mac.string = value
+            name.names.append(rec_mac)
+
+        otf['name'] = name
+
+        # === Create post table (format 3.0 for smaller web fonts) ===
+        post = newTable('post')
+        post.formatType = 3.0  # No glyph names (smaller, web-optimized)
+        post.italicAngle = 0
+        post.underlinePosition = -100
+        post.underlineThickness = 50
+        post.isFixedPitch = 0
+        post.minMemType42 = 0
+        post.maxMemType42 = 0
+        post.minMemType1 = 0
+        post.maxMemType1 = 0
+        otf['post'] = post
+
+        # Save the OTF font
+        otf.save(output_path)
+        otf.close()
+
+        return True
+
+    except Exception as e:
+        print(f"ERROR: Conversion failed: {str(e)}", file=sys.stderr)
+        import traceback
+        traceback.print_exc(file=sys.stderr)
+        return False
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: convert_cff_to_ttf.py <input.cff> <output.otf> [tounicode.cmap]", file=sys.stderr)
+        sys.exit(1)
+
+    input_path = Path(sys.argv[1])
+    output_path = Path(sys.argv[2])
+    tounicode_path = Path(sys.argv[3]) if len(sys.argv) > 3 else None
+
+    if not input_path.exists():
+        print(f"ERROR: Input file not found: {input_path}", file=sys.stderr)
+        sys.exit(1)
+
+    if tounicode_path and not tounicode_path.exists():
+        print(f"Warning: ToUnicode file not found: {tounicode_path}", file=sys.stderr)
+        tounicode_path = None
+
+    success = wrap_cff_as_otf(str(input_path), str(output_path), str(tounicode_path) if tounicode_path else None)
+    sys.exit(0 if success else 1)
+
+if __name__ == '__main__':
+    main()