mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
garbage commit font remappings
This commit is contained in:
parent
c7c5613c13
commit
0d9321e6a1
@ -44,6 +44,12 @@ public class PdfJsonFont {
|
||||
/** Hint describing the font program type (ttf, otf, cff, pfb, etc.). */
|
||||
private String programFormat;
|
||||
|
||||
/** Web-optimized font program (e.g. converted TrueType) encoded as Base64. */
|
||||
private String webProgram;
|
||||
|
||||
/** Format hint for the webProgram payload. */
|
||||
private String webProgramFormat;
|
||||
|
||||
/** ToUnicode stream encoded as Base64 when present. */
|
||||
private String toUnicode;
|
||||
|
||||
@ -70,4 +76,7 @@ public class PdfJsonFont {
|
||||
|
||||
/** Units per em extracted from the font matrix. */
|
||||
private Integer unitsPerEm;
|
||||
|
||||
/** Serialized COS dictionary describing the original font resource. */
|
||||
private PdfJsonCosValue cosDictionary;
|
||||
}
|
||||
|
||||
@ -37,4 +37,5 @@ public class PdfJsonTextElement {
|
||||
private PdfJsonTextColor fillColor;
|
||||
private PdfJsonTextColor strokeColor;
|
||||
private Integer renderingMode;
|
||||
private Boolean fallbackUsed;
|
||||
}
|
||||
|
||||
@ -34,6 +34,7 @@ import java.util.Set;
|
||||
import java.util.TimeZone;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
@ -64,6 +65,7 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.common.PDStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFontFactory;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
@ -90,6 +92,8 @@ import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -173,11 +177,61 @@ public class PdfJsonConversionService {
|
||||
@Value("${stirling.pdf.json.cff-converter.enabled:true}")
|
||||
private boolean cffConversionEnabled;
|
||||
|
||||
@Value("${stirling.pdf.json.cff-converter.method:python}")
|
||||
private String cffConverterMethod;
|
||||
|
||||
@Value("${stirling.pdf.json.cff-converter.python-command:/opt/venv/bin/python3}")
|
||||
private String pythonCommand;
|
||||
|
||||
@Value("${stirling.pdf.json.cff-converter.python-script:/scripts/convert_cff_to_ttf.py}")
|
||||
private String pythonScript;
|
||||
|
||||
@Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}")
|
||||
private String fontforgeCommand;
|
||||
|
||||
private final Map<String, byte[]> fallbackFontCache = new ConcurrentHashMap<>();
|
||||
|
||||
private volatile boolean ghostscriptAvailable;
|
||||
|
||||
@PostConstruct
|
||||
private void initializeGhostscriptAvailability() {
|
||||
if (!fontNormalizationEnabled) {
|
||||
ghostscriptAvailable = false;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!isGhostscriptGroupEnabled()) {
|
||||
ghostscriptAvailable = false;
|
||||
log.warn(
|
||||
"Ghostscript font normalization disabled: Ghostscript group is not enabled in configuration");
|
||||
return;
|
||||
}
|
||||
|
||||
List<String> command = List.of("gs", "-version");
|
||||
try {
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||
.runCommandWithOutputHandling(command);
|
||||
ghostscriptAvailable = result.getRc() == 0;
|
||||
if (!ghostscriptAvailable) {
|
||||
log.warn(
|
||||
"Ghostscript executable not available (exit code {}); font normalization will be skipped",
|
||||
result.getRc());
|
||||
}
|
||||
} catch (InterruptedException ex) {
|
||||
Thread.currentThread().interrupt();
|
||||
ghostscriptAvailable = false;
|
||||
log.warn(
|
||||
"Ghostscript availability check interrupted; font normalization will be skipped: {}",
|
||||
ex.getMessage());
|
||||
} catch (IOException ex) {
|
||||
ghostscriptAvailable = false;
|
||||
log.warn(
|
||||
"Ghostscript executable not found or failed to start; font normalization will be skipped: {}",
|
||||
ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public byte[] convertPdfToJson(MultipartFile file) throws IOException {
|
||||
if (file == null) {
|
||||
throw ExceptionUtils.createNullArgumentException("fileInput");
|
||||
@ -452,10 +506,22 @@ public class PdfJsonConversionService {
|
||||
String encoding = resolveEncoding(font);
|
||||
PdfJsonFontCidSystemInfo cidInfo = extractCidSystemInfo(font.getCOSObject());
|
||||
boolean embedded = font.isEmbedded();
|
||||
FontProgramData programData = embedded ? extractFontProgram(font) : null;
|
||||
String toUnicode = extractToUnicode(font.getCOSObject());
|
||||
// Build complete CharCode→CID→GID→Unicode mapping for CID fonts
|
||||
String unicodeMapping = buildUnicodeMapping(font, toUnicode);
|
||||
FontProgramData programData = embedded ? extractFontProgram(font, unicodeMapping) : null;
|
||||
String standard14Name = resolveStandard14Name(font);
|
||||
Integer flags = descriptor != null ? descriptor.getFlags() : null;
|
||||
PdfJsonCosValue cosDictionary = serializeCosValue(font.getCOSObject());
|
||||
|
||||
log.debug(
|
||||
"Building font model: id={}, baseName={}, subtype={}, embedded={}, hasProgram={}, hasWebProgram={}",
|
||||
fontId,
|
||||
font.getName(),
|
||||
subtype,
|
||||
embedded,
|
||||
programData != null && programData.getBase64() != null,
|
||||
programData != null && programData.getWebBase64() != null);
|
||||
|
||||
return PdfJsonFont.builder()
|
||||
.id(fontId)
|
||||
@ -468,6 +534,8 @@ public class PdfJsonConversionService {
|
||||
.embedded(embedded)
|
||||
.program(programData != null ? programData.getBase64() : null)
|
||||
.programFormat(programData != null ? programData.getFormat() : null)
|
||||
.webProgram(programData != null ? programData.getWebBase64() : null)
|
||||
.webProgramFormat(programData != null ? programData.getWebFormat() : null)
|
||||
.toUnicode(toUnicode)
|
||||
.standard14Name(standard14Name)
|
||||
.fontDescriptorFlags(flags)
|
||||
@ -477,6 +545,7 @@ public class PdfJsonConversionService {
|
||||
.xHeight(descriptor != null ? descriptor.getXHeight() : null)
|
||||
.italicAngle(descriptor != null ? descriptor.getItalicAngle() : null)
|
||||
.unitsPerEm(extractUnitsPerEm(font))
|
||||
.cosDictionary(cosDictionary)
|
||||
.build();
|
||||
}
|
||||
|
||||
@ -508,11 +577,13 @@ public class PdfJsonConversionService {
|
||||
if (font == null) {
|
||||
fallbackNeeded = true;
|
||||
fallbackIds.add(FALLBACK_FONT_ID);
|
||||
element.setFallbackUsed(Boolean.TRUE);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!canEncodeFully(font, text)) {
|
||||
fallbackNeeded = true;
|
||||
element.setFallbackUsed(Boolean.TRUE);
|
||||
for (int offset = 0; offset < text.length(); ) {
|
||||
int codePoint = text.codePointAt(offset);
|
||||
offset += Character.charCount(codePoint);
|
||||
@ -682,11 +753,25 @@ public class PdfJsonConversionService {
|
||||
}
|
||||
|
||||
private boolean canRunGhostscript() {
|
||||
if (!fontNormalizationEnabled) {
|
||||
return false;
|
||||
}
|
||||
if (!isGhostscriptGroupEnabled()) {
|
||||
return false;
|
||||
}
|
||||
if (!ghostscriptAvailable) {
|
||||
log.debug("Skipping Ghostscript normalization; executable not available");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean isGhostscriptGroupEnabled() {
|
||||
try {
|
||||
return endpointConfiguration != null
|
||||
&& endpointConfiguration.isGroupEnabled("Ghostscript");
|
||||
} catch (Exception ex) {
|
||||
log.debug("Ghostscript availability check failed: {}", ex.getMessage());
|
||||
log.debug("Ghostscript group check failed: {}", ex.getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -736,12 +821,129 @@ public class PdfJsonConversionService {
|
||||
return null;
|
||||
}
|
||||
|
||||
private byte[] convertCffProgramToTrueType(byte[] fontBytes) {
|
||||
if (!cffConversionEnabled
|
||||
|| fontforgeCommand == null
|
||||
|| fontforgeCommand.isBlank()
|
||||
|| fontBytes == null
|
||||
|| fontBytes.length == 0) {
|
||||
private byte[] convertCffProgramToTrueType(byte[] fontBytes, String toUnicode) {
|
||||
if (!cffConversionEnabled || fontBytes == null || fontBytes.length == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Determine which converter to use
|
||||
if ("python".equalsIgnoreCase(cffConverterMethod)) {
|
||||
return convertCffUsingPython(fontBytes, toUnicode);
|
||||
} else if ("fontforge".equalsIgnoreCase(cffConverterMethod)) {
|
||||
return convertCffUsingFontForge(fontBytes);
|
||||
} else {
|
||||
log.warn("Unknown CFF converter method: {}, falling back to Python", cffConverterMethod);
|
||||
return convertCffUsingPython(fontBytes, toUnicode);
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] convertCffUsingPython(byte[] fontBytes, String toUnicode) {
|
||||
if (pythonCommand == null
|
||||
|| pythonCommand.isBlank()
|
||||
|| pythonScript == null
|
||||
|| pythonScript.isBlank()) {
|
||||
log.debug("Python converter not configured");
|
||||
return null;
|
||||
}
|
||||
|
||||
try (TempFile inputFile = new TempFile(tempFileManager, ".cff");
|
||||
TempFile outputFile = new TempFile(tempFileManager, ".otf");
|
||||
TempFile toUnicodeFile = toUnicode != null ? new TempFile(tempFileManager, ".tounicode") : null) {
|
||||
Files.write(inputFile.getPath(), fontBytes);
|
||||
|
||||
// Write ToUnicode CMap data if available
|
||||
if (toUnicode != null && toUnicodeFile != null) {
|
||||
byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicode);
|
||||
Files.write(toUnicodeFile.getPath(), toUnicodeBytes);
|
||||
}
|
||||
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add(pythonCommand);
|
||||
command.add(pythonScript);
|
||||
command.add(inputFile.getAbsolutePath());
|
||||
command.add(outputFile.getAbsolutePath());
|
||||
// Add optional ToUnicode file path
|
||||
if (toUnicodeFile != null) {
|
||||
command.add(toUnicodeFile.getAbsolutePath());
|
||||
}
|
||||
|
||||
ProcessBuilder builder = new ProcessBuilder(command);
|
||||
builder.redirectErrorStream(true);
|
||||
Process process = builder.start();
|
||||
|
||||
StringBuilder output = new StringBuilder();
|
||||
Thread reader =
|
||||
new Thread(
|
||||
() -> {
|
||||
try (BufferedReader br =
|
||||
new BufferedReader(
|
||||
new InputStreamReader(
|
||||
process.getInputStream(),
|
||||
StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
output.append(line).append('\n');
|
||||
}
|
||||
} catch (IOException ignored) {
|
||||
}
|
||||
});
|
||||
reader.start();
|
||||
|
||||
// Wait with timeout (Python fontTools is usually fast, but provide safety margin)
|
||||
boolean finished = process.waitFor(30, TimeUnit.SECONDS);
|
||||
if (!finished) {
|
||||
process.destroyForcibly();
|
||||
reader.interrupt();
|
||||
log.warn(
|
||||
"Python CFF→OTF wrapping timed out after 30 seconds - font may be corrupted");
|
||||
return null;
|
||||
}
|
||||
|
||||
int exitCode = process.exitValue();
|
||||
reader.join(5000);
|
||||
|
||||
if (exitCode == 0 && Files.exists(outputFile.getPath())) {
|
||||
byte[] convertedBytes = Files.readAllBytes(outputFile.getPath());
|
||||
if (convertedBytes.length > 0) {
|
||||
String validationError = validateFontTables(convertedBytes);
|
||||
if (validationError != null) {
|
||||
log.warn("Python converter produced invalid font: {}", validationError);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Log Python script output for debugging
|
||||
String outputStr = output.toString().trim();
|
||||
if (!outputStr.isEmpty()) {
|
||||
log.debug("Python script output: {}", outputStr);
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"Python CFF→OTF wrapping successful: {} bytes → {} bytes",
|
||||
fontBytes.length,
|
||||
convertedBytes.length);
|
||||
return convertedBytes;
|
||||
}
|
||||
} else {
|
||||
String outputStr = output.toString().trim();
|
||||
if (!outputStr.isEmpty()) {
|
||||
log.warn("Python CFF→OTF wrapping failed with exit code {}: {}", exitCode, outputStr);
|
||||
} else {
|
||||
log.warn("Python CFF→OTF wrapping failed with exit code {}", exitCode);
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException ex) {
|
||||
Thread.currentThread().interrupt();
|
||||
log.debug("Python CFF conversion interrupted", ex);
|
||||
} catch (IOException ex) {
|
||||
log.debug("Python CFF conversion I/O error", ex);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private byte[] convertCffUsingFontForge(byte[] fontBytes) {
|
||||
if (fontforgeCommand == null || fontforgeCommand.isBlank()) {
|
||||
log.debug("FontForge converter not configured");
|
||||
return null;
|
||||
}
|
||||
|
||||
@ -754,8 +956,18 @@ public class PdfJsonConversionService {
|
||||
command.add("-lang=ff");
|
||||
command.add("-c");
|
||||
command.add(
|
||||
"Open($1); SelectWorthOutputting(); SetFontOrder(2); Reencode(\"unicode\"); "
|
||||
+ "Generate($2); Close(); Quit()");
|
||||
"Open($1); "
|
||||
+ "ScaleToEm(1000); " // Force 1000 units per em (standard for Type1)
|
||||
+ "SelectWorthOutputting(); "
|
||||
+ "SetFontOrder(2); "
|
||||
+ "Reencode(\"unicode\"); "
|
||||
+ "RoundToInt(); "
|
||||
+ "RemoveOverlap(); "
|
||||
+ "Simplify(); "
|
||||
+ "CorrectDirection(); "
|
||||
+ "Generate($2, \"\", 4+16+32); "
|
||||
+ "Close(); "
|
||||
+ "Quit()");
|
||||
command.add(inputFile.getAbsolutePath());
|
||||
command.add(outputFile.getAbsolutePath());
|
||||
|
||||
@ -780,11 +992,59 @@ public class PdfJsonConversionService {
|
||||
}
|
||||
});
|
||||
reader.start();
|
||||
int exitCode = process.waitFor();
|
||||
reader.join();
|
||||
|
||||
// Wait with timeout to prevent hanging on problematic fonts
|
||||
boolean finished = process.waitFor(30, TimeUnit.SECONDS);
|
||||
if (!finished) {
|
||||
process.destroyForcibly();
|
||||
reader.interrupt();
|
||||
log.warn("FontForge conversion timed out after 30 seconds - font may be too complex or causing FontForge to hang");
|
||||
return null;
|
||||
}
|
||||
|
||||
int exitCode = process.exitValue();
|
||||
reader.join(5000); // Wait max 5 seconds for reader thread
|
||||
|
||||
if (exitCode == 0 && Files.exists(outputFile.getPath())) {
|
||||
return Files.readAllBytes(outputFile.getPath());
|
||||
byte[] convertedBytes = Files.readAllBytes(outputFile.getPath());
|
||||
if (convertedBytes.length > 0) {
|
||||
// Basic validation: check for TrueType magic number and critical tables
|
||||
if (convertedBytes.length >= 4) {
|
||||
int magic =
|
||||
((convertedBytes[0] & 0xFF) << 24)
|
||||
| ((convertedBytes[1] & 0xFF) << 16)
|
||||
| ((convertedBytes[2] & 0xFF) << 8)
|
||||
| (convertedBytes[3] & 0xFF);
|
||||
boolean validTrueType =
|
||||
magic == 0x00010000 || magic == 0x74727565; // 1.0 or 'true'
|
||||
boolean validOpenType = magic == 0x4F54544F; // 'OTTO'
|
||||
|
||||
if (validTrueType || validOpenType) {
|
||||
// Additional validation: check unitsPerEm in head table
|
||||
String validationError = validateFontTables(convertedBytes);
|
||||
if (validationError != null) {
|
||||
log.warn(
|
||||
"FontForge produced invalid font: {}",
|
||||
validationError);
|
||||
return null;
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"FontForge CFF→TrueType conversion successful: {} bytes, magic: 0x{}, type: {}",
|
||||
convertedBytes.length,
|
||||
Integer.toHexString(magic),
|
||||
validOpenType ? "OpenType" : "TrueType");
|
||||
return convertedBytes;
|
||||
} else {
|
||||
log.warn(
|
||||
"FontForge produced invalid font: magic number 0x{} (expected TrueType or OpenType)",
|
||||
Integer.toHexString(magic));
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
log.warn("FontForge produced empty output file");
|
||||
return null;
|
||||
}
|
||||
|
||||
log.warn(
|
||||
@ -801,6 +1061,127 @@ public class PdfJsonConversionService {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates critical OpenType/TrueType font tables to ensure browser compatibility.
|
||||
* @return Error message if invalid, null if valid
|
||||
*/
|
||||
private String validateFontTables(byte[] fontBytes) {
|
||||
try {
|
||||
if (fontBytes.length < 12) {
|
||||
return "Font file too small";
|
||||
}
|
||||
|
||||
// Read table directory
|
||||
int numTables = ((fontBytes[4] & 0xFF) << 8) | (fontBytes[5] & 0xFF);
|
||||
if (numTables == 0 || numTables > 100) {
|
||||
return "Invalid table count: " + numTables;
|
||||
}
|
||||
|
||||
// Find head table
|
||||
int offset = 12; // Skip sfnt header
|
||||
for (int i = 0; i < numTables && offset + 16 <= fontBytes.length; i++) {
|
||||
String tag = new String(fontBytes, offset, 4, StandardCharsets.US_ASCII);
|
||||
int tableOffset = ((fontBytes[offset + 8] & 0xFF) << 24)
|
||||
| ((fontBytes[offset + 9] & 0xFF) << 16)
|
||||
| ((fontBytes[offset + 10] & 0xFF) << 8)
|
||||
| (fontBytes[offset + 11] & 0xFF);
|
||||
int tableLength = ((fontBytes[offset + 12] & 0xFF) << 24)
|
||||
| ((fontBytes[offset + 13] & 0xFF) << 16)
|
||||
| ((fontBytes[offset + 14] & 0xFF) << 8)
|
||||
| (fontBytes[offset + 15] & 0xFF);
|
||||
|
||||
if ("head".equals(tag)) {
|
||||
if (tableOffset + 18 > fontBytes.length) {
|
||||
return "head table truncated";
|
||||
}
|
||||
// Check unitsPerEm at offset 18 in head table
|
||||
int unitsPerEm = ((fontBytes[tableOffset + 18] & 0xFF) << 8)
|
||||
| (fontBytes[tableOffset + 19] & 0xFF);
|
||||
if (unitsPerEm < 16 || unitsPerEm > 16384) {
|
||||
return "Invalid unitsPerEm: " + unitsPerEm + " (must be 16-16384)";
|
||||
}
|
||||
return null; // Valid
|
||||
}
|
||||
offset += 16;
|
||||
}
|
||||
return "head table not found";
|
||||
} catch (Exception ex) {
|
||||
return "Validation error: " + ex.getMessage();
|
||||
}
|
||||
}
|
||||
|
||||
private String buildUnicodeMapping(PDFont font, String toUnicodeBase64) throws IOException {
|
||||
log.debug("buildUnicodeMapping called for font: {}, hasToUnicode: {}, isCID: {}",
|
||||
font.getName(), toUnicodeBase64 != null, font instanceof PDType0Font);
|
||||
|
||||
if (toUnicodeBase64 == null || toUnicodeBase64.isBlank()) {
|
||||
log.debug("No ToUnicode data for font: {}", font.getName());
|
||||
return null;
|
||||
}
|
||||
|
||||
// For CID fonts (Type0), build complete CharCode→CID→GID→Unicode mapping
|
||||
if (!(font instanceof PDType0Font type0Font)) {
|
||||
// For non-CID fonts, just return ToUnicode as-is
|
||||
log.debug("Non-CID font {}, returning raw ToUnicode", font.getName());
|
||||
return toUnicodeBase64;
|
||||
}
|
||||
|
||||
log.debug("Building JSON mapping for CID font: {}", font.getName());
|
||||
|
||||
try {
|
||||
// Build a map of CharCode → Unicode from ToUnicode
|
||||
Map<Integer, Integer> charCodeToUnicode = new HashMap<>();
|
||||
byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicodeBase64);
|
||||
String toUnicodeStr = new String(toUnicodeBytes, StandardCharsets.UTF_8);
|
||||
|
||||
// Parse ToUnicode CMap for bfchar and bfrange
|
||||
java.util.regex.Pattern bfcharPattern = java.util.regex.Pattern.compile("<([0-9A-Fa-f]+)>\\s*<([0-9A-Fa-f]+)>");
|
||||
java.util.regex.Matcher matcher = bfcharPattern.matcher(toUnicodeStr);
|
||||
while (matcher.find()) {
|
||||
int charCode = Integer.parseInt(matcher.group(1), 16);
|
||||
int unicode = Integer.parseInt(matcher.group(2), 16);
|
||||
charCodeToUnicode.put(charCode, unicode);
|
||||
}
|
||||
|
||||
// Build JSON mapping: CharCode → CID → GID → Unicode
|
||||
StringBuilder json = new StringBuilder();
|
||||
json.append("{\"isCID\":true,\"cidToGidIdentity\":true,\"entries\":[");
|
||||
|
||||
boolean first = true;
|
||||
for (Map.Entry<Integer, Integer> entry : charCodeToUnicode.entrySet()) {
|
||||
int charCode = entry.getKey();
|
||||
int unicode = entry.getValue();
|
||||
|
||||
try {
|
||||
// Get CID from char code
|
||||
int cid = type0Font.codeToCID(charCode);
|
||||
// For Identity-H/V encoding, GID == CID
|
||||
int gid = cid;
|
||||
|
||||
if (!first) {
|
||||
json.append(",");
|
||||
}
|
||||
first = false;
|
||||
json.append(String.format("{\"code\":%d,\"cid\":%d,\"gid\":%d,\"unicode\":%d}",
|
||||
charCode, cid, gid, unicode));
|
||||
} catch (Exception e) {
|
||||
// Skip entries that fail to map
|
||||
log.debug("Failed to map charCode {} in font {}: {}", charCode, font.getName(), e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
json.append("]}");
|
||||
String jsonStr = json.toString();
|
||||
log.debug("Built Unicode mapping for CID font {} with {} entries",
|
||||
font.getName(), charCodeToUnicode.size());
|
||||
return Base64.getEncoder().encodeToString(jsonStr.getBytes(StandardCharsets.UTF_8));
|
||||
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to build Unicode mapping for font {}: {}", font.getName(), e.getMessage());
|
||||
return toUnicodeBase64; // Fall back to raw ToUnicode
|
||||
}
|
||||
}
|
||||
|
||||
private PdfJsonFontCidSystemInfo extractCidSystemInfo(COSDictionary fontDictionary) {
|
||||
if (fontDictionary == null) {
|
||||
return null;
|
||||
@ -824,7 +1205,7 @@ public class PdfJsonConversionService {
|
||||
return info;
|
||||
}
|
||||
|
||||
private FontProgramData extractFontProgram(PDFont font) throws IOException {
|
||||
private FontProgramData extractFontProgram(PDFont font, String toUnicode) throws IOException {
|
||||
PDFontDescriptor descriptor = font.getFontDescriptor();
|
||||
if (descriptor == null) {
|
||||
return null;
|
||||
@ -833,24 +1214,24 @@ public class PdfJsonConversionService {
|
||||
PDStream fontFile3 = descriptor.getFontFile3();
|
||||
if (fontFile3 != null) {
|
||||
String subtype = fontFile3.getCOSObject().getNameAsString(COSName.SUBTYPE);
|
||||
return readFontProgram(fontFile3, subtype != null ? subtype : "fontfile3", false);
|
||||
return readFontProgram(fontFile3, subtype != null ? subtype : "fontfile3", false, toUnicode);
|
||||
}
|
||||
|
||||
PDStream fontFile2 = descriptor.getFontFile2();
|
||||
if (fontFile2 != null) {
|
||||
return readFontProgram(fontFile2, null, true);
|
||||
return readFontProgram(fontFile2, null, true, toUnicode);
|
||||
}
|
||||
|
||||
PDStream fontFile = descriptor.getFontFile();
|
||||
if (fontFile != null) {
|
||||
return readFontProgram(fontFile, "type1", false);
|
||||
return readFontProgram(fontFile, "type1", false, toUnicode);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private FontProgramData readFontProgram(
|
||||
PDStream stream, String formatHint, boolean detectTrueType) throws IOException {
|
||||
PDStream stream, String formatHint, boolean detectTrueType, String toUnicode) throws IOException {
|
||||
try (InputStream inputStream = stream.createInputStream();
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
inputStream.transferTo(baos);
|
||||
@ -859,8 +1240,21 @@ public class PdfJsonConversionService {
|
||||
if (detectTrueType) {
|
||||
format = detectTrueTypeFormat(data);
|
||||
}
|
||||
String webBase64 = null;
|
||||
String webFormat = null;
|
||||
if (format != null && isCffFormat(format)) {
|
||||
log.debug("Detected CFF font format: {}, wrapping as OpenType-CFF for web preview", format);
|
||||
byte[] converted = convertCffProgramToTrueType(data, toUnicode);
|
||||
if (converted != null && converted.length > 0) {
|
||||
webBase64 = Base64.getEncoder().encodeToString(converted);
|
||||
webFormat = "otf";
|
||||
log.debug("CFF→OTF wrapping successful: {} bytes → {} bytes", data.length, converted.length);
|
||||
} else {
|
||||
log.debug("CFF→OTF wrapping returned null or empty result");
|
||||
}
|
||||
}
|
||||
String base64 = Base64.getEncoder().encodeToString(data);
|
||||
return new FontProgramData(base64, format);
|
||||
return new FontProgramData(base64, format, webBase64, webFormat);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1759,8 +2153,12 @@ public class PdfJsonConversionService {
|
||||
}
|
||||
|
||||
PDFont baseFont = primaryFont;
|
||||
boolean fallbackApplied = primaryFont == null;
|
||||
if (baseFont == null) {
|
||||
baseFont = ensureFallbackFont(document, fontMap, fontModels, FALLBACK_FONT_ID);
|
||||
if (baseFont != null) {
|
||||
fallbackApplied = true;
|
||||
}
|
||||
}
|
||||
if (baseFont == null) {
|
||||
log.warn("Unable to resolve a base font for text element; skipping text content");
|
||||
@ -1777,6 +2175,7 @@ public class PdfJsonConversionService {
|
||||
PDFont targetFont = currentFont;
|
||||
|
||||
if (!canEncode(baseFont, codePoint)) {
|
||||
fallbackApplied = true;
|
||||
String fallbackId = resolveFallbackFontId(codePoint);
|
||||
targetFont = ensureFallbackFont(document, fontMap, fontModels, fallbackId);
|
||||
if (targetFont == null || !canEncode(targetFont, glyph)) {
|
||||
@ -1823,6 +2222,10 @@ public class PdfJsonConversionService {
|
||||
runs.add(new FontRun(currentFont, buffer.toString()));
|
||||
}
|
||||
|
||||
if (fallbackApplied) {
|
||||
element.setFallbackUsed(Boolean.TRUE);
|
||||
}
|
||||
|
||||
return runs;
|
||||
}
|
||||
|
||||
@ -2019,10 +2422,14 @@ public class PdfJsonConversionService {
|
||||
private static class FontProgramData {
|
||||
private final String base64;
|
||||
private final String format;
|
||||
private final String webBase64;
|
||||
private final String webFormat;
|
||||
|
||||
private FontProgramData(String base64, String format) {
|
||||
private FontProgramData(String base64, String format, String webBase64, String webFormat) {
|
||||
this.base64 = base64;
|
||||
this.format = format;
|
||||
this.webBase64 = webBase64;
|
||||
this.webFormat = webFormat;
|
||||
}
|
||||
|
||||
private String getBase64() {
|
||||
@ -2032,6 +2439,14 @@ public class PdfJsonConversionService {
|
||||
private String getFormat() {
|
||||
return format;
|
||||
}
|
||||
|
||||
private String getWebBase64() {
|
||||
return webBase64;
|
||||
}
|
||||
|
||||
private String getWebFormat() {
|
||||
return webFormat;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class PreflightResult {
|
||||
@ -2371,46 +2786,106 @@ public class PdfJsonConversionService {
|
||||
return loadFallbackPdfFont(document);
|
||||
}
|
||||
|
||||
// IMPORTANT: Dictionary restoration is disabled because deserialized dictionaries
|
||||
// don't properly include the font stream references (FontFile/FontFile2/FontFile3).
|
||||
// This results in fonts that structurally exist but can't encode glyphs, causing
|
||||
// fallback to NotoSans. Instead, we ALWAYS use program bytes for reliable encoding.
|
||||
// The cosDictionary field is preserved in the JSON for potential future use, but
|
||||
// for now we rely on direct font program loading.
|
||||
if (false && fontModel.getCosDictionary() != null) {
|
||||
// Dictionary restoration code kept for reference but disabled
|
||||
COSBase restored = deserializeCosValue(fontModel.getCosDictionary(), document);
|
||||
if (restored instanceof COSDictionary cosDictionary) {
|
||||
try {
|
||||
PDFont font = PDFontFactory.createFont(cosDictionary);
|
||||
if (font != null && font.isEmbedded()) {
|
||||
// Verify font can actually encode a basic character
|
||||
try {
|
||||
font.encode("A");
|
||||
applyAdditionalFontMetadata(document, font, fontModel);
|
||||
log.debug("Successfully restored embedded font {} from dictionary", fontModel.getId());
|
||||
return font;
|
||||
} catch (IOException | IllegalArgumentException encodingEx) {
|
||||
log.warn(
|
||||
"Font {} restored from dictionary but failed encoding test: {}; falling back to program bytes",
|
||||
fontModel.getId(),
|
||||
encodingEx.getMessage());
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
log.warn(
|
||||
"Failed to restore font {} from stored dictionary: {}; falling back to program bytes",
|
||||
fontModel.getId(),
|
||||
ex.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
byte[] fontBytes = null;
|
||||
String format = null;
|
||||
|
||||
// For CFF/Type1C fonts, prefer the webProgram (converted TrueType) because:
|
||||
// 1. PDFBox's PDType0Font.load() expects TrueType/OpenType format
|
||||
// 2. Raw CFF program bytes lack the descriptor context needed for reconstruction
|
||||
// 3. FontForge-converted TrueType is reliable for both web preview and PDF export
|
||||
String originalFormat =
|
||||
fontModel.getProgramFormat() != null
|
||||
? fontModel.getProgramFormat().toLowerCase(Locale.ROOT)
|
||||
: null;
|
||||
// For JSON→PDF conversion, always use original font bytes
|
||||
// (PDFBox doesn't support OpenType-CFF; webProgram is only for frontend web preview)
|
||||
String program = fontModel.getProgram();
|
||||
if (program != null && !program.isBlank()) {
|
||||
byte[] fontBytes = Base64.getDecoder().decode(program);
|
||||
String format =
|
||||
fontModel.getProgramFormat() != null
|
||||
? fontModel.getProgramFormat().toLowerCase(Locale.ROOT)
|
||||
: "";
|
||||
fontBytes = Base64.getDecoder().decode(program);
|
||||
format = originalFormat;
|
||||
log.debug("Using original font program for {} (format: {})", fontModel.getId(), originalFormat);
|
||||
} else if (fontModel.getWebProgram() != null && !fontModel.getWebProgram().isBlank()) {
|
||||
// Fallback to webProgram if original program is unavailable
|
||||
fontBytes = Base64.getDecoder().decode(fontModel.getWebProgram());
|
||||
format =
|
||||
fontModel.getWebProgramFormat() != null
|
||||
? fontModel.getWebProgramFormat().toLowerCase(Locale.ROOT)
|
||||
: null;
|
||||
log.debug("Using web-optimized font program for {} (original program unavailable)", fontModel.getId());
|
||||
}
|
||||
|
||||
if (fontBytes != null && fontBytes.length > 0) {
|
||||
try {
|
||||
if (isCffFormat(format)) {
|
||||
byte[] converted = convertCffProgramToTrueType(fontBytes);
|
||||
if (converted != null) {
|
||||
fontBytes = converted;
|
||||
format = "ttf";
|
||||
log.debug(
|
||||
"Converted CFF font {} to TrueType outlines for embedding",
|
||||
fontModel.getId());
|
||||
} else {
|
||||
log.debug(
|
||||
"Unable to convert CFF font {} to TrueType; attempting direct load",
|
||||
fontModel.getId());
|
||||
}
|
||||
}
|
||||
if (isType1Format(format)) {
|
||||
try (InputStream stream = new ByteArrayInputStream(fontBytes)) {
|
||||
PDFont font = new PDType1Font(document, stream);
|
||||
applyAdditionalFontMetadata(document, font, fontModel);
|
||||
log.debug(
|
||||
"Successfully loaded Type1 font {} from program bytes (format: {}, originalFormat: {})",
|
||||
fontModel.getId(),
|
||||
format,
|
||||
originalFormat);
|
||||
return font;
|
||||
}
|
||||
}
|
||||
try (InputStream stream = new ByteArrayInputStream(fontBytes)) {
|
||||
PDFont font = PDType0Font.load(document, stream, true);
|
||||
applyAdditionalFontMetadata(document, font, fontModel);
|
||||
log.debug(
|
||||
"Successfully loaded Type0 font {} from program bytes (format: {}, originalFormat: {})",
|
||||
fontModel.getId(),
|
||||
format,
|
||||
originalFormat);
|
||||
return font;
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
log.debug(
|
||||
"Unable to load embedded font program for {}: {}",
|
||||
log.warn(
|
||||
"Unable to load embedded font program for {} (format: {}, originalFormat: {}): {}; falling back to Standard 14 or default",
|
||||
fontModel.getId(),
|
||||
format,
|
||||
originalFormat,
|
||||
ex.getMessage());
|
||||
}
|
||||
} else {
|
||||
log.warn(
|
||||
"Font {} has no program bytes available (originalFormat: {})",
|
||||
fontModel.getId(),
|
||||
originalFormat);
|
||||
}
|
||||
|
||||
String standardName = fontModel.getStandard14Name();
|
||||
|
||||
@ -173,9 +173,12 @@ stirling:
|
||||
fallback-font: classpath:/static/fonts/NotoSans-Regular.ttf # Override to point at a custom fallback font
|
||||
json:
|
||||
font-normalization:
|
||||
enabled: true # Run Ghostscript preflight to normalize fonts before PDF→JSON
|
||||
enabled: false # IMPORTANT: Disable to preserve ToUnicode CMaps for correct font rendering. Ghostscript strips Unicode mappings from CID fonts.
|
||||
cff-converter:
|
||||
enabled: true # Attempt to transcode CFF/Type1C programs to OTF using FontForge when available
|
||||
enabled: true # Wrap CFF/Type1C fonts as OpenType-CFF for browser compatibility
|
||||
method: python # Converter method: 'python' (fontTools, recommended - wraps as OTF), 'fontforge' (legacy - converts to TTF, may hang on CID fonts)
|
||||
python-command: /opt/venv/bin/python3 # Python interpreter path
|
||||
python-script: /scripts/convert_cff_to_ttf.py # Path to font wrapping script
|
||||
fontforge-command: fontforge # Override if FontForge is installed under a different name/path
|
||||
|
||||
ui:
|
||||
|
||||
@ -5,10 +5,6 @@ services:
|
||||
dockerfile: docker/backend/Dockerfile
|
||||
container_name: stirling-pdf-backend
|
||||
restart: on-failure:5
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:8080/api/v1/info/status | grep -q 'UP'"]
|
||||
interval: 5s
|
||||
|
||||
@ -95,8 +95,9 @@ const decodeBase64ToUint8Array = (value: string): Uint8Array => {
|
||||
};
|
||||
|
||||
const buildFontFamilyName = (font: PdfJsonFont): string => {
|
||||
const base = (font.uid ?? font.id ?? 'font').toString();
|
||||
return `pdf-font-${base.replace(/[^a-zA-Z0-9_-]/g, '')}`;
|
||||
const preferred = (font.baseName ?? '').trim();
|
||||
const identifier = preferred.length > 0 ? preferred : (font.uid ?? font.id ?? 'font').toString();
|
||||
return `pdf-font-${identifier.replace(/[^a-zA-Z0-9_-]/g, '')}`;
|
||||
};
|
||||
|
||||
const getCaretOffset = (element: HTMLElement): number => {
|
||||
@ -313,18 +314,34 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
|
||||
const next = new Map<string, string>();
|
||||
for (const font of fonts) {
|
||||
if (!font?.id || !font.program) {
|
||||
if (!font?.id) {
|
||||
continue;
|
||||
}
|
||||
const programSource = font.webProgram && font.webProgram.length > 0 ? font.webProgram : font.program;
|
||||
if (!programSource) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
const format = normalizeFontFormat(font.programFormat);
|
||||
const data = decodeBase64ToUint8Array(font.program);
|
||||
const formatSource = font.webProgram && font.webProgram.length > 0 ? font.webProgramFormat : font.programFormat;
|
||||
const format = normalizeFontFormat(formatSource);
|
||||
const data = decodeBase64ToUint8Array(programSource);
|
||||
const blob = new Blob([data as BlobPart], { type: getFontMimeType(format) });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const formatHint = getFontFormatHint(format);
|
||||
const familyName = buildFontFamilyName(font);
|
||||
const source = formatHint ? `url(${url}) format('${formatHint}')` : `url(${url})`;
|
||||
const fontFace = new FontFace(familyName, source);
|
||||
|
||||
console.debug(`[FontLoader] Loading font ${font.id} (${font.baseName}):`, {
|
||||
formatSource,
|
||||
format,
|
||||
formatHint,
|
||||
familyName,
|
||||
dataLength: data.length,
|
||||
hasWebProgram: !!font.webProgram,
|
||||
hasProgram: !!font.program
|
||||
});
|
||||
|
||||
await fontFace.load();
|
||||
if (disposed) {
|
||||
document.fonts.delete(fontFace);
|
||||
@ -334,8 +351,14 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
document.fonts.add(fontFace);
|
||||
active.push({ fontFace, url });
|
||||
next.set(font.id, familyName);
|
||||
console.debug(`[FontLoader] Successfully loaded font ${font.id}`);
|
||||
} catch (error) {
|
||||
// Silently ignore font loading failures - embedded PDF fonts often lack web font tables
|
||||
console.warn(`[FontLoader] Failed to load font ${font.id} (${font.baseName}):`, {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
formatSource: font.webProgram && font.webProgram.length > 0 ? font.webProgramFormat : font.programFormat,
|
||||
hasWebProgram: !!font.webProgram,
|
||||
hasProgram: !!font.program
|
||||
});
|
||||
// Fallback to web-safe fonts is already implemented via getFontFamily()
|
||||
}
|
||||
}
|
||||
@ -776,7 +799,8 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
const fontFamily = getFontFamily(group.fontId);
|
||||
const lineHeightPx = getLineHeightPx(group.fontId, fontSizePx);
|
||||
const lineHeightRatio = fontSizePx > 0 ? Math.max(lineHeightPx / fontSizePx, 1.05) : 1.2;
|
||||
const hasRotation = group.rotation != null && Math.abs(group.rotation) > 0.5;
|
||||
const rotation = group.rotation ?? 0;
|
||||
const hasRotation = Math.abs(rotation) > 0.5;
|
||||
const baselineLength = group.baselineLength ?? Math.max(group.bounds.right - group.bounds.left, 0);
|
||||
|
||||
let containerLeft = bounds.left;
|
||||
@ -795,7 +819,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
containerHeight = Math.max(lineHeightPx, fontSizePx * lineHeightRatio);
|
||||
transformOrigin = 'left bottom';
|
||||
// Negate rotation because Y-axis is flipped from PDF to web coordinates
|
||||
transform = `rotate(${-group.rotation}deg)`;
|
||||
transform = `rotate(${-rotation}deg)`;
|
||||
}
|
||||
|
||||
// Extract styling from group
|
||||
|
||||
@ -9,6 +9,14 @@ export interface PdfJsonTextColor {
|
||||
components?: number[] | null;
|
||||
}
|
||||
|
||||
export interface PdfJsonCosValue {
|
||||
type?: string | null;
|
||||
value?: unknown;
|
||||
items?: PdfJsonCosValue[] | null;
|
||||
entries?: Record<string, PdfJsonCosValue | null> | null;
|
||||
stream?: PdfJsonStream | null;
|
||||
}
|
||||
|
||||
export interface PdfJsonFont {
|
||||
id?: string;
|
||||
pageNumber?: number | null;
|
||||
@ -20,6 +28,8 @@ export interface PdfJsonFont {
|
||||
embedded?: boolean | null;
|
||||
program?: string | null;
|
||||
programFormat?: string | null;
|
||||
webProgram?: string | null;
|
||||
webProgramFormat?: string | null;
|
||||
toUnicode?: string | null;
|
||||
standard14Name?: string | null;
|
||||
fontDescriptorFlags?: number | null;
|
||||
@ -29,6 +39,7 @@ export interface PdfJsonFont {
|
||||
xHeight?: number | null;
|
||||
italicAngle?: number | null;
|
||||
unitsPerEm?: number | null;
|
||||
cosDictionary?: PdfJsonCosValue | null;
|
||||
}
|
||||
|
||||
export interface PdfJsonTextElement {
|
||||
@ -52,6 +63,7 @@ export interface PdfJsonTextElement {
|
||||
textMatrix?: number[] | null;
|
||||
fillColor?: PdfJsonTextColor | null;
|
||||
strokeColor?: PdfJsonTextColor | null;
|
||||
fallbackUsed?: boolean | null;
|
||||
}
|
||||
|
||||
export interface PdfJsonImageElement {
|
||||
|
||||
@ -16,6 +16,48 @@ const MIN_CHAR_WIDTH_FACTOR = 0.35;
|
||||
const MAX_CHAR_WIDTH_FACTOR = 1.25;
|
||||
const EXTRA_GAP_RATIO = 0.8;
|
||||
|
||||
type FontMetrics = {
|
||||
unitsPerEm: number;
|
||||
ascent: number;
|
||||
descent: number;
|
||||
};
|
||||
|
||||
type FontMetricsMap = Map<string, FontMetrics>;
|
||||
|
||||
const countGraphemes = (text: string): number => {
|
||||
if (!text) {
|
||||
return 0;
|
||||
}
|
||||
return Array.from(text).length;
|
||||
};
|
||||
|
||||
const metricsFor = (metrics: FontMetricsMap | undefined, fontId?: string | null): FontMetrics | undefined => {
|
||||
if (!metrics || !fontId) {
|
||||
return undefined;
|
||||
}
|
||||
return metrics.get(fontId) ?? undefined;
|
||||
};
|
||||
|
||||
const buildFontMetrics = (document: PdfJsonDocument | null | undefined): FontMetricsMap => {
|
||||
const metrics: FontMetricsMap = new Map();
|
||||
document?.fonts?.forEach((font) => {
|
||||
if (!font) {
|
||||
return;
|
||||
}
|
||||
const unitsPerEm = font.unitsPerEm && font.unitsPerEm > 0 ? font.unitsPerEm : 1000;
|
||||
const ascent = font.ascent ?? unitsPerEm * 0.8;
|
||||
const descent = font.descent ?? -(unitsPerEm * 0.2);
|
||||
const metric: FontMetrics = { unitsPerEm, ascent, descent };
|
||||
if (font.id) {
|
||||
metrics.set(font.id, metric);
|
||||
}
|
||||
if (font.uid) {
|
||||
metrics.set(font.uid, metric);
|
||||
}
|
||||
});
|
||||
return metrics;
|
||||
};
|
||||
|
||||
export const valueOr = (value: number | null | undefined, fallback = 0): number => {
|
||||
if (value === null || value === undefined || Number.isNaN(value)) {
|
||||
return fallback;
|
||||
@ -47,37 +89,87 @@ const getX = (element: PdfJsonTextElement): number => {
|
||||
return valueOr(element.x);
|
||||
};
|
||||
|
||||
const getWidth = (element: PdfJsonTextElement): number => {
|
||||
const getWidth = (element: PdfJsonTextElement, metrics?: FontMetricsMap): number => {
|
||||
const width = valueOr(element.width, 0);
|
||||
if (width === 0 && element.text) {
|
||||
const fontSize = valueOr(element.fontSize, 12);
|
||||
return fontSize * Math.max(element.text.length * 0.45, 0.5);
|
||||
if (width > 0) {
|
||||
return width;
|
||||
}
|
||||
return width;
|
||||
|
||||
const text = element.text ?? '';
|
||||
const glyphCount = Math.max(1, countGraphemes(text));
|
||||
const spacingFallback = Math.max(
|
||||
valueOr(element.spaceWidth, 0),
|
||||
valueOr(element.wordSpacing, 0),
|
||||
valueOr(element.characterSpacing, 0),
|
||||
);
|
||||
|
||||
if (spacingFallback > 0 && text.trim().length === 0) {
|
||||
return spacingFallback;
|
||||
}
|
||||
|
||||
const fontSize = getFontSize(element);
|
||||
const fontMetrics = metricsFor(metrics, element.fontId);
|
||||
if (fontMetrics) {
|
||||
const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000;
|
||||
const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8;
|
||||
const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2));
|
||||
const combinedUnits = Math.max(unitsPerEm * 0.8, ascentUnits + descentUnits);
|
||||
const averageAdvanceUnits = Math.max(unitsPerEm * 0.5, combinedUnits / Math.max(1, glyphCount));
|
||||
const fallbackWidth = (averageAdvanceUnits / unitsPerEm) * glyphCount * fontSize;
|
||||
if (fallbackWidth > 0) {
|
||||
return fallbackWidth;
|
||||
}
|
||||
}
|
||||
|
||||
return fontSize * glyphCount * 0.5;
|
||||
};
|
||||
|
||||
const getFontSize = (element: PdfJsonTextElement): number => valueOr(element.fontMatrixSize ?? element.fontSize, 12);
|
||||
|
||||
const getHeight = (element: PdfJsonTextElement): number => {
|
||||
const height = valueOr(element.height);
|
||||
if (height === 0) {
|
||||
return getFontSize(element) * 1.05;
|
||||
const getHeight = (element: PdfJsonTextElement, metrics?: FontMetricsMap): number => {
|
||||
const height = valueOr(element.height, 0);
|
||||
if (height > 0) {
|
||||
return height;
|
||||
}
|
||||
return height;
|
||||
const fontSize = getFontSize(element);
|
||||
const fontMetrics = metricsFor(metrics, element.fontId);
|
||||
if (fontMetrics) {
|
||||
const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000;
|
||||
const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8;
|
||||
const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2));
|
||||
const totalUnits = Math.max(unitsPerEm, ascentUnits + descentUnits);
|
||||
if (totalUnits > 0) {
|
||||
return (totalUnits / unitsPerEm) * fontSize;
|
||||
}
|
||||
}
|
||||
return fontSize;
|
||||
};
|
||||
|
||||
const getElementBounds = (element: PdfJsonTextElement): BoundingBox => {
|
||||
const getElementBounds = (
|
||||
element: PdfJsonTextElement,
|
||||
metrics?: FontMetricsMap,
|
||||
): BoundingBox => {
|
||||
const left = getX(element);
|
||||
const width = getWidth(element);
|
||||
const width = getWidth(element, metrics);
|
||||
const baseline = getBaseline(element);
|
||||
const height = getHeight(element);
|
||||
// In PDF coordinates, baseline is where text sits
|
||||
// Typical typography: ~80% of height above baseline (ascenders), ~20% below (descenders)
|
||||
// Using codebase's inverted naming: bottom (visual top) > top (visual bottom)
|
||||
const ascent = height * 0.8;
|
||||
const descent = height * 0.2;
|
||||
const bottom = baseline + ascent; // Visual top of text
|
||||
const top = baseline - descent; // Visual bottom (includes descenders)
|
||||
const height = getHeight(element, metrics);
|
||||
|
||||
let ascentRatio = 0.8;
|
||||
let descentRatio = 0.2;
|
||||
const fontMetrics = metricsFor(metrics, element.fontId);
|
||||
if (fontMetrics) {
|
||||
const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000;
|
||||
const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8;
|
||||
const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2));
|
||||
const totalUnits = Math.max(unitsPerEm, ascentUnits + descentUnits);
|
||||
if (totalUnits > 0) {
|
||||
ascentRatio = ascentUnits / totalUnits;
|
||||
descentRatio = descentUnits / totalUnits;
|
||||
}
|
||||
}
|
||||
|
||||
const bottom = baseline + height * ascentRatio;
|
||||
const top = baseline - height * descentRatio;
|
||||
return {
|
||||
left,
|
||||
right: left + width,
|
||||
@ -114,8 +206,12 @@ const getSpacingHint = (element: PdfJsonTextElement): number => {
|
||||
return Math.max(characterSpacing, 0);
|
||||
};
|
||||
|
||||
const estimateCharWidth = (element: PdfJsonTextElement, avgFontSize: number): number => {
|
||||
const rawWidth = getWidth(element);
|
||||
const estimateCharWidth = (
|
||||
element: PdfJsonTextElement,
|
||||
avgFontSize: number,
|
||||
metrics?: FontMetricsMap,
|
||||
): number => {
|
||||
const rawWidth = getWidth(element, metrics);
|
||||
const minWidth = avgFontSize * MIN_CHAR_WIDTH_FACTOR;
|
||||
const maxWidth = avgFontSize * MAX_CHAR_WIDTH_FACTOR;
|
||||
return Math.min(Math.max(rawWidth, minWidth), maxWidth);
|
||||
@ -136,12 +232,16 @@ const mergeBounds = (bounds: BoundingBox[]): BoundingBox => {
|
||||
);
|
||||
};
|
||||
|
||||
const shouldInsertSpace = (prev: PdfJsonTextElement, current: PdfJsonTextElement): boolean => {
|
||||
const prevRight = getX(prev) + getWidth(prev);
|
||||
const shouldInsertSpace = (
|
||||
prev: PdfJsonTextElement,
|
||||
current: PdfJsonTextElement,
|
||||
metrics?: FontMetricsMap,
|
||||
): boolean => {
|
||||
const prevRight = getX(prev) + getWidth(prev, metrics);
|
||||
const trailingGap = Math.max(0, getX(current) - prevRight);
|
||||
const avgFontSize = (getFontSize(prev) + getFontSize(current)) / 2;
|
||||
const baselineAdvance = Math.max(0, getX(current) - getX(prev));
|
||||
const charWidthEstimate = estimateCharWidth(prev, avgFontSize);
|
||||
const charWidthEstimate = estimateCharWidth(prev, avgFontSize, metrics);
|
||||
const inferredGap = Math.max(0, baselineAdvance - charWidthEstimate);
|
||||
const spacingHint = Math.max(
|
||||
SPACE_MIN_GAP,
|
||||
@ -166,7 +266,7 @@ const shouldInsertSpace = (prev: PdfJsonTextElement, current: PdfJsonTextElement
|
||||
return false;
|
||||
};
|
||||
|
||||
const buildGroupText = (elements: PdfJsonTextElement[]): string => {
|
||||
const buildGroupText = (elements: PdfJsonTextElement[], metrics?: FontMetricsMap): string => {
|
||||
let result = '';
|
||||
elements.forEach((element, index) => {
|
||||
const value = element.text ?? '';
|
||||
@ -176,7 +276,7 @@ const buildGroupText = (elements: PdfJsonTextElement[]): string => {
|
||||
}
|
||||
|
||||
const previous = elements[index - 1];
|
||||
const needsSpace = shouldInsertSpace(previous, element);
|
||||
const needsSpace = shouldInsertSpace(previous, element, metrics);
|
||||
const startsWithWhitespace = /^\s/u.test(value);
|
||||
|
||||
if (needsSpace && !startsWithWhitespace) {
|
||||
@ -314,21 +414,24 @@ const getAnchorPoint = (element: PdfJsonTextElement): { x: number; y: number } =
|
||||
};
|
||||
};
|
||||
|
||||
const computeBaselineLength = (elements: PdfJsonTextElement[]): number =>
|
||||
elements.reduce((acc, current) => acc + getWidth(current), 0);
|
||||
const computeBaselineLength = (
|
||||
elements: PdfJsonTextElement[],
|
||||
metrics?: FontMetricsMap,
|
||||
): number => elements.reduce((acc, current) => acc + getWidth(current, metrics), 0);
|
||||
|
||||
const createGroup = (
|
||||
pageIndex: number,
|
||||
idSuffix: number,
|
||||
elements: PdfJsonTextElement[],
|
||||
metrics?: FontMetricsMap,
|
||||
): TextGroup => {
|
||||
const clones = elements.map(cloneTextElement);
|
||||
const originalClones = clones.map(cloneTextElement);
|
||||
const bounds = mergeBounds(elements.map(getElementBounds));
|
||||
const bounds = mergeBounds(elements.map((element) => getElementBounds(element, metrics)));
|
||||
const firstElement = elements[0];
|
||||
const rotation = computeGroupRotation(elements);
|
||||
const anchor = rotation !== null ? getAnchorPoint(firstElement) : null;
|
||||
const baselineLength = computeBaselineLength(elements);
|
||||
const baselineLength = computeBaselineLength(elements, metrics);
|
||||
|
||||
return {
|
||||
id: `${pageIndex}-${idSuffix}`,
|
||||
@ -343,13 +446,17 @@ const createGroup = (
|
||||
baselineLength,
|
||||
elements: clones,
|
||||
originalElements: originalClones,
|
||||
text: buildGroupText(elements),
|
||||
originalText: buildGroupText(elements),
|
||||
text: buildGroupText(elements, metrics),
|
||||
originalText: buildGroupText(elements, metrics),
|
||||
bounds,
|
||||
};
|
||||
};
|
||||
|
||||
export const groupPageTextElements = (page: PdfJsonPage | null | undefined, pageIndex: number): TextGroup[] => {
|
||||
export const groupPageTextElements = (
|
||||
page: PdfJsonPage | null | undefined,
|
||||
pageIndex: number,
|
||||
metrics?: FontMetricsMap,
|
||||
): TextGroup[] => {
|
||||
if (!page?.textElements || page.textElements.length === 0) {
|
||||
return [];
|
||||
}
|
||||
@ -393,7 +500,7 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page
|
||||
}
|
||||
|
||||
const previous = currentBucket[currentBucket.length - 1];
|
||||
const gap = getX(element) - (getX(previous) + getWidth(previous));
|
||||
const gap = getX(element) - (getX(previous) + getWidth(previous, metrics));
|
||||
const avgFontSize = (getFontSize(previous) + getFontSize(element)) / 2;
|
||||
const splitThreshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR);
|
||||
|
||||
@ -412,7 +519,7 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page
|
||||
}
|
||||
|
||||
if (shouldSplit) {
|
||||
groups.push(createGroup(pageIndex, groupCounter, currentBucket));
|
||||
groups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics));
|
||||
groupCounter += 1;
|
||||
currentBucket = [element];
|
||||
} else {
|
||||
@ -421,7 +528,7 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page
|
||||
});
|
||||
|
||||
if (currentBucket.length > 0) {
|
||||
groups.push(createGroup(pageIndex, groupCounter, currentBucket));
|
||||
groups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics));
|
||||
groupCounter += 1;
|
||||
}
|
||||
});
|
||||
@ -431,7 +538,8 @@ export const groupPageTextElements = (page: PdfJsonPage | null | undefined, page
|
||||
|
||||
export const groupDocumentText = (document: PdfJsonDocument | null | undefined): TextGroup[][] => {
|
||||
const pages = document?.pages ?? [];
|
||||
return pages.map((page, index) => groupPageTextElements(page, index));
|
||||
const metrics = buildFontMetrics(document);
|
||||
return pages.map((page, index) => groupPageTextElements(page, index, metrics));
|
||||
};
|
||||
|
||||
export const extractPageImages = (
|
||||
|
||||
492
scripts/convert_cff_to_ttf.py
Normal file
492
scripts/convert_cff_to_ttf.py
Normal file
@ -0,0 +1,492 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Wrap raw CFF/Type1C data (extracted from PDFs) as OpenType-CFF for web compatibility.
|
||||
Builds proper Unicode cmap from PDF ToUnicode data.
|
||||
"""
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
from io import BytesIO
|
||||
from fontTools.ttLib import TTFont, newTable
|
||||
from fontTools.cffLib import CFFFontSet
|
||||
from fontTools.ttLib.tables._c_m_a_p import cmap_format_4, cmap_format_12
|
||||
from fontTools.ttLib.tables._n_a_m_e import NameRecord
|
||||
from fontTools.ttLib.tables.O_S_2f_2 import Panose
|
||||
|
||||
def parse_unicode_mapping(mapping_path):
|
||||
"""
|
||||
Parse Unicode mapping (either JSON with CharCode→CID→GID→Unicode or raw ToUnicode CMap).
|
||||
|
||||
Returns:
|
||||
dict[int, int]: GID → Unicode codepoint
|
||||
"""
|
||||
try:
|
||||
with open(mapping_path, 'rb') as f:
|
||||
data = f.read().decode('utf-8', errors='ignore')
|
||||
|
||||
# Try parsing as JSON first (CID font with complete mapping)
|
||||
if data.strip().startswith('{'):
|
||||
import json
|
||||
try:
|
||||
mapping_data = json.loads(data)
|
||||
if mapping_data.get('isCID'):
|
||||
# Build GID → Unicode mapping from entries
|
||||
gid_to_unicode = {}
|
||||
for entry in mapping_data.get('entries', []):
|
||||
gid = entry['gid']
|
||||
unicode_val = entry['unicode']
|
||||
if unicode_val > 0:
|
||||
gid_to_unicode[gid] = unicode_val
|
||||
print(f"Parsed JSON mapping: {len(gid_to_unicode)} GID→Unicode entries", file=sys.stderr)
|
||||
return gid_to_unicode
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fall back to parsing raw ToUnicode CMap (non-CID fonts)
|
||||
# For non-CID fonts, CID/GID is the same as array index
|
||||
gid_to_unicode = {}
|
||||
|
||||
# Pattern for bfchar entries
|
||||
bfchar_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>'
|
||||
for match in re.finditer(bfchar_pattern, data):
|
||||
gid = int(match.group(1), 16) # For non-CID, char code == GID
|
||||
unicode_val = int(match.group(2), 16)
|
||||
if unicode_val > 0:
|
||||
gid_to_unicode[gid] = unicode_val
|
||||
|
||||
# Pattern for bfrange entries
|
||||
bfrange_pattern = r'<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>'
|
||||
for match in re.finditer(bfrange_pattern, data):
|
||||
start_gid = int(match.group(1), 16)
|
||||
end_gid = int(match.group(2), 16)
|
||||
start_unicode = int(match.group(3), 16)
|
||||
for i, gid in enumerate(range(start_gid, end_gid + 1)):
|
||||
unicode_val = start_unicode + i
|
||||
if unicode_val > 0:
|
||||
gid_to_unicode[gid] = unicode_val
|
||||
|
||||
print(f"Parsed ToUnicode CMap: {len(gid_to_unicode)} mappings", file=sys.stderr)
|
||||
return gid_to_unicode
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to parse Unicode mapping: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
def wrap_cff_as_otf(input_path, output_path, tounicode_path=None):
|
||||
"""
|
||||
Wrap raw CFF data (from PDF font stream) as OpenType-CFF.
|
||||
|
||||
Args:
|
||||
input_path: Path to input CFF data file
|
||||
output_path: Path to output OTF font
|
||||
tounicode_path: Optional path to ToUnicode CMap file
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Read raw CFF data
|
||||
with open(input_path, 'rb') as f:
|
||||
cff_data = f.read()
|
||||
|
||||
# Parse raw CFF data
|
||||
cff_fontset = CFFFontSet()
|
||||
cff_fontset.decompile(BytesIO(cff_data), None)
|
||||
|
||||
# Get the first (and usually only) font in the CFF set
|
||||
if len(cff_fontset.fontNames) == 0:
|
||||
print("ERROR: No fonts found in CFF data", file=sys.stderr)
|
||||
return False
|
||||
|
||||
cff_font = cff_fontset[cff_fontset.fontNames[0]]
|
||||
|
||||
# Parse Unicode mapping (JSON or raw ToUnicode CMap) if provided
|
||||
gid_to_unicode = {}
|
||||
if tounicode_path:
|
||||
gid_to_unicode = parse_unicode_mapping(tounicode_path)
|
||||
|
||||
# Create a new OTF font
|
||||
otf = TTFont(sfntVersion='OTTO') # 'OTTO' = CFF-flavored OpenType
|
||||
|
||||
# Get glyph names
|
||||
if hasattr(cff_font, 'charset') and cff_font.charset is not None:
|
||||
glyph_order = ['.notdef'] + [name for name in cff_font.charset if name != '.notdef']
|
||||
else:
|
||||
# Fallback to CharStrings keys
|
||||
charstrings = cff_font.CharStrings
|
||||
glyph_order = ['.notdef'] + [name for name in charstrings.keys() if name != '.notdef']
|
||||
|
||||
otf.setGlyphOrder(glyph_order)
|
||||
|
||||
# === Add CFF table (the actual font outlines) ===
|
||||
cff_table = newTable('CFF ')
|
||||
cff_table.cff = cff_fontset
|
||||
otf['CFF '] = cff_table
|
||||
|
||||
# === Calculate metrics from CFF ===
|
||||
charstrings = cff_font.CharStrings
|
||||
|
||||
# Get defaults from CFF Private dict
|
||||
private_dict = getattr(cff_font, 'Private', None)
|
||||
default_width = getattr(private_dict, 'defaultWidthX', 500) if private_dict else 500
|
||||
|
||||
# Calculate bounding box, widths, and LSBs
|
||||
x_min = 0
|
||||
y_min = -200
|
||||
x_max = 1000
|
||||
y_max = 800
|
||||
max_advance = 0
|
||||
min_lsb = 0
|
||||
min_rsb = 0
|
||||
max_extent = 0
|
||||
|
||||
widths = {}
|
||||
lsbs = {}
|
||||
|
||||
for glyph_name in glyph_order:
|
||||
lsb = 0
|
||||
width = int(default_width)
|
||||
|
||||
if glyph_name in charstrings:
|
||||
try:
|
||||
cs = charstrings[glyph_name]
|
||||
|
||||
# Get width from charstring
|
||||
if hasattr(cs, 'width'):
|
||||
width = int(cs.width)
|
||||
|
||||
# Calculate bounds for LSB and bbox
|
||||
try:
|
||||
bounds = cs.calcBounds(None)
|
||||
if bounds:
|
||||
glyph_xmin = int(bounds[0])
|
||||
glyph_ymin = int(bounds[1])
|
||||
glyph_xmax = int(bounds[2])
|
||||
glyph_ymax = int(bounds[3])
|
||||
|
||||
lsb = glyph_xmin
|
||||
rsb = width - glyph_xmax
|
||||
extent = lsb + glyph_xmax
|
||||
|
||||
# Update global bounds
|
||||
x_min = min(x_min, glyph_xmin)
|
||||
y_min = min(y_min, glyph_ymin)
|
||||
x_max = max(x_max, glyph_xmax)
|
||||
y_max = max(y_max, glyph_ymax)
|
||||
|
||||
# Update hhea metrics
|
||||
min_lsb = min(min_lsb, lsb)
|
||||
min_rsb = min(min_rsb, rsb)
|
||||
max_extent = max(max_extent, extent)
|
||||
except:
|
||||
pass # Some glyphs may not have outlines
|
||||
|
||||
except Exception as e:
|
||||
pass # Use defaults
|
||||
|
||||
widths[glyph_name] = width
|
||||
lsbs[glyph_name] = lsb
|
||||
max_advance = max(max_advance, width)
|
||||
|
||||
if max_advance == 0:
|
||||
max_advance = 1000
|
||||
if max_extent == 0:
|
||||
max_extent = x_max
|
||||
|
||||
units_per_em = 1000 # Standard for Type1/CFF
|
||||
|
||||
# === Create head table ===
|
||||
head = newTable('head')
|
||||
head.tableVersion = 1.0
|
||||
head.fontRevision = 1.0
|
||||
head.checkSumAdjustment = 0
|
||||
head.magicNumber = 0x5F0F3CF5
|
||||
head.flags = 0x000B # Baseline at y=0, LSB at x=0, integer PPEM
|
||||
head.unitsPerEm = units_per_em
|
||||
head.created = 3600000000
|
||||
head.modified = 3600000000
|
||||
head.xMin = x_min
|
||||
head.yMin = y_min
|
||||
head.xMax = x_max
|
||||
head.yMax = y_max
|
||||
head.macStyle = 0
|
||||
head.fontDirectionHint = 2
|
||||
head.indexToLocFormat = 0
|
||||
head.glyphDataFormat = 0
|
||||
head.lowestRecPPEM = 8
|
||||
otf['head'] = head
|
||||
|
||||
# === Create hhea table with correct metrics ===
|
||||
hhea = newTable('hhea')
|
||||
hhea.tableVersion = 0x00010000
|
||||
hhea.ascent = max(y_max, 800)
|
||||
hhea.descent = min(y_min, -200)
|
||||
hhea.lineGap = 0
|
||||
hhea.advanceWidthMax = max_advance
|
||||
hhea.minLeftSideBearing = min_lsb
|
||||
hhea.minRightSideBearing = min_rsb
|
||||
hhea.xMaxExtent = max_extent
|
||||
hhea.caretSlopeRise = 1
|
||||
hhea.caretSlopeRun = 0
|
||||
hhea.caretOffset = 0
|
||||
hhea.reserved0 = 0
|
||||
hhea.reserved1 = 0
|
||||
hhea.reserved2 = 0
|
||||
hhea.reserved3 = 0
|
||||
hhea.metricDataFormat = 0
|
||||
hhea.numberOfHMetrics = len(glyph_order)
|
||||
otf['hhea'] = hhea
|
||||
|
||||
# === Create hmtx table with correct LSBs ===
|
||||
hmtx = newTable('hmtx')
|
||||
hmtx.metrics = {}
|
||||
for glyph_name in glyph_order:
|
||||
hmtx.metrics[glyph_name] = (widths.get(glyph_name, default_width), lsbs.get(glyph_name, 0))
|
||||
otf['hmtx'] = hmtx
|
||||
|
||||
# === Create maxp table (simpler for CFF) ===
|
||||
maxp = newTable('maxp')
|
||||
maxp.tableVersion = 0x00005000 # CFF version (0.5)
|
||||
maxp.numGlyphs = len(glyph_order)
|
||||
otf['maxp'] = maxp
|
||||
|
||||
# === Build Unicode cmap from GID→Unicode mapping ===
|
||||
unicode_to_glyph = {}
|
||||
|
||||
if gid_to_unicode:
|
||||
# Debug: Show first few glyph names to understand naming convention
|
||||
sample_glyphs = glyph_order[:min(10, len(glyph_order))]
|
||||
print(f"Sample glyph names: {sample_glyphs}", file=sys.stderr)
|
||||
|
||||
# Debug: Show which GIDs we have mappings for
|
||||
sample_gids = sorted(gid_to_unicode.keys())[:10]
|
||||
print(f"Sample GIDs from mapping: {sample_gids}", file=sys.stderr)
|
||||
|
||||
# For CID fonts: glyph names are "cid00123" (5-digit zero-padded)
|
||||
# For non-CID fonts: glyph names vary but GID == array index
|
||||
is_cid_font = any(gn.startswith('cid') for gn in glyph_order[1:6]) # Check first few non-.notdef glyphs
|
||||
|
||||
for gid, unicode_val in gid_to_unicode.items():
|
||||
if unicode_val > 0:
|
||||
if is_cid_font:
|
||||
# Build glyph name as cidNNNNN (5 digits, zero-padded)
|
||||
glyph_name = f"cid{gid:05d}"
|
||||
# Verify this glyph exists in glyph_order
|
||||
if glyph_name in glyph_order:
|
||||
unicode_to_glyph[unicode_val] = glyph_name
|
||||
else:
|
||||
# Try without padding (some fonts use "cid123" not "cid00123")
|
||||
glyph_name_alt = f"cid{gid}"
|
||||
if glyph_name_alt in glyph_order:
|
||||
unicode_to_glyph[unicode_val] = glyph_name_alt
|
||||
else:
|
||||
# Non-CID font: GID is array index
|
||||
if 0 <= gid < len(glyph_order):
|
||||
glyph_name = glyph_order[gid]
|
||||
unicode_to_glyph[unicode_val] = glyph_name
|
||||
|
||||
print(f"Mapped {len(unicode_to_glyph)} Unicode codepoints (isCID={is_cid_font if gid_to_unicode else 'unknown'})", file=sys.stderr)
|
||||
|
||||
# Also try to map from glyph names (uni0041 → U+0041)
|
||||
for glyph_name in glyph_order:
|
||||
if glyph_name.startswith('uni') and len(glyph_name) == 7:
|
||||
try:
|
||||
unicode_val = int(glyph_name[3:], 16)
|
||||
if unicode_val not in unicode_to_glyph:
|
||||
unicode_to_glyph[unicode_val] = glyph_name
|
||||
except:
|
||||
pass
|
||||
elif glyph_name.startswith('u') and len(glyph_name) >= 5:
|
||||
try:
|
||||
unicode_val = int(glyph_name[1:], 16)
|
||||
if unicode_val not in unicode_to_glyph:
|
||||
unicode_to_glyph[unicode_val] = glyph_name
|
||||
except:
|
||||
pass
|
||||
|
||||
# === Create cmap table ===
|
||||
cmap = newTable('cmap')
|
||||
cmap.tableVersion = 0
|
||||
cmap_tables = []
|
||||
|
||||
# Windows Unicode BMP (format 4) - required
|
||||
cmap4_win = cmap_format_4(4)
|
||||
cmap4_win.platformID = 3 # Windows
|
||||
cmap4_win.platEncID = 1 # Unicode BMP
|
||||
cmap4_win.language = 0
|
||||
cmap4_win.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF}
|
||||
cmap_tables.append(cmap4_win)
|
||||
|
||||
# Windows Unicode UCS-4 (format 12) - for >BMP
|
||||
if any(cp > 0xFFFF for cp in unicode_to_glyph):
|
||||
cmap12_win = cmap_format_12(12)
|
||||
cmap12_win.platformID = 3 # Windows
|
||||
cmap12_win.platEncID = 10 # Unicode UCS-4
|
||||
cmap12_win.language = 0
|
||||
cmap12_win.cmap = dict(unicode_to_glyph)
|
||||
cmap_tables.append(cmap12_win)
|
||||
|
||||
# Mac Unicode (format 4) - for compatibility
|
||||
cmap4_mac = cmap_format_4(4)
|
||||
cmap4_mac.platformID = 1 # Mac
|
||||
cmap4_mac.platEncID = 0 # Roman
|
||||
cmap4_mac.language = 0
|
||||
cmap4_mac.cmap = {cp: gn for cp, gn in unicode_to_glyph.items() if cp <= 0xFFFF}
|
||||
cmap_tables.append(cmap4_mac)
|
||||
|
||||
cmap.tables = [t for t in cmap_tables if t.cmap] or [cmap4_win] # Ensure at least one
|
||||
otf['cmap'] = cmap
|
||||
|
||||
print(f"Built cmap with {len(unicode_to_glyph)} Unicode mappings", file=sys.stderr)
|
||||
|
||||
# === Create OS/2 table with correct metrics ===
|
||||
os2 = newTable('OS/2')
|
||||
os2.version = 4
|
||||
os2.xAvgCharWidth = int(sum(widths.values()) / len(widths)) if widths else 500
|
||||
os2.usWeightClass = 400 # Normal
|
||||
os2.usWidthClass = 5 # Medium
|
||||
os2.fsType = 0 # Installable embedding
|
||||
os2.ySubscriptXSize = 650
|
||||
os2.ySubscriptYSize = 600
|
||||
os2.ySubscriptXOffset = 0
|
||||
os2.ySubscriptYOffset = 75
|
||||
os2.ySuperscriptXSize = 650
|
||||
os2.ySuperscriptYSize = 600
|
||||
os2.ySuperscriptXOffset = 0
|
||||
os2.ySuperscriptYOffset = 350
|
||||
os2.yStrikeoutSize = 50
|
||||
os2.yStrikeoutPosition = 300
|
||||
os2.sFamilyClass = 0
|
||||
|
||||
# PANOSE - use proper object structure
|
||||
os2.panose = Panose()
|
||||
os2.panose.bFamilyType = 0
|
||||
os2.panose.bSerifStyle = 0
|
||||
os2.panose.bWeight = 0
|
||||
os2.panose.bProportion = 0
|
||||
os2.panose.bContrast = 0
|
||||
os2.panose.bStrokeVariation = 0
|
||||
os2.panose.bArmStyle = 0
|
||||
os2.panose.bLetterForm = 0
|
||||
os2.panose.bMidline = 0
|
||||
os2.panose.bXHeight = 0
|
||||
|
||||
os2.ulUnicodeRange1 = 0
|
||||
os2.ulUnicodeRange2 = 0
|
||||
os2.ulUnicodeRange3 = 0
|
||||
os2.ulUnicodeRange4 = 0
|
||||
os2.achVendID = 'SPDF'
|
||||
os2.fsSelection = 0x0040 # REGULAR bit
|
||||
|
||||
# Set character index range from actual cmap
|
||||
if unicode_to_glyph:
|
||||
codepoints = sorted(unicode_to_glyph.keys())
|
||||
os2.usFirstCharIndex = codepoints[0]
|
||||
os2.usLastCharIndex = codepoints[-1]
|
||||
else:
|
||||
os2.usFirstCharIndex = 0x20 # space
|
||||
os2.usLastCharIndex = 0x7E # tilde
|
||||
|
||||
# Typo metrics match hhea
|
||||
os2.sTypoAscender = hhea.ascent
|
||||
os2.sTypoDescender = hhea.descent
|
||||
os2.sTypoLineGap = hhea.lineGap
|
||||
|
||||
# Windows metrics (positive values, cover bbox)
|
||||
os2.usWinAscent = max(0, y_max)
|
||||
os2.usWinDescent = max(0, -y_min)
|
||||
|
||||
os2.ulCodePageRange1 = 0x00000001 # Latin 1
|
||||
os2.ulCodePageRange2 = 0
|
||||
os2.sxHeight = 500
|
||||
os2.sCapHeight = 700
|
||||
os2.usDefaultChar = 0
|
||||
os2.usBreakChar = 32
|
||||
os2.usMaxContext = 0
|
||||
otf['OS/2'] = os2
|
||||
|
||||
# === Create name table with Windows and Mac records ===
|
||||
name = newTable('name')
|
||||
name.names = []
|
||||
|
||||
# Get font name from CFF if available
|
||||
font_name = cff_fontset.fontNames[0] if cff_fontset.fontNames else "Converted"
|
||||
|
||||
name_strings = {
|
||||
1: font_name, # Font Family
|
||||
2: "Regular", # Subfamily
|
||||
3: f"Stirling-PDF: {font_name}", # Unique ID
|
||||
4: font_name, # Full Name
|
||||
5: "Version 1.0", # Version
|
||||
6: font_name.replace(' ', '-'), # PostScript Name
|
||||
}
|
||||
|
||||
# Add both Windows and Mac name records
|
||||
for name_id, value in name_strings.items():
|
||||
# Windows (platform 3, encoding 1, language 0x0409 = en-US)
|
||||
rec_win = NameRecord()
|
||||
rec_win.nameID = name_id
|
||||
rec_win.platformID = 3
|
||||
rec_win.platEncID = 1
|
||||
rec_win.langID = 0x0409
|
||||
rec_win.string = value
|
||||
name.names.append(rec_win)
|
||||
|
||||
# Mac (platform 1, encoding 0, language 0)
|
||||
rec_mac = NameRecord()
|
||||
rec_mac.nameID = name_id
|
||||
rec_mac.platformID = 1
|
||||
rec_mac.platEncID = 0
|
||||
rec_mac.langID = 0
|
||||
rec_mac.string = value
|
||||
name.names.append(rec_mac)
|
||||
|
||||
otf['name'] = name
|
||||
|
||||
# === Create post table (format 3.0 for smaller web fonts) ===
|
||||
post = newTable('post')
|
||||
post.formatType = 3.0 # No glyph names (smaller, web-optimized)
|
||||
post.italicAngle = 0
|
||||
post.underlinePosition = -100
|
||||
post.underlineThickness = 50
|
||||
post.isFixedPitch = 0
|
||||
post.minMemType42 = 0
|
||||
post.maxMemType42 = 0
|
||||
post.minMemType1 = 0
|
||||
post.maxMemType1 = 0
|
||||
otf['post'] = post
|
||||
|
||||
# Save the OTF font
|
||||
otf.save(output_path)
|
||||
otf.close()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"ERROR: Conversion failed: {str(e)}", file=sys.stderr)
|
||||
import traceback
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
return False
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: convert_cff_to_ttf.py <input.cff> <output.otf> [tounicode.cmap]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
input_path = Path(sys.argv[1])
|
||||
output_path = Path(sys.argv[2])
|
||||
tounicode_path = Path(sys.argv[3]) if len(sys.argv) > 3 else None
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"ERROR: Input file not found: {input_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if tounicode_path and not tounicode_path.exists():
|
||||
print(f"Warning: ToUnicode file not found: {tounicode_path}", file=sys.stderr)
|
||||
tounicode_path = None
|
||||
|
||||
success = wrap_cff_as_otf(str(input_path), str(output_path), str(tounicode_path) if tounicode_path else None)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user