mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
fix issues with type0
This commit is contained in:
parent
6db66f1f1a
commit
f645eaff18
BIN
app/core/src/main/resources/static/pdfjs-legacy/standard_fonts/LiberationMono-Regular.ttf
vendored
Normal file
BIN
app/core/src/main/resources/static/pdfjs-legacy/standard_fonts/LiberationMono-Regular.ttf
vendored
Normal file
Binary file not shown.
BIN
app/core/src/main/resources/static/pdfjs-legacy/standard_fonts/LiberationSerif-Regular.ttf
vendored
Normal file
BIN
app/core/src/main/resources/static/pdfjs-legacy/standard_fonts/LiberationSerif-Regular.ttf
vendored
Normal file
Binary file not shown.
@ -311,13 +311,17 @@ public class PdfJsonConversionService {
|
|||||||
|
|
||||||
try (PDDocument document = pdfDocumentFactory.load(workingPath, true)) {
|
try (PDDocument document = pdfDocumentFactory.load(workingPath, true)) {
|
||||||
int totalPages = document.getNumberOfPages();
|
int totalPages = document.getNumberOfPages();
|
||||||
boolean useLazyImages = totalPages > 5 && jobId != null;
|
// Only use lazy images for real async jobs where client can access the cache
|
||||||
|
// Synchronous calls with synthetic jobId should do full extraction
|
||||||
|
boolean useLazyImages = totalPages > 5 && isRealJobId;
|
||||||
Map<COSBase, FontModelCacheEntry> fontCache = new IdentityHashMap<>();
|
Map<COSBase, FontModelCacheEntry> fontCache = new IdentityHashMap<>();
|
||||||
Map<COSBase, EncodedImage> imageCache = new IdentityHashMap<>();
|
Map<COSBase, EncodedImage> imageCache = new IdentityHashMap<>();
|
||||||
log.debug(
|
log.debug(
|
||||||
"Converting PDF to JSON ({} pages) - {} mode",
|
"Converting PDF to JSON ({} pages) - {} mode (jobId: {}, isRealJobId: {})",
|
||||||
totalPages,
|
totalPages,
|
||||||
useLazyImages ? "lazy image" : "standard");
|
useLazyImages ? "lazy image" : "standard",
|
||||||
|
jobId,
|
||||||
|
isRealJobId);
|
||||||
Map<String, PdfJsonFont> fonts = new LinkedHashMap<>();
|
Map<String, PdfJsonFont> fonts = new LinkedHashMap<>();
|
||||||
Map<Integer, List<PdfJsonTextElement>> textByPage = new LinkedHashMap<>();
|
Map<Integer, List<PdfJsonTextElement>> textByPage = new LinkedHashMap<>();
|
||||||
Map<Integer, Map<PDFont, String>> pageFontResources = new HashMap<>();
|
Map<Integer, Map<PDFont, String>> pageFontResources = new HashMap<>();
|
||||||
@ -327,7 +331,8 @@ public class PdfJsonConversionService {
|
|||||||
int pageNumber = 1;
|
int pageNumber = 1;
|
||||||
for (PDPage page : document.getPages()) {
|
for (PDPage page : document.getPages()) {
|
||||||
Map<PDFont, String> resourceMap =
|
Map<PDFont, String> resourceMap =
|
||||||
collectFontsForPage(document, page, pageNumber, fonts, fontCache, jobId);
|
collectFontsForPage(
|
||||||
|
document, page, pageNumber, fonts, fontCache, jobId);
|
||||||
pageFontResources.put(pageNumber, resourceMap);
|
pageFontResources.put(pageNumber, resourceMap);
|
||||||
log.debug(
|
log.debug(
|
||||||
"PDF→JSON: collected {} font resources on page {}",
|
"PDF→JSON: collected {} font resources on page {}",
|
||||||
@ -444,8 +449,9 @@ public class PdfJsonConversionService {
|
|||||||
byte[] result = objectMapper.writeValueAsBytes(pdfJson);
|
byte[] result = objectMapper.writeValueAsBytes(pdfJson);
|
||||||
progress.accept(PdfJsonConversionProgress.complete());
|
progress.accept(PdfJsonConversionProgress.complete());
|
||||||
|
|
||||||
// If document wasn't cached, clear Type3 cache entries immediately
|
// Clear Type3 cache entries immediately for non-cached conversions
|
||||||
// (jobId is always set now, either from request context or synthetic)
|
// Cached conversions (useLazyImages=true) are cleaned when cache expires
|
||||||
|
// Synchronous conversions always clear immediately since they don't use lazy mode
|
||||||
if (!useLazyImages) {
|
if (!useLazyImages) {
|
||||||
clearType3CacheEntriesForJob(jobId);
|
clearType3CacheEntriesForJob(jobId);
|
||||||
}
|
}
|
||||||
@ -718,7 +724,8 @@ public class PdfJsonConversionService {
|
|||||||
mapping.put(font, fontId);
|
mapping.put(font, fontId);
|
||||||
String key = buildFontKey(jobId, pageNumber, fontId);
|
String key = buildFontKey(jobId, pageNumber, fontId);
|
||||||
if (!fonts.containsKey(key)) {
|
if (!fonts.containsKey(key)) {
|
||||||
fonts.put(key, buildFontModel(document, font, fontId, pageNumber, fontCache, jobId));
|
fonts.put(
|
||||||
|
key, buildFontModel(document, font, fontId, pageNumber, fontCache, jobId));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -873,15 +880,13 @@ public class PdfJsonConversionService {
|
|||||||
|| hasPayload(font.getWebProgram())
|
|| hasPayload(font.getWebProgram())
|
||||||
|| hasPayload(font.getProgram());
|
|| hasPayload(font.getProgram());
|
||||||
|
|
||||||
// Keep cosDictionary for TrueType and Type0 fonts even with usable program
|
// Only clear cosDictionary for Type3 fonts (which have inline content streams)
|
||||||
// Subsetted fonts need the ToUnicode CMap from the original dictionary
|
// All other font types may need ToUnicode CMap or encoding from the dictionary
|
||||||
|
// Conservative approach: better to keep extra data than lose encoding info
|
||||||
String subtype = font.getSubtype();
|
String subtype = font.getSubtype();
|
||||||
boolean needsCosDictionary =
|
boolean isType3 = subtype != null && subtype.equalsIgnoreCase("Type3");
|
||||||
subtype != null
|
|
||||||
&& (subtype.equalsIgnoreCase("TrueType")
|
|
||||||
|| subtype.equalsIgnoreCase("Type0"));
|
|
||||||
|
|
||||||
if (hasUsableProgram && !needsCosDictionary) {
|
if (hasUsableProgram && isType3) {
|
||||||
font.setCosDictionary(null);
|
font.setCosDictionary(null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1380,9 +1385,20 @@ public class PdfJsonConversionService {
|
|||||||
String key = buildFontKey(null, -1, effectiveId);
|
String key = buildFontKey(null, -1, effectiveId);
|
||||||
PDFont font = fontMap.get(key);
|
PDFont font = fontMap.get(key);
|
||||||
if (font != null) {
|
if (font != null) {
|
||||||
|
log.debug(
|
||||||
|
"[FALLBACK-DEBUG] Reusing cached fallback font {} (key: {})", effectiveId, key);
|
||||||
return font;
|
return font;
|
||||||
}
|
}
|
||||||
|
log.info(
|
||||||
|
"[FALLBACK-DEBUG] Loading fallback font {} (key: {}) via fallbackFontService",
|
||||||
|
effectiveId,
|
||||||
|
key);
|
||||||
PDFont loaded = fallbackFontService.loadFallbackPdfFont(document, effectiveId);
|
PDFont loaded = fallbackFontService.loadFallbackPdfFont(document, effectiveId);
|
||||||
|
log.info(
|
||||||
|
"[FALLBACK-DEBUG] Loaded fallback font {} - PDFont class: {}, name: {}",
|
||||||
|
effectiveId,
|
||||||
|
loaded.getClass().getSimpleName(),
|
||||||
|
loaded.getName());
|
||||||
fontMap.put(key, loaded);
|
fontMap.put(key, loaded);
|
||||||
if (fontModels != null
|
if (fontModels != null
|
||||||
&& fontModels.stream().noneMatch(f -> effectiveId.equals(f.getId()))) {
|
&& fontModels.stream().noneMatch(f -> effectiveId.equals(f.getId()))) {
|
||||||
@ -2561,10 +2577,16 @@ public class PdfJsonConversionService {
|
|||||||
&& runFontModel.getType3Glyphs() != null
|
&& runFontModel.getType3Glyphs() != null
|
||||||
&& !runFontModel.getType3Glyphs().isEmpty();
|
&& !runFontModel.getType3Glyphs().isEmpty();
|
||||||
|
|
||||||
if (isNormalizedType3) {
|
// For fonts with proper Unicode mappings, let PDFBox handle encoding
|
||||||
// For normalized Type3 fonts, use original text directly
|
// This includes: normalized Type3 fonts, PDType0Font (composite fonts)
|
||||||
// The font has proper Unicode mappings, so PDFBox can encode it
|
boolean useDirectText =
|
||||||
// correctly
|
isNormalizedType3
|
||||||
|
|| run.font()
|
||||||
|
instanceof
|
||||||
|
org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||||
|
|
||||||
|
if (useDirectText) {
|
||||||
|
// Pass text directly - PDFBox handles encoding internally
|
||||||
contentStream.showText(run.text());
|
contentStream.showText(run.text());
|
||||||
} else {
|
} else {
|
||||||
// For actual Type3 fonts and other fonts, encode manually
|
// For actual Type3 fonts and other fonts, encode manually
|
||||||
@ -2582,6 +2604,14 @@ public class PdfJsonConversionService {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
|
log.debug(
|
||||||
|
"[ENCODE-DEBUG] Encoding text '{}' with font {} (fontId={}, runFontModel={})",
|
||||||
|
run.text(),
|
||||||
|
run.font().getName(),
|
||||||
|
run.fontId(),
|
||||||
|
runFontModel != null
|
||||||
|
? runFontModel.getId()
|
||||||
|
: "null");
|
||||||
encoded =
|
encoded =
|
||||||
encodeTextWithFont(
|
encodeTextWithFont(
|
||||||
run.font(),
|
run.font(),
|
||||||
@ -2590,9 +2620,13 @@ public class PdfJsonConversionService {
|
|||||||
run.charCodes());
|
run.charCodes());
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
log.warn(
|
log.warn(
|
||||||
"Failed to encode text '{}' with font {} on page {}: {}",
|
"Failed to encode text '{}' with font {} (fontId={}, runFontModel={}) on page {}: {}",
|
||||||
run.text(),
|
run.text(),
|
||||||
run.font().getName(),
|
run.font().getName(),
|
||||||
|
run.fontId(),
|
||||||
|
runFontModel != null
|
||||||
|
? runFontModel.getId()
|
||||||
|
: "null",
|
||||||
pageNumber,
|
pageNumber,
|
||||||
ex.getMessage());
|
ex.getMessage());
|
||||||
continue;
|
continue;
|
||||||
@ -2725,7 +2759,11 @@ public class PdfJsonConversionService {
|
|||||||
}
|
}
|
||||||
if (targetFont == null || !fallbackFontService.canEncode(targetFont, glyph)) {
|
if (targetFont == null || !fallbackFontService.canEncode(targetFont, glyph)) {
|
||||||
fallbackApplied = true;
|
fallbackApplied = true;
|
||||||
String fallbackId = fallbackFontService.resolveFallbackFontId(codePoint);
|
// Try to match fallback font to original font family for visual consistency
|
||||||
|
String originalFontName =
|
||||||
|
baseFontModel != null ? baseFontModel.getBaseName() : null;
|
||||||
|
String fallbackId =
|
||||||
|
fallbackFontService.resolveFallbackFontId(originalFontName, codePoint);
|
||||||
targetFont = ensureFallbackFont(document, fontMap, fontModels, fallbackId);
|
targetFont = ensureFallbackFont(document, fontMap, fontModels, fallbackId);
|
||||||
targetFontId = fallbackId != null ? fallbackId : FALLBACK_FONT_ID;
|
targetFontId = fallbackId != null ? fallbackId : FALLBACK_FONT_ID;
|
||||||
if (targetFont == null || !fallbackFontService.canEncode(targetFont, glyph)) {
|
if (targetFont == null || !fallbackFontService.canEncode(targetFont, glyph)) {
|
||||||
@ -3335,7 +3373,8 @@ public class PdfJsonConversionService {
|
|||||||
// or return null to trigger fallback font
|
// or return null to trigger fallback font
|
||||||
} else if (!isType3Font || fontModel == null) {
|
} else if (!isType3Font || fontModel == null) {
|
||||||
// For non-Type3 fonts without Type3 metadata, use standard encoding
|
// For non-Type3 fonts without Type3 metadata, use standard encoding
|
||||||
return sanitizeEncoded(font.encode(text));
|
byte[] encoded = font.encode(text);
|
||||||
|
return sanitizeEncoded(encoded);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Type3 glyph mapping logic (for actual Type3 fonts AND normalized Type3 fonts)
|
// Type3 glyph mapping logic (for actual Type3 fonts AND normalized Type3 fonts)
|
||||||
@ -3750,20 +3789,43 @@ public class PdfJsonConversionService {
|
|||||||
}
|
}
|
||||||
// Fall through to Standard14 fallback below if nothing else succeeded.
|
// Fall through to Standard14 fallback below if nothing else succeeded.
|
||||||
} else {
|
} else {
|
||||||
|
// For TrueType and Type0 fonts, prioritize cosDictionary restoration
|
||||||
|
// These fonts often use ToUnicode CMap which is preserved in the dictionary
|
||||||
|
String subtype = fontModel.getSubtype();
|
||||||
|
boolean preferDictionary =
|
||||||
|
subtype != null
|
||||||
|
&& (subtype.equalsIgnoreCase("TrueType")
|
||||||
|
|| subtype.equalsIgnoreCase("Type0"));
|
||||||
|
|
||||||
|
if (preferDictionary) {
|
||||||
|
PDFont restored = restoreFontFromDictionary(document, fontModel);
|
||||||
|
if (restored != null) {
|
||||||
|
log.debug(
|
||||||
|
"Font {} restored from cosDictionary (preferred for subsetted {})",
|
||||||
|
fontModel.getId(),
|
||||||
|
subtype);
|
||||||
|
return restored;
|
||||||
|
}
|
||||||
|
// If dictionary restoration fails, fall back to font program bytes
|
||||||
|
log.debug(
|
||||||
|
"Font {} cosDictionary restoration failed, trying font program bytes",
|
||||||
|
fontModel.getId());
|
||||||
|
}
|
||||||
|
|
||||||
PDFont loaded =
|
PDFont loaded =
|
||||||
loadFirstAvailableFont(document, fontModel, orderedCandidates, originalFormat);
|
loadFirstAvailableFont(document, fontModel, orderedCandidates, originalFormat);
|
||||||
if (loaded != null) {
|
if (loaded != null) {
|
||||||
return loaded;
|
return loaded;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Try to restore from COS dictionary if font programs failed
|
// Try to restore from COS dictionary if font programs failed and we haven't tried yet
|
||||||
if (!isType3Font) {
|
if (!preferDictionary) {
|
||||||
PDFont restored = restoreFontFromDictionary(document, fontModel);
|
PDFont restored = restoreFontFromDictionary(document, fontModel);
|
||||||
if (restored != null) {
|
if (restored != null) {
|
||||||
return restored;
|
return restored;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (FontByteSource source : orderedCandidates) {
|
for (FontByteSource source : orderedCandidates) {
|
||||||
byte[] fontBytes = source.bytes();
|
byte[] fontBytes = source.bytes();
|
||||||
@ -3972,34 +4034,74 @@ public class PdfJsonConversionService {
|
|||||||
log.debug("[FONT-RESTORE] Font {} has no cosDictionary", fontModel.getId());
|
log.debug("[FONT-RESTORE] Font {} has no cosDictionary", fontModel.getId());
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
COSBase restored = cosMapper.deserializeCosValue(fontModel.getCosDictionary(), document);
|
|
||||||
|
// Deserialize the cosDictionary - cosMapper handles validation internally
|
||||||
|
COSBase restored;
|
||||||
|
try {
|
||||||
|
restored = cosMapper.deserializeCosValue(fontModel.getCosDictionary(), document);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
log.warn(
|
||||||
|
"[FONT-RESTORE] Font {} cosDictionary deserialization failed: {}",
|
||||||
|
fontModel.getId(),
|
||||||
|
ex.getMessage());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
if (!(restored instanceof COSDictionary cosDictionary)) {
|
if (!(restored instanceof COSDictionary cosDictionary)) {
|
||||||
log.debug(
|
log.warn(
|
||||||
"[FONT-RESTORE] Font {} cosDictionary deserialized to {} instead of COSDictionary",
|
"[FONT-RESTORE] Font {} cosDictionary deserialized to {} instead of COSDictionary",
|
||||||
fontModel.getId(),
|
fontModel.getId(),
|
||||||
restored != null ? restored.getClass().getSimpleName() : "null");
|
restored != null ? restored.getClass().getSimpleName() : "null");
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Validate that dictionary contains required font keys
|
||||||
|
if (!cosDictionary.containsKey(org.apache.pdfbox.cos.COSName.TYPE)
|
||||||
|
|| !cosDictionary.containsKey(org.apache.pdfbox.cos.COSName.SUBTYPE)) {
|
||||||
|
log.warn(
|
||||||
|
"[FONT-RESTORE] Font {} cosDictionary missing required Type or Subtype keys",
|
||||||
|
fontModel.getId());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
PDFont font = PDFontFactory.createFont(cosDictionary);
|
PDFont font = PDFontFactory.createFont(cosDictionary);
|
||||||
if (font != null && font.isEmbedded()) {
|
if (font == null) {
|
||||||
|
log.warn(
|
||||||
|
"[FONT-RESTORE] Font {} PDFontFactory returned null for valid dictionary",
|
||||||
|
fontModel.getId());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!font.isEmbedded()) {
|
||||||
|
log.warn(
|
||||||
|
"[FONT-RESTORE] Font {} restored from dictionary but is not embedded; rejecting to avoid system font substitution",
|
||||||
|
fontModel.getId());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
applyAdditionalFontMetadata(document, font, fontModel);
|
applyAdditionalFontMetadata(document, font, fontModel);
|
||||||
log.info(
|
log.info(
|
||||||
"[FONT-RESTORE] Successfully restored embedded font {} from original dictionary",
|
"[FONT-RESTORE] Successfully restored embedded font {} (subtype={}) from original dictionary",
|
||||||
fontModel.getId());
|
|
||||||
return font;
|
|
||||||
}
|
|
||||||
log.warn(
|
|
||||||
"[FONT-RESTORE] Restored font {} from dictionary but font was {}embedded; rejecting",
|
|
||||||
fontModel.getId(),
|
fontModel.getId(),
|
||||||
font != null && font.isEmbedded() ? "" : "not ");
|
font.getSubType());
|
||||||
|
return font;
|
||||||
|
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
log.warn(
|
log.warn(
|
||||||
"[FONT-RESTORE] Failed to restore font {} from stored dictionary: {}",
|
"[FONT-RESTORE] Failed to restore font {} from dictionary ({}): {}",
|
||||||
fontModel.getId(),
|
fontModel.getId(),
|
||||||
|
fontModel.getSubtype(),
|
||||||
ex.getMessage());
|
ex.getMessage());
|
||||||
}
|
|
||||||
return null;
|
return null;
|
||||||
|
} catch (Exception ex) {
|
||||||
|
log.error(
|
||||||
|
"[FONT-RESTORE] Unexpected error restoring font {} from dictionary: {}",
|
||||||
|
fontModel.getId(),
|
||||||
|
ex.getMessage(),
|
||||||
|
ex);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isType1Format(String format) {
|
private boolean isType1Format(String format) {
|
||||||
@ -4948,7 +5050,8 @@ public class PdfJsonConversionService {
|
|||||||
}
|
}
|
||||||
String key = buildFontKey(jobId, currentPage, fontId);
|
String key = buildFontKey(jobId, currentPage, fontId);
|
||||||
if (!fonts.containsKey(key)) {
|
if (!fonts.containsKey(key)) {
|
||||||
fonts.put(key, buildFontModel(document, font, fontId, currentPage, fontCache, jobId));
|
fonts.put(
|
||||||
|
key, buildFontModel(document, font, fontId, currentPage, fontCache, jobId));
|
||||||
}
|
}
|
||||||
return fontId;
|
return fontId;
|
||||||
}
|
}
|
||||||
@ -5514,8 +5617,8 @@ public class PdfJsonConversionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clear job-specific entries from Type3 font caches. Font UIDs include jobId prefix, so we
|
* Clear job-specific entries from Type3 font caches. Font UIDs include jobId prefix, so we can
|
||||||
* can identify and remove them.
|
* identify and remove them.
|
||||||
*/
|
*/
|
||||||
private void clearType3CacheEntriesForJob(String jobId) {
|
private void clearType3CacheEntriesForJob(String jobId) {
|
||||||
if (jobId == null || jobId.isEmpty()) {
|
if (jobId == null || jobId.isEmpty()) {
|
||||||
|
|||||||
@ -36,6 +36,21 @@ public class PdfJsonFallbackFontService {
|
|||||||
public static final String FALLBACK_FONT_AR_ID = "fallback-noto-arabic";
|
public static final String FALLBACK_FONT_AR_ID = "fallback-noto-arabic";
|
||||||
public static final String FALLBACK_FONT_TH_ID = "fallback-noto-thai";
|
public static final String FALLBACK_FONT_TH_ID = "fallback-noto-thai";
|
||||||
|
|
||||||
|
// Font name aliases map PDF font names to available fallback fonts
|
||||||
|
// This provides better visual consistency when editing PDFs
|
||||||
|
private static final Map<String, String> FONT_NAME_ALIASES =
|
||||||
|
Map.ofEntries(
|
||||||
|
// Liberation fonts are metric-compatible with Microsoft core fonts
|
||||||
|
Map.entry("arial", "fallback-liberation-sans"),
|
||||||
|
Map.entry("helvetica", "fallback-liberation-sans"),
|
||||||
|
Map.entry("arimo", "fallback-liberation-sans"),
|
||||||
|
Map.entry("times", "fallback-liberation-serif"),
|
||||||
|
Map.entry("timesnewroman", "fallback-liberation-serif"),
|
||||||
|
Map.entry("tinos", "fallback-liberation-serif"),
|
||||||
|
Map.entry("courier", "fallback-liberation-mono"),
|
||||||
|
Map.entry("couriernew", "fallback-liberation-mono"),
|
||||||
|
Map.entry("cousine", "fallback-liberation-mono"));
|
||||||
|
|
||||||
private static final Map<String, FallbackFontSpec> BUILT_IN_FALLBACK_FONTS =
|
private static final Map<String, FallbackFontSpec> BUILT_IN_FALLBACK_FONTS =
|
||||||
Map.ofEntries(
|
Map.ofEntries(
|
||||||
Map.entry(
|
Map.entry(
|
||||||
@ -65,6 +80,45 @@ public class PdfJsonFallbackFontService {
|
|||||||
new FallbackFontSpec(
|
new FallbackFontSpec(
|
||||||
"classpath:/static/fonts/NotoSansThai-Regular.ttf",
|
"classpath:/static/fonts/NotoSansThai-Regular.ttf",
|
||||||
"NotoSansThai-Regular",
|
"NotoSansThai-Regular",
|
||||||
|
"ttf")),
|
||||||
|
// Liberation Sans family
|
||||||
|
Map.entry(
|
||||||
|
"fallback-liberation-sans",
|
||||||
|
new FallbackFontSpec(
|
||||||
|
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationSans-Regular.ttf",
|
||||||
|
"LiberationSans-Regular",
|
||||||
|
"ttf")),
|
||||||
|
Map.entry(
|
||||||
|
"fallback-liberation-sans-bold",
|
||||||
|
new FallbackFontSpec(
|
||||||
|
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationSans-Bold.ttf",
|
||||||
|
"LiberationSans-Bold",
|
||||||
|
"ttf")),
|
||||||
|
Map.entry(
|
||||||
|
"fallback-liberation-sans-italic",
|
||||||
|
new FallbackFontSpec(
|
||||||
|
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationSans-Italic.ttf",
|
||||||
|
"LiberationSans-Italic",
|
||||||
|
"ttf")),
|
||||||
|
Map.entry(
|
||||||
|
"fallback-liberation-sans-bolditalic",
|
||||||
|
new FallbackFontSpec(
|
||||||
|
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationSans-BoldItalic.ttf",
|
||||||
|
"LiberationSans-BoldItalic",
|
||||||
|
"ttf")),
|
||||||
|
// Liberation Serif family
|
||||||
|
Map.entry(
|
||||||
|
"fallback-liberation-serif",
|
||||||
|
new FallbackFontSpec(
|
||||||
|
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationSerif-Regular.ttf",
|
||||||
|
"LiberationSerif-Regular",
|
||||||
|
"ttf")),
|
||||||
|
// Liberation Mono family
|
||||||
|
Map.entry(
|
||||||
|
"fallback-liberation-mono",
|
||||||
|
new FallbackFontSpec(
|
||||||
|
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationMono-Regular.ttf",
|
||||||
|
"LiberationMono-Regular",
|
||||||
"ttf")));
|
"ttf")));
|
||||||
|
|
||||||
private final ResourceLoader resourceLoader;
|
private final ResourceLoader resourceLoader;
|
||||||
@ -107,7 +161,9 @@ public class PdfJsonFallbackFontService {
|
|||||||
}
|
}
|
||||||
byte[] bytes = loadFallbackFontBytes(fallbackId, spec);
|
byte[] bytes = loadFallbackFontBytes(fallbackId, spec);
|
||||||
try (InputStream stream = new ByteArrayInputStream(bytes)) {
|
try (InputStream stream = new ByteArrayInputStream(bytes)) {
|
||||||
return PDType0Font.load(document, stream, true);
|
// Load with embedSubset=false to ensure full glyph coverage
|
||||||
|
// Fallback fonts need all glyphs available for substituting missing characters
|
||||||
|
return PDType0Font.load(document, stream, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -140,6 +196,53 @@ public class PdfJsonFallbackFontService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve fallback font ID based on the original font name and code point. Attempts to match
|
||||||
|
* font family for visual consistency.
|
||||||
|
*
|
||||||
|
* @param originalFontName the name of the original font (may be null)
|
||||||
|
* @param codePoint the Unicode code point that needs to be rendered
|
||||||
|
* @return fallback font ID
|
||||||
|
*/
|
||||||
|
public String resolveFallbackFontId(String originalFontName, int codePoint) {
|
||||||
|
// First try to match based on original font name for visual consistency
|
||||||
|
if (originalFontName != null && !originalFontName.isEmpty()) {
|
||||||
|
// Normalize font name: remove subset prefix (e.g. "PXAAAC+"), convert to lowercase,
|
||||||
|
// remove spaces
|
||||||
|
String normalized =
|
||||||
|
originalFontName
|
||||||
|
.replaceAll("^[A-Z]{6}\\+", "") // Remove subset prefix
|
||||||
|
.toLowerCase()
|
||||||
|
.replaceAll("\\s+", ""); // Remove spaces (e.g. "Times New Roman" ->
|
||||||
|
// "timesnewroman")
|
||||||
|
|
||||||
|
// Extract base name without weight/style suffixes
|
||||||
|
// Split on common delimiters: hyphen, underscore, comma, plus
|
||||||
|
// Handles: "Arimo_700wght" -> "arimo", "Arial-Bold" -> "arial", "Arial,Bold" -> "arial"
|
||||||
|
String baseName = normalized.split("[-_,+]")[0];
|
||||||
|
|
||||||
|
String aliasedFontId = FONT_NAME_ALIASES.get(baseName);
|
||||||
|
if (aliasedFontId != null) {
|
||||||
|
log.debug(
|
||||||
|
"Matched font '{}' (normalized: '{}', base: '{}') to fallback '{}'",
|
||||||
|
originalFontName,
|
||||||
|
normalized,
|
||||||
|
baseName,
|
||||||
|
aliasedFontId);
|
||||||
|
return aliasedFontId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to Unicode-based selection
|
||||||
|
return resolveFallbackFontId(codePoint);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve fallback font ID based on Unicode code point properties.
|
||||||
|
*
|
||||||
|
* @param codePoint the Unicode code point
|
||||||
|
* @return fallback font ID
|
||||||
|
*/
|
||||||
public String resolveFallbackFontId(int codePoint) {
|
public String resolveFallbackFontId(int codePoint) {
|
||||||
Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
|
Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
|
||||||
if (block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|
if (block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|
||||||
|
|||||||
@ -13,11 +13,25 @@
|
|||||||
- Reuse the existing job cache (`documentCache`) to serve these on demand and clean up after timeouts (`PdfJsonConversionService.java:3608-3687`).
|
- Reuse the existing job cache (`documentCache`) to serve these on demand and clean up after timeouts (`PdfJsonConversionService.java:3608-3687`).
|
||||||
|
|
||||||
- **Editor UX Safeguards**
|
- **Editor UX Safeguards**
|
||||||
- Respect `fallbackFontService` indicators; mark groups using fallback glyphs so the UI can warn about possible appearance shifts (`frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1260-1287`).
|
- Mark groups using fallback glyphs so the UI can warn about possible appearance shifts. Font family matching is now implemented (Liberation fonts), but weight matching is still TODO, so bold/italic text using fallbacks may appear lighter than original.
|
||||||
- Surface when Type3 conversion was downgraded (e.g., rasterized glyphs) and limit editing to operations that keep the PDF stable.
|
- Surface when Type3 conversion was downgraded (e.g., rasterized glyphs) and limit editing to operations that keep the PDF stable.
|
||||||
|
- Reference: `frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1260-1287`
|
||||||
|
|
||||||
- **Canonical Font Sharing**
|
- **Canonical Font Sharing**
|
||||||
- Emit fonts once per unique embedded program. Add a `canonicalFonts` array containing the full payload (program, ToUnicode, metadata) and a compact `fontAliases` mapping `{pageNumber, fontId, canonicalUid}` so text elements can still reference per-page IDs.
|
- Emit fonts once per unique embedded program. Add a `canonicalFonts` array containing the full payload (program, ToUnicode, metadata) and a compact `fontAliases` mapping `{pageNumber, fontId, canonicalUid}` so text elements can still reference per-page IDs.
|
||||||
- Store COS dictionaries only on canonical entries; aliases should keep light fields (e.g., size adjustments) if they differ.
|
- Note: COS dictionaries are currently preserved for TrueType/Type0 fonts (needed for ToUnicode CMap). The canonical approach should maintain this preservation while deduplicating font programs.
|
||||||
- Update `buildFontMap` to resolve aliases when recreating PDFBox fonts, and adjust the front end to load programs via the canonical UID.
|
- Update `buildFontMap` to resolve aliases when recreating PDFBox fonts, and adjust the front end to load programs via the canonical UID.
|
||||||
- Optional: expose a lazy endpoint for the original COS dictionary if the canonical record strips it, so export still reconstructs untouched fonts.
|
- Optional: expose a lazy endpoint for the original COS dictionary if the canonical record strips it, so export still reconstructs untouched fonts.
|
||||||
|
|
||||||
|
- **Font Weight Matching for Fallback Fonts**
|
||||||
|
- Font family matching is now implemented (Arial→LiberationSans, Times→LiberationSerif, Courier→LiberationMono).
|
||||||
|
- However, fallback fonts still use Regular weight for all missing glyphs, regardless of the original font weight (e.g., bold text falls back to regular weight).
|
||||||
|
- TODO: Parse weight from font names (e.g., `Arimo_700wght`, `Arial-Bold`, `TimesNewRoman,SemiBold`) and map to corresponding Liberation font variants:
|
||||||
|
- Regular/Normal → LiberationSans-Regular, LiberationSerif-Regular, LiberationMono-Regular
|
||||||
|
- Bold/700 → LiberationSans-Bold, LiberationSerif-Bold, LiberationMono-Bold
|
||||||
|
- Italic/Oblique → LiberationSans-Italic, LiberationSerif-Italic, LiberationMono-Italic
|
||||||
|
- BoldItalic → LiberationSans-BoldItalic, LiberationSerif-BoldItalic, LiberationMono-BoldItalic
|
||||||
|
- Add all Liberation font variants to `BUILT_IN_FALLBACK_FONTS` map with appropriate IDs (e.g., `fallback-liberation-sans-bold`).
|
||||||
|
- Update `resolveFallbackFontId(String originalFontName, int codePoint)` in `PdfJsonFallbackFontService.java` to detect weight/style and return the matching variant ID.
|
||||||
|
- Benefits: Better visual consistency when editing text in bold/italic fonts, as missing characters will match the original weight.
|
||||||
|
- Implementation reference: `app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonFallbackFontService.java:186-213`
|
||||||
|
|||||||
1431
docs/pdf_json_threading_analysis.md
Normal file
1431
docs/pdf_json_threading_analysis.md
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user