fix issues with type0

This commit is contained in:
Anthony Stirling 2025-11-10 00:26:55 +00:00
parent 6db66f1f1a
commit f645eaff18
6 changed files with 1696 additions and 45 deletions

View File

@ -311,13 +311,17 @@ public class PdfJsonConversionService {
try (PDDocument document = pdfDocumentFactory.load(workingPath, true)) {
int totalPages = document.getNumberOfPages();
boolean useLazyImages = totalPages > 5 && jobId != null;
// Only use lazy images for real async jobs where client can access the cache
// Synchronous calls with synthetic jobId should do full extraction
boolean useLazyImages = totalPages > 5 && isRealJobId;
Map<COSBase, FontModelCacheEntry> fontCache = new IdentityHashMap<>();
Map<COSBase, EncodedImage> imageCache = new IdentityHashMap<>();
log.debug(
"Converting PDF to JSON ({} pages) - {} mode",
"Converting PDF to JSON ({} pages) - {} mode (jobId: {}, isRealJobId: {})",
totalPages,
useLazyImages ? "lazy image" : "standard");
useLazyImages ? "lazy image" : "standard",
jobId,
isRealJobId);
Map<String, PdfJsonFont> fonts = new LinkedHashMap<>();
Map<Integer, List<PdfJsonTextElement>> textByPage = new LinkedHashMap<>();
Map<Integer, Map<PDFont, String>> pageFontResources = new HashMap<>();
@ -327,7 +331,8 @@ public class PdfJsonConversionService {
int pageNumber = 1;
for (PDPage page : document.getPages()) {
Map<PDFont, String> resourceMap =
collectFontsForPage(document, page, pageNumber, fonts, fontCache, jobId);
collectFontsForPage(
document, page, pageNumber, fonts, fontCache, jobId);
pageFontResources.put(pageNumber, resourceMap);
log.debug(
"PDF→JSON: collected {} font resources on page {}",
@ -444,8 +449,9 @@ public class PdfJsonConversionService {
byte[] result = objectMapper.writeValueAsBytes(pdfJson);
progress.accept(PdfJsonConversionProgress.complete());
// If document wasn't cached, clear Type3 cache entries immediately
// (jobId is always set now, either from request context or synthetic)
// Clear Type3 cache entries immediately for non-cached conversions
// Cached conversions (useLazyImages=true) are cleaned when cache expires
// Synchronous conversions always clear immediately since they don't use lazy mode
if (!useLazyImages) {
clearType3CacheEntriesForJob(jobId);
}
@ -718,7 +724,8 @@ public class PdfJsonConversionService {
mapping.put(font, fontId);
String key = buildFontKey(jobId, pageNumber, fontId);
if (!fonts.containsKey(key)) {
fonts.put(key, buildFontModel(document, font, fontId, pageNumber, fontCache, jobId));
fonts.put(
key, buildFontModel(document, font, fontId, pageNumber, fontCache, jobId));
}
}
@ -873,15 +880,13 @@ public class PdfJsonConversionService {
|| hasPayload(font.getWebProgram())
|| hasPayload(font.getProgram());
// Keep cosDictionary for TrueType and Type0 fonts even with usable program
// Subsetted fonts need the ToUnicode CMap from the original dictionary
// Only clear cosDictionary for Type3 fonts (which have inline content streams)
// All other font types may need ToUnicode CMap or encoding from the dictionary
// Conservative approach: better to keep extra data than lose encoding info
String subtype = font.getSubtype();
boolean needsCosDictionary =
subtype != null
&& (subtype.equalsIgnoreCase("TrueType")
|| subtype.equalsIgnoreCase("Type0"));
boolean isType3 = subtype != null && subtype.equalsIgnoreCase("Type3");
if (hasUsableProgram && !needsCosDictionary) {
if (hasUsableProgram && isType3) {
font.setCosDictionary(null);
}
}
@ -1380,9 +1385,20 @@ public class PdfJsonConversionService {
String key = buildFontKey(null, -1, effectiveId);
PDFont font = fontMap.get(key);
if (font != null) {
log.debug(
"[FALLBACK-DEBUG] Reusing cached fallback font {} (key: {})", effectiveId, key);
return font;
}
log.info(
"[FALLBACK-DEBUG] Loading fallback font {} (key: {}) via fallbackFontService",
effectiveId,
key);
PDFont loaded = fallbackFontService.loadFallbackPdfFont(document, effectiveId);
log.info(
"[FALLBACK-DEBUG] Loaded fallback font {} - PDFont class: {}, name: {}",
effectiveId,
loaded.getClass().getSimpleName(),
loaded.getName());
fontMap.put(key, loaded);
if (fontModels != null
&& fontModels.stream().noneMatch(f -> effectiveId.equals(f.getId()))) {
@ -2561,10 +2577,16 @@ public class PdfJsonConversionService {
&& runFontModel.getType3Glyphs() != null
&& !runFontModel.getType3Glyphs().isEmpty();
if (isNormalizedType3) {
// For normalized Type3 fonts, use original text directly
// The font has proper Unicode mappings, so PDFBox can encode it
// correctly
// For fonts with proper Unicode mappings, let PDFBox handle encoding
// This includes: normalized Type3 fonts, PDType0Font (composite fonts)
boolean useDirectText =
isNormalizedType3
|| run.font()
instanceof
org.apache.pdfbox.pdmodel.font.PDType0Font;
if (useDirectText) {
// Pass text directly - PDFBox handles encoding internally
contentStream.showText(run.text());
} else {
// For actual Type3 fonts and other fonts, encode manually
@ -2582,6 +2604,14 @@ public class PdfJsonConversionService {
}
} else {
try {
log.debug(
"[ENCODE-DEBUG] Encoding text '{}' with font {} (fontId={}, runFontModel={})",
run.text(),
run.font().getName(),
run.fontId(),
runFontModel != null
? runFontModel.getId()
: "null");
encoded =
encodeTextWithFont(
run.font(),
@ -2590,9 +2620,13 @@ public class PdfJsonConversionService {
run.charCodes());
} catch (IOException ex) {
log.warn(
"Failed to encode text '{}' with font {} on page {}: {}",
"Failed to encode text '{}' with font {} (fontId={}, runFontModel={}) on page {}: {}",
run.text(),
run.font().getName(),
run.fontId(),
runFontModel != null
? runFontModel.getId()
: "null",
pageNumber,
ex.getMessage());
continue;
@ -2725,7 +2759,11 @@ public class PdfJsonConversionService {
}
if (targetFont == null || !fallbackFontService.canEncode(targetFont, glyph)) {
fallbackApplied = true;
String fallbackId = fallbackFontService.resolveFallbackFontId(codePoint);
// Try to match fallback font to original font family for visual consistency
String originalFontName =
baseFontModel != null ? baseFontModel.getBaseName() : null;
String fallbackId =
fallbackFontService.resolveFallbackFontId(originalFontName, codePoint);
targetFont = ensureFallbackFont(document, fontMap, fontModels, fallbackId);
targetFontId = fallbackId != null ? fallbackId : FALLBACK_FONT_ID;
if (targetFont == null || !fallbackFontService.canEncode(targetFont, glyph)) {
@ -3335,7 +3373,8 @@ public class PdfJsonConversionService {
// or return null to trigger fallback font
} else if (!isType3Font || fontModel == null) {
// For non-Type3 fonts without Type3 metadata, use standard encoding
return sanitizeEncoded(font.encode(text));
byte[] encoded = font.encode(text);
return sanitizeEncoded(encoded);
}
// Type3 glyph mapping logic (for actual Type3 fonts AND normalized Type3 fonts)
@ -3750,18 +3789,41 @@ public class PdfJsonConversionService {
}
// Fall through to Standard14 fallback below if nothing else succeeded.
} else {
// For TrueType and Type0 fonts, prioritize cosDictionary restoration
// These fonts often use ToUnicode CMap which is preserved in the dictionary
String subtype = fontModel.getSubtype();
boolean preferDictionary =
subtype != null
&& (subtype.equalsIgnoreCase("TrueType")
|| subtype.equalsIgnoreCase("Type0"));
if (preferDictionary) {
PDFont restored = restoreFontFromDictionary(document, fontModel);
if (restored != null) {
log.debug(
"Font {} restored from cosDictionary (preferred for subsetted {})",
fontModel.getId(),
subtype);
return restored;
}
// If dictionary restoration fails, fall back to font program bytes
log.debug(
"Font {} cosDictionary restoration failed, trying font program bytes",
fontModel.getId());
}
PDFont loaded =
loadFirstAvailableFont(document, fontModel, orderedCandidates, originalFormat);
if (loaded != null) {
return loaded;
}
}
// Try to restore from COS dictionary if font programs failed
if (!isType3Font) {
PDFont restored = restoreFontFromDictionary(document, fontModel);
if (restored != null) {
return restored;
// Try to restore from COS dictionary if font programs failed and we haven't tried yet
if (!preferDictionary) {
PDFont restored = restoreFontFromDictionary(document, fontModel);
if (restored != null) {
return restored;
}
}
}
@ -3972,34 +4034,74 @@ public class PdfJsonConversionService {
log.debug("[FONT-RESTORE] Font {} has no cosDictionary", fontModel.getId());
return null;
}
COSBase restored = cosMapper.deserializeCosValue(fontModel.getCosDictionary(), document);
// Deserialize the cosDictionary - cosMapper handles validation internally
COSBase restored;
try {
restored = cosMapper.deserializeCosValue(fontModel.getCosDictionary(), document);
} catch (Exception ex) {
log.warn(
"[FONT-RESTORE] Font {} cosDictionary deserialization failed: {}",
fontModel.getId(),
ex.getMessage());
return null;
}
if (!(restored instanceof COSDictionary cosDictionary)) {
log.debug(
log.warn(
"[FONT-RESTORE] Font {} cosDictionary deserialized to {} instead of COSDictionary",
fontModel.getId(),
restored != null ? restored.getClass().getSimpleName() : "null");
return null;
}
// Validate that dictionary contains required font keys
if (!cosDictionary.containsKey(org.apache.pdfbox.cos.COSName.TYPE)
|| !cosDictionary.containsKey(org.apache.pdfbox.cos.COSName.SUBTYPE)) {
log.warn(
"[FONT-RESTORE] Font {} cosDictionary missing required Type or Subtype keys",
fontModel.getId());
return null;
}
try {
PDFont font = PDFontFactory.createFont(cosDictionary);
if (font != null && font.isEmbedded()) {
applyAdditionalFontMetadata(document, font, fontModel);
log.info(
"[FONT-RESTORE] Successfully restored embedded font {} from original dictionary",
if (font == null) {
log.warn(
"[FONT-RESTORE] Font {} PDFontFactory returned null for valid dictionary",
fontModel.getId());
return font;
return null;
}
log.warn(
"[FONT-RESTORE] Restored font {} from dictionary but font was {}embedded; rejecting",
if (!font.isEmbedded()) {
log.warn(
"[FONT-RESTORE] Font {} restored from dictionary but is not embedded; rejecting to avoid system font substitution",
fontModel.getId());
return null;
}
applyAdditionalFontMetadata(document, font, fontModel);
log.info(
"[FONT-RESTORE] Successfully restored embedded font {} (subtype={}) from original dictionary",
fontModel.getId(),
font != null && font.isEmbedded() ? "" : "not ");
font.getSubType());
return font;
} catch (IOException ex) {
log.warn(
"[FONT-RESTORE] Failed to restore font {} from stored dictionary: {}",
"[FONT-RESTORE] Failed to restore font {} from dictionary ({}): {}",
fontModel.getId(),
fontModel.getSubtype(),
ex.getMessage());
return null;
} catch (Exception ex) {
log.error(
"[FONT-RESTORE] Unexpected error restoring font {} from dictionary: {}",
fontModel.getId(),
ex.getMessage(),
ex);
return null;
}
return null;
}
private boolean isType1Format(String format) {
@ -4948,7 +5050,8 @@ public class PdfJsonConversionService {
}
String key = buildFontKey(jobId, currentPage, fontId);
if (!fonts.containsKey(key)) {
fonts.put(key, buildFontModel(document, font, fontId, currentPage, fontCache, jobId));
fonts.put(
key, buildFontModel(document, font, fontId, currentPage, fontCache, jobId));
}
return fontId;
}
@ -5514,8 +5617,8 @@ public class PdfJsonConversionService {
}
/**
* Clear job-specific entries from Type3 font caches. Font UIDs include jobId prefix, so we
* can identify and remove them.
* Clear job-specific entries from Type3 font caches. Font UIDs include jobId prefix, so we can
* identify and remove them.
*/
private void clearType3CacheEntriesForJob(String jobId) {
if (jobId == null || jobId.isEmpty()) {

View File

@ -36,6 +36,21 @@ public class PdfJsonFallbackFontService {
public static final String FALLBACK_FONT_AR_ID = "fallback-noto-arabic";
public static final String FALLBACK_FONT_TH_ID = "fallback-noto-thai";
// Font name aliases map PDF font names to available fallback fonts
// This provides better visual consistency when editing PDFs
private static final Map<String, String> FONT_NAME_ALIASES =
Map.ofEntries(
// Liberation fonts are metric-compatible with Microsoft core fonts
Map.entry("arial", "fallback-liberation-sans"),
Map.entry("helvetica", "fallback-liberation-sans"),
Map.entry("arimo", "fallback-liberation-sans"),
Map.entry("times", "fallback-liberation-serif"),
Map.entry("timesnewroman", "fallback-liberation-serif"),
Map.entry("tinos", "fallback-liberation-serif"),
Map.entry("courier", "fallback-liberation-mono"),
Map.entry("couriernew", "fallback-liberation-mono"),
Map.entry("cousine", "fallback-liberation-mono"));
private static final Map<String, FallbackFontSpec> BUILT_IN_FALLBACK_FONTS =
Map.ofEntries(
Map.entry(
@ -65,6 +80,45 @@ public class PdfJsonFallbackFontService {
new FallbackFontSpec(
"classpath:/static/fonts/NotoSansThai-Regular.ttf",
"NotoSansThai-Regular",
"ttf")),
// Liberation Sans family
Map.entry(
"fallback-liberation-sans",
new FallbackFontSpec(
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationSans-Regular.ttf",
"LiberationSans-Regular",
"ttf")),
Map.entry(
"fallback-liberation-sans-bold",
new FallbackFontSpec(
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationSans-Bold.ttf",
"LiberationSans-Bold",
"ttf")),
Map.entry(
"fallback-liberation-sans-italic",
new FallbackFontSpec(
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationSans-Italic.ttf",
"LiberationSans-Italic",
"ttf")),
Map.entry(
"fallback-liberation-sans-bolditalic",
new FallbackFontSpec(
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationSans-BoldItalic.ttf",
"LiberationSans-BoldItalic",
"ttf")),
// Liberation Serif family
Map.entry(
"fallback-liberation-serif",
new FallbackFontSpec(
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationSerif-Regular.ttf",
"LiberationSerif-Regular",
"ttf")),
// Liberation Mono family
Map.entry(
"fallback-liberation-mono",
new FallbackFontSpec(
"classpath:/static/pdfjs-legacy/standard_fonts/LiberationMono-Regular.ttf",
"LiberationMono-Regular",
"ttf")));
private final ResourceLoader resourceLoader;
@ -107,7 +161,9 @@ public class PdfJsonFallbackFontService {
}
byte[] bytes = loadFallbackFontBytes(fallbackId, spec);
try (InputStream stream = new ByteArrayInputStream(bytes)) {
return PDType0Font.load(document, stream, true);
// Load with embedSubset=false to ensure full glyph coverage
// Fallback fonts need all glyphs available for substituting missing characters
return PDType0Font.load(document, stream, false);
}
}
@ -140,6 +196,53 @@ public class PdfJsonFallbackFontService {
}
}
/**
* Resolve fallback font ID based on the original font name and code point. Attempts to match
* font family for visual consistency.
*
* @param originalFontName the name of the original font (may be null)
* @param codePoint the Unicode code point that needs to be rendered
* @return fallback font ID
*/
public String resolveFallbackFontId(String originalFontName, int codePoint) {
// First try to match based on original font name for visual consistency
if (originalFontName != null && !originalFontName.isEmpty()) {
// Normalize font name: remove subset prefix (e.g. "PXAAAC+"), convert to lowercase,
// remove spaces
String normalized =
originalFontName
.replaceAll("^[A-Z]{6}\\+", "") // Remove subset prefix
.toLowerCase()
.replaceAll("\\s+", ""); // Remove spaces (e.g. "Times New Roman" ->
// "timesnewroman")
// Extract base name without weight/style suffixes
// Split on common delimiters: hyphen, underscore, comma, plus
// Handles: "Arimo_700wght" -> "arimo", "Arial-Bold" -> "arial", "Arial,Bold" -> "arial"
String baseName = normalized.split("[-_,+]")[0];
String aliasedFontId = FONT_NAME_ALIASES.get(baseName);
if (aliasedFontId != null) {
log.debug(
"Matched font '{}' (normalized: '{}', base: '{}') to fallback '{}'",
originalFontName,
normalized,
baseName,
aliasedFontId);
return aliasedFontId;
}
}
// Fall back to Unicode-based selection
return resolveFallbackFontId(codePoint);
}
/**
* Resolve fallback font ID based on Unicode code point properties.
*
* @param codePoint the Unicode code point
* @return fallback font ID
*/
public String resolveFallbackFontId(int codePoint) {
Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
if (block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS

View File

@ -13,11 +13,25 @@
- Reuse the existing job cache (`documentCache`) to serve these on demand and clean up after timeouts (`PdfJsonConversionService.java:3608-3687`).
- **Editor UX Safeguards**
- Respect `fallbackFontService` indicators; mark groups using fallback glyphs so the UI can warn about possible appearance shifts (`frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1260-1287`).
- Mark groups using fallback glyphs so the UI can warn about possible appearance shifts. Font family matching is now implemented (Liberation fonts), but weight matching is still TODO, so bold/italic text using fallbacks may appear lighter than original.
- Surface when Type3 conversion was downgraded (e.g., rasterized glyphs) and limit editing to operations that keep the PDF stable.
- Reference: `frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx:1260-1287`
- **Canonical Font Sharing**
- Emit fonts once per unique embedded program. Add a `canonicalFonts` array containing the full payload (program, ToUnicode, metadata) and a compact `fontAliases` mapping `{pageNumber, fontId, canonicalUid}` so text elements can still reference per-page IDs.
- Store COS dictionaries only on canonical entries; aliases should keep light fields (e.g., size adjustments) if they differ.
- Note: COS dictionaries are currently preserved for TrueType/Type0 fonts (needed for ToUnicode CMap). The canonical approach should maintain this preservation while deduplicating font programs.
- Update `buildFontMap` to resolve aliases when recreating PDFBox fonts, and adjust the front end to load programs via the canonical UID.
- Optional: expose a lazy endpoint for the original COS dictionary if the canonical record strips it, so export still reconstructs untouched fonts.
- **Font Weight Matching for Fallback Fonts**
- Font family matching is now implemented (Arial→LiberationSans, Times→LiberationSerif, Courier→LiberationMono).
- However, fallback fonts still use Regular weight for all missing glyphs, regardless of the original font weight (e.g., bold text falls back to regular weight).
- TODO: Parse weight from font names (e.g., `Arimo_700wght`, `Arial-Bold`, `TimesNewRoman,SemiBold`) and map to corresponding Liberation font variants:
- Regular/Normal → LiberationSans-Regular, LiberationSerif-Regular, LiberationMono-Regular
- Bold/700 → LiberationSans-Bold, LiberationSerif-Bold, LiberationMono-Bold
- Italic/Oblique → LiberationSans-Italic, LiberationSerif-Italic, LiberationMono-Italic
- BoldItalic → LiberationSans-BoldItalic, LiberationSerif-BoldItalic, LiberationMono-BoldItalic
- Add all Liberation font variants to `BUILT_IN_FALLBACK_FONTS` map with appropriate IDs (e.g., `fallback-liberation-sans-bold`).
- Update `resolveFallbackFontId(String originalFontName, int codePoint)` in `PdfJsonFallbackFontService.java` to detect weight/style and return the matching variant ID.
- Benefits: Better visual consistency when editing text in bold/italic fonts, as missing characters will match the original weight.
- Implementation reference: `app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonFallbackFontService.java:186-213`

File diff suppressed because it is too large Load Diff