Cache fix issues

This commit is contained in:
Anthony Stirling 2025-12-10 23:26:35 +00:00
parent d6a83fe6a1
commit a73636a597
10 changed files with 501 additions and 104 deletions

View File

@ -68,6 +68,7 @@ public class ApplicationProperties {
private AutoPipeline autoPipeline = new AutoPipeline();
private ProcessExecutor processExecutor = new ProcessExecutor();
private PdfEditor pdfEditor = new PdfEditor();
@Bean
public PropertySource<?> dynamicYamlPropertySource(ConfigurableEnvironment environment)
@ -100,6 +101,46 @@ public class ApplicationProperties {
private String outputFolder;
}
@Data
public static class PdfEditor {
private Cache cache = new Cache();
private FontNormalization fontNormalization = new FontNormalization();
private CffConverter cffConverter = new CffConverter();
private Type3 type3 = new Type3();
private String fallbackFont = "classpath:/static/fonts/NotoSans-Regular.ttf";
@Data
public static class Cache {
private long maxBytes = -1;
private int maxPercent = 20;
}
@Data
public static class FontNormalization {
private boolean enabled = false;
}
@Data
public static class CffConverter {
private boolean enabled = true;
private String method = "python";
private String pythonCommand = "/opt/venv/bin/python3";
private String pythonScript = "/scripts/convert_cff_to_ttf.py";
private String fontforgeCommand = "fontforge";
}
@Data
public static class Type3 {
private Library library = new Library();
@Data
public static class Library {
private boolean enabled = true;
private String index = "classpath:/type3/library/index.json";
}
}
}
@Data
public static class Legal {
private String termsAndConditions;

View File

@ -0,0 +1,44 @@
package stirling.software.SPDF.controller.api.converters;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ControllerAdvice;
import org.springframework.web.bind.annotation.ExceptionHandler;
import org.springframework.web.bind.annotation.ResponseBody;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.exception.CacheUnavailableException;
@ControllerAdvice(assignableTypes = ConvertPdfJsonController.class)
@Slf4j
@RequiredArgsConstructor
public class ConvertPdfJsonExceptionHandler {
private final ObjectMapper objectMapper;
@ExceptionHandler(CacheUnavailableException.class)
@ResponseBody
public ResponseEntity<byte[]> handleCacheUnavailable(CacheUnavailableException ex) {
try {
byte[] body =
objectMapper.writeValueAsBytes(
java.util.Map.of(
"error", "cache_unavailable",
"action", "reupload",
"message", ex.getMessage()));
return ResponseEntity.status(HttpStatus.GONE).body(body);
} catch (Exception e) {
log.warn("Failed to serialize cache_unavailable response: {}", e.getMessage());
return ResponseEntity.status(HttpStatus.GONE)
.body(
("{\"error\":\"cache_unavailable\",\"action\":\"reupload\",\"message\":\""
+ ex.getMessage()
+ "\"}")
.getBytes());
}
}
}

View File

@ -0,0 +1,8 @@
package stirling.software.SPDF.exception;
public class CacheUnavailableException extends RuntimeException {
public CacheUnavailableException(String message) {
super(message);
}
}

View File

@ -144,14 +144,21 @@ public class PdfJsonConversionService {
private final PdfJsonFontService fontService;
private final Type3FontConversionService type3FontConversionService;
private final Type3GlyphExtractor type3GlyphExtractor;
private final stirling.software.common.model.ApplicationProperties applicationProperties;
private final Map<String, PDFont> type3NormalizedFontCache = new ConcurrentHashMap<>();
private final Map<String, Set<Integer>> type3GlyphCoverageCache = new ConcurrentHashMap<>();
@Value("${stirling.pdf.json.font-normalization.enabled:true}")
private boolean fontNormalizationEnabled;
private long cacheMaxBytes;
private int cacheMaxPercent;
/** Cache for storing PDDocuments for lazy page loading. Key is jobId. */
private final Map<String, CachedPdfDocument> documentCache = new ConcurrentHashMap<>();
private final java.util.LinkedHashMap<String, CachedPdfDocument> lruCache =
new java.util.LinkedHashMap<>(16, 0.75f, true);
private final Object cacheLock = new Object();
private volatile long currentCacheBytes = 0L;
private volatile long cacheBudgetBytes = -1L;
private volatile boolean ghostscriptAvailable;
@ -161,7 +168,23 @@ public class PdfJsonConversionService {
@PostConstruct
private void initializeToolAvailability() {
loadConfigurationFromProperties();
initializeGhostscriptAvailability();
initializeCacheBudget();
}
private void loadConfigurationFromProperties() {
stirling.software.common.model.ApplicationProperties.PdfEditor cfg =
applicationProperties.getPdfEditor();
if (cfg != null) {
fontNormalizationEnabled = cfg.getFontNormalization().isEnabled();
cacheMaxBytes = cfg.getCache().getMaxBytes();
cacheMaxPercent = cfg.getCache().getMaxPercent();
} else {
fontNormalizationEnabled = false;
cacheMaxBytes = -1;
cacheMaxPercent = 20;
}
}
private void initializeGhostscriptAvailability() {
@ -202,6 +225,25 @@ public class PdfJsonConversionService {
}
}
private void initializeCacheBudget() {
long effective = -1L;
if (cacheMaxBytes > 0) {
effective = cacheMaxBytes;
} else if (cacheMaxPercent > 0) {
long maxMem = Runtime.getRuntime().maxMemory();
effective = Math.max(0L, (maxMem * cacheMaxPercent) / 100);
}
cacheBudgetBytes = effective;
if (cacheBudgetBytes > 0) {
log.info(
"PDF JSON cache budget configured: {} bytes (source: {})",
cacheBudgetBytes,
cacheMaxBytes > 0 ? "max-bytes" : "max-percent");
} else {
log.info("PDF JSON cache budget: unlimited");
}
}
public byte[] convertPdfToJson(MultipartFile file) throws IOException {
return convertPdfToJson(file, null, false);
}
@ -318,9 +360,9 @@ public class PdfJsonConversionService {
try (PDDocument document = pdfDocumentFactory.load(workingPath, true)) {
int totalPages = document.getNumberOfPages();
// Only use lazy images for real async jobs where client can access the cache
// Synchronous calls with synthetic jobId should do full extraction
boolean useLazyImages = totalPages > 5 && isRealJobId;
// Always enable lazy mode for real async jobs so cache is available regardless of
// page count. Synchronous calls with synthetic jobId still do full extraction.
boolean useLazyImages = isRealJobId;
Map<COSBase, FontModelCacheEntry> fontCache = new IdentityHashMap<>();
Map<COSBase, EncodedImage> imageCache = new IdentityHashMap<>();
log.debug(
@ -435,15 +477,16 @@ public class PdfJsonConversionService {
cachedPdfBytes = Files.readAllBytes(workingPath);
}
CachedPdfDocument cached =
new CachedPdfDocument(
cachedPdfBytes, docMetadata, fonts, pageFontResources);
documentCache.put(jobId, cached);
buildCachedDocument(
jobId, cachedPdfBytes, docMetadata, fonts, pageFontResources);
putCachedDocument(jobId, cached);
log.debug(
"Cached PDF bytes ({} bytes, {} pages, {} fonts) for lazy images, jobId: {}",
cachedPdfBytes.length,
"Cached PDF bytes ({} bytes, {} pages, {} fonts) for lazy images, jobId: {} (diskBacked={})",
cached.getPdfSize(),
totalPages,
fonts.size(),
jobId);
jobId,
cached.isDiskBacked());
scheduleDocumentCleanup(jobId);
}
@ -2973,6 +3016,130 @@ public class PdfJsonConversionService {
}
}
// Cache helpers
private CachedPdfDocument buildCachedDocument(
String jobId,
byte[] pdfBytes,
PdfJsonDocumentMetadata metadata,
Map<String, PdfJsonFont> fonts,
Map<Integer, Map<PDFont, String>> pageFontResources)
throws IOException {
if (pdfBytes == null) {
throw new IllegalArgumentException("pdfBytes must not be null");
}
long budget = cacheBudgetBytes;
// If single document is larger than budget, spill straight to disk
if (budget > 0 && pdfBytes.length > budget) {
TempFile tempFile = new TempFile(tempFileManager, ".pdfjsoncache");
Files.write(tempFile.getPath(), pdfBytes);
log.debug(
"Cached PDF spilled to disk ({} bytes exceeds budget {}) for jobId {}",
pdfBytes.length,
budget,
jobId);
return new CachedPdfDocument(
null, tempFile, pdfBytes.length, metadata, fonts, pageFontResources);
}
return new CachedPdfDocument(
pdfBytes, null, pdfBytes.length, metadata, fonts, pageFontResources);
}
private void putCachedDocument(String jobId, CachedPdfDocument cached) {
synchronized (cacheLock) {
CachedPdfDocument existing = documentCache.put(jobId, cached);
if (existing != null) {
lruCache.remove(jobId);
currentCacheBytes = Math.max(0L, currentCacheBytes - existing.getInMemorySize());
closeQuietly(existing.pdfTempFile);
}
lruCache.put(jobId, cached);
currentCacheBytes += cached.getInMemorySize();
enforceCacheBudget();
}
}
private CachedPdfDocument getCachedDocument(String jobId) {
synchronized (cacheLock) {
CachedPdfDocument cached = documentCache.get(jobId);
if (cached != null) {
lruCache.remove(jobId);
lruCache.put(jobId, cached);
}
return cached;
}
}
private void enforceCacheBudget() {
if (cacheBudgetBytes <= 0) {
return;
}
synchronized (cacheLock) {
java.util.Iterator<java.util.Map.Entry<String, CachedPdfDocument>> it =
lruCache.entrySet().iterator();
while (currentCacheBytes > cacheBudgetBytes && it.hasNext()) {
java.util.Map.Entry<String, CachedPdfDocument> entry = it.next();
it.remove();
CachedPdfDocument removed = entry.getValue();
documentCache.remove(entry.getKey(), removed);
currentCacheBytes =
Math.max(0L, currentCacheBytes - removed.getInMemorySize());
removed.close();
log.debug(
"Evicted cached PDF for jobId {} to enforce cache budget", entry.getKey());
}
if (currentCacheBytes > cacheBudgetBytes && !lruCache.isEmpty()) {
// Spill the most recently used large entry to disk
String key =
lruCache.entrySet().stream()
.reduce((first, second) -> second)
.map(java.util.Map.Entry::getKey)
.orElse(null);
if (key != null) {
CachedPdfDocument doc = lruCache.get(key);
if (doc != null && doc.getInMemorySize() > 0) {
try {
CachedPdfDocument diskDoc =
buildCachedDocument(
key,
doc.getPdfBytes(),
doc.getMetadata(),
doc.getFonts(),
doc.getPageFontResources());
lruCache.put(key, diskDoc);
documentCache.put(key, diskDoc);
currentCacheBytes =
Math.max(0L, currentCacheBytes - doc.getInMemorySize())
+ diskDoc.getInMemorySize();
doc.close();
log.debug(
"Spilled cached PDF for jobId {} to disk to satisfy budget",
key);
} catch (IOException ex) {
log.warn(
"Failed to spill cached PDF for jobId {} to disk: {}",
key,
ex.getMessage());
}
}
}
}
}
}
private void removeCachedDocument(String jobId) {
CachedPdfDocument removed = null;
synchronized (cacheLock) {
removed = documentCache.remove(jobId);
if (removed != null) {
lruCache.remove(jobId);
currentCacheBytes = Math.max(0L, currentCacheBytes - removed.getInMemorySize());
}
}
if (removed != null) {
removed.close();
}
}
private void applyTextState(PDPageContentStream contentStream, PdfJsonTextElement element)
throws IOException {
if (element.getCharacterSpacing() != null) {
@ -5311,6 +5478,8 @@ public class PdfJsonConversionService {
*/
private static class CachedPdfDocument {
private final byte[] pdfBytes;
private final TempFile pdfTempFile;
private final long pdfSize;
private final PdfJsonDocumentMetadata metadata;
private final Map<String, PdfJsonFont> fonts; // Font map with UIDs for consistency
private final Map<Integer, Map<PDFont, String>> pageFontResources; // Page font resources
@ -5318,10 +5487,14 @@ public class PdfJsonConversionService {
public CachedPdfDocument(
byte[] pdfBytes,
TempFile pdfTempFile,
long pdfSize,
PdfJsonDocumentMetadata metadata,
Map<String, PdfJsonFont> fonts,
Map<Integer, Map<PDFont, String>> pageFontResources) {
this.pdfBytes = pdfBytes;
this.pdfTempFile = pdfTempFile;
this.pdfSize = pdfSize;
this.metadata = metadata;
// Create defensive copies to prevent mutation of shared maps
this.fonts =
@ -5336,8 +5509,14 @@ public class PdfJsonConversionService {
}
// Getters return defensive copies to prevent external mutation
public byte[] getPdfBytes() {
return pdfBytes;
public byte[] getPdfBytes() throws IOException {
if (pdfBytes != null) {
return pdfBytes;
}
if (pdfTempFile != null) {
return Files.readAllBytes(pdfTempFile.getPath());
}
throw new IOException("Cached PDF backing missing");
}
public PdfJsonDocumentMetadata getMetadata() {
@ -5352,6 +5531,18 @@ public class PdfJsonConversionService {
return new java.util.concurrent.ConcurrentHashMap<>(pageFontResources);
}
public long getPdfSize() {
return pdfSize;
}
public long getInMemorySize() {
return pdfBytes != null ? pdfBytes.length : 0L;
}
public boolean isDiskBacked() {
return pdfBytes == null && pdfTempFile != null;
}
public long getTimestamp() {
return timestamp;
}
@ -5363,7 +5554,14 @@ public class PdfJsonConversionService {
public CachedPdfDocument withUpdatedFonts(
byte[] nextBytes, Map<String, PdfJsonFont> nextFonts) {
Map<String, PdfJsonFont> fontsToUse = nextFonts != null ? nextFonts : this.fonts;
return new CachedPdfDocument(nextBytes, metadata, fontsToUse, pageFontResources);
return new CachedPdfDocument(
nextBytes, null, nextBytes != null ? nextBytes.length : 0, metadata, fontsToUse, pageFontResources);
}
public void close() {
if (pdfTempFile != null) {
pdfTempFile.close();
}
}
}
@ -5444,14 +5642,15 @@ public class PdfJsonConversionService {
// Cache PDF bytes, metadata, and fonts for lazy page loading
if (jobId != null) {
CachedPdfDocument cached =
new CachedPdfDocument(pdfBytes, docMetadata, fonts, pageFontResources);
documentCache.put(jobId, cached);
buildCachedDocument(jobId, pdfBytes, docMetadata, fonts, pageFontResources);
putCachedDocument(jobId, cached);
log.debug(
"Cached PDF bytes ({} bytes, {} pages, {} fonts) for lazy loading, jobId: {}",
pdfBytes.length,
"Cached PDF bytes ({} bytes, {} pages, {} fonts) for lazy loading, jobId: {} (diskBacked={})",
cached.getPdfSize(),
totalPages,
fonts.size(),
jobId);
jobId,
cached.isDiskBacked());
// Schedule cleanup after 30 minutes
scheduleDocumentCleanup(jobId);
@ -5466,9 +5665,10 @@ public class PdfJsonConversionService {
/** Extracts a single page from cached PDF bytes. Re-loads the PDF for each request. */
public byte[] extractSinglePage(String jobId, int pageNumber) throws IOException {
CachedPdfDocument cached = documentCache.get(jobId);
CachedPdfDocument cached = getCachedDocument(jobId);
if (cached == null) {
throw new IllegalArgumentException("No cached document found for jobId: " + jobId);
throw new stirling.software.SPDF.exception.CacheUnavailableException(
"No cached document found for jobId: " + jobId);
}
int pageIndex = pageNumber - 1;
@ -5480,8 +5680,8 @@ public class PdfJsonConversionService {
}
log.debug(
"Loading PDF from bytes ({} bytes) to extract page {} (jobId: {})",
cached.getPdfBytes().length,
"Loading PDF from {} to extract page {} (jobId: {})",
cached.isDiskBacked() ? "disk cache" : "memory cache",
pageNumber,
jobId);
@ -5627,9 +5827,10 @@ public class PdfJsonConversionService {
if (jobId == null || jobId.isBlank()) {
throw new IllegalArgumentException("jobId is required for incremental export");
}
CachedPdfDocument cached = documentCache.get(jobId);
CachedPdfDocument cached = getCachedDocument(jobId);
if (cached == null) {
throw new IllegalArgumentException("No cached document available for jobId: " + jobId);
throw new stirling.software.SPDF.exception.CacheUnavailableException(
"No cached document available for jobId: " + jobId);
}
if (updates == null || updates.getPages() == null || updates.getPages().isEmpty()) {
log.debug(
@ -5709,7 +5910,14 @@ public class PdfJsonConversionService {
document.save(baos);
byte[] updatedBytes = baos.toByteArray();
documentCache.put(jobId, cached.withUpdatedFonts(updatedBytes, mergedFonts));
CachedPdfDocument updated =
buildCachedDocument(
jobId,
updatedBytes,
cached.getMetadata(),
mergedFonts,
cached.getPageFontResources());
putCachedDocument(jobId, updated);
// Clear Type3 cache entries for this incremental update
clearType3CacheEntriesForJob(updateJobId);
@ -5724,11 +5932,13 @@ public class PdfJsonConversionService {
/** Clears a cached document. */
public void clearCachedDocument(String jobId) {
CachedPdfDocument cached = documentCache.remove(jobId);
CachedPdfDocument cached = getCachedDocument(jobId);
removeCachedDocument(jobId);
if (cached != null) {
log.debug(
"Removed cached PDF bytes ({} bytes) for jobId: {}",
cached.getPdfBytes().length,
"Removed cached PDF ({} bytes, diskBacked={}) for jobId: {}",
cached.getPdfSize(),
cached.isDiskBacked(),
jobId);
}

View File

@ -312,12 +312,26 @@ public class PdfJsonFallbackFontService {
"ttf")));
private final ResourceLoader resourceLoader;
private final stirling.software.common.model.ApplicationProperties applicationProperties;
@Value("${stirling.pdf.fallback-font:" + DEFAULT_FALLBACK_FONT_LOCATION + "}")
private String legacyFallbackFontLocation;
private String fallbackFontLocation;
private final Map<String, byte[]> fallbackFontCache = new ConcurrentHashMap<>();
@jakarta.annotation.PostConstruct
private void loadConfig() {
String configured = applicationProperties.getPdfEditor().getFallbackFont();
if (configured != null && !configured.isBlank()) {
fallbackFontLocation = configured;
} else {
fallbackFontLocation = legacyFallbackFontLocation;
}
log.info("Using fallback font location: {}", fallbackFontLocation);
}
public PdfJsonFont buildFallbackFontModel() throws IOException {
return buildFallbackFontModel(FALLBACK_FONT_ID);
}

View File

@ -25,22 +25,18 @@ import stirling.software.common.util.TempFileManager;
public class PdfJsonFontService {
private final TempFileManager tempFileManager;
private final stirling.software.common.model.ApplicationProperties applicationProperties;
@Getter
@Value("${stirling.pdf.json.cff-converter.enabled:true}")
private boolean cffConversionEnabled;
@Getter
@Value("${stirling.pdf.json.cff-converter.method:python}")
private String cffConverterMethod;
@Value("${stirling.pdf.json.cff-converter.python-command:/opt/venv/bin/python3}")
private String pythonCommand;
@Value("${stirling.pdf.json.cff-converter.python-script:/scripts/convert_cff_to_ttf.py}")
private String pythonScript;
@Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}")
private String fontforgeCommand;
private volatile boolean pythonCffConverterAvailable;
@ -48,6 +44,7 @@ public class PdfJsonFontService {
@PostConstruct
private void initialiseCffConverterAvailability() {
loadConfiguration();
if (!cffConversionEnabled) {
log.warn("[FONT-DEBUG] CFF conversion is DISABLED in configuration");
pythonCffConverterAvailable = false;
@ -77,6 +74,15 @@ public class PdfJsonFontService {
log.info("[FONT-DEBUG] Selected CFF converter method: {}", cffConverterMethod);
}
private void loadConfiguration() {
var cfg = applicationProperties.getPdfEditor().getCffConverter();
this.cffConversionEnabled = cfg.isEnabled();
this.cffConverterMethod = cfg.getMethod();
this.pythonCommand = cfg.getPythonCommand();
this.pythonScript = cfg.getPythonScript();
this.fontforgeCommand = cfg.getFontforgeCommand();
}
public byte[] convertCffProgramToTrueType(byte[] fontBytes, String toUnicode) {
if (!cffConversionEnabled || fontBytes == null || fontBytes.length == 0) {
log.warn(

View File

@ -23,8 +23,8 @@ import stirling.software.SPDF.service.pdfjson.type3.library.Type3FontLibraryPayl
public class Type3LibraryStrategy implements Type3ConversionStrategy {
private final Type3FontLibrary fontLibrary;
private final stirling.software.common.model.ApplicationProperties applicationProperties;
@Value("${stirling.pdf.json.type3.library.enabled:true}")
private boolean enabled;
@Override
@ -42,6 +42,12 @@ public class Type3LibraryStrategy implements Type3ConversionStrategy {
return enabled && fontLibrary != null && fontLibrary.isLoaded();
}
@jakarta.annotation.PostConstruct
private void loadConfiguration() {
var cfg = applicationProperties.getPdfEditor().getType3().getLibrary();
this.enabled = cfg.isEnabled();
}
@Override
public PdfJsonFontConversionCandidate convert(
Type3ConversionRequest request, Type3GlyphContext context) throws IOException {

View File

@ -34,8 +34,8 @@ public class Type3FontLibrary {
private final ObjectMapper objectMapper;
private final ResourceLoader resourceLoader;
private final stirling.software.common.model.ApplicationProperties applicationProperties;
@Value("${stirling.pdf.json.type3.library.index:classpath:/type3/library/index.json}")
private String indexLocation;
private final Map<String, Type3FontLibraryEntry> signatureIndex = new ConcurrentHashMap<>();
@ -44,6 +44,8 @@ public class Type3FontLibrary {
@jakarta.annotation.PostConstruct
void initialise() {
this.indexLocation =
applicationProperties.getPdfEditor().getType3().getLibrary().getIndex();
Resource resource = resourceLoader.getResource(indexLocation);
if (!resource.exists()) {
log.info("[TYPE3] Library index {} not found; Type3 library disabled", indexLocation);

View File

@ -178,23 +178,6 @@ system:
databaseBackup:
cron: '0 0 0 * * ?' # Cron expression for automatic database backups "0 0 0 * * ?" daily at midnight
stirling:
pdf:
fallback-font: classpath:/static/fonts/NotoSans-Regular.ttf # Override to point at a custom fallback font
json:
font-normalization:
enabled: false # IMPORTANT: Disable to preserve ToUnicode CMaps for correct font rendering. Ghostscript strips Unicode mappings from CID fonts.
cff-converter:
enabled: true # Wrap CFF/Type1C fonts as OpenType-CFF for browser compatibility
method: python # Converter method: 'python' (fontTools, recommended - wraps as OTF), 'fontforge' (legacy - converts to TTF, may hang on CID fonts)
python-command: /opt/venv/bin/python3 # Python interpreter path
python-script: /scripts/convert_cff_to_ttf.py # Path to font wrapping script
fontforge-command: fontforge # Override if FontForge is installed under a different name/path
type3:
library:
enabled: true # Match common Type3 fonts against the built-in library of converted programs
index: classpath:/type3/library/index.json # Override to point at a custom index.json (supports http:, file:, classpath:)
ui:
appNameNavbar: '' # name displayed on the navigation bar
logoStyle: classic # Options: 'classic' (default - classic S icon) or 'modern' (minimalist logo)
@ -236,3 +219,21 @@ processExecutor:
qpdfTimeoutMinutes: 30
ghostscriptTimeoutMinutes: 30
ocrMyPdfTimeoutMinutes: 30
pdfEditor:
fallback-font: classpath:/static/fonts/NotoSans-Regular.ttf # Override to point at a custom fallback font
cache:
max-bytes: -1 # Max in-memory cache size in bytes; -1 disables byte cap
max-percent: 20 # Max in-memory cache as % of JVM max; used when max-bytes <= 0
font-normalization:
enabled: false # IMPORTANT: Disable to preserve ToUnicode CMaps for correct font rendering. Ghostscript strips Unicode mappings from CID fonts.
cff-converter:
enabled: true # Wrap CFF/Type1CFF fonts as OpenType-CFF for browser compatibility
method: python # Converter method: 'python' (fontTools, recommended - wraps as OTF), 'fontforge' (legacy - converts to TTF, may hang on CID fonts)
python-command: /opt/venv/bin/python3 # Python interpreter path
python-script: /scripts/convert_cff_to_ttf.py # Path to font wrapping script
fontforge-command: fontforge # Override if FontForge is installed under a different name/path
type3:
library:
enabled: true # Match common Type3 fonts against the built-in library of converted programs
index: classpath:/type3/library/index.json # Override to point at a custom index.json (supports http:, file:, classpath:)

View File

@ -238,6 +238,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
const originalGroupsRef = useRef<TextGroup[][]>([]);
const imagesByPageRef = useRef<PdfJsonImageElement[][]>([]);
const lastLoadedFileRef = useRef<File | null>(null);
const autoLoadKeyRef = useRef<string | null>(null);
const sourceFileIdRef = useRef<string | null>(null);
const loadRequestIdRef = useRef(0);
@ -251,6 +252,8 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const pagePreviewsRef = useRef<Map<number, string>>(pagePreviews);
const previewScaleRef = useRef<Map<number, number>>(new Map());
const cachedJobIdRef = useRef<string | null>(null);
const cacheRecoveryInProgressRef = useRef(false);
const recoverCacheAndReloadRef = useRef<() => Promise<boolean>>(async () => false);
// Keep ref in sync with state for access in async callbacks
useEffect(() => {
@ -279,6 +282,13 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
};
}, []);
const isCacheUnavailableError = useCallback((error: any): boolean => {
const status = error?.response?.status;
const data = error?.response?.data;
const code = (data && (data.error || data.code)) ?? undefined;
return status === 410 && code === 'cache_unavailable';
}, []);
const dirtyPages = useMemo(
() => getDirtyPages(groupsByPage, imagesByPage, originalGroupsRef.current, originalImagesRef.current),
[groupsByPage, imagesByPage],
@ -316,6 +326,8 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
loadedImagePagesRef.current = new Set();
loadingImagePagesRef.current = new Set();
setSelectedPage(0);
setIsLazyMode(false);
setCachedJobId(null);
return;
}
const cloned = deepCloneDocument(document);
@ -404,7 +416,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
// Load images for a page in lazy mode
const loadImagesForPage = useCallback(
async (pageIndex: number) => {
async (pageIndex: number, fromRecovery = false) => {
if (!isLazyMode) {
return;
}
@ -489,6 +501,12 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
);
} catch (error) {
console.error(`[loadImagesForPage] Failed to load images for page ${pageNumber}:`, error);
if (!fromRecovery && isCacheUnavailableError(error)) {
const recovered = await recoverCacheAndReloadRef.current();
if (recovered) {
return loadImagesForPage(pageIndex, true);
}
}
} finally {
loadingImagePagesRef.current.delete(pageIndex);
setLoadingImagePages((prev) => {
@ -498,7 +516,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
});
}
},
[isLazyMode, cachedJobId],
[isLazyMode, cachedJobId, isCacheUnavailableError],
);
const handleLoadFile = useCallback(
@ -507,6 +525,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
return;
}
lastLoadedFileRef.current = file;
const requestId = loadRequestIdRef.current + 1;
loadRequestIdRef.current = requestId;
@ -555,59 +574,35 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
message: 'Starting conversion...',
});
let jobComplete = false;
let attempts = 0;
const maxAttempts = 600;
let jobComplete = false;
let attempts = 0;
const maxAttempts = 600;
let pollDelay = 500;
while (!jobComplete && attempts < maxAttempts) {
await new Promise((resolve) => setTimeout(resolve, 1000));
attempts += 1;
while (!jobComplete && attempts < maxAttempts) {
await new Promise((resolve) => setTimeout(resolve, pollDelay));
attempts += 1;
if (pollDelay < 10000) {
pollDelay = Math.min(10000, Math.floor(pollDelay * 1.5));
}
try {
const statusResponse = await apiClient.get(`/api/v1/general/job/${jobId}`);
const jobStatus = statusResponse.data;
console.log(`Job status (attempt ${attempts}):`, jobStatus);
if (jobStatus.notes && jobStatus.notes.length > 0) {
const lastNote = jobStatus.notes[jobStatus.notes.length - 1];
console.log('Latest note:', lastNote);
const matchWithCount = lastNote.match(
/\[(\d+)%\]\s+(\w+):\s+(.+?)\s+\((\d+)\/(\d+)\)/,
);
if (matchWithCount) {
const percent = parseInt(matchWithCount[1], 10);
const stage = matchWithCount[2];
const message = matchWithCount[3];
const current = parseInt(matchWithCount[4], 10);
const total = parseInt(matchWithCount[5], 10);
setConversionProgress({
percent,
stage,
message,
current,
total,
});
} else {
const match = lastNote.match(/\[(\d+)%\]\s+(\w+):\s+(.+)/);
if (match) {
const percent = parseInt(match[1], 10);
const stage = match[2];
const message = match[3];
setConversionProgress({
percent,
stage,
message,
});
}
}
} else if (jobStatus.progress !== undefined) {
const percent = Math.min(Math.max(jobStatus.progress, 0), 100);
setConversionProgress({
percent,
stage: jobStatus.stage || 'processing',
message: jobStatus.note || 'Converting PDF to JSON...',
});
}
const percent = Math.min(Math.max(jobStatus.progress ?? 0, 0), 100);
const stage = jobStatus.stage || 'processing';
const message = jobStatus.note || 'Converting PDF to JSON...';
const current = jobStatus.current ?? undefined;
const total = jobStatus.total ?? undefined;
setConversionProgress({
percent,
stage,
message,
current,
total,
});
if (jobStatus.complete) {
if (jobStatus.error) {
@ -719,6 +714,8 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
setLoadedDocument(null);
resetToDocument(null, groupingMode);
clearPdfPreview();
setIsLazyMode(false);
setCachedJobId(null);
if (isPdf) {
const errorMsg =
@ -743,6 +740,55 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
[groupingMode, resetToDocument, t],
);
const recoverCacheAndReload = useCallback(async () => {
if (cacheRecoveryInProgressRef.current) {
return false;
}
if ((recoverCacheAndReloadRef as any).attempts === undefined) {
(recoverCacheAndReloadRef as any).attempts = 0;
}
if ((recoverCacheAndReloadRef as any).attempts >= 2) {
setErrorMessage(
t(
'pdfTextEditor.errors.cacheRecoveryLimit',
'Cache was unavailable after multiple attempts. Please reload the file manually.',
),
);
return false;
}
(recoverCacheAndReloadRef as any).attempts += 1;
const file = lastLoadedFileRef.current;
if (!file) {
setErrorMessage(
t(
'pdfTextEditor.errors.cacheMissingFile',
'Session expired. Please reload the PDF file to continue.',
),
);
return false;
}
cacheRecoveryInProgressRef.current = true;
try {
await handleLoadFile(file);
return true;
} catch (error) {
console.error('[PdfTextEditor] Cache recovery failed', error);
setErrorMessage(
t(
'pdfTextEditor.errors.cacheReloadFailed',
'Cache expired and reload failed. Please reselect the file.',
),
);
return false;
} finally {
cacheRecoveryInProgressRef.current = false;
}
}, [handleLoadFile, t]);
useEffect(() => {
recoverCacheAndReloadRef.current = recoverCacheAndReload;
}, [recoverCacheAndReload]);
// Wrapper for loading files from the dropzone - adds to workbench first
const handleLoadFileFromDropzone = useCallback(
async (file: File) => {
@ -1054,10 +1100,11 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
if (canUseIncremental) {
await ensureImagesForPages(dirtyPageIndices);
try {
let incrementalRetried = false;
const attemptIncrementalExport = async () => {
const payload = buildPayload();
if (!payload) {
return;
return false;
}
const { document, filename } = payload;
@ -1076,7 +1123,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const baseName = sanitizeBaseName(filename).replace(/-edited$/u, '');
const expectedName = `${baseName || 'document'}.pdf`;
const response = await apiClient.post(
`/api/v1/convert/pdf/text-editor/partial/${cachedJobId}?filename=${encodeURIComponent(expectedName)}`,
`/api/v1/convert/pdf/text-editor/partial/${cachedJobIdRef.current}?filename=${encodeURIComponent(expectedName)}`,
partialDocument,
{
responseType: 'blob',
@ -1094,8 +1141,26 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
onComplete([pdfFile]);
}
setErrorMessage(null);
return;
return true;
};
try {
const success = await attemptIncrementalExport();
if (success) {
return;
}
} catch (incrementalError) {
if (!incrementalRetried && isCacheUnavailableError(incrementalError)) {
const recovered = await recoverCacheAndReloadRef.current();
incrementalRetried = true;
if (recovered) {
await ensureImagesForPages(dirtyPageIndices);
const success = await attemptIncrementalExport();
if (success) {
return;
}
}
}
console.warn(
'[handleGeneratePdf] Incremental export failed, falling back to full export',
incrementalError,