diff --git a/app/common/src/main/java/stirling/software/common/model/ApplicationProperties.java b/app/common/src/main/java/stirling/software/common/model/ApplicationProperties.java index 35ddddeaa..56a9b05af 100644 --- a/app/common/src/main/java/stirling/software/common/model/ApplicationProperties.java +++ b/app/common/src/main/java/stirling/software/common/model/ApplicationProperties.java @@ -68,6 +68,7 @@ public class ApplicationProperties { private AutoPipeline autoPipeline = new AutoPipeline(); private ProcessExecutor processExecutor = new ProcessExecutor(); + private PdfEditor pdfEditor = new PdfEditor(); @Bean public PropertySource dynamicYamlPropertySource(ConfigurableEnvironment environment) @@ -100,6 +101,46 @@ public class ApplicationProperties { private String outputFolder; } + @Data + public static class PdfEditor { + private Cache cache = new Cache(); + private FontNormalization fontNormalization = new FontNormalization(); + private CffConverter cffConverter = new CffConverter(); + private Type3 type3 = new Type3(); + private String fallbackFont = "classpath:/static/fonts/NotoSans-Regular.ttf"; + + @Data + public static class Cache { + private long maxBytes = -1; + private int maxPercent = 20; + } + + @Data + public static class FontNormalization { + private boolean enabled = false; + } + + @Data + public static class CffConverter { + private boolean enabled = true; + private String method = "python"; + private String pythonCommand = "/opt/venv/bin/python3"; + private String pythonScript = "/scripts/convert_cff_to_ttf.py"; + private String fontforgeCommand = "fontforge"; + } + + @Data + public static class Type3 { + private Library library = new Library(); + + @Data + public static class Library { + private boolean enabled = true; + private String index = "classpath:/type3/library/index.json"; + } + } + } + @Data public static class Legal { private String termsAndConditions; diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonExceptionHandler.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonExceptionHandler.java new file mode 100644 index 000000000..cb753b2e3 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonExceptionHandler.java @@ -0,0 +1,44 @@ +package stirling.software.SPDF.controller.api.converters; + +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.ControllerAdvice; +import org.springframework.web.bind.annotation.ExceptionHandler; +import org.springframework.web.bind.annotation.ResponseBody; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import stirling.software.SPDF.exception.CacheUnavailableException; + +@ControllerAdvice(assignableTypes = ConvertPdfJsonController.class) +@Slf4j +@RequiredArgsConstructor +public class ConvertPdfJsonExceptionHandler { + + private final ObjectMapper objectMapper; + + @ExceptionHandler(CacheUnavailableException.class) + @ResponseBody + public ResponseEntity handleCacheUnavailable(CacheUnavailableException ex) { + try { + byte[] body = + objectMapper.writeValueAsBytes( + java.util.Map.of( + "error", "cache_unavailable", + "action", "reupload", + "message", ex.getMessage())); + return ResponseEntity.status(HttpStatus.GONE).body(body); + } catch (Exception e) { + log.warn("Failed to serialize cache_unavailable response: {}", e.getMessage()); + return ResponseEntity.status(HttpStatus.GONE) + .body( + ("{\"error\":\"cache_unavailable\",\"action\":\"reupload\",\"message\":\"" + + ex.getMessage() + + "\"}") + .getBytes()); + } + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/exception/CacheUnavailableException.java b/app/core/src/main/java/stirling/software/SPDF/exception/CacheUnavailableException.java new file mode 100644 index 000000000..fd5d77677 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/exception/CacheUnavailableException.java @@ -0,0 +1,8 @@ +package stirling.software.SPDF.exception; + +public class CacheUnavailableException extends RuntimeException { + + public CacheUnavailableException(String message) { + super(message); + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java index 623b99260..53451298c 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java @@ -144,14 +144,21 @@ public class PdfJsonConversionService { private final PdfJsonFontService fontService; private final Type3FontConversionService type3FontConversionService; private final Type3GlyphExtractor type3GlyphExtractor; + private final stirling.software.common.model.ApplicationProperties applicationProperties; private final Map type3NormalizedFontCache = new ConcurrentHashMap<>(); private final Map> type3GlyphCoverageCache = new ConcurrentHashMap<>(); - @Value("${stirling.pdf.json.font-normalization.enabled:true}") private boolean fontNormalizationEnabled; + private long cacheMaxBytes; + private int cacheMaxPercent; /** Cache for storing PDDocuments for lazy page loading. Key is jobId. */ private final Map documentCache = new ConcurrentHashMap<>(); + private final java.util.LinkedHashMap lruCache = + new java.util.LinkedHashMap<>(16, 0.75f, true); + private final Object cacheLock = new Object(); + private volatile long currentCacheBytes = 0L; + private volatile long cacheBudgetBytes = -1L; private volatile boolean ghostscriptAvailable; @@ -161,7 +168,23 @@ public class PdfJsonConversionService { @PostConstruct private void initializeToolAvailability() { + loadConfigurationFromProperties(); initializeGhostscriptAvailability(); + initializeCacheBudget(); + } + + private void loadConfigurationFromProperties() { + stirling.software.common.model.ApplicationProperties.PdfEditor cfg = + applicationProperties.getPdfEditor(); + if (cfg != null) { + fontNormalizationEnabled = cfg.getFontNormalization().isEnabled(); + cacheMaxBytes = cfg.getCache().getMaxBytes(); + cacheMaxPercent = cfg.getCache().getMaxPercent(); + } else { + fontNormalizationEnabled = false; + cacheMaxBytes = -1; + cacheMaxPercent = 20; + } } private void initializeGhostscriptAvailability() { @@ -202,6 +225,25 @@ public class PdfJsonConversionService { } } + private void initializeCacheBudget() { + long effective = -1L; + if (cacheMaxBytes > 0) { + effective = cacheMaxBytes; + } else if (cacheMaxPercent > 0) { + long maxMem = Runtime.getRuntime().maxMemory(); + effective = Math.max(0L, (maxMem * cacheMaxPercent) / 100); + } + cacheBudgetBytes = effective; + if (cacheBudgetBytes > 0) { + log.info( + "PDF JSON cache budget configured: {} bytes (source: {})", + cacheBudgetBytes, + cacheMaxBytes > 0 ? "max-bytes" : "max-percent"); + } else { + log.info("PDF JSON cache budget: unlimited"); + } + } + public byte[] convertPdfToJson(MultipartFile file) throws IOException { return convertPdfToJson(file, null, false); } @@ -318,9 +360,9 @@ public class PdfJsonConversionService { try (PDDocument document = pdfDocumentFactory.load(workingPath, true)) { int totalPages = document.getNumberOfPages(); - // Only use lazy images for real async jobs where client can access the cache - // Synchronous calls with synthetic jobId should do full extraction - boolean useLazyImages = totalPages > 5 && isRealJobId; + // Always enable lazy mode for real async jobs so cache is available regardless of + // page count. Synchronous calls with synthetic jobId still do full extraction. + boolean useLazyImages = isRealJobId; Map fontCache = new IdentityHashMap<>(); Map imageCache = new IdentityHashMap<>(); log.debug( @@ -435,15 +477,16 @@ public class PdfJsonConversionService { cachedPdfBytes = Files.readAllBytes(workingPath); } CachedPdfDocument cached = - new CachedPdfDocument( - cachedPdfBytes, docMetadata, fonts, pageFontResources); - documentCache.put(jobId, cached); + buildCachedDocument( + jobId, cachedPdfBytes, docMetadata, fonts, pageFontResources); + putCachedDocument(jobId, cached); log.debug( - "Cached PDF bytes ({} bytes, {} pages, {} fonts) for lazy images, jobId: {}", - cachedPdfBytes.length, + "Cached PDF bytes ({} bytes, {} pages, {} fonts) for lazy images, jobId: {} (diskBacked={})", + cached.getPdfSize(), totalPages, fonts.size(), - jobId); + jobId, + cached.isDiskBacked()); scheduleDocumentCleanup(jobId); } @@ -2973,6 +3016,130 @@ public class PdfJsonConversionService { } } + // Cache helpers + private CachedPdfDocument buildCachedDocument( + String jobId, + byte[] pdfBytes, + PdfJsonDocumentMetadata metadata, + Map fonts, + Map> pageFontResources) + throws IOException { + if (pdfBytes == null) { + throw new IllegalArgumentException("pdfBytes must not be null"); + } + long budget = cacheBudgetBytes; + // If single document is larger than budget, spill straight to disk + if (budget > 0 && pdfBytes.length > budget) { + TempFile tempFile = new TempFile(tempFileManager, ".pdfjsoncache"); + Files.write(tempFile.getPath(), pdfBytes); + log.debug( + "Cached PDF spilled to disk ({} bytes exceeds budget {}) for jobId {}", + pdfBytes.length, + budget, + jobId); + return new CachedPdfDocument( + null, tempFile, pdfBytes.length, metadata, fonts, pageFontResources); + } + return new CachedPdfDocument( + pdfBytes, null, pdfBytes.length, metadata, fonts, pageFontResources); + } + + private void putCachedDocument(String jobId, CachedPdfDocument cached) { + synchronized (cacheLock) { + CachedPdfDocument existing = documentCache.put(jobId, cached); + if (existing != null) { + lruCache.remove(jobId); + currentCacheBytes = Math.max(0L, currentCacheBytes - existing.getInMemorySize()); + closeQuietly(existing.pdfTempFile); + } + lruCache.put(jobId, cached); + currentCacheBytes += cached.getInMemorySize(); + enforceCacheBudget(); + } + } + + private CachedPdfDocument getCachedDocument(String jobId) { + synchronized (cacheLock) { + CachedPdfDocument cached = documentCache.get(jobId); + if (cached != null) { + lruCache.remove(jobId); + lruCache.put(jobId, cached); + } + return cached; + } + } + + private void enforceCacheBudget() { + if (cacheBudgetBytes <= 0) { + return; + } + synchronized (cacheLock) { + java.util.Iterator> it = + lruCache.entrySet().iterator(); + while (currentCacheBytes > cacheBudgetBytes && it.hasNext()) { + java.util.Map.Entry entry = it.next(); + it.remove(); + CachedPdfDocument removed = entry.getValue(); + documentCache.remove(entry.getKey(), removed); + currentCacheBytes = + Math.max(0L, currentCacheBytes - removed.getInMemorySize()); + removed.close(); + log.debug( + "Evicted cached PDF for jobId {} to enforce cache budget", entry.getKey()); + } + if (currentCacheBytes > cacheBudgetBytes && !lruCache.isEmpty()) { + // Spill the most recently used large entry to disk + String key = + lruCache.entrySet().stream() + .reduce((first, second) -> second) + .map(java.util.Map.Entry::getKey) + .orElse(null); + if (key != null) { + CachedPdfDocument doc = lruCache.get(key); + if (doc != null && doc.getInMemorySize() > 0) { + try { + CachedPdfDocument diskDoc = + buildCachedDocument( + key, + doc.getPdfBytes(), + doc.getMetadata(), + doc.getFonts(), + doc.getPageFontResources()); + lruCache.put(key, diskDoc); + documentCache.put(key, diskDoc); + currentCacheBytes = + Math.max(0L, currentCacheBytes - doc.getInMemorySize()) + + diskDoc.getInMemorySize(); + doc.close(); + log.debug( + "Spilled cached PDF for jobId {} to disk to satisfy budget", + key); + } catch (IOException ex) { + log.warn( + "Failed to spill cached PDF for jobId {} to disk: {}", + key, + ex.getMessage()); + } + } + } + } + } + } + + private void removeCachedDocument(String jobId) { + CachedPdfDocument removed = null; + synchronized (cacheLock) { + removed = documentCache.remove(jobId); + if (removed != null) { + lruCache.remove(jobId); + currentCacheBytes = Math.max(0L, currentCacheBytes - removed.getInMemorySize()); + } + } + if (removed != null) { + removed.close(); + } + } + private void applyTextState(PDPageContentStream contentStream, PdfJsonTextElement element) throws IOException { if (element.getCharacterSpacing() != null) { @@ -5311,6 +5478,8 @@ public class PdfJsonConversionService { */ private static class CachedPdfDocument { private final byte[] pdfBytes; + private final TempFile pdfTempFile; + private final long pdfSize; private final PdfJsonDocumentMetadata metadata; private final Map fonts; // Font map with UIDs for consistency private final Map> pageFontResources; // Page font resources @@ -5318,10 +5487,14 @@ public class PdfJsonConversionService { public CachedPdfDocument( byte[] pdfBytes, + TempFile pdfTempFile, + long pdfSize, PdfJsonDocumentMetadata metadata, Map fonts, Map> pageFontResources) { this.pdfBytes = pdfBytes; + this.pdfTempFile = pdfTempFile; + this.pdfSize = pdfSize; this.metadata = metadata; // Create defensive copies to prevent mutation of shared maps this.fonts = @@ -5336,8 +5509,14 @@ public class PdfJsonConversionService { } // Getters return defensive copies to prevent external mutation - public byte[] getPdfBytes() { - return pdfBytes; + public byte[] getPdfBytes() throws IOException { + if (pdfBytes != null) { + return pdfBytes; + } + if (pdfTempFile != null) { + return Files.readAllBytes(pdfTempFile.getPath()); + } + throw new IOException("Cached PDF backing missing"); } public PdfJsonDocumentMetadata getMetadata() { @@ -5352,6 +5531,18 @@ public class PdfJsonConversionService { return new java.util.concurrent.ConcurrentHashMap<>(pageFontResources); } + public long getPdfSize() { + return pdfSize; + } + + public long getInMemorySize() { + return pdfBytes != null ? pdfBytes.length : 0L; + } + + public boolean isDiskBacked() { + return pdfBytes == null && pdfTempFile != null; + } + public long getTimestamp() { return timestamp; } @@ -5363,7 +5554,14 @@ public class PdfJsonConversionService { public CachedPdfDocument withUpdatedFonts( byte[] nextBytes, Map nextFonts) { Map fontsToUse = nextFonts != null ? nextFonts : this.fonts; - return new CachedPdfDocument(nextBytes, metadata, fontsToUse, pageFontResources); + return new CachedPdfDocument( + nextBytes, null, nextBytes != null ? nextBytes.length : 0, metadata, fontsToUse, pageFontResources); + } + + public void close() { + if (pdfTempFile != null) { + pdfTempFile.close(); + } } } @@ -5444,14 +5642,15 @@ public class PdfJsonConversionService { // Cache PDF bytes, metadata, and fonts for lazy page loading if (jobId != null) { CachedPdfDocument cached = - new CachedPdfDocument(pdfBytes, docMetadata, fonts, pageFontResources); - documentCache.put(jobId, cached); + buildCachedDocument(jobId, pdfBytes, docMetadata, fonts, pageFontResources); + putCachedDocument(jobId, cached); log.debug( - "Cached PDF bytes ({} bytes, {} pages, {} fonts) for lazy loading, jobId: {}", - pdfBytes.length, + "Cached PDF bytes ({} bytes, {} pages, {} fonts) for lazy loading, jobId: {} (diskBacked={})", + cached.getPdfSize(), totalPages, fonts.size(), - jobId); + jobId, + cached.isDiskBacked()); // Schedule cleanup after 30 minutes scheduleDocumentCleanup(jobId); @@ -5466,9 +5665,10 @@ public class PdfJsonConversionService { /** Extracts a single page from cached PDF bytes. Re-loads the PDF for each request. */ public byte[] extractSinglePage(String jobId, int pageNumber) throws IOException { - CachedPdfDocument cached = documentCache.get(jobId); + CachedPdfDocument cached = getCachedDocument(jobId); if (cached == null) { - throw new IllegalArgumentException("No cached document found for jobId: " + jobId); + throw new stirling.software.SPDF.exception.CacheUnavailableException( + "No cached document found for jobId: " + jobId); } int pageIndex = pageNumber - 1; @@ -5480,8 +5680,8 @@ public class PdfJsonConversionService { } log.debug( - "Loading PDF from bytes ({} bytes) to extract page {} (jobId: {})", - cached.getPdfBytes().length, + "Loading PDF from {} to extract page {} (jobId: {})", + cached.isDiskBacked() ? "disk cache" : "memory cache", pageNumber, jobId); @@ -5627,9 +5827,10 @@ public class PdfJsonConversionService { if (jobId == null || jobId.isBlank()) { throw new IllegalArgumentException("jobId is required for incremental export"); } - CachedPdfDocument cached = documentCache.get(jobId); + CachedPdfDocument cached = getCachedDocument(jobId); if (cached == null) { - throw new IllegalArgumentException("No cached document available for jobId: " + jobId); + throw new stirling.software.SPDF.exception.CacheUnavailableException( + "No cached document available for jobId: " + jobId); } if (updates == null || updates.getPages() == null || updates.getPages().isEmpty()) { log.debug( @@ -5709,7 +5910,14 @@ public class PdfJsonConversionService { document.save(baos); byte[] updatedBytes = baos.toByteArray(); - documentCache.put(jobId, cached.withUpdatedFonts(updatedBytes, mergedFonts)); + CachedPdfDocument updated = + buildCachedDocument( + jobId, + updatedBytes, + cached.getMetadata(), + mergedFonts, + cached.getPageFontResources()); + putCachedDocument(jobId, updated); // Clear Type3 cache entries for this incremental update clearType3CacheEntriesForJob(updateJobId); @@ -5724,11 +5932,13 @@ public class PdfJsonConversionService { /** Clears a cached document. */ public void clearCachedDocument(String jobId) { - CachedPdfDocument cached = documentCache.remove(jobId); + CachedPdfDocument cached = getCachedDocument(jobId); + removeCachedDocument(jobId); if (cached != null) { log.debug( - "Removed cached PDF bytes ({} bytes) for jobId: {}", - cached.getPdfBytes().length, + "Removed cached PDF ({} bytes, diskBacked={}) for jobId: {}", + cached.getPdfSize(), + cached.isDiskBacked(), jobId); } diff --git a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonFallbackFontService.java b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonFallbackFontService.java index 107abbe2b..1dd25fa0c 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonFallbackFontService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonFallbackFontService.java @@ -312,12 +312,26 @@ public class PdfJsonFallbackFontService { "ttf"))); private final ResourceLoader resourceLoader; + private final stirling.software.common.model.ApplicationProperties applicationProperties; @Value("${stirling.pdf.fallback-font:" + DEFAULT_FALLBACK_FONT_LOCATION + "}") + private String legacyFallbackFontLocation; + private String fallbackFontLocation; private final Map fallbackFontCache = new ConcurrentHashMap<>(); + @jakarta.annotation.PostConstruct + private void loadConfig() { + String configured = applicationProperties.getPdfEditor().getFallbackFont(); + if (configured != null && !configured.isBlank()) { + fallbackFontLocation = configured; + } else { + fallbackFontLocation = legacyFallbackFontLocation; + } + log.info("Using fallback font location: {}", fallbackFontLocation); + } + public PdfJsonFont buildFallbackFontModel() throws IOException { return buildFallbackFontModel(FALLBACK_FONT_ID); } diff --git a/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonFontService.java b/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonFontService.java index 1a9f7f698..a75e3681d 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonFontService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonFontService.java @@ -25,22 +25,18 @@ import stirling.software.common.util.TempFileManager; public class PdfJsonFontService { private final TempFileManager tempFileManager; + private final stirling.software.common.model.ApplicationProperties applicationProperties; @Getter - @Value("${stirling.pdf.json.cff-converter.enabled:true}") private boolean cffConversionEnabled; @Getter - @Value("${stirling.pdf.json.cff-converter.method:python}") private String cffConverterMethod; - @Value("${stirling.pdf.json.cff-converter.python-command:/opt/venv/bin/python3}") private String pythonCommand; - @Value("${stirling.pdf.json.cff-converter.python-script:/scripts/convert_cff_to_ttf.py}") private String pythonScript; - @Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}") private String fontforgeCommand; private volatile boolean pythonCffConverterAvailable; @@ -48,6 +44,7 @@ public class PdfJsonFontService { @PostConstruct private void initialiseCffConverterAvailability() { + loadConfiguration(); if (!cffConversionEnabled) { log.warn("[FONT-DEBUG] CFF conversion is DISABLED in configuration"); pythonCffConverterAvailable = false; @@ -77,6 +74,15 @@ public class PdfJsonFontService { log.info("[FONT-DEBUG] Selected CFF converter method: {}", cffConverterMethod); } + private void loadConfiguration() { + var cfg = applicationProperties.getPdfEditor().getCffConverter(); + this.cffConversionEnabled = cfg.isEnabled(); + this.cffConverterMethod = cfg.getMethod(); + this.pythonCommand = cfg.getPythonCommand(); + this.pythonScript = cfg.getPythonScript(); + this.fontforgeCommand = cfg.getFontforgeCommand(); + } + public byte[] convertCffProgramToTrueType(byte[] fontBytes, String toUnicode) { if (!cffConversionEnabled || fontBytes == null || fontBytes.length == 0) { log.warn( diff --git a/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/type3/Type3LibraryStrategy.java b/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/type3/Type3LibraryStrategy.java index 4385e5725..51a282a48 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/type3/Type3LibraryStrategy.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/type3/Type3LibraryStrategy.java @@ -23,8 +23,8 @@ import stirling.software.SPDF.service.pdfjson.type3.library.Type3FontLibraryPayl public class Type3LibraryStrategy implements Type3ConversionStrategy { private final Type3FontLibrary fontLibrary; + private final stirling.software.common.model.ApplicationProperties applicationProperties; - @Value("${stirling.pdf.json.type3.library.enabled:true}") private boolean enabled; @Override @@ -42,6 +42,12 @@ public class Type3LibraryStrategy implements Type3ConversionStrategy { return enabled && fontLibrary != null && fontLibrary.isLoaded(); } + @jakarta.annotation.PostConstruct + private void loadConfiguration() { + var cfg = applicationProperties.getPdfEditor().getType3().getLibrary(); + this.enabled = cfg.isEnabled(); + } + @Override public PdfJsonFontConversionCandidate convert( Type3ConversionRequest request, Type3GlyphContext context) throws IOException { diff --git a/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/type3/library/Type3FontLibrary.java b/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/type3/library/Type3FontLibrary.java index 32a6abec2..78ec84653 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/type3/library/Type3FontLibrary.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/pdfjson/type3/library/Type3FontLibrary.java @@ -34,8 +34,8 @@ public class Type3FontLibrary { private final ObjectMapper objectMapper; private final ResourceLoader resourceLoader; + private final stirling.software.common.model.ApplicationProperties applicationProperties; - @Value("${stirling.pdf.json.type3.library.index:classpath:/type3/library/index.json}") private String indexLocation; private final Map signatureIndex = new ConcurrentHashMap<>(); @@ -44,6 +44,8 @@ public class Type3FontLibrary { @jakarta.annotation.PostConstruct void initialise() { + this.indexLocation = + applicationProperties.getPdfEditor().getType3().getLibrary().getIndex(); Resource resource = resourceLoader.getResource(indexLocation); if (!resource.exists()) { log.info("[TYPE3] Library index {} not found; Type3 library disabled", indexLocation); diff --git a/app/core/src/main/resources/settings.yml.template b/app/core/src/main/resources/settings.yml.template index 5a50ef903..b5d10126b 100644 --- a/app/core/src/main/resources/settings.yml.template +++ b/app/core/src/main/resources/settings.yml.template @@ -178,23 +178,6 @@ system: databaseBackup: cron: '0 0 0 * * ?' # Cron expression for automatic database backups "0 0 0 * * ?" daily at midnight -stirling: - pdf: - fallback-font: classpath:/static/fonts/NotoSans-Regular.ttf # Override to point at a custom fallback font - json: - font-normalization: - enabled: false # IMPORTANT: Disable to preserve ToUnicode CMaps for correct font rendering. Ghostscript strips Unicode mappings from CID fonts. - cff-converter: - enabled: true # Wrap CFF/Type1C fonts as OpenType-CFF for browser compatibility - method: python # Converter method: 'python' (fontTools, recommended - wraps as OTF), 'fontforge' (legacy - converts to TTF, may hang on CID fonts) - python-command: /opt/venv/bin/python3 # Python interpreter path - python-script: /scripts/convert_cff_to_ttf.py # Path to font wrapping script - fontforge-command: fontforge # Override if FontForge is installed under a different name/path - type3: - library: - enabled: true # Match common Type3 fonts against the built-in library of converted programs - index: classpath:/type3/library/index.json # Override to point at a custom index.json (supports http:, file:, classpath:) - ui: appNameNavbar: '' # name displayed on the navigation bar logoStyle: classic # Options: 'classic' (default - classic S icon) or 'modern' (minimalist logo) @@ -236,3 +219,21 @@ processExecutor: qpdfTimeoutMinutes: 30 ghostscriptTimeoutMinutes: 30 ocrMyPdfTimeoutMinutes: 30 + +pdfEditor: + fallback-font: classpath:/static/fonts/NotoSans-Regular.ttf # Override to point at a custom fallback font + cache: + max-bytes: -1 # Max in-memory cache size in bytes; -1 disables byte cap + max-percent: 20 # Max in-memory cache as % of JVM max; used when max-bytes <= 0 + font-normalization: + enabled: false # IMPORTANT: Disable to preserve ToUnicode CMaps for correct font rendering. Ghostscript strips Unicode mappings from CID fonts. + cff-converter: + enabled: true # Wrap CFF/Type1CFF fonts as OpenType-CFF for browser compatibility + method: python # Converter method: 'python' (fontTools, recommended - wraps as OTF), 'fontforge' (legacy - converts to TTF, may hang on CID fonts) + python-command: /opt/venv/bin/python3 # Python interpreter path + python-script: /scripts/convert_cff_to_ttf.py # Path to font wrapping script + fontforge-command: fontforge # Override if FontForge is installed under a different name/path + type3: + library: + enabled: true # Match common Type3 fonts against the built-in library of converted programs + index: classpath:/type3/library/index.json # Override to point at a custom index.json (supports http:, file:, classpath:) diff --git a/frontend/src/core/tools/pdfTextEditor/PdfTextEditor.tsx b/frontend/src/core/tools/pdfTextEditor/PdfTextEditor.tsx index 533dc644b..88c50870c 100644 --- a/frontend/src/core/tools/pdfTextEditor/PdfTextEditor.tsx +++ b/frontend/src/core/tools/pdfTextEditor/PdfTextEditor.tsx @@ -238,6 +238,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { const originalImagesRef = useRef([]); const originalGroupsRef = useRef([]); const imagesByPageRef = useRef([]); + const lastLoadedFileRef = useRef(null); const autoLoadKeyRef = useRef(null); const sourceFileIdRef = useRef(null); const loadRequestIdRef = useRef(0); @@ -251,6 +252,8 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { const pagePreviewsRef = useRef>(pagePreviews); const previewScaleRef = useRef>(new Map()); const cachedJobIdRef = useRef(null); + const cacheRecoveryInProgressRef = useRef(false); + const recoverCacheAndReloadRef = useRef<() => Promise>(async () => false); // Keep ref in sync with state for access in async callbacks useEffect(() => { @@ -279,6 +282,13 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { }; }, []); + const isCacheUnavailableError = useCallback((error: any): boolean => { + const status = error?.response?.status; + const data = error?.response?.data; + const code = (data && (data.error || data.code)) ?? undefined; + return status === 410 && code === 'cache_unavailable'; + }, []); + const dirtyPages = useMemo( () => getDirtyPages(groupsByPage, imagesByPage, originalGroupsRef.current, originalImagesRef.current), [groupsByPage, imagesByPage], @@ -316,6 +326,8 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { loadedImagePagesRef.current = new Set(); loadingImagePagesRef.current = new Set(); setSelectedPage(0); + setIsLazyMode(false); + setCachedJobId(null); return; } const cloned = deepCloneDocument(document); @@ -404,7 +416,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { // Load images for a page in lazy mode const loadImagesForPage = useCallback( - async (pageIndex: number) => { + async (pageIndex: number, fromRecovery = false) => { if (!isLazyMode) { return; } @@ -489,6 +501,12 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { ); } catch (error) { console.error(`[loadImagesForPage] Failed to load images for page ${pageNumber}:`, error); + if (!fromRecovery && isCacheUnavailableError(error)) { + const recovered = await recoverCacheAndReloadRef.current(); + if (recovered) { + return loadImagesForPage(pageIndex, true); + } + } } finally { loadingImagePagesRef.current.delete(pageIndex); setLoadingImagePages((prev) => { @@ -498,7 +516,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { }); } }, - [isLazyMode, cachedJobId], + [isLazyMode, cachedJobId, isCacheUnavailableError], ); const handleLoadFile = useCallback( @@ -507,6 +525,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { return; } + lastLoadedFileRef.current = file; const requestId = loadRequestIdRef.current + 1; loadRequestIdRef.current = requestId; @@ -555,59 +574,35 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { message: 'Starting conversion...', }); - let jobComplete = false; - let attempts = 0; - const maxAttempts = 600; + let jobComplete = false; + let attempts = 0; + const maxAttempts = 600; + let pollDelay = 500; - while (!jobComplete && attempts < maxAttempts) { - await new Promise((resolve) => setTimeout(resolve, 1000)); - attempts += 1; + while (!jobComplete && attempts < maxAttempts) { + await new Promise((resolve) => setTimeout(resolve, pollDelay)); + attempts += 1; + if (pollDelay < 10000) { + pollDelay = Math.min(10000, Math.floor(pollDelay * 1.5)); + } try { const statusResponse = await apiClient.get(`/api/v1/general/job/${jobId}`); const jobStatus = statusResponse.data; console.log(`Job status (attempt ${attempts}):`, jobStatus); - if (jobStatus.notes && jobStatus.notes.length > 0) { - const lastNote = jobStatus.notes[jobStatus.notes.length - 1]; - console.log('Latest note:', lastNote); - const matchWithCount = lastNote.match( - /\[(\d+)%\]\s+(\w+):\s+(.+?)\s+\((\d+)\/(\d+)\)/, - ); - if (matchWithCount) { - const percent = parseInt(matchWithCount[1], 10); - const stage = matchWithCount[2]; - const message = matchWithCount[3]; - const current = parseInt(matchWithCount[4], 10); - const total = parseInt(matchWithCount[5], 10); - setConversionProgress({ - percent, - stage, - message, - current, - total, - }); - } else { - const match = lastNote.match(/\[(\d+)%\]\s+(\w+):\s+(.+)/); - if (match) { - const percent = parseInt(match[1], 10); - const stage = match[2]; - const message = match[3]; - setConversionProgress({ - percent, - stage, - message, - }); - } - } - } else if (jobStatus.progress !== undefined) { - const percent = Math.min(Math.max(jobStatus.progress, 0), 100); - setConversionProgress({ - percent, - stage: jobStatus.stage || 'processing', - message: jobStatus.note || 'Converting PDF to JSON...', - }); - } + const percent = Math.min(Math.max(jobStatus.progress ?? 0, 0), 100); + const stage = jobStatus.stage || 'processing'; + const message = jobStatus.note || 'Converting PDF to JSON...'; + const current = jobStatus.current ?? undefined; + const total = jobStatus.total ?? undefined; + setConversionProgress({ + percent, + stage, + message, + current, + total, + }); if (jobStatus.complete) { if (jobStatus.error) { @@ -719,6 +714,8 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { setLoadedDocument(null); resetToDocument(null, groupingMode); clearPdfPreview(); + setIsLazyMode(false); + setCachedJobId(null); if (isPdf) { const errorMsg = @@ -743,6 +740,55 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { [groupingMode, resetToDocument, t], ); + const recoverCacheAndReload = useCallback(async () => { + if (cacheRecoveryInProgressRef.current) { + return false; + } + if ((recoverCacheAndReloadRef as any).attempts === undefined) { + (recoverCacheAndReloadRef as any).attempts = 0; + } + if ((recoverCacheAndReloadRef as any).attempts >= 2) { + setErrorMessage( + t( + 'pdfTextEditor.errors.cacheRecoveryLimit', + 'Cache was unavailable after multiple attempts. Please reload the file manually.', + ), + ); + return false; + } + (recoverCacheAndReloadRef as any).attempts += 1; + const file = lastLoadedFileRef.current; + if (!file) { + setErrorMessage( + t( + 'pdfTextEditor.errors.cacheMissingFile', + 'Session expired. Please reload the PDF file to continue.', + ), + ); + return false; + } + cacheRecoveryInProgressRef.current = true; + try { + await handleLoadFile(file); + return true; + } catch (error) { + console.error('[PdfTextEditor] Cache recovery failed', error); + setErrorMessage( + t( + 'pdfTextEditor.errors.cacheReloadFailed', + 'Cache expired and reload failed. Please reselect the file.', + ), + ); + return false; + } finally { + cacheRecoveryInProgressRef.current = false; + } + }, [handleLoadFile, t]); + + useEffect(() => { + recoverCacheAndReloadRef.current = recoverCacheAndReload; + }, [recoverCacheAndReload]); + // Wrapper for loading files from the dropzone - adds to workbench first const handleLoadFileFromDropzone = useCallback( async (file: File) => { @@ -1054,10 +1100,11 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { if (canUseIncremental) { await ensureImagesForPages(dirtyPageIndices); - try { + let incrementalRetried = false; + const attemptIncrementalExport = async () => { const payload = buildPayload(); if (!payload) { - return; + return false; } const { document, filename } = payload; @@ -1076,7 +1123,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { const baseName = sanitizeBaseName(filename).replace(/-edited$/u, ''); const expectedName = `${baseName || 'document'}.pdf`; const response = await apiClient.post( - `/api/v1/convert/pdf/text-editor/partial/${cachedJobId}?filename=${encodeURIComponent(expectedName)}`, + `/api/v1/convert/pdf/text-editor/partial/${cachedJobIdRef.current}?filename=${encodeURIComponent(expectedName)}`, partialDocument, { responseType: 'blob', @@ -1094,8 +1141,26 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { onComplete([pdfFile]); } setErrorMessage(null); - return; + return true; + }; + + try { + const success = await attemptIncrementalExport(); + if (success) { + return; + } } catch (incrementalError) { + if (!incrementalRetried && isCacheUnavailableError(incrementalError)) { + const recovered = await recoverCacheAndReloadRef.current(); + incrementalRetried = true; + if (recovered) { + await ensureImagesForPages(dirtyPageIndices); + const success = await attemptIncrementalExport(); + if (success) { + return; + } + } + } console.warn( '[handleGeneratePdf] Incremental export failed, falling back to full export', incrementalError,