diff --git a/.gradle-home/wrapper/dists/gradle-8.14-all/c2qonpi39x1mddn7hk5gh9iqj/gradle-8.14-all.zip.lck b/.gradle-home/wrapper/dists/gradle-8.14-all/c2qonpi39x1mddn7hk5gh9iqj/gradle-8.14-all.zip.lck new file mode 100644 index 000000000..e69de29bb diff --git a/.gradle-home/wrapper/dists/gradle-8.14-all/c2qonpi39x1mddn7hk5gh9iqj/gradle-8.14-all.zip.part b/.gradle-home/wrapper/dists/gradle-8.14-all/c2qonpi39x1mddn7hk5gh9iqj/gradle-8.14-all.zip.part new file mode 100644 index 000000000..e69de29bb diff --git a/app/common/src/main/java/stirling/software/common/service/JobExecutorService.java b/app/common/src/main/java/stirling/software/common/service/JobExecutorService.java index 73afa22a0..740067d3d 100644 --- a/app/common/src/main/java/stirling/software/common/service/JobExecutorService.java +++ b/app/common/src/main/java/stirling/software/common/service/JobExecutorService.java @@ -148,17 +148,31 @@ public class JobExecutorService { taskManager.createTask(jobId); // Create a specialized wrapper that updates the TaskManager + final String capturedJobIdForQueue = jobId; Supplier wrappedWork = () -> { try { + // Set jobId in ThreadLocal context for the queued job + stirling.software.common.util.JobContext.setJobId( + capturedJobIdForQueue); + log.debug( + "Set jobId {} in JobContext for queued job execution", + capturedJobIdForQueue); + Object result = work.get(); - processJobResult(jobId, result); + processJobResult(capturedJobIdForQueue, result); return result; } catch (Exception e) { log.error( - "Error executing queued job {}: {}", jobId, e.getMessage(), e); - taskManager.setError(jobId, e.getMessage()); + "Error executing queued job {}: {}", + capturedJobIdForQueue, + e.getMessage(), + e); + taskManager.setError(capturedJobIdForQueue, e.getMessage()); throw e; + } finally { + // Clean up ThreadLocal to avoid memory leaks + stirling.software.common.util.JobContext.clear(); } }; @@ -170,21 +184,36 @@ public class JobExecutorService { return ResponseEntity.ok().body(new JobResponse<>(true, jobId, null)); } else if (async) { taskManager.createTask(jobId); + + // Capture the jobId for the async thread + final String capturedJobId = jobId; + executor.execute( () -> { try { log.debug( - "Running async job {} with timeout {} ms", jobId, timeoutToUse); + "Running async job {} with timeout {} ms", + capturedJobId, + timeoutToUse); + + // Set jobId in ThreadLocal context for the async thread + stirling.software.common.util.JobContext.setJobId(capturedJobId); + log.debug( + "Set jobId {} in JobContext for async execution", + capturedJobId); // Execute with timeout Object result = executeWithTimeout(() -> work.get(), timeoutToUse); - processJobResult(jobId, result); + processJobResult(capturedJobId, result); } catch (TimeoutException te) { log.error("Job {} timed out after {} ms", jobId, timeoutToUse); taskManager.setError(jobId, "Job timed out"); } catch (Exception e) { log.error("Error executing job {}: {}", jobId, e.getMessage(), e); taskManager.setError(jobId, e.getMessage()); + } finally { + // Clean up ThreadLocal to avoid memory leaks + stirling.software.common.util.JobContext.clear(); } }); @@ -193,6 +222,10 @@ public class JobExecutorService { try { log.debug("Running sync job with timeout {} ms", timeoutToUse); + // Make jobId available to downstream components on the worker thread + stirling.software.common.util.JobContext.setJobId(jobId); + log.debug("Set jobId {} in JobContext for sync execution", jobId); + // Execute with timeout Object result = executeWithTimeout(() -> work.get(), timeoutToUse); @@ -212,6 +245,8 @@ public class JobExecutorService { // Construct a JSON error response return ResponseEntity.internalServerError() .body(Map.of("error", "Job failed: " + e.getMessage())); + } finally { + stirling.software.common.util.JobContext.clear(); } } } @@ -456,8 +491,23 @@ public class JobExecutorService { throws TimeoutException, Exception { // Use the same executor as other async jobs for consistency // This ensures all operations run on the same thread pool + String currentJobId = stirling.software.common.util.JobContext.getJobId(); + java.util.concurrent.CompletableFuture future = - java.util.concurrent.CompletableFuture.supplyAsync(supplier, executor); + java.util.concurrent.CompletableFuture.supplyAsync( + () -> { + if (currentJobId != null) { + stirling.software.common.util.JobContext.setJobId(currentJobId); + } + try { + return supplier.get(); + } finally { + if (currentJobId != null) { + stirling.software.common.util.JobContext.clear(); + } + } + }, + executor); try { return future.get(timeoutMs, TimeUnit.MILLISECONDS); diff --git a/app/common/src/main/java/stirling/software/common/util/JobContext.java b/app/common/src/main/java/stirling/software/common/util/JobContext.java new file mode 100644 index 000000000..a41394914 --- /dev/null +++ b/app/common/src/main/java/stirling/software/common/util/JobContext.java @@ -0,0 +1,18 @@ +package stirling.software.common.util; + +/** Thread-local context for passing job ID across async boundaries */ +public class JobContext { + private static final ThreadLocal CURRENT_JOB_ID = new ThreadLocal<>(); + + public static void setJobId(String jobId) { + CURRENT_JOB_ID.set(jobId); + } + + public static String getJobId() { + return CURRENT_JOB_ID.get(); + } + + public static void clear() { + CURRENT_JOB_ID.remove(); + } +} diff --git a/app/common/src/main/java/stirling/software/common/util/ProcessExecutor.java b/app/common/src/main/java/stirling/software/common/util/ProcessExecutor.java index ee7297153..82342bcde 100644 --- a/app/common/src/main/java/stirling/software/common/util/ProcessExecutor.java +++ b/app/common/src/main/java/stirling/software/common/util/ProcessExecutor.java @@ -94,6 +94,7 @@ public class ProcessExecutor { .getProcessExecutor() .getSessionLimit() .getOcrMyPdfSessionLimit(); + case CFF_CONVERTER -> 1; }; long timeoutMinutes = @@ -148,6 +149,7 @@ public class ProcessExecutor { .getProcessExecutor() .getTimeoutMinutes() .getOcrMyPdfTimeoutMinutes(); + case CFF_CONVERTER -> 5L; }; return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes); }); @@ -300,7 +302,8 @@ public class ProcessExecutor { TESSERACT, QPDF, GHOSTSCRIPT, - OCR_MY_PDF + OCR_MY_PDF, + CFF_CONVERTER } public class ProcessExecutorResult { diff --git a/app/common/src/test/java/stirling/software/common/service/JobExecutorServiceTest.java b/app/common/src/test/java/stirling/software/common/service/JobExecutorServiceTest.java index 630ac80bf..b1f96f3e9 100644 --- a/app/common/src/test/java/stirling/software/common/service/JobExecutorServiceTest.java +++ b/app/common/src/test/java/stirling/software/common/service/JobExecutorServiceTest.java @@ -78,6 +78,23 @@ class JobExecutorServiceTest { verify(request).setAttribute(eq("jobId"), anyString()); } + @Test + void shouldExposeJobIdInJobContextDuringSyncExecution() throws Exception { + // Given + Supplier work = stirling.software.common.util.JobContext::getJobId; + + // When + ResponseEntity response = jobExecutorService.runJobGeneric(false, work); + + // Then + assertEquals(HttpStatus.OK, response.getStatusCode()); + assertNotNull(response.getBody()); + + var requestJobIdCaptor = ArgumentCaptor.forClass(String.class); + verify(request).setAttribute(eq("jobId"), requestJobIdCaptor.capture()); + assertEquals(requestJobIdCaptor.getValue(), response.getBody()); + } + @Test void shouldRunAsyncJobSuccessfully() throws Exception { // Given diff --git a/app/core/src/main/resources/application.properties b/app/core/src/main/resources/application.properties index 1208da90e..cb96934c3 100644 --- a/app/core/src/main/resources/application.properties +++ b/app/core/src/main/resources/application.properties @@ -8,6 +8,8 @@ logging.level.org.eclipse.jetty=WARN #logging.level.stirling.software.proprietary.security=DEBUG logging.level.com.zaxxer.hikari=WARN logging.level.stirling.software.SPDF.service.PdfJsonConversionService=TRACE +logging.level.stirling.software.common.service.JobExecutorService=DEBUG +logging.level.stirling.software.common.service.TaskManager=DEBUG spring.jpa.open-in-view=false server.forward-headers-strategy=NATIVE server.error.path=/error diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java b/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java index 582679dfd..0cd22d3ff 100644 --- a/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java +++ b/app/proprietary/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java @@ -1,16 +1,26 @@ package stirling.software.SPDF.controller.api.converters; +import java.util.Optional; + import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.ModelAttribute; +import org.springframework.web.bind.annotation.PathVariable; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestBody; +import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.multipart.MultipartFile; import io.github.pixee.security.Filenames; import io.swagger.v3.oas.annotations.Operation; import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.config.swagger.StandardPdfResponse; +import stirling.software.SPDF.model.json.PdfJsonDocument; +import stirling.software.SPDF.model.json.PdfJsonMetadata; import stirling.software.SPDF.service.PdfJsonConversionService; import stirling.software.common.annotations.AutoJobPostMapping; import stirling.software.common.annotations.api.ConvertApi; @@ -19,6 +29,7 @@ import stirling.software.common.model.api.PDFFile; import stirling.software.common.util.ExceptionUtils; import stirling.software.common.util.WebResponseUtils; +@Slf4j @ConvertApi @RequiredArgsConstructor public class ConvertPdfJsonController { @@ -71,4 +82,81 @@ public class ConvertPdfJsonController { String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf"; return WebResponseUtils.bytesToWebResponse(pdfBytes, docName); } + + @PostMapping(consumes = "multipart/form-data", value = "/pdf/json/metadata") + @Operation( + summary = "Extract PDF metadata for lazy loading", + description = + "Extracts document metadata, fonts, and page dimensions. Caches the document for" + + " subsequent page requests. Input:PDF Output:JSON Type:SISO") + public ResponseEntity extractPdfMetadata( + @ModelAttribute PDFFile request, @RequestParam(required = true) String jobId) + throws Exception { + MultipartFile inputFile = request.getFileInput(); + if (inputFile == null) { + throw ExceptionUtils.createNullArgumentException("fileInput"); + } + + byte[] jsonBytes = pdfJsonConversionService.extractDocumentMetadata(inputFile, jobId); + String originalName = inputFile.getOriginalFilename(); + String baseName = + (originalName != null && !originalName.isBlank()) + ? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "") + : "document"; + String docName = baseName + "_metadata.json"; + return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON); + } + + @PostMapping(value = "/pdf/json/partial/{jobId}", consumes = MediaType.APPLICATION_JSON_VALUE) + @StandardPdfResponse + @Operation( + summary = "Apply incremental edits to a cached PDF", + description = + "Applies edits for the specified pages of a cached PDF and returns an updated PDF." + + " Requires the PDF to have been previously cached via the PDF to JSON endpoint.") + public ResponseEntity exportPartialPdf( + @PathVariable String jobId, + @RequestBody PdfJsonDocument document, + @RequestParam(value = "filename", required = false) String filename) + throws Exception { + if (document == null) { + throw ExceptionUtils.createNullArgumentException("document"); + } + + byte[] pdfBytes = pdfJsonConversionService.exportUpdatedPages(jobId, document); + + String baseName = + (filename != null && !filename.isBlank()) + ? Filenames.toSimpleFileName(filename).replaceFirst("[.][^.]+$", "") + : Optional.ofNullable(document.getMetadata()) + .map(PdfJsonMetadata::getTitle) + .filter(title -> title != null && !title.isBlank()) + .orElse("document"); + String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf"; + return WebResponseUtils.bytesToWebResponse(pdfBytes, docName); + } + + @GetMapping(value = "/pdf/json/page/{jobId}/{pageNumber}") + @Operation( + summary = "Extract single page from cached PDF", + description = + "Retrieves a single page's content from a previously cached PDF document." + + " Requires prior call to /pdf/json/metadata. Output:JSON") + public ResponseEntity extractSinglePage( + @PathVariable String jobId, @PathVariable int pageNumber) throws Exception { + byte[] jsonBytes = pdfJsonConversionService.extractSinglePage(jobId, pageNumber); + String docName = "page_" + pageNumber + ".json"; + return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON); + } + + @PostMapping(value = "/pdf/json/clear-cache/{jobId}") + @Operation( + summary = "Clear cached PDF document", + description = + "Manually clears a cached PDF document to free up server resources." + + " Called automatically after 30 minutes.") + public ResponseEntity clearCache(@PathVariable String jobId) { + pdfJsonConversionService.clearCachedDocument(jobId); + return ResponseEntity.ok().build(); + } } diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/model/api/PdfJsonConversionProgress.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/api/PdfJsonConversionProgress.java new file mode 100644 index 000000000..75e41541a --- /dev/null +++ b/app/proprietary/src/main/java/stirling/software/SPDF/model/api/PdfJsonConversionProgress.java @@ -0,0 +1,49 @@ +package stirling.software.SPDF.model.api; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class PdfJsonConversionProgress { + private int percent; + private String stage; + private String message; + private boolean complete; + private Integer current; // Current item being processed (e.g., page number) + private Integer total; // Total items to process (e.g., total pages) + + public static PdfJsonConversionProgress of(int percent, String stage, String message) { + return PdfJsonConversionProgress.builder() + .percent(percent) + .stage(stage) + .message(message) + .complete(false) + .build(); + } + + public static PdfJsonConversionProgress of( + int percent, String stage, String message, int current, int total) { + return PdfJsonConversionProgress.builder() + .percent(percent) + .stage(stage) + .message(message) + .current(current) + .total(total) + .complete(false) + .build(); + } + + public static PdfJsonConversionProgress complete() { + return PdfJsonConversionProgress.builder() + .percent(100) + .stage("complete") + .message("Conversion complete") + .complete(true) + .build(); + } +} diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java index d590b34b9..b1559a874 100644 --- a/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java +++ b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocument.java @@ -22,6 +22,9 @@ public class PdfJsonDocument { /** Optional XMP metadata packet stored as Base64. */ private String xmpMetadata; + /** Indicates that images should be loaded lazily via API rather than embedded in the JSON. */ + private Boolean lazyImages; + @Builder.Default private List fonts = new ArrayList<>(); @Builder.Default private List pages = new ArrayList<>(); diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocumentMetadata.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocumentMetadata.java new file mode 100644 index 000000000..15819973e --- /dev/null +++ b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonDocumentMetadata.java @@ -0,0 +1,34 @@ +package stirling.software.SPDF.model.json; + +import java.util.ArrayList; +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonDocumentMetadata { + + private PdfJsonMetadata metadata; + + /** Optional XMP metadata packet stored as Base64. */ + private String xmpMetadata; + + /** Indicates that images should be requested lazily via the page endpoint. */ + private Boolean lazyImages; + + @Builder.Default private List fonts = new ArrayList<>(); + + @Builder.Default private List pageDimensions = new ArrayList<>(); + + /** Form fields (AcroForm) at document level */ + @Builder.Default private List formFields = new ArrayList<>(); +} diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonPageDimension.java b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonPageDimension.java new file mode 100644 index 000000000..283f59747 --- /dev/null +++ b/app/proprietary/src/main/java/stirling/software/SPDF/model/json/PdfJsonPageDimension.java @@ -0,0 +1,20 @@ +package stirling.software.SPDF.model.json; + +import com.fasterxml.jackson.annotation.JsonInclude; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +public class PdfJsonPageDimension { + private Integer pageNumber; + private Float width; + private Float height; + private Integer rotation; +} diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java index 050879402..b86c118dc 100644 --- a/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java +++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java @@ -1,14 +1,14 @@ package stirling.software.SPDF.service; +import static stirling.software.SPDF.service.PdfJsonFallbackFontService.FALLBACK_FONT_ID; + import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; import java.awt.image.BufferedImage; -import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.io.OutputStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -16,7 +16,6 @@ import java.nio.file.Path; import java.time.Instant; import java.time.format.DateTimeParseException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Base64; import java.util.Calendar; import java.util.Collections; @@ -37,6 +36,7 @@ import java.util.TimeZone; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; import javax.imageio.ImageIO; @@ -45,13 +45,8 @@ import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.OperatorName; import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; -import org.apache.pdfbox.cos.COSBoolean; import org.apache.pdfbox.cos.COSDictionary; -import org.apache.pdfbox.cos.COSFloat; -import org.apache.pdfbox.cos.COSInteger; import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.cos.COSNull; -import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.pdfparser.PDFStreamParser; @@ -71,8 +66,10 @@ import org.apache.pdfbox.pdmodel.font.PDFontFactory; import org.apache.pdfbox.pdmodel.font.PDType0Font; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.color.PDColor; import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImage; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; @@ -87,8 +84,6 @@ import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.util.DateConverter; import org.apache.pdfbox.util.Matrix; import org.springframework.beans.factory.annotation.Value; -import org.springframework.core.io.Resource; -import org.springframework.core.io.ResourceLoader; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; @@ -100,19 +95,24 @@ import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.config.EndpointConfiguration; +import stirling.software.SPDF.model.api.PdfJsonConversionProgress; import stirling.software.SPDF.model.json.PdfJsonAnnotation; import stirling.software.SPDF.model.json.PdfJsonCosValue; import stirling.software.SPDF.model.json.PdfJsonDocument; +import stirling.software.SPDF.model.json.PdfJsonDocumentMetadata; import stirling.software.SPDF.model.json.PdfJsonFont; import stirling.software.SPDF.model.json.PdfJsonFontCidSystemInfo; import stirling.software.SPDF.model.json.PdfJsonFormField; import stirling.software.SPDF.model.json.PdfJsonImageElement; import stirling.software.SPDF.model.json.PdfJsonMetadata; import stirling.software.SPDF.model.json.PdfJsonPage; +import stirling.software.SPDF.model.json.PdfJsonPageDimension; import stirling.software.SPDF.model.json.PdfJsonStream; import stirling.software.SPDF.model.json.PdfJsonTextColor; import stirling.software.SPDF.model.json.PdfJsonTextElement; +import stirling.software.SPDF.service.pdfjson.PdfJsonFontService; import stirling.software.common.service.CustomPDFDocumentFactory; +import stirling.software.common.service.TaskManager; import stirling.software.common.util.ExceptionUtils; import stirling.software.common.util.ProcessExecutor; import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult; @@ -126,76 +126,26 @@ public class PdfJsonConversionService { private final CustomPDFDocumentFactory pdfDocumentFactory; private final ObjectMapper objectMapper; - private final ResourceLoader resourceLoader; private final EndpointConfiguration endpointConfiguration; private final TempFileManager tempFileManager; - - private static final String FALLBACK_FONT_ID = "fallback-noto-sans"; - private static final String DEFAULT_FALLBACK_FONT_LOCATION = - "classpath:/static/fonts/NotoSans-Regular.ttf"; - private static final String FALLBACK_FONT_CJK_ID = "fallback-noto-cjk"; - private static final String FALLBACK_FONT_JP_ID = "fallback-noto-jp"; - private static final String FALLBACK_FONT_KR_ID = "fallback-noto-korean"; - private static final String FALLBACK_FONT_AR_ID = "fallback-noto-arabic"; - private static final String FALLBACK_FONT_TH_ID = "fallback-noto-thai"; - - private static final Map BUILT_IN_FALLBACK_FONTS = - Map.ofEntries( - Map.entry( - FALLBACK_FONT_CJK_ID, - new FallbackFontSpec( - "classpath:/static/fonts/NotoSansSC-Regular.ttf", - "NotoSansSC-Regular", - "ttf")), - Map.entry( - FALLBACK_FONT_JP_ID, - new FallbackFontSpec( - "classpath:/static/fonts/NotoSansJP-Regular.ttf", - "NotoSansJP-Regular", - "ttf")), - Map.entry( - FALLBACK_FONT_KR_ID, - new FallbackFontSpec( - "classpath:/static/fonts/malgun.ttf", "MalgunGothic", "ttf")), - Map.entry( - FALLBACK_FONT_AR_ID, - new FallbackFontSpec( - "classpath:/static/fonts/NotoSansArabic-Regular.ttf", - "NotoSansArabic-Regular", - "ttf")), - Map.entry( - FALLBACK_FONT_TH_ID, - new FallbackFontSpec( - "classpath:/static/fonts/NotoSansThai-Regular.ttf", - "NotoSansThai-Regular", - "ttf"))); - - @Value("${stirling.pdf.fallback-font:" + DEFAULT_FALLBACK_FONT_LOCATION + "}") - private String fallbackFontLocation; + private final TaskManager taskManager; + private final PdfJsonCosMapper cosMapper; + private final PdfJsonFallbackFontService fallbackFontService; + private final PdfJsonFontService fontService; @Value("${stirling.pdf.json.font-normalization.enabled:true}") private boolean fontNormalizationEnabled; - @Value("${stirling.pdf.json.cff-converter.enabled:true}") - private boolean cffConversionEnabled; - - @Value("${stirling.pdf.json.cff-converter.method:python}") - private String cffConverterMethod; - - @Value("${stirling.pdf.json.cff-converter.python-command:/opt/venv/bin/python3}") - private String pythonCommand; - - @Value("${stirling.pdf.json.cff-converter.python-script:/scripts/convert_cff_to_ttf.py}") - private String pythonScript; - - @Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}") - private String fontforgeCommand; - - private final Map fallbackFontCache = new ConcurrentHashMap<>(); + /** Cache for storing PDDocuments for lazy page loading. Key is jobId. */ + private final Map documentCache = new ConcurrentHashMap<>(); private volatile boolean ghostscriptAvailable; @PostConstruct + private void initializeToolAvailability() { + initializeGhostscriptAvailability(); + } + private void initializeGhostscriptAvailability() { if (!fontNormalizationEnabled) { ghostscriptAvailable = false; @@ -235,17 +185,72 @@ public class PdfJsonConversionService { } public byte[] convertPdfToJson(MultipartFile file) throws IOException { + return convertPdfToJson(file, null); + } + + public byte[] convertPdfToJson( + MultipartFile file, Consumer progressCallback) + throws IOException { if (file == null) { throw ExceptionUtils.createNullArgumentException("fileInput"); } + // Get job ID from request context if running in async mode + String jobId = getJobIdFromRequest(); + log.info("Starting PDF to JSON conversion, jobId from context: {}", jobId); + + Consumer progress = + progressCallback != null + ? (p) -> { + log.info( + "Progress: [{}%] {} - {}{}", + p.getPercent(), + p.getStage(), + p.getMessage(), + (p.getCurrent() != null && p.getTotal() != null) + ? String.format( + " (%d/%d)", p.getCurrent(), p.getTotal()) + : ""); + progressCallback.accept(p); + } + : jobId != null + ? (p) -> { + log.info( + "Progress: [{}%] {} - {}{}", + p.getPercent(), + p.getStage(), + p.getMessage(), + (p.getCurrent() != null && p.getTotal() != null) + ? String.format( + " (%d/%d)", + p.getCurrent(), p.getTotal()) + : ""); + reportProgressToTaskManager(jobId, p); + } + : (p) -> { + log.info( + "Progress (no job): [{}%] {} - {}{}", + p.getPercent(), + p.getStage(), + p.getMessage(), + (p.getCurrent() != null && p.getTotal() != null) + ? String.format( + " (%d/%d)", + p.getCurrent(), p.getTotal()) + : ""); + }; + TempFile normalizedFile = null; try (TempFile originalFile = new TempFile(tempFileManager, ".pdf")) { + progress.accept(PdfJsonConversionProgress.of(5, "loading", "Loading PDF document")); file.transferTo(originalFile.getFile()); Path workingPath = originalFile.getPath(); if (fontNormalizationEnabled && canRunGhostscript()) { try { + progress.accept( + PdfJsonConversionProgress.of( + 10, "normalizing", "Normalizing fonts with Ghostscript")); normalizedFile = normalizePdfFonts(workingPath); if (normalizedFile != null && normalizedFile.exists()) { workingPath = normalizedFile.getPath(); @@ -260,13 +265,28 @@ public class PdfJsonConversionService { } } - try (PDDocument document = pdfDocumentFactory.load(workingPath, true)) { - int totalPages = document.getNumberOfPages(); - log.info("Converting PDF to JSON ({} pages)", totalPages); + progress.accept(PdfJsonConversionProgress.of(20, "parsing", "Parsing PDF structure")); + + // First, check page count to decide on lazy loading + byte[] pdfBytes = Files.readAllBytes(workingPath); + int totalPages; + try (PDDocument tempDoc = pdfDocumentFactory.load(pdfBytes, true)) { + totalPages = tempDoc.getNumberOfPages(); + } + + boolean useLazyImages = totalPages > 5 && jobId != null; + + try (PDDocument document = pdfDocumentFactory.load(pdfBytes, true)) { + log.info( + "Converting PDF to JSON ({} pages) - {} mode", + totalPages, + useLazyImages ? "lazy image" : "standard"); Map fonts = new LinkedHashMap<>(); Map> textByPage = new LinkedHashMap<>(); - Map> pageFontResources = new HashMap<>(); + + progress.accept( + PdfJsonConversionProgress.of(30, "fonts", "Collecting font information")); int pageNumber = 1; for (PDPage page : document.getPages()) { Map resourceMap = @@ -276,21 +296,51 @@ public class PdfJsonConversionService { "PDF→JSON: collected {} font resources on page {}", resourceMap.size(), pageNumber); + + // Update progress for font collection (30-50%) + int fontProgress = 30 + (int) ((pageNumber / (double) totalPages) * 20); + progress.accept( + PdfJsonConversionProgress.of( + fontProgress, + "fonts", + "Collecting fonts", + pageNumber, + totalPages)); pageNumber++; } + progress.accept( + PdfJsonConversionProgress.of(50, "text", "Extracting text content")); TextCollectingStripper stripper = new TextCollectingStripper(document, fonts, textByPage, pageFontResources); stripper.setSortByPosition(true); stripper.getText(document); - Map> imagesByPage = collectImages(document); - Map> annotationsByPage = - collectAnnotations(document); + Map> imagesByPage; + if (useLazyImages) { + progress.accept( + PdfJsonConversionProgress.of( + 70, "images", "Skipping upfront image extraction")); + imagesByPage = new LinkedHashMap<>(); + } else { + progress.accept( + PdfJsonConversionProgress.of( + 70, "images", "Extracting embedded images")); + imagesByPage = collectImages(document, totalPages, progress); + } + progress.accept( + PdfJsonConversionProgress.of( + 80, "annotations", "Collecting annotations and form fields")); + Map> annotationsByPage = + collectAnnotations(document, totalPages, progress); + + progress.accept( + PdfJsonConversionProgress.of(90, "metadata", "Extracting metadata")); PdfJsonDocument pdfJson = new PdfJsonDocument(); pdfJson.setMetadata(extractMetadata(document)); pdfJson.setXmpMetadata(extractXmpMetadata(document)); + pdfJson.setLazyImages(useLazyImages); List serializedFonts = new ArrayList<>(fonts.values()); serializedFonts.sort( Comparator.comparing( @@ -301,12 +351,53 @@ public class PdfJsonConversionService { extractPages(document, textByPage, imagesByPage, annotationsByPage)); pdfJson.setFormFields(collectFormFields(document)); - log.info( - "PDF→JSON conversion complete (fonts: {}, pages: {})", - serializedFonts.size(), - pdfJson.getPages().size()); + if (useLazyImages && jobId != null) { + PdfJsonDocumentMetadata docMetadata = new PdfJsonDocumentMetadata(); + docMetadata.setMetadata(pdfJson.getMetadata()); + docMetadata.setXmpMetadata(pdfJson.getXmpMetadata()); + docMetadata.setFonts(serializedFonts); + docMetadata.setFormFields(pdfJson.getFormFields()); + docMetadata.setLazyImages(Boolean.TRUE); - return objectMapper.writerWithDefaultPrettyPrinter().writeValueAsBytes(pdfJson); + List pageDimensions = new ArrayList<>(); + int pageIndex = 0; + for (PDPage page : document.getPages()) { + PdfJsonPageDimension dim = new PdfJsonPageDimension(); + dim.setPageNumber(pageIndex + 1); + PDRectangle mediaBox = page.getMediaBox(); + dim.setWidth(mediaBox.getWidth()); + dim.setHeight(mediaBox.getHeight()); + dim.setRotation(page.getRotation()); + pageDimensions.add(dim); + pageIndex++; + } + docMetadata.setPageDimensions(pageDimensions); + + CachedPdfDocument cached = + new CachedPdfDocument(pdfBytes, docMetadata, fonts, pageFontResources); + documentCache.put(jobId, cached); + log.info( + "Cached PDF bytes ({} bytes, {} pages, {} fonts) for lazy images, jobId: {}", + pdfBytes.length, + totalPages, + fonts.size(), + jobId); + scheduleDocumentCleanup(jobId); + } + + progress.accept( + PdfJsonConversionProgress.of(95, "serializing", "Generating JSON output")); + + log.info( + "PDF→JSON conversion complete (fonts: {}, pages: {}, lazyImages: {})", + serializedFonts.size(), + pdfJson.getPages().size(), + useLazyImages); + + byte[] result = + objectMapper.writerWithDefaultPrettyPrinter().writeValueAsBytes(pdfJson); + progress.accept(PdfJsonConversionProgress.complete()); + return result; } } finally { closeQuietly(normalizedFile); @@ -402,29 +493,20 @@ public class PdfJsonConversionService { boolean rewriteSucceeded = true; if (hasText) { - if (!preservedStreams.isEmpty()) { - if (preflightResult.usesFallback()) { + if (preflightResult.usesFallback()) { + log.debug( + "Skipping token rewrite for page {} because fallback fonts are required", + pageNumberValue); + rewriteSucceeded = false; + } else if (!preservedStreams.isEmpty()) { + log.info("Attempting token rewrite for page {}", pageNumberValue); + rewriteSucceeded = rewriteTextOperators(document, page, elements, false); + if (!rewriteSucceeded) { log.info( - "Fallback fonts required for page {}; clearing original text tokens", + "Token rewrite failed for page {}, regenerating text stream", pageNumberValue); - rewriteSucceeded = - rewriteTextOperators(document, page, elements, true); - if (!rewriteSucceeded) { - log.info( - "Failed to clear original text tokens on page {}; forcing regeneration", - pageNumberValue); - } } else { - log.info("Attempting token rewrite for page {}", pageNumberValue); - rewriteSucceeded = - rewriteTextOperators(document, page, elements, false); - if (!rewriteSucceeded) { - log.info( - "Token rewrite failed for page {}, regenerating text stream", - pageNumberValue); - } else { - log.info("Token rewrite succeeded for page {}", pageNumberValue); - } + log.info("Token rewrite succeeded for page {}", pageNumberValue); } } else { rewriteSucceeded = false; @@ -494,26 +576,86 @@ public class PdfJsonConversionService { private Map collectFontsForPage( PDDocument document, PDPage page, int pageNumber, Map fonts) throws IOException { - PDResources resources = page.getResources(); + Map mapping = new HashMap<>(); + Set visited = Collections.newSetFromMap(new IdentityHashMap<>()); + collectFontsFromResources( + document, page.getResources(), pageNumber, fonts, mapping, visited, ""); + log.debug( + "Page {} font scan complete (unique fonts discovered: {})", + pageNumber, + mapping.size()); + return mapping; + } + + /** + * Recursively collect fonts from a resource dictionary, including Form XObjects. + * + * @param document The PDF document + * @param resources The resources to scan + * @param pageNumber The page number (for font UID generation) + * @param fonts The global font map to populate + * @param mapping The page-level PDFont -> fontId mapping + * @param visited Set of visited XObject names to prevent infinite recursion + */ + private void collectFontsFromResources( + PDDocument document, + PDResources resources, + int pageNumber, + Map fonts, + Map mapping, + Set visited, + String prefix) + throws IOException { if (resources == null) { - return Collections.emptyMap(); + log.debug( + "Page {} resource scan skipped{} (resources null)", + pageNumber, + prefix.isEmpty() ? "" : " under " + prefix); + return; + } + if (!visited.add(resources.getCOSObject())) { + return; } - Map mapping = new HashMap<>(); for (COSName resourceName : resources.getFontNames()) { PDFont font = resources.getFont(resourceName); if (font == null) { continue; } - String fontId = resourceName.getName(); + String fontId = + prefix.isEmpty() + ? resourceName.getName() + : prefix + "/" + resourceName.getName(); mapping.put(font, fontId); - String key = buildFontKey(pageNumber, fontId); if (!fonts.containsKey(key)) { fonts.put(key, buildFontModel(document, font, fontId, pageNumber)); } } - return mapping; + + for (COSName xobjectName : resources.getXObjectNames()) { + try { + PDXObject xobject = resources.getXObject(xobjectName); + if (xobject instanceof PDFormXObject form) { + collectFontsFromResources( + document, + form.getResources(), + pageNumber, + fonts, + mapping, + visited, + prefix.isEmpty() + ? xobjectName.getName() + : prefix + "/" + xobjectName.getName()); + } + } catch (Exception ex) { + log.debug( + "Failed to inspect XObject {} for fonts on page {}: {}", + xobjectName.getName(), + pageNumber, + ex.getMessage()); + } + } } private String buildFontKey(int pageNumber, String fontId) { @@ -538,16 +680,7 @@ public class PdfJsonConversionService { FontProgramData programData = embedded ? extractFontProgram(font, unicodeMapping) : null; String standard14Name = resolveStandard14Name(font); Integer flags = descriptor != null ? descriptor.getFlags() : null; - PdfJsonCosValue cosDictionary = serializeCosValue(font.getCOSObject()); - - log.debug( - "Building font model: id={}, baseName={}, subtype={}, embedded={}, hasProgram={}, hasWebProgram={}", - fontId, - font.getName(), - subtype, - embedded, - programData != null && programData.getBase64() != null, - programData != null && programData.getWebBase64() != null); + PdfJsonCosValue cosDictionary = cosMapper.serializeCosValue(font.getCOSObject()); return PdfJsonFont.builder() .id(fontId) @@ -609,14 +742,14 @@ public class PdfJsonConversionService { continue; } - if (!canEncodeFully(font, text)) { + if (!fallbackFontService.canEncodeFully(font, text)) { fallbackNeeded = true; element.setFallbackUsed(Boolean.TRUE); for (int offset = 0; offset < text.length(); ) { int codePoint = text.codePointAt(offset); offset += Character.charCount(codePoint); - if (!canEncode(font, codePoint)) { - String fallbackId = resolveFallbackFontId(codePoint); + if (!fallbackFontService.canEncode(font, codePoint)) { + String fallbackId = fallbackFontService.resolveFallbackFontId(codePoint); fallbackIds.add(fallbackId != null ? fallbackId : FALLBACK_FONT_ID); } } @@ -635,61 +768,6 @@ public class PdfJsonConversionService { return new PreflightResult(fallbackNeeded, fallbackIds); } - private PdfJsonFont buildFallbackFontModel() throws IOException { - return buildFallbackFontModel(FALLBACK_FONT_ID); - } - - private PdfJsonFont buildFallbackFontModel(String fallbackId) throws IOException { - FallbackFontSpec spec = getFallbackFontSpec(fallbackId); - if (spec == null) { - throw new IOException("Unknown fallback font id " + fallbackId); - } - byte[] bytes = loadFallbackFontBytes(fallbackId, spec); - String base64 = Base64.getEncoder().encodeToString(bytes); - return PdfJsonFont.builder() - .id(fallbackId) - .uid(fallbackId) - .baseName(spec.baseName()) - .subtype("TrueType") - .embedded(true) - .program(base64) - .programFormat(spec.format()) - .build(); - } - - private FallbackFontSpec getFallbackFontSpec(String fallbackId) { - if (FALLBACK_FONT_ID.equals(fallbackId)) { - String baseName = inferBaseName(fallbackFontLocation, "NotoSans-Regular"); - String format = inferFormat(fallbackFontLocation, "ttf"); - return new FallbackFontSpec(fallbackFontLocation, baseName, format); - } - return BUILT_IN_FALLBACK_FONTS.get(fallbackId); - } - - private String inferBaseName(String location, String defaultName) { - if (location == null || location.isBlank()) { - return defaultName; - } - int slash = location.lastIndexOf('/'); - String fileName = slash >= 0 ? location.substring(slash + 1) : location; - int dot = fileName.lastIndexOf('.'); - if (dot > 0) { - fileName = fileName.substring(0, dot); - } - return fileName.isEmpty() ? defaultName : fileName; - } - - private String inferFormat(String location, String defaultFormat) { - if (location == null || location.isBlank()) { - return defaultFormat; - } - int dot = location.lastIndexOf('.'); - if (dot >= 0 && dot < location.length() - 1) { - return location.substring(dot + 1).toLowerCase(Locale.ROOT); - } - return defaultFormat; - } - private void ensureFallbackResources( PDPage page, Set fallbackFontIds, Map fontMap) { if (fallbackFontIds == null || fallbackFontIds.isEmpty()) { @@ -722,21 +800,6 @@ public class PdfJsonConversionService { } } - private PDFont loadFallbackPdfFont(PDDocument document) throws IOException { - return loadFallbackPdfFont(document, FALLBACK_FONT_ID); - } - - private PDFont loadFallbackPdfFont(PDDocument document, String fallbackId) throws IOException { - FallbackFontSpec spec = getFallbackFontSpec(fallbackId); - if (spec == null) { - throw new IOException("Unknown fallback font id " + fallbackId); - } - byte[] bytes = loadFallbackFontBytes(fallbackId, spec); - try (InputStream stream = new ByteArrayInputStream(bytes)) { - return PDType0Font.load(document, stream, true); - } - } - private PDFont ensureFallbackFont( PDDocument document, Map fontMap, @@ -749,37 +812,15 @@ public class PdfJsonConversionService { if (font != null) { return font; } - PDFont loaded = loadFallbackPdfFont(document, effectiveId); + PDFont loaded = fallbackFontService.loadFallbackPdfFont(document, effectiveId); fontMap.put(key, loaded); if (fontModels != null && fontModels.stream().noneMatch(f -> effectiveId.equals(f.getId()))) { - fontModels.add(buildFallbackFontModel(effectiveId)); + fontModels.add(fallbackFontService.buildFallbackFontModel(effectiveId)); } return loaded; } - private byte[] loadFallbackFontBytes(String fallbackId, FallbackFontSpec spec) - throws IOException { - if (spec == null) { - throw new IOException("No fallback font specification for " + fallbackId); - } - byte[] cached = fallbackFontCache.get(fallbackId); - if (cached != null) { - return cached; - } - Resource resource = resourceLoader.getResource(spec.resourceLocation()); - if (!resource.exists()) { - throw new IOException("Fallback font resource not found at " + spec.resourceLocation()); - } - try (InputStream inputStream = resource.getInputStream(); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - inputStream.transferTo(baos); - byte[] bytes = baos.toByteArray(); - fallbackFontCache.put(fallbackId, bytes); - return bytes; - } - } - private boolean canRunGhostscript() { if (!fontNormalizationEnabled) { return false; @@ -850,323 +891,20 @@ public class PdfJsonConversionService { } private byte[] convertCffProgramToTrueType(byte[] fontBytes, String toUnicode) { - if (!cffConversionEnabled || fontBytes == null || fontBytes.length == 0) { - return null; - } - - // Determine which converter to use - if ("python".equalsIgnoreCase(cffConverterMethod)) { - return convertCffUsingPython(fontBytes, toUnicode); - } else if ("fontforge".equalsIgnoreCase(cffConverterMethod)) { - return convertCffUsingFontForge(fontBytes); - } else { - log.warn( - "Unknown CFF converter method: {}, falling back to Python", cffConverterMethod); - return convertCffUsingPython(fontBytes, toUnicode); - } - } - - private byte[] convertCffUsingPython(byte[] fontBytes, String toUnicode) { - if (pythonCommand == null - || pythonCommand.isBlank() - || pythonScript == null - || pythonScript.isBlank()) { - log.debug("Python converter not configured"); - return null; - } - - try (TempFile inputFile = new TempFile(tempFileManager, ".cff"); - TempFile outputFile = new TempFile(tempFileManager, ".otf"); - TempFile toUnicodeFile = - toUnicode != null ? new TempFile(tempFileManager, ".tounicode") : null) { - Files.write(inputFile.getPath(), fontBytes); - - // Write ToUnicode CMap data if available - if (toUnicode != null && toUnicodeFile != null) { - byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicode); - Files.write(toUnicodeFile.getPath(), toUnicodeBytes); - } - - List command = new ArrayList<>(); - command.add(pythonCommand); - command.add(pythonScript); - command.add(inputFile.getAbsolutePath()); - command.add(outputFile.getAbsolutePath()); - // Add optional ToUnicode file path - if (toUnicodeFile != null) { - command.add(toUnicodeFile.getAbsolutePath()); - } - - ProcessBuilder builder = new ProcessBuilder(command); - builder.redirectErrorStream(true); - Process process = builder.start(); - - StringBuilder output = new StringBuilder(); - Thread reader = - new Thread( - () -> { - try (BufferedReader br = - new BufferedReader( - new InputStreamReader( - process.getInputStream(), - StandardCharsets.UTF_8))) { - String line; - while ((line = br.readLine()) != null) { - output.append(line).append('\n'); - } - } catch (IOException ignored) { - } - }); - reader.start(); - - // Wait with timeout (Python fontTools is usually fast, but provide safety margin) - boolean finished = process.waitFor(30, TimeUnit.SECONDS); - if (!finished) { - process.destroyForcibly(); - reader.interrupt(); - log.warn( - "Python CFF→OTF wrapping timed out after 30 seconds - font may be corrupted"); - return null; - } - - int exitCode = process.exitValue(); - reader.join(5000); - - if (exitCode == 0 && Files.exists(outputFile.getPath())) { - byte[] convertedBytes = Files.readAllBytes(outputFile.getPath()); - if (convertedBytes.length > 0) { - String validationError = validateFontTables(convertedBytes); - if (validationError != null) { - log.warn("Python converter produced invalid font: {}", validationError); - return null; - } - - // Log Python script output for debugging - String outputStr = output.toString().trim(); - if (!outputStr.isEmpty()) { - log.debug("Python script output: {}", outputStr); - } - - log.debug( - "Python CFF→OTF wrapping successful: {} bytes → {} bytes", - fontBytes.length, - convertedBytes.length); - return convertedBytes; - } - } else { - String outputStr = output.toString().trim(); - if (!outputStr.isEmpty()) { - log.warn( - "Python CFF→OTF wrapping failed with exit code {}: {}", - exitCode, - outputStr); - } else { - log.warn("Python CFF→OTF wrapping failed with exit code {}", exitCode); - } - } - } catch (InterruptedException ex) { - Thread.currentThread().interrupt(); - log.debug("Python CFF conversion interrupted", ex); - } catch (IOException ex) { - log.debug("Python CFF conversion I/O error", ex); - } - - return null; - } - - private byte[] convertCffUsingFontForge(byte[] fontBytes) { - if (fontforgeCommand == null || fontforgeCommand.isBlank()) { - log.debug("FontForge converter not configured"); - return null; - } - - try (TempFile inputFile = new TempFile(tempFileManager, ".cff"); - TempFile outputFile = new TempFile(tempFileManager, ".ttf")) { - Files.write(inputFile.getPath(), fontBytes); - - List command = new ArrayList<>(); - command.add(fontforgeCommand); - command.add("-lang=ff"); - command.add("-c"); - command.add( - "Open($1); " - + "ScaleToEm(1000); " // Force 1000 units per em (standard for Type1) - + "SelectWorthOutputting(); " - + "SetFontOrder(2); " - + "Reencode(\"unicode\"); " - + "RoundToInt(); " - + "RemoveOverlap(); " - + "Simplify(); " - + "CorrectDirection(); " - + "Generate($2, \"\", 4+16+32); " - + "Close(); " - + "Quit()"); - command.add(inputFile.getAbsolutePath()); - command.add(outputFile.getAbsolutePath()); - - ProcessBuilder builder = new ProcessBuilder(command); - builder.redirectErrorStream(true); - Process process = builder.start(); - - StringBuilder output = new StringBuilder(); - Thread reader = - new Thread( - () -> { - try (BufferedReader br = - new BufferedReader( - new InputStreamReader( - process.getInputStream(), - StandardCharsets.UTF_8))) { - String line; - while ((line = br.readLine()) != null) { - output.append(line).append('\n'); - } - } catch (IOException ignored) { - } - }); - reader.start(); - - // Wait with timeout to prevent hanging on problematic fonts - boolean finished = process.waitFor(30, TimeUnit.SECONDS); - if (!finished) { - process.destroyForcibly(); - reader.interrupt(); - log.warn( - "FontForge conversion timed out after 30 seconds - font may be too complex or causing FontForge to hang"); - return null; - } - - int exitCode = process.exitValue(); - reader.join(5000); // Wait max 5 seconds for reader thread - - if (exitCode == 0 && Files.exists(outputFile.getPath())) { - byte[] convertedBytes = Files.readAllBytes(outputFile.getPath()); - if (convertedBytes.length > 0) { - // Basic validation: check for TrueType magic number and critical tables - if (convertedBytes.length >= 4) { - int magic = - ((convertedBytes[0] & 0xFF) << 24) - | ((convertedBytes[1] & 0xFF) << 16) - | ((convertedBytes[2] & 0xFF) << 8) - | (convertedBytes[3] & 0xFF); - boolean validTrueType = - magic == 0x00010000 || magic == 0x74727565; // 1.0 or 'true' - boolean validOpenType = magic == 0x4F54544F; // 'OTTO' - - if (validTrueType || validOpenType) { - // Additional validation: check unitsPerEm in head table - String validationError = validateFontTables(convertedBytes); - if (validationError != null) { - log.warn("FontForge produced invalid font: {}", validationError); - return null; - } - - log.debug( - "FontForge CFF→TrueType conversion successful: {} bytes, magic: 0x{}, type: {}", - convertedBytes.length, - Integer.toHexString(magic), - validOpenType ? "OpenType" : "TrueType"); - return convertedBytes; - } else { - log.warn( - "FontForge produced invalid font: magic number 0x{} (expected TrueType or OpenType)", - Integer.toHexString(magic)); - return null; - } - } - } - log.warn("FontForge produced empty output file"); - return null; - } - - log.warn( - "FontForge conversion exited with code {}: {}", - exitCode, - output.toString().trim()); - } catch (InterruptedException ex) { - Thread.currentThread().interrupt(); - log.warn("FontForge conversion interrupted"); - } catch (IOException ex) { - log.warn("FontForge conversion failed: {}", ex.getMessage()); - } - - return null; - } - - /** - * Validates critical OpenType/TrueType font tables to ensure browser compatibility. - * - * @return Error message if invalid, null if valid - */ - private String validateFontTables(byte[] fontBytes) { - try { - if (fontBytes.length < 12) { - return "Font file too small"; - } - - // Read table directory - int numTables = ((fontBytes[4] & 0xFF) << 8) | (fontBytes[5] & 0xFF); - if (numTables == 0 || numTables > 100) { - return "Invalid table count: " + numTables; - } - - // Find head table - int offset = 12; // Skip sfnt header - for (int i = 0; i < numTables && offset + 16 <= fontBytes.length; i++) { - String tag = new String(fontBytes, offset, 4, StandardCharsets.US_ASCII); - int tableOffset = - ((fontBytes[offset + 8] & 0xFF) << 24) - | ((fontBytes[offset + 9] & 0xFF) << 16) - | ((fontBytes[offset + 10] & 0xFF) << 8) - | (fontBytes[offset + 11] & 0xFF); - int tableLength = - ((fontBytes[offset + 12] & 0xFF) << 24) - | ((fontBytes[offset + 13] & 0xFF) << 16) - | ((fontBytes[offset + 14] & 0xFF) << 8) - | (fontBytes[offset + 15] & 0xFF); - - if ("head".equals(tag)) { - if (tableOffset + 18 > fontBytes.length) { - return "head table truncated"; - } - // Check unitsPerEm at offset 18 in head table - int unitsPerEm = - ((fontBytes[tableOffset + 18] & 0xFF) << 8) - | (fontBytes[tableOffset + 19] & 0xFF); - if (unitsPerEm < 16 || unitsPerEm > 16384) { - return "Invalid unitsPerEm: " + unitsPerEm + " (must be 16-16384)"; - } - return null; // Valid - } - offset += 16; - } - return "head table not found"; - } catch (Exception ex) { - return "Validation error: " + ex.getMessage(); - } + return fontService.convertCffProgramToTrueType(fontBytes, toUnicode); } private String buildUnicodeMapping(PDFont font, String toUnicodeBase64) throws IOException { - log.debug( - "buildUnicodeMapping called for font: {}, hasToUnicode: {}, isCID: {}", - font.getName(), - toUnicodeBase64 != null, - font instanceof PDType0Font); - if (toUnicodeBase64 == null || toUnicodeBase64.isBlank()) { - log.debug("No ToUnicode data for font: {}", font.getName()); return null; } // For CID fonts (Type0), build complete CharCode→CID→GID→Unicode mapping if (!(font instanceof PDType0Font type0Font)) { // For non-CID fonts, just return ToUnicode as-is - log.debug("Non-CID font {}, returning raw ToUnicode", font.getName()); return toUnicodeBase64; } - log.debug("Building JSON mapping for CID font: {}", font.getName()); - try { // Build a map of CharCode → Unicode from ToUnicode Map charCodeToUnicode = new HashMap<>(); @@ -1265,20 +1003,27 @@ public class PdfJsonConversionService { PDStream fontFile3 = descriptor.getFontFile3(); if (fontFile3 != null) { String subtype = fontFile3.getCOSObject().getNameAsString(COSName.SUBTYPE); + log.info( + "[FONT-DEBUG] Font {}: Found FontFile3 with subtype {}", + font.getName(), + subtype); return readFontProgram( fontFile3, subtype != null ? subtype : "fontfile3", false, toUnicode); } PDStream fontFile2 = descriptor.getFontFile2(); if (fontFile2 != null) { + log.info("[FONT-DEBUG] Font {}: Found FontFile2 (TrueType)", font.getName()); return readFontProgram(fontFile2, null, true, toUnicode); } PDStream fontFile = descriptor.getFontFile(); if (fontFile != null) { + log.info("[FONT-DEBUG] Font {}: Found FontFile (Type1)", font.getName()); return readFontProgram(fontFile, "type1", false, toUnicode); } + log.warn("[FONT-DEBUG] Font {}: No font program found", font.getName()); return null; } @@ -1291,98 +1036,76 @@ public class PdfJsonConversionService { byte[] data = baos.toByteArray(); String format = formatHint; if (detectTrueType) { - format = detectTrueTypeFormat(data); + format = fontService.detectTrueTypeFormat(data); } + log.info( + "[FONT-DEBUG] Font program: size={} bytes, formatHint={}, detectedFormat={}", + data.length, + formatHint, + format); + String webBase64 = null; String webFormat = null; String pdfBase64 = null; String pdfFormat = null; if (format != null && isCffFormat(format)) { - log.debug( - "Detected CFF font format: {}, wrapping as OpenType-CFF for web preview", - format); + log.info( + "[FONT-DEBUG] Font is CFF format, attempting conversion. CFF conversion enabled: {}, method: {}", + fontService.isCffConversionEnabled(), + fontService.getCffConverterMethod()); + byte[] converted = convertCffProgramToTrueType(data, toUnicode); if (converted != null && converted.length > 0) { - String detectedFormat = detectFontFlavor(converted); + String detectedFormat = fontService.detectFontFlavor(converted); webBase64 = Base64.getEncoder().encodeToString(converted); webFormat = detectedFormat; + log.info( + "[FONT-DEBUG] Primary CFF conversion succeeded: {} bytes -> {}", + data.length, + detectedFormat); if ("ttf".equals(detectedFormat)) { pdfBase64 = webBase64; pdfFormat = detectedFormat; } - log.debug( - "Primary CFF conversion successful: {} bytes → {} bytes (format: {})", - data.length, - converted.length, - detectedFormat); } else { - log.debug("CFF→OTF wrapping returned null or empty result"); + log.warn("[FONT-DEBUG] Primary CFF conversion returned null/empty"); } - if (pdfBase64 == null && cffConversionEnabled) { - byte[] ttfConverted = convertCffUsingFontForge(data); + if (pdfBase64 == null && fontService.isCffConversionEnabled()) { + log.info("[FONT-DEBUG] Attempting fallback FontForge conversion"); + byte[] ttfConverted = fontService.convertCffUsingFontForge(data); if (ttfConverted != null && ttfConverted.length > 0) { - String detectedFormat = detectFontFlavor(ttfConverted); + String detectedFormat = fontService.detectFontFlavor(ttfConverted); if (detectedFormat != null) { pdfBase64 = Base64.getEncoder().encodeToString(ttfConverted); pdfFormat = detectedFormat; - log.debug( - "FontForge conversion produced {} bytes (format: {})", - ttfConverted.length, - detectedFormat); if (webBase64 == null) { webBase64 = pdfBase64; webFormat = detectedFormat; } + log.info( + "[FONT-DEBUG] FontForge conversion succeeded: {} bytes -> {}", + data.length, + detectedFormat); } + } else { + log.warn("[FONT-DEBUG] FontForge conversion also returned null/empty"); } } + + if (webBase64 == null && pdfBase64 == null) { + log.error( + "[FONT-DEBUG] ALL CFF conversions failed - font will not be usable in browser!"); + } + } else if (format != null) { + log.info("[FONT-DEBUG] Font is non-CFF format ({}), using as-is", format); } + String base64 = Base64.getEncoder().encodeToString(data); return new FontProgramData(base64, format, webBase64, webFormat, pdfBase64, pdfFormat); } } - private String detectFontFlavor(byte[] fontBytes) { - if (fontBytes == null || fontBytes.length < 4) { - return null; - } - int magic = - ((fontBytes[0] & 0xFF) << 24) - | ((fontBytes[1] & 0xFF) << 16) - | ((fontBytes[2] & 0xFF) << 8) - | (fontBytes[3] & 0xFF); - if (magic == 0x4F54544F) { // 'OTTO' - return "otf"; - } - if (magic == 0x00010000 || magic == 0x74727565) { // 1.0 or 'true' - return "ttf"; - } - return null; - } - - private String detectTrueTypeFormat(byte[] data) { - if (data == null || data.length < 4) { - return "ttf"; - } - String tag = new String(data, 0, 4, StandardCharsets.US_ASCII); - if ("OTTO".equals(tag)) { - return "otf"; - } - if ("true".equals(tag) || "typ1".equals(tag)) { - return "ttf"; - } - int value = - ((data[0] & 0xFF) << 24) - | ((data[1] & 0xFF) << 16) - | ((data[2] & 0xFF) << 8) - | (data[3] & 0xFF); - if (value == 0x00010000) { - return "ttf"; - } - return "ttf"; - } - private String extractToUnicode(COSDictionary fontDictionary) throws IOException { if (fontDictionary == null) { return null; @@ -1455,7 +1178,7 @@ public class PdfJsonConversionService { // imageElements COSBase resourcesBase = page.getCOSObject().getDictionaryObject(COSName.RESOURCES); COSBase filteredResources = filterImageXObjectsFromResources(resourcesBase); - pageModel.setResources(serializeCosValue(filteredResources)); + pageModel.setResources(cosMapper.serializeCosValue(filteredResources)); pageModel.setContentStreams(extractContentStreams(page)); pages.add(pageModel); pageIndex++; @@ -1463,7 +1186,8 @@ public class PdfJsonConversionService { return pages; } - private Map> collectImages(PDDocument document) + private Map> collectImages( + PDDocument document, int totalPages, Consumer progress) throws IOException { Map> imagesByPage = new LinkedHashMap<>(); int pageNumber = 1; @@ -1471,12 +1195,19 @@ public class PdfJsonConversionService { ImageCollectingEngine engine = new ImageCollectingEngine(page, pageNumber, imagesByPage); engine.processPage(page); + + // Update progress for image extraction (70-80%) + int imageProgress = 70 + (int) ((pageNumber / (double) totalPages) * 10); + progress.accept( + PdfJsonConversionProgress.of( + imageProgress, "images", "Extracting images", pageNumber, totalPages)); pageNumber++; } return imagesByPage; } - private Map> collectAnnotations(PDDocument document) + private Map> collectAnnotations( + PDDocument document, int totalPages, Consumer progress) throws IOException { Map> annotationsByPage = new LinkedHashMap<>(); int pageNumber = 1; @@ -1549,7 +1280,7 @@ public class PdfJsonConversionService { } // Store raw dictionary for lossless round-trip - ann.setRawData(serializeCosValue(annotDict)); + ann.setRawData(cosMapper.serializeCosValue(annotDict)); annotations.add(ann); } catch (Exception e) { @@ -1562,6 +1293,16 @@ public class PdfJsonConversionService { if (!annotations.isEmpty()) { annotationsByPage.put(pageNumber, annotations); } + + // Update progress for annotation collection (80-90%) + int annotationProgress = 80 + (int) ((pageNumber / (double) totalPages) * 10); + progress.accept( + PdfJsonConversionProgress.of( + annotationProgress, + "annotations", + "Collecting annotations", + pageNumber, + totalPages)); pageNumber++; } return annotationsByPage; @@ -1619,7 +1360,7 @@ public class PdfJsonConversionService { } // Store raw dictionary for lossless round-trip - formField.setRawData(serializeCosValue(field.getCOSObject())); + formField.setRawData(cosMapper.serializeCosValue(field.getCOSObject())); formFields.add(formField); } catch (Exception e) { @@ -1770,7 +1511,8 @@ public class PdfJsonConversionService { try { // Restore from raw COS data if available for lossless round-trip if (annModel.getRawData() != null) { - COSBase rawAnnot = deserializeCosValue(annModel.getRawData(), document); + COSBase rawAnnot = + cosMapper.deserializeCosValue(annModel.getRawData(), document); if (rawAnnot instanceof COSDictionary) { PDAnnotation annotation = PDAnnotation.createAnnotation((COSDictionary) rawAnnot); @@ -1819,7 +1561,8 @@ public class PdfJsonConversionService { try { // Restore from raw COS data if available for lossless round-trip if (fieldModel.getRawData() != null) { - COSBase rawField = deserializeCosValue(fieldModel.getRawData(), document); + COSBase rawField = + cosMapper.deserializeCosValue(fieldModel.getRawData(), document); if (rawField instanceof COSDictionary) { // Add the field dictionary directly to the fields array fieldsArray.add(rawField); @@ -1852,7 +1595,7 @@ public class PdfJsonConversionService { if (resourcesModel == null) { return; } - COSBase base = deserializeCosValue(resourcesModel, document); + COSBase base = cosMapper.deserializeCosValue(resourcesModel, document); if (base instanceof COSDictionary dictionary) { page.setResources(new PDResources(dictionary)); } @@ -1943,7 +1686,7 @@ public class PdfJsonConversionService { if (streamModel == null) { continue; } - COSStream cosStream = buildStreamFromModel(streamModel, document); + COSStream cosStream = cosMapper.buildStreamFromModel(streamModel, document); if (cosStream != null) { streams.add(new PDStream(cosStream)); } @@ -1959,7 +1702,7 @@ public class PdfJsonConversionService { } while (iterator.hasNext()) { PDStream stream = iterator.next(); - PdfJsonStream model = serializeStream(stream); + PdfJsonStream model = cosMapper.serializeStream(stream); if (model != null) { streams.add(model); } @@ -1967,241 +1710,6 @@ public class PdfJsonConversionService { return streams; } - private COSStream buildStreamFromModel(PdfJsonStream streamModel, PDDocument document) - throws IOException { - COSStream cosStream = document.getDocument().createCOSStream(); - if (streamModel.getDictionary() != null) { - for (Map.Entry entry : - streamModel.getDictionary().entrySet()) { - COSName key = COSName.getPDFName(entry.getKey()); - COSBase value = deserializeCosValue(entry.getValue(), document); - if (value != null) { - cosStream.setItem(key, value); - } - } - } - String rawData = streamModel.getRawData(); - if (rawData != null && !rawData.isBlank()) { - byte[] data; - try { - data = Base64.getDecoder().decode(rawData); - } catch (IllegalArgumentException ex) { - log.debug("Invalid base64 content stream data: {}", ex.getMessage()); - data = new byte[0]; - } - try (OutputStream outputStream = cosStream.createRawOutputStream()) { - outputStream.write(data); - } - cosStream.setItem(COSName.LENGTH, COSInteger.get(data.length)); - } else { - cosStream.setItem(COSName.LENGTH, COSInteger.get(0)); - } - return cosStream; - } - - private PdfJsonStream serializeStream(PDStream stream) throws IOException { - if (stream == null) { - return null; - } - return serializeStream( - stream.getCOSObject(), Collections.newSetFromMap(new IdentityHashMap<>())); - } - - private PdfJsonStream serializeStream(COSStream cosStream) throws IOException { - if (cosStream == null) { - return null; - } - return serializeStream( - cosStream, Collections.newSetFromMap(new IdentityHashMap<>())); - } - - private PdfJsonStream serializeStream(COSStream cosStream, Set visited) - throws IOException { - if (cosStream == null) { - return null; - } - Map dictionary = new LinkedHashMap<>(); - for (COSName key : cosStream.keySet()) { - COSBase value = cosStream.getDictionaryObject(key); - PdfJsonCosValue serialized = serializeCosValue(value, visited); - if (serialized != null) { - dictionary.put(key.getName(), serialized); - } - } - String rawData = null; - try (InputStream inputStream = cosStream.createRawInputStream(); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - if (inputStream != null) { - inputStream.transferTo(baos); - } - byte[] data = baos.toByteArray(); - if (data.length > 0) { - rawData = Base64.getEncoder().encodeToString(data); - } - } - return PdfJsonStream.builder().dictionary(dictionary).rawData(rawData).build(); - } - - private PdfJsonCosValue serializeCosValue(COSBase base) throws IOException { - return serializeCosValue( - base, Collections.newSetFromMap(new IdentityHashMap<>())); - } - - private PdfJsonCosValue serializeCosValue(COSBase base, Set visited) throws IOException { - if (base == null) { - return null; - } - if (base instanceof COSObject cosObject) { - base = cosObject.getObject(); - if (base == null) { - return null; - } - } - - boolean complex = - base instanceof COSDictionary - || base instanceof COSArray - || base instanceof COSStream; - if (complex) { - if (!visited.add(base)) { - return PdfJsonCosValue.builder() - .type(PdfJsonCosValue.Type.NAME) - .value("__circular__") - .build(); - } - } - - try { - PdfJsonCosValue.PdfJsonCosValueBuilder builder = PdfJsonCosValue.builder(); - if (base instanceof COSNull) { - builder.type(PdfJsonCosValue.Type.NULL); - return builder.build(); - } - if (base instanceof COSBoolean booleanValue) { - builder.type(PdfJsonCosValue.Type.BOOLEAN).value(booleanValue.getValue()); - return builder.build(); - } - if (base instanceof COSInteger integer) { - builder.type(PdfJsonCosValue.Type.INTEGER).value(integer.longValue()); - return builder.build(); - } - if (base instanceof COSFloat floatValue) { - builder.type(PdfJsonCosValue.Type.FLOAT).value(floatValue.floatValue()); - return builder.build(); - } - if (base instanceof COSName name) { - builder.type(PdfJsonCosValue.Type.NAME).value(name.getName()); - return builder.build(); - } - if (base instanceof COSString cosString) { - builder.type(PdfJsonCosValue.Type.STRING) - .value(Base64.getEncoder().encodeToString(cosString.getBytes())); - return builder.build(); - } - if (base instanceof COSArray array) { - List items = new ArrayList<>(array.size()); - for (COSBase item : array) { - PdfJsonCosValue serialized = serializeCosValue(item, visited); - items.add(serialized); - } - builder.type(PdfJsonCosValue.Type.ARRAY).items(items); - return builder.build(); - } - if (base instanceof COSStream stream) { - builder.type(PdfJsonCosValue.Type.STREAM).stream(serializeStream(stream, visited)); - return builder.build(); - } - if (base instanceof COSDictionary dictionary) { - Map entries = new LinkedHashMap<>(); - for (COSName key : dictionary.keySet()) { - PdfJsonCosValue serialized = - serializeCosValue(dictionary.getDictionaryObject(key), visited); - entries.put(key.getName(), serialized); - } - builder.type(PdfJsonCosValue.Type.DICTIONARY).entries(entries); - return builder.build(); - } - return null; - } finally { - if (complex) { - visited.remove(base); - } - } - } - - private COSBase deserializeCosValue(PdfJsonCosValue value, PDDocument document) - throws IOException { - if (value == null || value.getType() == null) { - return null; - } - switch (value.getType()) { - case NULL: - return COSNull.NULL; - case BOOLEAN: - if (value.getValue() instanceof Boolean bool) { - return COSBoolean.getBoolean(bool); - } - return null; - case INTEGER: - if (value.getValue() instanceof Number number) { - return COSInteger.get(number.longValue()); - } - return null; - case FLOAT: - if (value.getValue() instanceof Number number) { - return new COSFloat(number.floatValue()); - } - return null; - case NAME: - if (value.getValue() instanceof String name) { - return COSName.getPDFName(name); - } - return null; - case STRING: - if (value.getValue() instanceof String encoded) { - try { - byte[] bytes = Base64.getDecoder().decode(encoded); - return new COSString(bytes); - } catch (IllegalArgumentException ex) { - log.debug("Failed to decode COSString value: {}", ex.getMessage()); - } - } - return null; - case ARRAY: - COSArray array = new COSArray(); - if (value.getItems() != null) { - for (PdfJsonCosValue item : value.getItems()) { - COSBase entry = deserializeCosValue(item, document); - if (entry != null) { - array.add(entry); - } else { - array.add(COSNull.NULL); - } - } - } - return array; - case DICTIONARY: - COSDictionary dictionary = new COSDictionary(); - if (value.getEntries() != null) { - for (Map.Entry entry : value.getEntries().entrySet()) { - COSName key = COSName.getPDFName(entry.getKey()); - COSBase entryValue = deserializeCosValue(entry.getValue(), document); - if (entryValue != null) { - dictionary.setItem(key, entryValue); - } - } - } - return dictionary; - case STREAM: - if (value.getStream() != null) { - return buildStreamFromModel(value.getStream(), document); - } - return null; - default: - return null; - } - } - private PDStream extractVectorGraphics( PDDocument document, List preservedStreams, @@ -2252,9 +1760,7 @@ public class PdfJsonConversionService { } private void collectVectorTokens( - List sourceTokens, - List targetTokens, - Set imageObjectNames) { + List sourceTokens, List targetTokens, Set imageObjectNames) { if (sourceTokens == null || sourceTokens.isEmpty()) { return; } @@ -2425,26 +1931,27 @@ public class PdfJsonConversionService { String glyph = new String(Character.toChars(codePoint)); PDFont targetFont = currentFont; - if (!canEncode(baseFont, codePoint)) { + if (!fallbackFontService.canEncode(baseFont, codePoint)) { fallbackApplied = true; - String fallbackId = resolveFallbackFontId(codePoint); + String fallbackId = fallbackFontService.resolveFallbackFontId(codePoint); targetFont = ensureFallbackFont(document, fontMap, fontModels, fallbackId); - if (targetFont == null || !canEncode(targetFont, glyph)) { - String mapped = mapUnsupportedGlyph(codePoint); + if (targetFont == null || !fallbackFontService.canEncode(targetFont, glyph)) { + String mapped = fallbackFontService.mapUnsupportedGlyph(codePoint); if (mapped != null) { - if (canEncode(baseFont, mapped)) { + if (fallbackFontService.canEncode(baseFont, mapped)) { glyph = mapped; targetFont = baseFont; - } else if (targetFont != null && canEncode(targetFont, mapped)) { + } else if (targetFont != null + && fallbackFontService.canEncode(targetFont, mapped)) { glyph = mapped; } } } - if (targetFont == null || !canEncode(targetFont, glyph)) { + if (targetFont == null || !fallbackFontService.canEncode(targetFont, glyph)) { glyph = "?"; targetFont = ensureFallbackFont(document, fontMap, fontModels, FALLBACK_FONT_ID); - if (targetFont == null || !canEncode(targetFont, glyph)) { + if (targetFont == null || !fallbackFontService.canEncode(targetFont, glyph)) { log.debug( "Dropping unsupported glyph U+{} for text element", Integer.toHexString(codePoint)); @@ -2497,68 +2004,6 @@ public class PdfJsonConversionService { return 1000; } - private boolean canEncodeFully(PDFont font, String text) { - return canEncode(font, text); - } - - private boolean canEncode(PDFont font, int codePoint) { - return canEncode(font, new String(Character.toChars(codePoint))); - } - - private boolean canEncode(PDFont font, String text) { - if (font == null || text == null || text.isEmpty()) { - return false; - } - try { - font.encode(text); - return true; - } catch (IOException | IllegalArgumentException ex) { - return false; - } - } - - private String resolveFallbackFontId(int codePoint) { - Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint); - if (block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS - || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A - || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B - || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C - || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D - || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E - || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F - || block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION - || block == Character.UnicodeBlock.BOPOMOFO - || block == Character.UnicodeBlock.BOPOMOFO_EXTENDED - || block == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { - return FALLBACK_FONT_CJK_ID; - } - - Character.UnicodeScript script = Character.UnicodeScript.of(codePoint); - switch (script) { - case HAN: - return FALLBACK_FONT_CJK_ID; - case HIRAGANA: - case KATAKANA: - return FALLBACK_FONT_JP_ID; - case HANGUL: - return FALLBACK_FONT_KR_ID; - case ARABIC: - return FALLBACK_FONT_AR_ID; - case THAI: - return FALLBACK_FONT_TH_ID; - default: - return FALLBACK_FONT_ID; - } - } - - private String mapUnsupportedGlyph(int codePoint) { - return switch (codePoint) { - case 0x276E -> "<"; - case 0x276F -> ">"; - default -> null; - }; - } - private void closeQuietly(TempFile tempFile) { if (tempFile == null) { return; @@ -2742,30 +2187,6 @@ public class PdfJsonConversionService { } } - private static final class FallbackFontSpec { - private final String resourceLocation; - private final String baseName; - private final String format; - - private FallbackFontSpec(String resourceLocation, String baseName, String format) { - this.resourceLocation = resourceLocation; - this.baseName = baseName; - this.format = format; - } - - private String resourceLocation() { - return resourceLocation; - } - - private String baseName() { - return baseName; - } - - private String format() { - return format; - } - } - private static final class FontRun { private final PDFont font; private final String text; @@ -3046,7 +2467,7 @@ public class PdfJsonConversionService { boolean fallbackPresent = fonts != null && fonts.stream().anyMatch(f -> FALLBACK_FONT_ID.equals(f.getId())); if (!fallbackPresent) { - PdfJsonFont fallbackModel = buildFallbackFontModel(); + PdfJsonFont fallbackModel = fallbackFontService.buildFallbackFontModel(); if (fonts != null) { fonts.add(fallbackModel); log.info("Added fallback font definition to JSON font list"); @@ -3058,7 +2479,11 @@ public class PdfJsonConversionService { fonts.stream() .filter(f -> FALLBACK_FONT_ID.equals(f.getId())) .findFirst() - .orElse(buildFallbackFontModel()); + .orElse(null); + if (fallbackModel == null) { + fallbackModel = fallbackFontService.buildFallbackFontModel(); + fonts.add(fallbackModel); + } PDFont fallbackFont = createFontFromModel(document, fallbackModel); fontMap.put(buildFontKey(-1, FALLBACK_FONT_ID), fallbackFont); } @@ -3073,7 +2498,7 @@ public class PdfJsonConversionService { } if (FALLBACK_FONT_ID.equals(fontModel.getId())) { - return loadFallbackPdfFont(document); + return fallbackFontService.loadFallbackPdfFont(document); } String originalFormat = @@ -3224,7 +2649,8 @@ public class PdfJsonConversionService { // embedded program streams captured during extraction. This handles subset fonts whose // raw program bytes cannot be reloaded directly (e.g., missing Unicode cmap tables). if (fontModel.getCosDictionary() != null) { - COSBase restored = deserializeCosValue(fontModel.getCosDictionary(), document); + COSBase restored = + cosMapper.deserializeCosValue(fontModel.getCosDictionary(), document); if (restored instanceof COSDictionary cosDictionary) { try { PDFont font = PDFontFactory.createFont(cosDictionary); @@ -3272,7 +2698,7 @@ public class PdfJsonConversionService { } } - PDFont fallback = loadFallbackPdfFont(document); + PDFont fallback = fallbackFontService.loadFallbackPdfFont(document); applyAdditionalFontMetadata(document, fallback, fontModel); return fallback; } @@ -3989,4 +3415,614 @@ public class PdfJsonConversionService { return null; } } + + /** + * Get the job ID from the current request context + * + * @return The job ID, or null if not in an async job context + */ + private String getJobIdFromRequest() { + // First check ThreadLocal (for async jobs) + String jobId = stirling.software.common.util.JobContext.getJobId(); + if (jobId != null) { + log.debug("Retrieved jobId from JobContext: {}", jobId); + return jobId; + } + + // Fallback to request attribute (for sync jobs) + try { + org.springframework.web.context.request.RequestAttributes attrs = + org.springframework.web.context.request.RequestContextHolder + .getRequestAttributes(); + if (attrs instanceof org.springframework.web.context.request.ServletRequestAttributes) { + jakarta.servlet.http.HttpServletRequest request = + ((org.springframework.web.context.request.ServletRequestAttributes) attrs) + .getRequest(); + jobId = (String) request.getAttribute("jobId"); + if (jobId != null) { + log.debug("Retrieved jobId from request attribute: {}", jobId); + return jobId; + } + } + } catch (Exception e) { + log.debug("Could not retrieve job ID from request context: {}", e.getMessage()); + } + return null; + } + + /** + * Report progress to TaskManager for async jobs + * + * @param jobId The job ID + * @param progress The progress update + */ + private void reportProgressToTaskManager(String jobId, PdfJsonConversionProgress progress) { + try { + log.info( + "Reporting progress for job {}: {}% - {}", + jobId, progress.getPercent(), progress.getStage()); + // Add progress note to job + String note; + if (progress.getCurrent() != null && progress.getTotal() != null) { + note = + String.format( + "[%d%%] %s: %s (%d/%d)", + progress.getPercent(), + progress.getStage(), + progress.getMessage(), + progress.getCurrent(), + progress.getTotal()); + } else { + note = + String.format( + "[%d%%] %s: %s", + progress.getPercent(), progress.getStage(), progress.getMessage()); + } + boolean added = taskManager.addNote(jobId, note); + if (!added) { + log.warn("Failed to add note - job {} not found in TaskManager", jobId); + } else { + log.info("Successfully added progress note for job {}: {}", jobId, note); + } + } catch (Exception e) { + log.error("Exception reporting progress for job {}: {}", jobId, e.getMessage(), e); + } + } + + // ======================================================================== + // Lazy Page Loading Support + // ======================================================================== + + /** + * Stores PDF bytes for lazy page loading. Each page is extracted on-demand by re-loading the + * PDF from bytes. + */ + @lombok.Data + private static class CachedPdfDocument { + private final byte[] pdfBytes; + private final PdfJsonDocumentMetadata metadata; + private final Map fonts; // Font map with UIDs for consistency + private final Map> pageFontResources; // Page font resources + private final long timestamp; + + public CachedPdfDocument( + byte[] pdfBytes, + PdfJsonDocumentMetadata metadata, + Map fonts, + Map> pageFontResources) { + this.pdfBytes = pdfBytes; + this.metadata = metadata; + this.fonts = fonts; + this.pageFontResources = pageFontResources; + this.timestamp = System.currentTimeMillis(); + } + + public CachedPdfDocument withUpdatedPdfBytes(byte[] nextBytes) { + return new CachedPdfDocument(nextBytes, metadata, fonts, pageFontResources); + } + } + + /** + * Extracts document metadata, fonts, and page dimensions without page content. Caches the PDF + * bytes for subsequent page requests. + */ + public byte[] extractDocumentMetadata(MultipartFile file, String jobId) throws IOException { + if (file == null) { + throw ExceptionUtils.createNullArgumentException("fileInput"); + } + + Consumer progress = + jobId != null + ? (p) -> { + log.info( + "Progress: [{}%] {} - {}{}", + p.getPercent(), + p.getStage(), + p.getMessage(), + (p.getCurrent() != null && p.getTotal() != null) + ? String.format( + " (%d/%d)", p.getCurrent(), p.getTotal()) + : ""); + reportProgressToTaskManager(jobId, p); + } + : (p) -> {}; + + // Read PDF bytes once for processing and caching + byte[] pdfBytes = file.getBytes(); + + try (PDDocument document = pdfDocumentFactory.load(pdfBytes, true)) { + int totalPages = document.getNumberOfPages(); + + // Extract fonts + progress.accept( + PdfJsonConversionProgress.of(30, "fonts", "Collecting font information")); + Map fonts = new LinkedHashMap<>(); + Map> pageFontResources = new HashMap<>(); + int pageNumber = 1; + for (PDPage page : document.getPages()) { + Map resourceMap = + collectFontsForPage(document, page, pageNumber, fonts); + pageFontResources.put(pageNumber, resourceMap); + pageNumber++; + } + + // Build metadata response + progress.accept(PdfJsonConversionProgress.of(90, "metadata", "Extracting metadata")); + PdfJsonDocumentMetadata docMetadata = new PdfJsonDocumentMetadata(); + docMetadata.setMetadata(extractMetadata(document)); + docMetadata.setXmpMetadata(extractXmpMetadata(document)); + + List serializedFonts = new ArrayList<>(fonts.values()); + serializedFonts.sort( + Comparator.comparing( + PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder()))); + docMetadata.setFonts(serializedFonts); + + // Extract page dimensions + List pageDimensions = new ArrayList<>(); + int pageIndex = 0; + for (PDPage page : document.getPages()) { + PdfJsonPageDimension dim = new PdfJsonPageDimension(); + dim.setPageNumber(pageIndex + 1); + PDRectangle mediaBox = page.getMediaBox(); + dim.setWidth(mediaBox.getWidth()); + dim.setHeight(mediaBox.getHeight()); + dim.setRotation(page.getRotation()); + pageDimensions.add(dim); + pageIndex++; + } + docMetadata.setPageDimensions(pageDimensions); + docMetadata.setFormFields(collectFormFields(document)); + docMetadata.setLazyImages(Boolean.TRUE); + + // Cache PDF bytes, metadata, and fonts for lazy page loading + if (jobId != null) { + CachedPdfDocument cached = + new CachedPdfDocument(pdfBytes, docMetadata, fonts, pageFontResources); + documentCache.put(jobId, cached); + log.info( + "Cached PDF bytes ({} bytes, {} pages, {} fonts) for lazy loading, jobId: {}", + pdfBytes.length, + totalPages, + fonts.size(), + jobId); + + // Schedule cleanup after 30 minutes + scheduleDocumentCleanup(jobId); + } + + progress.accept( + PdfJsonConversionProgress.of(100, "complete", "Metadata extraction complete")); + + return objectMapper.writeValueAsBytes(docMetadata); + } + } + + /** Extracts a single page from cached PDF bytes. Re-loads the PDF for each request. */ + public byte[] extractSinglePage(String jobId, int pageNumber) throws IOException { + CachedPdfDocument cached = documentCache.get(jobId); + if (cached == null) { + throw new IllegalArgumentException("No cached document found for jobId: " + jobId); + } + + int pageIndex = pageNumber - 1; + int totalPages = cached.getMetadata().getPageDimensions().size(); + + if (pageIndex < 0 || pageIndex >= totalPages) { + throw new IllegalArgumentException( + "Page number " + pageNumber + " out of range (1-" + totalPages + ")"); + } + + log.debug( + "Loading PDF from bytes ({} bytes) to extract page {} (jobId: {})", + cached.getPdfBytes().length, + pageNumber, + jobId); + + // Re-load PDF from cached bytes and extract the single page + try (PDDocument document = pdfDocumentFactory.load(cached.getPdfBytes(), true)) { + PDPage page = document.getPage(pageIndex); + PdfJsonPage pageModel = new PdfJsonPage(); + pageModel.setPageNumber(pageNumber); + PDRectangle mediaBox = page.getMediaBox(); + pageModel.setWidth(mediaBox.getWidth()); + pageModel.setHeight(mediaBox.getHeight()); + pageModel.setRotation(page.getRotation()); + + // Extract text on-demand using cached fonts (ensures consistent font UIDs) + Map> textByPage = new LinkedHashMap<>(); + TextCollectingStripper stripper = + new TextCollectingStripper( + document, cached.getFonts(), textByPage, cached.getPageFontResources()); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setSortByPosition(true); + stripper.getText(document); + pageModel.setTextElements(textByPage.getOrDefault(pageNumber, List.of())); + + // Extract annotations on-demand + List annotations = new ArrayList<>(); + for (PDAnnotation annotation : page.getAnnotations()) { + try { + PdfJsonAnnotation ann = new PdfJsonAnnotation(); + ann.setSubtype(annotation.getSubtype()); + ann.setContents(annotation.getContents()); + + PDRectangle rect = annotation.getRectangle(); + if (rect != null) { + ann.setRect( + List.of( + rect.getLowerLeftX(), + rect.getLowerLeftY(), + rect.getUpperRightX(), + rect.getUpperRightY())); + } + + COSName appearanceState = annotation.getAppearanceState(); + if (appearanceState != null) { + ann.setAppearanceState(appearanceState.getName()); + } + + if (annotation.getColor() != null) { + float[] colorComponents = annotation.getColor().getComponents(); + List colorList = new ArrayList<>(colorComponents.length); + for (float c : colorComponents) { + colorList.add(c); + } + ann.setColor(colorList); + } + + COSDictionary annotDict = annotation.getCOSObject(); + COSString title = (COSString) annotDict.getDictionaryObject(COSName.T); + if (title != null) { + ann.setAuthor(title.getString()); + } + + COSString subj = (COSString) annotDict.getDictionaryObject(COSName.SUBJ); + if (subj != null) { + ann.setSubject(subj.getString()); + } + + COSString creationDateStr = + (COSString) annotDict.getDictionaryObject(COSName.CREATION_DATE); + if (creationDateStr != null) { + try { + Calendar creationDate = + DateConverter.toCalendar(creationDateStr.getString()); + ann.setCreationDate(formatCalendar(creationDate)); + } catch (Exception e) { + log.debug( + "Failed to parse annotation creation date: {}", e.getMessage()); + } + } + + COSString modDateStr = (COSString) annotDict.getDictionaryObject(COSName.M); + if (modDateStr != null) { + try { + Calendar modDate = DateConverter.toCalendar(modDateStr.getString()); + ann.setModificationDate(formatCalendar(modDate)); + } catch (Exception e) { + log.debug( + "Failed to parse annotation modification date: {}", + e.getMessage()); + } + } + + ann.setRawData(cosMapper.serializeCosValue(annotDict)); + annotations.add(ann); + } catch (Exception e) { + log.warn( + "Failed to extract annotation on page {}: {}", + pageNumber, + e.getMessage()); + } + } + pageModel.setAnnotations(annotations); + + // Extract images on-demand + Map> singlePageImages = new LinkedHashMap<>(); + ImageCollectingEngine engine = + new ImageCollectingEngine(page, pageNumber, singlePageImages); + engine.processPage(page); + List images = singlePageImages.getOrDefault(pageNumber, List.of()); + pageModel.setImageElements(images); + + // Extract resources and content streams + COSBase resourcesBase = page.getCOSObject().getDictionaryObject(COSName.RESOURCES); + COSBase filteredResources = filterImageXObjectsFromResources(resourcesBase); + pageModel.setResources(cosMapper.serializeCosValue(filteredResources)); + pageModel.setContentStreams(extractContentStreams(page)); + + log.debug( + "Extracted page {} (text: {}, images: {}, annotations: {}) for jobId: {}", + pageNumber, + pageModel.getTextElements().size(), + images.size(), + pageModel.getAnnotations().size(), + jobId); + + return objectMapper.writeValueAsBytes(pageModel); + } + } + + public byte[] exportUpdatedPages(String jobId, PdfJsonDocument updates) throws IOException { + if (jobId == null || jobId.isBlank()) { + throw new IllegalArgumentException("jobId is required for incremental export"); + } + CachedPdfDocument cached = documentCache.get(jobId); + if (cached == null) { + throw new IllegalArgumentException("No cached document available for jobId: " + jobId); + } + if (updates == null || updates.getPages() == null || updates.getPages().isEmpty()) { + log.info( + "Incremental export requested with no page updates; returning cached PDF for jobId {}", + jobId); + return cached.getPdfBytes(); + } + + try (PDDocument document = pdfDocumentFactory.load(cached.getPdfBytes(), true)) { + List fontModels = new ArrayList<>(cached.getFonts().values()); + if (updates.getFonts() != null) { + for (PdfJsonFont font : updates.getFonts()) { + if (font == null || font.getId() == null) { + continue; + } + boolean exists = + fontModels.stream() + .anyMatch( + existing -> + Objects.equals(existing.getId(), font.getId()) + && Objects.equals( + existing.getUid(), + font.getUid())); + if (!exists) { + fontModels.add(font); + } + } + } + + List fontModelsCopy = new ArrayList<>(fontModels); + Map fontMap = buildFontMap(document, fontModelsCopy); + + Set updatedPages = new HashSet<>(); + for (PdfJsonPage pageModel : updates.getPages()) { + if (pageModel == null) { + continue; + } + Integer pageNumber = pageModel.getPageNumber(); + if (pageNumber == null) { + log.warn( + "Skipping incremental page update without pageNumber for jobId {}", + jobId); + continue; + } + int pageIndex = pageNumber - 1; + if (pageIndex < 0 || pageIndex >= document.getNumberOfPages()) { + log.warn( + "Skipping incremental update for out-of-range page {} (jobId {})", + pageNumber, + jobId); + continue; + } + PDPage page = document.getPage(pageIndex); + replacePageContentFromModel( + document, page, pageModel, fontMap, fontModelsCopy, pageNumber); + updatedPages.add(pageIndex); + } + + if (updatedPages.isEmpty()) { + log.info( + "Incremental export for jobId {} resulted in no page updates; returning cached PDF", + jobId); + return cached.getPdfBytes(); + } + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + document.save(baos); + byte[] updatedBytes = baos.toByteArray(); + + documentCache.put(jobId, cached.withUpdatedPdfBytes(updatedBytes)); + + log.info( + "Incremental export complete for jobId {} (pages updated: {})", + jobId, + updatedPages.stream().map(i -> i + 1).sorted().toList()); + return updatedBytes; + } + } + + /** Clears a cached document. */ + public void clearCachedDocument(String jobId) { + CachedPdfDocument cached = documentCache.remove(jobId); + if (cached != null) { + log.info( + "Removed cached PDF bytes ({} bytes) for jobId: {}", + cached.getPdfBytes().length, + jobId); + } + } + + private void replacePageContentFromModel( + PDDocument document, + PDPage page, + PdfJsonPage pageModel, + Map fontMap, + List fontModels, + int pageNumberValue) + throws IOException { + PDRectangle currentBox = page.getMediaBox(); + float fallbackWidth = currentBox != null ? currentBox.getWidth() : 612f; + float fallbackHeight = currentBox != null ? currentBox.getHeight() : 792f; + + float width = safeFloat(pageModel.getWidth(), fallbackWidth); + float height = safeFloat(pageModel.getHeight(), fallbackHeight); + PDRectangle newBox = new PDRectangle(width, height); + page.setMediaBox(newBox); + page.setCropBox(newBox); + + if (pageModel.getRotation() != null) { + page.setRotation(pageModel.getRotation()); + } + + applyPageResources(document, page, pageModel.getResources()); + + List preservedStreams = + buildContentStreams(document, pageModel.getContentStreams()); + if (preservedStreams.isEmpty()) { + page.setContents(new ArrayList<>()); + } else { + page.setContents(preservedStreams); + } + + List imageElements = + pageModel.getImageElements() != null + ? new ArrayList<>(pageModel.getImageElements()) + : new ArrayList<>(); + + if (!preservedStreams.isEmpty() && !imageElements.isEmpty()) { + reconstructImageXObjects(document, page, preservedStreams, imageElements); + } + + List textElements = + pageModel.getTextElements() != null + ? new ArrayList<>(pageModel.getTextElements()) + : new ArrayList<>(); + + PreflightResult preflightResult = + preflightTextElements(document, fontMap, fontModels, textElements, pageNumberValue); + if (!preflightResult.fallbackFontIds().isEmpty()) { + ensureFallbackResources(page, preflightResult.fallbackFontIds(), fontMap); + } + + AppendMode appendMode = + preservedStreams.isEmpty() ? AppendMode.OVERWRITE : AppendMode.APPEND; + + RegenerateMode regenerateMode = + determineRegenerateMode( + document, + page, + preservedStreams, + textElements, + imageElements, + preflightResult, + pageNumberValue); + + if (regenerateMode == RegenerateMode.REUSE_EXISTING) { + page.getAnnotations().clear(); + List annotations = + pageModel.getAnnotations() != null + ? new ArrayList<>(pageModel.getAnnotations()) + : new ArrayList<>(); + restoreAnnotations(document, page, annotations); + return; + } + + if (regenerateMode == RegenerateMode.REGENERATE_WITH_VECTOR_OVERLAY) { + PDStream vectorStream = + extractVectorGraphics(document, preservedStreams, imageElements); + if (vectorStream != null) { + page.setContents(Collections.singletonList(vectorStream)); + appendMode = AppendMode.APPEND; + } else { + page.setContents(new ArrayList<>()); + appendMode = AppendMode.OVERWRITE; + } + } else if (regenerateMode == RegenerateMode.REGENERATE_CLEAR) { + page.setContents(new ArrayList<>()); + appendMode = AppendMode.OVERWRITE; + } + + regeneratePageContent( + document, + page, + textElements, + imageElements, + fontMap, + fontModels, + pageNumberValue, + appendMode); + + page.getAnnotations().clear(); + List annotations = + pageModel.getAnnotations() != null + ? new ArrayList<>(pageModel.getAnnotations()) + : new ArrayList<>(); + restoreAnnotations(document, page, annotations); + } + + private RegenerateMode determineRegenerateMode( + PDDocument document, + PDPage page, + List preservedStreams, + List textElements, + List imageElements, + PreflightResult preflightResult, + int pageNumberValue) + throws IOException { + boolean hasText = textElements != null && !textElements.isEmpty(); + boolean hasImages = imageElements != null && !imageElements.isEmpty(); + + if (!hasText && !hasImages) { + return RegenerateMode.REGENERATE_CLEAR; + } + + if (preservedStreams.isEmpty()) { + return RegenerateMode.REGENERATE_CLEAR; + } + + if (hasImages) { + return RegenerateMode.REGENERATE_WITH_VECTOR_OVERLAY; + } + + if (hasText && !preflightResult.usesFallback()) { + boolean rewriteSucceeded = rewriteTextOperators(document, page, textElements, false); + if (rewriteSucceeded) { + return RegenerateMode.REUSE_EXISTING; + } + return RegenerateMode.REGENERATE_WITH_VECTOR_OVERLAY; + } + + return RegenerateMode.REGENERATE_WITH_VECTOR_OVERLAY; + } + + private enum RegenerateMode { + REUSE_EXISTING, + REGENERATE_WITH_VECTOR_OVERLAY, + REGENERATE_CLEAR + } + + /** Schedules automatic cleanup of cached documents after 30 minutes. */ + private void scheduleDocumentCleanup(String jobId) { + new Thread( + () -> { + try { + Thread.sleep(TimeUnit.MINUTES.toMillis(30)); + clearCachedDocument(jobId); + log.info("Auto-cleaned cached document for jobId: {}", jobId); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + }) + .start(); + } } diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonCosMapper.java b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonCosMapper.java new file mode 100644 index 000000000..c990c568b --- /dev/null +++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonCosMapper.java @@ -0,0 +1,274 @@ +package stirling.software.SPDF.service; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Base64; +import java.util.Collections; +import java.util.IdentityHashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSBoolean; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSFloat; +import org.apache.pdfbox.cos.COSInteger; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSNull; +import org.apache.pdfbox.cos.COSObject; +import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.common.PDStream; +import org.springframework.stereotype.Component; + +import lombok.extern.slf4j.Slf4j; + +import stirling.software.SPDF.model.json.PdfJsonCosValue; +import stirling.software.SPDF.model.json.PdfJsonStream; + +@Slf4j +@Component +public class PdfJsonCosMapper { + + public PdfJsonStream serializeStream(PDStream stream) throws IOException { + if (stream == null) { + return null; + } + return serializeStream( + stream.getCOSObject(), Collections.newSetFromMap(new IdentityHashMap<>())); + } + + public PdfJsonStream serializeStream(COSStream cosStream) throws IOException { + if (cosStream == null) { + return null; + } + return serializeStream(cosStream, Collections.newSetFromMap(new IdentityHashMap<>())); + } + + public PdfJsonCosValue serializeCosValue(COSBase base) throws IOException { + return serializeCosValue(base, Collections.newSetFromMap(new IdentityHashMap<>())); + } + + public COSBase deserializeCosValue(PdfJsonCosValue value, PDDocument document) + throws IOException { + if (value == null || value.getType() == null) { + return null; + } + switch (value.getType()) { + case NULL: + return COSNull.NULL; + case BOOLEAN: + if (value.getValue() instanceof Boolean bool) { + return COSBoolean.getBoolean(bool); + } + return null; + case INTEGER: + if (value.getValue() instanceof Number number) { + return COSInteger.get(number.longValue()); + } + return null; + case FLOAT: + if (value.getValue() instanceof Number number) { + return new COSFloat(number.floatValue()); + } + return null; + case NAME: + if (value.getValue() instanceof String name) { + return COSName.getPDFName(name); + } + return null; + case STRING: + if (value.getValue() instanceof String encoded) { + try { + byte[] bytes = Base64.getDecoder().decode(encoded); + return new COSString(bytes); + } catch (IllegalArgumentException ex) { + log.debug("Failed to decode COSString value: {}", ex.getMessage()); + } + } + return null; + case ARRAY: + COSArray array = new COSArray(); + if (value.getItems() != null) { + for (PdfJsonCosValue item : value.getItems()) { + COSBase entry = deserializeCosValue(item, document); + if (entry != null) { + array.add(entry); + } else { + array.add(COSNull.NULL); + } + } + } + return array; + case DICTIONARY: + COSDictionary dictionary = new COSDictionary(); + if (value.getEntries() != null) { + for (Map.Entry entry : value.getEntries().entrySet()) { + COSName key = COSName.getPDFName(entry.getKey()); + COSBase entryValue = deserializeCosValue(entry.getValue(), document); + if (entryValue != null) { + dictionary.setItem(key, entryValue); + } + } + } + return dictionary; + case STREAM: + if (value.getStream() != null) { + return buildStreamFromModel(value.getStream(), document); + } + return null; + default: + return null; + } + } + + public COSStream buildStreamFromModel(PdfJsonStream streamModel, PDDocument document) + throws IOException { + if (streamModel == null) { + return null; + } + COSStream cosStream = document.getDocument().createCOSStream(); + if (streamModel.getDictionary() != null) { + for (Map.Entry entry : + streamModel.getDictionary().entrySet()) { + COSName key = COSName.getPDFName(entry.getKey()); + COSBase value = deserializeCosValue(entry.getValue(), document); + if (value != null) { + cosStream.setItem(key, value); + } + } + } + + String rawData = streamModel.getRawData(); + if (rawData != null && !rawData.isBlank()) { + byte[] data; + try { + data = Base64.getDecoder().decode(rawData); + } catch (IllegalArgumentException ex) { + log.debug("Invalid base64 content stream data: {}", ex.getMessage()); + data = new byte[0]; + } + try (OutputStream outputStream = cosStream.createRawOutputStream()) { + outputStream.write(data); + } + cosStream.setItem(COSName.LENGTH, COSInteger.get(data.length)); + } else { + cosStream.setItem(COSName.LENGTH, COSInteger.get(0)); + } + return cosStream; + } + + private PdfJsonCosValue serializeCosValue(COSBase base, Set visited) + throws IOException { + if (base == null) { + return null; + } + if (base instanceof COSObject cosObject) { + base = cosObject.getObject(); + if (base == null) { + return null; + } + } + + boolean complex = + base instanceof COSDictionary + || base instanceof COSArray + || base instanceof COSStream; + if (complex) { + if (!visited.add(base)) { + return PdfJsonCosValue.builder() + .type(PdfJsonCosValue.Type.NAME) + .value("__circular__") + .build(); + } + } + + try { + PdfJsonCosValue.PdfJsonCosValueBuilder builder = PdfJsonCosValue.builder(); + if (base instanceof COSNull) { + builder.type(PdfJsonCosValue.Type.NULL); + return builder.build(); + } + if (base instanceof COSBoolean booleanValue) { + builder.type(PdfJsonCosValue.Type.BOOLEAN).value(booleanValue.getValue()); + return builder.build(); + } + if (base instanceof COSInteger integer) { + builder.type(PdfJsonCosValue.Type.INTEGER).value(integer.longValue()); + return builder.build(); + } + if (base instanceof COSFloat floatValue) { + builder.type(PdfJsonCosValue.Type.FLOAT).value(floatValue.floatValue()); + return builder.build(); + } + if (base instanceof COSName name) { + builder.type(PdfJsonCosValue.Type.NAME).value(name.getName()); + return builder.build(); + } + if (base instanceof COSString cosString) { + builder.type(PdfJsonCosValue.Type.STRING) + .value(Base64.getEncoder().encodeToString(cosString.getBytes())); + return builder.build(); + } + if (base instanceof COSArray array) { + List items = new ArrayList<>(array.size()); + for (COSBase item : array) { + PdfJsonCosValue serialized = serializeCosValue(item, visited); + items.add(serialized); + } + builder.type(PdfJsonCosValue.Type.ARRAY).items(items); + return builder.build(); + } + if (base instanceof COSStream stream) { + builder.type(PdfJsonCosValue.Type.STREAM).stream(serializeStream(stream, visited)); + return builder.build(); + } + if (base instanceof COSDictionary dictionary) { + Map entries = new LinkedHashMap<>(); + for (COSName key : dictionary.keySet()) { + PdfJsonCosValue serialized = + serializeCosValue(dictionary.getDictionaryObject(key), visited); + entries.put(key.getName(), serialized); + } + builder.type(PdfJsonCosValue.Type.DICTIONARY).entries(entries); + return builder.build(); + } + return null; + } finally { + if (complex) { + visited.remove(base); + } + } + } + + private PdfJsonStream serializeStream(COSStream cosStream, Set visited) + throws IOException { + Map dictionary = new LinkedHashMap<>(); + for (COSName key : cosStream.keySet()) { + COSBase value = cosStream.getDictionaryObject(key); + PdfJsonCosValue serialized = serializeCosValue(value, visited); + if (serialized != null) { + dictionary.put(key.getName(), serialized); + } + } + String rawData = null; + try (InputStream inputStream = cosStream.createRawInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + if (inputStream != null) { + inputStream.transferTo(baos); + } + byte[] data = baos.toByteArray(); + if (data.length > 0) { + rawData = Base64.getEncoder().encodeToString(data); + } + } + return PdfJsonStream.builder().dictionary(dictionary).rawData(rawData).build(); + } +} diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonFallbackFontService.java b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonFallbackFontService.java new file mode 100644 index 000000000..ee9a4ee75 --- /dev/null +++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/PdfJsonFallbackFontService.java @@ -0,0 +1,224 @@ +package stirling.software.SPDF.service; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDType0Font; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.core.io.Resource; +import org.springframework.core.io.ResourceLoader; +import org.springframework.stereotype.Component; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import stirling.software.SPDF.model.json.PdfJsonFont; + +@Slf4j +@Component +@RequiredArgsConstructor +public class PdfJsonFallbackFontService { + + public static final String FALLBACK_FONT_ID = "fallback-noto-sans"; + public static final String DEFAULT_FALLBACK_FONT_LOCATION = + "classpath:/static/fonts/NotoSans-Regular.ttf"; + public static final String FALLBACK_FONT_CJK_ID = "fallback-noto-cjk"; + public static final String FALLBACK_FONT_JP_ID = "fallback-noto-jp"; + public static final String FALLBACK_FONT_KR_ID = "fallback-noto-korean"; + public static final String FALLBACK_FONT_AR_ID = "fallback-noto-arabic"; + public static final String FALLBACK_FONT_TH_ID = "fallback-noto-thai"; + + private static final Map BUILT_IN_FALLBACK_FONTS = + Map.ofEntries( + Map.entry( + FALLBACK_FONT_CJK_ID, + new FallbackFontSpec( + "classpath:/static/fonts/NotoSansSC-Regular.ttf", + "NotoSansSC-Regular", + "ttf")), + Map.entry( + FALLBACK_FONT_JP_ID, + new FallbackFontSpec( + "classpath:/static/fonts/NotoSansJP-Regular.ttf", + "NotoSansJP-Regular", + "ttf")), + Map.entry( + FALLBACK_FONT_KR_ID, + new FallbackFontSpec( + "classpath:/static/fonts/malgun.ttf", "MalgunGothic", "ttf")), + Map.entry( + FALLBACK_FONT_AR_ID, + new FallbackFontSpec( + "classpath:/static/fonts/NotoSansArabic-Regular.ttf", + "NotoSansArabic-Regular", + "ttf")), + Map.entry( + FALLBACK_FONT_TH_ID, + new FallbackFontSpec( + "classpath:/static/fonts/NotoSansThai-Regular.ttf", + "NotoSansThai-Regular", + "ttf"))); + + private final ResourceLoader resourceLoader; + + @Value("${stirling.pdf.fallback-font:" + DEFAULT_FALLBACK_FONT_LOCATION + "}") + private String fallbackFontLocation; + + private final Map fallbackFontCache = new ConcurrentHashMap<>(); + + public PdfJsonFont buildFallbackFontModel() throws IOException { + return buildFallbackFontModel(FALLBACK_FONT_ID); + } + + public PdfJsonFont buildFallbackFontModel(String fallbackId) throws IOException { + FallbackFontSpec spec = getFallbackFontSpec(fallbackId); + if (spec == null) { + throw new IOException("Unknown fallback font id " + fallbackId); + } + byte[] bytes = loadFallbackFontBytes(fallbackId, spec); + String base64 = java.util.Base64.getEncoder().encodeToString(bytes); + return PdfJsonFont.builder() + .id(fallbackId) + .uid(fallbackId) + .baseName(spec.baseName()) + .subtype("TrueType") + .embedded(true) + .program(base64) + .programFormat(spec.format()) + .build(); + } + + public PDFont loadFallbackPdfFont(PDDocument document) throws IOException { + return loadFallbackPdfFont(document, FALLBACK_FONT_ID); + } + + public PDFont loadFallbackPdfFont(PDDocument document, String fallbackId) throws IOException { + FallbackFontSpec spec = getFallbackFontSpec(fallbackId); + if (spec == null) { + throw new IOException("Unknown fallback font id " + fallbackId); + } + byte[] bytes = loadFallbackFontBytes(fallbackId, spec); + try (InputStream stream = new ByteArrayInputStream(bytes)) { + return PDType0Font.load(document, stream, true); + } + } + + public boolean canEncodeFully(PDFont font, String text) { + return canEncode(font, text); + } + + public boolean canEncode(PDFont font, int codePoint) { + return canEncode(font, new String(Character.toChars(codePoint))); + } + + public boolean canEncode(PDFont font, String text) { + if (font == null || text == null || text.isEmpty()) { + return false; + } + try { + font.encode(text); + return true; + } catch (IOException | IllegalArgumentException ex) { + return false; + } + } + + public String resolveFallbackFontId(int codePoint) { + Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint); + if (block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E + || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F + || block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION + || block == Character.UnicodeBlock.BOPOMOFO + || block == Character.UnicodeBlock.BOPOMOFO_EXTENDED + || block == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { + return FALLBACK_FONT_CJK_ID; + } + + Character.UnicodeScript script = Character.UnicodeScript.of(codePoint); + return switch (script) { + case HAN -> FALLBACK_FONT_CJK_ID; + case HIRAGANA, KATAKANA -> FALLBACK_FONT_JP_ID; + case HANGUL -> FALLBACK_FONT_KR_ID; + case ARABIC -> FALLBACK_FONT_AR_ID; + case THAI -> FALLBACK_FONT_TH_ID; + default -> FALLBACK_FONT_ID; + }; + } + + public String mapUnsupportedGlyph(int codePoint) { + return switch (codePoint) { + case 0x276E -> "<"; + case 0x276F -> ">"; + default -> null; + }; + } + + private FallbackFontSpec getFallbackFontSpec(String fallbackId) { + if (FALLBACK_FONT_ID.equals(fallbackId)) { + String baseName = inferBaseName(fallbackFontLocation, "NotoSans-Regular"); + String format = inferFormat(fallbackFontLocation, "ttf"); + return new FallbackFontSpec(fallbackFontLocation, baseName, format); + } + return BUILT_IN_FALLBACK_FONTS.get(fallbackId); + } + + private byte[] loadFallbackFontBytes(String fallbackId, FallbackFontSpec spec) + throws IOException { + if (spec == null) { + throw new IOException("No fallback font specification for " + fallbackId); + } + byte[] cached = fallbackFontCache.get(fallbackId); + if (cached != null) { + return cached; + } + Resource resource = resourceLoader.getResource(spec.resourceLocation()); + if (!resource.exists()) { + throw new IOException("Fallback font resource not found at " + spec.resourceLocation()); + } + try (InputStream inputStream = resource.getInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + inputStream.transferTo(baos); + byte[] bytes = baos.toByteArray(); + fallbackFontCache.put(fallbackId, bytes); + return bytes; + } + } + + private String inferBaseName(String location, String defaultName) { + if (location == null || location.isBlank()) { + return defaultName; + } + int slash = location.lastIndexOf('/'); + String fileName = slash >= 0 ? location.substring(slash + 1) : location; + int dot = fileName.lastIndexOf('.'); + if (dot > 0) { + fileName = fileName.substring(0, dot); + } + return fileName.isEmpty() ? defaultName : fileName; + } + + private String inferFormat(String location, String defaultFormat) { + if (location == null || location.isBlank()) { + return defaultFormat; + } + int dot = location.lastIndexOf('.'); + if (dot >= 0 && dot < location.length() - 1) { + return location.substring(dot + 1).toLowerCase(Locale.ROOT); + } + return defaultFormat; + } + + private record FallbackFontSpec(String resourceLocation, String baseName, String format) {} +} diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonFontService.java b/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonFontService.java new file mode 100644 index 000000000..8b638882a --- /dev/null +++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonFontService.java @@ -0,0 +1,349 @@ +package stirling.software.SPDF.service.pdfjson; + +import java.io.IOException; +import java.nio.file.Files; +import java.util.Base64; +import java.util.Locale; + +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; + +import jakarta.annotation.PostConstruct; + +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import stirling.software.common.util.ProcessExecutor; +import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult; +import stirling.software.common.util.TempFile; +import stirling.software.common.util.TempFileManager; + +@Slf4j +@Service +@RequiredArgsConstructor +public class PdfJsonFontService { + + private final TempFileManager tempFileManager; + + @Getter + @Value("${stirling.pdf.json.cff-converter.enabled:true}") + private boolean cffConversionEnabled; + + @Getter + @Value("${stirling.pdf.json.cff-converter.method:python}") + private String cffConverterMethod; + + @Value("${stirling.pdf.json.cff-converter.python-command:/opt/venv/bin/python3}") + private String pythonCommand; + + @Value("${stirling.pdf.json.cff-converter.python-script:/scripts/convert_cff_to_ttf.py}") + private String pythonScript; + + @Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}") + private String fontforgeCommand; + + private volatile boolean pythonCffConverterAvailable; + private volatile boolean fontForgeCffConverterAvailable; + + @PostConstruct + private void initialiseCffConverterAvailability() { + if (!cffConversionEnabled) { + log.warn("[FONT-DEBUG] CFF conversion is DISABLED in configuration"); + pythonCffConverterAvailable = false; + fontForgeCffConverterAvailable = false; + return; + } + + log.info("[FONT-DEBUG] CFF conversion enabled, checking tool availability..."); + pythonCffConverterAvailable = isCommandAvailable(pythonCommand); + if (!pythonCffConverterAvailable) { + log.warn( + "[FONT-DEBUG] Python command '{}' not found; Python CFF conversion disabled", + pythonCommand); + } else { + log.info("[FONT-DEBUG] Python command '{}' is available", pythonCommand); + } + + fontForgeCffConverterAvailable = isCommandAvailable(fontforgeCommand); + if (!fontForgeCffConverterAvailable) { + log.warn( + "[FONT-DEBUG] FontForge command '{}' not found; FontForge CFF conversion disabled", + fontforgeCommand); + } else { + log.info("[FONT-DEBUG] FontForge command '{}' is available", fontforgeCommand); + } + + log.info("[FONT-DEBUG] Selected CFF converter method: {}", cffConverterMethod); + } + + public byte[] convertCffProgramToTrueType(byte[] fontBytes, String toUnicode) { + if (!cffConversionEnabled || fontBytes == null || fontBytes.length == 0) { + log.warn( + "[FONT-DEBUG] CFF conversion skipped: enabled={}, bytes={}", + cffConversionEnabled, + fontBytes == null ? "null" : fontBytes.length); + return null; + } + + log.info( + "[FONT-DEBUG] Converting CFF font: {} bytes, method: {}", + fontBytes.length, + cffConverterMethod); + + if ("python".equalsIgnoreCase(cffConverterMethod)) { + if (!pythonCffConverterAvailable) { + log.warn("[FONT-DEBUG] Python CFF converter not available, skipping conversion"); + return null; + } + byte[] result = convertCffUsingPython(fontBytes, toUnicode); + log.info( + "[FONT-DEBUG] Python conversion result: {}", + result == null ? "null" : result.length + " bytes"); + return result; + } else if ("fontforge".equalsIgnoreCase(cffConverterMethod)) { + if (!fontForgeCffConverterAvailable) { + log.warn("[FONT-DEBUG] FontForge CFF converter not available, skipping conversion"); + return null; + } + byte[] result = convertCffUsingFontForge(fontBytes); + log.info( + "[FONT-DEBUG] FontForge conversion result: {}", + result == null ? "null" : result.length + " bytes"); + return result; + } else { + log.warn( + "[FONT-DEBUG] Unknown CFF converter method: {}, falling back to Python", + cffConverterMethod); + if (!pythonCffConverterAvailable) { + log.warn("[FONT-DEBUG] Python CFF converter not available, skipping conversion"); + return null; + } + byte[] result = convertCffUsingPython(fontBytes, toUnicode); + log.info( + "[FONT-DEBUG] Python conversion result: {}", + result == null ? "null" : result.length + " bytes"); + return result; + } + } + + public String detectFontFlavor(byte[] fontBytes) { + if (fontBytes == null || fontBytes.length < 4) { + return null; + } + int signature = + ((fontBytes[0] & 0xFF) << 24) + | ((fontBytes[1] & 0xFF) << 16) + | ((fontBytes[2] & 0xFF) << 8) + | (fontBytes[3] & 0xFF); + if (signature == 0x00010000 || signature == 0x74727565) { + return "ttf"; + } + if (signature == 0x4F54544F) { + return "otf"; + } + if (signature == 0x74746366) { + return "cff"; + } + return null; + } + + public String detectTrueTypeFormat(byte[] data) { + if (data == null || data.length < 4) { + return null; + } + int signature = + ((data[0] & 0xFF) << 24) + | ((data[1] & 0xFF) << 16) + | ((data[2] & 0xFF) << 8) + | (data[3] & 0xFF); + if (signature == 0x00010000) { + return "ttf"; + } + if (signature == 0x4F54544F) { + return "otf"; + } + if (signature == 0x74746366) { + return "cff"; + } + return null; + } + + public String validateFontTables(byte[] fontBytes) { + if (fontBytes == null || fontBytes.length < 12) { + return "Font program too small"; + } + int numTables = ((fontBytes[4] & 0xFF) << 8) | (fontBytes[5] & 0xFF); + if (numTables <= 0 || numTables > 512) { + return "Invalid numTables: " + numTables; + } + return null; + } + + private byte[] convertCffUsingPython(byte[] fontBytes, String toUnicode) { + if (!pythonCffConverterAvailable) { + log.warn("[FONT-DEBUG] Python CFF converter not available"); + return null; + } + if (pythonCommand == null + || pythonCommand.isBlank() + || pythonScript == null + || pythonScript.isBlank()) { + log.warn("[FONT-DEBUG] Python converter not configured"); + return null; + } + + log.info( + "[FONT-DEBUG] Running Python CFF converter: command={}, script={}", + pythonCommand, + pythonScript); + + try (TempFile inputFile = new TempFile(tempFileManager, ".cff"); + TempFile outputFile = new TempFile(tempFileManager, ".otf"); + TempFile toUnicodeFile = + toUnicode != null ? new TempFile(tempFileManager, ".tounicode") : null) { + Files.write(inputFile.getPath(), fontBytes); + if (toUnicodeFile != null) { + try { + byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicode); + Files.write(toUnicodeFile.getPath(), toUnicodeBytes); + } catch (IllegalArgumentException ex) { + log.warn( + "[FONT-DEBUG] Failed to decode ToUnicode data for CFF conversion: {}", + ex.getMessage()); + return null; + } + } + + String[] command = + buildPythonCommand( + inputFile.getAbsolutePath(), + outputFile.getAbsolutePath(), + toUnicodeFile != null ? toUnicodeFile.getAbsolutePath() : null); + log.info("[FONT-DEBUG] Executing: {}", String.join(" ", command)); + + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.CFF_CONVERTER) + .runCommandWithOutputHandling(java.util.Arrays.asList(command)); + + if (result.getRc() != 0) { + log.error( + "[FONT-DEBUG] Python CFF conversion failed with exit code: {}", + result.getRc()); + log.error("[FONT-DEBUG] Stdout: {}", result.getMessages()); + return null; + } + if (!Files.exists(outputFile.getPath())) { + log.error("[FONT-DEBUG] Python CFF conversion produced no output file"); + return null; + } + byte[] data = Files.readAllBytes(outputFile.getPath()); + if (data.length == 0) { + log.error("[FONT-DEBUG] Python CFF conversion returned empty output"); + return null; + } + log.info( + "[FONT-DEBUG] Python CFF conversion succeeded: {} bytes -> {} bytes", + fontBytes.length, + data.length); + return data; + } catch (IOException | InterruptedException ex) { + if (ex instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + log.error("[FONT-DEBUG] Python CFF conversion exception: {}", ex.getMessage(), ex); + return null; + } + } + + public byte[] convertCffUsingFontForge(byte[] fontBytes) { + if (!fontForgeCffConverterAvailable) { + log.debug("FontForge CFF converter not available"); + return null; + } + + try (TempFile inputFile = new TempFile(tempFileManager, ".cff"); + TempFile outputFile = new TempFile(tempFileManager, ".ttf")) { + Files.write(inputFile.getPath(), fontBytes); + + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.CFF_CONVERTER) + .runCommandWithOutputHandling( + java.util.Arrays.asList( + fontforgeCommand, + "-lang=ff", + "-c", + "Open($1); " + + "ScaleToEm(1000); " + + "SelectWorthOutputting(); " + + "SetFontOrder(2); " + + "Reencode(\"unicode\"); " + + "RoundToInt(); " + + "RemoveOverlap(); " + + "Simplify(); " + + "CorrectDirection(); " + + "Generate($2, \"\", 4+16+32); " + + "Close(); " + + "Quit()", + inputFile.getAbsolutePath(), + outputFile.getAbsolutePath())); + + if (result.getRc() != 0) { + log.warn("FontForge CFF conversion failed: {}", result.getRc()); + return null; + } + if (!Files.exists(outputFile.getPath())) { + log.warn("FontForge CFF conversion produced no output"); + return null; + } + byte[] data = Files.readAllBytes(outputFile.getPath()); + if (data.length == 0) { + log.warn("FontForge CFF conversion returned empty output"); + return null; + } + return data; + } catch (IOException | InterruptedException ex) { + if (ex instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + log.warn("FontForge CFF conversion failed: {}", ex.getMessage()); + return null; + } + } + + private boolean isCommandAvailable(String command) { + if (command == null || command.isBlank()) { + return false; + } + try { + ProcessBuilder processBuilder = new ProcessBuilder(); + if (System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows")) { + processBuilder.command("where", command); + } else { + processBuilder.command("which", command); + } + Process process = processBuilder.start(); + int exitCode = process.waitFor(); + return exitCode == 0; + } catch (Exception e) { + log.debug("Error checking for command {}: {}", command, e.getMessage()); + return false; + } + } + + private String[] buildPythonCommand(String input, String output, String toUnicode) { + if (toUnicode != null) { + return new String[] { + pythonCommand, + pythonScript, + "--input", + input, + "--output", + output, + "--to-unicode", + toUnicode + }; + } + return new String[] {pythonCommand, pythonScript, "--input", input, "--output", output}; + } +} diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonImageService.java b/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonImageService.java new file mode 100644 index 000000000..58b56b22a --- /dev/null +++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonImageService.java @@ -0,0 +1,444 @@ +package stirling.software.SPDF.service.pdfjson; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Point2D; +import java.awt.image.BufferedImage; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Base64; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.UUID; +import java.util.function.Consumer; + +import javax.imageio.ImageIO; + +import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine; +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.contentstream.operator.OperatorName; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.graphics.image.PDImage; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.pdfbox.util.Matrix; +import org.springframework.stereotype.Service; + +import lombok.extern.slf4j.Slf4j; + +import stirling.software.SPDF.model.api.PdfJsonConversionProgress; +import stirling.software.SPDF.model.json.PdfJsonImageElement; + +/** + * Service for handling PDF image operations for JSON conversion (extraction, encoding, rendering). + */ +@Service +@Slf4j +public class PdfJsonImageService { + + private record EncodedImage(String base64, String format) {} + + private record Bounds(float left, float right, float bottom, float top) { + float width() { + return Math.max(0f, right - left); + } + + float height() { + return Math.max(0f, top - bottom); + } + } + + /** + * Collects images from all pages in a PDF document. + * + * @param document The PDF document + * @param totalPages Total number of pages + * @param progress Progress callback + * @return Map of page number to list of image elements + * @throws IOException If image extraction fails + */ + public Map> collectImages( + PDDocument document, int totalPages, Consumer progress) + throws IOException { + Map> imagesByPage = new LinkedHashMap<>(); + int pageNumber = 1; + for (PDPage page : document.getPages()) { + ImageCollectingEngine engine = + new ImageCollectingEngine(page, pageNumber, imagesByPage); + engine.processPage(page); + + // Update progress for image extraction (70-80%) + int imageProgress = 70 + (int) ((pageNumber / (double) totalPages) * 10); + progress.accept( + PdfJsonConversionProgress.of( + imageProgress, "images", "Extracting images", pageNumber, totalPages)); + pageNumber++; + } + return imagesByPage; + } + + /** + * Extracts images from a single PDF page (for on-demand lazy loading). + * + * @param document The PDF document + * @param page The specific page to extract images from + * @param pageNumber The page number (1-indexed) + * @return List of image elements for this page + * @throws IOException If image extraction fails + */ + public List extractImagesForPage( + PDDocument document, PDPage page, int pageNumber) throws IOException { + Map> imagesByPage = new LinkedHashMap<>(); + ImageCollectingEngine engine = new ImageCollectingEngine(page, pageNumber, imagesByPage); + engine.processPage(page); + return imagesByPage.getOrDefault(pageNumber, new ArrayList<>()); + } + + /** + * Draws an image element on a PDF page content stream. + * + * @param contentStream The content stream to draw on + * @param document The PDF document + * @param element The image element to draw + * @param cache Cache of previously created image XObjects + * @throws IOException If drawing fails + */ + public void drawImageElement( + PDPageContentStream contentStream, + PDDocument document, + PdfJsonImageElement element, + Map cache) + throws IOException { + if (element == null || element.getImageData() == null || element.getImageData().isBlank()) { + return; + } + + String cacheKey = + element.getId() != null && !element.getId().isBlank() + ? element.getId() + : Integer.toHexString(System.identityHashCode(element)); + PDImageXObject image = cache.get(cacheKey); + if (image == null) { + image = createImageXObject(document, element); + if (image == null) { + return; + } + cache.put(cacheKey, image); + } + + List transform = element.getTransform(); + if (transform != null && transform.size() == 6) { + Matrix matrix = + new Matrix( + safeFloat(transform.get(0), 1f), + safeFloat(transform.get(1), 0f), + safeFloat(transform.get(2), 0f), + safeFloat(transform.get(3), 1f), + safeFloat(transform.get(4), 0f), + safeFloat(transform.get(5), 0f)); + contentStream.drawImage(image, matrix); + return; + } + + float width = safeFloat(element.getWidth(), fallbackWidth(element)); + float height = safeFloat(element.getHeight(), fallbackHeight(element)); + if (width <= 0f) { + width = Math.max(1f, fallbackWidth(element)); + } + if (height <= 0f) { + height = Math.max(1f, fallbackHeight(element)); + } + float left = resolveLeft(element, width); + float bottom = resolveBottom(element, height); + + contentStream.drawImage(image, left, bottom, width, height); + } + + /** + * Creates a PDImageXObject from a PdfJsonImageElement. + * + * @param document The PDF document + * @param element The image element with base64 data + * @return The created image XObject + * @throws IOException If image creation fails + */ + public PDImageXObject createImageXObject(PDDocument document, PdfJsonImageElement element) + throws IOException { + byte[] data; + try { + data = Base64.getDecoder().decode(element.getImageData()); + } catch (IllegalArgumentException ex) { + log.debug("Failed to decode image element: {}", ex.getMessage()); + return null; + } + String name = element.getId() != null ? element.getId() : UUID.randomUUID().toString(); + return PDImageXObject.createFromByteArray(document, data, name); + } + + private EncodedImage encodeImage(PDImage image) { + try { + BufferedImage bufferedImage = image.getImage(); + if (bufferedImage == null) { + return null; + } + String format = resolveImageFormat(image); + if (format == null || format.isBlank()) { + format = "png"; + } + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + boolean written = ImageIO.write(bufferedImage, format, baos); + if (!written) { + if (!"png".equalsIgnoreCase(format)) { + baos.reset(); + if (!ImageIO.write(bufferedImage, "png", baos)) { + return null; + } + format = "png"; + } else { + return null; + } + } + return new EncodedImage(Base64.getEncoder().encodeToString(baos.toByteArray()), format); + } catch (IOException ex) { + log.debug("Failed to encode image: {}", ex.getMessage()); + return null; + } + } + + private String resolveImageFormat(PDImage image) { + if (image instanceof PDImageXObject xObject) { + String suffix = xObject.getSuffix(); + if (suffix != null && !suffix.isBlank()) { + return suffix.toLowerCase(Locale.ROOT); + } + } + return "png"; + } + + private float fallbackWidth(PdfJsonImageElement element) { + if (element.getRight() != null && element.getLeft() != null) { + return Math.max(0f, element.getRight() - element.getLeft()); + } + if (element.getNativeWidth() != null) { + return element.getNativeWidth(); + } + return 1f; + } + + private float fallbackHeight(PdfJsonImageElement element) { + if (element.getTop() != null && element.getBottom() != null) { + return Math.max(0f, element.getTop() - element.getBottom()); + } + if (element.getNativeHeight() != null) { + return element.getNativeHeight(); + } + return 1f; + } + + private float resolveLeft(PdfJsonImageElement element, float width) { + if (element.getLeft() != null) { + return element.getLeft(); + } + if (element.getX() != null) { + return element.getX(); + } + if (element.getRight() != null) { + return element.getRight() - width; + } + return 0f; + } + + private float resolveBottom(PdfJsonImageElement element, float height) { + if (element.getBottom() != null) { + return element.getBottom(); + } + if (element.getY() != null) { + return element.getY(); + } + if (element.getTop() != null) { + return element.getTop() - height; + } + return 0f; + } + + private List toMatrixValues(Matrix matrix) { + List values = new ArrayList<>(6); + values.add(matrix.getValue(0, 0)); + values.add(matrix.getValue(0, 1)); + values.add(matrix.getValue(1, 0)); + values.add(matrix.getValue(1, 1)); + values.add(matrix.getValue(2, 0)); + values.add(matrix.getValue(2, 1)); + return values; + } + + private float safeFloat(Float value, float defaultValue) { + if (value == null || Float.isNaN(value) || Float.isInfinite(value)) { + return defaultValue; + } + return value; + } + + /** + * Inner engine that extends PDFGraphicsStreamEngine to collect images from PDF content streams. + */ + private class ImageCollectingEngine extends PDFGraphicsStreamEngine { + + private final int pageNumber; + private final Map> imagesByPage; + + private COSName currentXObjectName; + private int imageCounter = 0; + + protected ImageCollectingEngine( + PDPage page, int pageNumber, Map> imagesByPage) + throws IOException { + super(page); + this.pageNumber = pageNumber; + this.imagesByPage = imagesByPage; + } + + @Override + public void processPage(PDPage page) throws IOException { + super.processPage(page); + } + + @Override + public void drawImage(PDImage pdImage) throws IOException { + EncodedImage encoded = encodeImage(pdImage); + if (encoded == null) { + return; + } + Matrix ctm = getGraphicsState().getCurrentTransformationMatrix(); + Bounds bounds = computeBounds(ctm); + List matrixValues = toMatrixValues(ctm); + + PdfJsonImageElement element = + PdfJsonImageElement.builder() + .id(UUID.randomUUID().toString()) + .objectName( + currentXObjectName != null + ? currentXObjectName.getName() + : null) + .inlineImage(!(pdImage instanceof PDImageXObject)) + .nativeWidth(pdImage.getWidth()) + .nativeHeight(pdImage.getHeight()) + .x(bounds.left) + .y(bounds.bottom) + .width(bounds.width()) + .height(bounds.height()) + .left(bounds.left) + .right(bounds.right) + .top(bounds.top) + .bottom(bounds.bottom) + .transform(matrixValues) + .zOrder(-1_000_000 + imageCounter) + .imageData(encoded.base64()) + .imageFormat(encoded.format()) + .build(); + imageCounter++; + imagesByPage.computeIfAbsent(pageNumber, key -> new ArrayList<>()).add(element); + } + + @Override + public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) + throws IOException { + // Not needed for image extraction + } + + @Override + public void clip(int windingRule) throws IOException { + // Not needed for image extraction + } + + @Override + public void moveTo(float x, float y) throws IOException { + // Not needed for image extraction + } + + @Override + public void lineTo(float x, float y) throws IOException { + // Not needed for image extraction + } + + @Override + public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) + throws IOException { + // Not needed for image extraction + } + + @Override + public Point2D getCurrentPoint() throws IOException { + return new Point2D.Float(); + } + + @Override + public void closePath() throws IOException { + // Not needed for image extraction + } + + @Override + public void endPath() throws IOException { + // Not needed for image extraction + } + + @Override + public void shadingFill(COSName shadingName) throws IOException { + // Not needed for image extraction + } + + @Override + public void fillAndStrokePath(int windingRule) throws IOException { + // Not needed for image extraction + } + + @Override + public void fillPath(int windingRule) throws IOException { + // Not needed for image extraction + } + + @Override + public void strokePath() throws IOException { + // Not needed for image extraction + } + + @Override + protected void processOperator(Operator operator, List operands) + throws IOException { + if (OperatorName.DRAW_OBJECT.equals(operator.getName()) + && !operands.isEmpty() + && operands.get(0) instanceof COSName name) { + currentXObjectName = name; + } + super.processOperator(operator, operands); + currentXObjectName = null; + } + + private Bounds computeBounds(Matrix ctm) { + AffineTransform transform = ctm.createAffineTransform(); + Point2D.Float p0 = new Point2D.Float(0, 0); + Point2D.Float p1 = new Point2D.Float(1, 0); + Point2D.Float p2 = new Point2D.Float(0, 1); + Point2D.Float p3 = new Point2D.Float(1, 1); + transform.transform(p0, p0); + transform.transform(p1, p1); + transform.transform(p2, p2); + transform.transform(p3, p3); + + float minX = Math.min(Math.min(p0.x, p1.x), Math.min(p2.x, p3.x)); + float maxX = Math.max(Math.max(p0.x, p1.x), Math.max(p2.x, p3.x)); + float minY = Math.min(Math.min(p0.y, p1.y), Math.min(p2.y, p3.y)); + float maxY = Math.max(Math.max(p0.y, p1.y), Math.max(p2.y, p3.y)); + + if (!Float.isFinite(minX) || !Float.isFinite(minY)) { + return new Bounds(0f, 0f, 0f, 0f); + } + return new Bounds(minX, maxX, minY, maxY); + } + } +} diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonMetadataService.java b/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonMetadataService.java new file mode 100644 index 000000000..8cbffd538 --- /dev/null +++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfJsonMetadataService.java @@ -0,0 +1,148 @@ +package stirling.software.SPDF.service.pdfjson; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.time.Instant; +import java.time.format.DateTimeParseException; +import java.util.Base64; +import java.util.Calendar; +import java.util.Optional; +import java.util.TimeZone; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.pdmodel.common.PDMetadata; +import org.springframework.stereotype.Service; + +import lombok.extern.slf4j.Slf4j; + +import stirling.software.SPDF.model.json.PdfJsonMetadata; + +/** Service for extracting and applying PDF metadata (document info and XMP) for JSON conversion. */ +@Service +@Slf4j +public class PdfJsonMetadataService { + + /** + * Extracts document information metadata from a PDF. + * + * @param document The PDF document + * @return Metadata model with document info + */ + public PdfJsonMetadata extractMetadata(PDDocument document) { + PdfJsonMetadata metadata = new PdfJsonMetadata(); + PDDocumentInformation info = document.getDocumentInformation(); + if (info != null) { + metadata.setTitle(info.getTitle()); + metadata.setAuthor(info.getAuthor()); + metadata.setSubject(info.getSubject()); + metadata.setKeywords(info.getKeywords()); + metadata.setCreator(info.getCreator()); + metadata.setProducer(info.getProducer()); + metadata.setCreationDate(formatCalendar(info.getCreationDate())); + metadata.setModificationDate(formatCalendar(info.getModificationDate())); + metadata.setTrapped(info.getTrapped()); + } + metadata.setNumberOfPages(document.getNumberOfPages()); + return metadata; + } + + /** + * Extracts XMP metadata from a PDF as base64-encoded string. + * + * @param document The PDF document + * @return Base64-encoded XMP metadata, or null if not present + */ + public String extractXmpMetadata(PDDocument document) { + if (document.getDocumentCatalog() == null) { + return null; + } + PDMetadata metadata = document.getDocumentCatalog().getMetadata(); + if (metadata == null) { + return null; + } + try (InputStream inputStream = metadata.createInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + inputStream.transferTo(baos); + byte[] data = baos.toByteArray(); + if (data.length == 0) { + return null; + } + return Base64.getEncoder().encodeToString(data); + } catch (IOException ex) { + log.debug("Failed to extract XMP metadata: {}", ex.getMessage()); + return null; + } + } + + /** + * Applies metadata to a PDF document. + * + * @param document The PDF document + * @param metadata The metadata to apply + */ + public void applyMetadata(PDDocument document, PdfJsonMetadata metadata) { + if (metadata == null) { + return; + } + PDDocumentInformation info = document.getDocumentInformation(); + info.setTitle(metadata.getTitle()); + info.setAuthor(metadata.getAuthor()); + info.setSubject(metadata.getSubject()); + info.setKeywords(metadata.getKeywords()); + info.setCreator(metadata.getCreator()); + info.setProducer(metadata.getProducer()); + if (metadata.getCreationDate() != null) { + parseInstant(metadata.getCreationDate()) + .ifPresent(instant -> info.setCreationDate(toCalendar(instant))); + } + if (metadata.getModificationDate() != null) { + parseInstant(metadata.getModificationDate()) + .ifPresent(instant -> info.setModificationDate(toCalendar(instant))); + } + info.setTrapped(metadata.getTrapped()); + } + + /** + * Applies XMP metadata to a PDF document from base64-encoded string. + * + * @param document The PDF document + * @param base64 Base64-encoded XMP metadata + */ + public void applyXmpMetadata(PDDocument document, String base64) { + if (base64 == null || base64.isBlank()) { + return; + } + try (InputStream inputStream = + new ByteArrayInputStream(Base64.getDecoder().decode(base64))) { + PDMetadata metadata = new PDMetadata(document, inputStream); + document.getDocumentCatalog().setMetadata(metadata); + } catch (IllegalArgumentException | IOException ex) { + log.debug("Failed to apply XMP metadata: {}", ex.getMessage()); + } + } + + private String formatCalendar(Calendar calendar) { + if (calendar == null) { + return null; + } + return calendar.toInstant().toString(); + } + + private Optional parseInstant(String value) { + try { + return Optional.of(Instant.parse(value)); + } catch (DateTimeParseException ex) { + log.warn("Failed to parse instant '{}': {}", value, ex.getMessage()); + return Optional.empty(); + } + } + + private Calendar toCalendar(Instant instant) { + Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC")); + calendar.setTimeInMillis(instant.toEpochMilli()); + return calendar; + } +} diff --git a/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfLazyLoadingService.java b/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfLazyLoadingService.java new file mode 100644 index 000000000..cd843ee07 --- /dev/null +++ b/app/proprietary/src/main/java/stirling/software/SPDF/service/pdfjson/PdfLazyLoadingService.java @@ -0,0 +1,308 @@ +package stirling.software.SPDF.service.pdfjson; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; + +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import lombok.Data; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import stirling.software.SPDF.model.api.PdfJsonConversionProgress; +import stirling.software.SPDF.model.json.PdfJsonAnnotation; +import stirling.software.SPDF.model.json.PdfJsonCosValue; +import stirling.software.SPDF.model.json.PdfJsonDocumentMetadata; +import stirling.software.SPDF.model.json.PdfJsonFont; +import stirling.software.SPDF.model.json.PdfJsonImageElement; +import stirling.software.SPDF.model.json.PdfJsonPage; +import stirling.software.SPDF.model.json.PdfJsonPageDimension; +import stirling.software.SPDF.model.json.PdfJsonStream; +import stirling.software.SPDF.model.json.PdfJsonTextElement; +import stirling.software.common.service.CustomPDFDocumentFactory; +import stirling.software.common.service.TaskManager; +import stirling.software.common.util.ExceptionUtils; + +/** + * Service for lazy loading PDF pages. Caches PDF documents and extracts pages on-demand to reduce + * memory usage for large PDFs. + */ +@Service +@Slf4j +@RequiredArgsConstructor +public class PdfLazyLoadingService { + + private final CustomPDFDocumentFactory pdfDocumentFactory; + private final ObjectMapper objectMapper; + private final TaskManager taskManager; + private final PdfJsonMetadataService metadataService; + private final PdfJsonImageService imageService; + + /** Cache for storing PDDocuments for lazy page loading. Key is jobId. */ + private final Map documentCache = new ConcurrentHashMap<>(); + + /** + * Stores PDF file bytes for lazy page loading. Each page is extracted on-demand by re-loading + * the PDF from bytes. + */ + @Data + private static class CachedPdfDocument { + private final byte[] pdfBytes; + private final PdfJsonDocumentMetadata metadata; + private final long timestamp; + + public CachedPdfDocument(byte[] pdfBytes, PdfJsonDocumentMetadata metadata) { + this.pdfBytes = pdfBytes; + this.metadata = metadata; + this.timestamp = System.currentTimeMillis(); + } + } + + /** + * Extracts document metadata, fonts, and page dimensions without page content. Caches the PDF + * bytes for subsequent page requests. + * + * @param file The uploaded PDF file + * @param jobId The job ID for caching + * @param fonts Font map (will be populated) + * @param pageFontResources Page font resources map (will be populated) + * @return Serialized metadata JSON + * @throws IOException If extraction fails + */ + public byte[] extractDocumentMetadata( + MultipartFile file, + String jobId, + Map fonts, + Map> pageFontResources) + throws IOException { + if (file == null) { + throw ExceptionUtils.createNullArgumentException("fileInput"); + } + + Consumer progress = + jobId != null + ? (p) -> { + log.info( + "Progress: [{}%] {} - {}{}", + p.getPercent(), + p.getStage(), + p.getMessage(), + (p.getCurrent() != null && p.getTotal() != null) + ? String.format( + " (%d/%d)", p.getCurrent(), p.getTotal()) + : ""); + reportProgressToTaskManager(jobId, p); + } + : (p) -> {}; + + // Read PDF bytes once for processing and caching + byte[] pdfBytes = file.getBytes(); + + try (PDDocument document = pdfDocumentFactory.load(pdfBytes, true)) { + int totalPages = document.getNumberOfPages(); + + // Build metadata response + progress.accept(PdfJsonConversionProgress.of(90, "metadata", "Extracting metadata")); + PdfJsonDocumentMetadata docMetadata = new PdfJsonDocumentMetadata(); + docMetadata.setMetadata(metadataService.extractMetadata(document)); + docMetadata.setXmpMetadata(metadataService.extractXmpMetadata(document)); + docMetadata.setLazyImages(Boolean.TRUE); + + List serializedFonts = new ArrayList<>(fonts.values()); + serializedFonts.sort( + Comparator.comparing( + PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder()))); + docMetadata.setFonts(serializedFonts); + + // Extract page dimensions + List pageDimensions = new ArrayList<>(); + int pageIndex = 0; + for (PDPage page : document.getPages()) { + PdfJsonPageDimension dim = new PdfJsonPageDimension(); + dim.setPageNumber(pageIndex + 1); + PDRectangle mediaBox = page.getMediaBox(); + dim.setWidth(mediaBox.getWidth()); + dim.setHeight(mediaBox.getHeight()); + dim.setRotation(page.getRotation()); + pageDimensions.add(dim); + pageIndex++; + } + docMetadata.setPageDimensions(pageDimensions); + + // Cache PDF bytes and metadata for lazy page loading + if (jobId != null) { + CachedPdfDocument cached = new CachedPdfDocument(pdfBytes, docMetadata); + documentCache.put(jobId, cached); + log.info( + "Cached PDF bytes ({} bytes) for lazy loading, jobId: {}", + pdfBytes.length, + jobId); + + // Schedule cleanup after 30 minutes + scheduleDocumentCleanup(jobId); + } + + progress.accept( + PdfJsonConversionProgress.of(100, "complete", "Metadata extraction complete")); + + return objectMapper.writeValueAsBytes(docMetadata); + } + } + + /** + * Extracts a single page from cached PDF bytes. Re-loads the PDF for each request. + * + * @param jobId The job ID + * @param pageNumber The page number (1-indexed) + * @param serializeCosValue Function to serialize COS values + * @param extractContentStreams Function to extract content streams + * @param filterImageXObjectsFromResources Function to filter image XObjects + * @param extractText Function to extract text elements for the page + * @param extractAnnotations Function to extract annotations for the page + * @return Serialized page JSON + * @throws IOException If extraction fails + */ + public byte[] extractSinglePage( + String jobId, + int pageNumber, + java.util.function.Function serializeCosValue, + java.util.function.Function> extractContentStreams, + java.util.function.Function filterImageXObjectsFromResources, + java.util.function.BiFunction> + extractText, + java.util.function.BiFunction> + extractAnnotations) + throws IOException { + CachedPdfDocument cached = documentCache.get(jobId); + if (cached == null) { + throw new IllegalArgumentException("No cached document found for jobId: " + jobId); + } + + int pageIndex = pageNumber - 1; + int totalPages = cached.getMetadata().getPageDimensions().size(); + + if (pageIndex < 0 || pageIndex >= totalPages) { + throw new IllegalArgumentException( + "Page number " + pageNumber + " out of range (1-" + totalPages + ")"); + } + + log.debug("Loading PDF from bytes to extract page {} (jobId: {})", pageNumber, jobId); + + // Re-load PDF from cached bytes and extract the single page + try (PDDocument document = pdfDocumentFactory.load(cached.getPdfBytes(), true)) { + PDPage page = document.getPage(pageIndex); + PdfJsonPage pageModel = new PdfJsonPage(); + pageModel.setPageNumber(pageNumber); + PDRectangle mediaBox = page.getMediaBox(); + pageModel.setWidth(mediaBox.getWidth()); + pageModel.setHeight(mediaBox.getHeight()); + pageModel.setRotation(page.getRotation()); + + // Extract text on-demand + pageModel.setTextElements(extractText.apply(document, pageNumber)); + + // Extract annotations on-demand + pageModel.setAnnotations(extractAnnotations.apply(document, pageNumber)); + + // Extract images on-demand + List images = + imageService.extractImagesForPage(document, page, pageNumber); + pageModel.setImageElements(images); + + // Extract resources and content streams + COSBase resourcesBase = page.getCOSObject().getDictionaryObject(COSName.RESOURCES); + COSBase filteredResources = filterImageXObjectsFromResources.apply(resourcesBase); + pageModel.setResources(serializeCosValue.apply(filteredResources)); + pageModel.setContentStreams(extractContentStreams.apply(page)); + + log.debug( + "Extracted page {} (text: {}, images: {}, annotations: {}) for jobId: {}", + pageNumber, + pageModel.getTextElements().size(), + images.size(), + pageModel.getAnnotations().size(), + jobId); + + return objectMapper.writeValueAsBytes(pageModel); + } + } + + /** Clears a cached document. */ + public void clearCachedDocument(String jobId) { + CachedPdfDocument cached = documentCache.remove(jobId); + if (cached != null) { + log.info( + "Removed cached PDF bytes ({} bytes) for jobId: {}", + cached.getPdfBytes().length, + jobId); + } + } + + /** Schedules automatic cleanup of cached documents after 30 minutes. */ + private void scheduleDocumentCleanup(String jobId) { + new Thread( + () -> { + try { + Thread.sleep(TimeUnit.MINUTES.toMillis(30)); + clearCachedDocument(jobId); + log.info("Auto-cleaned cached document for jobId: {}", jobId); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + }) + .start(); + } + + /** + * Report progress to TaskManager for async jobs + * + * @param jobId The job ID + * @param progress The progress update + */ + private void reportProgressToTaskManager(String jobId, PdfJsonConversionProgress progress) { + try { + log.info( + "Reporting progress for job {}: {}% - {}", + jobId, progress.getPercent(), progress.getStage()); + String note; + if (progress.getCurrent() != null && progress.getTotal() != null) { + note = + String.format( + "[%d%%] %s: %s (%d/%d)", + progress.getPercent(), + progress.getStage(), + progress.getMessage(), + progress.getCurrent(), + progress.getTotal()); + } else { + note = + String.format( + "[%d%%] %s: %s", + progress.getPercent(), progress.getStage(), progress.getMessage()); + } + boolean added = taskManager.addNote(jobId, note); + if (!added) { + log.warn("Failed to add note - job {} not found in TaskManager", jobId); + } else { + log.info("Successfully added progress note for job {}: {}", jobId, note); + } + } catch (Exception e) { + log.error("Exception reporting progress for job {}: {}", jobId, e.getMessage(), e); + } + } +} diff --git a/frontend/public/locales/en-GB/translation.json b/frontend/public/locales/en-GB/translation.json index 43e4c542a..63677052b 100644 --- a/frontend/public/locales/en-GB/translation.json +++ b/frontend/public/locales/en-GB/translation.json @@ -4437,6 +4437,32 @@ "errors": { "invalidJson": "Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.", "pdfConversion": "Unable to convert the edited JSON back into a PDF." + }, + "options": { + "autoScaleText": { + "title": "Auto-scale text to fit boxes", + "description": "Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF." + } + }, + "disclaimer": { + "heading": "Preview limitations", + "textFocus": "This workspace focuses on editing text and repositioning embedded images. Complex page artwork, form widgets, and layered graphics are preserved for export but are not fully editable here.", + "previewVariance": "Some visuals (such as table borders, shapes, or annotation appearances) may not display exactly in the preview. The exported PDF keeps the original drawing commands whenever possible.", + "alpha": "This alpha viewer is still evolving—certain fonts, colours, transparency effects, and layout details may shift slightly. Please double-check the generated PDF before sharing." + }, + "stages": { + "uploading": "Uploading", + "initializing": "Initializing", + "loading": "Loading", + "normalizing": "Normalizing", + "parsing": "Parsing", + "fonts": "Fonts", + "text": "Text Extraction", + "images": "Images", + "annotations": "Annotations", + "metadata": "Metadata", + "serializing": "Finalizing", + "complete": "Complete" } }, "workspace": { diff --git a/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx b/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx index 6cac64a0c..9aadb26a7 100644 --- a/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx +++ b/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx @@ -11,8 +11,10 @@ import { FileButton, Group, Pagination, + Progress, ScrollArea, Stack, + Switch, Text, Title, } from '@mantine/core'; @@ -32,6 +34,7 @@ import { PdfJsonEditorViewData, PdfJsonFont, PdfJsonPage, + ConversionProgress, } from '@app/tools/pdfJsonEditor/pdfJsonEditorTypes'; import { getImageBounds, pageDimensions } from '@app/tools/pdfJsonEditor/pdfJsonEditorUtils'; @@ -205,6 +208,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const [activeImageId, setActiveImageId] = useState(null); const [fontFamilies, setFontFamilies] = useState>(new Map()); const [textGroupsExpanded, setTextGroupsExpanded] = useState(false); + const [autoScaleText, setAutoScaleText] = useState(true); + const [textScales, setTextScales] = useState>(new Map()); + const measurementKeyRef = useRef(''); const containerRef = useRef(null); const editorRefs = useRef>(new Map()); const caretOffsetsRef = useRef>(new Map()); @@ -220,6 +226,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { errorMessage, isGeneratingPdf, isConverting, + conversionProgress, hasChanges, onLoadJson, onSelectPage, @@ -562,8 +569,73 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { setActiveGroupId(null); setEditingGroupId(null); setActiveImageId(null); + setTextScales(new Map()); + measurementKeyRef.current = ''; }, [selectedPage]); + // Measure text widths once per page/configuration and apply static scaling + useLayoutEffect(() => { + if (!autoScaleText || visibleGroups.length === 0) { + return; + } + + // Create a stable key for this measurement configuration + const currentKey = `${selectedPage}-${fontFamilies.size}-${autoScaleText}`; + + // Skip if we've already measured for this configuration + if (measurementKeyRef.current === currentKey) { + return; + } + + const measureTextScales = () => { + const newScales = new Map(); + + visibleGroups.forEach((group) => { + // Skip groups that are being edited + if (editingGroupId === group.id) { + return; + } + + const element = document.querySelector(`[data-text-group="${group.id}"]`); + if (!element) { + return; + } + + const textSpan = element.querySelector('span[data-text-content]'); + if (!textSpan) { + return; + } + + // Temporarily remove any existing transform to get natural width + const originalTransform = textSpan.style.transform; + textSpan.style.transform = 'none'; + + const bounds = toCssBounds(currentPage, pageHeight, scale, group.bounds); + const containerWidth = bounds.width; + const textWidth = textSpan.getBoundingClientRect().width; + + // Restore original transform + textSpan.style.transform = originalTransform; + + // Only scale if text overflows by more than 2% + if (textWidth > 0 && textWidth > containerWidth * 1.02) { + const scaleX = Math.max(containerWidth / textWidth, 0.5); // Min 50% scale + newScales.set(group.id, scaleX); + } else { + newScales.set(group.id, 1); + } + }); + + // Mark this configuration as measured + measurementKeyRef.current = currentKey; + setTextScales(newScales); + }; + + // Delay measurement to ensure fonts and layout are ready + const timer = setTimeout(measureTextScales, 150); + return () => clearTimeout(timer); + }, [autoScaleText, visibleGroups, editingGroupId, currentPage, pageHeight, scale, fontFamilies.size, selectedPage]); + useLayoutEffect(() => { if (!editingGroupId) { return; @@ -726,6 +798,27 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { {t('pdfJsonEditor.currentFile', 'Current file: {{name}}', { name: fileName })} )} + + + + +
+ + {t('pdfJsonEditor.options.autoScaleText.title', 'Auto-scale text to fit boxes')} + + + {t( + 'pdfJsonEditor.options.autoScaleText.description', + 'Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF.' + )} + +
+ setAutoScaleText(event.currentTarget.checked)} + /> +
@@ -782,10 +875,39 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { {isConverting && ( - - - - {t('pdfJsonEditor.converting', 'Converting PDF to editable format...')} + + +
+ + {conversionProgress + ? conversionProgress.message + : t('pdfJsonEditor.converting', 'Converting PDF to editable format...')} + + {conversionProgress && ( + + + {t(`pdfJsonEditor.stages.${conversionProgress.stage}`, conversionProgress.stage)} + + {conversionProgress.current !== undefined && + conversionProgress.total !== undefined && ( + + • Page {conversionProgress.current} of {conversionProgress.total} + + )} + + )} +
+ +
+ + + {conversionProgress?.percent || 0}% complete
@@ -1105,6 +1227,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { ); } + const textScale = textScales.get(group.id) ?? 1; + const shouldScale = autoScaleText && textScale < 0.98; + return ( {renderGroupContainer( @@ -1112,6 +1237,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { isActive, changed,
{ overflow: 'visible', }} > - {group.text || '\u00A0'} + + {group.text || '\u00A0'} +
, () => { setEditingGroupId(group.id); diff --git a/frontend/src/proprietary/data/useProprietaryToolRegistry.tsx b/frontend/src/proprietary/data/useProprietaryToolRegistry.tsx index 34ad6675e..57da85cbb 100644 --- a/frontend/src/proprietary/data/useProprietaryToolRegistry.tsx +++ b/frontend/src/proprietary/data/useProprietaryToolRegistry.tsx @@ -27,8 +27,8 @@ export function useProprietaryToolRegistry(): ProprietaryToolRegistry { "home.pdfJsonEditor.desc", "Review and edit Stirling PDF JSON exports with grouped text editing and PDF regeneration" ), - categoryId: ToolCategoryId.ADVANCED_TOOLS, - subcategoryId: SubcategoryId.DEVELOPER_TOOLS, + categoryId: ToolCategoryId.RECOMMENDED_TOOLS, + subcategoryId: SubcategoryId.GENERAL, workbench: "custom:pdfJsonEditor", endpoints: ["json-pdf"], synonyms: getSynonyms(t, "pdfJsonEditor"), diff --git a/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx b/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx index 419cab1c9..366db7d45 100644 --- a/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx +++ b/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx @@ -13,6 +13,7 @@ import { getFilenameFromHeaders } from '@app/utils/fileResponseUtils'; import { PdfJsonDocument, PdfJsonImageElement, + PdfJsonPage, TextGroup, PdfJsonEditorViewData, } from './pdfJsonEditorTypes'; @@ -68,11 +69,39 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { const [errorMessage, setErrorMessage] = useState(null); const [isGeneratingPdf, setIsGeneratingPdf] = useState(false); const [isConverting, setIsConverting] = useState(false); + const [conversionProgress, setConversionProgress] = useState<{ + percent: number; + stage: string; + message: string; + } | null>(null); + + // Lazy loading state + const [isLazyMode, setIsLazyMode] = useState(false); + const [cachedJobId, setCachedJobId] = useState(null); + const [loadedImagePages, setLoadedImagePages] = useState>(new Set()); + const [loadingImagePages, setLoadingImagePages] = useState>(new Set()); const originalImagesRef = useRef([]); + const imagesByPageRef = useRef([]); const autoLoadKeyRef = useRef(null); const loadRequestIdRef = useRef(0); const latestPdfRequestIdRef = useRef(null); + const loadedDocumentRef = useRef(null); + const loadedImagePagesRef = useRef>(new Set()); + const loadingImagePagesRef = useRef>(new Set()); + + // Keep ref in sync with state for access in async callbacks + useEffect(() => { + loadedDocumentRef.current = loadedDocument; + }, [loadedDocument]); + + useEffect(() => { + loadedImagePagesRef.current = new Set(loadedImagePages); + }, [loadedImagePages]); + + useEffect(() => { + loadingImagePagesRef.current = new Set(loadingImagePages); + }, [loadingImagePages]); const dirtyPages = useMemo( () => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current), @@ -88,18 +117,134 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { setGroupsByPage([]); setImagesByPage([]); originalImagesRef.current = []; + imagesByPageRef.current = []; + setLoadedImagePages(new Set()); + setLoadingImagePages(new Set()); + loadedImagePagesRef.current = new Set(); + loadingImagePagesRef.current = new Set(); setSelectedPage(0); return; } const cloned = deepCloneDocument(document); const groups = groupDocumentText(cloned); const images = extractDocumentImages(cloned); - originalImagesRef.current = images.map((page) => page.map(cloneImageElement)); + const originalImages = images.map((page) => page.map(cloneImageElement)); + originalImagesRef.current = originalImages; + imagesByPageRef.current = images.map((page) => page.map(cloneImageElement)); + const initialLoaded = new Set(); + originalImages.forEach((pageImages, index) => { + if (pageImages.length > 0) { + initialLoaded.add(index); + } + }); setGroupsByPage(groups); setImagesByPage(images); + setLoadedImagePages(initialLoaded); + setLoadingImagePages(new Set()); + loadedImagePagesRef.current = new Set(initialLoaded); + loadingImagePagesRef.current = new Set(); setSelectedPage(0); }, []); + // Load images for a page in lazy mode + const loadImagesForPage = useCallback( + async (pageIndex: number) => { + if (!isLazyMode) { + return; + } + if (!cachedJobId) { + console.log('[loadImagesForPage] No cached jobId, skipping'); + return; + } + if ( + loadedImagePagesRef.current.has(pageIndex) || + loadingImagePagesRef.current.has(pageIndex) + ) { + return; + } + + loadingImagePagesRef.current.add(pageIndex); + setLoadingImagePages((prev) => { + const next = new Set(prev); + next.add(pageIndex); + return next; + }); + + const pageNumber = pageIndex + 1; + const start = performance.now(); + + try { + const response = await apiClient.get( + `/api/v1/convert/pdf/json/page/${cachedJobId}/${pageNumber}`, + { + responseType: 'json', + }, + ); + + const pageData = response.data as PdfJsonPage; + const normalizedImages = (pageData.imageElements ?? []).map(cloneImageElement); + + if (imagesByPageRef.current.length <= pageIndex) { + imagesByPageRef.current.length = pageIndex + 1; + } + imagesByPageRef.current[pageIndex] = normalizedImages.map(cloneImageElement); + + setLoadedDocument((prevDoc) => { + if (!prevDoc || !prevDoc.pages) { + return prevDoc; + } + const nextPages = [...prevDoc.pages]; + const existingPage = nextPages[pageIndex] ?? {}; + nextPages[pageIndex] = { + ...existingPage, + imageElements: normalizedImages.map(cloneImageElement), + }; + return { + ...prevDoc, + pages: nextPages, + }; + }); + + setImagesByPage((prev) => { + const next = [...prev]; + while (next.length <= pageIndex) { + next.push([]); + } + next[pageIndex] = normalizedImages.map(cloneImageElement); + return next; + }); + + if (originalImagesRef.current.length <= pageIndex) { + originalImagesRef.current.length = pageIndex + 1; + } + originalImagesRef.current[pageIndex] = normalizedImages.map(cloneImageElement); + + setLoadedImagePages((prev) => { + const next = new Set(prev); + next.add(pageIndex); + return next; + }); + loadedImagePagesRef.current.add(pageIndex); + + console.log( + `[loadImagesForPage] Loaded ${normalizedImages.length} images for page ${pageNumber} in ${( + performance.now() - start + ).toFixed(2)}ms`, + ); + } catch (error) { + console.error(`[loadImagesForPage] Failed to load images for page ${pageNumber}:`, error); + } finally { + loadingImagePagesRef.current.delete(pageIndex); + setLoadingImagePages((prev) => { + const next = new Set(prev); + next.delete(pageIndex); + return next; + }); + } + }, + [isLazyMode, cachedJobId], + ); + const handleLoadFile = useCallback( async (file: File | null) => { if (!file) { @@ -113,39 +258,200 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { const isPdf = file.type === 'application/pdf' || file.name.toLowerCase().endsWith('.pdf'); try { - let parsed: PdfJsonDocument; + let parsed: PdfJsonDocument | null = null; + let shouldUseLazyMode = false; + let pendingJobId: string | null = null; setErrorMessage(null); if (isPdf) { latestPdfRequestIdRef.current = requestId; setIsConverting(true); + setConversionProgress({ + percent: 0, + stage: 'uploading', + message: 'Uploading PDF file to server...', + }); const formData = new FormData(); formData.append('fileInput', file); - const response = await apiClient.post(CONVERSION_ENDPOINTS['pdf-json'], formData, { - responseType: 'blob', + console.log('Sending conversion request with async=true'); + const response = await apiClient.post( + `${CONVERSION_ENDPOINTS['pdf-json']}?async=true`, + formData, + { + responseType: 'json', + }, + ); + + console.log('Conversion response:', response.data); + const jobId = response.data.jobId; + + if (!jobId) { + console.error('No job ID in response:', response.data); + throw new Error('No job ID received from server'); + } + + pendingJobId = jobId; + console.log('Got job ID:', jobId); + setConversionProgress({ + percent: 3, + stage: 'processing', + message: 'Starting conversion...', }); - const jsonText = await response.data.text(); - parsed = JSON.parse(jsonText) as PdfJsonDocument; + let jobComplete = false; + let attempts = 0; + const maxAttempts = 600; + + while (!jobComplete && attempts < maxAttempts) { + await new Promise((resolve) => setTimeout(resolve, 1000)); + attempts += 1; + + try { + const statusResponse = await apiClient.get(`/api/v1/general/job/${jobId}`); + const jobStatus = statusResponse.data; + console.log(`Job status (attempt ${attempts}):`, jobStatus); + + if (jobStatus.notes && jobStatus.notes.length > 0) { + const lastNote = jobStatus.notes[jobStatus.notes.length - 1]; + console.log('Latest note:', lastNote); + const matchWithCount = lastNote.match( + /\[(\d+)%\]\s+(\w+):\s+(.+?)\s+\((\d+)\/(\d+)\)/, + ); + if (matchWithCount) { + const percent = parseInt(matchWithCount[1], 10); + const stage = matchWithCount[2]; + const message = matchWithCount[3]; + const current = parseInt(matchWithCount[4], 10); + const total = parseInt(matchWithCount[5], 10); + setConversionProgress({ + percent, + stage, + message, + current, + total, + }); + } else { + const match = lastNote.match(/\[(\d+)%\]\s+(\w+):\s+(.+)/); + if (match) { + const percent = parseInt(match[1], 10); + const stage = match[2]; + const message = match[3]; + setConversionProgress({ + percent, + stage, + message, + }); + } + } + } else if (jobStatus.progress !== undefined) { + const percent = Math.min(Math.max(jobStatus.progress, 0), 100); + setConversionProgress({ + percent, + stage: jobStatus.stage || 'processing', + message: jobStatus.note || 'Converting PDF to JSON...', + }); + } + + if (jobStatus.complete) { + if (jobStatus.error) { + console.error('Job failed:', jobStatus.error); + throw new Error(jobStatus.error); + } + + console.log('Job completed, retrieving JSON result...'); + jobComplete = true; + + const resultResponse = await apiClient.get( + `/api/v1/general/job/${jobId}/result`, + { + responseType: 'blob', + }, + ); + + const jsonText = await resultResponse.data.text(); + const result = JSON.parse(jsonText); + + if (!Array.isArray(result.pages)) { + console.error('Conversion result missing page array:', result); + throw new Error( + 'PDF conversion result did not include page data. Please update the server.', + ); + } + + const docResult = result as PdfJsonDocument; + parsed = { + ...docResult, + pages: docResult.pages ?? [], + }; + shouldUseLazyMode = Boolean(docResult.lazyImages); + pendingJobId = shouldUseLazyMode ? jobId : null; + setConversionProgress(null); + } else { + console.log('Job not complete yet, continuing to poll...'); + } + } catch (pollError: any) { + console.error('Error polling job status:', pollError); + console.error('Poll error details:', { + status: pollError?.response?.status, + data: pollError?.response?.data, + message: pollError?.message, + }); + if (pollError?.response?.status === 404) { + throw new Error('Job not found on server'); + } + } + } + + if (!jobComplete) { + throw new Error('Conversion timed out'); + } + if (!parsed) { + throw new Error('Conversion did not return JSON content'); + } } else { const content = await file.text(); - parsed = JSON.parse(content) as PdfJsonDocument; + const docResult = JSON.parse(content) as PdfJsonDocument; + parsed = { + ...docResult, + pages: docResult.pages ?? [], + }; + shouldUseLazyMode = false; + pendingJobId = null; } + setConversionProgress(null); + if (loadRequestIdRef.current !== requestId) { return; } + if (!parsed) { + throw new Error('Failed to parse PDF JSON document'); + } + + console.log( + `[PdfJsonEditor] Document loaded. Lazy image mode: ${shouldUseLazyMode}, Pages: ${ + parsed.pages?.length || 0 + }`, + ); + setLoadedDocument(parsed); resetToDocument(parsed); + setIsLazyMode(shouldUseLazyMode); + setCachedJobId(shouldUseLazyMode ? pendingJobId : null); setFileName(file.name); setErrorMessage(null); autoLoadKeyRef.current = fileKey; - } catch (error) { + } catch (error: any) { console.error('Failed to load file', error); + console.error('Error details:', { + message: error?.message, + response: error?.response?.data, + stack: error?.stack, + }); if (loadRequestIdRef.current !== requestId) { return; @@ -155,15 +461,17 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { resetToDocument(null); if (isPdf) { - setErrorMessage( - t('pdfJsonEditor.conversionFailed', 'Failed to convert PDF. Please try again.') - ); + const errorMsg = + error?.message || + t('pdfJsonEditor.conversionFailed', 'Failed to convert PDF. Please try again.'); + setErrorMessage(errorMsg); + console.error('Setting error message:', errorMsg); } else { setErrorMessage( t( 'pdfJsonEditor.errors.invalidJson', - 'Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.' - ) + 'Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.', + ), ); } } finally { @@ -172,12 +480,16 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { } } }, - [resetToDocument, t] + [resetToDocument, t], ); const handleSelectPage = useCallback((pageIndex: number) => { setSelectedPage(pageIndex); - }, []); + // Trigger lazy loading for images on the selected page + if (isLazyMode) { + void loadImagesForPage(pageIndex); + } + }, [isLazyMode, loadImagesForPage]); const handleGroupTextChange = useCallback((pageIndex: number, groupId: string, value: string) => { setGroupsByPage((previous) => @@ -195,55 +507,63 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { imageId: string, next: { left: number; bottom: number; width: number; height: number; transform: number[] }, ) => { - setImagesByPage((previous) => - previous.map((images, idx) => { - if (idx !== pageIndex) { - return images; + setImagesByPage((previous) => { + const current = previous[pageIndex] ?? []; + let changed = false; + const updatedPage = current.map((image) => { + if ((image.id ?? '') !== imageId) { + return image; } - let changed = false; - const updated = images.map((image) => { - if ((image.id ?? '') !== imageId) { - return image; - } - const originalTransform = image.transform ?? originalImagesRef.current[idx]?.find((base) => (base.id ?? '') === imageId)?.transform; - const scaleXSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[0]) || 1 : 1; - const scaleYSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[3]) || 1 : 1; - const right = next.left + next.width; - const top = next.bottom + next.height; - const updatedImage: PdfJsonImageElement = { - ...image, - x: next.left, - y: next.bottom, - left: next.left, - bottom: next.bottom, - right, - top, - width: next.width, - height: next.height, - transform: scaleXSign < 0 || scaleYSign < 0 ? [ - next.width * scaleXSign, - 0, - 0, - next.height * scaleYSign, - next.left, - scaleYSign >= 0 ? next.bottom : next.bottom + next.height, - ] : null, - }; + const originalTransform = image.transform ?? originalImagesRef.current[pageIndex]?.find((base) => (base.id ?? '') === imageId)?.transform; + const scaleXSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[0]) || 1 : 1; + const scaleYSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[3]) || 1 : 1; + const right = next.left + next.width; + const top = next.bottom + next.height; + const updatedImage: PdfJsonImageElement = { + ...image, + x: next.left, + y: next.bottom, + left: next.left, + bottom: next.bottom, + right, + top, + width: next.width, + height: next.height, + transform: scaleXSign < 0 || scaleYSign < 0 + ? [ + next.width * scaleXSign, + 0, + 0, + next.height * scaleYSign, + next.left, + scaleYSign >= 0 ? next.bottom : next.bottom + next.height, + ] + : null, + }; - const isSame = - Math.abs(valueOr(image.left, 0) - next.left) < 1e-4 && - Math.abs(valueOr(image.bottom, 0) - next.bottom) < 1e-4 && - Math.abs(valueOr(image.width, 0) - next.width) < 1e-4 && - Math.abs(valueOr(image.height, 0) - next.height) < 1e-4; + const isSame = + Math.abs(valueOr(image.left, 0) - next.left) < 1e-4 && + Math.abs(valueOr(image.bottom, 0) - next.bottom) < 1e-4 && + Math.abs(valueOr(image.width, 0) - next.width) < 1e-4 && + Math.abs(valueOr(image.height, 0) - next.height) < 1e-4; - if (!isSame) { - changed = true; - } - return updatedImage; - }); - return changed ? updated : images; - }), - ); + if (!isSame) { + changed = true; + } + return updatedImage; + }); + + if (!changed) { + return previous; + } + + const nextImages = previous.map((images, idx) => (idx === pageIndex ? updatedPage : images)); + if (imagesByPageRef.current.length <= pageIndex) { + imagesByPageRef.current.length = pageIndex + 1; + } + imagesByPageRef.current[pageIndex] = updatedPage.map(cloneImageElement); + return nextImages; + }); }, [], ); @@ -253,14 +573,28 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { if (!baseline) { return; } - setImagesByPage((previous) => - previous.map((images, idx) => { - if (idx !== pageIndex) { - return images; + setImagesByPage((previous) => { + const current = previous[pageIndex] ?? []; + let changed = false; + const updatedPage = current.map((image) => { + if ((image.id ?? '') !== imageId) { + return image; } - return images.map((image) => ((image.id ?? '') === imageId ? cloneImageElement(baseline) : image)); - }), - ); + changed = true; + return cloneImageElement(baseline); + }); + + if (!changed) { + return previous; + } + + const nextImages = previous.map((images, idx) => (idx === pageIndex ? updatedPage : images)); + if (imagesByPageRef.current.length <= pageIndex) { + imagesByPageRef.current.length = pageIndex + 1; + } + imagesByPageRef.current[pageIndex] = updatedPage.map(cloneImageElement); + return nextImages; + }); }, []); const handleResetEdits = useCallback(() => { @@ -279,7 +613,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { const updatedDocument = restoreGlyphElements( loadedDocument, groupsByPage, - imagesByPage, + imagesByPageRef.current, originalImagesRef.current, ); const baseName = sanitizeBaseName(fileName || loadedDocument.metadata?.title || undefined); @@ -287,7 +621,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { document: updatedDocument, filename: `${baseName}.json`, }; - }, [fileName, groupsByPage, imagesByPage, loadedDocument]); + }, [fileName, groupsByPage, loadedDocument]); const handleDownloadJson = useCallback(() => { const payload = buildPayload(); @@ -306,20 +640,129 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { }, [buildPayload, onComplete]); const handleGeneratePdf = useCallback(async () => { - const payload = buildPayload(); - if (!payload) { - return; - } - - const { document, filename } = payload; - const serialized = JSON.stringify(document, null, 2); - const jsonFile = new File([serialized], filename, { type: 'application/json' }); - - const formData = new FormData(); - formData.append('fileInput', jsonFile); - try { setIsGeneratingPdf(true); + + const ensureImagesForPages = async (pageIndices: number[]) => { + const uniqueIndices = Array.from(new Set(pageIndices)).filter((index) => index >= 0); + if (uniqueIndices.length === 0) { + return; + } + + for (const index of uniqueIndices) { + if (!loadedImagePagesRef.current.has(index)) { + await loadImagesForPage(index); + } + } + + const maxWaitTime = 15000; + const pollInterval = 150; + const startWait = Date.now(); + while (Date.now() - startWait < maxWaitTime) { + const allLoaded = uniqueIndices.every( + (index) => + loadedImagePagesRef.current.has(index) && + imagesByPageRef.current[index] !== undefined, + ); + const anyLoading = uniqueIndices.some((index) => + loadingImagePagesRef.current.has(index), + ); + if (allLoaded && !anyLoading) { + return; + } + await new Promise((resolve) => setTimeout(resolve, pollInterval)); + } + + const missing = uniqueIndices.filter( + (index) => !loadedImagePagesRef.current.has(index), + ); + if (missing.length > 0) { + throw new Error( + `Failed to load images for pages ${missing.map((i) => i + 1).join(', ')}`, + ); + } + }; + + const currentDoc = loadedDocumentRef.current; + const totalPages = currentDoc?.pages?.length ?? 0; + const dirtyPageIndices = dirtyPages + .map((isDirty, index) => (isDirty ? index : -1)) + .filter((index) => index >= 0); + + const canUseIncremental = + isLazyMode && + cachedJobId && + dirtyPageIndices.length > 0 && + dirtyPageIndices.length < totalPages; + + if (canUseIncremental) { + await ensureImagesForPages(dirtyPageIndices); + + try { + const payload = buildPayload(); + if (!payload) { + return; + } + + const { document, filename } = payload; + const dirtyPageSet = new Set(dirtyPageIndices); + const partialPages = + document.pages?.filter((_, index) => dirtyPageSet.has(index)) ?? []; + + const partialDocument: PdfJsonDocument = { + metadata: document.metadata, + xmpMetadata: document.xmpMetadata, + fonts: document.fonts, + lazyImages: true, + pages: partialPages, + }; + + const baseName = sanitizeBaseName(filename).replace(/-edited$/u, ''); + const expectedName = `${baseName || 'document'}.pdf`; + const response = await apiClient.post( + `/api/v1/convert/pdf/json/partial/${cachedJobId}?filename=${encodeURIComponent(expectedName)}`, + partialDocument, + { + responseType: 'blob', + }, + ); + + const contentDisposition = response.headers?.['content-disposition'] ?? ''; + const detectedName = getFilenameFromHeaders(contentDisposition); + const downloadName = detectedName || expectedName; + + downloadBlob(response.data, downloadName); + + if (onComplete) { + const pdfFile = new File([response.data], downloadName, { type: 'application/pdf' }); + onComplete([pdfFile]); + } + setErrorMessage(null); + return; + } catch (incrementalError) { + console.warn( + '[handleGeneratePdf] Incremental export failed, falling back to full export', + incrementalError, + ); + } + } + + if (isLazyMode && totalPages > 0) { + const allPageIndices = Array.from({ length: totalPages }, (_, index) => index); + await ensureImagesForPages(allPageIndices); + } + + const payload = buildPayload(); + if (!payload) { + return; + } + + const { document, filename } = payload; + const serialized = JSON.stringify(document, null, 2); + const jsonFile = new File([serialized], filename, { type: 'application/json' }); + + const formData = new FormData(); + formData.append('fileInput', jsonFile); const response = await apiClient.post(CONVERSION_ENDPOINTS['json-pdf'], formData, { responseType: 'blob', }); @@ -350,7 +793,16 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { } finally { setIsGeneratingPdf(false); } - }, [buildPayload, onComplete, onError, t]); + }, [ + buildPayload, + cachedJobId, + dirtyPages, + isLazyMode, + loadImagesForPage, + onComplete, + onError, + t, + ]); const viewData = useMemo(() => ({ document: loadedDocument, @@ -363,6 +815,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { errorMessage, isGeneratingPdf, isConverting, + conversionProgress, hasChanges, onLoadJson: handleLoadFile, onSelectPage: handleSelectPage, @@ -390,6 +843,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { hasDocument, isGeneratingPdf, isConverting, + conversionProgress, loadedDocument, selectedPage, ]); @@ -397,6 +851,13 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { const latestViewDataRef = useRef(viewData); latestViewDataRef.current = viewData; + // Trigger initial image loading in lazy mode + useEffect(() => { + if (isLazyMode && loadedDocument) { + void loadImagesForPage(selectedPage); + } + }, [isLazyMode, loadedDocument, selectedPage, loadImagesForPage]); + useEffect(() => { if (selectedFiles.length === 0) { autoLoadKeyRef.current = null; @@ -433,11 +894,20 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { setCustomWorkbenchViewData(VIEW_ID, latestViewDataRef.current); return () => { + // Clear backend cache if we were using lazy loading + if (cachedJobId) { + console.log(`[PdfJsonEditor] Cleaning up cached document for jobId: ${cachedJobId}`); + apiClient.post(`/api/v1/convert/pdf/json/clear-cache/${cachedJobId}`).catch((error) => { + console.warn('[PdfJsonEditor] Failed to clear cache:', error); + }); + } + clearCustomWorkbenchViewData(VIEW_ID); unregisterCustomWorkbenchView(VIEW_ID); setLeftPanelView('toolPicker'); }; }, [ + cachedJobId, clearCustomWorkbenchViewData, registerCustomWorkbenchView, setCustomWorkbenchViewData, diff --git a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts index 6f4ee3ae6..98323f161 100644 --- a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts +++ b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts @@ -122,6 +122,23 @@ export interface PdfJsonDocument { xmpMetadata?: string | null; fonts?: PdfJsonFont[] | null; pages?: PdfJsonPage[] | null; + lazyImages?: boolean | null; +} + +export interface PdfJsonPageDimension { + pageNumber?: number | null; + width?: number | null; + height?: number | null; + rotation?: number | null; +} + +export interface PdfJsonDocumentMetadata { + metadata?: PdfJsonMetadata | null; + xmpMetadata?: string | null; + fonts?: PdfJsonFont[] | null; + pageDimensions?: PdfJsonPageDimension[] | null; + formFields?: unknown[] | null; + lazyImages?: boolean | null; } export interface BoundingBox { @@ -153,6 +170,14 @@ export interface TextGroup { export const DEFAULT_PAGE_WIDTH = 612; export const DEFAULT_PAGE_HEIGHT = 792; +export interface ConversionProgress { + percent: number; + stage: string; + message: string; + current?: number; + total?: number; +} + export interface PdfJsonEditorViewData { document: PdfJsonDocument | null; groupsByPage: TextGroup[][]; @@ -164,6 +189,7 @@ export interface PdfJsonEditorViewData { errorMessage: string | null; isGeneratingPdf: boolean; isConverting: boolean; + conversionProgress: ConversionProgress | null; hasChanges: boolean; onLoadJson: (file: File | null) => Promise | void; onSelectPage: (pageIndex: number) => void; diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts index e5f4ddc8a..408cf20b7 100644 --- a/frontend/vite.config.ts +++ b/frontend/vite.config.ts @@ -15,6 +15,7 @@ export default defineConfig({ }), ], server: { + host: true, proxy: { '/api': { target: 'http://localhost:8080',