mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
editor revamp, complete change
This commit is contained in:
parent
ec0ae36a82
commit
bbcb23ca11
@ -148,17 +148,31 @@ public class JobExecutorService {
|
||||
taskManager.createTask(jobId);
|
||||
|
||||
// Create a specialized wrapper that updates the TaskManager
|
||||
final String capturedJobIdForQueue = jobId;
|
||||
Supplier<Object> wrappedWork =
|
||||
() -> {
|
||||
try {
|
||||
// Set jobId in ThreadLocal context for the queued job
|
||||
stirling.software.common.util.JobContext.setJobId(
|
||||
capturedJobIdForQueue);
|
||||
log.debug(
|
||||
"Set jobId {} in JobContext for queued job execution",
|
||||
capturedJobIdForQueue);
|
||||
|
||||
Object result = work.get();
|
||||
processJobResult(jobId, result);
|
||||
processJobResult(capturedJobIdForQueue, result);
|
||||
return result;
|
||||
} catch (Exception e) {
|
||||
log.error(
|
||||
"Error executing queued job {}: {}", jobId, e.getMessage(), e);
|
||||
taskManager.setError(jobId, e.getMessage());
|
||||
"Error executing queued job {}: {}",
|
||||
capturedJobIdForQueue,
|
||||
e.getMessage(),
|
||||
e);
|
||||
taskManager.setError(capturedJobIdForQueue, e.getMessage());
|
||||
throw e;
|
||||
} finally {
|
||||
// Clean up ThreadLocal to avoid memory leaks
|
||||
stirling.software.common.util.JobContext.clear();
|
||||
}
|
||||
};
|
||||
|
||||
@ -170,21 +184,36 @@ public class JobExecutorService {
|
||||
return ResponseEntity.ok().body(new JobResponse<>(true, jobId, null));
|
||||
} else if (async) {
|
||||
taskManager.createTask(jobId);
|
||||
|
||||
// Capture the jobId for the async thread
|
||||
final String capturedJobId = jobId;
|
||||
|
||||
executor.execute(
|
||||
() -> {
|
||||
try {
|
||||
log.debug(
|
||||
"Running async job {} with timeout {} ms", jobId, timeoutToUse);
|
||||
"Running async job {} with timeout {} ms",
|
||||
capturedJobId,
|
||||
timeoutToUse);
|
||||
|
||||
// Set jobId in ThreadLocal context for the async thread
|
||||
stirling.software.common.util.JobContext.setJobId(capturedJobId);
|
||||
log.debug(
|
||||
"Set jobId {} in JobContext for async execution",
|
||||
capturedJobId);
|
||||
|
||||
// Execute with timeout
|
||||
Object result = executeWithTimeout(() -> work.get(), timeoutToUse);
|
||||
processJobResult(jobId, result);
|
||||
processJobResult(capturedJobId, result);
|
||||
} catch (TimeoutException te) {
|
||||
log.error("Job {} timed out after {} ms", jobId, timeoutToUse);
|
||||
taskManager.setError(jobId, "Job timed out");
|
||||
} catch (Exception e) {
|
||||
log.error("Error executing job {}: {}", jobId, e.getMessage(), e);
|
||||
taskManager.setError(jobId, e.getMessage());
|
||||
} finally {
|
||||
// Clean up ThreadLocal to avoid memory leaks
|
||||
stirling.software.common.util.JobContext.clear();
|
||||
}
|
||||
});
|
||||
|
||||
@ -193,6 +222,10 @@ public class JobExecutorService {
|
||||
try {
|
||||
log.debug("Running sync job with timeout {} ms", timeoutToUse);
|
||||
|
||||
// Make jobId available to downstream components on the worker thread
|
||||
stirling.software.common.util.JobContext.setJobId(jobId);
|
||||
log.debug("Set jobId {} in JobContext for sync execution", jobId);
|
||||
|
||||
// Execute with timeout
|
||||
Object result = executeWithTimeout(() -> work.get(), timeoutToUse);
|
||||
|
||||
@ -212,6 +245,8 @@ public class JobExecutorService {
|
||||
// Construct a JSON error response
|
||||
return ResponseEntity.internalServerError()
|
||||
.body(Map.of("error", "Job failed: " + e.getMessage()));
|
||||
} finally {
|
||||
stirling.software.common.util.JobContext.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -456,8 +491,23 @@ public class JobExecutorService {
|
||||
throws TimeoutException, Exception {
|
||||
// Use the same executor as other async jobs for consistency
|
||||
// This ensures all operations run on the same thread pool
|
||||
String currentJobId = stirling.software.common.util.JobContext.getJobId();
|
||||
|
||||
java.util.concurrent.CompletableFuture<T> future =
|
||||
java.util.concurrent.CompletableFuture.supplyAsync(supplier, executor);
|
||||
java.util.concurrent.CompletableFuture.supplyAsync(
|
||||
() -> {
|
||||
if (currentJobId != null) {
|
||||
stirling.software.common.util.JobContext.setJobId(currentJobId);
|
||||
}
|
||||
try {
|
||||
return supplier.get();
|
||||
} finally {
|
||||
if (currentJobId != null) {
|
||||
stirling.software.common.util.JobContext.clear();
|
||||
}
|
||||
}
|
||||
},
|
||||
executor);
|
||||
|
||||
try {
|
||||
return future.get(timeoutMs, TimeUnit.MILLISECONDS);
|
||||
|
||||
@ -0,0 +1,18 @@
|
||||
package stirling.software.common.util;
|
||||
|
||||
/** Thread-local context for passing job ID across async boundaries */
|
||||
public class JobContext {
|
||||
private static final ThreadLocal<String> CURRENT_JOB_ID = new ThreadLocal<>();
|
||||
|
||||
public static void setJobId(String jobId) {
|
||||
CURRENT_JOB_ID.set(jobId);
|
||||
}
|
||||
|
||||
public static String getJobId() {
|
||||
return CURRENT_JOB_ID.get();
|
||||
}
|
||||
|
||||
public static void clear() {
|
||||
CURRENT_JOB_ID.remove();
|
||||
}
|
||||
}
|
||||
@ -94,6 +94,7 @@ public class ProcessExecutor {
|
||||
.getProcessExecutor()
|
||||
.getSessionLimit()
|
||||
.getOcrMyPdfSessionLimit();
|
||||
case CFF_CONVERTER -> 1;
|
||||
};
|
||||
|
||||
long timeoutMinutes =
|
||||
@ -148,6 +149,7 @@ public class ProcessExecutor {
|
||||
.getProcessExecutor()
|
||||
.getTimeoutMinutes()
|
||||
.getOcrMyPdfTimeoutMinutes();
|
||||
case CFF_CONVERTER -> 5L;
|
||||
};
|
||||
return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes);
|
||||
});
|
||||
@ -300,7 +302,8 @@ public class ProcessExecutor {
|
||||
TESSERACT,
|
||||
QPDF,
|
||||
GHOSTSCRIPT,
|
||||
OCR_MY_PDF
|
||||
OCR_MY_PDF,
|
||||
CFF_CONVERTER
|
||||
}
|
||||
|
||||
public class ProcessExecutorResult {
|
||||
|
||||
@ -78,6 +78,23 @@ class JobExecutorServiceTest {
|
||||
verify(request).setAttribute(eq("jobId"), anyString());
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldExposeJobIdInJobContextDuringSyncExecution() throws Exception {
|
||||
// Given
|
||||
Supplier<Object> work = stirling.software.common.util.JobContext::getJobId;
|
||||
|
||||
// When
|
||||
ResponseEntity<?> response = jobExecutorService.runJobGeneric(false, work);
|
||||
|
||||
// Then
|
||||
assertEquals(HttpStatus.OK, response.getStatusCode());
|
||||
assertNotNull(response.getBody());
|
||||
|
||||
var requestJobIdCaptor = ArgumentCaptor.forClass(String.class);
|
||||
verify(request).setAttribute(eq("jobId"), requestJobIdCaptor.capture());
|
||||
assertEquals(requestJobIdCaptor.getValue(), response.getBody());
|
||||
}
|
||||
|
||||
@Test
|
||||
void shouldRunAsyncJobSuccessfully() throws Exception {
|
||||
// Given
|
||||
|
||||
@ -8,6 +8,8 @@ logging.level.org.eclipse.jetty=WARN
|
||||
#logging.level.stirling.software.proprietary.security=DEBUG
|
||||
logging.level.com.zaxxer.hikari=WARN
|
||||
logging.level.stirling.software.SPDF.service.PdfJsonConversionService=TRACE
|
||||
logging.level.stirling.software.common.service.JobExecutorService=DEBUG
|
||||
logging.level.stirling.software.common.service.TaskManager=DEBUG
|
||||
spring.jpa.open-in-view=false
|
||||
server.forward-headers-strategy=NATIVE
|
||||
server.error.path=/error
|
||||
|
||||
@ -1,16 +1,26 @@
|
||||
package stirling.software.SPDF.controller.api.converters;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
import org.springframework.web.bind.annotation.PathVariable;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import io.github.pixee.security.Filenames;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.config.swagger.StandardPdfResponse;
|
||||
import stirling.software.SPDF.model.json.PdfJsonDocument;
|
||||
import stirling.software.SPDF.model.json.PdfJsonMetadata;
|
||||
import stirling.software.SPDF.service.PdfJsonConversionService;
|
||||
import stirling.software.common.annotations.AutoJobPostMapping;
|
||||
import stirling.software.common.annotations.api.ConvertApi;
|
||||
@ -19,6 +29,7 @@ import stirling.software.common.model.api.PDFFile;
|
||||
import stirling.software.common.util.ExceptionUtils;
|
||||
import stirling.software.common.util.WebResponseUtils;
|
||||
|
||||
@Slf4j
|
||||
@ConvertApi
|
||||
@RequiredArgsConstructor
|
||||
public class ConvertPdfJsonController {
|
||||
@ -71,4 +82,81 @@ public class ConvertPdfJsonController {
|
||||
String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf";
|
||||
return WebResponseUtils.bytesToWebResponse(pdfBytes, docName);
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/pdf/json/metadata")
|
||||
@Operation(
|
||||
summary = "Extract PDF metadata for lazy loading",
|
||||
description =
|
||||
"Extracts document metadata, fonts, and page dimensions. Caches the document for"
|
||||
+ " subsequent page requests. Input:PDF Output:JSON Type:SISO")
|
||||
public ResponseEntity<byte[]> extractPdfMetadata(
|
||||
@ModelAttribute PDFFile request, @RequestParam(required = true) String jobId)
|
||||
throws Exception {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
if (inputFile == null) {
|
||||
throw ExceptionUtils.createNullArgumentException("fileInput");
|
||||
}
|
||||
|
||||
byte[] jsonBytes = pdfJsonConversionService.extractDocumentMetadata(inputFile, jobId);
|
||||
String originalName = inputFile.getOriginalFilename();
|
||||
String baseName =
|
||||
(originalName != null && !originalName.isBlank())
|
||||
? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "")
|
||||
: "document";
|
||||
String docName = baseName + "_metadata.json";
|
||||
return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON);
|
||||
}
|
||||
|
||||
@PostMapping(value = "/pdf/json/partial/{jobId}", consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
@StandardPdfResponse
|
||||
@Operation(
|
||||
summary = "Apply incremental edits to a cached PDF",
|
||||
description =
|
||||
"Applies edits for the specified pages of a cached PDF and returns an updated PDF."
|
||||
+ " Requires the PDF to have been previously cached via the PDF to JSON endpoint.")
|
||||
public ResponseEntity<byte[]> exportPartialPdf(
|
||||
@PathVariable String jobId,
|
||||
@RequestBody PdfJsonDocument document,
|
||||
@RequestParam(value = "filename", required = false) String filename)
|
||||
throws Exception {
|
||||
if (document == null) {
|
||||
throw ExceptionUtils.createNullArgumentException("document");
|
||||
}
|
||||
|
||||
byte[] pdfBytes = pdfJsonConversionService.exportUpdatedPages(jobId, document);
|
||||
|
||||
String baseName =
|
||||
(filename != null && !filename.isBlank())
|
||||
? Filenames.toSimpleFileName(filename).replaceFirst("[.][^.]+$", "")
|
||||
: Optional.ofNullable(document.getMetadata())
|
||||
.map(PdfJsonMetadata::getTitle)
|
||||
.filter(title -> title != null && !title.isBlank())
|
||||
.orElse("document");
|
||||
String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf";
|
||||
return WebResponseUtils.bytesToWebResponse(pdfBytes, docName);
|
||||
}
|
||||
|
||||
@GetMapping(value = "/pdf/json/page/{jobId}/{pageNumber}")
|
||||
@Operation(
|
||||
summary = "Extract single page from cached PDF",
|
||||
description =
|
||||
"Retrieves a single page's content from a previously cached PDF document."
|
||||
+ " Requires prior call to /pdf/json/metadata. Output:JSON")
|
||||
public ResponseEntity<byte[]> extractSinglePage(
|
||||
@PathVariable String jobId, @PathVariable int pageNumber) throws Exception {
|
||||
byte[] jsonBytes = pdfJsonConversionService.extractSinglePage(jobId, pageNumber);
|
||||
String docName = "page_" + pageNumber + ".json";
|
||||
return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON);
|
||||
}
|
||||
|
||||
@PostMapping(value = "/pdf/json/clear-cache/{jobId}")
|
||||
@Operation(
|
||||
summary = "Clear cached PDF document",
|
||||
description =
|
||||
"Manually clears a cached PDF document to free up server resources."
|
||||
+ " Called automatically after 30 minutes.")
|
||||
public ResponseEntity<Void> clearCache(@PathVariable String jobId) {
|
||||
pdfJsonConversionService.clearCachedDocument(jobId);
|
||||
return ResponseEntity.ok().build();
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
package stirling.software.SPDF.model.api;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class PdfJsonConversionProgress {
|
||||
private int percent;
|
||||
private String stage;
|
||||
private String message;
|
||||
private boolean complete;
|
||||
private Integer current; // Current item being processed (e.g., page number)
|
||||
private Integer total; // Total items to process (e.g., total pages)
|
||||
|
||||
public static PdfJsonConversionProgress of(int percent, String stage, String message) {
|
||||
return PdfJsonConversionProgress.builder()
|
||||
.percent(percent)
|
||||
.stage(stage)
|
||||
.message(message)
|
||||
.complete(false)
|
||||
.build();
|
||||
}
|
||||
|
||||
public static PdfJsonConversionProgress of(
|
||||
int percent, String stage, String message, int current, int total) {
|
||||
return PdfJsonConversionProgress.builder()
|
||||
.percent(percent)
|
||||
.stage(stage)
|
||||
.message(message)
|
||||
.current(current)
|
||||
.total(total)
|
||||
.complete(false)
|
||||
.build();
|
||||
}
|
||||
|
||||
public static PdfJsonConversionProgress complete() {
|
||||
return PdfJsonConversionProgress.builder()
|
||||
.percent(100)
|
||||
.stage("complete")
|
||||
.message("Conversion complete")
|
||||
.complete(true)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
@ -22,6 +22,9 @@ public class PdfJsonDocument {
|
||||
/** Optional XMP metadata packet stored as Base64. */
|
||||
private String xmpMetadata;
|
||||
|
||||
/** Indicates that images should be loaded lazily via API rather than embedded in the JSON. */
|
||||
private Boolean lazyImages;
|
||||
|
||||
@Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>();
|
||||
|
||||
@Builder.Default private List<PdfJsonPage> pages = new ArrayList<>();
|
||||
|
||||
@ -0,0 +1,34 @@
|
||||
package stirling.software.SPDF.model.json;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@JsonInclude(JsonInclude.Include.NON_NULL)
|
||||
public class PdfJsonDocumentMetadata {
|
||||
|
||||
private PdfJsonMetadata metadata;
|
||||
|
||||
/** Optional XMP metadata packet stored as Base64. */
|
||||
private String xmpMetadata;
|
||||
|
||||
/** Indicates that images should be requested lazily via the page endpoint. */
|
||||
private Boolean lazyImages;
|
||||
|
||||
@Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>();
|
||||
|
||||
@Builder.Default private List<PdfJsonPageDimension> pageDimensions = new ArrayList<>();
|
||||
|
||||
/** Form fields (AcroForm) at document level */
|
||||
@Builder.Default private List<PdfJsonFormField> formFields = new ArrayList<>();
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
package stirling.software.SPDF.model.json;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@JsonInclude(JsonInclude.Include.NON_NULL)
|
||||
public class PdfJsonPageDimension {
|
||||
private Integer pageNumber;
|
||||
private Float width;
|
||||
private Float height;
|
||||
private Integer rotation;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,274 @@
|
||||
package stirling.software.SPDF.service;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Base64;
|
||||
import java.util.Collections;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.pdfbox.cos.COSArray;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSBoolean;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.cos.COSFloat;
|
||||
import org.apache.pdfbox.cos.COSInteger;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSNull;
|
||||
import org.apache.pdfbox.cos.COSObject;
|
||||
import org.apache.pdfbox.cos.COSStream;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.common.PDStream;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.json.PdfJsonCosValue;
|
||||
import stirling.software.SPDF.model.json.PdfJsonStream;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
public class PdfJsonCosMapper {
|
||||
|
||||
public PdfJsonStream serializeStream(PDStream stream) throws IOException {
|
||||
if (stream == null) {
|
||||
return null;
|
||||
}
|
||||
return serializeStream(
|
||||
stream.getCOSObject(), Collections.newSetFromMap(new IdentityHashMap<>()));
|
||||
}
|
||||
|
||||
public PdfJsonStream serializeStream(COSStream cosStream) throws IOException {
|
||||
if (cosStream == null) {
|
||||
return null;
|
||||
}
|
||||
return serializeStream(cosStream, Collections.newSetFromMap(new IdentityHashMap<>()));
|
||||
}
|
||||
|
||||
public PdfJsonCosValue serializeCosValue(COSBase base) throws IOException {
|
||||
return serializeCosValue(base, Collections.newSetFromMap(new IdentityHashMap<>()));
|
||||
}
|
||||
|
||||
public COSBase deserializeCosValue(PdfJsonCosValue value, PDDocument document)
|
||||
throws IOException {
|
||||
if (value == null || value.getType() == null) {
|
||||
return null;
|
||||
}
|
||||
switch (value.getType()) {
|
||||
case NULL:
|
||||
return COSNull.NULL;
|
||||
case BOOLEAN:
|
||||
if (value.getValue() instanceof Boolean bool) {
|
||||
return COSBoolean.getBoolean(bool);
|
||||
}
|
||||
return null;
|
||||
case INTEGER:
|
||||
if (value.getValue() instanceof Number number) {
|
||||
return COSInteger.get(number.longValue());
|
||||
}
|
||||
return null;
|
||||
case FLOAT:
|
||||
if (value.getValue() instanceof Number number) {
|
||||
return new COSFloat(number.floatValue());
|
||||
}
|
||||
return null;
|
||||
case NAME:
|
||||
if (value.getValue() instanceof String name) {
|
||||
return COSName.getPDFName(name);
|
||||
}
|
||||
return null;
|
||||
case STRING:
|
||||
if (value.getValue() instanceof String encoded) {
|
||||
try {
|
||||
byte[] bytes = Base64.getDecoder().decode(encoded);
|
||||
return new COSString(bytes);
|
||||
} catch (IllegalArgumentException ex) {
|
||||
log.debug("Failed to decode COSString value: {}", ex.getMessage());
|
||||
}
|
||||
}
|
||||
return null;
|
||||
case ARRAY:
|
||||
COSArray array = new COSArray();
|
||||
if (value.getItems() != null) {
|
||||
for (PdfJsonCosValue item : value.getItems()) {
|
||||
COSBase entry = deserializeCosValue(item, document);
|
||||
if (entry != null) {
|
||||
array.add(entry);
|
||||
} else {
|
||||
array.add(COSNull.NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
return array;
|
||||
case DICTIONARY:
|
||||
COSDictionary dictionary = new COSDictionary();
|
||||
if (value.getEntries() != null) {
|
||||
for (Map.Entry<String, PdfJsonCosValue> entry : value.getEntries().entrySet()) {
|
||||
COSName key = COSName.getPDFName(entry.getKey());
|
||||
COSBase entryValue = deserializeCosValue(entry.getValue(), document);
|
||||
if (entryValue != null) {
|
||||
dictionary.setItem(key, entryValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
return dictionary;
|
||||
case STREAM:
|
||||
if (value.getStream() != null) {
|
||||
return buildStreamFromModel(value.getStream(), document);
|
||||
}
|
||||
return null;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public COSStream buildStreamFromModel(PdfJsonStream streamModel, PDDocument document)
|
||||
throws IOException {
|
||||
if (streamModel == null) {
|
||||
return null;
|
||||
}
|
||||
COSStream cosStream = document.getDocument().createCOSStream();
|
||||
if (streamModel.getDictionary() != null) {
|
||||
for (Map.Entry<String, PdfJsonCosValue> entry :
|
||||
streamModel.getDictionary().entrySet()) {
|
||||
COSName key = COSName.getPDFName(entry.getKey());
|
||||
COSBase value = deserializeCosValue(entry.getValue(), document);
|
||||
if (value != null) {
|
||||
cosStream.setItem(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String rawData = streamModel.getRawData();
|
||||
if (rawData != null && !rawData.isBlank()) {
|
||||
byte[] data;
|
||||
try {
|
||||
data = Base64.getDecoder().decode(rawData);
|
||||
} catch (IllegalArgumentException ex) {
|
||||
log.debug("Invalid base64 content stream data: {}", ex.getMessage());
|
||||
data = new byte[0];
|
||||
}
|
||||
try (OutputStream outputStream = cosStream.createRawOutputStream()) {
|
||||
outputStream.write(data);
|
||||
}
|
||||
cosStream.setItem(COSName.LENGTH, COSInteger.get(data.length));
|
||||
} else {
|
||||
cosStream.setItem(COSName.LENGTH, COSInteger.get(0));
|
||||
}
|
||||
return cosStream;
|
||||
}
|
||||
|
||||
private PdfJsonCosValue serializeCosValue(COSBase base, Set<COSBase> visited)
|
||||
throws IOException {
|
||||
if (base == null) {
|
||||
return null;
|
||||
}
|
||||
if (base instanceof COSObject cosObject) {
|
||||
base = cosObject.getObject();
|
||||
if (base == null) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
boolean complex =
|
||||
base instanceof COSDictionary
|
||||
|| base instanceof COSArray
|
||||
|| base instanceof COSStream;
|
||||
if (complex) {
|
||||
if (!visited.add(base)) {
|
||||
return PdfJsonCosValue.builder()
|
||||
.type(PdfJsonCosValue.Type.NAME)
|
||||
.value("__circular__")
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
PdfJsonCosValue.PdfJsonCosValueBuilder builder = PdfJsonCosValue.builder();
|
||||
if (base instanceof COSNull) {
|
||||
builder.type(PdfJsonCosValue.Type.NULL);
|
||||
return builder.build();
|
||||
}
|
||||
if (base instanceof COSBoolean booleanValue) {
|
||||
builder.type(PdfJsonCosValue.Type.BOOLEAN).value(booleanValue.getValue());
|
||||
return builder.build();
|
||||
}
|
||||
if (base instanceof COSInteger integer) {
|
||||
builder.type(PdfJsonCosValue.Type.INTEGER).value(integer.longValue());
|
||||
return builder.build();
|
||||
}
|
||||
if (base instanceof COSFloat floatValue) {
|
||||
builder.type(PdfJsonCosValue.Type.FLOAT).value(floatValue.floatValue());
|
||||
return builder.build();
|
||||
}
|
||||
if (base instanceof COSName name) {
|
||||
builder.type(PdfJsonCosValue.Type.NAME).value(name.getName());
|
||||
return builder.build();
|
||||
}
|
||||
if (base instanceof COSString cosString) {
|
||||
builder.type(PdfJsonCosValue.Type.STRING)
|
||||
.value(Base64.getEncoder().encodeToString(cosString.getBytes()));
|
||||
return builder.build();
|
||||
}
|
||||
if (base instanceof COSArray array) {
|
||||
List<PdfJsonCosValue> items = new ArrayList<>(array.size());
|
||||
for (COSBase item : array) {
|
||||
PdfJsonCosValue serialized = serializeCosValue(item, visited);
|
||||
items.add(serialized);
|
||||
}
|
||||
builder.type(PdfJsonCosValue.Type.ARRAY).items(items);
|
||||
return builder.build();
|
||||
}
|
||||
if (base instanceof COSStream stream) {
|
||||
builder.type(PdfJsonCosValue.Type.STREAM).stream(serializeStream(stream, visited));
|
||||
return builder.build();
|
||||
}
|
||||
if (base instanceof COSDictionary dictionary) {
|
||||
Map<String, PdfJsonCosValue> entries = new LinkedHashMap<>();
|
||||
for (COSName key : dictionary.keySet()) {
|
||||
PdfJsonCosValue serialized =
|
||||
serializeCosValue(dictionary.getDictionaryObject(key), visited);
|
||||
entries.put(key.getName(), serialized);
|
||||
}
|
||||
builder.type(PdfJsonCosValue.Type.DICTIONARY).entries(entries);
|
||||
return builder.build();
|
||||
}
|
||||
return null;
|
||||
} finally {
|
||||
if (complex) {
|
||||
visited.remove(base);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private PdfJsonStream serializeStream(COSStream cosStream, Set<COSBase> visited)
|
||||
throws IOException {
|
||||
Map<String, PdfJsonCosValue> dictionary = new LinkedHashMap<>();
|
||||
for (COSName key : cosStream.keySet()) {
|
||||
COSBase value = cosStream.getDictionaryObject(key);
|
||||
PdfJsonCosValue serialized = serializeCosValue(value, visited);
|
||||
if (serialized != null) {
|
||||
dictionary.put(key.getName(), serialized);
|
||||
}
|
||||
}
|
||||
String rawData = null;
|
||||
try (InputStream inputStream = cosStream.createRawInputStream();
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
if (inputStream != null) {
|
||||
inputStream.transferTo(baos);
|
||||
}
|
||||
byte[] data = baos.toByteArray();
|
||||
if (data.length > 0) {
|
||||
rawData = Base64.getEncoder().encodeToString(data);
|
||||
}
|
||||
}
|
||||
return PdfJsonStream.builder().dictionary(dictionary).rawData(rawData).build();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,224 @@
|
||||
package stirling.software.SPDF.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.core.io.Resource;
|
||||
import org.springframework.core.io.ResourceLoader;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.json.PdfJsonFont;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
public class PdfJsonFallbackFontService {
|
||||
|
||||
public static final String FALLBACK_FONT_ID = "fallback-noto-sans";
|
||||
public static final String DEFAULT_FALLBACK_FONT_LOCATION =
|
||||
"classpath:/static/fonts/NotoSans-Regular.ttf";
|
||||
public static final String FALLBACK_FONT_CJK_ID = "fallback-noto-cjk";
|
||||
public static final String FALLBACK_FONT_JP_ID = "fallback-noto-jp";
|
||||
public static final String FALLBACK_FONT_KR_ID = "fallback-noto-korean";
|
||||
public static final String FALLBACK_FONT_AR_ID = "fallback-noto-arabic";
|
||||
public static final String FALLBACK_FONT_TH_ID = "fallback-noto-thai";
|
||||
|
||||
private static final Map<String, FallbackFontSpec> BUILT_IN_FALLBACK_FONTS =
|
||||
Map.ofEntries(
|
||||
Map.entry(
|
||||
FALLBACK_FONT_CJK_ID,
|
||||
new FallbackFontSpec(
|
||||
"classpath:/static/fonts/NotoSansSC-Regular.ttf",
|
||||
"NotoSansSC-Regular",
|
||||
"ttf")),
|
||||
Map.entry(
|
||||
FALLBACK_FONT_JP_ID,
|
||||
new FallbackFontSpec(
|
||||
"classpath:/static/fonts/NotoSansJP-Regular.ttf",
|
||||
"NotoSansJP-Regular",
|
||||
"ttf")),
|
||||
Map.entry(
|
||||
FALLBACK_FONT_KR_ID,
|
||||
new FallbackFontSpec(
|
||||
"classpath:/static/fonts/malgun.ttf", "MalgunGothic", "ttf")),
|
||||
Map.entry(
|
||||
FALLBACK_FONT_AR_ID,
|
||||
new FallbackFontSpec(
|
||||
"classpath:/static/fonts/NotoSansArabic-Regular.ttf",
|
||||
"NotoSansArabic-Regular",
|
||||
"ttf")),
|
||||
Map.entry(
|
||||
FALLBACK_FONT_TH_ID,
|
||||
new FallbackFontSpec(
|
||||
"classpath:/static/fonts/NotoSansThai-Regular.ttf",
|
||||
"NotoSansThai-Regular",
|
||||
"ttf")));
|
||||
|
||||
private final ResourceLoader resourceLoader;
|
||||
|
||||
@Value("${stirling.pdf.fallback-font:" + DEFAULT_FALLBACK_FONT_LOCATION + "}")
|
||||
private String fallbackFontLocation;
|
||||
|
||||
private final Map<String, byte[]> fallbackFontCache = new ConcurrentHashMap<>();
|
||||
|
||||
public PdfJsonFont buildFallbackFontModel() throws IOException {
|
||||
return buildFallbackFontModel(FALLBACK_FONT_ID);
|
||||
}
|
||||
|
||||
public PdfJsonFont buildFallbackFontModel(String fallbackId) throws IOException {
|
||||
FallbackFontSpec spec = getFallbackFontSpec(fallbackId);
|
||||
if (spec == null) {
|
||||
throw new IOException("Unknown fallback font id " + fallbackId);
|
||||
}
|
||||
byte[] bytes = loadFallbackFontBytes(fallbackId, spec);
|
||||
String base64 = java.util.Base64.getEncoder().encodeToString(bytes);
|
||||
return PdfJsonFont.builder()
|
||||
.id(fallbackId)
|
||||
.uid(fallbackId)
|
||||
.baseName(spec.baseName())
|
||||
.subtype("TrueType")
|
||||
.embedded(true)
|
||||
.program(base64)
|
||||
.programFormat(spec.format())
|
||||
.build();
|
||||
}
|
||||
|
||||
public PDFont loadFallbackPdfFont(PDDocument document) throws IOException {
|
||||
return loadFallbackPdfFont(document, FALLBACK_FONT_ID);
|
||||
}
|
||||
|
||||
public PDFont loadFallbackPdfFont(PDDocument document, String fallbackId) throws IOException {
|
||||
FallbackFontSpec spec = getFallbackFontSpec(fallbackId);
|
||||
if (spec == null) {
|
||||
throw new IOException("Unknown fallback font id " + fallbackId);
|
||||
}
|
||||
byte[] bytes = loadFallbackFontBytes(fallbackId, spec);
|
||||
try (InputStream stream = new ByteArrayInputStream(bytes)) {
|
||||
return PDType0Font.load(document, stream, true);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean canEncodeFully(PDFont font, String text) {
|
||||
return canEncode(font, text);
|
||||
}
|
||||
|
||||
public boolean canEncode(PDFont font, int codePoint) {
|
||||
return canEncode(font, new String(Character.toChars(codePoint)));
|
||||
}
|
||||
|
||||
public boolean canEncode(PDFont font, String text) {
|
||||
if (font == null || text == null || text.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
font.encode(text);
|
||||
return true;
|
||||
} catch (IOException | IllegalArgumentException ex) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public String resolveFallbackFontId(int codePoint) {
|
||||
Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
|
||||
if (block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|
||||
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|
||||
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|
||||
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C
|
||||
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D
|
||||
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E
|
||||
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F
|
||||
|| block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|
||||
|| block == Character.UnicodeBlock.BOPOMOFO
|
||||
|| block == Character.UnicodeBlock.BOPOMOFO_EXTENDED
|
||||
|| block == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
|
||||
return FALLBACK_FONT_CJK_ID;
|
||||
}
|
||||
|
||||
Character.UnicodeScript script = Character.UnicodeScript.of(codePoint);
|
||||
return switch (script) {
|
||||
case HAN -> FALLBACK_FONT_CJK_ID;
|
||||
case HIRAGANA, KATAKANA -> FALLBACK_FONT_JP_ID;
|
||||
case HANGUL -> FALLBACK_FONT_KR_ID;
|
||||
case ARABIC -> FALLBACK_FONT_AR_ID;
|
||||
case THAI -> FALLBACK_FONT_TH_ID;
|
||||
default -> FALLBACK_FONT_ID;
|
||||
};
|
||||
}
|
||||
|
||||
public String mapUnsupportedGlyph(int codePoint) {
|
||||
return switch (codePoint) {
|
||||
case 0x276E -> "<";
|
||||
case 0x276F -> ">";
|
||||
default -> null;
|
||||
};
|
||||
}
|
||||
|
||||
private FallbackFontSpec getFallbackFontSpec(String fallbackId) {
|
||||
if (FALLBACK_FONT_ID.equals(fallbackId)) {
|
||||
String baseName = inferBaseName(fallbackFontLocation, "NotoSans-Regular");
|
||||
String format = inferFormat(fallbackFontLocation, "ttf");
|
||||
return new FallbackFontSpec(fallbackFontLocation, baseName, format);
|
||||
}
|
||||
return BUILT_IN_FALLBACK_FONTS.get(fallbackId);
|
||||
}
|
||||
|
||||
private byte[] loadFallbackFontBytes(String fallbackId, FallbackFontSpec spec)
|
||||
throws IOException {
|
||||
if (spec == null) {
|
||||
throw new IOException("No fallback font specification for " + fallbackId);
|
||||
}
|
||||
byte[] cached = fallbackFontCache.get(fallbackId);
|
||||
if (cached != null) {
|
||||
return cached;
|
||||
}
|
||||
Resource resource = resourceLoader.getResource(spec.resourceLocation());
|
||||
if (!resource.exists()) {
|
||||
throw new IOException("Fallback font resource not found at " + spec.resourceLocation());
|
||||
}
|
||||
try (InputStream inputStream = resource.getInputStream();
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
inputStream.transferTo(baos);
|
||||
byte[] bytes = baos.toByteArray();
|
||||
fallbackFontCache.put(fallbackId, bytes);
|
||||
return bytes;
|
||||
}
|
||||
}
|
||||
|
||||
private String inferBaseName(String location, String defaultName) {
|
||||
if (location == null || location.isBlank()) {
|
||||
return defaultName;
|
||||
}
|
||||
int slash = location.lastIndexOf('/');
|
||||
String fileName = slash >= 0 ? location.substring(slash + 1) : location;
|
||||
int dot = fileName.lastIndexOf('.');
|
||||
if (dot > 0) {
|
||||
fileName = fileName.substring(0, dot);
|
||||
}
|
||||
return fileName.isEmpty() ? defaultName : fileName;
|
||||
}
|
||||
|
||||
private String inferFormat(String location, String defaultFormat) {
|
||||
if (location == null || location.isBlank()) {
|
||||
return defaultFormat;
|
||||
}
|
||||
int dot = location.lastIndexOf('.');
|
||||
if (dot >= 0 && dot < location.length() - 1) {
|
||||
return location.substring(dot + 1).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
return defaultFormat;
|
||||
}
|
||||
|
||||
private record FallbackFontSpec(String resourceLocation, String baseName, String format) {}
|
||||
}
|
||||
@ -0,0 +1,349 @@
|
||||
package stirling.software.SPDF.service.pdfjson;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.Base64;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.common.util.ProcessExecutor;
|
||||
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
|
||||
import stirling.software.common.util.TempFile;
|
||||
import stirling.software.common.util.TempFileManager;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class PdfJsonFontService {
|
||||
|
||||
private final TempFileManager tempFileManager;
|
||||
|
||||
@Getter
|
||||
@Value("${stirling.pdf.json.cff-converter.enabled:true}")
|
||||
private boolean cffConversionEnabled;
|
||||
|
||||
@Getter
|
||||
@Value("${stirling.pdf.json.cff-converter.method:python}")
|
||||
private String cffConverterMethod;
|
||||
|
||||
@Value("${stirling.pdf.json.cff-converter.python-command:/opt/venv/bin/python3}")
|
||||
private String pythonCommand;
|
||||
|
||||
@Value("${stirling.pdf.json.cff-converter.python-script:/scripts/convert_cff_to_ttf.py}")
|
||||
private String pythonScript;
|
||||
|
||||
@Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}")
|
||||
private String fontforgeCommand;
|
||||
|
||||
private volatile boolean pythonCffConverterAvailable;
|
||||
private volatile boolean fontForgeCffConverterAvailable;
|
||||
|
||||
@PostConstruct
|
||||
private void initialiseCffConverterAvailability() {
|
||||
if (!cffConversionEnabled) {
|
||||
log.warn("[FONT-DEBUG] CFF conversion is DISABLED in configuration");
|
||||
pythonCffConverterAvailable = false;
|
||||
fontForgeCffConverterAvailable = false;
|
||||
return;
|
||||
}
|
||||
|
||||
log.info("[FONT-DEBUG] CFF conversion enabled, checking tool availability...");
|
||||
pythonCffConverterAvailable = isCommandAvailable(pythonCommand);
|
||||
if (!pythonCffConverterAvailable) {
|
||||
log.warn(
|
||||
"[FONT-DEBUG] Python command '{}' not found; Python CFF conversion disabled",
|
||||
pythonCommand);
|
||||
} else {
|
||||
log.info("[FONT-DEBUG] Python command '{}' is available", pythonCommand);
|
||||
}
|
||||
|
||||
fontForgeCffConverterAvailable = isCommandAvailable(fontforgeCommand);
|
||||
if (!fontForgeCffConverterAvailable) {
|
||||
log.warn(
|
||||
"[FONT-DEBUG] FontForge command '{}' not found; FontForge CFF conversion disabled",
|
||||
fontforgeCommand);
|
||||
} else {
|
||||
log.info("[FONT-DEBUG] FontForge command '{}' is available", fontforgeCommand);
|
||||
}
|
||||
|
||||
log.info("[FONT-DEBUG] Selected CFF converter method: {}", cffConverterMethod);
|
||||
}
|
||||
|
||||
public byte[] convertCffProgramToTrueType(byte[] fontBytes, String toUnicode) {
|
||||
if (!cffConversionEnabled || fontBytes == null || fontBytes.length == 0) {
|
||||
log.warn(
|
||||
"[FONT-DEBUG] CFF conversion skipped: enabled={}, bytes={}",
|
||||
cffConversionEnabled,
|
||||
fontBytes == null ? "null" : fontBytes.length);
|
||||
return null;
|
||||
}
|
||||
|
||||
log.info(
|
||||
"[FONT-DEBUG] Converting CFF font: {} bytes, method: {}",
|
||||
fontBytes.length,
|
||||
cffConverterMethod);
|
||||
|
||||
if ("python".equalsIgnoreCase(cffConverterMethod)) {
|
||||
if (!pythonCffConverterAvailable) {
|
||||
log.warn("[FONT-DEBUG] Python CFF converter not available, skipping conversion");
|
||||
return null;
|
||||
}
|
||||
byte[] result = convertCffUsingPython(fontBytes, toUnicode);
|
||||
log.info(
|
||||
"[FONT-DEBUG] Python conversion result: {}",
|
||||
result == null ? "null" : result.length + " bytes");
|
||||
return result;
|
||||
} else if ("fontforge".equalsIgnoreCase(cffConverterMethod)) {
|
||||
if (!fontForgeCffConverterAvailable) {
|
||||
log.warn("[FONT-DEBUG] FontForge CFF converter not available, skipping conversion");
|
||||
return null;
|
||||
}
|
||||
byte[] result = convertCffUsingFontForge(fontBytes);
|
||||
log.info(
|
||||
"[FONT-DEBUG] FontForge conversion result: {}",
|
||||
result == null ? "null" : result.length + " bytes");
|
||||
return result;
|
||||
} else {
|
||||
log.warn(
|
||||
"[FONT-DEBUG] Unknown CFF converter method: {}, falling back to Python",
|
||||
cffConverterMethod);
|
||||
if (!pythonCffConverterAvailable) {
|
||||
log.warn("[FONT-DEBUG] Python CFF converter not available, skipping conversion");
|
||||
return null;
|
||||
}
|
||||
byte[] result = convertCffUsingPython(fontBytes, toUnicode);
|
||||
log.info(
|
||||
"[FONT-DEBUG] Python conversion result: {}",
|
||||
result == null ? "null" : result.length + " bytes");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
public String detectFontFlavor(byte[] fontBytes) {
|
||||
if (fontBytes == null || fontBytes.length < 4) {
|
||||
return null;
|
||||
}
|
||||
int signature =
|
||||
((fontBytes[0] & 0xFF) << 24)
|
||||
| ((fontBytes[1] & 0xFF) << 16)
|
||||
| ((fontBytes[2] & 0xFF) << 8)
|
||||
| (fontBytes[3] & 0xFF);
|
||||
if (signature == 0x00010000 || signature == 0x74727565) {
|
||||
return "ttf";
|
||||
}
|
||||
if (signature == 0x4F54544F) {
|
||||
return "otf";
|
||||
}
|
||||
if (signature == 0x74746366) {
|
||||
return "cff";
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public String detectTrueTypeFormat(byte[] data) {
|
||||
if (data == null || data.length < 4) {
|
||||
return null;
|
||||
}
|
||||
int signature =
|
||||
((data[0] & 0xFF) << 24)
|
||||
| ((data[1] & 0xFF) << 16)
|
||||
| ((data[2] & 0xFF) << 8)
|
||||
| (data[3] & 0xFF);
|
||||
if (signature == 0x00010000) {
|
||||
return "ttf";
|
||||
}
|
||||
if (signature == 0x4F54544F) {
|
||||
return "otf";
|
||||
}
|
||||
if (signature == 0x74746366) {
|
||||
return "cff";
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public String validateFontTables(byte[] fontBytes) {
|
||||
if (fontBytes == null || fontBytes.length < 12) {
|
||||
return "Font program too small";
|
||||
}
|
||||
int numTables = ((fontBytes[4] & 0xFF) << 8) | (fontBytes[5] & 0xFF);
|
||||
if (numTables <= 0 || numTables > 512) {
|
||||
return "Invalid numTables: " + numTables;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private byte[] convertCffUsingPython(byte[] fontBytes, String toUnicode) {
|
||||
if (!pythonCffConverterAvailable) {
|
||||
log.warn("[FONT-DEBUG] Python CFF converter not available");
|
||||
return null;
|
||||
}
|
||||
if (pythonCommand == null
|
||||
|| pythonCommand.isBlank()
|
||||
|| pythonScript == null
|
||||
|| pythonScript.isBlank()) {
|
||||
log.warn("[FONT-DEBUG] Python converter not configured");
|
||||
return null;
|
||||
}
|
||||
|
||||
log.info(
|
||||
"[FONT-DEBUG] Running Python CFF converter: command={}, script={}",
|
||||
pythonCommand,
|
||||
pythonScript);
|
||||
|
||||
try (TempFile inputFile = new TempFile(tempFileManager, ".cff");
|
||||
TempFile outputFile = new TempFile(tempFileManager, ".otf");
|
||||
TempFile toUnicodeFile =
|
||||
toUnicode != null ? new TempFile(tempFileManager, ".tounicode") : null) {
|
||||
Files.write(inputFile.getPath(), fontBytes);
|
||||
if (toUnicodeFile != null) {
|
||||
try {
|
||||
byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicode);
|
||||
Files.write(toUnicodeFile.getPath(), toUnicodeBytes);
|
||||
} catch (IllegalArgumentException ex) {
|
||||
log.warn(
|
||||
"[FONT-DEBUG] Failed to decode ToUnicode data for CFF conversion: {}",
|
||||
ex.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
String[] command =
|
||||
buildPythonCommand(
|
||||
inputFile.getAbsolutePath(),
|
||||
outputFile.getAbsolutePath(),
|
||||
toUnicodeFile != null ? toUnicodeFile.getAbsolutePath() : null);
|
||||
log.info("[FONT-DEBUG] Executing: {}", String.join(" ", command));
|
||||
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.CFF_CONVERTER)
|
||||
.runCommandWithOutputHandling(java.util.Arrays.asList(command));
|
||||
|
||||
if (result.getRc() != 0) {
|
||||
log.error(
|
||||
"[FONT-DEBUG] Python CFF conversion failed with exit code: {}",
|
||||
result.getRc());
|
||||
log.error("[FONT-DEBUG] Stdout: {}", result.getMessages());
|
||||
return null;
|
||||
}
|
||||
if (!Files.exists(outputFile.getPath())) {
|
||||
log.error("[FONT-DEBUG] Python CFF conversion produced no output file");
|
||||
return null;
|
||||
}
|
||||
byte[] data = Files.readAllBytes(outputFile.getPath());
|
||||
if (data.length == 0) {
|
||||
log.error("[FONT-DEBUG] Python CFF conversion returned empty output");
|
||||
return null;
|
||||
}
|
||||
log.info(
|
||||
"[FONT-DEBUG] Python CFF conversion succeeded: {} bytes -> {} bytes",
|
||||
fontBytes.length,
|
||||
data.length);
|
||||
return data;
|
||||
} catch (IOException | InterruptedException ex) {
|
||||
if (ex instanceof InterruptedException) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
log.error("[FONT-DEBUG] Python CFF conversion exception: {}", ex.getMessage(), ex);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public byte[] convertCffUsingFontForge(byte[] fontBytes) {
|
||||
if (!fontForgeCffConverterAvailable) {
|
||||
log.debug("FontForge CFF converter not available");
|
||||
return null;
|
||||
}
|
||||
|
||||
try (TempFile inputFile = new TempFile(tempFileManager, ".cff");
|
||||
TempFile outputFile = new TempFile(tempFileManager, ".ttf")) {
|
||||
Files.write(inputFile.getPath(), fontBytes);
|
||||
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.CFF_CONVERTER)
|
||||
.runCommandWithOutputHandling(
|
||||
java.util.Arrays.asList(
|
||||
fontforgeCommand,
|
||||
"-lang=ff",
|
||||
"-c",
|
||||
"Open($1); "
|
||||
+ "ScaleToEm(1000); "
|
||||
+ "SelectWorthOutputting(); "
|
||||
+ "SetFontOrder(2); "
|
||||
+ "Reencode(\"unicode\"); "
|
||||
+ "RoundToInt(); "
|
||||
+ "RemoveOverlap(); "
|
||||
+ "Simplify(); "
|
||||
+ "CorrectDirection(); "
|
||||
+ "Generate($2, \"\", 4+16+32); "
|
||||
+ "Close(); "
|
||||
+ "Quit()",
|
||||
inputFile.getAbsolutePath(),
|
||||
outputFile.getAbsolutePath()));
|
||||
|
||||
if (result.getRc() != 0) {
|
||||
log.warn("FontForge CFF conversion failed: {}", result.getRc());
|
||||
return null;
|
||||
}
|
||||
if (!Files.exists(outputFile.getPath())) {
|
||||
log.warn("FontForge CFF conversion produced no output");
|
||||
return null;
|
||||
}
|
||||
byte[] data = Files.readAllBytes(outputFile.getPath());
|
||||
if (data.length == 0) {
|
||||
log.warn("FontForge CFF conversion returned empty output");
|
||||
return null;
|
||||
}
|
||||
return data;
|
||||
} catch (IOException | InterruptedException ex) {
|
||||
if (ex instanceof InterruptedException) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
log.warn("FontForge CFF conversion failed: {}", ex.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isCommandAvailable(String command) {
|
||||
if (command == null || command.isBlank()) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
ProcessBuilder processBuilder = new ProcessBuilder();
|
||||
if (System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows")) {
|
||||
processBuilder.command("where", command);
|
||||
} else {
|
||||
processBuilder.command("which", command);
|
||||
}
|
||||
Process process = processBuilder.start();
|
||||
int exitCode = process.waitFor();
|
||||
return exitCode == 0;
|
||||
} catch (Exception e) {
|
||||
log.debug("Error checking for command {}: {}", command, e.getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private String[] buildPythonCommand(String input, String output, String toUnicode) {
|
||||
if (toUnicode != null) {
|
||||
return new String[] {
|
||||
pythonCommand,
|
||||
pythonScript,
|
||||
"--input",
|
||||
input,
|
||||
"--output",
|
||||
output,
|
||||
"--to-unicode",
|
||||
toUnicode
|
||||
};
|
||||
}
|
||||
return new String[] {pythonCommand, pythonScript, "--input", input, "--output", output};
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,444 @@
|
||||
package stirling.software.SPDF.service.pdfjson;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Base64;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.api.PdfJsonConversionProgress;
|
||||
import stirling.software.SPDF.model.json.PdfJsonImageElement;
|
||||
|
||||
/**
|
||||
* Service for handling PDF image operations for JSON conversion (extraction, encoding, rendering).
|
||||
*/
|
||||
@Service
|
||||
@Slf4j
|
||||
public class PdfJsonImageService {
|
||||
|
||||
private record EncodedImage(String base64, String format) {}
|
||||
|
||||
private record Bounds(float left, float right, float bottom, float top) {
|
||||
float width() {
|
||||
return Math.max(0f, right - left);
|
||||
}
|
||||
|
||||
float height() {
|
||||
return Math.max(0f, top - bottom);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects images from all pages in a PDF document.
|
||||
*
|
||||
* @param document The PDF document
|
||||
* @param totalPages Total number of pages
|
||||
* @param progress Progress callback
|
||||
* @return Map of page number to list of image elements
|
||||
* @throws IOException If image extraction fails
|
||||
*/
|
||||
public Map<Integer, List<PdfJsonImageElement>> collectImages(
|
||||
PDDocument document, int totalPages, Consumer<PdfJsonConversionProgress> progress)
|
||||
throws IOException {
|
||||
Map<Integer, List<PdfJsonImageElement>> imagesByPage = new LinkedHashMap<>();
|
||||
int pageNumber = 1;
|
||||
for (PDPage page : document.getPages()) {
|
||||
ImageCollectingEngine engine =
|
||||
new ImageCollectingEngine(page, pageNumber, imagesByPage);
|
||||
engine.processPage(page);
|
||||
|
||||
// Update progress for image extraction (70-80%)
|
||||
int imageProgress = 70 + (int) ((pageNumber / (double) totalPages) * 10);
|
||||
progress.accept(
|
||||
PdfJsonConversionProgress.of(
|
||||
imageProgress, "images", "Extracting images", pageNumber, totalPages));
|
||||
pageNumber++;
|
||||
}
|
||||
return imagesByPage;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts images from a single PDF page (for on-demand lazy loading).
|
||||
*
|
||||
* @param document The PDF document
|
||||
* @param page The specific page to extract images from
|
||||
* @param pageNumber The page number (1-indexed)
|
||||
* @return List of image elements for this page
|
||||
* @throws IOException If image extraction fails
|
||||
*/
|
||||
public List<PdfJsonImageElement> extractImagesForPage(
|
||||
PDDocument document, PDPage page, int pageNumber) throws IOException {
|
||||
Map<Integer, List<PdfJsonImageElement>> imagesByPage = new LinkedHashMap<>();
|
||||
ImageCollectingEngine engine = new ImageCollectingEngine(page, pageNumber, imagesByPage);
|
||||
engine.processPage(page);
|
||||
return imagesByPage.getOrDefault(pageNumber, new ArrayList<>());
|
||||
}
|
||||
|
||||
/**
|
||||
* Draws an image element on a PDF page content stream.
|
||||
*
|
||||
* @param contentStream The content stream to draw on
|
||||
* @param document The PDF document
|
||||
* @param element The image element to draw
|
||||
* @param cache Cache of previously created image XObjects
|
||||
* @throws IOException If drawing fails
|
||||
*/
|
||||
public void drawImageElement(
|
||||
PDPageContentStream contentStream,
|
||||
PDDocument document,
|
||||
PdfJsonImageElement element,
|
||||
Map<String, PDImageXObject> cache)
|
||||
throws IOException {
|
||||
if (element == null || element.getImageData() == null || element.getImageData().isBlank()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String cacheKey =
|
||||
element.getId() != null && !element.getId().isBlank()
|
||||
? element.getId()
|
||||
: Integer.toHexString(System.identityHashCode(element));
|
||||
PDImageXObject image = cache.get(cacheKey);
|
||||
if (image == null) {
|
||||
image = createImageXObject(document, element);
|
||||
if (image == null) {
|
||||
return;
|
||||
}
|
||||
cache.put(cacheKey, image);
|
||||
}
|
||||
|
||||
List<Float> transform = element.getTransform();
|
||||
if (transform != null && transform.size() == 6) {
|
||||
Matrix matrix =
|
||||
new Matrix(
|
||||
safeFloat(transform.get(0), 1f),
|
||||
safeFloat(transform.get(1), 0f),
|
||||
safeFloat(transform.get(2), 0f),
|
||||
safeFloat(transform.get(3), 1f),
|
||||
safeFloat(transform.get(4), 0f),
|
||||
safeFloat(transform.get(5), 0f));
|
||||
contentStream.drawImage(image, matrix);
|
||||
return;
|
||||
}
|
||||
|
||||
float width = safeFloat(element.getWidth(), fallbackWidth(element));
|
||||
float height = safeFloat(element.getHeight(), fallbackHeight(element));
|
||||
if (width <= 0f) {
|
||||
width = Math.max(1f, fallbackWidth(element));
|
||||
}
|
||||
if (height <= 0f) {
|
||||
height = Math.max(1f, fallbackHeight(element));
|
||||
}
|
||||
float left = resolveLeft(element, width);
|
||||
float bottom = resolveBottom(element, height);
|
||||
|
||||
contentStream.drawImage(image, left, bottom, width, height);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PDImageXObject from a PdfJsonImageElement.
|
||||
*
|
||||
* @param document The PDF document
|
||||
* @param element The image element with base64 data
|
||||
* @return The created image XObject
|
||||
* @throws IOException If image creation fails
|
||||
*/
|
||||
public PDImageXObject createImageXObject(PDDocument document, PdfJsonImageElement element)
|
||||
throws IOException {
|
||||
byte[] data;
|
||||
try {
|
||||
data = Base64.getDecoder().decode(element.getImageData());
|
||||
} catch (IllegalArgumentException ex) {
|
||||
log.debug("Failed to decode image element: {}", ex.getMessage());
|
||||
return null;
|
||||
}
|
||||
String name = element.getId() != null ? element.getId() : UUID.randomUUID().toString();
|
||||
return PDImageXObject.createFromByteArray(document, data, name);
|
||||
}
|
||||
|
||||
private EncodedImage encodeImage(PDImage image) {
|
||||
try {
|
||||
BufferedImage bufferedImage = image.getImage();
|
||||
if (bufferedImage == null) {
|
||||
return null;
|
||||
}
|
||||
String format = resolveImageFormat(image);
|
||||
if (format == null || format.isBlank()) {
|
||||
format = "png";
|
||||
}
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
boolean written = ImageIO.write(bufferedImage, format, baos);
|
||||
if (!written) {
|
||||
if (!"png".equalsIgnoreCase(format)) {
|
||||
baos.reset();
|
||||
if (!ImageIO.write(bufferedImage, "png", baos)) {
|
||||
return null;
|
||||
}
|
||||
format = "png";
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return new EncodedImage(Base64.getEncoder().encodeToString(baos.toByteArray()), format);
|
||||
} catch (IOException ex) {
|
||||
log.debug("Failed to encode image: {}", ex.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private String resolveImageFormat(PDImage image) {
|
||||
if (image instanceof PDImageXObject xObject) {
|
||||
String suffix = xObject.getSuffix();
|
||||
if (suffix != null && !suffix.isBlank()) {
|
||||
return suffix.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
}
|
||||
return "png";
|
||||
}
|
||||
|
||||
private float fallbackWidth(PdfJsonImageElement element) {
|
||||
if (element.getRight() != null && element.getLeft() != null) {
|
||||
return Math.max(0f, element.getRight() - element.getLeft());
|
||||
}
|
||||
if (element.getNativeWidth() != null) {
|
||||
return element.getNativeWidth();
|
||||
}
|
||||
return 1f;
|
||||
}
|
||||
|
||||
private float fallbackHeight(PdfJsonImageElement element) {
|
||||
if (element.getTop() != null && element.getBottom() != null) {
|
||||
return Math.max(0f, element.getTop() - element.getBottom());
|
||||
}
|
||||
if (element.getNativeHeight() != null) {
|
||||
return element.getNativeHeight();
|
||||
}
|
||||
return 1f;
|
||||
}
|
||||
|
||||
private float resolveLeft(PdfJsonImageElement element, float width) {
|
||||
if (element.getLeft() != null) {
|
||||
return element.getLeft();
|
||||
}
|
||||
if (element.getX() != null) {
|
||||
return element.getX();
|
||||
}
|
||||
if (element.getRight() != null) {
|
||||
return element.getRight() - width;
|
||||
}
|
||||
return 0f;
|
||||
}
|
||||
|
||||
private float resolveBottom(PdfJsonImageElement element, float height) {
|
||||
if (element.getBottom() != null) {
|
||||
return element.getBottom();
|
||||
}
|
||||
if (element.getY() != null) {
|
||||
return element.getY();
|
||||
}
|
||||
if (element.getTop() != null) {
|
||||
return element.getTop() - height;
|
||||
}
|
||||
return 0f;
|
||||
}
|
||||
|
||||
private List<Float> toMatrixValues(Matrix matrix) {
|
||||
List<Float> values = new ArrayList<>(6);
|
||||
values.add(matrix.getValue(0, 0));
|
||||
values.add(matrix.getValue(0, 1));
|
||||
values.add(matrix.getValue(1, 0));
|
||||
values.add(matrix.getValue(1, 1));
|
||||
values.add(matrix.getValue(2, 0));
|
||||
values.add(matrix.getValue(2, 1));
|
||||
return values;
|
||||
}
|
||||
|
||||
private float safeFloat(Float value, float defaultValue) {
|
||||
if (value == null || Float.isNaN(value) || Float.isInfinite(value)) {
|
||||
return defaultValue;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inner engine that extends PDFGraphicsStreamEngine to collect images from PDF content streams.
|
||||
*/
|
||||
private class ImageCollectingEngine extends PDFGraphicsStreamEngine {
|
||||
|
||||
private final int pageNumber;
|
||||
private final Map<Integer, List<PdfJsonImageElement>> imagesByPage;
|
||||
|
||||
private COSName currentXObjectName;
|
||||
private int imageCounter = 0;
|
||||
|
||||
protected ImageCollectingEngine(
|
||||
PDPage page, int pageNumber, Map<Integer, List<PdfJsonImageElement>> imagesByPage)
|
||||
throws IOException {
|
||||
super(page);
|
||||
this.pageNumber = pageNumber;
|
||||
this.imagesByPage = imagesByPage;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processPage(PDPage page) throws IOException {
|
||||
super.processPage(page);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void drawImage(PDImage pdImage) throws IOException {
|
||||
EncodedImage encoded = encodeImage(pdImage);
|
||||
if (encoded == null) {
|
||||
return;
|
||||
}
|
||||
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
|
||||
Bounds bounds = computeBounds(ctm);
|
||||
List<Float> matrixValues = toMatrixValues(ctm);
|
||||
|
||||
PdfJsonImageElement element =
|
||||
PdfJsonImageElement.builder()
|
||||
.id(UUID.randomUUID().toString())
|
||||
.objectName(
|
||||
currentXObjectName != null
|
||||
? currentXObjectName.getName()
|
||||
: null)
|
||||
.inlineImage(!(pdImage instanceof PDImageXObject))
|
||||
.nativeWidth(pdImage.getWidth())
|
||||
.nativeHeight(pdImage.getHeight())
|
||||
.x(bounds.left)
|
||||
.y(bounds.bottom)
|
||||
.width(bounds.width())
|
||||
.height(bounds.height())
|
||||
.left(bounds.left)
|
||||
.right(bounds.right)
|
||||
.top(bounds.top)
|
||||
.bottom(bounds.bottom)
|
||||
.transform(matrixValues)
|
||||
.zOrder(-1_000_000 + imageCounter)
|
||||
.imageData(encoded.base64())
|
||||
.imageFormat(encoded.format())
|
||||
.build();
|
||||
imageCounter++;
|
||||
imagesByPage.computeIfAbsent(pageNumber, key -> new ArrayList<>()).add(element);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3)
|
||||
throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clip(int windingRule) throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
public void moveTo(float x, float y) throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
public void lineTo(float x, float y) throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
|
||||
throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
public Point2D getCurrentPoint() throws IOException {
|
||||
return new Point2D.Float();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void closePath() throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endPath() throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shadingFill(COSName shadingName) throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fillAndStrokePath(int windingRule) throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fillPath(int windingRule) throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
public void strokePath() throws IOException {
|
||||
// Not needed for image extraction
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processOperator(Operator operator, List<COSBase> operands)
|
||||
throws IOException {
|
||||
if (OperatorName.DRAW_OBJECT.equals(operator.getName())
|
||||
&& !operands.isEmpty()
|
||||
&& operands.get(0) instanceof COSName name) {
|
||||
currentXObjectName = name;
|
||||
}
|
||||
super.processOperator(operator, operands);
|
||||
currentXObjectName = null;
|
||||
}
|
||||
|
||||
private Bounds computeBounds(Matrix ctm) {
|
||||
AffineTransform transform = ctm.createAffineTransform();
|
||||
Point2D.Float p0 = new Point2D.Float(0, 0);
|
||||
Point2D.Float p1 = new Point2D.Float(1, 0);
|
||||
Point2D.Float p2 = new Point2D.Float(0, 1);
|
||||
Point2D.Float p3 = new Point2D.Float(1, 1);
|
||||
transform.transform(p0, p0);
|
||||
transform.transform(p1, p1);
|
||||
transform.transform(p2, p2);
|
||||
transform.transform(p3, p3);
|
||||
|
||||
float minX = Math.min(Math.min(p0.x, p1.x), Math.min(p2.x, p3.x));
|
||||
float maxX = Math.max(Math.max(p0.x, p1.x), Math.max(p2.x, p3.x));
|
||||
float minY = Math.min(Math.min(p0.y, p1.y), Math.min(p2.y, p3.y));
|
||||
float maxY = Math.max(Math.max(p0.y, p1.y), Math.max(p2.y, p3.y));
|
||||
|
||||
if (!Float.isFinite(minX) || !Float.isFinite(minY)) {
|
||||
return new Bounds(0f, 0f, 0f, 0f);
|
||||
}
|
||||
return new Bounds(minX, maxX, minY, maxY);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,148 @@
|
||||
package stirling.software.SPDF.service.pdfjson;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.time.Instant;
|
||||
import java.time.format.DateTimeParseException;
|
||||
import java.util.Base64;
|
||||
import java.util.Calendar;
|
||||
import java.util.Optional;
|
||||
import java.util.TimeZone;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
import org.apache.pdfbox.pdmodel.common.PDMetadata;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.json.PdfJsonMetadata;
|
||||
|
||||
/** Service for extracting and applying PDF metadata (document info and XMP) for JSON conversion. */
|
||||
@Service
|
||||
@Slf4j
|
||||
public class PdfJsonMetadataService {
|
||||
|
||||
/**
|
||||
* Extracts document information metadata from a PDF.
|
||||
*
|
||||
* @param document The PDF document
|
||||
* @return Metadata model with document info
|
||||
*/
|
||||
public PdfJsonMetadata extractMetadata(PDDocument document) {
|
||||
PdfJsonMetadata metadata = new PdfJsonMetadata();
|
||||
PDDocumentInformation info = document.getDocumentInformation();
|
||||
if (info != null) {
|
||||
metadata.setTitle(info.getTitle());
|
||||
metadata.setAuthor(info.getAuthor());
|
||||
metadata.setSubject(info.getSubject());
|
||||
metadata.setKeywords(info.getKeywords());
|
||||
metadata.setCreator(info.getCreator());
|
||||
metadata.setProducer(info.getProducer());
|
||||
metadata.setCreationDate(formatCalendar(info.getCreationDate()));
|
||||
metadata.setModificationDate(formatCalendar(info.getModificationDate()));
|
||||
metadata.setTrapped(info.getTrapped());
|
||||
}
|
||||
metadata.setNumberOfPages(document.getNumberOfPages());
|
||||
return metadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts XMP metadata from a PDF as base64-encoded string.
|
||||
*
|
||||
* @param document The PDF document
|
||||
* @return Base64-encoded XMP metadata, or null if not present
|
||||
*/
|
||||
public String extractXmpMetadata(PDDocument document) {
|
||||
if (document.getDocumentCatalog() == null) {
|
||||
return null;
|
||||
}
|
||||
PDMetadata metadata = document.getDocumentCatalog().getMetadata();
|
||||
if (metadata == null) {
|
||||
return null;
|
||||
}
|
||||
try (InputStream inputStream = metadata.createInputStream();
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
inputStream.transferTo(baos);
|
||||
byte[] data = baos.toByteArray();
|
||||
if (data.length == 0) {
|
||||
return null;
|
||||
}
|
||||
return Base64.getEncoder().encodeToString(data);
|
||||
} catch (IOException ex) {
|
||||
log.debug("Failed to extract XMP metadata: {}", ex.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies metadata to a PDF document.
|
||||
*
|
||||
* @param document The PDF document
|
||||
* @param metadata The metadata to apply
|
||||
*/
|
||||
public void applyMetadata(PDDocument document, PdfJsonMetadata metadata) {
|
||||
if (metadata == null) {
|
||||
return;
|
||||
}
|
||||
PDDocumentInformation info = document.getDocumentInformation();
|
||||
info.setTitle(metadata.getTitle());
|
||||
info.setAuthor(metadata.getAuthor());
|
||||
info.setSubject(metadata.getSubject());
|
||||
info.setKeywords(metadata.getKeywords());
|
||||
info.setCreator(metadata.getCreator());
|
||||
info.setProducer(metadata.getProducer());
|
||||
if (metadata.getCreationDate() != null) {
|
||||
parseInstant(metadata.getCreationDate())
|
||||
.ifPresent(instant -> info.setCreationDate(toCalendar(instant)));
|
||||
}
|
||||
if (metadata.getModificationDate() != null) {
|
||||
parseInstant(metadata.getModificationDate())
|
||||
.ifPresent(instant -> info.setModificationDate(toCalendar(instant)));
|
||||
}
|
||||
info.setTrapped(metadata.getTrapped());
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies XMP metadata to a PDF document from base64-encoded string.
|
||||
*
|
||||
* @param document The PDF document
|
||||
* @param base64 Base64-encoded XMP metadata
|
||||
*/
|
||||
public void applyXmpMetadata(PDDocument document, String base64) {
|
||||
if (base64 == null || base64.isBlank()) {
|
||||
return;
|
||||
}
|
||||
try (InputStream inputStream =
|
||||
new ByteArrayInputStream(Base64.getDecoder().decode(base64))) {
|
||||
PDMetadata metadata = new PDMetadata(document, inputStream);
|
||||
document.getDocumentCatalog().setMetadata(metadata);
|
||||
} catch (IllegalArgumentException | IOException ex) {
|
||||
log.debug("Failed to apply XMP metadata: {}", ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private String formatCalendar(Calendar calendar) {
|
||||
if (calendar == null) {
|
||||
return null;
|
||||
}
|
||||
return calendar.toInstant().toString();
|
||||
}
|
||||
|
||||
private Optional<Instant> parseInstant(String value) {
|
||||
try {
|
||||
return Optional.of(Instant.parse(value));
|
||||
} catch (DateTimeParseException ex) {
|
||||
log.warn("Failed to parse instant '{}': {}", value, ex.getMessage());
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
private Calendar toCalendar(Instant instant) {
|
||||
Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
|
||||
calendar.setTimeInMillis(instant.toEpochMilli());
|
||||
return calendar;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,308 @@
|
||||
package stirling.software.SPDF.service.pdfjson;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.api.PdfJsonConversionProgress;
|
||||
import stirling.software.SPDF.model.json.PdfJsonAnnotation;
|
||||
import stirling.software.SPDF.model.json.PdfJsonCosValue;
|
||||
import stirling.software.SPDF.model.json.PdfJsonDocumentMetadata;
|
||||
import stirling.software.SPDF.model.json.PdfJsonFont;
|
||||
import stirling.software.SPDF.model.json.PdfJsonImageElement;
|
||||
import stirling.software.SPDF.model.json.PdfJsonPage;
|
||||
import stirling.software.SPDF.model.json.PdfJsonPageDimension;
|
||||
import stirling.software.SPDF.model.json.PdfJsonStream;
|
||||
import stirling.software.SPDF.model.json.PdfJsonTextElement;
|
||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||
import stirling.software.common.service.TaskManager;
|
||||
import stirling.software.common.util.ExceptionUtils;
|
||||
|
||||
/**
|
||||
* Service for lazy loading PDF pages. Caches PDF documents and extracts pages on-demand to reduce
|
||||
* memory usage for large PDFs.
|
||||
*/
|
||||
@Service
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
public class PdfLazyLoadingService {
|
||||
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
private final ObjectMapper objectMapper;
|
||||
private final TaskManager taskManager;
|
||||
private final PdfJsonMetadataService metadataService;
|
||||
private final PdfJsonImageService imageService;
|
||||
|
||||
/** Cache for storing PDDocuments for lazy page loading. Key is jobId. */
|
||||
private final Map<String, CachedPdfDocument> documentCache = new ConcurrentHashMap<>();
|
||||
|
||||
/**
|
||||
* Stores PDF file bytes for lazy page loading. Each page is extracted on-demand by re-loading
|
||||
* the PDF from bytes.
|
||||
*/
|
||||
@Data
|
||||
private static class CachedPdfDocument {
|
||||
private final byte[] pdfBytes;
|
||||
private final PdfJsonDocumentMetadata metadata;
|
||||
private final long timestamp;
|
||||
|
||||
public CachedPdfDocument(byte[] pdfBytes, PdfJsonDocumentMetadata metadata) {
|
||||
this.pdfBytes = pdfBytes;
|
||||
this.metadata = metadata;
|
||||
this.timestamp = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts document metadata, fonts, and page dimensions without page content. Caches the PDF
|
||||
* bytes for subsequent page requests.
|
||||
*
|
||||
* @param file The uploaded PDF file
|
||||
* @param jobId The job ID for caching
|
||||
* @param fonts Font map (will be populated)
|
||||
* @param pageFontResources Page font resources map (will be populated)
|
||||
* @return Serialized metadata JSON
|
||||
* @throws IOException If extraction fails
|
||||
*/
|
||||
public byte[] extractDocumentMetadata(
|
||||
MultipartFile file,
|
||||
String jobId,
|
||||
Map<String, PdfJsonFont> fonts,
|
||||
Map<Integer, Map<PDFont, String>> pageFontResources)
|
||||
throws IOException {
|
||||
if (file == null) {
|
||||
throw ExceptionUtils.createNullArgumentException("fileInput");
|
||||
}
|
||||
|
||||
Consumer<PdfJsonConversionProgress> progress =
|
||||
jobId != null
|
||||
? (p) -> {
|
||||
log.info(
|
||||
"Progress: [{}%] {} - {}{}",
|
||||
p.getPercent(),
|
||||
p.getStage(),
|
||||
p.getMessage(),
|
||||
(p.getCurrent() != null && p.getTotal() != null)
|
||||
? String.format(
|
||||
" (%d/%d)", p.getCurrent(), p.getTotal())
|
||||
: "");
|
||||
reportProgressToTaskManager(jobId, p);
|
||||
}
|
||||
: (p) -> {};
|
||||
|
||||
// Read PDF bytes once for processing and caching
|
||||
byte[] pdfBytes = file.getBytes();
|
||||
|
||||
try (PDDocument document = pdfDocumentFactory.load(pdfBytes, true)) {
|
||||
int totalPages = document.getNumberOfPages();
|
||||
|
||||
// Build metadata response
|
||||
progress.accept(PdfJsonConversionProgress.of(90, "metadata", "Extracting metadata"));
|
||||
PdfJsonDocumentMetadata docMetadata = new PdfJsonDocumentMetadata();
|
||||
docMetadata.setMetadata(metadataService.extractMetadata(document));
|
||||
docMetadata.setXmpMetadata(metadataService.extractXmpMetadata(document));
|
||||
docMetadata.setLazyImages(Boolean.TRUE);
|
||||
|
||||
List<PdfJsonFont> serializedFonts = new ArrayList<>(fonts.values());
|
||||
serializedFonts.sort(
|
||||
Comparator.comparing(
|
||||
PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder())));
|
||||
docMetadata.setFonts(serializedFonts);
|
||||
|
||||
// Extract page dimensions
|
||||
List<PdfJsonPageDimension> pageDimensions = new ArrayList<>();
|
||||
int pageIndex = 0;
|
||||
for (PDPage page : document.getPages()) {
|
||||
PdfJsonPageDimension dim = new PdfJsonPageDimension();
|
||||
dim.setPageNumber(pageIndex + 1);
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
dim.setWidth(mediaBox.getWidth());
|
||||
dim.setHeight(mediaBox.getHeight());
|
||||
dim.setRotation(page.getRotation());
|
||||
pageDimensions.add(dim);
|
||||
pageIndex++;
|
||||
}
|
||||
docMetadata.setPageDimensions(pageDimensions);
|
||||
|
||||
// Cache PDF bytes and metadata for lazy page loading
|
||||
if (jobId != null) {
|
||||
CachedPdfDocument cached = new CachedPdfDocument(pdfBytes, docMetadata);
|
||||
documentCache.put(jobId, cached);
|
||||
log.info(
|
||||
"Cached PDF bytes ({} bytes) for lazy loading, jobId: {}",
|
||||
pdfBytes.length,
|
||||
jobId);
|
||||
|
||||
// Schedule cleanup after 30 minutes
|
||||
scheduleDocumentCleanup(jobId);
|
||||
}
|
||||
|
||||
progress.accept(
|
||||
PdfJsonConversionProgress.of(100, "complete", "Metadata extraction complete"));
|
||||
|
||||
return objectMapper.writeValueAsBytes(docMetadata);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts a single page from cached PDF bytes. Re-loads the PDF for each request.
|
||||
*
|
||||
* @param jobId The job ID
|
||||
* @param pageNumber The page number (1-indexed)
|
||||
* @param serializeCosValue Function to serialize COS values
|
||||
* @param extractContentStreams Function to extract content streams
|
||||
* @param filterImageXObjectsFromResources Function to filter image XObjects
|
||||
* @param extractText Function to extract text elements for the page
|
||||
* @param extractAnnotations Function to extract annotations for the page
|
||||
* @return Serialized page JSON
|
||||
* @throws IOException If extraction fails
|
||||
*/
|
||||
public byte[] extractSinglePage(
|
||||
String jobId,
|
||||
int pageNumber,
|
||||
java.util.function.Function<COSBase, PdfJsonCosValue> serializeCosValue,
|
||||
java.util.function.Function<PDPage, List<PdfJsonStream>> extractContentStreams,
|
||||
java.util.function.Function<COSBase, COSBase> filterImageXObjectsFromResources,
|
||||
java.util.function.BiFunction<PDDocument, Integer, List<PdfJsonTextElement>>
|
||||
extractText,
|
||||
java.util.function.BiFunction<PDDocument, Integer, List<PdfJsonAnnotation>>
|
||||
extractAnnotations)
|
||||
throws IOException {
|
||||
CachedPdfDocument cached = documentCache.get(jobId);
|
||||
if (cached == null) {
|
||||
throw new IllegalArgumentException("No cached document found for jobId: " + jobId);
|
||||
}
|
||||
|
||||
int pageIndex = pageNumber - 1;
|
||||
int totalPages = cached.getMetadata().getPageDimensions().size();
|
||||
|
||||
if (pageIndex < 0 || pageIndex >= totalPages) {
|
||||
throw new IllegalArgumentException(
|
||||
"Page number " + pageNumber + " out of range (1-" + totalPages + ")");
|
||||
}
|
||||
|
||||
log.debug("Loading PDF from bytes to extract page {} (jobId: {})", pageNumber, jobId);
|
||||
|
||||
// Re-load PDF from cached bytes and extract the single page
|
||||
try (PDDocument document = pdfDocumentFactory.load(cached.getPdfBytes(), true)) {
|
||||
PDPage page = document.getPage(pageIndex);
|
||||
PdfJsonPage pageModel = new PdfJsonPage();
|
||||
pageModel.setPageNumber(pageNumber);
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
pageModel.setWidth(mediaBox.getWidth());
|
||||
pageModel.setHeight(mediaBox.getHeight());
|
||||
pageModel.setRotation(page.getRotation());
|
||||
|
||||
// Extract text on-demand
|
||||
pageModel.setTextElements(extractText.apply(document, pageNumber));
|
||||
|
||||
// Extract annotations on-demand
|
||||
pageModel.setAnnotations(extractAnnotations.apply(document, pageNumber));
|
||||
|
||||
// Extract images on-demand
|
||||
List<PdfJsonImageElement> images =
|
||||
imageService.extractImagesForPage(document, page, pageNumber);
|
||||
pageModel.setImageElements(images);
|
||||
|
||||
// Extract resources and content streams
|
||||
COSBase resourcesBase = page.getCOSObject().getDictionaryObject(COSName.RESOURCES);
|
||||
COSBase filteredResources = filterImageXObjectsFromResources.apply(resourcesBase);
|
||||
pageModel.setResources(serializeCosValue.apply(filteredResources));
|
||||
pageModel.setContentStreams(extractContentStreams.apply(page));
|
||||
|
||||
log.debug(
|
||||
"Extracted page {} (text: {}, images: {}, annotations: {}) for jobId: {}",
|
||||
pageNumber,
|
||||
pageModel.getTextElements().size(),
|
||||
images.size(),
|
||||
pageModel.getAnnotations().size(),
|
||||
jobId);
|
||||
|
||||
return objectMapper.writeValueAsBytes(pageModel);
|
||||
}
|
||||
}
|
||||
|
||||
/** Clears a cached document. */
|
||||
public void clearCachedDocument(String jobId) {
|
||||
CachedPdfDocument cached = documentCache.remove(jobId);
|
||||
if (cached != null) {
|
||||
log.info(
|
||||
"Removed cached PDF bytes ({} bytes) for jobId: {}",
|
||||
cached.getPdfBytes().length,
|
||||
jobId);
|
||||
}
|
||||
}
|
||||
|
||||
/** Schedules automatic cleanup of cached documents after 30 minutes. */
|
||||
private void scheduleDocumentCleanup(String jobId) {
|
||||
new Thread(
|
||||
() -> {
|
||||
try {
|
||||
Thread.sleep(TimeUnit.MINUTES.toMillis(30));
|
||||
clearCachedDocument(jobId);
|
||||
log.info("Auto-cleaned cached document for jobId: {}", jobId);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
})
|
||||
.start();
|
||||
}
|
||||
|
||||
/**
|
||||
* Report progress to TaskManager for async jobs
|
||||
*
|
||||
* @param jobId The job ID
|
||||
* @param progress The progress update
|
||||
*/
|
||||
private void reportProgressToTaskManager(String jobId, PdfJsonConversionProgress progress) {
|
||||
try {
|
||||
log.info(
|
||||
"Reporting progress for job {}: {}% - {}",
|
||||
jobId, progress.getPercent(), progress.getStage());
|
||||
String note;
|
||||
if (progress.getCurrent() != null && progress.getTotal() != null) {
|
||||
note =
|
||||
String.format(
|
||||
"[%d%%] %s: %s (%d/%d)",
|
||||
progress.getPercent(),
|
||||
progress.getStage(),
|
||||
progress.getMessage(),
|
||||
progress.getCurrent(),
|
||||
progress.getTotal());
|
||||
} else {
|
||||
note =
|
||||
String.format(
|
||||
"[%d%%] %s: %s",
|
||||
progress.getPercent(), progress.getStage(), progress.getMessage());
|
||||
}
|
||||
boolean added = taskManager.addNote(jobId, note);
|
||||
if (!added) {
|
||||
log.warn("Failed to add note - job {} not found in TaskManager", jobId);
|
||||
} else {
|
||||
log.info("Successfully added progress note for job {}: {}", jobId, note);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("Exception reporting progress for job {}: {}", jobId, e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4437,6 +4437,32 @@
|
||||
"errors": {
|
||||
"invalidJson": "Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.",
|
||||
"pdfConversion": "Unable to convert the edited JSON back into a PDF."
|
||||
},
|
||||
"options": {
|
||||
"autoScaleText": {
|
||||
"title": "Auto-scale text to fit boxes",
|
||||
"description": "Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF."
|
||||
}
|
||||
},
|
||||
"disclaimer": {
|
||||
"heading": "Preview limitations",
|
||||
"textFocus": "This workspace focuses on editing text and repositioning embedded images. Complex page artwork, form widgets, and layered graphics are preserved for export but are not fully editable here.",
|
||||
"previewVariance": "Some visuals (such as table borders, shapes, or annotation appearances) may not display exactly in the preview. The exported PDF keeps the original drawing commands whenever possible.",
|
||||
"alpha": "This alpha viewer is still evolving—certain fonts, colours, transparency effects, and layout details may shift slightly. Please double-check the generated PDF before sharing."
|
||||
},
|
||||
"stages": {
|
||||
"uploading": "Uploading",
|
||||
"initializing": "Initializing",
|
||||
"loading": "Loading",
|
||||
"normalizing": "Normalizing",
|
||||
"parsing": "Parsing",
|
||||
"fonts": "Fonts",
|
||||
"text": "Text Extraction",
|
||||
"images": "Images",
|
||||
"annotations": "Annotations",
|
||||
"metadata": "Metadata",
|
||||
"serializing": "Finalizing",
|
||||
"complete": "Complete"
|
||||
}
|
||||
},
|
||||
"workspace": {
|
||||
|
||||
@ -11,8 +11,10 @@ import {
|
||||
FileButton,
|
||||
Group,
|
||||
Pagination,
|
||||
Progress,
|
||||
ScrollArea,
|
||||
Stack,
|
||||
Switch,
|
||||
Text,
|
||||
Title,
|
||||
} from '@mantine/core';
|
||||
@ -32,6 +34,7 @@ import {
|
||||
PdfJsonEditorViewData,
|
||||
PdfJsonFont,
|
||||
PdfJsonPage,
|
||||
ConversionProgress,
|
||||
} from '@app/tools/pdfJsonEditor/pdfJsonEditorTypes';
|
||||
import { getImageBounds, pageDimensions } from '@app/tools/pdfJsonEditor/pdfJsonEditorUtils';
|
||||
|
||||
@ -205,6 +208,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
const [activeImageId, setActiveImageId] = useState<string | null>(null);
|
||||
const [fontFamilies, setFontFamilies] = useState<Map<string, string>>(new Map());
|
||||
const [textGroupsExpanded, setTextGroupsExpanded] = useState(false);
|
||||
const [autoScaleText, setAutoScaleText] = useState(true);
|
||||
const [textScales, setTextScales] = useState<Map<string, number>>(new Map());
|
||||
const measurementKeyRef = useRef<string>('');
|
||||
const containerRef = useRef<HTMLDivElement | null>(null);
|
||||
const editorRefs = useRef<Map<string, HTMLDivElement>>(new Map());
|
||||
const caretOffsetsRef = useRef<Map<string, number>>(new Map());
|
||||
@ -220,6 +226,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
errorMessage,
|
||||
isGeneratingPdf,
|
||||
isConverting,
|
||||
conversionProgress,
|
||||
hasChanges,
|
||||
onLoadJson,
|
||||
onSelectPage,
|
||||
@ -562,8 +569,73 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
setActiveGroupId(null);
|
||||
setEditingGroupId(null);
|
||||
setActiveImageId(null);
|
||||
setTextScales(new Map());
|
||||
measurementKeyRef.current = '';
|
||||
}, [selectedPage]);
|
||||
|
||||
// Measure text widths once per page/configuration and apply static scaling
|
||||
useLayoutEffect(() => {
|
||||
if (!autoScaleText || visibleGroups.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Create a stable key for this measurement configuration
|
||||
const currentKey = `${selectedPage}-${fontFamilies.size}-${autoScaleText}`;
|
||||
|
||||
// Skip if we've already measured for this configuration
|
||||
if (measurementKeyRef.current === currentKey) {
|
||||
return;
|
||||
}
|
||||
|
||||
const measureTextScales = () => {
|
||||
const newScales = new Map<string, number>();
|
||||
|
||||
visibleGroups.forEach((group) => {
|
||||
// Skip groups that are being edited
|
||||
if (editingGroupId === group.id) {
|
||||
return;
|
||||
}
|
||||
|
||||
const element = document.querySelector<HTMLElement>(`[data-text-group="${group.id}"]`);
|
||||
if (!element) {
|
||||
return;
|
||||
}
|
||||
|
||||
const textSpan = element.querySelector<HTMLSpanElement>('span[data-text-content]');
|
||||
if (!textSpan) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Temporarily remove any existing transform to get natural width
|
||||
const originalTransform = textSpan.style.transform;
|
||||
textSpan.style.transform = 'none';
|
||||
|
||||
const bounds = toCssBounds(currentPage, pageHeight, scale, group.bounds);
|
||||
const containerWidth = bounds.width;
|
||||
const textWidth = textSpan.getBoundingClientRect().width;
|
||||
|
||||
// Restore original transform
|
||||
textSpan.style.transform = originalTransform;
|
||||
|
||||
// Only scale if text overflows by more than 2%
|
||||
if (textWidth > 0 && textWidth > containerWidth * 1.02) {
|
||||
const scaleX = Math.max(containerWidth / textWidth, 0.5); // Min 50% scale
|
||||
newScales.set(group.id, scaleX);
|
||||
} else {
|
||||
newScales.set(group.id, 1);
|
||||
}
|
||||
});
|
||||
|
||||
// Mark this configuration as measured
|
||||
measurementKeyRef.current = currentKey;
|
||||
setTextScales(newScales);
|
||||
};
|
||||
|
||||
// Delay measurement to ensure fonts and layout are ready
|
||||
const timer = setTimeout(measureTextScales, 150);
|
||||
return () => clearTimeout(timer);
|
||||
}, [autoScaleText, visibleGroups, editingGroupId, currentPage, pageHeight, scale, fontFamilies.size, selectedPage]);
|
||||
|
||||
useLayoutEffect(() => {
|
||||
if (!editingGroupId) {
|
||||
return;
|
||||
@ -726,6 +798,27 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
{t('pdfJsonEditor.currentFile', 'Current file: {{name}}', { name: fileName })}
|
||||
</Text>
|
||||
)}
|
||||
|
||||
<Divider my="sm" />
|
||||
|
||||
<Group justify="space-between" align="center">
|
||||
<div>
|
||||
<Text fw={500} size="sm">
|
||||
{t('pdfJsonEditor.options.autoScaleText.title', 'Auto-scale text to fit boxes')}
|
||||
</Text>
|
||||
<Text size="xs" c="dimmed" mt={4}>
|
||||
{t(
|
||||
'pdfJsonEditor.options.autoScaleText.description',
|
||||
'Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF.'
|
||||
)}
|
||||
</Text>
|
||||
</div>
|
||||
<Switch
|
||||
size="md"
|
||||
checked={autoScaleText}
|
||||
onChange={(event) => setAutoScaleText(event.currentTarget.checked)}
|
||||
/>
|
||||
</Group>
|
||||
</Stack>
|
||||
</Card>
|
||||
|
||||
@ -782,10 +875,39 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
|
||||
{isConverting && (
|
||||
<Card withBorder radius="md" padding="xl">
|
||||
<Stack align="center" gap="md">
|
||||
<AutorenewIcon sx={{ fontSize: 48 }} className="animate-spin" />
|
||||
<Text size="lg" fw={600}>
|
||||
{t('pdfJsonEditor.converting', 'Converting PDF to editable format...')}
|
||||
<Stack gap="md">
|
||||
<Group justify="space-between" align="flex-start">
|
||||
<div style={{ flex: 1 }}>
|
||||
<Text size="lg" fw={600} mb="xs">
|
||||
{conversionProgress
|
||||
? conversionProgress.message
|
||||
: t('pdfJsonEditor.converting', 'Converting PDF to editable format...')}
|
||||
</Text>
|
||||
{conversionProgress && (
|
||||
<Group gap="xs">
|
||||
<Text size="sm" c="dimmed" tt="capitalize">
|
||||
{t(`pdfJsonEditor.stages.${conversionProgress.stage}`, conversionProgress.stage)}
|
||||
</Text>
|
||||
{conversionProgress.current !== undefined &&
|
||||
conversionProgress.total !== undefined && (
|
||||
<Text size="sm" c="dimmed">
|
||||
• Page {conversionProgress.current} of {conversionProgress.total}
|
||||
</Text>
|
||||
)}
|
||||
</Group>
|
||||
)}
|
||||
</div>
|
||||
<AutorenewIcon sx={{ fontSize: 36 }} className="animate-spin" />
|
||||
</Group>
|
||||
<Progress
|
||||
value={conversionProgress?.percent || 0}
|
||||
size="lg"
|
||||
radius="md"
|
||||
animated
|
||||
striped
|
||||
/>
|
||||
<Text size="sm" c="dimmed" ta="right">
|
||||
{conversionProgress?.percent || 0}% complete
|
||||
</Text>
|
||||
</Stack>
|
||||
</Card>
|
||||
@ -1105,6 +1227,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
);
|
||||
}
|
||||
|
||||
const textScale = textScales.get(group.id) ?? 1;
|
||||
const shouldScale = autoScaleText && textScale < 0.98;
|
||||
|
||||
return (
|
||||
<Box key={group.id} style={containerStyle}>
|
||||
{renderGroupContainer(
|
||||
@ -1112,6 +1237,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
isActive,
|
||||
changed,
|
||||
<div
|
||||
data-text-group={group.id}
|
||||
style={{
|
||||
width: '100%',
|
||||
minHeight: '100%',
|
||||
@ -1127,7 +1253,17 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
overflow: 'visible',
|
||||
}}
|
||||
>
|
||||
<span style={{ pointerEvents: 'none' }}>{group.text || '\u00A0'}</span>
|
||||
<span
|
||||
data-text-content
|
||||
style={{
|
||||
pointerEvents: 'none',
|
||||
display: 'inline-block',
|
||||
transform: shouldScale ? `scaleX(${textScale})` : undefined,
|
||||
transformOrigin: 'left center',
|
||||
}}
|
||||
>
|
||||
{group.text || '\u00A0'}
|
||||
</span>
|
||||
</div>,
|
||||
() => {
|
||||
setEditingGroupId(group.id);
|
||||
|
||||
@ -27,8 +27,8 @@ export function useProprietaryToolRegistry(): ProprietaryToolRegistry {
|
||||
"home.pdfJsonEditor.desc",
|
||||
"Review and edit Stirling PDF JSON exports with grouped text editing and PDF regeneration"
|
||||
),
|
||||
categoryId: ToolCategoryId.ADVANCED_TOOLS,
|
||||
subcategoryId: SubcategoryId.DEVELOPER_TOOLS,
|
||||
categoryId: ToolCategoryId.RECOMMENDED_TOOLS,
|
||||
subcategoryId: SubcategoryId.GENERAL,
|
||||
workbench: "custom:pdfJsonEditor",
|
||||
endpoints: ["json-pdf"],
|
||||
synonyms: getSynonyms(t, "pdfJsonEditor"),
|
||||
|
||||
@ -13,6 +13,7 @@ import { getFilenameFromHeaders } from '@app/utils/fileResponseUtils';
|
||||
import {
|
||||
PdfJsonDocument,
|
||||
PdfJsonImageElement,
|
||||
PdfJsonPage,
|
||||
TextGroup,
|
||||
PdfJsonEditorViewData,
|
||||
} from './pdfJsonEditorTypes';
|
||||
@ -68,11 +69,39 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
const [errorMessage, setErrorMessage] = useState<string | null>(null);
|
||||
const [isGeneratingPdf, setIsGeneratingPdf] = useState(false);
|
||||
const [isConverting, setIsConverting] = useState(false);
|
||||
const [conversionProgress, setConversionProgress] = useState<{
|
||||
percent: number;
|
||||
stage: string;
|
||||
message: string;
|
||||
} | null>(null);
|
||||
|
||||
// Lazy loading state
|
||||
const [isLazyMode, setIsLazyMode] = useState(false);
|
||||
const [cachedJobId, setCachedJobId] = useState<string | null>(null);
|
||||
const [loadedImagePages, setLoadedImagePages] = useState<Set<number>>(new Set());
|
||||
const [loadingImagePages, setLoadingImagePages] = useState<Set<number>>(new Set());
|
||||
|
||||
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
|
||||
const imagesByPageRef = useRef<PdfJsonImageElement[][]>([]);
|
||||
const autoLoadKeyRef = useRef<string | null>(null);
|
||||
const loadRequestIdRef = useRef(0);
|
||||
const latestPdfRequestIdRef = useRef<number | null>(null);
|
||||
const loadedDocumentRef = useRef<PdfJsonDocument | null>(null);
|
||||
const loadedImagePagesRef = useRef<Set<number>>(new Set());
|
||||
const loadingImagePagesRef = useRef<Set<number>>(new Set());
|
||||
|
||||
// Keep ref in sync with state for access in async callbacks
|
||||
useEffect(() => {
|
||||
loadedDocumentRef.current = loadedDocument;
|
||||
}, [loadedDocument]);
|
||||
|
||||
useEffect(() => {
|
||||
loadedImagePagesRef.current = new Set(loadedImagePages);
|
||||
}, [loadedImagePages]);
|
||||
|
||||
useEffect(() => {
|
||||
loadingImagePagesRef.current = new Set(loadingImagePages);
|
||||
}, [loadingImagePages]);
|
||||
|
||||
const dirtyPages = useMemo(
|
||||
() => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current),
|
||||
@ -88,18 +117,134 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
setGroupsByPage([]);
|
||||
setImagesByPage([]);
|
||||
originalImagesRef.current = [];
|
||||
imagesByPageRef.current = [];
|
||||
setLoadedImagePages(new Set());
|
||||
setLoadingImagePages(new Set());
|
||||
loadedImagePagesRef.current = new Set();
|
||||
loadingImagePagesRef.current = new Set();
|
||||
setSelectedPage(0);
|
||||
return;
|
||||
}
|
||||
const cloned = deepCloneDocument(document);
|
||||
const groups = groupDocumentText(cloned);
|
||||
const images = extractDocumentImages(cloned);
|
||||
originalImagesRef.current = images.map((page) => page.map(cloneImageElement));
|
||||
const originalImages = images.map((page) => page.map(cloneImageElement));
|
||||
originalImagesRef.current = originalImages;
|
||||
imagesByPageRef.current = images.map((page) => page.map(cloneImageElement));
|
||||
const initialLoaded = new Set<number>();
|
||||
originalImages.forEach((pageImages, index) => {
|
||||
if (pageImages.length > 0) {
|
||||
initialLoaded.add(index);
|
||||
}
|
||||
});
|
||||
setGroupsByPage(groups);
|
||||
setImagesByPage(images);
|
||||
setLoadedImagePages(initialLoaded);
|
||||
setLoadingImagePages(new Set());
|
||||
loadedImagePagesRef.current = new Set(initialLoaded);
|
||||
loadingImagePagesRef.current = new Set();
|
||||
setSelectedPage(0);
|
||||
}, []);
|
||||
|
||||
// Load images for a page in lazy mode
|
||||
const loadImagesForPage = useCallback(
|
||||
async (pageIndex: number) => {
|
||||
if (!isLazyMode) {
|
||||
return;
|
||||
}
|
||||
if (!cachedJobId) {
|
||||
console.log('[loadImagesForPage] No cached jobId, skipping');
|
||||
return;
|
||||
}
|
||||
if (
|
||||
loadedImagePagesRef.current.has(pageIndex) ||
|
||||
loadingImagePagesRef.current.has(pageIndex)
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
loadingImagePagesRef.current.add(pageIndex);
|
||||
setLoadingImagePages((prev) => {
|
||||
const next = new Set(prev);
|
||||
next.add(pageIndex);
|
||||
return next;
|
||||
});
|
||||
|
||||
const pageNumber = pageIndex + 1;
|
||||
const start = performance.now();
|
||||
|
||||
try {
|
||||
const response = await apiClient.get(
|
||||
`/api/v1/convert/pdf/json/page/${cachedJobId}/${pageNumber}`,
|
||||
{
|
||||
responseType: 'json',
|
||||
},
|
||||
);
|
||||
|
||||
const pageData = response.data as PdfJsonPage;
|
||||
const normalizedImages = (pageData.imageElements ?? []).map(cloneImageElement);
|
||||
|
||||
if (imagesByPageRef.current.length <= pageIndex) {
|
||||
imagesByPageRef.current.length = pageIndex + 1;
|
||||
}
|
||||
imagesByPageRef.current[pageIndex] = normalizedImages.map(cloneImageElement);
|
||||
|
||||
setLoadedDocument((prevDoc) => {
|
||||
if (!prevDoc || !prevDoc.pages) {
|
||||
return prevDoc;
|
||||
}
|
||||
const nextPages = [...prevDoc.pages];
|
||||
const existingPage = nextPages[pageIndex] ?? {};
|
||||
nextPages[pageIndex] = {
|
||||
...existingPage,
|
||||
imageElements: normalizedImages.map(cloneImageElement),
|
||||
};
|
||||
return {
|
||||
...prevDoc,
|
||||
pages: nextPages,
|
||||
};
|
||||
});
|
||||
|
||||
setImagesByPage((prev) => {
|
||||
const next = [...prev];
|
||||
while (next.length <= pageIndex) {
|
||||
next.push([]);
|
||||
}
|
||||
next[pageIndex] = normalizedImages.map(cloneImageElement);
|
||||
return next;
|
||||
});
|
||||
|
||||
if (originalImagesRef.current.length <= pageIndex) {
|
||||
originalImagesRef.current.length = pageIndex + 1;
|
||||
}
|
||||
originalImagesRef.current[pageIndex] = normalizedImages.map(cloneImageElement);
|
||||
|
||||
setLoadedImagePages((prev) => {
|
||||
const next = new Set(prev);
|
||||
next.add(pageIndex);
|
||||
return next;
|
||||
});
|
||||
loadedImagePagesRef.current.add(pageIndex);
|
||||
|
||||
console.log(
|
||||
`[loadImagesForPage] Loaded ${normalizedImages.length} images for page ${pageNumber} in ${(
|
||||
performance.now() - start
|
||||
).toFixed(2)}ms`,
|
||||
);
|
||||
} catch (error) {
|
||||
console.error(`[loadImagesForPage] Failed to load images for page ${pageNumber}:`, error);
|
||||
} finally {
|
||||
loadingImagePagesRef.current.delete(pageIndex);
|
||||
setLoadingImagePages((prev) => {
|
||||
const next = new Set(prev);
|
||||
next.delete(pageIndex);
|
||||
return next;
|
||||
});
|
||||
}
|
||||
},
|
||||
[isLazyMode, cachedJobId],
|
||||
);
|
||||
|
||||
const handleLoadFile = useCallback(
|
||||
async (file: File | null) => {
|
||||
if (!file) {
|
||||
@ -113,39 +258,200 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
const isPdf = file.type === 'application/pdf' || file.name.toLowerCase().endsWith('.pdf');
|
||||
|
||||
try {
|
||||
let parsed: PdfJsonDocument;
|
||||
let parsed: PdfJsonDocument | null = null;
|
||||
let shouldUseLazyMode = false;
|
||||
let pendingJobId: string | null = null;
|
||||
|
||||
setErrorMessage(null);
|
||||
|
||||
if (isPdf) {
|
||||
latestPdfRequestIdRef.current = requestId;
|
||||
setIsConverting(true);
|
||||
setConversionProgress({
|
||||
percent: 0,
|
||||
stage: 'uploading',
|
||||
message: 'Uploading PDF file to server...',
|
||||
});
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('fileInput', file);
|
||||
|
||||
const response = await apiClient.post(CONVERSION_ENDPOINTS['pdf-json'], formData, {
|
||||
responseType: 'blob',
|
||||
console.log('Sending conversion request with async=true');
|
||||
const response = await apiClient.post(
|
||||
`${CONVERSION_ENDPOINTS['pdf-json']}?async=true`,
|
||||
formData,
|
||||
{
|
||||
responseType: 'json',
|
||||
},
|
||||
);
|
||||
|
||||
console.log('Conversion response:', response.data);
|
||||
const jobId = response.data.jobId;
|
||||
|
||||
if (!jobId) {
|
||||
console.error('No job ID in response:', response.data);
|
||||
throw new Error('No job ID received from server');
|
||||
}
|
||||
|
||||
pendingJobId = jobId;
|
||||
console.log('Got job ID:', jobId);
|
||||
setConversionProgress({
|
||||
percent: 3,
|
||||
stage: 'processing',
|
||||
message: 'Starting conversion...',
|
||||
});
|
||||
|
||||
const jsonText = await response.data.text();
|
||||
parsed = JSON.parse(jsonText) as PdfJsonDocument;
|
||||
let jobComplete = false;
|
||||
let attempts = 0;
|
||||
const maxAttempts = 600;
|
||||
|
||||
while (!jobComplete && attempts < maxAttempts) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||
attempts += 1;
|
||||
|
||||
try {
|
||||
const statusResponse = await apiClient.get(`/api/v1/general/job/${jobId}`);
|
||||
const jobStatus = statusResponse.data;
|
||||
console.log(`Job status (attempt ${attempts}):`, jobStatus);
|
||||
|
||||
if (jobStatus.notes && jobStatus.notes.length > 0) {
|
||||
const lastNote = jobStatus.notes[jobStatus.notes.length - 1];
|
||||
console.log('Latest note:', lastNote);
|
||||
const matchWithCount = lastNote.match(
|
||||
/\[(\d+)%\]\s+(\w+):\s+(.+?)\s+\((\d+)\/(\d+)\)/,
|
||||
);
|
||||
if (matchWithCount) {
|
||||
const percent = parseInt(matchWithCount[1], 10);
|
||||
const stage = matchWithCount[2];
|
||||
const message = matchWithCount[3];
|
||||
const current = parseInt(matchWithCount[4], 10);
|
||||
const total = parseInt(matchWithCount[5], 10);
|
||||
setConversionProgress({
|
||||
percent,
|
||||
stage,
|
||||
message,
|
||||
current,
|
||||
total,
|
||||
});
|
||||
} else {
|
||||
const match = lastNote.match(/\[(\d+)%\]\s+(\w+):\s+(.+)/);
|
||||
if (match) {
|
||||
const percent = parseInt(match[1], 10);
|
||||
const stage = match[2];
|
||||
const message = match[3];
|
||||
setConversionProgress({
|
||||
percent,
|
||||
stage,
|
||||
message,
|
||||
});
|
||||
}
|
||||
}
|
||||
} else if (jobStatus.progress !== undefined) {
|
||||
const percent = Math.min(Math.max(jobStatus.progress, 0), 100);
|
||||
setConversionProgress({
|
||||
percent,
|
||||
stage: jobStatus.stage || 'processing',
|
||||
message: jobStatus.note || 'Converting PDF to JSON...',
|
||||
});
|
||||
}
|
||||
|
||||
if (jobStatus.complete) {
|
||||
if (jobStatus.error) {
|
||||
console.error('Job failed:', jobStatus.error);
|
||||
throw new Error(jobStatus.error);
|
||||
}
|
||||
|
||||
console.log('Job completed, retrieving JSON result...');
|
||||
jobComplete = true;
|
||||
|
||||
const resultResponse = await apiClient.get(
|
||||
`/api/v1/general/job/${jobId}/result`,
|
||||
{
|
||||
responseType: 'blob',
|
||||
},
|
||||
);
|
||||
|
||||
const jsonText = await resultResponse.data.text();
|
||||
const result = JSON.parse(jsonText);
|
||||
|
||||
if (!Array.isArray(result.pages)) {
|
||||
console.error('Conversion result missing page array:', result);
|
||||
throw new Error(
|
||||
'PDF conversion result did not include page data. Please update the server.',
|
||||
);
|
||||
}
|
||||
|
||||
const docResult = result as PdfJsonDocument;
|
||||
parsed = {
|
||||
...docResult,
|
||||
pages: docResult.pages ?? [],
|
||||
};
|
||||
shouldUseLazyMode = Boolean(docResult.lazyImages);
|
||||
pendingJobId = shouldUseLazyMode ? jobId : null;
|
||||
setConversionProgress(null);
|
||||
} else {
|
||||
console.log('Job not complete yet, continuing to poll...');
|
||||
}
|
||||
} catch (pollError: any) {
|
||||
console.error('Error polling job status:', pollError);
|
||||
console.error('Poll error details:', {
|
||||
status: pollError?.response?.status,
|
||||
data: pollError?.response?.data,
|
||||
message: pollError?.message,
|
||||
});
|
||||
if (pollError?.response?.status === 404) {
|
||||
throw new Error('Job not found on server');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!jobComplete) {
|
||||
throw new Error('Conversion timed out');
|
||||
}
|
||||
if (!parsed) {
|
||||
throw new Error('Conversion did not return JSON content');
|
||||
}
|
||||
} else {
|
||||
const content = await file.text();
|
||||
parsed = JSON.parse(content) as PdfJsonDocument;
|
||||
const docResult = JSON.parse(content) as PdfJsonDocument;
|
||||
parsed = {
|
||||
...docResult,
|
||||
pages: docResult.pages ?? [],
|
||||
};
|
||||
shouldUseLazyMode = false;
|
||||
pendingJobId = null;
|
||||
}
|
||||
|
||||
setConversionProgress(null);
|
||||
|
||||
if (loadRequestIdRef.current !== requestId) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!parsed) {
|
||||
throw new Error('Failed to parse PDF JSON document');
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[PdfJsonEditor] Document loaded. Lazy image mode: ${shouldUseLazyMode}, Pages: ${
|
||||
parsed.pages?.length || 0
|
||||
}`,
|
||||
);
|
||||
|
||||
setLoadedDocument(parsed);
|
||||
resetToDocument(parsed);
|
||||
setIsLazyMode(shouldUseLazyMode);
|
||||
setCachedJobId(shouldUseLazyMode ? pendingJobId : null);
|
||||
setFileName(file.name);
|
||||
setErrorMessage(null);
|
||||
autoLoadKeyRef.current = fileKey;
|
||||
} catch (error) {
|
||||
} catch (error: any) {
|
||||
console.error('Failed to load file', error);
|
||||
console.error('Error details:', {
|
||||
message: error?.message,
|
||||
response: error?.response?.data,
|
||||
stack: error?.stack,
|
||||
});
|
||||
|
||||
if (loadRequestIdRef.current !== requestId) {
|
||||
return;
|
||||
@ -155,15 +461,17 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
resetToDocument(null);
|
||||
|
||||
if (isPdf) {
|
||||
setErrorMessage(
|
||||
t('pdfJsonEditor.conversionFailed', 'Failed to convert PDF. Please try again.')
|
||||
);
|
||||
const errorMsg =
|
||||
error?.message ||
|
||||
t('pdfJsonEditor.conversionFailed', 'Failed to convert PDF. Please try again.');
|
||||
setErrorMessage(errorMsg);
|
||||
console.error('Setting error message:', errorMsg);
|
||||
} else {
|
||||
setErrorMessage(
|
||||
t(
|
||||
'pdfJsonEditor.errors.invalidJson',
|
||||
'Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.'
|
||||
)
|
||||
'Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.',
|
||||
),
|
||||
);
|
||||
}
|
||||
} finally {
|
||||
@ -172,12 +480,16 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}
|
||||
}
|
||||
},
|
||||
[resetToDocument, t]
|
||||
[resetToDocument, t],
|
||||
);
|
||||
|
||||
const handleSelectPage = useCallback((pageIndex: number) => {
|
||||
setSelectedPage(pageIndex);
|
||||
}, []);
|
||||
// Trigger lazy loading for images on the selected page
|
||||
if (isLazyMode) {
|
||||
void loadImagesForPage(pageIndex);
|
||||
}
|
||||
}, [isLazyMode, loadImagesForPage]);
|
||||
|
||||
const handleGroupTextChange = useCallback((pageIndex: number, groupId: string, value: string) => {
|
||||
setGroupsByPage((previous) =>
|
||||
@ -195,55 +507,63 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
imageId: string,
|
||||
next: { left: number; bottom: number; width: number; height: number; transform: number[] },
|
||||
) => {
|
||||
setImagesByPage((previous) =>
|
||||
previous.map((images, idx) => {
|
||||
if (idx !== pageIndex) {
|
||||
return images;
|
||||
setImagesByPage((previous) => {
|
||||
const current = previous[pageIndex] ?? [];
|
||||
let changed = false;
|
||||
const updatedPage = current.map((image) => {
|
||||
if ((image.id ?? '') !== imageId) {
|
||||
return image;
|
||||
}
|
||||
let changed = false;
|
||||
const updated = images.map((image) => {
|
||||
if ((image.id ?? '') !== imageId) {
|
||||
return image;
|
||||
}
|
||||
const originalTransform = image.transform ?? originalImagesRef.current[idx]?.find((base) => (base.id ?? '') === imageId)?.transform;
|
||||
const scaleXSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[0]) || 1 : 1;
|
||||
const scaleYSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[3]) || 1 : 1;
|
||||
const right = next.left + next.width;
|
||||
const top = next.bottom + next.height;
|
||||
const updatedImage: PdfJsonImageElement = {
|
||||
...image,
|
||||
x: next.left,
|
||||
y: next.bottom,
|
||||
left: next.left,
|
||||
bottom: next.bottom,
|
||||
right,
|
||||
top,
|
||||
width: next.width,
|
||||
height: next.height,
|
||||
transform: scaleXSign < 0 || scaleYSign < 0 ? [
|
||||
next.width * scaleXSign,
|
||||
0,
|
||||
0,
|
||||
next.height * scaleYSign,
|
||||
next.left,
|
||||
scaleYSign >= 0 ? next.bottom : next.bottom + next.height,
|
||||
] : null,
|
||||
};
|
||||
const originalTransform = image.transform ?? originalImagesRef.current[pageIndex]?.find((base) => (base.id ?? '') === imageId)?.transform;
|
||||
const scaleXSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[0]) || 1 : 1;
|
||||
const scaleYSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[3]) || 1 : 1;
|
||||
const right = next.left + next.width;
|
||||
const top = next.bottom + next.height;
|
||||
const updatedImage: PdfJsonImageElement = {
|
||||
...image,
|
||||
x: next.left,
|
||||
y: next.bottom,
|
||||
left: next.left,
|
||||
bottom: next.bottom,
|
||||
right,
|
||||
top,
|
||||
width: next.width,
|
||||
height: next.height,
|
||||
transform: scaleXSign < 0 || scaleYSign < 0
|
||||
? [
|
||||
next.width * scaleXSign,
|
||||
0,
|
||||
0,
|
||||
next.height * scaleYSign,
|
||||
next.left,
|
||||
scaleYSign >= 0 ? next.bottom : next.bottom + next.height,
|
||||
]
|
||||
: null,
|
||||
};
|
||||
|
||||
const isSame =
|
||||
Math.abs(valueOr(image.left, 0) - next.left) < 1e-4 &&
|
||||
Math.abs(valueOr(image.bottom, 0) - next.bottom) < 1e-4 &&
|
||||
Math.abs(valueOr(image.width, 0) - next.width) < 1e-4 &&
|
||||
Math.abs(valueOr(image.height, 0) - next.height) < 1e-4;
|
||||
const isSame =
|
||||
Math.abs(valueOr(image.left, 0) - next.left) < 1e-4 &&
|
||||
Math.abs(valueOr(image.bottom, 0) - next.bottom) < 1e-4 &&
|
||||
Math.abs(valueOr(image.width, 0) - next.width) < 1e-4 &&
|
||||
Math.abs(valueOr(image.height, 0) - next.height) < 1e-4;
|
||||
|
||||
if (!isSame) {
|
||||
changed = true;
|
||||
}
|
||||
return updatedImage;
|
||||
});
|
||||
return changed ? updated : images;
|
||||
}),
|
||||
);
|
||||
if (!isSame) {
|
||||
changed = true;
|
||||
}
|
||||
return updatedImage;
|
||||
});
|
||||
|
||||
if (!changed) {
|
||||
return previous;
|
||||
}
|
||||
|
||||
const nextImages = previous.map((images, idx) => (idx === pageIndex ? updatedPage : images));
|
||||
if (imagesByPageRef.current.length <= pageIndex) {
|
||||
imagesByPageRef.current.length = pageIndex + 1;
|
||||
}
|
||||
imagesByPageRef.current[pageIndex] = updatedPage.map(cloneImageElement);
|
||||
return nextImages;
|
||||
});
|
||||
},
|
||||
[],
|
||||
);
|
||||
@ -253,14 +573,28 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
if (!baseline) {
|
||||
return;
|
||||
}
|
||||
setImagesByPage((previous) =>
|
||||
previous.map((images, idx) => {
|
||||
if (idx !== pageIndex) {
|
||||
return images;
|
||||
setImagesByPage((previous) => {
|
||||
const current = previous[pageIndex] ?? [];
|
||||
let changed = false;
|
||||
const updatedPage = current.map((image) => {
|
||||
if ((image.id ?? '') !== imageId) {
|
||||
return image;
|
||||
}
|
||||
return images.map((image) => ((image.id ?? '') === imageId ? cloneImageElement(baseline) : image));
|
||||
}),
|
||||
);
|
||||
changed = true;
|
||||
return cloneImageElement(baseline);
|
||||
});
|
||||
|
||||
if (!changed) {
|
||||
return previous;
|
||||
}
|
||||
|
||||
const nextImages = previous.map((images, idx) => (idx === pageIndex ? updatedPage : images));
|
||||
if (imagesByPageRef.current.length <= pageIndex) {
|
||||
imagesByPageRef.current.length = pageIndex + 1;
|
||||
}
|
||||
imagesByPageRef.current[pageIndex] = updatedPage.map(cloneImageElement);
|
||||
return nextImages;
|
||||
});
|
||||
}, []);
|
||||
|
||||
const handleResetEdits = useCallback(() => {
|
||||
@ -279,7 +613,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
const updatedDocument = restoreGlyphElements(
|
||||
loadedDocument,
|
||||
groupsByPage,
|
||||
imagesByPage,
|
||||
imagesByPageRef.current,
|
||||
originalImagesRef.current,
|
||||
);
|
||||
const baseName = sanitizeBaseName(fileName || loadedDocument.metadata?.title || undefined);
|
||||
@ -287,7 +621,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
document: updatedDocument,
|
||||
filename: `${baseName}.json`,
|
||||
};
|
||||
}, [fileName, groupsByPage, imagesByPage, loadedDocument]);
|
||||
}, [fileName, groupsByPage, loadedDocument]);
|
||||
|
||||
const handleDownloadJson = useCallback(() => {
|
||||
const payload = buildPayload();
|
||||
@ -306,20 +640,129 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}, [buildPayload, onComplete]);
|
||||
|
||||
const handleGeneratePdf = useCallback(async () => {
|
||||
const payload = buildPayload();
|
||||
if (!payload) {
|
||||
return;
|
||||
}
|
||||
|
||||
const { document, filename } = payload;
|
||||
const serialized = JSON.stringify(document, null, 2);
|
||||
const jsonFile = new File([serialized], filename, { type: 'application/json' });
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('fileInput', jsonFile);
|
||||
|
||||
try {
|
||||
setIsGeneratingPdf(true);
|
||||
|
||||
const ensureImagesForPages = async (pageIndices: number[]) => {
|
||||
const uniqueIndices = Array.from(new Set(pageIndices)).filter((index) => index >= 0);
|
||||
if (uniqueIndices.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const index of uniqueIndices) {
|
||||
if (!loadedImagePagesRef.current.has(index)) {
|
||||
await loadImagesForPage(index);
|
||||
}
|
||||
}
|
||||
|
||||
const maxWaitTime = 15000;
|
||||
const pollInterval = 150;
|
||||
const startWait = Date.now();
|
||||
while (Date.now() - startWait < maxWaitTime) {
|
||||
const allLoaded = uniqueIndices.every(
|
||||
(index) =>
|
||||
loadedImagePagesRef.current.has(index) &&
|
||||
imagesByPageRef.current[index] !== undefined,
|
||||
);
|
||||
const anyLoading = uniqueIndices.some((index) =>
|
||||
loadingImagePagesRef.current.has(index),
|
||||
);
|
||||
if (allLoaded && !anyLoading) {
|
||||
return;
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, pollInterval));
|
||||
}
|
||||
|
||||
const missing = uniqueIndices.filter(
|
||||
(index) => !loadedImagePagesRef.current.has(index),
|
||||
);
|
||||
if (missing.length > 0) {
|
||||
throw new Error(
|
||||
`Failed to load images for pages ${missing.map((i) => i + 1).join(', ')}`,
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
const currentDoc = loadedDocumentRef.current;
|
||||
const totalPages = currentDoc?.pages?.length ?? 0;
|
||||
const dirtyPageIndices = dirtyPages
|
||||
.map((isDirty, index) => (isDirty ? index : -1))
|
||||
.filter((index) => index >= 0);
|
||||
|
||||
const canUseIncremental =
|
||||
isLazyMode &&
|
||||
cachedJobId &&
|
||||
dirtyPageIndices.length > 0 &&
|
||||
dirtyPageIndices.length < totalPages;
|
||||
|
||||
if (canUseIncremental) {
|
||||
await ensureImagesForPages(dirtyPageIndices);
|
||||
|
||||
try {
|
||||
const payload = buildPayload();
|
||||
if (!payload) {
|
||||
return;
|
||||
}
|
||||
|
||||
const { document, filename } = payload;
|
||||
const dirtyPageSet = new Set(dirtyPageIndices);
|
||||
const partialPages =
|
||||
document.pages?.filter((_, index) => dirtyPageSet.has(index)) ?? [];
|
||||
|
||||
const partialDocument: PdfJsonDocument = {
|
||||
metadata: document.metadata,
|
||||
xmpMetadata: document.xmpMetadata,
|
||||
fonts: document.fonts,
|
||||
lazyImages: true,
|
||||
pages: partialPages,
|
||||
};
|
||||
|
||||
const baseName = sanitizeBaseName(filename).replace(/-edited$/u, '');
|
||||
const expectedName = `${baseName || 'document'}.pdf`;
|
||||
const response = await apiClient.post(
|
||||
`/api/v1/convert/pdf/json/partial/${cachedJobId}?filename=${encodeURIComponent(expectedName)}`,
|
||||
partialDocument,
|
||||
{
|
||||
responseType: 'blob',
|
||||
},
|
||||
);
|
||||
|
||||
const contentDisposition = response.headers?.['content-disposition'] ?? '';
|
||||
const detectedName = getFilenameFromHeaders(contentDisposition);
|
||||
const downloadName = detectedName || expectedName;
|
||||
|
||||
downloadBlob(response.data, downloadName);
|
||||
|
||||
if (onComplete) {
|
||||
const pdfFile = new File([response.data], downloadName, { type: 'application/pdf' });
|
||||
onComplete([pdfFile]);
|
||||
}
|
||||
setErrorMessage(null);
|
||||
return;
|
||||
} catch (incrementalError) {
|
||||
console.warn(
|
||||
'[handleGeneratePdf] Incremental export failed, falling back to full export',
|
||||
incrementalError,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (isLazyMode && totalPages > 0) {
|
||||
const allPageIndices = Array.from({ length: totalPages }, (_, index) => index);
|
||||
await ensureImagesForPages(allPageIndices);
|
||||
}
|
||||
|
||||
const payload = buildPayload();
|
||||
if (!payload) {
|
||||
return;
|
||||
}
|
||||
|
||||
const { document, filename } = payload;
|
||||
const serialized = JSON.stringify(document, null, 2);
|
||||
const jsonFile = new File([serialized], filename, { type: 'application/json' });
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('fileInput', jsonFile);
|
||||
const response = await apiClient.post(CONVERSION_ENDPOINTS['json-pdf'], formData, {
|
||||
responseType: 'blob',
|
||||
});
|
||||
@ -350,7 +793,16 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
} finally {
|
||||
setIsGeneratingPdf(false);
|
||||
}
|
||||
}, [buildPayload, onComplete, onError, t]);
|
||||
}, [
|
||||
buildPayload,
|
||||
cachedJobId,
|
||||
dirtyPages,
|
||||
isLazyMode,
|
||||
loadImagesForPage,
|
||||
onComplete,
|
||||
onError,
|
||||
t,
|
||||
]);
|
||||
|
||||
const viewData = useMemo<PdfJsonEditorViewData>(() => ({
|
||||
document: loadedDocument,
|
||||
@ -363,6 +815,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
errorMessage,
|
||||
isGeneratingPdf,
|
||||
isConverting,
|
||||
conversionProgress,
|
||||
hasChanges,
|
||||
onLoadJson: handleLoadFile,
|
||||
onSelectPage: handleSelectPage,
|
||||
@ -390,6 +843,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
hasDocument,
|
||||
isGeneratingPdf,
|
||||
isConverting,
|
||||
conversionProgress,
|
||||
loadedDocument,
|
||||
selectedPage,
|
||||
]);
|
||||
@ -397,6 +851,13 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
const latestViewDataRef = useRef<PdfJsonEditorViewData>(viewData);
|
||||
latestViewDataRef.current = viewData;
|
||||
|
||||
// Trigger initial image loading in lazy mode
|
||||
useEffect(() => {
|
||||
if (isLazyMode && loadedDocument) {
|
||||
void loadImagesForPage(selectedPage);
|
||||
}
|
||||
}, [isLazyMode, loadedDocument, selectedPage, loadImagesForPage]);
|
||||
|
||||
useEffect(() => {
|
||||
if (selectedFiles.length === 0) {
|
||||
autoLoadKeyRef.current = null;
|
||||
@ -433,11 +894,20 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
setCustomWorkbenchViewData(VIEW_ID, latestViewDataRef.current);
|
||||
|
||||
return () => {
|
||||
// Clear backend cache if we were using lazy loading
|
||||
if (cachedJobId) {
|
||||
console.log(`[PdfJsonEditor] Cleaning up cached document for jobId: ${cachedJobId}`);
|
||||
apiClient.post(`/api/v1/convert/pdf/json/clear-cache/${cachedJobId}`).catch((error) => {
|
||||
console.warn('[PdfJsonEditor] Failed to clear cache:', error);
|
||||
});
|
||||
}
|
||||
|
||||
clearCustomWorkbenchViewData(VIEW_ID);
|
||||
unregisterCustomWorkbenchView(VIEW_ID);
|
||||
setLeftPanelView('toolPicker');
|
||||
};
|
||||
}, [
|
||||
cachedJobId,
|
||||
clearCustomWorkbenchViewData,
|
||||
registerCustomWorkbenchView,
|
||||
setCustomWorkbenchViewData,
|
||||
|
||||
@ -122,6 +122,23 @@ export interface PdfJsonDocument {
|
||||
xmpMetadata?: string | null;
|
||||
fonts?: PdfJsonFont[] | null;
|
||||
pages?: PdfJsonPage[] | null;
|
||||
lazyImages?: boolean | null;
|
||||
}
|
||||
|
||||
export interface PdfJsonPageDimension {
|
||||
pageNumber?: number | null;
|
||||
width?: number | null;
|
||||
height?: number | null;
|
||||
rotation?: number | null;
|
||||
}
|
||||
|
||||
export interface PdfJsonDocumentMetadata {
|
||||
metadata?: PdfJsonMetadata | null;
|
||||
xmpMetadata?: string | null;
|
||||
fonts?: PdfJsonFont[] | null;
|
||||
pageDimensions?: PdfJsonPageDimension[] | null;
|
||||
formFields?: unknown[] | null;
|
||||
lazyImages?: boolean | null;
|
||||
}
|
||||
|
||||
export interface BoundingBox {
|
||||
@ -153,6 +170,14 @@ export interface TextGroup {
|
||||
export const DEFAULT_PAGE_WIDTH = 612;
|
||||
export const DEFAULT_PAGE_HEIGHT = 792;
|
||||
|
||||
export interface ConversionProgress {
|
||||
percent: number;
|
||||
stage: string;
|
||||
message: string;
|
||||
current?: number;
|
||||
total?: number;
|
||||
}
|
||||
|
||||
export interface PdfJsonEditorViewData {
|
||||
document: PdfJsonDocument | null;
|
||||
groupsByPage: TextGroup[][];
|
||||
@ -164,6 +189,7 @@ export interface PdfJsonEditorViewData {
|
||||
errorMessage: string | null;
|
||||
isGeneratingPdf: boolean;
|
||||
isConverting: boolean;
|
||||
conversionProgress: ConversionProgress | null;
|
||||
hasChanges: boolean;
|
||||
onLoadJson: (file: File | null) => Promise<void> | void;
|
||||
onSelectPage: (pageIndex: number) => void;
|
||||
|
||||
@ -15,6 +15,7 @@ export default defineConfig({
|
||||
}),
|
||||
],
|
||||
server: {
|
||||
host: true,
|
||||
proxy: {
|
||||
'/api': {
|
||||
target: 'http://localhost:8080',
|
||||
|
||||
Loading…
Reference in New Issue
Block a user