editor revamp, complete change

This commit is contained in:
Anthony Stirling 2025-11-02 21:00:03 +00:00
parent ec0ae36a82
commit bbcb23ca11
25 changed files with 3747 additions and 1021 deletions

View File

@ -148,17 +148,31 @@ public class JobExecutorService {
taskManager.createTask(jobId); taskManager.createTask(jobId);
// Create a specialized wrapper that updates the TaskManager // Create a specialized wrapper that updates the TaskManager
final String capturedJobIdForQueue = jobId;
Supplier<Object> wrappedWork = Supplier<Object> wrappedWork =
() -> { () -> {
try { try {
// Set jobId in ThreadLocal context for the queued job
stirling.software.common.util.JobContext.setJobId(
capturedJobIdForQueue);
log.debug(
"Set jobId {} in JobContext for queued job execution",
capturedJobIdForQueue);
Object result = work.get(); Object result = work.get();
processJobResult(jobId, result); processJobResult(capturedJobIdForQueue, result);
return result; return result;
} catch (Exception e) { } catch (Exception e) {
log.error( log.error(
"Error executing queued job {}: {}", jobId, e.getMessage(), e); "Error executing queued job {}: {}",
taskManager.setError(jobId, e.getMessage()); capturedJobIdForQueue,
e.getMessage(),
e);
taskManager.setError(capturedJobIdForQueue, e.getMessage());
throw e; throw e;
} finally {
// Clean up ThreadLocal to avoid memory leaks
stirling.software.common.util.JobContext.clear();
} }
}; };
@ -170,21 +184,36 @@ public class JobExecutorService {
return ResponseEntity.ok().body(new JobResponse<>(true, jobId, null)); return ResponseEntity.ok().body(new JobResponse<>(true, jobId, null));
} else if (async) { } else if (async) {
taskManager.createTask(jobId); taskManager.createTask(jobId);
// Capture the jobId for the async thread
final String capturedJobId = jobId;
executor.execute( executor.execute(
() -> { () -> {
try { try {
log.debug( log.debug(
"Running async job {} with timeout {} ms", jobId, timeoutToUse); "Running async job {} with timeout {} ms",
capturedJobId,
timeoutToUse);
// Set jobId in ThreadLocal context for the async thread
stirling.software.common.util.JobContext.setJobId(capturedJobId);
log.debug(
"Set jobId {} in JobContext for async execution",
capturedJobId);
// Execute with timeout // Execute with timeout
Object result = executeWithTimeout(() -> work.get(), timeoutToUse); Object result = executeWithTimeout(() -> work.get(), timeoutToUse);
processJobResult(jobId, result); processJobResult(capturedJobId, result);
} catch (TimeoutException te) { } catch (TimeoutException te) {
log.error("Job {} timed out after {} ms", jobId, timeoutToUse); log.error("Job {} timed out after {} ms", jobId, timeoutToUse);
taskManager.setError(jobId, "Job timed out"); taskManager.setError(jobId, "Job timed out");
} catch (Exception e) { } catch (Exception e) {
log.error("Error executing job {}: {}", jobId, e.getMessage(), e); log.error("Error executing job {}: {}", jobId, e.getMessage(), e);
taskManager.setError(jobId, e.getMessage()); taskManager.setError(jobId, e.getMessage());
} finally {
// Clean up ThreadLocal to avoid memory leaks
stirling.software.common.util.JobContext.clear();
} }
}); });
@ -193,6 +222,10 @@ public class JobExecutorService {
try { try {
log.debug("Running sync job with timeout {} ms", timeoutToUse); log.debug("Running sync job with timeout {} ms", timeoutToUse);
// Make jobId available to downstream components on the worker thread
stirling.software.common.util.JobContext.setJobId(jobId);
log.debug("Set jobId {} in JobContext for sync execution", jobId);
// Execute with timeout // Execute with timeout
Object result = executeWithTimeout(() -> work.get(), timeoutToUse); Object result = executeWithTimeout(() -> work.get(), timeoutToUse);
@ -212,6 +245,8 @@ public class JobExecutorService {
// Construct a JSON error response // Construct a JSON error response
return ResponseEntity.internalServerError() return ResponseEntity.internalServerError()
.body(Map.of("error", "Job failed: " + e.getMessage())); .body(Map.of("error", "Job failed: " + e.getMessage()));
} finally {
stirling.software.common.util.JobContext.clear();
} }
} }
} }
@ -456,8 +491,23 @@ public class JobExecutorService {
throws TimeoutException, Exception { throws TimeoutException, Exception {
// Use the same executor as other async jobs for consistency // Use the same executor as other async jobs for consistency
// This ensures all operations run on the same thread pool // This ensures all operations run on the same thread pool
String currentJobId = stirling.software.common.util.JobContext.getJobId();
java.util.concurrent.CompletableFuture<T> future = java.util.concurrent.CompletableFuture<T> future =
java.util.concurrent.CompletableFuture.supplyAsync(supplier, executor); java.util.concurrent.CompletableFuture.supplyAsync(
() -> {
if (currentJobId != null) {
stirling.software.common.util.JobContext.setJobId(currentJobId);
}
try {
return supplier.get();
} finally {
if (currentJobId != null) {
stirling.software.common.util.JobContext.clear();
}
}
},
executor);
try { try {
return future.get(timeoutMs, TimeUnit.MILLISECONDS); return future.get(timeoutMs, TimeUnit.MILLISECONDS);

View File

@ -0,0 +1,18 @@
package stirling.software.common.util;
/** Thread-local context for passing job ID across async boundaries */
public class JobContext {
private static final ThreadLocal<String> CURRENT_JOB_ID = new ThreadLocal<>();
public static void setJobId(String jobId) {
CURRENT_JOB_ID.set(jobId);
}
public static String getJobId() {
return CURRENT_JOB_ID.get();
}
public static void clear() {
CURRENT_JOB_ID.remove();
}
}

View File

@ -94,6 +94,7 @@ public class ProcessExecutor {
.getProcessExecutor() .getProcessExecutor()
.getSessionLimit() .getSessionLimit()
.getOcrMyPdfSessionLimit(); .getOcrMyPdfSessionLimit();
case CFF_CONVERTER -> 1;
}; };
long timeoutMinutes = long timeoutMinutes =
@ -148,6 +149,7 @@ public class ProcessExecutor {
.getProcessExecutor() .getProcessExecutor()
.getTimeoutMinutes() .getTimeoutMinutes()
.getOcrMyPdfTimeoutMinutes(); .getOcrMyPdfTimeoutMinutes();
case CFF_CONVERTER -> 5L;
}; };
return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes); return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes);
}); });
@ -300,7 +302,8 @@ public class ProcessExecutor {
TESSERACT, TESSERACT,
QPDF, QPDF,
GHOSTSCRIPT, GHOSTSCRIPT,
OCR_MY_PDF OCR_MY_PDF,
CFF_CONVERTER
} }
public class ProcessExecutorResult { public class ProcessExecutorResult {

View File

@ -78,6 +78,23 @@ class JobExecutorServiceTest {
verify(request).setAttribute(eq("jobId"), anyString()); verify(request).setAttribute(eq("jobId"), anyString());
} }
@Test
void shouldExposeJobIdInJobContextDuringSyncExecution() throws Exception {
// Given
Supplier<Object> work = stirling.software.common.util.JobContext::getJobId;
// When
ResponseEntity<?> response = jobExecutorService.runJobGeneric(false, work);
// Then
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
var requestJobIdCaptor = ArgumentCaptor.forClass(String.class);
verify(request).setAttribute(eq("jobId"), requestJobIdCaptor.capture());
assertEquals(requestJobIdCaptor.getValue(), response.getBody());
}
@Test @Test
void shouldRunAsyncJobSuccessfully() throws Exception { void shouldRunAsyncJobSuccessfully() throws Exception {
// Given // Given

View File

@ -8,6 +8,8 @@ logging.level.org.eclipse.jetty=WARN
#logging.level.stirling.software.proprietary.security=DEBUG #logging.level.stirling.software.proprietary.security=DEBUG
logging.level.com.zaxxer.hikari=WARN logging.level.com.zaxxer.hikari=WARN
logging.level.stirling.software.SPDF.service.PdfJsonConversionService=TRACE logging.level.stirling.software.SPDF.service.PdfJsonConversionService=TRACE
logging.level.stirling.software.common.service.JobExecutorService=DEBUG
logging.level.stirling.software.common.service.TaskManager=DEBUG
spring.jpa.open-in-view=false spring.jpa.open-in-view=false
server.forward-headers-strategy=NATIVE server.forward-headers-strategy=NATIVE
server.error.path=/error server.error.path=/error

View File

@ -1,16 +1,26 @@
package stirling.software.SPDF.controller.api.converters; package stirling.software.SPDF.controller.api.converters;
import java.util.Optional;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.multipart.MultipartFile;
import io.github.pixee.security.Filenames; import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Operation;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.config.swagger.StandardPdfResponse; import stirling.software.SPDF.config.swagger.StandardPdfResponse;
import stirling.software.SPDF.model.json.PdfJsonDocument;
import stirling.software.SPDF.model.json.PdfJsonMetadata;
import stirling.software.SPDF.service.PdfJsonConversionService; import stirling.software.SPDF.service.PdfJsonConversionService;
import stirling.software.common.annotations.AutoJobPostMapping; import stirling.software.common.annotations.AutoJobPostMapping;
import stirling.software.common.annotations.api.ConvertApi; import stirling.software.common.annotations.api.ConvertApi;
@ -19,6 +29,7 @@ import stirling.software.common.model.api.PDFFile;
import stirling.software.common.util.ExceptionUtils; import stirling.software.common.util.ExceptionUtils;
import stirling.software.common.util.WebResponseUtils; import stirling.software.common.util.WebResponseUtils;
@Slf4j
@ConvertApi @ConvertApi
@RequiredArgsConstructor @RequiredArgsConstructor
public class ConvertPdfJsonController { public class ConvertPdfJsonController {
@ -71,4 +82,81 @@ public class ConvertPdfJsonController {
String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf"; String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf";
return WebResponseUtils.bytesToWebResponse(pdfBytes, docName); return WebResponseUtils.bytesToWebResponse(pdfBytes, docName);
} }
@PostMapping(consumes = "multipart/form-data", value = "/pdf/json/metadata")
@Operation(
summary = "Extract PDF metadata for lazy loading",
description =
"Extracts document metadata, fonts, and page dimensions. Caches the document for"
+ " subsequent page requests. Input:PDF Output:JSON Type:SISO")
public ResponseEntity<byte[]> extractPdfMetadata(
@ModelAttribute PDFFile request, @RequestParam(required = true) String jobId)
throws Exception {
MultipartFile inputFile = request.getFileInput();
if (inputFile == null) {
throw ExceptionUtils.createNullArgumentException("fileInput");
}
byte[] jsonBytes = pdfJsonConversionService.extractDocumentMetadata(inputFile, jobId);
String originalName = inputFile.getOriginalFilename();
String baseName =
(originalName != null && !originalName.isBlank())
? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "")
: "document";
String docName = baseName + "_metadata.json";
return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON);
}
@PostMapping(value = "/pdf/json/partial/{jobId}", consumes = MediaType.APPLICATION_JSON_VALUE)
@StandardPdfResponse
@Operation(
summary = "Apply incremental edits to a cached PDF",
description =
"Applies edits for the specified pages of a cached PDF and returns an updated PDF."
+ " Requires the PDF to have been previously cached via the PDF to JSON endpoint.")
public ResponseEntity<byte[]> exportPartialPdf(
@PathVariable String jobId,
@RequestBody PdfJsonDocument document,
@RequestParam(value = "filename", required = false) String filename)
throws Exception {
if (document == null) {
throw ExceptionUtils.createNullArgumentException("document");
}
byte[] pdfBytes = pdfJsonConversionService.exportUpdatedPages(jobId, document);
String baseName =
(filename != null && !filename.isBlank())
? Filenames.toSimpleFileName(filename).replaceFirst("[.][^.]+$", "")
: Optional.ofNullable(document.getMetadata())
.map(PdfJsonMetadata::getTitle)
.filter(title -> title != null && !title.isBlank())
.orElse("document");
String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf";
return WebResponseUtils.bytesToWebResponse(pdfBytes, docName);
}
@GetMapping(value = "/pdf/json/page/{jobId}/{pageNumber}")
@Operation(
summary = "Extract single page from cached PDF",
description =
"Retrieves a single page's content from a previously cached PDF document."
+ " Requires prior call to /pdf/json/metadata. Output:JSON")
public ResponseEntity<byte[]> extractSinglePage(
@PathVariable String jobId, @PathVariable int pageNumber) throws Exception {
byte[] jsonBytes = pdfJsonConversionService.extractSinglePage(jobId, pageNumber);
String docName = "page_" + pageNumber + ".json";
return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON);
}
@PostMapping(value = "/pdf/json/clear-cache/{jobId}")
@Operation(
summary = "Clear cached PDF document",
description =
"Manually clears a cached PDF document to free up server resources."
+ " Called automatically after 30 minutes.")
public ResponseEntity<Void> clearCache(@PathVariable String jobId) {
pdfJsonConversionService.clearCachedDocument(jobId);
return ResponseEntity.ok().build();
}
} }

View File

@ -0,0 +1,49 @@
package stirling.software.SPDF.model.api;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class PdfJsonConversionProgress {
private int percent;
private String stage;
private String message;
private boolean complete;
private Integer current; // Current item being processed (e.g., page number)
private Integer total; // Total items to process (e.g., total pages)
public static PdfJsonConversionProgress of(int percent, String stage, String message) {
return PdfJsonConversionProgress.builder()
.percent(percent)
.stage(stage)
.message(message)
.complete(false)
.build();
}
public static PdfJsonConversionProgress of(
int percent, String stage, String message, int current, int total) {
return PdfJsonConversionProgress.builder()
.percent(percent)
.stage(stage)
.message(message)
.current(current)
.total(total)
.complete(false)
.build();
}
public static PdfJsonConversionProgress complete() {
return PdfJsonConversionProgress.builder()
.percent(100)
.stage("complete")
.message("Conversion complete")
.complete(true)
.build();
}
}

View File

@ -22,6 +22,9 @@ public class PdfJsonDocument {
/** Optional XMP metadata packet stored as Base64. */ /** Optional XMP metadata packet stored as Base64. */
private String xmpMetadata; private String xmpMetadata;
/** Indicates that images should be loaded lazily via API rather than embedded in the JSON. */
private Boolean lazyImages;
@Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>(); @Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>();
@Builder.Default private List<PdfJsonPage> pages = new ArrayList<>(); @Builder.Default private List<PdfJsonPage> pages = new ArrayList<>();

View File

@ -0,0 +1,34 @@
package stirling.software.SPDF.model.json;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonDocumentMetadata {
private PdfJsonMetadata metadata;
/** Optional XMP metadata packet stored as Base64. */
private String xmpMetadata;
/** Indicates that images should be requested lazily via the page endpoint. */
private Boolean lazyImages;
@Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>();
@Builder.Default private List<PdfJsonPageDimension> pageDimensions = new ArrayList<>();
/** Form fields (AcroForm) at document level */
@Builder.Default private List<PdfJsonFormField> formFields = new ArrayList<>();
}

View File

@ -0,0 +1,20 @@
package stirling.software.SPDF.model.json;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonPageDimension {
private Integer pageNumber;
private Float width;
private Float height;
private Integer rotation;
}

View File

@ -0,0 +1,274 @@
package stirling.software.SPDF.service;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSBoolean;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.springframework.stereotype.Component;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.json.PdfJsonCosValue;
import stirling.software.SPDF.model.json.PdfJsonStream;
@Slf4j
@Component
public class PdfJsonCosMapper {
public PdfJsonStream serializeStream(PDStream stream) throws IOException {
if (stream == null) {
return null;
}
return serializeStream(
stream.getCOSObject(), Collections.newSetFromMap(new IdentityHashMap<>()));
}
public PdfJsonStream serializeStream(COSStream cosStream) throws IOException {
if (cosStream == null) {
return null;
}
return serializeStream(cosStream, Collections.newSetFromMap(new IdentityHashMap<>()));
}
public PdfJsonCosValue serializeCosValue(COSBase base) throws IOException {
return serializeCosValue(base, Collections.newSetFromMap(new IdentityHashMap<>()));
}
public COSBase deserializeCosValue(PdfJsonCosValue value, PDDocument document)
throws IOException {
if (value == null || value.getType() == null) {
return null;
}
switch (value.getType()) {
case NULL:
return COSNull.NULL;
case BOOLEAN:
if (value.getValue() instanceof Boolean bool) {
return COSBoolean.getBoolean(bool);
}
return null;
case INTEGER:
if (value.getValue() instanceof Number number) {
return COSInteger.get(number.longValue());
}
return null;
case FLOAT:
if (value.getValue() instanceof Number number) {
return new COSFloat(number.floatValue());
}
return null;
case NAME:
if (value.getValue() instanceof String name) {
return COSName.getPDFName(name);
}
return null;
case STRING:
if (value.getValue() instanceof String encoded) {
try {
byte[] bytes = Base64.getDecoder().decode(encoded);
return new COSString(bytes);
} catch (IllegalArgumentException ex) {
log.debug("Failed to decode COSString value: {}", ex.getMessage());
}
}
return null;
case ARRAY:
COSArray array = new COSArray();
if (value.getItems() != null) {
for (PdfJsonCosValue item : value.getItems()) {
COSBase entry = deserializeCosValue(item, document);
if (entry != null) {
array.add(entry);
} else {
array.add(COSNull.NULL);
}
}
}
return array;
case DICTIONARY:
COSDictionary dictionary = new COSDictionary();
if (value.getEntries() != null) {
for (Map.Entry<String, PdfJsonCosValue> entry : value.getEntries().entrySet()) {
COSName key = COSName.getPDFName(entry.getKey());
COSBase entryValue = deserializeCosValue(entry.getValue(), document);
if (entryValue != null) {
dictionary.setItem(key, entryValue);
}
}
}
return dictionary;
case STREAM:
if (value.getStream() != null) {
return buildStreamFromModel(value.getStream(), document);
}
return null;
default:
return null;
}
}
public COSStream buildStreamFromModel(PdfJsonStream streamModel, PDDocument document)
throws IOException {
if (streamModel == null) {
return null;
}
COSStream cosStream = document.getDocument().createCOSStream();
if (streamModel.getDictionary() != null) {
for (Map.Entry<String, PdfJsonCosValue> entry :
streamModel.getDictionary().entrySet()) {
COSName key = COSName.getPDFName(entry.getKey());
COSBase value = deserializeCosValue(entry.getValue(), document);
if (value != null) {
cosStream.setItem(key, value);
}
}
}
String rawData = streamModel.getRawData();
if (rawData != null && !rawData.isBlank()) {
byte[] data;
try {
data = Base64.getDecoder().decode(rawData);
} catch (IllegalArgumentException ex) {
log.debug("Invalid base64 content stream data: {}", ex.getMessage());
data = new byte[0];
}
try (OutputStream outputStream = cosStream.createRawOutputStream()) {
outputStream.write(data);
}
cosStream.setItem(COSName.LENGTH, COSInteger.get(data.length));
} else {
cosStream.setItem(COSName.LENGTH, COSInteger.get(0));
}
return cosStream;
}
private PdfJsonCosValue serializeCosValue(COSBase base, Set<COSBase> visited)
throws IOException {
if (base == null) {
return null;
}
if (base instanceof COSObject cosObject) {
base = cosObject.getObject();
if (base == null) {
return null;
}
}
boolean complex =
base instanceof COSDictionary
|| base instanceof COSArray
|| base instanceof COSStream;
if (complex) {
if (!visited.add(base)) {
return PdfJsonCosValue.builder()
.type(PdfJsonCosValue.Type.NAME)
.value("__circular__")
.build();
}
}
try {
PdfJsonCosValue.PdfJsonCosValueBuilder builder = PdfJsonCosValue.builder();
if (base instanceof COSNull) {
builder.type(PdfJsonCosValue.Type.NULL);
return builder.build();
}
if (base instanceof COSBoolean booleanValue) {
builder.type(PdfJsonCosValue.Type.BOOLEAN).value(booleanValue.getValue());
return builder.build();
}
if (base instanceof COSInteger integer) {
builder.type(PdfJsonCosValue.Type.INTEGER).value(integer.longValue());
return builder.build();
}
if (base instanceof COSFloat floatValue) {
builder.type(PdfJsonCosValue.Type.FLOAT).value(floatValue.floatValue());
return builder.build();
}
if (base instanceof COSName name) {
builder.type(PdfJsonCosValue.Type.NAME).value(name.getName());
return builder.build();
}
if (base instanceof COSString cosString) {
builder.type(PdfJsonCosValue.Type.STRING)
.value(Base64.getEncoder().encodeToString(cosString.getBytes()));
return builder.build();
}
if (base instanceof COSArray array) {
List<PdfJsonCosValue> items = new ArrayList<>(array.size());
for (COSBase item : array) {
PdfJsonCosValue serialized = serializeCosValue(item, visited);
items.add(serialized);
}
builder.type(PdfJsonCosValue.Type.ARRAY).items(items);
return builder.build();
}
if (base instanceof COSStream stream) {
builder.type(PdfJsonCosValue.Type.STREAM).stream(serializeStream(stream, visited));
return builder.build();
}
if (base instanceof COSDictionary dictionary) {
Map<String, PdfJsonCosValue> entries = new LinkedHashMap<>();
for (COSName key : dictionary.keySet()) {
PdfJsonCosValue serialized =
serializeCosValue(dictionary.getDictionaryObject(key), visited);
entries.put(key.getName(), serialized);
}
builder.type(PdfJsonCosValue.Type.DICTIONARY).entries(entries);
return builder.build();
}
return null;
} finally {
if (complex) {
visited.remove(base);
}
}
}
private PdfJsonStream serializeStream(COSStream cosStream, Set<COSBase> visited)
throws IOException {
Map<String, PdfJsonCosValue> dictionary = new LinkedHashMap<>();
for (COSName key : cosStream.keySet()) {
COSBase value = cosStream.getDictionaryObject(key);
PdfJsonCosValue serialized = serializeCosValue(value, visited);
if (serialized != null) {
dictionary.put(key.getName(), serialized);
}
}
String rawData = null;
try (InputStream inputStream = cosStream.createRawInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
if (inputStream != null) {
inputStream.transferTo(baos);
}
byte[] data = baos.toByteArray();
if (data.length > 0) {
rawData = Base64.getEncoder().encodeToString(data);
}
}
return PdfJsonStream.builder().dictionary(dictionary).rawData(rawData).build();
}
}

View File

@ -0,0 +1,224 @@
package stirling.software.SPDF.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.core.io.ResourceLoader;
import org.springframework.stereotype.Component;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.json.PdfJsonFont;
@Slf4j
@Component
@RequiredArgsConstructor
public class PdfJsonFallbackFontService {
public static final String FALLBACK_FONT_ID = "fallback-noto-sans";
public static final String DEFAULT_FALLBACK_FONT_LOCATION =
"classpath:/static/fonts/NotoSans-Regular.ttf";
public static final String FALLBACK_FONT_CJK_ID = "fallback-noto-cjk";
public static final String FALLBACK_FONT_JP_ID = "fallback-noto-jp";
public static final String FALLBACK_FONT_KR_ID = "fallback-noto-korean";
public static final String FALLBACK_FONT_AR_ID = "fallback-noto-arabic";
public static final String FALLBACK_FONT_TH_ID = "fallback-noto-thai";
private static final Map<String, FallbackFontSpec> BUILT_IN_FALLBACK_FONTS =
Map.ofEntries(
Map.entry(
FALLBACK_FONT_CJK_ID,
new FallbackFontSpec(
"classpath:/static/fonts/NotoSansSC-Regular.ttf",
"NotoSansSC-Regular",
"ttf")),
Map.entry(
FALLBACK_FONT_JP_ID,
new FallbackFontSpec(
"classpath:/static/fonts/NotoSansJP-Regular.ttf",
"NotoSansJP-Regular",
"ttf")),
Map.entry(
FALLBACK_FONT_KR_ID,
new FallbackFontSpec(
"classpath:/static/fonts/malgun.ttf", "MalgunGothic", "ttf")),
Map.entry(
FALLBACK_FONT_AR_ID,
new FallbackFontSpec(
"classpath:/static/fonts/NotoSansArabic-Regular.ttf",
"NotoSansArabic-Regular",
"ttf")),
Map.entry(
FALLBACK_FONT_TH_ID,
new FallbackFontSpec(
"classpath:/static/fonts/NotoSansThai-Regular.ttf",
"NotoSansThai-Regular",
"ttf")));
private final ResourceLoader resourceLoader;
@Value("${stirling.pdf.fallback-font:" + DEFAULT_FALLBACK_FONT_LOCATION + "}")
private String fallbackFontLocation;
private final Map<String, byte[]> fallbackFontCache = new ConcurrentHashMap<>();
public PdfJsonFont buildFallbackFontModel() throws IOException {
return buildFallbackFontModel(FALLBACK_FONT_ID);
}
public PdfJsonFont buildFallbackFontModel(String fallbackId) throws IOException {
FallbackFontSpec spec = getFallbackFontSpec(fallbackId);
if (spec == null) {
throw new IOException("Unknown fallback font id " + fallbackId);
}
byte[] bytes = loadFallbackFontBytes(fallbackId, spec);
String base64 = java.util.Base64.getEncoder().encodeToString(bytes);
return PdfJsonFont.builder()
.id(fallbackId)
.uid(fallbackId)
.baseName(spec.baseName())
.subtype("TrueType")
.embedded(true)
.program(base64)
.programFormat(spec.format())
.build();
}
public PDFont loadFallbackPdfFont(PDDocument document) throws IOException {
return loadFallbackPdfFont(document, FALLBACK_FONT_ID);
}
public PDFont loadFallbackPdfFont(PDDocument document, String fallbackId) throws IOException {
FallbackFontSpec spec = getFallbackFontSpec(fallbackId);
if (spec == null) {
throw new IOException("Unknown fallback font id " + fallbackId);
}
byte[] bytes = loadFallbackFontBytes(fallbackId, spec);
try (InputStream stream = new ByteArrayInputStream(bytes)) {
return PDType0Font.load(document, stream, true);
}
}
public boolean canEncodeFully(PDFont font, String text) {
return canEncode(font, text);
}
public boolean canEncode(PDFont font, int codePoint) {
return canEncode(font, new String(Character.toChars(codePoint)));
}
public boolean canEncode(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
return false;
}
try {
font.encode(text);
return true;
} catch (IOException | IllegalArgumentException ex) {
return false;
}
}
public String resolveFallbackFontId(int codePoint) {
Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
if (block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F
|| block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|| block == Character.UnicodeBlock.BOPOMOFO
|| block == Character.UnicodeBlock.BOPOMOFO_EXTENDED
|| block == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
return FALLBACK_FONT_CJK_ID;
}
Character.UnicodeScript script = Character.UnicodeScript.of(codePoint);
return switch (script) {
case HAN -> FALLBACK_FONT_CJK_ID;
case HIRAGANA, KATAKANA -> FALLBACK_FONT_JP_ID;
case HANGUL -> FALLBACK_FONT_KR_ID;
case ARABIC -> FALLBACK_FONT_AR_ID;
case THAI -> FALLBACK_FONT_TH_ID;
default -> FALLBACK_FONT_ID;
};
}
public String mapUnsupportedGlyph(int codePoint) {
return switch (codePoint) {
case 0x276E -> "<";
case 0x276F -> ">";
default -> null;
};
}
private FallbackFontSpec getFallbackFontSpec(String fallbackId) {
if (FALLBACK_FONT_ID.equals(fallbackId)) {
String baseName = inferBaseName(fallbackFontLocation, "NotoSans-Regular");
String format = inferFormat(fallbackFontLocation, "ttf");
return new FallbackFontSpec(fallbackFontLocation, baseName, format);
}
return BUILT_IN_FALLBACK_FONTS.get(fallbackId);
}
private byte[] loadFallbackFontBytes(String fallbackId, FallbackFontSpec spec)
throws IOException {
if (spec == null) {
throw new IOException("No fallback font specification for " + fallbackId);
}
byte[] cached = fallbackFontCache.get(fallbackId);
if (cached != null) {
return cached;
}
Resource resource = resourceLoader.getResource(spec.resourceLocation());
if (!resource.exists()) {
throw new IOException("Fallback font resource not found at " + spec.resourceLocation());
}
try (InputStream inputStream = resource.getInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
inputStream.transferTo(baos);
byte[] bytes = baos.toByteArray();
fallbackFontCache.put(fallbackId, bytes);
return bytes;
}
}
private String inferBaseName(String location, String defaultName) {
if (location == null || location.isBlank()) {
return defaultName;
}
int slash = location.lastIndexOf('/');
String fileName = slash >= 0 ? location.substring(slash + 1) : location;
int dot = fileName.lastIndexOf('.');
if (dot > 0) {
fileName = fileName.substring(0, dot);
}
return fileName.isEmpty() ? defaultName : fileName;
}
private String inferFormat(String location, String defaultFormat) {
if (location == null || location.isBlank()) {
return defaultFormat;
}
int dot = location.lastIndexOf('.');
if (dot >= 0 && dot < location.length() - 1) {
return location.substring(dot + 1).toLowerCase(Locale.ROOT);
}
return defaultFormat;
}
private record FallbackFontSpec(String resourceLocation, String baseName, String format) {}
}

View File

@ -0,0 +1,349 @@
package stirling.software.SPDF.service.pdfjson;
import java.io.IOException;
import java.nio.file.Files;
import java.util.Base64;
import java.util.Locale;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import jakarta.annotation.PostConstruct;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.common.util.ProcessExecutor;
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
import stirling.software.common.util.TempFile;
import stirling.software.common.util.TempFileManager;
@Slf4j
@Service
@RequiredArgsConstructor
public class PdfJsonFontService {
private final TempFileManager tempFileManager;
@Getter
@Value("${stirling.pdf.json.cff-converter.enabled:true}")
private boolean cffConversionEnabled;
@Getter
@Value("${stirling.pdf.json.cff-converter.method:python}")
private String cffConverterMethod;
@Value("${stirling.pdf.json.cff-converter.python-command:/opt/venv/bin/python3}")
private String pythonCommand;
@Value("${stirling.pdf.json.cff-converter.python-script:/scripts/convert_cff_to_ttf.py}")
private String pythonScript;
@Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}")
private String fontforgeCommand;
private volatile boolean pythonCffConverterAvailable;
private volatile boolean fontForgeCffConverterAvailable;
@PostConstruct
private void initialiseCffConverterAvailability() {
if (!cffConversionEnabled) {
log.warn("[FONT-DEBUG] CFF conversion is DISABLED in configuration");
pythonCffConverterAvailable = false;
fontForgeCffConverterAvailable = false;
return;
}
log.info("[FONT-DEBUG] CFF conversion enabled, checking tool availability...");
pythonCffConverterAvailable = isCommandAvailable(pythonCommand);
if (!pythonCffConverterAvailable) {
log.warn(
"[FONT-DEBUG] Python command '{}' not found; Python CFF conversion disabled",
pythonCommand);
} else {
log.info("[FONT-DEBUG] Python command '{}' is available", pythonCommand);
}
fontForgeCffConverterAvailable = isCommandAvailable(fontforgeCommand);
if (!fontForgeCffConverterAvailable) {
log.warn(
"[FONT-DEBUG] FontForge command '{}' not found; FontForge CFF conversion disabled",
fontforgeCommand);
} else {
log.info("[FONT-DEBUG] FontForge command '{}' is available", fontforgeCommand);
}
log.info("[FONT-DEBUG] Selected CFF converter method: {}", cffConverterMethod);
}
public byte[] convertCffProgramToTrueType(byte[] fontBytes, String toUnicode) {
if (!cffConversionEnabled || fontBytes == null || fontBytes.length == 0) {
log.warn(
"[FONT-DEBUG] CFF conversion skipped: enabled={}, bytes={}",
cffConversionEnabled,
fontBytes == null ? "null" : fontBytes.length);
return null;
}
log.info(
"[FONT-DEBUG] Converting CFF font: {} bytes, method: {}",
fontBytes.length,
cffConverterMethod);
if ("python".equalsIgnoreCase(cffConverterMethod)) {
if (!pythonCffConverterAvailable) {
log.warn("[FONT-DEBUG] Python CFF converter not available, skipping conversion");
return null;
}
byte[] result = convertCffUsingPython(fontBytes, toUnicode);
log.info(
"[FONT-DEBUG] Python conversion result: {}",
result == null ? "null" : result.length + " bytes");
return result;
} else if ("fontforge".equalsIgnoreCase(cffConverterMethod)) {
if (!fontForgeCffConverterAvailable) {
log.warn("[FONT-DEBUG] FontForge CFF converter not available, skipping conversion");
return null;
}
byte[] result = convertCffUsingFontForge(fontBytes);
log.info(
"[FONT-DEBUG] FontForge conversion result: {}",
result == null ? "null" : result.length + " bytes");
return result;
} else {
log.warn(
"[FONT-DEBUG] Unknown CFF converter method: {}, falling back to Python",
cffConverterMethod);
if (!pythonCffConverterAvailable) {
log.warn("[FONT-DEBUG] Python CFF converter not available, skipping conversion");
return null;
}
byte[] result = convertCffUsingPython(fontBytes, toUnicode);
log.info(
"[FONT-DEBUG] Python conversion result: {}",
result == null ? "null" : result.length + " bytes");
return result;
}
}
public String detectFontFlavor(byte[] fontBytes) {
if (fontBytes == null || fontBytes.length < 4) {
return null;
}
int signature =
((fontBytes[0] & 0xFF) << 24)
| ((fontBytes[1] & 0xFF) << 16)
| ((fontBytes[2] & 0xFF) << 8)
| (fontBytes[3] & 0xFF);
if (signature == 0x00010000 || signature == 0x74727565) {
return "ttf";
}
if (signature == 0x4F54544F) {
return "otf";
}
if (signature == 0x74746366) {
return "cff";
}
return null;
}
public String detectTrueTypeFormat(byte[] data) {
if (data == null || data.length < 4) {
return null;
}
int signature =
((data[0] & 0xFF) << 24)
| ((data[1] & 0xFF) << 16)
| ((data[2] & 0xFF) << 8)
| (data[3] & 0xFF);
if (signature == 0x00010000) {
return "ttf";
}
if (signature == 0x4F54544F) {
return "otf";
}
if (signature == 0x74746366) {
return "cff";
}
return null;
}
public String validateFontTables(byte[] fontBytes) {
if (fontBytes == null || fontBytes.length < 12) {
return "Font program too small";
}
int numTables = ((fontBytes[4] & 0xFF) << 8) | (fontBytes[5] & 0xFF);
if (numTables <= 0 || numTables > 512) {
return "Invalid numTables: " + numTables;
}
return null;
}
private byte[] convertCffUsingPython(byte[] fontBytes, String toUnicode) {
if (!pythonCffConverterAvailable) {
log.warn("[FONT-DEBUG] Python CFF converter not available");
return null;
}
if (pythonCommand == null
|| pythonCommand.isBlank()
|| pythonScript == null
|| pythonScript.isBlank()) {
log.warn("[FONT-DEBUG] Python converter not configured");
return null;
}
log.info(
"[FONT-DEBUG] Running Python CFF converter: command={}, script={}",
pythonCommand,
pythonScript);
try (TempFile inputFile = new TempFile(tempFileManager, ".cff");
TempFile outputFile = new TempFile(tempFileManager, ".otf");
TempFile toUnicodeFile =
toUnicode != null ? new TempFile(tempFileManager, ".tounicode") : null) {
Files.write(inputFile.getPath(), fontBytes);
if (toUnicodeFile != null) {
try {
byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicode);
Files.write(toUnicodeFile.getPath(), toUnicodeBytes);
} catch (IllegalArgumentException ex) {
log.warn(
"[FONT-DEBUG] Failed to decode ToUnicode data for CFF conversion: {}",
ex.getMessage());
return null;
}
}
String[] command =
buildPythonCommand(
inputFile.getAbsolutePath(),
outputFile.getAbsolutePath(),
toUnicodeFile != null ? toUnicodeFile.getAbsolutePath() : null);
log.info("[FONT-DEBUG] Executing: {}", String.join(" ", command));
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.CFF_CONVERTER)
.runCommandWithOutputHandling(java.util.Arrays.asList(command));
if (result.getRc() != 0) {
log.error(
"[FONT-DEBUG] Python CFF conversion failed with exit code: {}",
result.getRc());
log.error("[FONT-DEBUG] Stdout: {}", result.getMessages());
return null;
}
if (!Files.exists(outputFile.getPath())) {
log.error("[FONT-DEBUG] Python CFF conversion produced no output file");
return null;
}
byte[] data = Files.readAllBytes(outputFile.getPath());
if (data.length == 0) {
log.error("[FONT-DEBUG] Python CFF conversion returned empty output");
return null;
}
log.info(
"[FONT-DEBUG] Python CFF conversion succeeded: {} bytes -> {} bytes",
fontBytes.length,
data.length);
return data;
} catch (IOException | InterruptedException ex) {
if (ex instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
log.error("[FONT-DEBUG] Python CFF conversion exception: {}", ex.getMessage(), ex);
return null;
}
}
public byte[] convertCffUsingFontForge(byte[] fontBytes) {
if (!fontForgeCffConverterAvailable) {
log.debug("FontForge CFF converter not available");
return null;
}
try (TempFile inputFile = new TempFile(tempFileManager, ".cff");
TempFile outputFile = new TempFile(tempFileManager, ".ttf")) {
Files.write(inputFile.getPath(), fontBytes);
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.CFF_CONVERTER)
.runCommandWithOutputHandling(
java.util.Arrays.asList(
fontforgeCommand,
"-lang=ff",
"-c",
"Open($1); "
+ "ScaleToEm(1000); "
+ "SelectWorthOutputting(); "
+ "SetFontOrder(2); "
+ "Reencode(\"unicode\"); "
+ "RoundToInt(); "
+ "RemoveOverlap(); "
+ "Simplify(); "
+ "CorrectDirection(); "
+ "Generate($2, \"\", 4+16+32); "
+ "Close(); "
+ "Quit()",
inputFile.getAbsolutePath(),
outputFile.getAbsolutePath()));
if (result.getRc() != 0) {
log.warn("FontForge CFF conversion failed: {}", result.getRc());
return null;
}
if (!Files.exists(outputFile.getPath())) {
log.warn("FontForge CFF conversion produced no output");
return null;
}
byte[] data = Files.readAllBytes(outputFile.getPath());
if (data.length == 0) {
log.warn("FontForge CFF conversion returned empty output");
return null;
}
return data;
} catch (IOException | InterruptedException ex) {
if (ex instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
log.warn("FontForge CFF conversion failed: {}", ex.getMessage());
return null;
}
}
private boolean isCommandAvailable(String command) {
if (command == null || command.isBlank()) {
return false;
}
try {
ProcessBuilder processBuilder = new ProcessBuilder();
if (System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows")) {
processBuilder.command("where", command);
} else {
processBuilder.command("which", command);
}
Process process = processBuilder.start();
int exitCode = process.waitFor();
return exitCode == 0;
} catch (Exception e) {
log.debug("Error checking for command {}: {}", command, e.getMessage());
return false;
}
}
private String[] buildPythonCommand(String input, String output, String toUnicode) {
if (toUnicode != null) {
return new String[] {
pythonCommand,
pythonScript,
"--input",
input,
"--output",
output,
"--to-unicode",
toUnicode
};
}
return new String[] {pythonCommand, pythonScript, "--input", input, "--output", output};
}
}

View File

@ -0,0 +1,444 @@
package stirling.software.SPDF.service.pdfjson;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.UUID;
import java.util.function.Consumer;
import javax.imageio.ImageIO;
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.util.Matrix;
import org.springframework.stereotype.Service;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PdfJsonConversionProgress;
import stirling.software.SPDF.model.json.PdfJsonImageElement;
/**
* Service for handling PDF image operations for JSON conversion (extraction, encoding, rendering).
*/
@Service
@Slf4j
public class PdfJsonImageService {
private record EncodedImage(String base64, String format) {}
private record Bounds(float left, float right, float bottom, float top) {
float width() {
return Math.max(0f, right - left);
}
float height() {
return Math.max(0f, top - bottom);
}
}
/**
* Collects images from all pages in a PDF document.
*
* @param document The PDF document
* @param totalPages Total number of pages
* @param progress Progress callback
* @return Map of page number to list of image elements
* @throws IOException If image extraction fails
*/
public Map<Integer, List<PdfJsonImageElement>> collectImages(
PDDocument document, int totalPages, Consumer<PdfJsonConversionProgress> progress)
throws IOException {
Map<Integer, List<PdfJsonImageElement>> imagesByPage = new LinkedHashMap<>();
int pageNumber = 1;
for (PDPage page : document.getPages()) {
ImageCollectingEngine engine =
new ImageCollectingEngine(page, pageNumber, imagesByPage);
engine.processPage(page);
// Update progress for image extraction (70-80%)
int imageProgress = 70 + (int) ((pageNumber / (double) totalPages) * 10);
progress.accept(
PdfJsonConversionProgress.of(
imageProgress, "images", "Extracting images", pageNumber, totalPages));
pageNumber++;
}
return imagesByPage;
}
/**
* Extracts images from a single PDF page (for on-demand lazy loading).
*
* @param document The PDF document
* @param page The specific page to extract images from
* @param pageNumber The page number (1-indexed)
* @return List of image elements for this page
* @throws IOException If image extraction fails
*/
public List<PdfJsonImageElement> extractImagesForPage(
PDDocument document, PDPage page, int pageNumber) throws IOException {
Map<Integer, List<PdfJsonImageElement>> imagesByPage = new LinkedHashMap<>();
ImageCollectingEngine engine = new ImageCollectingEngine(page, pageNumber, imagesByPage);
engine.processPage(page);
return imagesByPage.getOrDefault(pageNumber, new ArrayList<>());
}
/**
* Draws an image element on a PDF page content stream.
*
* @param contentStream The content stream to draw on
* @param document The PDF document
* @param element The image element to draw
* @param cache Cache of previously created image XObjects
* @throws IOException If drawing fails
*/
public void drawImageElement(
PDPageContentStream contentStream,
PDDocument document,
PdfJsonImageElement element,
Map<String, PDImageXObject> cache)
throws IOException {
if (element == null || element.getImageData() == null || element.getImageData().isBlank()) {
return;
}
String cacheKey =
element.getId() != null && !element.getId().isBlank()
? element.getId()
: Integer.toHexString(System.identityHashCode(element));
PDImageXObject image = cache.get(cacheKey);
if (image == null) {
image = createImageXObject(document, element);
if (image == null) {
return;
}
cache.put(cacheKey, image);
}
List<Float> transform = element.getTransform();
if (transform != null && transform.size() == 6) {
Matrix matrix =
new Matrix(
safeFloat(transform.get(0), 1f),
safeFloat(transform.get(1), 0f),
safeFloat(transform.get(2), 0f),
safeFloat(transform.get(3), 1f),
safeFloat(transform.get(4), 0f),
safeFloat(transform.get(5), 0f));
contentStream.drawImage(image, matrix);
return;
}
float width = safeFloat(element.getWidth(), fallbackWidth(element));
float height = safeFloat(element.getHeight(), fallbackHeight(element));
if (width <= 0f) {
width = Math.max(1f, fallbackWidth(element));
}
if (height <= 0f) {
height = Math.max(1f, fallbackHeight(element));
}
float left = resolveLeft(element, width);
float bottom = resolveBottom(element, height);
contentStream.drawImage(image, left, bottom, width, height);
}
/**
* Creates a PDImageXObject from a PdfJsonImageElement.
*
* @param document The PDF document
* @param element The image element with base64 data
* @return The created image XObject
* @throws IOException If image creation fails
*/
public PDImageXObject createImageXObject(PDDocument document, PdfJsonImageElement element)
throws IOException {
byte[] data;
try {
data = Base64.getDecoder().decode(element.getImageData());
} catch (IllegalArgumentException ex) {
log.debug("Failed to decode image element: {}", ex.getMessage());
return null;
}
String name = element.getId() != null ? element.getId() : UUID.randomUUID().toString();
return PDImageXObject.createFromByteArray(document, data, name);
}
private EncodedImage encodeImage(PDImage image) {
try {
BufferedImage bufferedImage = image.getImage();
if (bufferedImage == null) {
return null;
}
String format = resolveImageFormat(image);
if (format == null || format.isBlank()) {
format = "png";
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
boolean written = ImageIO.write(bufferedImage, format, baos);
if (!written) {
if (!"png".equalsIgnoreCase(format)) {
baos.reset();
if (!ImageIO.write(bufferedImage, "png", baos)) {
return null;
}
format = "png";
} else {
return null;
}
}
return new EncodedImage(Base64.getEncoder().encodeToString(baos.toByteArray()), format);
} catch (IOException ex) {
log.debug("Failed to encode image: {}", ex.getMessage());
return null;
}
}
private String resolveImageFormat(PDImage image) {
if (image instanceof PDImageXObject xObject) {
String suffix = xObject.getSuffix();
if (suffix != null && !suffix.isBlank()) {
return suffix.toLowerCase(Locale.ROOT);
}
}
return "png";
}
private float fallbackWidth(PdfJsonImageElement element) {
if (element.getRight() != null && element.getLeft() != null) {
return Math.max(0f, element.getRight() - element.getLeft());
}
if (element.getNativeWidth() != null) {
return element.getNativeWidth();
}
return 1f;
}
private float fallbackHeight(PdfJsonImageElement element) {
if (element.getTop() != null && element.getBottom() != null) {
return Math.max(0f, element.getTop() - element.getBottom());
}
if (element.getNativeHeight() != null) {
return element.getNativeHeight();
}
return 1f;
}
private float resolveLeft(PdfJsonImageElement element, float width) {
if (element.getLeft() != null) {
return element.getLeft();
}
if (element.getX() != null) {
return element.getX();
}
if (element.getRight() != null) {
return element.getRight() - width;
}
return 0f;
}
private float resolveBottom(PdfJsonImageElement element, float height) {
if (element.getBottom() != null) {
return element.getBottom();
}
if (element.getY() != null) {
return element.getY();
}
if (element.getTop() != null) {
return element.getTop() - height;
}
return 0f;
}
private List<Float> toMatrixValues(Matrix matrix) {
List<Float> values = new ArrayList<>(6);
values.add(matrix.getValue(0, 0));
values.add(matrix.getValue(0, 1));
values.add(matrix.getValue(1, 0));
values.add(matrix.getValue(1, 1));
values.add(matrix.getValue(2, 0));
values.add(matrix.getValue(2, 1));
return values;
}
private float safeFloat(Float value, float defaultValue) {
if (value == null || Float.isNaN(value) || Float.isInfinite(value)) {
return defaultValue;
}
return value;
}
/**
* Inner engine that extends PDFGraphicsStreamEngine to collect images from PDF content streams.
*/
private class ImageCollectingEngine extends PDFGraphicsStreamEngine {
private final int pageNumber;
private final Map<Integer, List<PdfJsonImageElement>> imagesByPage;
private COSName currentXObjectName;
private int imageCounter = 0;
protected ImageCollectingEngine(
PDPage page, int pageNumber, Map<Integer, List<PdfJsonImageElement>> imagesByPage)
throws IOException {
super(page);
this.pageNumber = pageNumber;
this.imagesByPage = imagesByPage;
}
@Override
public void processPage(PDPage page) throws IOException {
super.processPage(page);
}
@Override
public void drawImage(PDImage pdImage) throws IOException {
EncodedImage encoded = encodeImage(pdImage);
if (encoded == null) {
return;
}
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
Bounds bounds = computeBounds(ctm);
List<Float> matrixValues = toMatrixValues(ctm);
PdfJsonImageElement element =
PdfJsonImageElement.builder()
.id(UUID.randomUUID().toString())
.objectName(
currentXObjectName != null
? currentXObjectName.getName()
: null)
.inlineImage(!(pdImage instanceof PDImageXObject))
.nativeWidth(pdImage.getWidth())
.nativeHeight(pdImage.getHeight())
.x(bounds.left)
.y(bounds.bottom)
.width(bounds.width())
.height(bounds.height())
.left(bounds.left)
.right(bounds.right)
.top(bounds.top)
.bottom(bounds.bottom)
.transform(matrixValues)
.zOrder(-1_000_000 + imageCounter)
.imageData(encoded.base64())
.imageFormat(encoded.format())
.build();
imageCounter++;
imagesByPage.computeIfAbsent(pageNumber, key -> new ArrayList<>()).add(element);
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3)
throws IOException {
// Not needed for image extraction
}
@Override
public void clip(int windingRule) throws IOException {
// Not needed for image extraction
}
@Override
public void moveTo(float x, float y) throws IOException {
// Not needed for image extraction
}
@Override
public void lineTo(float x, float y) throws IOException {
// Not needed for image extraction
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
throws IOException {
// Not needed for image extraction
}
@Override
public Point2D getCurrentPoint() throws IOException {
return new Point2D.Float();
}
@Override
public void closePath() throws IOException {
// Not needed for image extraction
}
@Override
public void endPath() throws IOException {
// Not needed for image extraction
}
@Override
public void shadingFill(COSName shadingName) throws IOException {
// Not needed for image extraction
}
@Override
public void fillAndStrokePath(int windingRule) throws IOException {
// Not needed for image extraction
}
@Override
public void fillPath(int windingRule) throws IOException {
// Not needed for image extraction
}
@Override
public void strokePath() throws IOException {
// Not needed for image extraction
}
@Override
protected void processOperator(Operator operator, List<COSBase> operands)
throws IOException {
if (OperatorName.DRAW_OBJECT.equals(operator.getName())
&& !operands.isEmpty()
&& operands.get(0) instanceof COSName name) {
currentXObjectName = name;
}
super.processOperator(operator, operands);
currentXObjectName = null;
}
private Bounds computeBounds(Matrix ctm) {
AffineTransform transform = ctm.createAffineTransform();
Point2D.Float p0 = new Point2D.Float(0, 0);
Point2D.Float p1 = new Point2D.Float(1, 0);
Point2D.Float p2 = new Point2D.Float(0, 1);
Point2D.Float p3 = new Point2D.Float(1, 1);
transform.transform(p0, p0);
transform.transform(p1, p1);
transform.transform(p2, p2);
transform.transform(p3, p3);
float minX = Math.min(Math.min(p0.x, p1.x), Math.min(p2.x, p3.x));
float maxX = Math.max(Math.max(p0.x, p1.x), Math.max(p2.x, p3.x));
float minY = Math.min(Math.min(p0.y, p1.y), Math.min(p2.y, p3.y));
float maxY = Math.max(Math.max(p0.y, p1.y), Math.max(p2.y, p3.y));
if (!Float.isFinite(minX) || !Float.isFinite(minY)) {
return new Bounds(0f, 0f, 0f, 0f);
}
return new Bounds(minX, maxX, minY, maxY);
}
}
}

View File

@ -0,0 +1,148 @@
package stirling.software.SPDF.service.pdfjson;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.time.Instant;
import java.time.format.DateTimeParseException;
import java.util.Base64;
import java.util.Calendar;
import java.util.Optional;
import java.util.TimeZone;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.springframework.stereotype.Service;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.json.PdfJsonMetadata;
/** Service for extracting and applying PDF metadata (document info and XMP) for JSON conversion. */
@Service
@Slf4j
public class PdfJsonMetadataService {
/**
* Extracts document information metadata from a PDF.
*
* @param document The PDF document
* @return Metadata model with document info
*/
public PdfJsonMetadata extractMetadata(PDDocument document) {
PdfJsonMetadata metadata = new PdfJsonMetadata();
PDDocumentInformation info = document.getDocumentInformation();
if (info != null) {
metadata.setTitle(info.getTitle());
metadata.setAuthor(info.getAuthor());
metadata.setSubject(info.getSubject());
metadata.setKeywords(info.getKeywords());
metadata.setCreator(info.getCreator());
metadata.setProducer(info.getProducer());
metadata.setCreationDate(formatCalendar(info.getCreationDate()));
metadata.setModificationDate(formatCalendar(info.getModificationDate()));
metadata.setTrapped(info.getTrapped());
}
metadata.setNumberOfPages(document.getNumberOfPages());
return metadata;
}
/**
* Extracts XMP metadata from a PDF as base64-encoded string.
*
* @param document The PDF document
* @return Base64-encoded XMP metadata, or null if not present
*/
public String extractXmpMetadata(PDDocument document) {
if (document.getDocumentCatalog() == null) {
return null;
}
PDMetadata metadata = document.getDocumentCatalog().getMetadata();
if (metadata == null) {
return null;
}
try (InputStream inputStream = metadata.createInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
inputStream.transferTo(baos);
byte[] data = baos.toByteArray();
if (data.length == 0) {
return null;
}
return Base64.getEncoder().encodeToString(data);
} catch (IOException ex) {
log.debug("Failed to extract XMP metadata: {}", ex.getMessage());
return null;
}
}
/**
* Applies metadata to a PDF document.
*
* @param document The PDF document
* @param metadata The metadata to apply
*/
public void applyMetadata(PDDocument document, PdfJsonMetadata metadata) {
if (metadata == null) {
return;
}
PDDocumentInformation info = document.getDocumentInformation();
info.setTitle(metadata.getTitle());
info.setAuthor(metadata.getAuthor());
info.setSubject(metadata.getSubject());
info.setKeywords(metadata.getKeywords());
info.setCreator(metadata.getCreator());
info.setProducer(metadata.getProducer());
if (metadata.getCreationDate() != null) {
parseInstant(metadata.getCreationDate())
.ifPresent(instant -> info.setCreationDate(toCalendar(instant)));
}
if (metadata.getModificationDate() != null) {
parseInstant(metadata.getModificationDate())
.ifPresent(instant -> info.setModificationDate(toCalendar(instant)));
}
info.setTrapped(metadata.getTrapped());
}
/**
* Applies XMP metadata to a PDF document from base64-encoded string.
*
* @param document The PDF document
* @param base64 Base64-encoded XMP metadata
*/
public void applyXmpMetadata(PDDocument document, String base64) {
if (base64 == null || base64.isBlank()) {
return;
}
try (InputStream inputStream =
new ByteArrayInputStream(Base64.getDecoder().decode(base64))) {
PDMetadata metadata = new PDMetadata(document, inputStream);
document.getDocumentCatalog().setMetadata(metadata);
} catch (IllegalArgumentException | IOException ex) {
log.debug("Failed to apply XMP metadata: {}", ex.getMessage());
}
}
private String formatCalendar(Calendar calendar) {
if (calendar == null) {
return null;
}
return calendar.toInstant().toString();
}
private Optional<Instant> parseInstant(String value) {
try {
return Optional.of(Instant.parse(value));
} catch (DateTimeParseException ex) {
log.warn("Failed to parse instant '{}': {}", value, ex.getMessage());
return Optional.empty();
}
}
private Calendar toCalendar(Instant instant) {
Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
calendar.setTimeInMillis(instant.toEpochMilli());
return calendar;
}
}

View File

@ -0,0 +1,308 @@
package stirling.software.SPDF.service.pdfjson;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PdfJsonConversionProgress;
import stirling.software.SPDF.model.json.PdfJsonAnnotation;
import stirling.software.SPDF.model.json.PdfJsonCosValue;
import stirling.software.SPDF.model.json.PdfJsonDocumentMetadata;
import stirling.software.SPDF.model.json.PdfJsonFont;
import stirling.software.SPDF.model.json.PdfJsonImageElement;
import stirling.software.SPDF.model.json.PdfJsonPage;
import stirling.software.SPDF.model.json.PdfJsonPageDimension;
import stirling.software.SPDF.model.json.PdfJsonStream;
import stirling.software.SPDF.model.json.PdfJsonTextElement;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.service.TaskManager;
import stirling.software.common.util.ExceptionUtils;
/**
* Service for lazy loading PDF pages. Caches PDF documents and extracts pages on-demand to reduce
* memory usage for large PDFs.
*/
@Service
@Slf4j
@RequiredArgsConstructor
public class PdfLazyLoadingService {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final ObjectMapper objectMapper;
private final TaskManager taskManager;
private final PdfJsonMetadataService metadataService;
private final PdfJsonImageService imageService;
/** Cache for storing PDDocuments for lazy page loading. Key is jobId. */
private final Map<String, CachedPdfDocument> documentCache = new ConcurrentHashMap<>();
/**
* Stores PDF file bytes for lazy page loading. Each page is extracted on-demand by re-loading
* the PDF from bytes.
*/
@Data
private static class CachedPdfDocument {
private final byte[] pdfBytes;
private final PdfJsonDocumentMetadata metadata;
private final long timestamp;
public CachedPdfDocument(byte[] pdfBytes, PdfJsonDocumentMetadata metadata) {
this.pdfBytes = pdfBytes;
this.metadata = metadata;
this.timestamp = System.currentTimeMillis();
}
}
/**
* Extracts document metadata, fonts, and page dimensions without page content. Caches the PDF
* bytes for subsequent page requests.
*
* @param file The uploaded PDF file
* @param jobId The job ID for caching
* @param fonts Font map (will be populated)
* @param pageFontResources Page font resources map (will be populated)
* @return Serialized metadata JSON
* @throws IOException If extraction fails
*/
public byte[] extractDocumentMetadata(
MultipartFile file,
String jobId,
Map<String, PdfJsonFont> fonts,
Map<Integer, Map<PDFont, String>> pageFontResources)
throws IOException {
if (file == null) {
throw ExceptionUtils.createNullArgumentException("fileInput");
}
Consumer<PdfJsonConversionProgress> progress =
jobId != null
? (p) -> {
log.info(
"Progress: [{}%] {} - {}{}",
p.getPercent(),
p.getStage(),
p.getMessage(),
(p.getCurrent() != null && p.getTotal() != null)
? String.format(
" (%d/%d)", p.getCurrent(), p.getTotal())
: "");
reportProgressToTaskManager(jobId, p);
}
: (p) -> {};
// Read PDF bytes once for processing and caching
byte[] pdfBytes = file.getBytes();
try (PDDocument document = pdfDocumentFactory.load(pdfBytes, true)) {
int totalPages = document.getNumberOfPages();
// Build metadata response
progress.accept(PdfJsonConversionProgress.of(90, "metadata", "Extracting metadata"));
PdfJsonDocumentMetadata docMetadata = new PdfJsonDocumentMetadata();
docMetadata.setMetadata(metadataService.extractMetadata(document));
docMetadata.setXmpMetadata(metadataService.extractXmpMetadata(document));
docMetadata.setLazyImages(Boolean.TRUE);
List<PdfJsonFont> serializedFonts = new ArrayList<>(fonts.values());
serializedFonts.sort(
Comparator.comparing(
PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder())));
docMetadata.setFonts(serializedFonts);
// Extract page dimensions
List<PdfJsonPageDimension> pageDimensions = new ArrayList<>();
int pageIndex = 0;
for (PDPage page : document.getPages()) {
PdfJsonPageDimension dim = new PdfJsonPageDimension();
dim.setPageNumber(pageIndex + 1);
PDRectangle mediaBox = page.getMediaBox();
dim.setWidth(mediaBox.getWidth());
dim.setHeight(mediaBox.getHeight());
dim.setRotation(page.getRotation());
pageDimensions.add(dim);
pageIndex++;
}
docMetadata.setPageDimensions(pageDimensions);
// Cache PDF bytes and metadata for lazy page loading
if (jobId != null) {
CachedPdfDocument cached = new CachedPdfDocument(pdfBytes, docMetadata);
documentCache.put(jobId, cached);
log.info(
"Cached PDF bytes ({} bytes) for lazy loading, jobId: {}",
pdfBytes.length,
jobId);
// Schedule cleanup after 30 minutes
scheduleDocumentCleanup(jobId);
}
progress.accept(
PdfJsonConversionProgress.of(100, "complete", "Metadata extraction complete"));
return objectMapper.writeValueAsBytes(docMetadata);
}
}
/**
* Extracts a single page from cached PDF bytes. Re-loads the PDF for each request.
*
* @param jobId The job ID
* @param pageNumber The page number (1-indexed)
* @param serializeCosValue Function to serialize COS values
* @param extractContentStreams Function to extract content streams
* @param filterImageXObjectsFromResources Function to filter image XObjects
* @param extractText Function to extract text elements for the page
* @param extractAnnotations Function to extract annotations for the page
* @return Serialized page JSON
* @throws IOException If extraction fails
*/
public byte[] extractSinglePage(
String jobId,
int pageNumber,
java.util.function.Function<COSBase, PdfJsonCosValue> serializeCosValue,
java.util.function.Function<PDPage, List<PdfJsonStream>> extractContentStreams,
java.util.function.Function<COSBase, COSBase> filterImageXObjectsFromResources,
java.util.function.BiFunction<PDDocument, Integer, List<PdfJsonTextElement>>
extractText,
java.util.function.BiFunction<PDDocument, Integer, List<PdfJsonAnnotation>>
extractAnnotations)
throws IOException {
CachedPdfDocument cached = documentCache.get(jobId);
if (cached == null) {
throw new IllegalArgumentException("No cached document found for jobId: " + jobId);
}
int pageIndex = pageNumber - 1;
int totalPages = cached.getMetadata().getPageDimensions().size();
if (pageIndex < 0 || pageIndex >= totalPages) {
throw new IllegalArgumentException(
"Page number " + pageNumber + " out of range (1-" + totalPages + ")");
}
log.debug("Loading PDF from bytes to extract page {} (jobId: {})", pageNumber, jobId);
// Re-load PDF from cached bytes and extract the single page
try (PDDocument document = pdfDocumentFactory.load(cached.getPdfBytes(), true)) {
PDPage page = document.getPage(pageIndex);
PdfJsonPage pageModel = new PdfJsonPage();
pageModel.setPageNumber(pageNumber);
PDRectangle mediaBox = page.getMediaBox();
pageModel.setWidth(mediaBox.getWidth());
pageModel.setHeight(mediaBox.getHeight());
pageModel.setRotation(page.getRotation());
// Extract text on-demand
pageModel.setTextElements(extractText.apply(document, pageNumber));
// Extract annotations on-demand
pageModel.setAnnotations(extractAnnotations.apply(document, pageNumber));
// Extract images on-demand
List<PdfJsonImageElement> images =
imageService.extractImagesForPage(document, page, pageNumber);
pageModel.setImageElements(images);
// Extract resources and content streams
COSBase resourcesBase = page.getCOSObject().getDictionaryObject(COSName.RESOURCES);
COSBase filteredResources = filterImageXObjectsFromResources.apply(resourcesBase);
pageModel.setResources(serializeCosValue.apply(filteredResources));
pageModel.setContentStreams(extractContentStreams.apply(page));
log.debug(
"Extracted page {} (text: {}, images: {}, annotations: {}) for jobId: {}",
pageNumber,
pageModel.getTextElements().size(),
images.size(),
pageModel.getAnnotations().size(),
jobId);
return objectMapper.writeValueAsBytes(pageModel);
}
}
/** Clears a cached document. */
public void clearCachedDocument(String jobId) {
CachedPdfDocument cached = documentCache.remove(jobId);
if (cached != null) {
log.info(
"Removed cached PDF bytes ({} bytes) for jobId: {}",
cached.getPdfBytes().length,
jobId);
}
}
/** Schedules automatic cleanup of cached documents after 30 minutes. */
private void scheduleDocumentCleanup(String jobId) {
new Thread(
() -> {
try {
Thread.sleep(TimeUnit.MINUTES.toMillis(30));
clearCachedDocument(jobId);
log.info("Auto-cleaned cached document for jobId: {}", jobId);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
})
.start();
}
/**
* Report progress to TaskManager for async jobs
*
* @param jobId The job ID
* @param progress The progress update
*/
private void reportProgressToTaskManager(String jobId, PdfJsonConversionProgress progress) {
try {
log.info(
"Reporting progress for job {}: {}% - {}",
jobId, progress.getPercent(), progress.getStage());
String note;
if (progress.getCurrent() != null && progress.getTotal() != null) {
note =
String.format(
"[%d%%] %s: %s (%d/%d)",
progress.getPercent(),
progress.getStage(),
progress.getMessage(),
progress.getCurrent(),
progress.getTotal());
} else {
note =
String.format(
"[%d%%] %s: %s",
progress.getPercent(), progress.getStage(), progress.getMessage());
}
boolean added = taskManager.addNote(jobId, note);
if (!added) {
log.warn("Failed to add note - job {} not found in TaskManager", jobId);
} else {
log.info("Successfully added progress note for job {}: {}", jobId, note);
}
} catch (Exception e) {
log.error("Exception reporting progress for job {}: {}", jobId, e.getMessage(), e);
}
}
}

View File

@ -4437,6 +4437,32 @@
"errors": { "errors": {
"invalidJson": "Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.", "invalidJson": "Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.",
"pdfConversion": "Unable to convert the edited JSON back into a PDF." "pdfConversion": "Unable to convert the edited JSON back into a PDF."
},
"options": {
"autoScaleText": {
"title": "Auto-scale text to fit boxes",
"description": "Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF."
}
},
"disclaimer": {
"heading": "Preview limitations",
"textFocus": "This workspace focuses on editing text and repositioning embedded images. Complex page artwork, form widgets, and layered graphics are preserved for export but are not fully editable here.",
"previewVariance": "Some visuals (such as table borders, shapes, or annotation appearances) may not display exactly in the preview. The exported PDF keeps the original drawing commands whenever possible.",
"alpha": "This alpha viewer is still evolving—certain fonts, colours, transparency effects, and layout details may shift slightly. Please double-check the generated PDF before sharing."
},
"stages": {
"uploading": "Uploading",
"initializing": "Initializing",
"loading": "Loading",
"normalizing": "Normalizing",
"parsing": "Parsing",
"fonts": "Fonts",
"text": "Text Extraction",
"images": "Images",
"annotations": "Annotations",
"metadata": "Metadata",
"serializing": "Finalizing",
"complete": "Complete"
} }
}, },
"workspace": { "workspace": {

View File

@ -11,8 +11,10 @@ import {
FileButton, FileButton,
Group, Group,
Pagination, Pagination,
Progress,
ScrollArea, ScrollArea,
Stack, Stack,
Switch,
Text, Text,
Title, Title,
} from '@mantine/core'; } from '@mantine/core';
@ -32,6 +34,7 @@ import {
PdfJsonEditorViewData, PdfJsonEditorViewData,
PdfJsonFont, PdfJsonFont,
PdfJsonPage, PdfJsonPage,
ConversionProgress,
} from '@app/tools/pdfJsonEditor/pdfJsonEditorTypes'; } from '@app/tools/pdfJsonEditor/pdfJsonEditorTypes';
import { getImageBounds, pageDimensions } from '@app/tools/pdfJsonEditor/pdfJsonEditorUtils'; import { getImageBounds, pageDimensions } from '@app/tools/pdfJsonEditor/pdfJsonEditorUtils';
@ -205,6 +208,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
const [activeImageId, setActiveImageId] = useState<string | null>(null); const [activeImageId, setActiveImageId] = useState<string | null>(null);
const [fontFamilies, setFontFamilies] = useState<Map<string, string>>(new Map()); const [fontFamilies, setFontFamilies] = useState<Map<string, string>>(new Map());
const [textGroupsExpanded, setTextGroupsExpanded] = useState(false); const [textGroupsExpanded, setTextGroupsExpanded] = useState(false);
const [autoScaleText, setAutoScaleText] = useState(true);
const [textScales, setTextScales] = useState<Map<string, number>>(new Map());
const measurementKeyRef = useRef<string>('');
const containerRef = useRef<HTMLDivElement | null>(null); const containerRef = useRef<HTMLDivElement | null>(null);
const editorRefs = useRef<Map<string, HTMLDivElement>>(new Map()); const editorRefs = useRef<Map<string, HTMLDivElement>>(new Map());
const caretOffsetsRef = useRef<Map<string, number>>(new Map()); const caretOffsetsRef = useRef<Map<string, number>>(new Map());
@ -220,6 +226,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
errorMessage, errorMessage,
isGeneratingPdf, isGeneratingPdf,
isConverting, isConverting,
conversionProgress,
hasChanges, hasChanges,
onLoadJson, onLoadJson,
onSelectPage, onSelectPage,
@ -562,8 +569,73 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
setActiveGroupId(null); setActiveGroupId(null);
setEditingGroupId(null); setEditingGroupId(null);
setActiveImageId(null); setActiveImageId(null);
setTextScales(new Map());
measurementKeyRef.current = '';
}, [selectedPage]); }, [selectedPage]);
// Measure text widths once per page/configuration and apply static scaling
useLayoutEffect(() => {
if (!autoScaleText || visibleGroups.length === 0) {
return;
}
// Create a stable key for this measurement configuration
const currentKey = `${selectedPage}-${fontFamilies.size}-${autoScaleText}`;
// Skip if we've already measured for this configuration
if (measurementKeyRef.current === currentKey) {
return;
}
const measureTextScales = () => {
const newScales = new Map<string, number>();
visibleGroups.forEach((group) => {
// Skip groups that are being edited
if (editingGroupId === group.id) {
return;
}
const element = document.querySelector<HTMLElement>(`[data-text-group="${group.id}"]`);
if (!element) {
return;
}
const textSpan = element.querySelector<HTMLSpanElement>('span[data-text-content]');
if (!textSpan) {
return;
}
// Temporarily remove any existing transform to get natural width
const originalTransform = textSpan.style.transform;
textSpan.style.transform = 'none';
const bounds = toCssBounds(currentPage, pageHeight, scale, group.bounds);
const containerWidth = bounds.width;
const textWidth = textSpan.getBoundingClientRect().width;
// Restore original transform
textSpan.style.transform = originalTransform;
// Only scale if text overflows by more than 2%
if (textWidth > 0 && textWidth > containerWidth * 1.02) {
const scaleX = Math.max(containerWidth / textWidth, 0.5); // Min 50% scale
newScales.set(group.id, scaleX);
} else {
newScales.set(group.id, 1);
}
});
// Mark this configuration as measured
measurementKeyRef.current = currentKey;
setTextScales(newScales);
};
// Delay measurement to ensure fonts and layout are ready
const timer = setTimeout(measureTextScales, 150);
return () => clearTimeout(timer);
}, [autoScaleText, visibleGroups, editingGroupId, currentPage, pageHeight, scale, fontFamilies.size, selectedPage]);
useLayoutEffect(() => { useLayoutEffect(() => {
if (!editingGroupId) { if (!editingGroupId) {
return; return;
@ -726,6 +798,27 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
{t('pdfJsonEditor.currentFile', 'Current file: {{name}}', { name: fileName })} {t('pdfJsonEditor.currentFile', 'Current file: {{name}}', { name: fileName })}
</Text> </Text>
)} )}
<Divider my="sm" />
<Group justify="space-between" align="center">
<div>
<Text fw={500} size="sm">
{t('pdfJsonEditor.options.autoScaleText.title', 'Auto-scale text to fit boxes')}
</Text>
<Text size="xs" c="dimmed" mt={4}>
{t(
'pdfJsonEditor.options.autoScaleText.description',
'Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF.'
)}
</Text>
</div>
<Switch
size="md"
checked={autoScaleText}
onChange={(event) => setAutoScaleText(event.currentTarget.checked)}
/>
</Group>
</Stack> </Stack>
</Card> </Card>
@ -782,10 +875,39 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
{isConverting && ( {isConverting && (
<Card withBorder radius="md" padding="xl"> <Card withBorder radius="md" padding="xl">
<Stack align="center" gap="md"> <Stack gap="md">
<AutorenewIcon sx={{ fontSize: 48 }} className="animate-spin" /> <Group justify="space-between" align="flex-start">
<Text size="lg" fw={600}> <div style={{ flex: 1 }}>
{t('pdfJsonEditor.converting', 'Converting PDF to editable format...')} <Text size="lg" fw={600} mb="xs">
{conversionProgress
? conversionProgress.message
: t('pdfJsonEditor.converting', 'Converting PDF to editable format...')}
</Text>
{conversionProgress && (
<Group gap="xs">
<Text size="sm" c="dimmed" tt="capitalize">
{t(`pdfJsonEditor.stages.${conversionProgress.stage}`, conversionProgress.stage)}
</Text>
{conversionProgress.current !== undefined &&
conversionProgress.total !== undefined && (
<Text size="sm" c="dimmed">
Page {conversionProgress.current} of {conversionProgress.total}
</Text>
)}
</Group>
)}
</div>
<AutorenewIcon sx={{ fontSize: 36 }} className="animate-spin" />
</Group>
<Progress
value={conversionProgress?.percent || 0}
size="lg"
radius="md"
animated
striped
/>
<Text size="sm" c="dimmed" ta="right">
{conversionProgress?.percent || 0}% complete
</Text> </Text>
</Stack> </Stack>
</Card> </Card>
@ -1105,6 +1227,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
); );
} }
const textScale = textScales.get(group.id) ?? 1;
const shouldScale = autoScaleText && textScale < 0.98;
return ( return (
<Box key={group.id} style={containerStyle}> <Box key={group.id} style={containerStyle}>
{renderGroupContainer( {renderGroupContainer(
@ -1112,6 +1237,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
isActive, isActive,
changed, changed,
<div <div
data-text-group={group.id}
style={{ style={{
width: '100%', width: '100%',
minHeight: '100%', minHeight: '100%',
@ -1127,7 +1253,17 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
overflow: 'visible', overflow: 'visible',
}} }}
> >
<span style={{ pointerEvents: 'none' }}>{group.text || '\u00A0'}</span> <span
data-text-content
style={{
pointerEvents: 'none',
display: 'inline-block',
transform: shouldScale ? `scaleX(${textScale})` : undefined,
transformOrigin: 'left center',
}}
>
{group.text || '\u00A0'}
</span>
</div>, </div>,
() => { () => {
setEditingGroupId(group.id); setEditingGroupId(group.id);

View File

@ -27,8 +27,8 @@ export function useProprietaryToolRegistry(): ProprietaryToolRegistry {
"home.pdfJsonEditor.desc", "home.pdfJsonEditor.desc",
"Review and edit Stirling PDF JSON exports with grouped text editing and PDF regeneration" "Review and edit Stirling PDF JSON exports with grouped text editing and PDF regeneration"
), ),
categoryId: ToolCategoryId.ADVANCED_TOOLS, categoryId: ToolCategoryId.RECOMMENDED_TOOLS,
subcategoryId: SubcategoryId.DEVELOPER_TOOLS, subcategoryId: SubcategoryId.GENERAL,
workbench: "custom:pdfJsonEditor", workbench: "custom:pdfJsonEditor",
endpoints: ["json-pdf"], endpoints: ["json-pdf"],
synonyms: getSynonyms(t, "pdfJsonEditor"), synonyms: getSynonyms(t, "pdfJsonEditor"),

View File

@ -13,6 +13,7 @@ import { getFilenameFromHeaders } from '@app/utils/fileResponseUtils';
import { import {
PdfJsonDocument, PdfJsonDocument,
PdfJsonImageElement, PdfJsonImageElement,
PdfJsonPage,
TextGroup, TextGroup,
PdfJsonEditorViewData, PdfJsonEditorViewData,
} from './pdfJsonEditorTypes'; } from './pdfJsonEditorTypes';
@ -68,11 +69,39 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const [errorMessage, setErrorMessage] = useState<string | null>(null); const [errorMessage, setErrorMessage] = useState<string | null>(null);
const [isGeneratingPdf, setIsGeneratingPdf] = useState(false); const [isGeneratingPdf, setIsGeneratingPdf] = useState(false);
const [isConverting, setIsConverting] = useState(false); const [isConverting, setIsConverting] = useState(false);
const [conversionProgress, setConversionProgress] = useState<{
percent: number;
stage: string;
message: string;
} | null>(null);
// Lazy loading state
const [isLazyMode, setIsLazyMode] = useState(false);
const [cachedJobId, setCachedJobId] = useState<string | null>(null);
const [loadedImagePages, setLoadedImagePages] = useState<Set<number>>(new Set());
const [loadingImagePages, setLoadingImagePages] = useState<Set<number>>(new Set());
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]); const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
const imagesByPageRef = useRef<PdfJsonImageElement[][]>([]);
const autoLoadKeyRef = useRef<string | null>(null); const autoLoadKeyRef = useRef<string | null>(null);
const loadRequestIdRef = useRef(0); const loadRequestIdRef = useRef(0);
const latestPdfRequestIdRef = useRef<number | null>(null); const latestPdfRequestIdRef = useRef<number | null>(null);
const loadedDocumentRef = useRef<PdfJsonDocument | null>(null);
const loadedImagePagesRef = useRef<Set<number>>(new Set());
const loadingImagePagesRef = useRef<Set<number>>(new Set());
// Keep ref in sync with state for access in async callbacks
useEffect(() => {
loadedDocumentRef.current = loadedDocument;
}, [loadedDocument]);
useEffect(() => {
loadedImagePagesRef.current = new Set(loadedImagePages);
}, [loadedImagePages]);
useEffect(() => {
loadingImagePagesRef.current = new Set(loadingImagePages);
}, [loadingImagePages]);
const dirtyPages = useMemo( const dirtyPages = useMemo(
() => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current), () => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current),
@ -88,18 +117,134 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
setGroupsByPage([]); setGroupsByPage([]);
setImagesByPage([]); setImagesByPage([]);
originalImagesRef.current = []; originalImagesRef.current = [];
imagesByPageRef.current = [];
setLoadedImagePages(new Set());
setLoadingImagePages(new Set());
loadedImagePagesRef.current = new Set();
loadingImagePagesRef.current = new Set();
setSelectedPage(0); setSelectedPage(0);
return; return;
} }
const cloned = deepCloneDocument(document); const cloned = deepCloneDocument(document);
const groups = groupDocumentText(cloned); const groups = groupDocumentText(cloned);
const images = extractDocumentImages(cloned); const images = extractDocumentImages(cloned);
originalImagesRef.current = images.map((page) => page.map(cloneImageElement)); const originalImages = images.map((page) => page.map(cloneImageElement));
originalImagesRef.current = originalImages;
imagesByPageRef.current = images.map((page) => page.map(cloneImageElement));
const initialLoaded = new Set<number>();
originalImages.forEach((pageImages, index) => {
if (pageImages.length > 0) {
initialLoaded.add(index);
}
});
setGroupsByPage(groups); setGroupsByPage(groups);
setImagesByPage(images); setImagesByPage(images);
setLoadedImagePages(initialLoaded);
setLoadingImagePages(new Set());
loadedImagePagesRef.current = new Set(initialLoaded);
loadingImagePagesRef.current = new Set();
setSelectedPage(0); setSelectedPage(0);
}, []); }, []);
// Load images for a page in lazy mode
const loadImagesForPage = useCallback(
async (pageIndex: number) => {
if (!isLazyMode) {
return;
}
if (!cachedJobId) {
console.log('[loadImagesForPage] No cached jobId, skipping');
return;
}
if (
loadedImagePagesRef.current.has(pageIndex) ||
loadingImagePagesRef.current.has(pageIndex)
) {
return;
}
loadingImagePagesRef.current.add(pageIndex);
setLoadingImagePages((prev) => {
const next = new Set(prev);
next.add(pageIndex);
return next;
});
const pageNumber = pageIndex + 1;
const start = performance.now();
try {
const response = await apiClient.get(
`/api/v1/convert/pdf/json/page/${cachedJobId}/${pageNumber}`,
{
responseType: 'json',
},
);
const pageData = response.data as PdfJsonPage;
const normalizedImages = (pageData.imageElements ?? []).map(cloneImageElement);
if (imagesByPageRef.current.length <= pageIndex) {
imagesByPageRef.current.length = pageIndex + 1;
}
imagesByPageRef.current[pageIndex] = normalizedImages.map(cloneImageElement);
setLoadedDocument((prevDoc) => {
if (!prevDoc || !prevDoc.pages) {
return prevDoc;
}
const nextPages = [...prevDoc.pages];
const existingPage = nextPages[pageIndex] ?? {};
nextPages[pageIndex] = {
...existingPage,
imageElements: normalizedImages.map(cloneImageElement),
};
return {
...prevDoc,
pages: nextPages,
};
});
setImagesByPage((prev) => {
const next = [...prev];
while (next.length <= pageIndex) {
next.push([]);
}
next[pageIndex] = normalizedImages.map(cloneImageElement);
return next;
});
if (originalImagesRef.current.length <= pageIndex) {
originalImagesRef.current.length = pageIndex + 1;
}
originalImagesRef.current[pageIndex] = normalizedImages.map(cloneImageElement);
setLoadedImagePages((prev) => {
const next = new Set(prev);
next.add(pageIndex);
return next;
});
loadedImagePagesRef.current.add(pageIndex);
console.log(
`[loadImagesForPage] Loaded ${normalizedImages.length} images for page ${pageNumber} in ${(
performance.now() - start
).toFixed(2)}ms`,
);
} catch (error) {
console.error(`[loadImagesForPage] Failed to load images for page ${pageNumber}:`, error);
} finally {
loadingImagePagesRef.current.delete(pageIndex);
setLoadingImagePages((prev) => {
const next = new Set(prev);
next.delete(pageIndex);
return next;
});
}
},
[isLazyMode, cachedJobId],
);
const handleLoadFile = useCallback( const handleLoadFile = useCallback(
async (file: File | null) => { async (file: File | null) => {
if (!file) { if (!file) {
@ -113,39 +258,200 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const isPdf = file.type === 'application/pdf' || file.name.toLowerCase().endsWith('.pdf'); const isPdf = file.type === 'application/pdf' || file.name.toLowerCase().endsWith('.pdf');
try { try {
let parsed: PdfJsonDocument; let parsed: PdfJsonDocument | null = null;
let shouldUseLazyMode = false;
let pendingJobId: string | null = null;
setErrorMessage(null); setErrorMessage(null);
if (isPdf) { if (isPdf) {
latestPdfRequestIdRef.current = requestId; latestPdfRequestIdRef.current = requestId;
setIsConverting(true); setIsConverting(true);
setConversionProgress({
percent: 0,
stage: 'uploading',
message: 'Uploading PDF file to server...',
});
const formData = new FormData(); const formData = new FormData();
formData.append('fileInput', file); formData.append('fileInput', file);
const response = await apiClient.post(CONVERSION_ENDPOINTS['pdf-json'], formData, { console.log('Sending conversion request with async=true');
responseType: 'blob', const response = await apiClient.post(
`${CONVERSION_ENDPOINTS['pdf-json']}?async=true`,
formData,
{
responseType: 'json',
},
);
console.log('Conversion response:', response.data);
const jobId = response.data.jobId;
if (!jobId) {
console.error('No job ID in response:', response.data);
throw new Error('No job ID received from server');
}
pendingJobId = jobId;
console.log('Got job ID:', jobId);
setConversionProgress({
percent: 3,
stage: 'processing',
message: 'Starting conversion...',
}); });
const jsonText = await response.data.text(); let jobComplete = false;
parsed = JSON.parse(jsonText) as PdfJsonDocument; let attempts = 0;
const maxAttempts = 600;
while (!jobComplete && attempts < maxAttempts) {
await new Promise((resolve) => setTimeout(resolve, 1000));
attempts += 1;
try {
const statusResponse = await apiClient.get(`/api/v1/general/job/${jobId}`);
const jobStatus = statusResponse.data;
console.log(`Job status (attempt ${attempts}):`, jobStatus);
if (jobStatus.notes && jobStatus.notes.length > 0) {
const lastNote = jobStatus.notes[jobStatus.notes.length - 1];
console.log('Latest note:', lastNote);
const matchWithCount = lastNote.match(
/\[(\d+)%\]\s+(\w+):\s+(.+?)\s+\((\d+)\/(\d+)\)/,
);
if (matchWithCount) {
const percent = parseInt(matchWithCount[1], 10);
const stage = matchWithCount[2];
const message = matchWithCount[3];
const current = parseInt(matchWithCount[4], 10);
const total = parseInt(matchWithCount[5], 10);
setConversionProgress({
percent,
stage,
message,
current,
total,
});
} else {
const match = lastNote.match(/\[(\d+)%\]\s+(\w+):\s+(.+)/);
if (match) {
const percent = parseInt(match[1], 10);
const stage = match[2];
const message = match[3];
setConversionProgress({
percent,
stage,
message,
});
}
}
} else if (jobStatus.progress !== undefined) {
const percent = Math.min(Math.max(jobStatus.progress, 0), 100);
setConversionProgress({
percent,
stage: jobStatus.stage || 'processing',
message: jobStatus.note || 'Converting PDF to JSON...',
});
}
if (jobStatus.complete) {
if (jobStatus.error) {
console.error('Job failed:', jobStatus.error);
throw new Error(jobStatus.error);
}
console.log('Job completed, retrieving JSON result...');
jobComplete = true;
const resultResponse = await apiClient.get(
`/api/v1/general/job/${jobId}/result`,
{
responseType: 'blob',
},
);
const jsonText = await resultResponse.data.text();
const result = JSON.parse(jsonText);
if (!Array.isArray(result.pages)) {
console.error('Conversion result missing page array:', result);
throw new Error(
'PDF conversion result did not include page data. Please update the server.',
);
}
const docResult = result as PdfJsonDocument;
parsed = {
...docResult,
pages: docResult.pages ?? [],
};
shouldUseLazyMode = Boolean(docResult.lazyImages);
pendingJobId = shouldUseLazyMode ? jobId : null;
setConversionProgress(null);
} else {
console.log('Job not complete yet, continuing to poll...');
}
} catch (pollError: any) {
console.error('Error polling job status:', pollError);
console.error('Poll error details:', {
status: pollError?.response?.status,
data: pollError?.response?.data,
message: pollError?.message,
});
if (pollError?.response?.status === 404) {
throw new Error('Job not found on server');
}
}
}
if (!jobComplete) {
throw new Error('Conversion timed out');
}
if (!parsed) {
throw new Error('Conversion did not return JSON content');
}
} else { } else {
const content = await file.text(); const content = await file.text();
parsed = JSON.parse(content) as PdfJsonDocument; const docResult = JSON.parse(content) as PdfJsonDocument;
parsed = {
...docResult,
pages: docResult.pages ?? [],
};
shouldUseLazyMode = false;
pendingJobId = null;
} }
setConversionProgress(null);
if (loadRequestIdRef.current !== requestId) { if (loadRequestIdRef.current !== requestId) {
return; return;
} }
if (!parsed) {
throw new Error('Failed to parse PDF JSON document');
}
console.log(
`[PdfJsonEditor] Document loaded. Lazy image mode: ${shouldUseLazyMode}, Pages: ${
parsed.pages?.length || 0
}`,
);
setLoadedDocument(parsed); setLoadedDocument(parsed);
resetToDocument(parsed); resetToDocument(parsed);
setIsLazyMode(shouldUseLazyMode);
setCachedJobId(shouldUseLazyMode ? pendingJobId : null);
setFileName(file.name); setFileName(file.name);
setErrorMessage(null); setErrorMessage(null);
autoLoadKeyRef.current = fileKey; autoLoadKeyRef.current = fileKey;
} catch (error) { } catch (error: any) {
console.error('Failed to load file', error); console.error('Failed to load file', error);
console.error('Error details:', {
message: error?.message,
response: error?.response?.data,
stack: error?.stack,
});
if (loadRequestIdRef.current !== requestId) { if (loadRequestIdRef.current !== requestId) {
return; return;
@ -155,15 +461,17 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
resetToDocument(null); resetToDocument(null);
if (isPdf) { if (isPdf) {
setErrorMessage( const errorMsg =
t('pdfJsonEditor.conversionFailed', 'Failed to convert PDF. Please try again.') error?.message ||
); t('pdfJsonEditor.conversionFailed', 'Failed to convert PDF. Please try again.');
setErrorMessage(errorMsg);
console.error('Setting error message:', errorMsg);
} else { } else {
setErrorMessage( setErrorMessage(
t( t(
'pdfJsonEditor.errors.invalidJson', 'pdfJsonEditor.errors.invalidJson',
'Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.' 'Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.',
) ),
); );
} }
} finally { } finally {
@ -172,12 +480,16 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
} }
} }
}, },
[resetToDocument, t] [resetToDocument, t],
); );
const handleSelectPage = useCallback((pageIndex: number) => { const handleSelectPage = useCallback((pageIndex: number) => {
setSelectedPage(pageIndex); setSelectedPage(pageIndex);
}, []); // Trigger lazy loading for images on the selected page
if (isLazyMode) {
void loadImagesForPage(pageIndex);
}
}, [isLazyMode, loadImagesForPage]);
const handleGroupTextChange = useCallback((pageIndex: number, groupId: string, value: string) => { const handleGroupTextChange = useCallback((pageIndex: number, groupId: string, value: string) => {
setGroupsByPage((previous) => setGroupsByPage((previous) =>
@ -195,55 +507,63 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
imageId: string, imageId: string,
next: { left: number; bottom: number; width: number; height: number; transform: number[] }, next: { left: number; bottom: number; width: number; height: number; transform: number[] },
) => { ) => {
setImagesByPage((previous) => setImagesByPage((previous) => {
previous.map((images, idx) => { const current = previous[pageIndex] ?? [];
if (idx !== pageIndex) { let changed = false;
return images; const updatedPage = current.map((image) => {
if ((image.id ?? '') !== imageId) {
return image;
} }
let changed = false; const originalTransform = image.transform ?? originalImagesRef.current[pageIndex]?.find((base) => (base.id ?? '') === imageId)?.transform;
const updated = images.map((image) => { const scaleXSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[0]) || 1 : 1;
if ((image.id ?? '') !== imageId) { const scaleYSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[3]) || 1 : 1;
return image; const right = next.left + next.width;
} const top = next.bottom + next.height;
const originalTransform = image.transform ?? originalImagesRef.current[idx]?.find((base) => (base.id ?? '') === imageId)?.transform; const updatedImage: PdfJsonImageElement = {
const scaleXSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[0]) || 1 : 1; ...image,
const scaleYSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[3]) || 1 : 1; x: next.left,
const right = next.left + next.width; y: next.bottom,
const top = next.bottom + next.height; left: next.left,
const updatedImage: PdfJsonImageElement = { bottom: next.bottom,
...image, right,
x: next.left, top,
y: next.bottom, width: next.width,
left: next.left, height: next.height,
bottom: next.bottom, transform: scaleXSign < 0 || scaleYSign < 0
right, ? [
top, next.width * scaleXSign,
width: next.width, 0,
height: next.height, 0,
transform: scaleXSign < 0 || scaleYSign < 0 ? [ next.height * scaleYSign,
next.width * scaleXSign, next.left,
0, scaleYSign >= 0 ? next.bottom : next.bottom + next.height,
0, ]
next.height * scaleYSign, : null,
next.left, };
scaleYSign >= 0 ? next.bottom : next.bottom + next.height,
] : null,
};
const isSame = const isSame =
Math.abs(valueOr(image.left, 0) - next.left) < 1e-4 && Math.abs(valueOr(image.left, 0) - next.left) < 1e-4 &&
Math.abs(valueOr(image.bottom, 0) - next.bottom) < 1e-4 && Math.abs(valueOr(image.bottom, 0) - next.bottom) < 1e-4 &&
Math.abs(valueOr(image.width, 0) - next.width) < 1e-4 && Math.abs(valueOr(image.width, 0) - next.width) < 1e-4 &&
Math.abs(valueOr(image.height, 0) - next.height) < 1e-4; Math.abs(valueOr(image.height, 0) - next.height) < 1e-4;
if (!isSame) { if (!isSame) {
changed = true; changed = true;
} }
return updatedImage; return updatedImage;
}); });
return changed ? updated : images;
}), if (!changed) {
); return previous;
}
const nextImages = previous.map((images, idx) => (idx === pageIndex ? updatedPage : images));
if (imagesByPageRef.current.length <= pageIndex) {
imagesByPageRef.current.length = pageIndex + 1;
}
imagesByPageRef.current[pageIndex] = updatedPage.map(cloneImageElement);
return nextImages;
});
}, },
[], [],
); );
@ -253,14 +573,28 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
if (!baseline) { if (!baseline) {
return; return;
} }
setImagesByPage((previous) => setImagesByPage((previous) => {
previous.map((images, idx) => { const current = previous[pageIndex] ?? [];
if (idx !== pageIndex) { let changed = false;
return images; const updatedPage = current.map((image) => {
if ((image.id ?? '') !== imageId) {
return image;
} }
return images.map((image) => ((image.id ?? '') === imageId ? cloneImageElement(baseline) : image)); changed = true;
}), return cloneImageElement(baseline);
); });
if (!changed) {
return previous;
}
const nextImages = previous.map((images, idx) => (idx === pageIndex ? updatedPage : images));
if (imagesByPageRef.current.length <= pageIndex) {
imagesByPageRef.current.length = pageIndex + 1;
}
imagesByPageRef.current[pageIndex] = updatedPage.map(cloneImageElement);
return nextImages;
});
}, []); }, []);
const handleResetEdits = useCallback(() => { const handleResetEdits = useCallback(() => {
@ -279,7 +613,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const updatedDocument = restoreGlyphElements( const updatedDocument = restoreGlyphElements(
loadedDocument, loadedDocument,
groupsByPage, groupsByPage,
imagesByPage, imagesByPageRef.current,
originalImagesRef.current, originalImagesRef.current,
); );
const baseName = sanitizeBaseName(fileName || loadedDocument.metadata?.title || undefined); const baseName = sanitizeBaseName(fileName || loadedDocument.metadata?.title || undefined);
@ -287,7 +621,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
document: updatedDocument, document: updatedDocument,
filename: `${baseName}.json`, filename: `${baseName}.json`,
}; };
}, [fileName, groupsByPage, imagesByPage, loadedDocument]); }, [fileName, groupsByPage, loadedDocument]);
const handleDownloadJson = useCallback(() => { const handleDownloadJson = useCallback(() => {
const payload = buildPayload(); const payload = buildPayload();
@ -306,20 +640,129 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
}, [buildPayload, onComplete]); }, [buildPayload, onComplete]);
const handleGeneratePdf = useCallback(async () => { const handleGeneratePdf = useCallback(async () => {
const payload = buildPayload();
if (!payload) {
return;
}
const { document, filename } = payload;
const serialized = JSON.stringify(document, null, 2);
const jsonFile = new File([serialized], filename, { type: 'application/json' });
const formData = new FormData();
formData.append('fileInput', jsonFile);
try { try {
setIsGeneratingPdf(true); setIsGeneratingPdf(true);
const ensureImagesForPages = async (pageIndices: number[]) => {
const uniqueIndices = Array.from(new Set(pageIndices)).filter((index) => index >= 0);
if (uniqueIndices.length === 0) {
return;
}
for (const index of uniqueIndices) {
if (!loadedImagePagesRef.current.has(index)) {
await loadImagesForPage(index);
}
}
const maxWaitTime = 15000;
const pollInterval = 150;
const startWait = Date.now();
while (Date.now() - startWait < maxWaitTime) {
const allLoaded = uniqueIndices.every(
(index) =>
loadedImagePagesRef.current.has(index) &&
imagesByPageRef.current[index] !== undefined,
);
const anyLoading = uniqueIndices.some((index) =>
loadingImagePagesRef.current.has(index),
);
if (allLoaded && !anyLoading) {
return;
}
await new Promise((resolve) => setTimeout(resolve, pollInterval));
}
const missing = uniqueIndices.filter(
(index) => !loadedImagePagesRef.current.has(index),
);
if (missing.length > 0) {
throw new Error(
`Failed to load images for pages ${missing.map((i) => i + 1).join(', ')}`,
);
}
};
const currentDoc = loadedDocumentRef.current;
const totalPages = currentDoc?.pages?.length ?? 0;
const dirtyPageIndices = dirtyPages
.map((isDirty, index) => (isDirty ? index : -1))
.filter((index) => index >= 0);
const canUseIncremental =
isLazyMode &&
cachedJobId &&
dirtyPageIndices.length > 0 &&
dirtyPageIndices.length < totalPages;
if (canUseIncremental) {
await ensureImagesForPages(dirtyPageIndices);
try {
const payload = buildPayload();
if (!payload) {
return;
}
const { document, filename } = payload;
const dirtyPageSet = new Set(dirtyPageIndices);
const partialPages =
document.pages?.filter((_, index) => dirtyPageSet.has(index)) ?? [];
const partialDocument: PdfJsonDocument = {
metadata: document.metadata,
xmpMetadata: document.xmpMetadata,
fonts: document.fonts,
lazyImages: true,
pages: partialPages,
};
const baseName = sanitizeBaseName(filename).replace(/-edited$/u, '');
const expectedName = `${baseName || 'document'}.pdf`;
const response = await apiClient.post(
`/api/v1/convert/pdf/json/partial/${cachedJobId}?filename=${encodeURIComponent(expectedName)}`,
partialDocument,
{
responseType: 'blob',
},
);
const contentDisposition = response.headers?.['content-disposition'] ?? '';
const detectedName = getFilenameFromHeaders(contentDisposition);
const downloadName = detectedName || expectedName;
downloadBlob(response.data, downloadName);
if (onComplete) {
const pdfFile = new File([response.data], downloadName, { type: 'application/pdf' });
onComplete([pdfFile]);
}
setErrorMessage(null);
return;
} catch (incrementalError) {
console.warn(
'[handleGeneratePdf] Incremental export failed, falling back to full export',
incrementalError,
);
}
}
if (isLazyMode && totalPages > 0) {
const allPageIndices = Array.from({ length: totalPages }, (_, index) => index);
await ensureImagesForPages(allPageIndices);
}
const payload = buildPayload();
if (!payload) {
return;
}
const { document, filename } = payload;
const serialized = JSON.stringify(document, null, 2);
const jsonFile = new File([serialized], filename, { type: 'application/json' });
const formData = new FormData();
formData.append('fileInput', jsonFile);
const response = await apiClient.post(CONVERSION_ENDPOINTS['json-pdf'], formData, { const response = await apiClient.post(CONVERSION_ENDPOINTS['json-pdf'], formData, {
responseType: 'blob', responseType: 'blob',
}); });
@ -350,7 +793,16 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
} finally { } finally {
setIsGeneratingPdf(false); setIsGeneratingPdf(false);
} }
}, [buildPayload, onComplete, onError, t]); }, [
buildPayload,
cachedJobId,
dirtyPages,
isLazyMode,
loadImagesForPage,
onComplete,
onError,
t,
]);
const viewData = useMemo<PdfJsonEditorViewData>(() => ({ const viewData = useMemo<PdfJsonEditorViewData>(() => ({
document: loadedDocument, document: loadedDocument,
@ -363,6 +815,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
errorMessage, errorMessage,
isGeneratingPdf, isGeneratingPdf,
isConverting, isConverting,
conversionProgress,
hasChanges, hasChanges,
onLoadJson: handleLoadFile, onLoadJson: handleLoadFile,
onSelectPage: handleSelectPage, onSelectPage: handleSelectPage,
@ -390,6 +843,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
hasDocument, hasDocument,
isGeneratingPdf, isGeneratingPdf,
isConverting, isConverting,
conversionProgress,
loadedDocument, loadedDocument,
selectedPage, selectedPage,
]); ]);
@ -397,6 +851,13 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const latestViewDataRef = useRef<PdfJsonEditorViewData>(viewData); const latestViewDataRef = useRef<PdfJsonEditorViewData>(viewData);
latestViewDataRef.current = viewData; latestViewDataRef.current = viewData;
// Trigger initial image loading in lazy mode
useEffect(() => {
if (isLazyMode && loadedDocument) {
void loadImagesForPage(selectedPage);
}
}, [isLazyMode, loadedDocument, selectedPage, loadImagesForPage]);
useEffect(() => { useEffect(() => {
if (selectedFiles.length === 0) { if (selectedFiles.length === 0) {
autoLoadKeyRef.current = null; autoLoadKeyRef.current = null;
@ -433,11 +894,20 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
setCustomWorkbenchViewData(VIEW_ID, latestViewDataRef.current); setCustomWorkbenchViewData(VIEW_ID, latestViewDataRef.current);
return () => { return () => {
// Clear backend cache if we were using lazy loading
if (cachedJobId) {
console.log(`[PdfJsonEditor] Cleaning up cached document for jobId: ${cachedJobId}`);
apiClient.post(`/api/v1/convert/pdf/json/clear-cache/${cachedJobId}`).catch((error) => {
console.warn('[PdfJsonEditor] Failed to clear cache:', error);
});
}
clearCustomWorkbenchViewData(VIEW_ID); clearCustomWorkbenchViewData(VIEW_ID);
unregisterCustomWorkbenchView(VIEW_ID); unregisterCustomWorkbenchView(VIEW_ID);
setLeftPanelView('toolPicker'); setLeftPanelView('toolPicker');
}; };
}, [ }, [
cachedJobId,
clearCustomWorkbenchViewData, clearCustomWorkbenchViewData,
registerCustomWorkbenchView, registerCustomWorkbenchView,
setCustomWorkbenchViewData, setCustomWorkbenchViewData,

View File

@ -122,6 +122,23 @@ export interface PdfJsonDocument {
xmpMetadata?: string | null; xmpMetadata?: string | null;
fonts?: PdfJsonFont[] | null; fonts?: PdfJsonFont[] | null;
pages?: PdfJsonPage[] | null; pages?: PdfJsonPage[] | null;
lazyImages?: boolean | null;
}
export interface PdfJsonPageDimension {
pageNumber?: number | null;
width?: number | null;
height?: number | null;
rotation?: number | null;
}
export interface PdfJsonDocumentMetadata {
metadata?: PdfJsonMetadata | null;
xmpMetadata?: string | null;
fonts?: PdfJsonFont[] | null;
pageDimensions?: PdfJsonPageDimension[] | null;
formFields?: unknown[] | null;
lazyImages?: boolean | null;
} }
export interface BoundingBox { export interface BoundingBox {
@ -153,6 +170,14 @@ export interface TextGroup {
export const DEFAULT_PAGE_WIDTH = 612; export const DEFAULT_PAGE_WIDTH = 612;
export const DEFAULT_PAGE_HEIGHT = 792; export const DEFAULT_PAGE_HEIGHT = 792;
export interface ConversionProgress {
percent: number;
stage: string;
message: string;
current?: number;
total?: number;
}
export interface PdfJsonEditorViewData { export interface PdfJsonEditorViewData {
document: PdfJsonDocument | null; document: PdfJsonDocument | null;
groupsByPage: TextGroup[][]; groupsByPage: TextGroup[][];
@ -164,6 +189,7 @@ export interface PdfJsonEditorViewData {
errorMessage: string | null; errorMessage: string | null;
isGeneratingPdf: boolean; isGeneratingPdf: boolean;
isConverting: boolean; isConverting: boolean;
conversionProgress: ConversionProgress | null;
hasChanges: boolean; hasChanges: boolean;
onLoadJson: (file: File | null) => Promise<void> | void; onLoadJson: (file: File | null) => Promise<void> | void;
onSelectPage: (pageIndex: number) => void; onSelectPage: (pageIndex: number) => void;

View File

@ -15,6 +15,7 @@ export default defineConfig({
}), }),
], ],
server: { server: {
host: true,
proxy: { proxy: {
'/api': { '/api': {
target: 'http://localhost:8080', target: 'http://localhost:8080',