editor revamp, complete change

This commit is contained in:
Anthony Stirling 2025-11-02 21:00:03 +00:00
parent ec0ae36a82
commit bbcb23ca11
25 changed files with 3747 additions and 1021 deletions

View File

@ -148,17 +148,31 @@ public class JobExecutorService {
taskManager.createTask(jobId);
// Create a specialized wrapper that updates the TaskManager
final String capturedJobIdForQueue = jobId;
Supplier<Object> wrappedWork =
() -> {
try {
// Set jobId in ThreadLocal context for the queued job
stirling.software.common.util.JobContext.setJobId(
capturedJobIdForQueue);
log.debug(
"Set jobId {} in JobContext for queued job execution",
capturedJobIdForQueue);
Object result = work.get();
processJobResult(jobId, result);
processJobResult(capturedJobIdForQueue, result);
return result;
} catch (Exception e) {
log.error(
"Error executing queued job {}: {}", jobId, e.getMessage(), e);
taskManager.setError(jobId, e.getMessage());
"Error executing queued job {}: {}",
capturedJobIdForQueue,
e.getMessage(),
e);
taskManager.setError(capturedJobIdForQueue, e.getMessage());
throw e;
} finally {
// Clean up ThreadLocal to avoid memory leaks
stirling.software.common.util.JobContext.clear();
}
};
@ -170,21 +184,36 @@ public class JobExecutorService {
return ResponseEntity.ok().body(new JobResponse<>(true, jobId, null));
} else if (async) {
taskManager.createTask(jobId);
// Capture the jobId for the async thread
final String capturedJobId = jobId;
executor.execute(
() -> {
try {
log.debug(
"Running async job {} with timeout {} ms", jobId, timeoutToUse);
"Running async job {} with timeout {} ms",
capturedJobId,
timeoutToUse);
// Set jobId in ThreadLocal context for the async thread
stirling.software.common.util.JobContext.setJobId(capturedJobId);
log.debug(
"Set jobId {} in JobContext for async execution",
capturedJobId);
// Execute with timeout
Object result = executeWithTimeout(() -> work.get(), timeoutToUse);
processJobResult(jobId, result);
processJobResult(capturedJobId, result);
} catch (TimeoutException te) {
log.error("Job {} timed out after {} ms", jobId, timeoutToUse);
taskManager.setError(jobId, "Job timed out");
} catch (Exception e) {
log.error("Error executing job {}: {}", jobId, e.getMessage(), e);
taskManager.setError(jobId, e.getMessage());
} finally {
// Clean up ThreadLocal to avoid memory leaks
stirling.software.common.util.JobContext.clear();
}
});
@ -193,6 +222,10 @@ public class JobExecutorService {
try {
log.debug("Running sync job with timeout {} ms", timeoutToUse);
// Make jobId available to downstream components on the worker thread
stirling.software.common.util.JobContext.setJobId(jobId);
log.debug("Set jobId {} in JobContext for sync execution", jobId);
// Execute with timeout
Object result = executeWithTimeout(() -> work.get(), timeoutToUse);
@ -212,6 +245,8 @@ public class JobExecutorService {
// Construct a JSON error response
return ResponseEntity.internalServerError()
.body(Map.of("error", "Job failed: " + e.getMessage()));
} finally {
stirling.software.common.util.JobContext.clear();
}
}
}
@ -456,8 +491,23 @@ public class JobExecutorService {
throws TimeoutException, Exception {
// Use the same executor as other async jobs for consistency
// This ensures all operations run on the same thread pool
String currentJobId = stirling.software.common.util.JobContext.getJobId();
java.util.concurrent.CompletableFuture<T> future =
java.util.concurrent.CompletableFuture.supplyAsync(supplier, executor);
java.util.concurrent.CompletableFuture.supplyAsync(
() -> {
if (currentJobId != null) {
stirling.software.common.util.JobContext.setJobId(currentJobId);
}
try {
return supplier.get();
} finally {
if (currentJobId != null) {
stirling.software.common.util.JobContext.clear();
}
}
},
executor);
try {
return future.get(timeoutMs, TimeUnit.MILLISECONDS);

View File

@ -0,0 +1,18 @@
package stirling.software.common.util;
/** Thread-local context for passing job ID across async boundaries */
public class JobContext {
private static final ThreadLocal<String> CURRENT_JOB_ID = new ThreadLocal<>();
public static void setJobId(String jobId) {
CURRENT_JOB_ID.set(jobId);
}
public static String getJobId() {
return CURRENT_JOB_ID.get();
}
public static void clear() {
CURRENT_JOB_ID.remove();
}
}

View File

@ -94,6 +94,7 @@ public class ProcessExecutor {
.getProcessExecutor()
.getSessionLimit()
.getOcrMyPdfSessionLimit();
case CFF_CONVERTER -> 1;
};
long timeoutMinutes =
@ -148,6 +149,7 @@ public class ProcessExecutor {
.getProcessExecutor()
.getTimeoutMinutes()
.getOcrMyPdfTimeoutMinutes();
case CFF_CONVERTER -> 5L;
};
return new ProcessExecutor(semaphoreLimit, liveUpdates, timeoutMinutes);
});
@ -300,7 +302,8 @@ public class ProcessExecutor {
TESSERACT,
QPDF,
GHOSTSCRIPT,
OCR_MY_PDF
OCR_MY_PDF,
CFF_CONVERTER
}
public class ProcessExecutorResult {

View File

@ -78,6 +78,23 @@ class JobExecutorServiceTest {
verify(request).setAttribute(eq("jobId"), anyString());
}
@Test
void shouldExposeJobIdInJobContextDuringSyncExecution() throws Exception {
// Given
Supplier<Object> work = stirling.software.common.util.JobContext::getJobId;
// When
ResponseEntity<?> response = jobExecutorService.runJobGeneric(false, work);
// Then
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
var requestJobIdCaptor = ArgumentCaptor.forClass(String.class);
verify(request).setAttribute(eq("jobId"), requestJobIdCaptor.capture());
assertEquals(requestJobIdCaptor.getValue(), response.getBody());
}
@Test
void shouldRunAsyncJobSuccessfully() throws Exception {
// Given

View File

@ -8,6 +8,8 @@ logging.level.org.eclipse.jetty=WARN
#logging.level.stirling.software.proprietary.security=DEBUG
logging.level.com.zaxxer.hikari=WARN
logging.level.stirling.software.SPDF.service.PdfJsonConversionService=TRACE
logging.level.stirling.software.common.service.JobExecutorService=DEBUG
logging.level.stirling.software.common.service.TaskManager=DEBUG
spring.jpa.open-in-view=false
server.forward-headers-strategy=NATIVE
server.error.path=/error

View File

@ -1,16 +1,26 @@
package stirling.software.SPDF.controller.api.converters;
import java.util.Optional;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.config.swagger.StandardPdfResponse;
import stirling.software.SPDF.model.json.PdfJsonDocument;
import stirling.software.SPDF.model.json.PdfJsonMetadata;
import stirling.software.SPDF.service.PdfJsonConversionService;
import stirling.software.common.annotations.AutoJobPostMapping;
import stirling.software.common.annotations.api.ConvertApi;
@ -19,6 +29,7 @@ import stirling.software.common.model.api.PDFFile;
import stirling.software.common.util.ExceptionUtils;
import stirling.software.common.util.WebResponseUtils;
@Slf4j
@ConvertApi
@RequiredArgsConstructor
public class ConvertPdfJsonController {
@ -71,4 +82,81 @@ public class ConvertPdfJsonController {
String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf";
return WebResponseUtils.bytesToWebResponse(pdfBytes, docName);
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/json/metadata")
@Operation(
summary = "Extract PDF metadata for lazy loading",
description =
"Extracts document metadata, fonts, and page dimensions. Caches the document for"
+ " subsequent page requests. Input:PDF Output:JSON Type:SISO")
public ResponseEntity<byte[]> extractPdfMetadata(
@ModelAttribute PDFFile request, @RequestParam(required = true) String jobId)
throws Exception {
MultipartFile inputFile = request.getFileInput();
if (inputFile == null) {
throw ExceptionUtils.createNullArgumentException("fileInput");
}
byte[] jsonBytes = pdfJsonConversionService.extractDocumentMetadata(inputFile, jobId);
String originalName = inputFile.getOriginalFilename();
String baseName =
(originalName != null && !originalName.isBlank())
? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "")
: "document";
String docName = baseName + "_metadata.json";
return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON);
}
@PostMapping(value = "/pdf/json/partial/{jobId}", consumes = MediaType.APPLICATION_JSON_VALUE)
@StandardPdfResponse
@Operation(
summary = "Apply incremental edits to a cached PDF",
description =
"Applies edits for the specified pages of a cached PDF and returns an updated PDF."
+ " Requires the PDF to have been previously cached via the PDF to JSON endpoint.")
public ResponseEntity<byte[]> exportPartialPdf(
@PathVariable String jobId,
@RequestBody PdfJsonDocument document,
@RequestParam(value = "filename", required = false) String filename)
throws Exception {
if (document == null) {
throw ExceptionUtils.createNullArgumentException("document");
}
byte[] pdfBytes = pdfJsonConversionService.exportUpdatedPages(jobId, document);
String baseName =
(filename != null && !filename.isBlank())
? Filenames.toSimpleFileName(filename).replaceFirst("[.][^.]+$", "")
: Optional.ofNullable(document.getMetadata())
.map(PdfJsonMetadata::getTitle)
.filter(title -> title != null && !title.isBlank())
.orElse("document");
String docName = baseName.endsWith(".pdf") ? baseName : baseName + ".pdf";
return WebResponseUtils.bytesToWebResponse(pdfBytes, docName);
}
@GetMapping(value = "/pdf/json/page/{jobId}/{pageNumber}")
@Operation(
summary = "Extract single page from cached PDF",
description =
"Retrieves a single page's content from a previously cached PDF document."
+ " Requires prior call to /pdf/json/metadata. Output:JSON")
public ResponseEntity<byte[]> extractSinglePage(
@PathVariable String jobId, @PathVariable int pageNumber) throws Exception {
byte[] jsonBytes = pdfJsonConversionService.extractSinglePage(jobId, pageNumber);
String docName = "page_" + pageNumber + ".json";
return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON);
}
@PostMapping(value = "/pdf/json/clear-cache/{jobId}")
@Operation(
summary = "Clear cached PDF document",
description =
"Manually clears a cached PDF document to free up server resources."
+ " Called automatically after 30 minutes.")
public ResponseEntity<Void> clearCache(@PathVariable String jobId) {
pdfJsonConversionService.clearCachedDocument(jobId);
return ResponseEntity.ok().build();
}
}

View File

@ -0,0 +1,49 @@
package stirling.software.SPDF.model.api;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class PdfJsonConversionProgress {
private int percent;
private String stage;
private String message;
private boolean complete;
private Integer current; // Current item being processed (e.g., page number)
private Integer total; // Total items to process (e.g., total pages)
public static PdfJsonConversionProgress of(int percent, String stage, String message) {
return PdfJsonConversionProgress.builder()
.percent(percent)
.stage(stage)
.message(message)
.complete(false)
.build();
}
public static PdfJsonConversionProgress of(
int percent, String stage, String message, int current, int total) {
return PdfJsonConversionProgress.builder()
.percent(percent)
.stage(stage)
.message(message)
.current(current)
.total(total)
.complete(false)
.build();
}
public static PdfJsonConversionProgress complete() {
return PdfJsonConversionProgress.builder()
.percent(100)
.stage("complete")
.message("Conversion complete")
.complete(true)
.build();
}
}

View File

@ -22,6 +22,9 @@ public class PdfJsonDocument {
/** Optional XMP metadata packet stored as Base64. */
private String xmpMetadata;
/** Indicates that images should be loaded lazily via API rather than embedded in the JSON. */
private Boolean lazyImages;
@Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>();
@Builder.Default private List<PdfJsonPage> pages = new ArrayList<>();

View File

@ -0,0 +1,34 @@
package stirling.software.SPDF.model.json;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonDocumentMetadata {
private PdfJsonMetadata metadata;
/** Optional XMP metadata packet stored as Base64. */
private String xmpMetadata;
/** Indicates that images should be requested lazily via the page endpoint. */
private Boolean lazyImages;
@Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>();
@Builder.Default private List<PdfJsonPageDimension> pageDimensions = new ArrayList<>();
/** Form fields (AcroForm) at document level */
@Builder.Default private List<PdfJsonFormField> formFields = new ArrayList<>();
}

View File

@ -0,0 +1,20 @@
package stirling.software.SPDF.model.json;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonPageDimension {
private Integer pageNumber;
private Float width;
private Float height;
private Integer rotation;
}

View File

@ -0,0 +1,274 @@
package stirling.software.SPDF.service;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSBoolean;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.springframework.stereotype.Component;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.json.PdfJsonCosValue;
import stirling.software.SPDF.model.json.PdfJsonStream;
@Slf4j
@Component
public class PdfJsonCosMapper {
public PdfJsonStream serializeStream(PDStream stream) throws IOException {
if (stream == null) {
return null;
}
return serializeStream(
stream.getCOSObject(), Collections.newSetFromMap(new IdentityHashMap<>()));
}
public PdfJsonStream serializeStream(COSStream cosStream) throws IOException {
if (cosStream == null) {
return null;
}
return serializeStream(cosStream, Collections.newSetFromMap(new IdentityHashMap<>()));
}
public PdfJsonCosValue serializeCosValue(COSBase base) throws IOException {
return serializeCosValue(base, Collections.newSetFromMap(new IdentityHashMap<>()));
}
public COSBase deserializeCosValue(PdfJsonCosValue value, PDDocument document)
throws IOException {
if (value == null || value.getType() == null) {
return null;
}
switch (value.getType()) {
case NULL:
return COSNull.NULL;
case BOOLEAN:
if (value.getValue() instanceof Boolean bool) {
return COSBoolean.getBoolean(bool);
}
return null;
case INTEGER:
if (value.getValue() instanceof Number number) {
return COSInteger.get(number.longValue());
}
return null;
case FLOAT:
if (value.getValue() instanceof Number number) {
return new COSFloat(number.floatValue());
}
return null;
case NAME:
if (value.getValue() instanceof String name) {
return COSName.getPDFName(name);
}
return null;
case STRING:
if (value.getValue() instanceof String encoded) {
try {
byte[] bytes = Base64.getDecoder().decode(encoded);
return new COSString(bytes);
} catch (IllegalArgumentException ex) {
log.debug("Failed to decode COSString value: {}", ex.getMessage());
}
}
return null;
case ARRAY:
COSArray array = new COSArray();
if (value.getItems() != null) {
for (PdfJsonCosValue item : value.getItems()) {
COSBase entry = deserializeCosValue(item, document);
if (entry != null) {
array.add(entry);
} else {
array.add(COSNull.NULL);
}
}
}
return array;
case DICTIONARY:
COSDictionary dictionary = new COSDictionary();
if (value.getEntries() != null) {
for (Map.Entry<String, PdfJsonCosValue> entry : value.getEntries().entrySet()) {
COSName key = COSName.getPDFName(entry.getKey());
COSBase entryValue = deserializeCosValue(entry.getValue(), document);
if (entryValue != null) {
dictionary.setItem(key, entryValue);
}
}
}
return dictionary;
case STREAM:
if (value.getStream() != null) {
return buildStreamFromModel(value.getStream(), document);
}
return null;
default:
return null;
}
}
public COSStream buildStreamFromModel(PdfJsonStream streamModel, PDDocument document)
throws IOException {
if (streamModel == null) {
return null;
}
COSStream cosStream = document.getDocument().createCOSStream();
if (streamModel.getDictionary() != null) {
for (Map.Entry<String, PdfJsonCosValue> entry :
streamModel.getDictionary().entrySet()) {
COSName key = COSName.getPDFName(entry.getKey());
COSBase value = deserializeCosValue(entry.getValue(), document);
if (value != null) {
cosStream.setItem(key, value);
}
}
}
String rawData = streamModel.getRawData();
if (rawData != null && !rawData.isBlank()) {
byte[] data;
try {
data = Base64.getDecoder().decode(rawData);
} catch (IllegalArgumentException ex) {
log.debug("Invalid base64 content stream data: {}", ex.getMessage());
data = new byte[0];
}
try (OutputStream outputStream = cosStream.createRawOutputStream()) {
outputStream.write(data);
}
cosStream.setItem(COSName.LENGTH, COSInteger.get(data.length));
} else {
cosStream.setItem(COSName.LENGTH, COSInteger.get(0));
}
return cosStream;
}
private PdfJsonCosValue serializeCosValue(COSBase base, Set<COSBase> visited)
throws IOException {
if (base == null) {
return null;
}
if (base instanceof COSObject cosObject) {
base = cosObject.getObject();
if (base == null) {
return null;
}
}
boolean complex =
base instanceof COSDictionary
|| base instanceof COSArray
|| base instanceof COSStream;
if (complex) {
if (!visited.add(base)) {
return PdfJsonCosValue.builder()
.type(PdfJsonCosValue.Type.NAME)
.value("__circular__")
.build();
}
}
try {
PdfJsonCosValue.PdfJsonCosValueBuilder builder = PdfJsonCosValue.builder();
if (base instanceof COSNull) {
builder.type(PdfJsonCosValue.Type.NULL);
return builder.build();
}
if (base instanceof COSBoolean booleanValue) {
builder.type(PdfJsonCosValue.Type.BOOLEAN).value(booleanValue.getValue());
return builder.build();
}
if (base instanceof COSInteger integer) {
builder.type(PdfJsonCosValue.Type.INTEGER).value(integer.longValue());
return builder.build();
}
if (base instanceof COSFloat floatValue) {
builder.type(PdfJsonCosValue.Type.FLOAT).value(floatValue.floatValue());
return builder.build();
}
if (base instanceof COSName name) {
builder.type(PdfJsonCosValue.Type.NAME).value(name.getName());
return builder.build();
}
if (base instanceof COSString cosString) {
builder.type(PdfJsonCosValue.Type.STRING)
.value(Base64.getEncoder().encodeToString(cosString.getBytes()));
return builder.build();
}
if (base instanceof COSArray array) {
List<PdfJsonCosValue> items = new ArrayList<>(array.size());
for (COSBase item : array) {
PdfJsonCosValue serialized = serializeCosValue(item, visited);
items.add(serialized);
}
builder.type(PdfJsonCosValue.Type.ARRAY).items(items);
return builder.build();
}
if (base instanceof COSStream stream) {
builder.type(PdfJsonCosValue.Type.STREAM).stream(serializeStream(stream, visited));
return builder.build();
}
if (base instanceof COSDictionary dictionary) {
Map<String, PdfJsonCosValue> entries = new LinkedHashMap<>();
for (COSName key : dictionary.keySet()) {
PdfJsonCosValue serialized =
serializeCosValue(dictionary.getDictionaryObject(key), visited);
entries.put(key.getName(), serialized);
}
builder.type(PdfJsonCosValue.Type.DICTIONARY).entries(entries);
return builder.build();
}
return null;
} finally {
if (complex) {
visited.remove(base);
}
}
}
private PdfJsonStream serializeStream(COSStream cosStream, Set<COSBase> visited)
throws IOException {
Map<String, PdfJsonCosValue> dictionary = new LinkedHashMap<>();
for (COSName key : cosStream.keySet()) {
COSBase value = cosStream.getDictionaryObject(key);
PdfJsonCosValue serialized = serializeCosValue(value, visited);
if (serialized != null) {
dictionary.put(key.getName(), serialized);
}
}
String rawData = null;
try (InputStream inputStream = cosStream.createRawInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
if (inputStream != null) {
inputStream.transferTo(baos);
}
byte[] data = baos.toByteArray();
if (data.length > 0) {
rawData = Base64.getEncoder().encodeToString(data);
}
}
return PdfJsonStream.builder().dictionary(dictionary).rawData(rawData).build();
}
}

View File

@ -0,0 +1,224 @@
package stirling.software.SPDF.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.core.io.ResourceLoader;
import org.springframework.stereotype.Component;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.json.PdfJsonFont;
@Slf4j
@Component
@RequiredArgsConstructor
public class PdfJsonFallbackFontService {
public static final String FALLBACK_FONT_ID = "fallback-noto-sans";
public static final String DEFAULT_FALLBACK_FONT_LOCATION =
"classpath:/static/fonts/NotoSans-Regular.ttf";
public static final String FALLBACK_FONT_CJK_ID = "fallback-noto-cjk";
public static final String FALLBACK_FONT_JP_ID = "fallback-noto-jp";
public static final String FALLBACK_FONT_KR_ID = "fallback-noto-korean";
public static final String FALLBACK_FONT_AR_ID = "fallback-noto-arabic";
public static final String FALLBACK_FONT_TH_ID = "fallback-noto-thai";
private static final Map<String, FallbackFontSpec> BUILT_IN_FALLBACK_FONTS =
Map.ofEntries(
Map.entry(
FALLBACK_FONT_CJK_ID,
new FallbackFontSpec(
"classpath:/static/fonts/NotoSansSC-Regular.ttf",
"NotoSansSC-Regular",
"ttf")),
Map.entry(
FALLBACK_FONT_JP_ID,
new FallbackFontSpec(
"classpath:/static/fonts/NotoSansJP-Regular.ttf",
"NotoSansJP-Regular",
"ttf")),
Map.entry(
FALLBACK_FONT_KR_ID,
new FallbackFontSpec(
"classpath:/static/fonts/malgun.ttf", "MalgunGothic", "ttf")),
Map.entry(
FALLBACK_FONT_AR_ID,
new FallbackFontSpec(
"classpath:/static/fonts/NotoSansArabic-Regular.ttf",
"NotoSansArabic-Regular",
"ttf")),
Map.entry(
FALLBACK_FONT_TH_ID,
new FallbackFontSpec(
"classpath:/static/fonts/NotoSansThai-Regular.ttf",
"NotoSansThai-Regular",
"ttf")));
private final ResourceLoader resourceLoader;
@Value("${stirling.pdf.fallback-font:" + DEFAULT_FALLBACK_FONT_LOCATION + "}")
private String fallbackFontLocation;
private final Map<String, byte[]> fallbackFontCache = new ConcurrentHashMap<>();
public PdfJsonFont buildFallbackFontModel() throws IOException {
return buildFallbackFontModel(FALLBACK_FONT_ID);
}
public PdfJsonFont buildFallbackFontModel(String fallbackId) throws IOException {
FallbackFontSpec spec = getFallbackFontSpec(fallbackId);
if (spec == null) {
throw new IOException("Unknown fallback font id " + fallbackId);
}
byte[] bytes = loadFallbackFontBytes(fallbackId, spec);
String base64 = java.util.Base64.getEncoder().encodeToString(bytes);
return PdfJsonFont.builder()
.id(fallbackId)
.uid(fallbackId)
.baseName(spec.baseName())
.subtype("TrueType")
.embedded(true)
.program(base64)
.programFormat(spec.format())
.build();
}
public PDFont loadFallbackPdfFont(PDDocument document) throws IOException {
return loadFallbackPdfFont(document, FALLBACK_FONT_ID);
}
public PDFont loadFallbackPdfFont(PDDocument document, String fallbackId) throws IOException {
FallbackFontSpec spec = getFallbackFontSpec(fallbackId);
if (spec == null) {
throw new IOException("Unknown fallback font id " + fallbackId);
}
byte[] bytes = loadFallbackFontBytes(fallbackId, spec);
try (InputStream stream = new ByteArrayInputStream(bytes)) {
return PDType0Font.load(document, stream, true);
}
}
public boolean canEncodeFully(PDFont font, String text) {
return canEncode(font, text);
}
public boolean canEncode(PDFont font, int codePoint) {
return canEncode(font, new String(Character.toChars(codePoint)));
}
public boolean canEncode(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
return false;
}
try {
font.encode(text);
return true;
} catch (IOException | IllegalArgumentException ex) {
return false;
}
}
public String resolveFallbackFontId(int codePoint) {
Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
if (block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E
|| block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F
|| block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|| block == Character.UnicodeBlock.BOPOMOFO
|| block == Character.UnicodeBlock.BOPOMOFO_EXTENDED
|| block == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
return FALLBACK_FONT_CJK_ID;
}
Character.UnicodeScript script = Character.UnicodeScript.of(codePoint);
return switch (script) {
case HAN -> FALLBACK_FONT_CJK_ID;
case HIRAGANA, KATAKANA -> FALLBACK_FONT_JP_ID;
case HANGUL -> FALLBACK_FONT_KR_ID;
case ARABIC -> FALLBACK_FONT_AR_ID;
case THAI -> FALLBACK_FONT_TH_ID;
default -> FALLBACK_FONT_ID;
};
}
public String mapUnsupportedGlyph(int codePoint) {
return switch (codePoint) {
case 0x276E -> "<";
case 0x276F -> ">";
default -> null;
};
}
private FallbackFontSpec getFallbackFontSpec(String fallbackId) {
if (FALLBACK_FONT_ID.equals(fallbackId)) {
String baseName = inferBaseName(fallbackFontLocation, "NotoSans-Regular");
String format = inferFormat(fallbackFontLocation, "ttf");
return new FallbackFontSpec(fallbackFontLocation, baseName, format);
}
return BUILT_IN_FALLBACK_FONTS.get(fallbackId);
}
private byte[] loadFallbackFontBytes(String fallbackId, FallbackFontSpec spec)
throws IOException {
if (spec == null) {
throw new IOException("No fallback font specification for " + fallbackId);
}
byte[] cached = fallbackFontCache.get(fallbackId);
if (cached != null) {
return cached;
}
Resource resource = resourceLoader.getResource(spec.resourceLocation());
if (!resource.exists()) {
throw new IOException("Fallback font resource not found at " + spec.resourceLocation());
}
try (InputStream inputStream = resource.getInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
inputStream.transferTo(baos);
byte[] bytes = baos.toByteArray();
fallbackFontCache.put(fallbackId, bytes);
return bytes;
}
}
private String inferBaseName(String location, String defaultName) {
if (location == null || location.isBlank()) {
return defaultName;
}
int slash = location.lastIndexOf('/');
String fileName = slash >= 0 ? location.substring(slash + 1) : location;
int dot = fileName.lastIndexOf('.');
if (dot > 0) {
fileName = fileName.substring(0, dot);
}
return fileName.isEmpty() ? defaultName : fileName;
}
private String inferFormat(String location, String defaultFormat) {
if (location == null || location.isBlank()) {
return defaultFormat;
}
int dot = location.lastIndexOf('.');
if (dot >= 0 && dot < location.length() - 1) {
return location.substring(dot + 1).toLowerCase(Locale.ROOT);
}
return defaultFormat;
}
private record FallbackFontSpec(String resourceLocation, String baseName, String format) {}
}

View File

@ -0,0 +1,349 @@
package stirling.software.SPDF.service.pdfjson;
import java.io.IOException;
import java.nio.file.Files;
import java.util.Base64;
import java.util.Locale;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import jakarta.annotation.PostConstruct;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.common.util.ProcessExecutor;
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
import stirling.software.common.util.TempFile;
import stirling.software.common.util.TempFileManager;
@Slf4j
@Service
@RequiredArgsConstructor
public class PdfJsonFontService {
private final TempFileManager tempFileManager;
@Getter
@Value("${stirling.pdf.json.cff-converter.enabled:true}")
private boolean cffConversionEnabled;
@Getter
@Value("${stirling.pdf.json.cff-converter.method:python}")
private String cffConverterMethod;
@Value("${stirling.pdf.json.cff-converter.python-command:/opt/venv/bin/python3}")
private String pythonCommand;
@Value("${stirling.pdf.json.cff-converter.python-script:/scripts/convert_cff_to_ttf.py}")
private String pythonScript;
@Value("${stirling.pdf.json.cff-converter.fontforge-command:fontforge}")
private String fontforgeCommand;
private volatile boolean pythonCffConverterAvailable;
private volatile boolean fontForgeCffConverterAvailable;
@PostConstruct
private void initialiseCffConverterAvailability() {
if (!cffConversionEnabled) {
log.warn("[FONT-DEBUG] CFF conversion is DISABLED in configuration");
pythonCffConverterAvailable = false;
fontForgeCffConverterAvailable = false;
return;
}
log.info("[FONT-DEBUG] CFF conversion enabled, checking tool availability...");
pythonCffConverterAvailable = isCommandAvailable(pythonCommand);
if (!pythonCffConverterAvailable) {
log.warn(
"[FONT-DEBUG] Python command '{}' not found; Python CFF conversion disabled",
pythonCommand);
} else {
log.info("[FONT-DEBUG] Python command '{}' is available", pythonCommand);
}
fontForgeCffConverterAvailable = isCommandAvailable(fontforgeCommand);
if (!fontForgeCffConverterAvailable) {
log.warn(
"[FONT-DEBUG] FontForge command '{}' not found; FontForge CFF conversion disabled",
fontforgeCommand);
} else {
log.info("[FONT-DEBUG] FontForge command '{}' is available", fontforgeCommand);
}
log.info("[FONT-DEBUG] Selected CFF converter method: {}", cffConverterMethod);
}
public byte[] convertCffProgramToTrueType(byte[] fontBytes, String toUnicode) {
if (!cffConversionEnabled || fontBytes == null || fontBytes.length == 0) {
log.warn(
"[FONT-DEBUG] CFF conversion skipped: enabled={}, bytes={}",
cffConversionEnabled,
fontBytes == null ? "null" : fontBytes.length);
return null;
}
log.info(
"[FONT-DEBUG] Converting CFF font: {} bytes, method: {}",
fontBytes.length,
cffConverterMethod);
if ("python".equalsIgnoreCase(cffConverterMethod)) {
if (!pythonCffConverterAvailable) {
log.warn("[FONT-DEBUG] Python CFF converter not available, skipping conversion");
return null;
}
byte[] result = convertCffUsingPython(fontBytes, toUnicode);
log.info(
"[FONT-DEBUG] Python conversion result: {}",
result == null ? "null" : result.length + " bytes");
return result;
} else if ("fontforge".equalsIgnoreCase(cffConverterMethod)) {
if (!fontForgeCffConverterAvailable) {
log.warn("[FONT-DEBUG] FontForge CFF converter not available, skipping conversion");
return null;
}
byte[] result = convertCffUsingFontForge(fontBytes);
log.info(
"[FONT-DEBUG] FontForge conversion result: {}",
result == null ? "null" : result.length + " bytes");
return result;
} else {
log.warn(
"[FONT-DEBUG] Unknown CFF converter method: {}, falling back to Python",
cffConverterMethod);
if (!pythonCffConverterAvailable) {
log.warn("[FONT-DEBUG] Python CFF converter not available, skipping conversion");
return null;
}
byte[] result = convertCffUsingPython(fontBytes, toUnicode);
log.info(
"[FONT-DEBUG] Python conversion result: {}",
result == null ? "null" : result.length + " bytes");
return result;
}
}
public String detectFontFlavor(byte[] fontBytes) {
if (fontBytes == null || fontBytes.length < 4) {
return null;
}
int signature =
((fontBytes[0] & 0xFF) << 24)
| ((fontBytes[1] & 0xFF) << 16)
| ((fontBytes[2] & 0xFF) << 8)
| (fontBytes[3] & 0xFF);
if (signature == 0x00010000 || signature == 0x74727565) {
return "ttf";
}
if (signature == 0x4F54544F) {
return "otf";
}
if (signature == 0x74746366) {
return "cff";
}
return null;
}
public String detectTrueTypeFormat(byte[] data) {
if (data == null || data.length < 4) {
return null;
}
int signature =
((data[0] & 0xFF) << 24)
| ((data[1] & 0xFF) << 16)
| ((data[2] & 0xFF) << 8)
| (data[3] & 0xFF);
if (signature == 0x00010000) {
return "ttf";
}
if (signature == 0x4F54544F) {
return "otf";
}
if (signature == 0x74746366) {
return "cff";
}
return null;
}
public String validateFontTables(byte[] fontBytes) {
if (fontBytes == null || fontBytes.length < 12) {
return "Font program too small";
}
int numTables = ((fontBytes[4] & 0xFF) << 8) | (fontBytes[5] & 0xFF);
if (numTables <= 0 || numTables > 512) {
return "Invalid numTables: " + numTables;
}
return null;
}
private byte[] convertCffUsingPython(byte[] fontBytes, String toUnicode) {
if (!pythonCffConverterAvailable) {
log.warn("[FONT-DEBUG] Python CFF converter not available");
return null;
}
if (pythonCommand == null
|| pythonCommand.isBlank()
|| pythonScript == null
|| pythonScript.isBlank()) {
log.warn("[FONT-DEBUG] Python converter not configured");
return null;
}
log.info(
"[FONT-DEBUG] Running Python CFF converter: command={}, script={}",
pythonCommand,
pythonScript);
try (TempFile inputFile = new TempFile(tempFileManager, ".cff");
TempFile outputFile = new TempFile(tempFileManager, ".otf");
TempFile toUnicodeFile =
toUnicode != null ? new TempFile(tempFileManager, ".tounicode") : null) {
Files.write(inputFile.getPath(), fontBytes);
if (toUnicodeFile != null) {
try {
byte[] toUnicodeBytes = Base64.getDecoder().decode(toUnicode);
Files.write(toUnicodeFile.getPath(), toUnicodeBytes);
} catch (IllegalArgumentException ex) {
log.warn(
"[FONT-DEBUG] Failed to decode ToUnicode data for CFF conversion: {}",
ex.getMessage());
return null;
}
}
String[] command =
buildPythonCommand(
inputFile.getAbsolutePath(),
outputFile.getAbsolutePath(),
toUnicodeFile != null ? toUnicodeFile.getAbsolutePath() : null);
log.info("[FONT-DEBUG] Executing: {}", String.join(" ", command));
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.CFF_CONVERTER)
.runCommandWithOutputHandling(java.util.Arrays.asList(command));
if (result.getRc() != 0) {
log.error(
"[FONT-DEBUG] Python CFF conversion failed with exit code: {}",
result.getRc());
log.error("[FONT-DEBUG] Stdout: {}", result.getMessages());
return null;
}
if (!Files.exists(outputFile.getPath())) {
log.error("[FONT-DEBUG] Python CFF conversion produced no output file");
return null;
}
byte[] data = Files.readAllBytes(outputFile.getPath());
if (data.length == 0) {
log.error("[FONT-DEBUG] Python CFF conversion returned empty output");
return null;
}
log.info(
"[FONT-DEBUG] Python CFF conversion succeeded: {} bytes -> {} bytes",
fontBytes.length,
data.length);
return data;
} catch (IOException | InterruptedException ex) {
if (ex instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
log.error("[FONT-DEBUG] Python CFF conversion exception: {}", ex.getMessage(), ex);
return null;
}
}
public byte[] convertCffUsingFontForge(byte[] fontBytes) {
if (!fontForgeCffConverterAvailable) {
log.debug("FontForge CFF converter not available");
return null;
}
try (TempFile inputFile = new TempFile(tempFileManager, ".cff");
TempFile outputFile = new TempFile(tempFileManager, ".ttf")) {
Files.write(inputFile.getPath(), fontBytes);
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.CFF_CONVERTER)
.runCommandWithOutputHandling(
java.util.Arrays.asList(
fontforgeCommand,
"-lang=ff",
"-c",
"Open($1); "
+ "ScaleToEm(1000); "
+ "SelectWorthOutputting(); "
+ "SetFontOrder(2); "
+ "Reencode(\"unicode\"); "
+ "RoundToInt(); "
+ "RemoveOverlap(); "
+ "Simplify(); "
+ "CorrectDirection(); "
+ "Generate($2, \"\", 4+16+32); "
+ "Close(); "
+ "Quit()",
inputFile.getAbsolutePath(),
outputFile.getAbsolutePath()));
if (result.getRc() != 0) {
log.warn("FontForge CFF conversion failed: {}", result.getRc());
return null;
}
if (!Files.exists(outputFile.getPath())) {
log.warn("FontForge CFF conversion produced no output");
return null;
}
byte[] data = Files.readAllBytes(outputFile.getPath());
if (data.length == 0) {
log.warn("FontForge CFF conversion returned empty output");
return null;
}
return data;
} catch (IOException | InterruptedException ex) {
if (ex instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
log.warn("FontForge CFF conversion failed: {}", ex.getMessage());
return null;
}
}
private boolean isCommandAvailable(String command) {
if (command == null || command.isBlank()) {
return false;
}
try {
ProcessBuilder processBuilder = new ProcessBuilder();
if (System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows")) {
processBuilder.command("where", command);
} else {
processBuilder.command("which", command);
}
Process process = processBuilder.start();
int exitCode = process.waitFor();
return exitCode == 0;
} catch (Exception e) {
log.debug("Error checking for command {}: {}", command, e.getMessage());
return false;
}
}
private String[] buildPythonCommand(String input, String output, String toUnicode) {
if (toUnicode != null) {
return new String[] {
pythonCommand,
pythonScript,
"--input",
input,
"--output",
output,
"--to-unicode",
toUnicode
};
}
return new String[] {pythonCommand, pythonScript, "--input", input, "--output", output};
}
}

View File

@ -0,0 +1,444 @@
package stirling.software.SPDF.service.pdfjson;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.UUID;
import java.util.function.Consumer;
import javax.imageio.ImageIO;
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.util.Matrix;
import org.springframework.stereotype.Service;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PdfJsonConversionProgress;
import stirling.software.SPDF.model.json.PdfJsonImageElement;
/**
* Service for handling PDF image operations for JSON conversion (extraction, encoding, rendering).
*/
@Service
@Slf4j
public class PdfJsonImageService {
private record EncodedImage(String base64, String format) {}
private record Bounds(float left, float right, float bottom, float top) {
float width() {
return Math.max(0f, right - left);
}
float height() {
return Math.max(0f, top - bottom);
}
}
/**
* Collects images from all pages in a PDF document.
*
* @param document The PDF document
* @param totalPages Total number of pages
* @param progress Progress callback
* @return Map of page number to list of image elements
* @throws IOException If image extraction fails
*/
public Map<Integer, List<PdfJsonImageElement>> collectImages(
PDDocument document, int totalPages, Consumer<PdfJsonConversionProgress> progress)
throws IOException {
Map<Integer, List<PdfJsonImageElement>> imagesByPage = new LinkedHashMap<>();
int pageNumber = 1;
for (PDPage page : document.getPages()) {
ImageCollectingEngine engine =
new ImageCollectingEngine(page, pageNumber, imagesByPage);
engine.processPage(page);
// Update progress for image extraction (70-80%)
int imageProgress = 70 + (int) ((pageNumber / (double) totalPages) * 10);
progress.accept(
PdfJsonConversionProgress.of(
imageProgress, "images", "Extracting images", pageNumber, totalPages));
pageNumber++;
}
return imagesByPage;
}
/**
* Extracts images from a single PDF page (for on-demand lazy loading).
*
* @param document The PDF document
* @param page The specific page to extract images from
* @param pageNumber The page number (1-indexed)
* @return List of image elements for this page
* @throws IOException If image extraction fails
*/
public List<PdfJsonImageElement> extractImagesForPage(
PDDocument document, PDPage page, int pageNumber) throws IOException {
Map<Integer, List<PdfJsonImageElement>> imagesByPage = new LinkedHashMap<>();
ImageCollectingEngine engine = new ImageCollectingEngine(page, pageNumber, imagesByPage);
engine.processPage(page);
return imagesByPage.getOrDefault(pageNumber, new ArrayList<>());
}
/**
* Draws an image element on a PDF page content stream.
*
* @param contentStream The content stream to draw on
* @param document The PDF document
* @param element The image element to draw
* @param cache Cache of previously created image XObjects
* @throws IOException If drawing fails
*/
public void drawImageElement(
PDPageContentStream contentStream,
PDDocument document,
PdfJsonImageElement element,
Map<String, PDImageXObject> cache)
throws IOException {
if (element == null || element.getImageData() == null || element.getImageData().isBlank()) {
return;
}
String cacheKey =
element.getId() != null && !element.getId().isBlank()
? element.getId()
: Integer.toHexString(System.identityHashCode(element));
PDImageXObject image = cache.get(cacheKey);
if (image == null) {
image = createImageXObject(document, element);
if (image == null) {
return;
}
cache.put(cacheKey, image);
}
List<Float> transform = element.getTransform();
if (transform != null && transform.size() == 6) {
Matrix matrix =
new Matrix(
safeFloat(transform.get(0), 1f),
safeFloat(transform.get(1), 0f),
safeFloat(transform.get(2), 0f),
safeFloat(transform.get(3), 1f),
safeFloat(transform.get(4), 0f),
safeFloat(transform.get(5), 0f));
contentStream.drawImage(image, matrix);
return;
}
float width = safeFloat(element.getWidth(), fallbackWidth(element));
float height = safeFloat(element.getHeight(), fallbackHeight(element));
if (width <= 0f) {
width = Math.max(1f, fallbackWidth(element));
}
if (height <= 0f) {
height = Math.max(1f, fallbackHeight(element));
}
float left = resolveLeft(element, width);
float bottom = resolveBottom(element, height);
contentStream.drawImage(image, left, bottom, width, height);
}
/**
* Creates a PDImageXObject from a PdfJsonImageElement.
*
* @param document The PDF document
* @param element The image element with base64 data
* @return The created image XObject
* @throws IOException If image creation fails
*/
public PDImageXObject createImageXObject(PDDocument document, PdfJsonImageElement element)
throws IOException {
byte[] data;
try {
data = Base64.getDecoder().decode(element.getImageData());
} catch (IllegalArgumentException ex) {
log.debug("Failed to decode image element: {}", ex.getMessage());
return null;
}
String name = element.getId() != null ? element.getId() : UUID.randomUUID().toString();
return PDImageXObject.createFromByteArray(document, data, name);
}
private EncodedImage encodeImage(PDImage image) {
try {
BufferedImage bufferedImage = image.getImage();
if (bufferedImage == null) {
return null;
}
String format = resolveImageFormat(image);
if (format == null || format.isBlank()) {
format = "png";
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
boolean written = ImageIO.write(bufferedImage, format, baos);
if (!written) {
if (!"png".equalsIgnoreCase(format)) {
baos.reset();
if (!ImageIO.write(bufferedImage, "png", baos)) {
return null;
}
format = "png";
} else {
return null;
}
}
return new EncodedImage(Base64.getEncoder().encodeToString(baos.toByteArray()), format);
} catch (IOException ex) {
log.debug("Failed to encode image: {}", ex.getMessage());
return null;
}
}
private String resolveImageFormat(PDImage image) {
if (image instanceof PDImageXObject xObject) {
String suffix = xObject.getSuffix();
if (suffix != null && !suffix.isBlank()) {
return suffix.toLowerCase(Locale.ROOT);
}
}
return "png";
}
private float fallbackWidth(PdfJsonImageElement element) {
if (element.getRight() != null && element.getLeft() != null) {
return Math.max(0f, element.getRight() - element.getLeft());
}
if (element.getNativeWidth() != null) {
return element.getNativeWidth();
}
return 1f;
}
private float fallbackHeight(PdfJsonImageElement element) {
if (element.getTop() != null && element.getBottom() != null) {
return Math.max(0f, element.getTop() - element.getBottom());
}
if (element.getNativeHeight() != null) {
return element.getNativeHeight();
}
return 1f;
}
private float resolveLeft(PdfJsonImageElement element, float width) {
if (element.getLeft() != null) {
return element.getLeft();
}
if (element.getX() != null) {
return element.getX();
}
if (element.getRight() != null) {
return element.getRight() - width;
}
return 0f;
}
private float resolveBottom(PdfJsonImageElement element, float height) {
if (element.getBottom() != null) {
return element.getBottom();
}
if (element.getY() != null) {
return element.getY();
}
if (element.getTop() != null) {
return element.getTop() - height;
}
return 0f;
}
private List<Float> toMatrixValues(Matrix matrix) {
List<Float> values = new ArrayList<>(6);
values.add(matrix.getValue(0, 0));
values.add(matrix.getValue(0, 1));
values.add(matrix.getValue(1, 0));
values.add(matrix.getValue(1, 1));
values.add(matrix.getValue(2, 0));
values.add(matrix.getValue(2, 1));
return values;
}
private float safeFloat(Float value, float defaultValue) {
if (value == null || Float.isNaN(value) || Float.isInfinite(value)) {
return defaultValue;
}
return value;
}
/**
* Inner engine that extends PDFGraphicsStreamEngine to collect images from PDF content streams.
*/
private class ImageCollectingEngine extends PDFGraphicsStreamEngine {
private final int pageNumber;
private final Map<Integer, List<PdfJsonImageElement>> imagesByPage;
private COSName currentXObjectName;
private int imageCounter = 0;
protected ImageCollectingEngine(
PDPage page, int pageNumber, Map<Integer, List<PdfJsonImageElement>> imagesByPage)
throws IOException {
super(page);
this.pageNumber = pageNumber;
this.imagesByPage = imagesByPage;
}
@Override
public void processPage(PDPage page) throws IOException {
super.processPage(page);
}
@Override
public void drawImage(PDImage pdImage) throws IOException {
EncodedImage encoded = encodeImage(pdImage);
if (encoded == null) {
return;
}
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
Bounds bounds = computeBounds(ctm);
List<Float> matrixValues = toMatrixValues(ctm);
PdfJsonImageElement element =
PdfJsonImageElement.builder()
.id(UUID.randomUUID().toString())
.objectName(
currentXObjectName != null
? currentXObjectName.getName()
: null)
.inlineImage(!(pdImage instanceof PDImageXObject))
.nativeWidth(pdImage.getWidth())
.nativeHeight(pdImage.getHeight())
.x(bounds.left)
.y(bounds.bottom)
.width(bounds.width())
.height(bounds.height())
.left(bounds.left)
.right(bounds.right)
.top(bounds.top)
.bottom(bounds.bottom)
.transform(matrixValues)
.zOrder(-1_000_000 + imageCounter)
.imageData(encoded.base64())
.imageFormat(encoded.format())
.build();
imageCounter++;
imagesByPage.computeIfAbsent(pageNumber, key -> new ArrayList<>()).add(element);
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3)
throws IOException {
// Not needed for image extraction
}
@Override
public void clip(int windingRule) throws IOException {
// Not needed for image extraction
}
@Override
public void moveTo(float x, float y) throws IOException {
// Not needed for image extraction
}
@Override
public void lineTo(float x, float y) throws IOException {
// Not needed for image extraction
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
throws IOException {
// Not needed for image extraction
}
@Override
public Point2D getCurrentPoint() throws IOException {
return new Point2D.Float();
}
@Override
public void closePath() throws IOException {
// Not needed for image extraction
}
@Override
public void endPath() throws IOException {
// Not needed for image extraction
}
@Override
public void shadingFill(COSName shadingName) throws IOException {
// Not needed for image extraction
}
@Override
public void fillAndStrokePath(int windingRule) throws IOException {
// Not needed for image extraction
}
@Override
public void fillPath(int windingRule) throws IOException {
// Not needed for image extraction
}
@Override
public void strokePath() throws IOException {
// Not needed for image extraction
}
@Override
protected void processOperator(Operator operator, List<COSBase> operands)
throws IOException {
if (OperatorName.DRAW_OBJECT.equals(operator.getName())
&& !operands.isEmpty()
&& operands.get(0) instanceof COSName name) {
currentXObjectName = name;
}
super.processOperator(operator, operands);
currentXObjectName = null;
}
private Bounds computeBounds(Matrix ctm) {
AffineTransform transform = ctm.createAffineTransform();
Point2D.Float p0 = new Point2D.Float(0, 0);
Point2D.Float p1 = new Point2D.Float(1, 0);
Point2D.Float p2 = new Point2D.Float(0, 1);
Point2D.Float p3 = new Point2D.Float(1, 1);
transform.transform(p0, p0);
transform.transform(p1, p1);
transform.transform(p2, p2);
transform.transform(p3, p3);
float minX = Math.min(Math.min(p0.x, p1.x), Math.min(p2.x, p3.x));
float maxX = Math.max(Math.max(p0.x, p1.x), Math.max(p2.x, p3.x));
float minY = Math.min(Math.min(p0.y, p1.y), Math.min(p2.y, p3.y));
float maxY = Math.max(Math.max(p0.y, p1.y), Math.max(p2.y, p3.y));
if (!Float.isFinite(minX) || !Float.isFinite(minY)) {
return new Bounds(0f, 0f, 0f, 0f);
}
return new Bounds(minX, maxX, minY, maxY);
}
}
}

View File

@ -0,0 +1,148 @@
package stirling.software.SPDF.service.pdfjson;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.time.Instant;
import java.time.format.DateTimeParseException;
import java.util.Base64;
import java.util.Calendar;
import java.util.Optional;
import java.util.TimeZone;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.springframework.stereotype.Service;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.json.PdfJsonMetadata;
/** Service for extracting and applying PDF metadata (document info and XMP) for JSON conversion. */
@Service
@Slf4j
public class PdfJsonMetadataService {
/**
* Extracts document information metadata from a PDF.
*
* @param document The PDF document
* @return Metadata model with document info
*/
public PdfJsonMetadata extractMetadata(PDDocument document) {
PdfJsonMetadata metadata = new PdfJsonMetadata();
PDDocumentInformation info = document.getDocumentInformation();
if (info != null) {
metadata.setTitle(info.getTitle());
metadata.setAuthor(info.getAuthor());
metadata.setSubject(info.getSubject());
metadata.setKeywords(info.getKeywords());
metadata.setCreator(info.getCreator());
metadata.setProducer(info.getProducer());
metadata.setCreationDate(formatCalendar(info.getCreationDate()));
metadata.setModificationDate(formatCalendar(info.getModificationDate()));
metadata.setTrapped(info.getTrapped());
}
metadata.setNumberOfPages(document.getNumberOfPages());
return metadata;
}
/**
* Extracts XMP metadata from a PDF as base64-encoded string.
*
* @param document The PDF document
* @return Base64-encoded XMP metadata, or null if not present
*/
public String extractXmpMetadata(PDDocument document) {
if (document.getDocumentCatalog() == null) {
return null;
}
PDMetadata metadata = document.getDocumentCatalog().getMetadata();
if (metadata == null) {
return null;
}
try (InputStream inputStream = metadata.createInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
inputStream.transferTo(baos);
byte[] data = baos.toByteArray();
if (data.length == 0) {
return null;
}
return Base64.getEncoder().encodeToString(data);
} catch (IOException ex) {
log.debug("Failed to extract XMP metadata: {}", ex.getMessage());
return null;
}
}
/**
* Applies metadata to a PDF document.
*
* @param document The PDF document
* @param metadata The metadata to apply
*/
public void applyMetadata(PDDocument document, PdfJsonMetadata metadata) {
if (metadata == null) {
return;
}
PDDocumentInformation info = document.getDocumentInformation();
info.setTitle(metadata.getTitle());
info.setAuthor(metadata.getAuthor());
info.setSubject(metadata.getSubject());
info.setKeywords(metadata.getKeywords());
info.setCreator(metadata.getCreator());
info.setProducer(metadata.getProducer());
if (metadata.getCreationDate() != null) {
parseInstant(metadata.getCreationDate())
.ifPresent(instant -> info.setCreationDate(toCalendar(instant)));
}
if (metadata.getModificationDate() != null) {
parseInstant(metadata.getModificationDate())
.ifPresent(instant -> info.setModificationDate(toCalendar(instant)));
}
info.setTrapped(metadata.getTrapped());
}
/**
* Applies XMP metadata to a PDF document from base64-encoded string.
*
* @param document The PDF document
* @param base64 Base64-encoded XMP metadata
*/
public void applyXmpMetadata(PDDocument document, String base64) {
if (base64 == null || base64.isBlank()) {
return;
}
try (InputStream inputStream =
new ByteArrayInputStream(Base64.getDecoder().decode(base64))) {
PDMetadata metadata = new PDMetadata(document, inputStream);
document.getDocumentCatalog().setMetadata(metadata);
} catch (IllegalArgumentException | IOException ex) {
log.debug("Failed to apply XMP metadata: {}", ex.getMessage());
}
}
private String formatCalendar(Calendar calendar) {
if (calendar == null) {
return null;
}
return calendar.toInstant().toString();
}
private Optional<Instant> parseInstant(String value) {
try {
return Optional.of(Instant.parse(value));
} catch (DateTimeParseException ex) {
log.warn("Failed to parse instant '{}': {}", value, ex.getMessage());
return Optional.empty();
}
}
private Calendar toCalendar(Instant instant) {
Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
calendar.setTimeInMillis(instant.toEpochMilli());
return calendar;
}
}

View File

@ -0,0 +1,308 @@
package stirling.software.SPDF.service.pdfjson;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.PdfJsonConversionProgress;
import stirling.software.SPDF.model.json.PdfJsonAnnotation;
import stirling.software.SPDF.model.json.PdfJsonCosValue;
import stirling.software.SPDF.model.json.PdfJsonDocumentMetadata;
import stirling.software.SPDF.model.json.PdfJsonFont;
import stirling.software.SPDF.model.json.PdfJsonImageElement;
import stirling.software.SPDF.model.json.PdfJsonPage;
import stirling.software.SPDF.model.json.PdfJsonPageDimension;
import stirling.software.SPDF.model.json.PdfJsonStream;
import stirling.software.SPDF.model.json.PdfJsonTextElement;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.service.TaskManager;
import stirling.software.common.util.ExceptionUtils;
/**
* Service for lazy loading PDF pages. Caches PDF documents and extracts pages on-demand to reduce
* memory usage for large PDFs.
*/
@Service
@Slf4j
@RequiredArgsConstructor
public class PdfLazyLoadingService {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final ObjectMapper objectMapper;
private final TaskManager taskManager;
private final PdfJsonMetadataService metadataService;
private final PdfJsonImageService imageService;
/** Cache for storing PDDocuments for lazy page loading. Key is jobId. */
private final Map<String, CachedPdfDocument> documentCache = new ConcurrentHashMap<>();
/**
* Stores PDF file bytes for lazy page loading. Each page is extracted on-demand by re-loading
* the PDF from bytes.
*/
@Data
private static class CachedPdfDocument {
private final byte[] pdfBytes;
private final PdfJsonDocumentMetadata metadata;
private final long timestamp;
public CachedPdfDocument(byte[] pdfBytes, PdfJsonDocumentMetadata metadata) {
this.pdfBytes = pdfBytes;
this.metadata = metadata;
this.timestamp = System.currentTimeMillis();
}
}
/**
* Extracts document metadata, fonts, and page dimensions without page content. Caches the PDF
* bytes for subsequent page requests.
*
* @param file The uploaded PDF file
* @param jobId The job ID for caching
* @param fonts Font map (will be populated)
* @param pageFontResources Page font resources map (will be populated)
* @return Serialized metadata JSON
* @throws IOException If extraction fails
*/
public byte[] extractDocumentMetadata(
MultipartFile file,
String jobId,
Map<String, PdfJsonFont> fonts,
Map<Integer, Map<PDFont, String>> pageFontResources)
throws IOException {
if (file == null) {
throw ExceptionUtils.createNullArgumentException("fileInput");
}
Consumer<PdfJsonConversionProgress> progress =
jobId != null
? (p) -> {
log.info(
"Progress: [{}%] {} - {}{}",
p.getPercent(),
p.getStage(),
p.getMessage(),
(p.getCurrent() != null && p.getTotal() != null)
? String.format(
" (%d/%d)", p.getCurrent(), p.getTotal())
: "");
reportProgressToTaskManager(jobId, p);
}
: (p) -> {};
// Read PDF bytes once for processing and caching
byte[] pdfBytes = file.getBytes();
try (PDDocument document = pdfDocumentFactory.load(pdfBytes, true)) {
int totalPages = document.getNumberOfPages();
// Build metadata response
progress.accept(PdfJsonConversionProgress.of(90, "metadata", "Extracting metadata"));
PdfJsonDocumentMetadata docMetadata = new PdfJsonDocumentMetadata();
docMetadata.setMetadata(metadataService.extractMetadata(document));
docMetadata.setXmpMetadata(metadataService.extractXmpMetadata(document));
docMetadata.setLazyImages(Boolean.TRUE);
List<PdfJsonFont> serializedFonts = new ArrayList<>(fonts.values());
serializedFonts.sort(
Comparator.comparing(
PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder())));
docMetadata.setFonts(serializedFonts);
// Extract page dimensions
List<PdfJsonPageDimension> pageDimensions = new ArrayList<>();
int pageIndex = 0;
for (PDPage page : document.getPages()) {
PdfJsonPageDimension dim = new PdfJsonPageDimension();
dim.setPageNumber(pageIndex + 1);
PDRectangle mediaBox = page.getMediaBox();
dim.setWidth(mediaBox.getWidth());
dim.setHeight(mediaBox.getHeight());
dim.setRotation(page.getRotation());
pageDimensions.add(dim);
pageIndex++;
}
docMetadata.setPageDimensions(pageDimensions);
// Cache PDF bytes and metadata for lazy page loading
if (jobId != null) {
CachedPdfDocument cached = new CachedPdfDocument(pdfBytes, docMetadata);
documentCache.put(jobId, cached);
log.info(
"Cached PDF bytes ({} bytes) for lazy loading, jobId: {}",
pdfBytes.length,
jobId);
// Schedule cleanup after 30 minutes
scheduleDocumentCleanup(jobId);
}
progress.accept(
PdfJsonConversionProgress.of(100, "complete", "Metadata extraction complete"));
return objectMapper.writeValueAsBytes(docMetadata);
}
}
/**
* Extracts a single page from cached PDF bytes. Re-loads the PDF for each request.
*
* @param jobId The job ID
* @param pageNumber The page number (1-indexed)
* @param serializeCosValue Function to serialize COS values
* @param extractContentStreams Function to extract content streams
* @param filterImageXObjectsFromResources Function to filter image XObjects
* @param extractText Function to extract text elements for the page
* @param extractAnnotations Function to extract annotations for the page
* @return Serialized page JSON
* @throws IOException If extraction fails
*/
public byte[] extractSinglePage(
String jobId,
int pageNumber,
java.util.function.Function<COSBase, PdfJsonCosValue> serializeCosValue,
java.util.function.Function<PDPage, List<PdfJsonStream>> extractContentStreams,
java.util.function.Function<COSBase, COSBase> filterImageXObjectsFromResources,
java.util.function.BiFunction<PDDocument, Integer, List<PdfJsonTextElement>>
extractText,
java.util.function.BiFunction<PDDocument, Integer, List<PdfJsonAnnotation>>
extractAnnotations)
throws IOException {
CachedPdfDocument cached = documentCache.get(jobId);
if (cached == null) {
throw new IllegalArgumentException("No cached document found for jobId: " + jobId);
}
int pageIndex = pageNumber - 1;
int totalPages = cached.getMetadata().getPageDimensions().size();
if (pageIndex < 0 || pageIndex >= totalPages) {
throw new IllegalArgumentException(
"Page number " + pageNumber + " out of range (1-" + totalPages + ")");
}
log.debug("Loading PDF from bytes to extract page {} (jobId: {})", pageNumber, jobId);
// Re-load PDF from cached bytes and extract the single page
try (PDDocument document = pdfDocumentFactory.load(cached.getPdfBytes(), true)) {
PDPage page = document.getPage(pageIndex);
PdfJsonPage pageModel = new PdfJsonPage();
pageModel.setPageNumber(pageNumber);
PDRectangle mediaBox = page.getMediaBox();
pageModel.setWidth(mediaBox.getWidth());
pageModel.setHeight(mediaBox.getHeight());
pageModel.setRotation(page.getRotation());
// Extract text on-demand
pageModel.setTextElements(extractText.apply(document, pageNumber));
// Extract annotations on-demand
pageModel.setAnnotations(extractAnnotations.apply(document, pageNumber));
// Extract images on-demand
List<PdfJsonImageElement> images =
imageService.extractImagesForPage(document, page, pageNumber);
pageModel.setImageElements(images);
// Extract resources and content streams
COSBase resourcesBase = page.getCOSObject().getDictionaryObject(COSName.RESOURCES);
COSBase filteredResources = filterImageXObjectsFromResources.apply(resourcesBase);
pageModel.setResources(serializeCosValue.apply(filteredResources));
pageModel.setContentStreams(extractContentStreams.apply(page));
log.debug(
"Extracted page {} (text: {}, images: {}, annotations: {}) for jobId: {}",
pageNumber,
pageModel.getTextElements().size(),
images.size(),
pageModel.getAnnotations().size(),
jobId);
return objectMapper.writeValueAsBytes(pageModel);
}
}
/** Clears a cached document. */
public void clearCachedDocument(String jobId) {
CachedPdfDocument cached = documentCache.remove(jobId);
if (cached != null) {
log.info(
"Removed cached PDF bytes ({} bytes) for jobId: {}",
cached.getPdfBytes().length,
jobId);
}
}
/** Schedules automatic cleanup of cached documents after 30 minutes. */
private void scheduleDocumentCleanup(String jobId) {
new Thread(
() -> {
try {
Thread.sleep(TimeUnit.MINUTES.toMillis(30));
clearCachedDocument(jobId);
log.info("Auto-cleaned cached document for jobId: {}", jobId);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
})
.start();
}
/**
* Report progress to TaskManager for async jobs
*
* @param jobId The job ID
* @param progress The progress update
*/
private void reportProgressToTaskManager(String jobId, PdfJsonConversionProgress progress) {
try {
log.info(
"Reporting progress for job {}: {}% - {}",
jobId, progress.getPercent(), progress.getStage());
String note;
if (progress.getCurrent() != null && progress.getTotal() != null) {
note =
String.format(
"[%d%%] %s: %s (%d/%d)",
progress.getPercent(),
progress.getStage(),
progress.getMessage(),
progress.getCurrent(),
progress.getTotal());
} else {
note =
String.format(
"[%d%%] %s: %s",
progress.getPercent(), progress.getStage(), progress.getMessage());
}
boolean added = taskManager.addNote(jobId, note);
if (!added) {
log.warn("Failed to add note - job {} not found in TaskManager", jobId);
} else {
log.info("Successfully added progress note for job {}: {}", jobId, note);
}
} catch (Exception e) {
log.error("Exception reporting progress for job {}: {}", jobId, e.getMessage(), e);
}
}
}

View File

@ -4437,6 +4437,32 @@
"errors": {
"invalidJson": "Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.",
"pdfConversion": "Unable to convert the edited JSON back into a PDF."
},
"options": {
"autoScaleText": {
"title": "Auto-scale text to fit boxes",
"description": "Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF."
}
},
"disclaimer": {
"heading": "Preview limitations",
"textFocus": "This workspace focuses on editing text and repositioning embedded images. Complex page artwork, form widgets, and layered graphics are preserved for export but are not fully editable here.",
"previewVariance": "Some visuals (such as table borders, shapes, or annotation appearances) may not display exactly in the preview. The exported PDF keeps the original drawing commands whenever possible.",
"alpha": "This alpha viewer is still evolving—certain fonts, colours, transparency effects, and layout details may shift slightly. Please double-check the generated PDF before sharing."
},
"stages": {
"uploading": "Uploading",
"initializing": "Initializing",
"loading": "Loading",
"normalizing": "Normalizing",
"parsing": "Parsing",
"fonts": "Fonts",
"text": "Text Extraction",
"images": "Images",
"annotations": "Annotations",
"metadata": "Metadata",
"serializing": "Finalizing",
"complete": "Complete"
}
},
"workspace": {

View File

@ -11,8 +11,10 @@ import {
FileButton,
Group,
Pagination,
Progress,
ScrollArea,
Stack,
Switch,
Text,
Title,
} from '@mantine/core';
@ -32,6 +34,7 @@ import {
PdfJsonEditorViewData,
PdfJsonFont,
PdfJsonPage,
ConversionProgress,
} from '@app/tools/pdfJsonEditor/pdfJsonEditorTypes';
import { getImageBounds, pageDimensions } from '@app/tools/pdfJsonEditor/pdfJsonEditorUtils';
@ -205,6 +208,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
const [activeImageId, setActiveImageId] = useState<string | null>(null);
const [fontFamilies, setFontFamilies] = useState<Map<string, string>>(new Map());
const [textGroupsExpanded, setTextGroupsExpanded] = useState(false);
const [autoScaleText, setAutoScaleText] = useState(true);
const [textScales, setTextScales] = useState<Map<string, number>>(new Map());
const measurementKeyRef = useRef<string>('');
const containerRef = useRef<HTMLDivElement | null>(null);
const editorRefs = useRef<Map<string, HTMLDivElement>>(new Map());
const caretOffsetsRef = useRef<Map<string, number>>(new Map());
@ -220,6 +226,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
errorMessage,
isGeneratingPdf,
isConverting,
conversionProgress,
hasChanges,
onLoadJson,
onSelectPage,
@ -562,8 +569,73 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
setActiveGroupId(null);
setEditingGroupId(null);
setActiveImageId(null);
setTextScales(new Map());
measurementKeyRef.current = '';
}, [selectedPage]);
// Measure text widths once per page/configuration and apply static scaling
useLayoutEffect(() => {
if (!autoScaleText || visibleGroups.length === 0) {
return;
}
// Create a stable key for this measurement configuration
const currentKey = `${selectedPage}-${fontFamilies.size}-${autoScaleText}`;
// Skip if we've already measured for this configuration
if (measurementKeyRef.current === currentKey) {
return;
}
const measureTextScales = () => {
const newScales = new Map<string, number>();
visibleGroups.forEach((group) => {
// Skip groups that are being edited
if (editingGroupId === group.id) {
return;
}
const element = document.querySelector<HTMLElement>(`[data-text-group="${group.id}"]`);
if (!element) {
return;
}
const textSpan = element.querySelector<HTMLSpanElement>('span[data-text-content]');
if (!textSpan) {
return;
}
// Temporarily remove any existing transform to get natural width
const originalTransform = textSpan.style.transform;
textSpan.style.transform = 'none';
const bounds = toCssBounds(currentPage, pageHeight, scale, group.bounds);
const containerWidth = bounds.width;
const textWidth = textSpan.getBoundingClientRect().width;
// Restore original transform
textSpan.style.transform = originalTransform;
// Only scale if text overflows by more than 2%
if (textWidth > 0 && textWidth > containerWidth * 1.02) {
const scaleX = Math.max(containerWidth / textWidth, 0.5); // Min 50% scale
newScales.set(group.id, scaleX);
} else {
newScales.set(group.id, 1);
}
});
// Mark this configuration as measured
measurementKeyRef.current = currentKey;
setTextScales(newScales);
};
// Delay measurement to ensure fonts and layout are ready
const timer = setTimeout(measureTextScales, 150);
return () => clearTimeout(timer);
}, [autoScaleText, visibleGroups, editingGroupId, currentPage, pageHeight, scale, fontFamilies.size, selectedPage]);
useLayoutEffect(() => {
if (!editingGroupId) {
return;
@ -726,6 +798,27 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
{t('pdfJsonEditor.currentFile', 'Current file: {{name}}', { name: fileName })}
</Text>
)}
<Divider my="sm" />
<Group justify="space-between" align="center">
<div>
<Text fw={500} size="sm">
{t('pdfJsonEditor.options.autoScaleText.title', 'Auto-scale text to fit boxes')}
</Text>
<Text size="xs" c="dimmed" mt={4}>
{t(
'pdfJsonEditor.options.autoScaleText.description',
'Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF.'
)}
</Text>
</div>
<Switch
size="md"
checked={autoScaleText}
onChange={(event) => setAutoScaleText(event.currentTarget.checked)}
/>
</Group>
</Stack>
</Card>
@ -782,10 +875,39 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
{isConverting && (
<Card withBorder radius="md" padding="xl">
<Stack align="center" gap="md">
<AutorenewIcon sx={{ fontSize: 48 }} className="animate-spin" />
<Text size="lg" fw={600}>
{t('pdfJsonEditor.converting', 'Converting PDF to editable format...')}
<Stack gap="md">
<Group justify="space-between" align="flex-start">
<div style={{ flex: 1 }}>
<Text size="lg" fw={600} mb="xs">
{conversionProgress
? conversionProgress.message
: t('pdfJsonEditor.converting', 'Converting PDF to editable format...')}
</Text>
{conversionProgress && (
<Group gap="xs">
<Text size="sm" c="dimmed" tt="capitalize">
{t(`pdfJsonEditor.stages.${conversionProgress.stage}`, conversionProgress.stage)}
</Text>
{conversionProgress.current !== undefined &&
conversionProgress.total !== undefined && (
<Text size="sm" c="dimmed">
Page {conversionProgress.current} of {conversionProgress.total}
</Text>
)}
</Group>
)}
</div>
<AutorenewIcon sx={{ fontSize: 36 }} className="animate-spin" />
</Group>
<Progress
value={conversionProgress?.percent || 0}
size="lg"
radius="md"
animated
striped
/>
<Text size="sm" c="dimmed" ta="right">
{conversionProgress?.percent || 0}% complete
</Text>
</Stack>
</Card>
@ -1105,6 +1227,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
);
}
const textScale = textScales.get(group.id) ?? 1;
const shouldScale = autoScaleText && textScale < 0.98;
return (
<Box key={group.id} style={containerStyle}>
{renderGroupContainer(
@ -1112,6 +1237,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
isActive,
changed,
<div
data-text-group={group.id}
style={{
width: '100%',
minHeight: '100%',
@ -1127,7 +1253,17 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
overflow: 'visible',
}}
>
<span style={{ pointerEvents: 'none' }}>{group.text || '\u00A0'}</span>
<span
data-text-content
style={{
pointerEvents: 'none',
display: 'inline-block',
transform: shouldScale ? `scaleX(${textScale})` : undefined,
transformOrigin: 'left center',
}}
>
{group.text || '\u00A0'}
</span>
</div>,
() => {
setEditingGroupId(group.id);

View File

@ -27,8 +27,8 @@ export function useProprietaryToolRegistry(): ProprietaryToolRegistry {
"home.pdfJsonEditor.desc",
"Review and edit Stirling PDF JSON exports with grouped text editing and PDF regeneration"
),
categoryId: ToolCategoryId.ADVANCED_TOOLS,
subcategoryId: SubcategoryId.DEVELOPER_TOOLS,
categoryId: ToolCategoryId.RECOMMENDED_TOOLS,
subcategoryId: SubcategoryId.GENERAL,
workbench: "custom:pdfJsonEditor",
endpoints: ["json-pdf"],
synonyms: getSynonyms(t, "pdfJsonEditor"),

View File

@ -13,6 +13,7 @@ import { getFilenameFromHeaders } from '@app/utils/fileResponseUtils';
import {
PdfJsonDocument,
PdfJsonImageElement,
PdfJsonPage,
TextGroup,
PdfJsonEditorViewData,
} from './pdfJsonEditorTypes';
@ -68,11 +69,39 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const [errorMessage, setErrorMessage] = useState<string | null>(null);
const [isGeneratingPdf, setIsGeneratingPdf] = useState(false);
const [isConverting, setIsConverting] = useState(false);
const [conversionProgress, setConversionProgress] = useState<{
percent: number;
stage: string;
message: string;
} | null>(null);
// Lazy loading state
const [isLazyMode, setIsLazyMode] = useState(false);
const [cachedJobId, setCachedJobId] = useState<string | null>(null);
const [loadedImagePages, setLoadedImagePages] = useState<Set<number>>(new Set());
const [loadingImagePages, setLoadingImagePages] = useState<Set<number>>(new Set());
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
const imagesByPageRef = useRef<PdfJsonImageElement[][]>([]);
const autoLoadKeyRef = useRef<string | null>(null);
const loadRequestIdRef = useRef(0);
const latestPdfRequestIdRef = useRef<number | null>(null);
const loadedDocumentRef = useRef<PdfJsonDocument | null>(null);
const loadedImagePagesRef = useRef<Set<number>>(new Set());
const loadingImagePagesRef = useRef<Set<number>>(new Set());
// Keep ref in sync with state for access in async callbacks
useEffect(() => {
loadedDocumentRef.current = loadedDocument;
}, [loadedDocument]);
useEffect(() => {
loadedImagePagesRef.current = new Set(loadedImagePages);
}, [loadedImagePages]);
useEffect(() => {
loadingImagePagesRef.current = new Set(loadingImagePages);
}, [loadingImagePages]);
const dirtyPages = useMemo(
() => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current),
@ -88,18 +117,134 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
setGroupsByPage([]);
setImagesByPage([]);
originalImagesRef.current = [];
imagesByPageRef.current = [];
setLoadedImagePages(new Set());
setLoadingImagePages(new Set());
loadedImagePagesRef.current = new Set();
loadingImagePagesRef.current = new Set();
setSelectedPage(0);
return;
}
const cloned = deepCloneDocument(document);
const groups = groupDocumentText(cloned);
const images = extractDocumentImages(cloned);
originalImagesRef.current = images.map((page) => page.map(cloneImageElement));
const originalImages = images.map((page) => page.map(cloneImageElement));
originalImagesRef.current = originalImages;
imagesByPageRef.current = images.map((page) => page.map(cloneImageElement));
const initialLoaded = new Set<number>();
originalImages.forEach((pageImages, index) => {
if (pageImages.length > 0) {
initialLoaded.add(index);
}
});
setGroupsByPage(groups);
setImagesByPage(images);
setLoadedImagePages(initialLoaded);
setLoadingImagePages(new Set());
loadedImagePagesRef.current = new Set(initialLoaded);
loadingImagePagesRef.current = new Set();
setSelectedPage(0);
}, []);
// Load images for a page in lazy mode
const loadImagesForPage = useCallback(
async (pageIndex: number) => {
if (!isLazyMode) {
return;
}
if (!cachedJobId) {
console.log('[loadImagesForPage] No cached jobId, skipping');
return;
}
if (
loadedImagePagesRef.current.has(pageIndex) ||
loadingImagePagesRef.current.has(pageIndex)
) {
return;
}
loadingImagePagesRef.current.add(pageIndex);
setLoadingImagePages((prev) => {
const next = new Set(prev);
next.add(pageIndex);
return next;
});
const pageNumber = pageIndex + 1;
const start = performance.now();
try {
const response = await apiClient.get(
`/api/v1/convert/pdf/json/page/${cachedJobId}/${pageNumber}`,
{
responseType: 'json',
},
);
const pageData = response.data as PdfJsonPage;
const normalizedImages = (pageData.imageElements ?? []).map(cloneImageElement);
if (imagesByPageRef.current.length <= pageIndex) {
imagesByPageRef.current.length = pageIndex + 1;
}
imagesByPageRef.current[pageIndex] = normalizedImages.map(cloneImageElement);
setLoadedDocument((prevDoc) => {
if (!prevDoc || !prevDoc.pages) {
return prevDoc;
}
const nextPages = [...prevDoc.pages];
const existingPage = nextPages[pageIndex] ?? {};
nextPages[pageIndex] = {
...existingPage,
imageElements: normalizedImages.map(cloneImageElement),
};
return {
...prevDoc,
pages: nextPages,
};
});
setImagesByPage((prev) => {
const next = [...prev];
while (next.length <= pageIndex) {
next.push([]);
}
next[pageIndex] = normalizedImages.map(cloneImageElement);
return next;
});
if (originalImagesRef.current.length <= pageIndex) {
originalImagesRef.current.length = pageIndex + 1;
}
originalImagesRef.current[pageIndex] = normalizedImages.map(cloneImageElement);
setLoadedImagePages((prev) => {
const next = new Set(prev);
next.add(pageIndex);
return next;
});
loadedImagePagesRef.current.add(pageIndex);
console.log(
`[loadImagesForPage] Loaded ${normalizedImages.length} images for page ${pageNumber} in ${(
performance.now() - start
).toFixed(2)}ms`,
);
} catch (error) {
console.error(`[loadImagesForPage] Failed to load images for page ${pageNumber}:`, error);
} finally {
loadingImagePagesRef.current.delete(pageIndex);
setLoadingImagePages((prev) => {
const next = new Set(prev);
next.delete(pageIndex);
return next;
});
}
},
[isLazyMode, cachedJobId],
);
const handleLoadFile = useCallback(
async (file: File | null) => {
if (!file) {
@ -113,39 +258,200 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const isPdf = file.type === 'application/pdf' || file.name.toLowerCase().endsWith('.pdf');
try {
let parsed: PdfJsonDocument;
let parsed: PdfJsonDocument | null = null;
let shouldUseLazyMode = false;
let pendingJobId: string | null = null;
setErrorMessage(null);
if (isPdf) {
latestPdfRequestIdRef.current = requestId;
setIsConverting(true);
setConversionProgress({
percent: 0,
stage: 'uploading',
message: 'Uploading PDF file to server...',
});
const formData = new FormData();
formData.append('fileInput', file);
const response = await apiClient.post(CONVERSION_ENDPOINTS['pdf-json'], formData, {
responseType: 'blob',
console.log('Sending conversion request with async=true');
const response = await apiClient.post(
`${CONVERSION_ENDPOINTS['pdf-json']}?async=true`,
formData,
{
responseType: 'json',
},
);
console.log('Conversion response:', response.data);
const jobId = response.data.jobId;
if (!jobId) {
console.error('No job ID in response:', response.data);
throw new Error('No job ID received from server');
}
pendingJobId = jobId;
console.log('Got job ID:', jobId);
setConversionProgress({
percent: 3,
stage: 'processing',
message: 'Starting conversion...',
});
const jsonText = await response.data.text();
parsed = JSON.parse(jsonText) as PdfJsonDocument;
let jobComplete = false;
let attempts = 0;
const maxAttempts = 600;
while (!jobComplete && attempts < maxAttempts) {
await new Promise((resolve) => setTimeout(resolve, 1000));
attempts += 1;
try {
const statusResponse = await apiClient.get(`/api/v1/general/job/${jobId}`);
const jobStatus = statusResponse.data;
console.log(`Job status (attempt ${attempts}):`, jobStatus);
if (jobStatus.notes && jobStatus.notes.length > 0) {
const lastNote = jobStatus.notes[jobStatus.notes.length - 1];
console.log('Latest note:', lastNote);
const matchWithCount = lastNote.match(
/\[(\d+)%\]\s+(\w+):\s+(.+?)\s+\((\d+)\/(\d+)\)/,
);
if (matchWithCount) {
const percent = parseInt(matchWithCount[1], 10);
const stage = matchWithCount[2];
const message = matchWithCount[3];
const current = parseInt(matchWithCount[4], 10);
const total = parseInt(matchWithCount[5], 10);
setConversionProgress({
percent,
stage,
message,
current,
total,
});
} else {
const match = lastNote.match(/\[(\d+)%\]\s+(\w+):\s+(.+)/);
if (match) {
const percent = parseInt(match[1], 10);
const stage = match[2];
const message = match[3];
setConversionProgress({
percent,
stage,
message,
});
}
}
} else if (jobStatus.progress !== undefined) {
const percent = Math.min(Math.max(jobStatus.progress, 0), 100);
setConversionProgress({
percent,
stage: jobStatus.stage || 'processing',
message: jobStatus.note || 'Converting PDF to JSON...',
});
}
if (jobStatus.complete) {
if (jobStatus.error) {
console.error('Job failed:', jobStatus.error);
throw new Error(jobStatus.error);
}
console.log('Job completed, retrieving JSON result...');
jobComplete = true;
const resultResponse = await apiClient.get(
`/api/v1/general/job/${jobId}/result`,
{
responseType: 'blob',
},
);
const jsonText = await resultResponse.data.text();
const result = JSON.parse(jsonText);
if (!Array.isArray(result.pages)) {
console.error('Conversion result missing page array:', result);
throw new Error(
'PDF conversion result did not include page data. Please update the server.',
);
}
const docResult = result as PdfJsonDocument;
parsed = {
...docResult,
pages: docResult.pages ?? [],
};
shouldUseLazyMode = Boolean(docResult.lazyImages);
pendingJobId = shouldUseLazyMode ? jobId : null;
setConversionProgress(null);
} else {
console.log('Job not complete yet, continuing to poll...');
}
} catch (pollError: any) {
console.error('Error polling job status:', pollError);
console.error('Poll error details:', {
status: pollError?.response?.status,
data: pollError?.response?.data,
message: pollError?.message,
});
if (pollError?.response?.status === 404) {
throw new Error('Job not found on server');
}
}
}
if (!jobComplete) {
throw new Error('Conversion timed out');
}
if (!parsed) {
throw new Error('Conversion did not return JSON content');
}
} else {
const content = await file.text();
parsed = JSON.parse(content) as PdfJsonDocument;
const docResult = JSON.parse(content) as PdfJsonDocument;
parsed = {
...docResult,
pages: docResult.pages ?? [],
};
shouldUseLazyMode = false;
pendingJobId = null;
}
setConversionProgress(null);
if (loadRequestIdRef.current !== requestId) {
return;
}
if (!parsed) {
throw new Error('Failed to parse PDF JSON document');
}
console.log(
`[PdfJsonEditor] Document loaded. Lazy image mode: ${shouldUseLazyMode}, Pages: ${
parsed.pages?.length || 0
}`,
);
setLoadedDocument(parsed);
resetToDocument(parsed);
setIsLazyMode(shouldUseLazyMode);
setCachedJobId(shouldUseLazyMode ? pendingJobId : null);
setFileName(file.name);
setErrorMessage(null);
autoLoadKeyRef.current = fileKey;
} catch (error) {
} catch (error: any) {
console.error('Failed to load file', error);
console.error('Error details:', {
message: error?.message,
response: error?.response?.data,
stack: error?.stack,
});
if (loadRequestIdRef.current !== requestId) {
return;
@ -155,15 +461,17 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
resetToDocument(null);
if (isPdf) {
setErrorMessage(
t('pdfJsonEditor.conversionFailed', 'Failed to convert PDF. Please try again.')
);
const errorMsg =
error?.message ||
t('pdfJsonEditor.conversionFailed', 'Failed to convert PDF. Please try again.');
setErrorMessage(errorMsg);
console.error('Setting error message:', errorMsg);
} else {
setErrorMessage(
t(
'pdfJsonEditor.errors.invalidJson',
'Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.'
)
'Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.',
),
);
}
} finally {
@ -172,12 +480,16 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
}
}
},
[resetToDocument, t]
[resetToDocument, t],
);
const handleSelectPage = useCallback((pageIndex: number) => {
setSelectedPage(pageIndex);
}, []);
// Trigger lazy loading for images on the selected page
if (isLazyMode) {
void loadImagesForPage(pageIndex);
}
}, [isLazyMode, loadImagesForPage]);
const handleGroupTextChange = useCallback((pageIndex: number, groupId: string, value: string) => {
setGroupsByPage((previous) =>
@ -195,17 +507,14 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
imageId: string,
next: { left: number; bottom: number; width: number; height: number; transform: number[] },
) => {
setImagesByPage((previous) =>
previous.map((images, idx) => {
if (idx !== pageIndex) {
return images;
}
setImagesByPage((previous) => {
const current = previous[pageIndex] ?? [];
let changed = false;
const updated = images.map((image) => {
const updatedPage = current.map((image) => {
if ((image.id ?? '') !== imageId) {
return image;
}
const originalTransform = image.transform ?? originalImagesRef.current[idx]?.find((base) => (base.id ?? '') === imageId)?.transform;
const originalTransform = image.transform ?? originalImagesRef.current[pageIndex]?.find((base) => (base.id ?? '') === imageId)?.transform;
const scaleXSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[0]) || 1 : 1;
const scaleYSign = originalTransform && originalTransform.length >= 6 ? Math.sign(originalTransform[3]) || 1 : 1;
const right = next.left + next.width;
@ -220,14 +529,16 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
top,
width: next.width,
height: next.height,
transform: scaleXSign < 0 || scaleYSign < 0 ? [
transform: scaleXSign < 0 || scaleYSign < 0
? [
next.width * scaleXSign,
0,
0,
next.height * scaleYSign,
next.left,
scaleYSign >= 0 ? next.bottom : next.bottom + next.height,
] : null,
]
: null,
};
const isSame =
@ -241,9 +552,18 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
}
return updatedImage;
});
return changed ? updated : images;
}),
);
if (!changed) {
return previous;
}
const nextImages = previous.map((images, idx) => (idx === pageIndex ? updatedPage : images));
if (imagesByPageRef.current.length <= pageIndex) {
imagesByPageRef.current.length = pageIndex + 1;
}
imagesByPageRef.current[pageIndex] = updatedPage.map(cloneImageElement);
return nextImages;
});
},
[],
);
@ -253,14 +573,28 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
if (!baseline) {
return;
}
setImagesByPage((previous) =>
previous.map((images, idx) => {
if (idx !== pageIndex) {
return images;
setImagesByPage((previous) => {
const current = previous[pageIndex] ?? [];
let changed = false;
const updatedPage = current.map((image) => {
if ((image.id ?? '') !== imageId) {
return image;
}
return images.map((image) => ((image.id ?? '') === imageId ? cloneImageElement(baseline) : image));
}),
);
changed = true;
return cloneImageElement(baseline);
});
if (!changed) {
return previous;
}
const nextImages = previous.map((images, idx) => (idx === pageIndex ? updatedPage : images));
if (imagesByPageRef.current.length <= pageIndex) {
imagesByPageRef.current.length = pageIndex + 1;
}
imagesByPageRef.current[pageIndex] = updatedPage.map(cloneImageElement);
return nextImages;
});
}, []);
const handleResetEdits = useCallback(() => {
@ -279,7 +613,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const updatedDocument = restoreGlyphElements(
loadedDocument,
groupsByPage,
imagesByPage,
imagesByPageRef.current,
originalImagesRef.current,
);
const baseName = sanitizeBaseName(fileName || loadedDocument.metadata?.title || undefined);
@ -287,7 +621,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
document: updatedDocument,
filename: `${baseName}.json`,
};
}, [fileName, groupsByPage, imagesByPage, loadedDocument]);
}, [fileName, groupsByPage, loadedDocument]);
const handleDownloadJson = useCallback(() => {
const payload = buildPayload();
@ -306,6 +640,118 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
}, [buildPayload, onComplete]);
const handleGeneratePdf = useCallback(async () => {
try {
setIsGeneratingPdf(true);
const ensureImagesForPages = async (pageIndices: number[]) => {
const uniqueIndices = Array.from(new Set(pageIndices)).filter((index) => index >= 0);
if (uniqueIndices.length === 0) {
return;
}
for (const index of uniqueIndices) {
if (!loadedImagePagesRef.current.has(index)) {
await loadImagesForPage(index);
}
}
const maxWaitTime = 15000;
const pollInterval = 150;
const startWait = Date.now();
while (Date.now() - startWait < maxWaitTime) {
const allLoaded = uniqueIndices.every(
(index) =>
loadedImagePagesRef.current.has(index) &&
imagesByPageRef.current[index] !== undefined,
);
const anyLoading = uniqueIndices.some((index) =>
loadingImagePagesRef.current.has(index),
);
if (allLoaded && !anyLoading) {
return;
}
await new Promise((resolve) => setTimeout(resolve, pollInterval));
}
const missing = uniqueIndices.filter(
(index) => !loadedImagePagesRef.current.has(index),
);
if (missing.length > 0) {
throw new Error(
`Failed to load images for pages ${missing.map((i) => i + 1).join(', ')}`,
);
}
};
const currentDoc = loadedDocumentRef.current;
const totalPages = currentDoc?.pages?.length ?? 0;
const dirtyPageIndices = dirtyPages
.map((isDirty, index) => (isDirty ? index : -1))
.filter((index) => index >= 0);
const canUseIncremental =
isLazyMode &&
cachedJobId &&
dirtyPageIndices.length > 0 &&
dirtyPageIndices.length < totalPages;
if (canUseIncremental) {
await ensureImagesForPages(dirtyPageIndices);
try {
const payload = buildPayload();
if (!payload) {
return;
}
const { document, filename } = payload;
const dirtyPageSet = new Set(dirtyPageIndices);
const partialPages =
document.pages?.filter((_, index) => dirtyPageSet.has(index)) ?? [];
const partialDocument: PdfJsonDocument = {
metadata: document.metadata,
xmpMetadata: document.xmpMetadata,
fonts: document.fonts,
lazyImages: true,
pages: partialPages,
};
const baseName = sanitizeBaseName(filename).replace(/-edited$/u, '');
const expectedName = `${baseName || 'document'}.pdf`;
const response = await apiClient.post(
`/api/v1/convert/pdf/json/partial/${cachedJobId}?filename=${encodeURIComponent(expectedName)}`,
partialDocument,
{
responseType: 'blob',
},
);
const contentDisposition = response.headers?.['content-disposition'] ?? '';
const detectedName = getFilenameFromHeaders(contentDisposition);
const downloadName = detectedName || expectedName;
downloadBlob(response.data, downloadName);
if (onComplete) {
const pdfFile = new File([response.data], downloadName, { type: 'application/pdf' });
onComplete([pdfFile]);
}
setErrorMessage(null);
return;
} catch (incrementalError) {
console.warn(
'[handleGeneratePdf] Incremental export failed, falling back to full export',
incrementalError,
);
}
}
if (isLazyMode && totalPages > 0) {
const allPageIndices = Array.from({ length: totalPages }, (_, index) => index);
await ensureImagesForPages(allPageIndices);
}
const payload = buildPayload();
if (!payload) {
return;
@ -317,9 +763,6 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const formData = new FormData();
formData.append('fileInput', jsonFile);
try {
setIsGeneratingPdf(true);
const response = await apiClient.post(CONVERSION_ENDPOINTS['json-pdf'], formData, {
responseType: 'blob',
});
@ -350,7 +793,16 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
} finally {
setIsGeneratingPdf(false);
}
}, [buildPayload, onComplete, onError, t]);
}, [
buildPayload,
cachedJobId,
dirtyPages,
isLazyMode,
loadImagesForPage,
onComplete,
onError,
t,
]);
const viewData = useMemo<PdfJsonEditorViewData>(() => ({
document: loadedDocument,
@ -363,6 +815,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
errorMessage,
isGeneratingPdf,
isConverting,
conversionProgress,
hasChanges,
onLoadJson: handleLoadFile,
onSelectPage: handleSelectPage,
@ -390,6 +843,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
hasDocument,
isGeneratingPdf,
isConverting,
conversionProgress,
loadedDocument,
selectedPage,
]);
@ -397,6 +851,13 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const latestViewDataRef = useRef<PdfJsonEditorViewData>(viewData);
latestViewDataRef.current = viewData;
// Trigger initial image loading in lazy mode
useEffect(() => {
if (isLazyMode && loadedDocument) {
void loadImagesForPage(selectedPage);
}
}, [isLazyMode, loadedDocument, selectedPage, loadImagesForPage]);
useEffect(() => {
if (selectedFiles.length === 0) {
autoLoadKeyRef.current = null;
@ -433,11 +894,20 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
setCustomWorkbenchViewData(VIEW_ID, latestViewDataRef.current);
return () => {
// Clear backend cache if we were using lazy loading
if (cachedJobId) {
console.log(`[PdfJsonEditor] Cleaning up cached document for jobId: ${cachedJobId}`);
apiClient.post(`/api/v1/convert/pdf/json/clear-cache/${cachedJobId}`).catch((error) => {
console.warn('[PdfJsonEditor] Failed to clear cache:', error);
});
}
clearCustomWorkbenchViewData(VIEW_ID);
unregisterCustomWorkbenchView(VIEW_ID);
setLeftPanelView('toolPicker');
};
}, [
cachedJobId,
clearCustomWorkbenchViewData,
registerCustomWorkbenchView,
setCustomWorkbenchViewData,

View File

@ -122,6 +122,23 @@ export interface PdfJsonDocument {
xmpMetadata?: string | null;
fonts?: PdfJsonFont[] | null;
pages?: PdfJsonPage[] | null;
lazyImages?: boolean | null;
}
export interface PdfJsonPageDimension {
pageNumber?: number | null;
width?: number | null;
height?: number | null;
rotation?: number | null;
}
export interface PdfJsonDocumentMetadata {
metadata?: PdfJsonMetadata | null;
xmpMetadata?: string | null;
fonts?: PdfJsonFont[] | null;
pageDimensions?: PdfJsonPageDimension[] | null;
formFields?: unknown[] | null;
lazyImages?: boolean | null;
}
export interface BoundingBox {
@ -153,6 +170,14 @@ export interface TextGroup {
export const DEFAULT_PAGE_WIDTH = 612;
export const DEFAULT_PAGE_HEIGHT = 792;
export interface ConversionProgress {
percent: number;
stage: string;
message: string;
current?: number;
total?: number;
}
export interface PdfJsonEditorViewData {
document: PdfJsonDocument | null;
groupsByPage: TextGroup[][];
@ -164,6 +189,7 @@ export interface PdfJsonEditorViewData {
errorMessage: string | null;
isGeneratingPdf: boolean;
isConverting: boolean;
conversionProgress: ConversionProgress | null;
hasChanges: boolean;
onLoadJson: (file: File | null) => Promise<void> | void;
onSelectPage: (pageIndex: number) => void;

View File

@ -15,6 +15,7 @@ export default defineConfig({
}),
],
server: {
host: true,
proxy: {
'/api': {
target: 'http://localhost:8080',