diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java index e686778b1..fe89c0ead 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPdfJsonController.java @@ -1,5 +1,6 @@ package stirling.software.SPDF.controller.api.converters; +import java.nio.charset.StandardCharsets; import java.util.Optional; import java.util.UUID; import java.util.regex.Pattern; @@ -58,6 +59,7 @@ public class ConvertPdfJsonController { } byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile, lightweight); + logJsonResponse("pdf/text-editor", jsonBytes); String originalName = inputFile.getOriginalFilename(); String baseName = (originalName != null && !originalName.isBlank()) @@ -114,10 +116,11 @@ public class ConvertPdfJsonController { // Scope job to authenticated user if security is enabled String scopedJobKey = getScopedJobKey(baseJobId); - log.info("Extracting metadata for PDF, assigned jobId: {}", scopedJobKey); + log.debug("Extracting metadata for PDF, assigned jobId: {}", scopedJobKey); byte[] jsonBytes = pdfJsonConversionService.extractDocumentMetadata(inputFile, scopedJobKey); + logJsonResponse("pdf/text-editor/metadata", jsonBytes); String originalName = inputFile.getOriginalFilename(); String baseName = (originalName != null && !originalName.isBlank()) @@ -185,11 +188,33 @@ public class ConvertPdfJsonController { validateJobAccess(jobId); byte[] jsonBytes = pdfJsonConversionService.extractSinglePage(jobId, pageNumber); + logJsonResponse("pdf/text-editor/page", jsonBytes); String docName = "page_" + pageNumber + ".json"; return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON); } - @AutoJobPostMapping(value = "/pdf/text-editor/clear-cache/{jobId}") + @GetMapping(value = "/pdf/text-editor/fonts/{jobId}/{pageNumber}") + @Operation( + summary = "Extract fonts used by a single cached page for text editor", + description = + "Retrieves the font payloads used by a single page from a previously cached PDF document." + + " Requires prior call to /pdf/text-editor/metadata. The jobId must belong to the" + + " authenticated user. Output:JSON") + public ResponseEntity extractPageFonts( + @PathVariable String jobId, @PathVariable int pageNumber) throws Exception { + + // Validate job ownership + validateJobAccess(jobId); + + byte[] jsonBytes = pdfJsonConversionService.extractPageFonts(jobId, pageNumber); + logJsonResponse("pdf/text-editor/fonts/page", jsonBytes); + String docName = "page_fonts_" + pageNumber + ".json"; + return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON); + } + + @AutoJobPostMapping( + value = "/pdf/text-editor/clear-cache/{jobId}", + consumes = MediaType.ALL_VALUE) @Operation( summary = "Clear cached PDF document for text editor", description = @@ -219,6 +244,188 @@ public class ConvertPdfJsonController { return baseJobId; } + private void logJsonResponse(String label, byte[] jsonBytes) { + if (jsonBytes == null) { + log.warn("Returning {} JSON response: null bytes", label); + return; + } + + // Only perform expensive tail extraction if debug logging is enabled + if (log.isDebugEnabled()) { + int length = jsonBytes.length; + boolean endsWithJson = + length > 0 && (jsonBytes[length - 1] == '}' || jsonBytes[length - 1] == ']'); + String tail = ""; + if (length > 0) { + int start = Math.max(0, length - 64); + tail = new String(jsonBytes, start, length - start, StandardCharsets.UTF_8); + tail = tail.replaceAll("[\\r\\n\\t]+", " ").replaceAll("[^\\x20-\\x7E]", "?"); + } + log.debug( + "Returning {} JSON response ({} bytes, endsWithJson={}, tail='{}')", + label, + length, + endsWithJson, + tail); + } + + if (isPdfJsonDebugDumpEnabled()) { + try { + String tmpDir = System.getProperty("java.io.tmpdir"); + String customDir = System.getenv("SPDF_PDFJSON_DUMP_DIR"); + java.nio.file.Path dumpDir = + customDir != null && !customDir.isBlank() + ? java.nio.file.Path.of(customDir) + : java.nio.file.Path.of(tmpDir); + java.nio.file.Path dumpPath = + java.nio.file.Files.createTempFile(dumpDir, "pdfjson_", ".json"); + java.nio.file.Files.write(dumpPath, jsonBytes); + log.debug("PDF JSON debug dump ({}): {}", label, dumpPath); + } catch (Exception ex) { + log.warn("Failed to write PDF JSON debug dump ({}): {}", label, ex.getMessage()); + } + } + + if (isPdfJsonRepeatScanEnabled()) { + logRepeatedJsonStrings(label, jsonBytes); + } + } + + private boolean isPdfJsonDebugDumpEnabled() { + String env = System.getenv("SPDF_PDFJSON_DUMP"); + if (env != null && env.equalsIgnoreCase("true")) { + return true; + } + return Boolean.getBoolean("spdf.pdfjson.dump"); + } + + private boolean isPdfJsonRepeatScanEnabled() { + String env = System.getenv("SPDF_PDFJSON_REPEAT_SCAN"); + if (env != null && env.equalsIgnoreCase("true")) { + return true; + } + return Boolean.getBoolean("spdf.pdfjson.repeatScan"); + } + + private void logRepeatedJsonStrings(String label, byte[] jsonBytes) { + final int minLen = 12; + final int maxLen = 200; + final int maxUnique = 50000; + java.util.Map counts = new java.util.HashMap<>(); + boolean inString = false; + boolean escape = false; + boolean tooLong = false; + StringBuilder current = new StringBuilder(64); + boolean capped = false; + + for (byte b : jsonBytes) { + char ch = (char) (b & 0xFF); + if (!inString) { + if (ch == '"') { + inString = true; + escape = false; + tooLong = false; + current.setLength(0); + } + continue; + } + + if (escape) { + escape = false; + if (!tooLong && current.length() < maxLen) { + current.append(ch); + } + continue; + } + if (ch == '\\') { + escape = true; + continue; + } + if (ch == '"') { + inString = false; + if (!tooLong) { + int len = current.length(); + if (len >= minLen && len <= maxLen) { + String value = current.toString(); + if (!looksLikeBase64(value)) { + if (!capped || counts.containsKey(value)) { + counts.merge(value, 1, Integer::sum); + if (!capped && counts.size() >= maxUnique) { + capped = true; + } + } + } + } + } + continue; + } + if (!tooLong) { + if (current.length() < maxLen) { + current.append(ch); + } else { + tooLong = true; + } + } + } + + java.util.List> top = + counts.entrySet().stream() + .filter(e -> e.getValue() > 1) + .sorted((a, b) -> Integer.compare(b.getValue(), a.getValue())) + .limit(20) + .toList(); + + if (!top.isEmpty()) { + String summary = + top.stream() + .map( + e -> + String.format( + "\"%s\"(len=%d,count=%d)", + truncateForLog(e.getKey()), + e.getKey().length(), + e.getValue())) + .collect(java.util.stream.Collectors.joining("; ")); + log.debug( + "PDF JSON repeat scan ({}): top strings -> {}{}", + label, + summary, + capped ? " (capped)" : ""); + } else { + log.debug( + "PDF JSON repeat scan ({}): no repeated strings found{}", + label, + capped ? " (capped)" : ""); + } + } + + private boolean looksLikeBase64(String value) { + if (value.length() < 32) { + return false; + } + int base64Chars = 0; + for (int i = 0; i < value.length(); i++) { + char c = value.charAt(i); + if ((c >= 'A' && c <= 'Z') + || (c >= 'a' && c <= 'z') + || (c >= '0' && c <= '9') + || c == '+' + || c == '/' + || c == '=') { + base64Chars++; + } + } + return base64Chars >= value.length() * 0.9; + } + + private String truncateForLog(String value) { + int max = 64; + if (value.length() <= max) { + return value.replaceAll("[\\r\\n\\t]+", " "); + } + return value.substring(0, max).replaceAll("[\\r\\n\\t]+", " ") + "..."; + } + /** * Validate that the current user has access to the given job. * diff --git a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java index 1d1e5ae8d..ae8e7f706 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonConversionService.java @@ -239,12 +239,12 @@ public class PdfJsonConversionService { } cacheBudgetBytes = effective; if (cacheBudgetBytes > 0) { - log.info( + log.debug( "PDF JSON cache budget configured: {} bytes (source: {})", cacheBudgetBytes, cacheMaxBytes > 0 ? "max-bytes" : "max-percent"); } else { - log.info("PDF JSON cache budget: unlimited"); + log.debug("PDF JSON cache budget: unlimited"); } } @@ -431,8 +431,10 @@ public class PdfJsonConversionService { progress.accept( PdfJsonConversionProgress.of( 80, "annotations", "Collecting annotations and form fields")); + boolean includeAnnotationRawData = !(lightweight && isRealJobId); Map> annotationsByPage = - collectAnnotations(document, totalPages, progress); + collectAnnotations( + document, totalPages, progress, includeAnnotationRawData); progress.accept( PdfJsonConversionProgress.of(90, "metadata", "Extracting metadata")); @@ -440,19 +442,40 @@ public class PdfJsonConversionService { pdfJson.setMetadata(extractMetadata(document)); pdfJson.setXmpMetadata(extractXmpMetadata(document)); pdfJson.setLazyImages(useLazyImages); - List serializedFonts = cloneFontList(fonts.values()); - serializedFonts.sort( + List cachedFonts = cloneFontList(fonts.values()); + cachedFonts.sort( Comparator.comparing( PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder()))); - pdfJson.setFonts(serializedFonts); + dedupeFontPayloads(cachedFonts); + Map cachedFontMap = new LinkedHashMap<>(); + for (PdfJsonFont cachedFont : cachedFonts) { + String cacheKey = resolveFontCacheKey(cachedFont); + if (cacheKey != null) { + cachedFontMap.put(cacheKey, cachedFont); + } + } + List responseFonts = cloneFontList(cachedFonts); + pdfJson.setFonts(responseFonts); pdfJson.setPages( - extractPages(document, textByPage, imagesByPage, annotationsByPage)); - pdfJson.setFormFields(collectFormFields(document)); + extractPages( + document, + textByPage, + imagesByPage, + annotationsByPage, + lightweight && isRealJobId)); + if (lightweight && isRealJobId) { + // Lightweight async editor flow does not use form fields and this payload can + // be + // very large due nested raw dictionaries. + pdfJson.setFormFields(null); + } else { + pdfJson.setFormFields(collectFormFields(document)); + } // Only cache for real async jobIds, not synthetic synchronous ones if (useLazyImages && isRealJobId) { - log.info( + log.debug( "Creating cache for jobId: {} (useLazyImages={}, isRealJobId={})", jobId, useLazyImages, @@ -460,7 +483,7 @@ public class PdfJsonConversionService { PdfJsonDocumentMetadata docMetadata = new PdfJsonDocumentMetadata(); docMetadata.setMetadata(pdfJson.getMetadata()); docMetadata.setXmpMetadata(pdfJson.getXmpMetadata()); - docMetadata.setFonts(serializedFonts); + docMetadata.setFonts(cloneFontList(responseFonts)); docMetadata.setFormFields(pdfJson.getFormFields()); docMetadata.setLazyImages(Boolean.TRUE); @@ -490,7 +513,11 @@ public class PdfJsonConversionService { } CachedPdfDocument cached = buildCachedDocument( - jobId, cachedPdfBytes, docMetadata, fonts, pageFontResources); + jobId, + cachedPdfBytes, + docMetadata, + cachedFontMap, + pageFontResources); putCachedDocument(jobId, cached); log.info( "Successfully cached PDF ({} bytes, {} pages, {} fonts) for jobId: {} (diskBacked={})", @@ -511,18 +538,26 @@ public class PdfJsonConversionService { if (lightweight) { applyLightweightTransformations(pdfJson); } + if (lightweight && isRealJobId) { + stripFontProgramPayloads(responseFonts); + stripFontCosStreamData(responseFonts); + } + + logFontPayloadStats(responseFonts, "pdf/text-editor"); + analyzePdfJson(pdfJson, "pdf/text-editor"); progress.accept( PdfJsonConversionProgress.of(95, "serializing", "Generating JSON output")); // Collect font issues for summary java.util.List fontsWithMissingProgram = - serializedFonts.stream() + responseFonts.stream() .filter( f -> Boolean.TRUE.equals(f.getEmbedded()) - && (f.getProgram() == null - || f.getProgram().isEmpty())) + && !(hasPayload(f.getProgram()) + || hasPayload(f.getPdfProgram()) + || hasPayload(f.getWebProgram()))) .map( f -> { String name = @@ -543,14 +578,12 @@ public class PdfJsonConversionService { }) .collect(java.util.stream.Collectors.toList()); long type3Fonts = - serializedFonts.stream() - .filter(f -> "Type3".equals(f.getSubtype())) - .count(); + responseFonts.stream().filter(f -> "Type3".equals(f.getSubtype())).count(); if (!fontsWithMissingProgram.isEmpty()) { log.warn( "PDF->JSON conversion complete: {} fonts ({} Type3), {} pages. Missing font programs for {} embedded font(s): {}", - serializedFonts.size(), + responseFonts.size(), type3Fonts, pdfJson.getPages().size(), fontsWithMissingProgram.size(), @@ -558,7 +591,7 @@ public class PdfJsonConversionService { } else { log.info( "PDF->JSON conversion complete: {} fonts ({} Type3), {} pages", - serializedFonts.size(), + responseFonts.size(), type3Fonts, pdfJson.getPages().size()); } @@ -960,40 +993,59 @@ public class PdfJsonConversionService { if (font == null) { return null; } - return PdfJsonFont.builder() - .id(font.getId()) - .pageNumber(font.getPageNumber()) - .uid(font.getUid()) - .baseName(font.getBaseName()) - .subtype(font.getSubtype()) - .encoding(font.getEncoding()) - .cidSystemInfo(font.getCidSystemInfo()) - .embedded(font.getEmbedded()) - .program(font.getProgram()) - .programFormat(font.getProgramFormat()) - .webProgram(font.getWebProgram()) - .webProgramFormat(font.getWebProgramFormat()) - .pdfProgram(font.getPdfProgram()) - .pdfProgramFormat(font.getPdfProgramFormat()) - .type3Glyphs( - font.getType3Glyphs() == null - ? null - : new ArrayList<>(font.getType3Glyphs())) - .conversionCandidates( - font.getConversionCandidates() == null - ? null - : new ArrayList<>(font.getConversionCandidates())) - .toUnicode(font.getToUnicode()) - .standard14Name(font.getStandard14Name()) - .fontDescriptorFlags(font.getFontDescriptorFlags()) - .ascent(font.getAscent()) - .descent(font.getDescent()) - .capHeight(font.getCapHeight()) - .xHeight(font.getXHeight()) - .italicAngle(font.getItalicAngle()) - .unitsPerEm(font.getUnitsPerEm()) - .cosDictionary(font.getCosDictionary()) - .build(); + try { + byte[] bytes = objectMapper.writeValueAsBytes(font); + return objectMapper.readValue(bytes, PdfJsonFont.class); + } catch (Exception ex) { + log.debug( + "Failed deep-cloning font {} via roundtrip: {}", font.getId(), ex.getMessage()); + PdfJsonCosValue cosClone = null; + try { + if (font.getCosDictionary() != null) { + byte[] cosBytes = objectMapper.writeValueAsBytes(font.getCosDictionary()); + cosClone = objectMapper.readValue(cosBytes, PdfJsonCosValue.class); + } + } catch (Exception cosEx) { + log.debug( + "Failed deep-cloning font cosDictionary {}: {}", + font.getId(), + cosEx.getMessage()); + } + return PdfJsonFont.builder() + .id(font.getId()) + .pageNumber(font.getPageNumber()) + .uid(font.getUid()) + .baseName(font.getBaseName()) + .subtype(font.getSubtype()) + .encoding(font.getEncoding()) + .cidSystemInfo(font.getCidSystemInfo()) + .embedded(font.getEmbedded()) + .program(font.getProgram()) + .programFormat(font.getProgramFormat()) + .webProgram(font.getWebProgram()) + .webProgramFormat(font.getWebProgramFormat()) + .pdfProgram(font.getPdfProgram()) + .pdfProgramFormat(font.getPdfProgramFormat()) + .type3Glyphs( + font.getType3Glyphs() == null + ? null + : new ArrayList<>(font.getType3Glyphs())) + .conversionCandidates( + font.getConversionCandidates() == null + ? null + : new ArrayList<>(font.getConversionCandidates())) + .toUnicode(font.getToUnicode()) + .standard14Name(font.getStandard14Name()) + .fontDescriptorFlags(font.getFontDescriptorFlags()) + .ascent(font.getAscent()) + .descent(font.getDescent()) + .capHeight(font.getCapHeight()) + .xHeight(font.getXHeight()) + .italicAngle(font.getItalicAngle()) + .unitsPerEm(font.getUnitsPerEm()) + .cosDictionary(cosClone) + .build(); + } } private void applyLightweightTransformations(PdfJsonDocument document) { @@ -1046,6 +1098,391 @@ public class PdfJsonConversionService { return toPdfJsonFont(cacheEntry, fontId, pageNumber, jobId); } + private void logFontPayloadStats(List fonts, String label) { + if (!log.isDebugEnabled()) { + return; + } + if (fonts == null || fonts.isEmpty()) { + return; + } + long programBytes = 0; + long webProgramBytes = 0; + long pdfProgramBytes = 0; + long toUnicodeBytes = 0; + long maxFontPayload = 0; + String maxFontId = null; + + for (PdfJsonFont font : fonts) { + if (font == null) { + continue; + } + long fontBytes = 0; + if (font.getProgram() != null) { + long len = font.getProgram().length(); + programBytes += len; + fontBytes += len; + } + if (font.getWebProgram() != null) { + long len = font.getWebProgram().length(); + webProgramBytes += len; + fontBytes += len; + } + if (font.getPdfProgram() != null) { + long len = font.getPdfProgram().length(); + pdfProgramBytes += len; + fontBytes += len; + } + if (font.getToUnicode() != null) { + long len = font.getToUnicode().length(); + toUnicodeBytes += len; + fontBytes += len; + } + if (fontBytes > maxFontPayload) { + maxFontPayload = fontBytes; + maxFontId = font.getUid() != null ? font.getUid() : font.getId(); + } + } + + log.debug( + "Font payload stats ({}): fonts={}, programBytes={}, webProgramBytes={}, pdfProgramBytes={}, toUnicodeBytes={}, maxFontPayloadBytes={} (fontId={})", + label, + fonts.size(), + programBytes, + webProgramBytes, + pdfProgramBytes, + toUnicodeBytes, + maxFontPayload, + maxFontId); + } + + private void analyzePdfJson(PdfJsonDocument pdfJson, String label) { + if (!isPdfJsonDebugAnalyzeEnabled() || pdfJson == null) { + return; + } + + try { + Map resourceStats = new HashMap<>(); + Map fontDictStats = new HashMap<>(); + Map annotationStats = new HashMap<>(); + long imageDataBytes = 0; + long imageCount = 0; + long textElementCount = 0; + long textCharCount = 0; + + List pages = pdfJson.getPages(); + if (pages != null) { + for (PdfJsonPage page : pages) { + if (page == null) { + continue; + } + recordDuplicate(resourceStats, page.getResources()); + + List annotations = page.getAnnotations(); + if (annotations != null) { + for (PdfJsonAnnotation annotation : annotations) { + recordDuplicate(annotationStats, annotation.getRawData()); + } + } + + List images = page.getImageElements(); + if (images != null) { + for (PdfJsonImageElement image : images) { + if (image == null) { + continue; + } + String data = image.getImageData(); + if (data != null) { + imageDataBytes += data.length(); + } + imageCount++; + } + } + + List textElements = page.getTextElements(); + if (textElements != null) { + for (PdfJsonTextElement element : textElements) { + if (element == null) { + continue; + } + textElementCount++; + String text = element.getText(); + if (text != null) { + textCharCount += text.length(); + } + } + } + } + } + + List fonts = pdfJson.getFonts(); + if (fonts != null) { + for (PdfJsonFont font : fonts) { + recordDuplicate(fontDictStats, font.getCosDictionary()); + } + } + + logDuplicateSummary("resources", label, resourceStats); + logDuplicateSummary("fontCosDictionary", label, fontDictStats); + logDuplicateSummary("annotationRawData", label, annotationStats); + log.debug( + "PDF JSON analysis ({}): images={} imageDataBytes={} textElements={} textChars={}", + label, + imageCount, + imageDataBytes, + textElementCount, + textCharCount); + + long fontsBytes = sizeOfObject(pdfJson.getFonts()); + long pagesBytes = sizeOfObject(pdfJson.getPages()); + long metadataBytes = sizeOfObject(pdfJson.getMetadata()); + long xmpBytes = sizeOfObject(pdfJson.getXmpMetadata()); + long formFieldsBytes = sizeOfObject(pdfJson.getFormFields()); + log.debug( + "PDF JSON analysis ({}): sectionSizes fonts={} pages={} metadata={} xmp={} formFields={}", + label, + fontsBytes, + pagesBytes, + metadataBytes, + xmpBytes, + formFieldsBytes); + + if (pages != null && !pages.isEmpty()) { + List topPages = new ArrayList<>(); + int pageIndex = 0; + for (PdfJsonPage page : pages) { + if (page == null) { + pageIndex++; + continue; + } + long size = sizeOfObject(page); + int pageNumber = + page.getPageNumber() != null ? page.getPageNumber() : pageIndex + 1; + topPages.add(new PageSizeStat(pageNumber, size, page)); + pageIndex++; + } + topPages.sort((a, b) -> Long.compare(b.sizeBytes, a.sizeBytes)); + String top = + topPages.stream() + .limit(5) + .map( + s -> + String.format( + "page=%d size=%d", + s.pageNumber, s.sizeBytes)) + .collect(java.util.stream.Collectors.joining("; ")); + log.debug("PDF JSON analysis ({}): topPageSizes -> {}", label, top); + + topPages.stream() + .limit(3) + .forEach( + s -> { + PdfJsonPage page = s.page; + long resources = sizeOfObject(page.getResources()); + long contentStreams = sizeOfObject(page.getContentStreams()); + long annotations = sizeOfObject(page.getAnnotations()); + long textElements = sizeOfObject(page.getTextElements()); + long imageElements = sizeOfObject(page.getImageElements()); + log.debug( + "PDF JSON analysis ({}): pageBreakdown page={} total={} resources={} contentStreams={} annotations={} textElements={} imageElements={}", + label, + s.pageNumber, + s.sizeBytes, + resources, + contentStreams, + annotations, + textElements, + imageElements); + }); + } + } catch (Exception ex) { + log.warn("PDF JSON analysis failed ({}): {}", label, ex.getMessage()); + } + } + + private void recordDuplicate(Map stats, PdfJsonCosValue value) + throws IOException, java.security.NoSuchAlgorithmException { + if (value == null) { + return; + } + byte[] bytes = objectMapper.writeValueAsBytes(value); + if (bytes.length == 0) { + return; + } + String hash = + Base64.getEncoder() + .encodeToString( + java.security.MessageDigest.getInstance("SHA-256").digest(bytes)); + DuplicateStats entry = stats.computeIfAbsent(hash, k -> new DuplicateStats()); + entry.count++; + if (entry.sizeBytes == 0) { + entry.sizeBytes = bytes.length; + } + } + + private void logDuplicateSummary( + String category, String label, Map stats) { + if (stats.isEmpty()) { + return; + } + List duplicates = + stats.values().stream() + .filter(s -> s.count > 1) + .sorted((a, b) -> Long.compare(b.totalBytesSaved(), a.totalBytesSaved())) + .limit(5) + .toList(); + + if (duplicates.isEmpty()) { + return; + } + + String summary = + duplicates.stream() + .map( + s -> + String.format( + "count=%d size=%d potentialSavings=%d", + s.count, s.sizeBytes, s.totalBytesSaved())) + .collect(java.util.stream.Collectors.joining("; ")); + log.debug("PDF JSON analysis ({}): top duplicates for {} -> {}", label, category, summary); + } + + private boolean isPdfJsonDebugAnalyzeEnabled() { + String env = System.getenv("SPDF_PDFJSON_ANALYZE"); + if (env != null && env.equalsIgnoreCase("true")) { + return true; + } + return Boolean.getBoolean("spdf.pdfjson.analyze"); + } + + private long sizeOfObject(Object value) { + if (value == null) { + return 0; + } + try { + return objectMapper.writeValueAsBytes(value).length; + } catch (Exception ex) { + log.warn("Failed to serialize object for size analysis: {}", ex.getMessage()); + return -1; + } + } + + private static final class DuplicateStats { + private int count; + private long sizeBytes; + + private long totalBytesSaved() { + return sizeBytes * (long) (count - 1); + } + } + + private static final class PageSizeStat { + private final int pageNumber; + private final long sizeBytes; + private final PdfJsonPage page; + + private PageSizeStat(int pageNumber, long sizeBytes, PdfJsonPage page) { + this.pageNumber = pageNumber; + this.sizeBytes = sizeBytes; + this.page = page; + } + } + + private void dedupeFontPayloads(List fonts) { + if (fonts == null || fonts.isEmpty()) { + return; + } + for (PdfJsonFont font : fonts) { + if (font == null) { + continue; + } + String program = font.getProgram(); + String pdfProgram = font.getPdfProgram(); + String webProgram = font.getWebProgram(); + + if (pdfProgram != null && !pdfProgram.isBlank()) { + if (program != null && program.equals(pdfProgram)) { + font.setProgram(null); + font.setProgramFormat(null); + } + if (webProgram != null && webProgram.equals(pdfProgram)) { + font.setWebProgram(null); + font.setWebProgramFormat(null); + } + continue; + } + + if (program != null && webProgram != null && program.equals(webProgram)) { + font.setWebProgram(null); + font.setWebProgramFormat(null); + } + } + } + + private void stripFontProgramPayloads(List fonts) { + if (fonts == null || fonts.isEmpty()) { + return; + } + for (PdfJsonFont font : fonts) { + if (font == null) { + continue; + } + font.setProgram(null); + font.setProgramFormat(null); + font.setWebProgram(null); + font.setWebProgramFormat(null); + font.setPdfProgram(null); + font.setPdfProgramFormat(null); + } + } + + private void stripFontCosStreamData(List fonts) { + if (fonts == null || fonts.isEmpty()) { + return; + } + Set visited = Collections.newSetFromMap(new IdentityHashMap<>()); + for (PdfJsonFont font : fonts) { + if (font == null) { + continue; + } + PdfJsonCosValue cosDictionary = font.getCosDictionary(); + if (cosDictionary != null) { + stripStreamRawData(cosDictionary, visited); + } + } + } + + private void stripStreamRawData(PdfJsonCosValue value, Set visited) { + if (value == null || value.getType() == null) { + return; + } + if (!visited.add(value)) { + return; + } + switch (value.getType()) { + case STREAM: + if (value.getStream() != null) { + value.getStream().setRawData(null); + } + break; + case ARRAY: + if (value.getItems() != null) { + for (PdfJsonCosValue item : value.getItems()) { + stripStreamRawData(item, visited); + } + } + break; + case DICTIONARY: + if (value.getEntries() != null) { + for (PdfJsonCosValue entry : value.getEntries().values()) { + stripStreamRawData(entry, visited); + } + } + break; + default: + break; + } + } + private FontModelCacheEntry createFontCacheEntry( PDDocument document, PDFont font, String fontId, int pageNumber, String jobId) throws IOException { @@ -1534,12 +1971,12 @@ public class PdfJsonConversionService { "[FALLBACK-DEBUG] Reusing cached fallback font {} (key: {})", effectiveId, key); return font; } - log.info( + log.debug( "[FALLBACK-DEBUG] Loading fallback font {} (key: {}) via fallbackFontService", effectiveId, key); PDFont loaded = fallbackFontService.loadFallbackPdfFont(document, effectiveId); - log.info( + log.debug( "[FALLBACK-DEBUG] Loaded fallback font {} - PDFont class: {}, name: {}", effectiveId, loaded.getClass().getSimpleName(), @@ -1734,7 +2171,7 @@ public class PdfJsonConversionService { PDStream fontFile3 = descriptor.getFontFile3(); if (fontFile3 != null) { String subtype = fontFile3.getCOSObject().getNameAsString(COSName.SUBTYPE); - log.info( + log.debug( "[FONT-DEBUG] Font {}: Found FontFile3 with subtype {}", font.getName(), subtype); @@ -1985,7 +2422,8 @@ public class PdfJsonConversionService { PDDocument document, Map> textByPage, Map> imagesByPage, - Map> annotationsByPage) + Map> annotationsByPage, + boolean omitResourceStreamData) throws IOException { List pages = new ArrayList<>(); int pageIndex = 0; @@ -2008,8 +2446,15 @@ public class PdfJsonConversionService { // imageElements COSBase resourcesBase = page.getCOSObject().getDictionaryObject(COSName.RESOURCES); COSBase filteredResources = filterImageXObjectsFromResources(resourcesBase); - pageModel.setResources(cosMapper.serializeCosValue(filteredResources)); - pageModel.setContentStreams(extractContentStreams(page)); + if (omitResourceStreamData) { + // In lightweight editor mode, omit heavy resource/content stream payloads entirely. + // Partial export preserves originals from cached PDF when these fields are missing. + pageModel.setResources(null); + pageModel.setContentStreams(null); + } else { + pageModel.setResources(cosMapper.serializeCosValue(filteredResources)); + pageModel.setContentStreams(extractContentStreams(page, false)); + } pages.add(pageModel); pageIndex++; } @@ -2040,7 +2485,10 @@ public class PdfJsonConversionService { } private Map> collectAnnotations( - PDDocument document, int totalPages, Consumer progress) + PDDocument document, + int totalPages, + Consumer progress, + boolean includeRawData) throws IOException { Map> annotationsByPage = new LinkedHashMap<>(); int pageNumber = 1; @@ -2112,8 +2560,13 @@ public class PdfJsonConversionService { } } - // Store raw dictionary for lossless round-trip - ann.setRawData(cosMapper.serializeCosValue(annotDict)); + if (includeRawData) { + // Store raw dictionary for lossless round-trip + ann.setRawData( + cosMapper.serializeCosValue( + annotDict, + PdfJsonCosMapper.SerializationContext.ANNOTATION_RAW_DATA)); + } annotations.add(ann); } catch (Exception e) { @@ -2193,7 +2646,10 @@ public class PdfJsonConversionService { } // Store raw dictionary for lossless round-trip - formField.setRawData(cosMapper.serializeCosValue(field.getCOSObject())); + formField.setRawData( + cosMapper.serializeCosValue( + field.getCOSObject(), + PdfJsonCosMapper.SerializationContext.FORM_FIELD_RAW_DATA)); formFields.add(formField); } catch (Exception e) { @@ -2527,7 +2983,8 @@ public class PdfJsonConversionService { return streams; } - private List extractContentStreams(PDPage page) throws IOException { + private List extractContentStreams(PDPage page, boolean omitRawData) + throws IOException { List streams = new ArrayList<>(); Iterator iterator = page.getContentStreams(); if (iterator == null) { @@ -2535,7 +2992,13 @@ public class PdfJsonConversionService { } while (iterator.hasNext()) { PDStream stream = iterator.next(); - PdfJsonStream model = cosMapper.serializeStream(stream); + PdfJsonStream model = + omitRawData + ? cosMapper.serializeStream( + stream, + PdfJsonCosMapper.SerializationContext + .CONTENT_STREAMS_LIGHTWEIGHT) + : cosMapper.serializeStream(stream); if (model != null) { streams.add(model); } @@ -2543,6 +3006,10 @@ public class PdfJsonConversionService { return streams; } + private List extractContentStreams(PDPage page) throws IOException { + return extractContentStreams(page, false); + } + private PDStream extractVectorGraphics( PDDocument document, List preservedStreams, @@ -2871,8 +3338,8 @@ public class PdfJsonConversionService { baseFontModel.getUid(), Collections.emptySet()) : Collections.emptySet(); boolean hasNormalizedType3 = baseIsType3 && normalizedType3Font != null; - if (hasNormalizedType3 && log.isInfoEnabled()) { - log.info( + if (hasNormalizedType3 && log.isDebugEnabled()) { + log.debug( "[TYPE3-RUNTIME] Using normalized library font {} for Type3 resource {} on page {}", normalizedType3Font.getName(), baseFontModel != null ? baseFontModel.getId() : baseFontId, @@ -3002,7 +3469,7 @@ public class PdfJsonConversionService { } if (rawType3CodesUsed) { - log.info( + log.debug( "[TYPE3-RUNTIME] Reused original Type3 charCodes for font {} on page {} ({} glyphs)", baseFontModel != null ? baseFontModel.getId() : baseFontId, pageNumber, @@ -3642,22 +4109,22 @@ public class PdfJsonConversionService { // NOTE: Do NOT sanitize encoded bytes for normalized Type3 fonts // Multi-byte encodings (UTF-16BE, CID fonts) have null bytes that are essential // Removing them corrupts the byte boundaries and produces garbled text - log.info( + log.debug( "[TYPE3] Encoded text '{}' for normalized font {}: encoded={} bytes", text.length() > 20 ? text.substring(0, 20) + "..." : text, fontModel.getId(), encoded != null ? encoded.length : 0); if (encoded != null && encoded.length > 0) { - log.info( + log.debug( "[TYPE3] Successfully encoded text for normalized Type3 font {} using standard encoding", fontModel.getId()); return encoded; } - log.info( + log.debug( "[TYPE3] Standard encoding produced empty result for normalized Type3 font {}, falling through to Type3 mapping", fontModel.getId()); } catch (IOException | IllegalArgumentException ex) { - log.info( + log.debug( "[TYPE3] Standard encoding failed for normalized Type3 font {}: {}", fontModel.getId(), ex.getMessage()); @@ -4183,7 +4650,7 @@ public class PdfJsonConversionService { // Last resort: Fuzzy match baseName against Standard14 fonts Standard14Fonts.FontName fuzzyMatch = fuzzyMatchStandard14(fontModel.getBaseName()); if (fuzzyMatch != null) { - log.info( + log.debug( "Fuzzy-matched font {} (baseName: {}) to Standard14 font {}", fontModel.getId(), fontModel.getBaseName(), @@ -4217,7 +4684,7 @@ public class PdfJsonConversionService { document, fontModel, source, originalFormat, true, true, true); if (font != null) { type3NormalizedFontCache.put(cacheKey, font); - log.info( + log.debug( "Cached normalized font {} for Type3 {} (key: {})", source.originLabel(), fontModel.getId(), @@ -4264,7 +4731,7 @@ public class PdfJsonConversionService { String originLabel = source.originLabel(); try { if (!skipMetadataLog) { - log.info( + log.debug( "[FONT-DEBUG] Attempting to load font {} using payload {} (format={}, size={} bytes)", fontModel.getId(), originLabel, @@ -4291,7 +4758,7 @@ public class PdfJsonConversionService { // so all glyphs are available for editing boolean willBeSubset = !originLabel.contains("type3-library"); if (!willBeSubset) { - log.info( + log.debug( "[TYPE3-RUNTIME] Loading library font {} WITHOUT subsetting (full glyph set) from {}", fontModel.getId(), originLabel); @@ -4343,7 +4810,7 @@ public class PdfJsonConversionService { try { restored = cosMapper.deserializeCosValue(fontModel.getCosDictionary(), document); } catch (Exception ex) { - log.warn( + log.debug( "[FONT-RESTORE] Font {} cosDictionary deserialization failed: {}", fontModel.getId(), ex.getMessage()); @@ -4351,7 +4818,7 @@ public class PdfJsonConversionService { } if (!(restored instanceof COSDictionary cosDictionary)) { - log.warn( + log.debug( "[FONT-RESTORE] Font {} cosDictionary deserialized to {} instead of COSDictionary", fontModel.getId(), restored != null ? restored.getClass().getSimpleName() : "null"); @@ -4361,7 +4828,7 @@ public class PdfJsonConversionService { // Validate that dictionary contains required font keys if (!cosDictionary.containsKey(org.apache.pdfbox.cos.COSName.TYPE) || !cosDictionary.containsKey(org.apache.pdfbox.cos.COSName.SUBTYPE)) { - log.warn( + log.debug( "[FONT-RESTORE] Font {} cosDictionary missing required Type or Subtype keys", fontModel.getId()); return null; @@ -4370,14 +4837,14 @@ public class PdfJsonConversionService { try { PDFont font = PDFontFactory.createFont(cosDictionary); if (font == null) { - log.warn( + log.debug( "[FONT-RESTORE] Font {} PDFontFactory returned null for valid dictionary", fontModel.getId()); return null; } if (!font.isEmbedded()) { - log.warn( + log.debug( "[FONT-RESTORE] Font {} restored from dictionary but is not embedded; rejecting to avoid system font substitution", fontModel.getId()); return null; @@ -4391,7 +4858,7 @@ public class PdfJsonConversionService { return font; } catch (IOException ex) { - log.warn( + log.debug( "[FONT-RESTORE] Failed to restore font {} from dictionary ({}): {}", fontModel.getId(), fontModel.getSubtype(), @@ -5657,10 +6124,13 @@ public class PdfJsonConversionService { docMetadata.setMetadata(extractMetadata(document)); docMetadata.setXmpMetadata(extractXmpMetadata(document)); - List serializedFonts = new ArrayList<>(fonts.values()); + List serializedFonts = cloneFontList(fonts.values()); serializedFonts.sort( Comparator.comparing( PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder()))); + dedupeFontPayloads(serializedFonts); + stripFontProgramPayloads(serializedFonts); + stripFontCosStreamData(serializedFonts); docMetadata.setFonts(serializedFonts); // Extract page dimensions @@ -5677,7 +6147,9 @@ public class PdfJsonConversionService { pageIndex++; } docMetadata.setPageDimensions(pageDimensions); - docMetadata.setFormFields(collectFormFields(document)); + // Metadata endpoint is used for lazy editor bootstrapping; omit form fields to avoid + // shipping large duplicate raw dictionaries before any edit occurs. + docMetadata.setFormFields(null); docMetadata.setLazyImages(Boolean.TRUE); // Cache PDF bytes, metadata, and fonts for lazy page loading @@ -5826,7 +6298,7 @@ public class PdfJsonConversionService { } } - ann.setRawData(cosMapper.serializeCosValue(annotDict)); + // For cached page extraction, skip rawData to avoid huge payloads annotations.add(ann); } catch (Exception e) { log.warn( @@ -5849,8 +6321,8 @@ public class PdfJsonConversionService { // Extract resources and content streams COSBase resourcesBase = page.getCOSObject().getDictionaryObject(COSName.RESOURCES); COSBase filteredResources = filterImageXObjectsFromResources(resourcesBase); - pageModel.setResources(cosMapper.serializeCosValue(filteredResources)); - pageModel.setContentStreams(extractContentStreams(page)); + pageModel.setResources(null); + pageModel.setContentStreams(null); log.debug( "Extracted page {} (text: {}, images: {}, annotations: {}) for jobId: {}", @@ -5864,11 +6336,59 @@ public class PdfJsonConversionService { } } + public byte[] extractPageFonts(String jobId, int pageNumber) throws IOException { + CachedPdfDocument cached = getCachedDocument(jobId); + if (cached == null) { + throw new stirling.software.SPDF.exception.CacheUnavailableException( + "No cached document found for jobId: " + jobId); + } + + int totalPages = cached.getMetadata().getPageDimensions().size(); + if (pageNumber < 1 || pageNumber > totalPages) { + throw new IllegalArgumentException( + String.format("pageNumber must be between 1 and %d", totalPages)); + } + + Map> pageFontResources = cached.getPageFontResources(); + Map pageMap = + pageFontResources != null ? pageFontResources.get(pageNumber) : null; + if (pageMap == null || pageMap.isEmpty()) { + return objectMapper.writeValueAsBytes(Collections.emptyList()); + } + + Map cachedFonts = cached.getFonts(); + List pageFonts = new ArrayList<>(); + Set seen = new LinkedHashSet<>(); + for (String fontId : pageMap.values()) { + if (fontId == null || fontId.isBlank()) { + continue; + } + String key = buildFontKey(jobId, pageNumber, fontId); + if (!seen.add(key)) { + continue; + } + PdfJsonFont font = cachedFonts.get(key); + if (font == null) { + // Fallback to unscoped key for resilience with legacy cached entries. + font = cachedFonts.get(buildFontKey(null, pageNumber, fontId)); + } + if (font == null) { + continue; + } + PdfJsonFont clone = cloneFont(font); + pageFonts.add(clone != null ? clone : font); + } + pageFonts.sort( + Comparator.comparing( + PdfJsonFont::getUid, Comparator.nullsLast(Comparator.naturalOrder()))); + return objectMapper.writeValueAsBytes(pageFonts); + } + public byte[] exportUpdatedPages(String jobId, PdfJsonDocument updates) throws IOException { if (jobId == null || jobId.isBlank()) { throw new IllegalArgumentException("jobId is required for incremental export"); } - log.info("Looking up cache for jobId: {}", jobId); + log.debug("Looking up cache for jobId: {}", jobId); CachedPdfDocument cached = getCachedDocument(jobId); if (cached == null) { log.error( @@ -5878,7 +6398,7 @@ public class PdfJsonConversionService { throw new stirling.software.SPDF.exception.CacheUnavailableException( "No cached document available for jobId: " + jobId); } - log.info( + log.debug( "Found cached document for jobId: {} (size={}, diskBacked={})", jobId, cached.getPdfSize(), @@ -6055,6 +6575,13 @@ public class PdfJsonConversionService { List fontModels, int pageNumberValue) throws IOException { + boolean preserveExistingAnnotations = + shouldPreserveExistingAnnotations(pageModel.getAnnotations()); + boolean preserveExistingContentStreams = + shouldPreserveExistingContentStreams(pageModel.getContentStreams()); + boolean preserveExistingResources = + shouldPreserveExistingResources(pageModel.getResources()); + PDRectangle currentBox = page.getMediaBox(); float fallbackWidth = currentBox != null ? currentBox.getWidth() : 612f; float fallbackHeight = currentBox != null ? currentBox.getHeight() : 792f; @@ -6069,14 +6596,20 @@ public class PdfJsonConversionService { page.setRotation(pageModel.getRotation()); } - applyPageResources(document, page, pageModel.getResources()); + if (!preserveExistingResources) { + applyPageResources(document, page, pageModel.getResources()); + } - List preservedStreams = - buildContentStreams(document, pageModel.getContentStreams()); - if (preservedStreams.isEmpty()) { - page.setContents(new ArrayList<>()); + List preservedStreams; + if (preserveExistingContentStreams) { + preservedStreams = snapshotExistingContentStreams(page); } else { - page.setContents(preservedStreams); + preservedStreams = buildContentStreams(document, pageModel.getContentStreams()); + if (preservedStreams.isEmpty()) { + page.setContents(new ArrayList<>()); + } else { + page.setContents(preservedStreams); + } } List imageElements = @@ -6116,12 +6649,14 @@ public class PdfJsonConversionService { pageNumberValue); if (regenerateMode == RegenerateMode.REUSE_EXISTING) { - page.getAnnotations().clear(); - List annotations = - pageModel.getAnnotations() != null - ? new ArrayList<>(pageModel.getAnnotations()) - : new ArrayList<>(); - restoreAnnotations(document, page, annotations); + if (!preserveExistingAnnotations) { + page.getAnnotations().clear(); + List annotations = + pageModel.getAnnotations() != null + ? new ArrayList<>(pageModel.getAnnotations()) + : new ArrayList<>(); + restoreAnnotations(document, page, annotations); + } return; } @@ -6150,12 +6685,14 @@ public class PdfJsonConversionService { pageNumberValue, appendMode); - page.getAnnotations().clear(); - List annotations = - pageModel.getAnnotations() != null - ? new ArrayList<>(pageModel.getAnnotations()) - : new ArrayList<>(); - restoreAnnotations(document, page, annotations); + if (!preserveExistingAnnotations) { + page.getAnnotations().clear(); + List annotations = + pageModel.getAnnotations() != null + ? new ArrayList<>(pageModel.getAnnotations()) + : new ArrayList<>(); + restoreAnnotations(document, page, annotations); + } } private RegenerateMode determineRegenerateMode( @@ -6202,6 +6739,99 @@ public class PdfJsonConversionService { REGENERATE_CLEAR } + private boolean shouldPreserveExistingAnnotations(List annotations) { + if (annotations == null) { + return true; + } + if (annotations.isEmpty()) { + return false; + } + for (PdfJsonAnnotation annotation : annotations) { + if (annotation == null || annotation.getRawData() == null) { + return true; + } + if (hasMissingStreamData(annotation.getRawData())) { + return true; + } + } + return false; + } + + private boolean shouldPreserveExistingContentStreams(List streams) { + if (streams == null) { + return true; + } + if (streams.isEmpty()) { + return false; + } + for (PdfJsonStream stream : streams) { + if (stream == null || stream.getRawData() == null) { + return true; + } + } + return false; + } + + private boolean shouldPreserveExistingResources(PdfJsonCosValue resources) { + if (resources == null) { + return true; + } + return hasMissingStreamData(resources); + } + + private List snapshotExistingContentStreams(PDPage page) throws IOException { + List streams = new ArrayList<>(); + Iterator iterator = page.getContentStreams(); + if (iterator == null) { + return streams; + } + while (iterator.hasNext()) { + PDStream stream = iterator.next(); + if (stream != null) { + streams.add(stream); + } + } + return streams; + } + + private boolean hasMissingStreamData(PdfJsonCosValue value) { + return hasMissingStreamData(value, Collections.newSetFromMap(new IdentityHashMap<>())); + } + + private boolean hasMissingStreamData(PdfJsonCosValue value, Set visited) { + if (value == null || value.getType() == null) { + return false; + } + if (!visited.add(value)) { + return false; + } + switch (value.getType()) { + case STREAM: + PdfJsonStream stream = value.getStream(); + return stream == null || stream.getRawData() == null; + case ARRAY: + if (value.getItems() != null) { + for (PdfJsonCosValue item : value.getItems()) { + if (hasMissingStreamData(item, visited)) { + return true; + } + } + } + return false; + case DICTIONARY: + if (value.getEntries() != null) { + for (PdfJsonCosValue entry : value.getEntries().values()) { + if (hasMissingStreamData(entry, visited)) { + return true; + } + } + } + return false; + default: + return false; + } + } + /** Schedules automatic cleanup of cached documents after 30 minutes. */ private void scheduleDocumentCleanup(String jobId) { new Thread( diff --git a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonCosMapper.java b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonCosMapper.java index c990c568b..070600329 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonCosMapper.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/PdfJsonCosMapper.java @@ -37,23 +37,68 @@ import stirling.software.SPDF.model.json.PdfJsonStream; @Component public class PdfJsonCosMapper { + public enum SerializationContext { + DEFAULT, + ANNOTATION_RAW_DATA, + FORM_FIELD_RAW_DATA, + CONTENT_STREAMS_LIGHTWEIGHT, + RESOURCES_LIGHTWEIGHT; + + public boolean omitStreamData() { + return this == CONTENT_STREAMS_LIGHTWEIGHT || this == RESOURCES_LIGHTWEIGHT; + } + } + public PdfJsonStream serializeStream(PDStream stream) throws IOException { if (stream == null) { return null; } return serializeStream( - stream.getCOSObject(), Collections.newSetFromMap(new IdentityHashMap<>())); + stream.getCOSObject(), + Collections.newSetFromMap(new IdentityHashMap<>()), + SerializationContext.DEFAULT); } public PdfJsonStream serializeStream(COSStream cosStream) throws IOException { if (cosStream == null) { return null; } - return serializeStream(cosStream, Collections.newSetFromMap(new IdentityHashMap<>())); + return serializeStream( + cosStream, + Collections.newSetFromMap(new IdentityHashMap<>()), + SerializationContext.DEFAULT); + } + + public PdfJsonStream serializeStream(COSStream cosStream, SerializationContext context) + throws IOException { + if (cosStream == null) { + return null; + } + SerializationContext effective = context != null ? context : SerializationContext.DEFAULT; + return serializeStream( + cosStream, Collections.newSetFromMap(new IdentityHashMap<>()), effective); + } + + public PdfJsonStream serializeStream(PDStream stream, SerializationContext context) + throws IOException { + if (stream == null) { + return null; + } + return serializeStream(stream.getCOSObject(), context); } public PdfJsonCosValue serializeCosValue(COSBase base) throws IOException { - return serializeCosValue(base, Collections.newSetFromMap(new IdentityHashMap<>())); + return serializeCosValue( + base, + Collections.newSetFromMap(new IdentityHashMap<>()), + SerializationContext.DEFAULT); + } + + public PdfJsonCosValue serializeCosValue(COSBase base, SerializationContext context) + throws IOException { + SerializationContext effective = context != null ? context : SerializationContext.DEFAULT; + return serializeCosValue( + base, Collections.newSetFromMap(new IdentityHashMap<>()), effective); } public COSBase deserializeCosValue(PdfJsonCosValue value, PDDocument document) @@ -165,8 +210,8 @@ public class PdfJsonCosMapper { return cosStream; } - private PdfJsonCosValue serializeCosValue(COSBase base, Set visited) - throws IOException { + private PdfJsonCosValue serializeCosValue( + COSBase base, Set visited, SerializationContext context) throws IOException { if (base == null) { return null; } @@ -220,21 +265,23 @@ public class PdfJsonCosMapper { if (base instanceof COSArray array) { List items = new ArrayList<>(array.size()); for (COSBase item : array) { - PdfJsonCosValue serialized = serializeCosValue(item, visited); + PdfJsonCosValue serialized = serializeCosValue(item, visited, context); items.add(serialized); } builder.type(PdfJsonCosValue.Type.ARRAY).items(items); return builder.build(); } if (base instanceof COSStream stream) { - builder.type(PdfJsonCosValue.Type.STREAM).stream(serializeStream(stream, visited)); + builder.type(PdfJsonCosValue.Type.STREAM).stream( + serializeStream(stream, visited, context)); return builder.build(); } if (base instanceof COSDictionary dictionary) { Map entries = new LinkedHashMap<>(); for (COSName key : dictionary.keySet()) { PdfJsonCosValue serialized = - serializeCosValue(dictionary.getDictionaryObject(key), visited); + serializeCosValue( + dictionary.getDictionaryObject(key), visited, context); entries.put(key.getName(), serialized); } builder.type(PdfJsonCosValue.Type.DICTIONARY).entries(entries); @@ -248,16 +295,23 @@ public class PdfJsonCosMapper { } } - private PdfJsonStream serializeStream(COSStream cosStream, Set visited) + private PdfJsonStream serializeStream( + COSStream cosStream, Set visited, SerializationContext context) throws IOException { Map dictionary = new LinkedHashMap<>(); for (COSName key : cosStream.keySet()) { COSBase value = cosStream.getDictionaryObject(key); - PdfJsonCosValue serialized = serializeCosValue(value, visited); + PdfJsonCosValue serialized = serializeCosValue(value, visited, context); if (serialized != null) { dictionary.put(key.getName(), serialized); } } + + if (context != null && context.omitStreamData()) { + log.debug("Omitting stream rawData during {} serialization", context); + return PdfJsonStream.builder().dictionary(dictionary).rawData(null).build(); + } + String rawData = null; try (InputStream inputStream = cosStream.createRawInputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream()) { diff --git a/frontend/src/core/tools/pdfTextEditor/PdfTextEditor.tsx b/frontend/src/core/tools/pdfTextEditor/PdfTextEditor.tsx index fa6156d81..deeb28ca7 100644 --- a/frontend/src/core/tools/pdfTextEditor/PdfTextEditor.tsx +++ b/frontend/src/core/tools/pdfTextEditor/PdfTextEditor.tsx @@ -16,6 +16,7 @@ import { pdfWorkerManager } from '@app/services/pdfWorkerManager'; import { Util } from 'pdfjs-dist/legacy/build/pdf.mjs'; import { PdfJsonDocument, + PdfJsonFont, PdfJsonImageElement, PdfJsonPage, TextGroup, @@ -450,14 +451,25 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { const start = performance.now(); try { - const response = await apiClient.get( - `/api/v1/convert/pdf/text-editor/page/${cachedJobId}/${pageNumber}`, - { - responseType: 'json', - }, - ); + const [pageResponse, pageFontsResponse] = await Promise.all([ + apiClient.get( + `/api/v1/convert/pdf/text-editor/page/${cachedJobId}/${pageNumber}`, + { + responseType: 'json', + }, + ), + apiClient.get( + `/api/v1/convert/pdf/text-editor/fonts/${cachedJobId}/${pageNumber}`, + { + responseType: 'json', + }, + ), + ]); - const pageData = response.data as PdfJsonPage; + const pageData = pageResponse.data as PdfJsonPage; + const pageFonts = Array.isArray(pageFontsResponse.data) + ? (pageFontsResponse.data as PdfJsonFont[]) + : []; const normalizedImages = (pageData.imageElements ?? []).map(cloneImageElement); if (imagesByPageRef.current.length <= pageIndex) { @@ -471,12 +483,31 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { } const nextPages = [...prevDoc.pages]; const existingPage = nextPages[pageIndex] ?? {}; + const fontMap = new Map(); + for (const existingFont of prevDoc.fonts ?? []) { + if (!existingFont) { + continue; + } + const existingKey = existingFont.uid || `${existingFont.pageNumber ?? -1}:${existingFont.id ?? ''}`; + fontMap.set(existingKey, existingFont); + } + if (pageFonts.length > 0) { + for (const font of pageFonts) { + if (!font) { + continue; + } + const key = font.uid || `${font.pageNumber ?? -1}:${font.id ?? ''}`; + fontMap.set(key, font); + } + } + const nextFonts = Array.from(fontMap.values()); nextPages[pageIndex] = { ...existingPage, imageElements: normalizedImages.map(cloneImageElement), }; return { ...prevDoc, + fonts: nextFonts, pages: nextPages, }; }); @@ -1087,8 +1118,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { const canUseIncremental = isLazyMode && cachedJobId && - dirtyPageIndices.length > 0 && - dirtyPageIndices.length < totalPages; + dirtyPageIndices.length > 0; if (canUseIncremental) { await ensureImagesForPages(dirtyPageIndices); @@ -1105,10 +1135,8 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { document.pages?.filter((_, index) => dirtyPageSet.has(index)) ?? []; const partialDocument: PdfJsonDocument = { - metadata: document.metadata, - xmpMetadata: document.xmpMetadata, - fonts: document.fonts, - lazyImages: true, + // Incremental export only needs changed pages. + // Fonts/resources/content streams are resolved from server-side cache. pages: partialPages, }; @@ -1135,11 +1163,13 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { setErrorMessage(null); return; } catch (incrementalError) { + if (isLazyMode && cachedJobIdRef.current) { + throw new Error('Incremental export failed for cached document. Please reload and retry.'); + } console.warn( '[handleGeneratePdf] Incremental export failed, falling back to full export', incrementalError, ); - // Fall through to full export below } } @@ -1272,8 +1302,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { const canUseIncremental = isLazyMode && cachedJobId && - dirtyPageIndices.length > 0 && - dirtyPageIndices.length < totalPages; + dirtyPageIndices.length > 0; if (canUseIncremental) { await ensureImagesForPages(dirtyPageIndices); @@ -1290,10 +1319,8 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { document.pages?.filter((_, index) => dirtyPageSet.has(index)) ?? []; const partialDocument: PdfJsonDocument = { - metadata: document.metadata, - xmpMetadata: document.xmpMetadata, - fonts: document.fonts, - lazyImages: true, + // Incremental export only needs changed pages. + // Fonts/resources/content streams are resolved from server-side cache. pages: partialPages, }; @@ -1312,6 +1339,9 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { downloadName = detectedName || expectedName; pdfBlob = response.data; } catch (incrementalError) { + if (isLazyMode && cachedJobId) { + throw new Error('Incremental export failed for cached document. Please reload and retry.'); + } console.warn( '[handleSaveToWorkbench] Incremental export failed, falling back to full export', incrementalError, diff --git a/frontend/src/core/tools/pdfTextEditor/pdfTextEditorUtils.ts b/frontend/src/core/tools/pdfTextEditor/pdfTextEditorUtils.ts index bfc2e112f..07fde0bef 100644 --- a/frontend/src/core/tools/pdfTextEditor/pdfTextEditorUtils.ts +++ b/frontend/src/core/tools/pdfTextEditor/pdfTextEditorUtils.ts @@ -1209,7 +1209,7 @@ export const buildUpdatedDocument = ( ...page, textElements: updatedElements, imageElements: images.map(cloneImageElement), - contentStreams: page.contentStreams ?? [], + contentStreams: page.contentStreams ?? null, }; }); @@ -1282,7 +1282,7 @@ export const restoreGlyphElements = ( ...page, textElements: rebuiltElements, imageElements: images.map(cloneImageElement), - contentStreams: page.contentStreams ?? [], + contentStreams: page.contentStreams ?? null, }; });