PDF Text editor changes (#5726)

# Description of Changes

 - Reduced lightweight editor JSON size:
- Omit heavy page resources and contentStreams in lazy/lightweight
flows.
      - Omit form fields in lazy metadata/editor bootstrapping flows.
      - Strip inline font program blobs from lazy initial payloads.
  - Added page-based font loading:
      - New endpoint to fetch fonts for a specific cached page:
        GET /api/v1/convert/pdf/text-editor/fonts/{jobId}/{pageNumber}
- Frontend now loads page fonts alongside page data and merges into
local doc state.
  - Reduced save payload duplication:
- Partial export now sends only changed pages (no repeated full-document
font/metadata payload each save).
  - Preserved round-trip/export safety:
- Missing lightweight fields (resources/contentStreams) are interpreted
as “preserve existing from cached PDF.”
- Annotation semantics fixed so explicit empty annotation lists can
clear annotations.
- Fixed a regression where lazy mode could fall back to full export and
lose overlays; lazy now stays on cached
        partial export path when dirty pages exist.
  - Logging/noise reduction
  - Transport optimization:
- Enabled HTTP compression for JSON/problem responses. (might remove
later tho in testing)
      
      
      ### Outcome

  - Much smaller JSON payloads for giant PDFs.
  - Fewer duplicated bytes over the wire.
  - Page-scoped loading of heavy font data.
- Better reliability for preserving overlays/vector/background content
during export.


## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### Translations (if applicable)

- [ ] I ran
[`scripts/counter_translation.py`](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/docs/counter_translation.md)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.
This commit is contained in:
Anthony Stirling 2026-02-16 17:36:13 +00:00 committed by GitHub
parent d5cf77cf50
commit 772dd4632e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 1062 additions and 141 deletions

View File

@ -1,5 +1,6 @@
package stirling.software.SPDF.controller.api.converters;
import java.nio.charset.StandardCharsets;
import java.util.Optional;
import java.util.UUID;
import java.util.regex.Pattern;
@ -58,6 +59,7 @@ public class ConvertPdfJsonController {
}
byte[] jsonBytes = pdfJsonConversionService.convertPdfToJson(inputFile, lightweight);
logJsonResponse("pdf/text-editor", jsonBytes);
String originalName = inputFile.getOriginalFilename();
String baseName =
(originalName != null && !originalName.isBlank())
@ -114,10 +116,11 @@ public class ConvertPdfJsonController {
// Scope job to authenticated user if security is enabled
String scopedJobKey = getScopedJobKey(baseJobId);
log.info("Extracting metadata for PDF, assigned jobId: {}", scopedJobKey);
log.debug("Extracting metadata for PDF, assigned jobId: {}", scopedJobKey);
byte[] jsonBytes =
pdfJsonConversionService.extractDocumentMetadata(inputFile, scopedJobKey);
logJsonResponse("pdf/text-editor/metadata", jsonBytes);
String originalName = inputFile.getOriginalFilename();
String baseName =
(originalName != null && !originalName.isBlank())
@ -185,11 +188,33 @@ public class ConvertPdfJsonController {
validateJobAccess(jobId);
byte[] jsonBytes = pdfJsonConversionService.extractSinglePage(jobId, pageNumber);
logJsonResponse("pdf/text-editor/page", jsonBytes);
String docName = "page_" + pageNumber + ".json";
return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON);
}
@AutoJobPostMapping(value = "/pdf/text-editor/clear-cache/{jobId}")
@GetMapping(value = "/pdf/text-editor/fonts/{jobId}/{pageNumber}")
@Operation(
summary = "Extract fonts used by a single cached page for text editor",
description =
"Retrieves the font payloads used by a single page from a previously cached PDF document."
+ " Requires prior call to /pdf/text-editor/metadata. The jobId must belong to the"
+ " authenticated user. Output:JSON")
public ResponseEntity<byte[]> extractPageFonts(
@PathVariable String jobId, @PathVariable int pageNumber) throws Exception {
// Validate job ownership
validateJobAccess(jobId);
byte[] jsonBytes = pdfJsonConversionService.extractPageFonts(jobId, pageNumber);
logJsonResponse("pdf/text-editor/fonts/page", jsonBytes);
String docName = "page_fonts_" + pageNumber + ".json";
return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON);
}
@AutoJobPostMapping(
value = "/pdf/text-editor/clear-cache/{jobId}",
consumes = MediaType.ALL_VALUE)
@Operation(
summary = "Clear cached PDF document for text editor",
description =
@ -219,6 +244,188 @@ public class ConvertPdfJsonController {
return baseJobId;
}
private void logJsonResponse(String label, byte[] jsonBytes) {
if (jsonBytes == null) {
log.warn("Returning {} JSON response: null bytes", label);
return;
}
// Only perform expensive tail extraction if debug logging is enabled
if (log.isDebugEnabled()) {
int length = jsonBytes.length;
boolean endsWithJson =
length > 0 && (jsonBytes[length - 1] == '}' || jsonBytes[length - 1] == ']');
String tail = "";
if (length > 0) {
int start = Math.max(0, length - 64);
tail = new String(jsonBytes, start, length - start, StandardCharsets.UTF_8);
tail = tail.replaceAll("[\\r\\n\\t]+", " ").replaceAll("[^\\x20-\\x7E]", "?");
}
log.debug(
"Returning {} JSON response ({} bytes, endsWithJson={}, tail='{}')",
label,
length,
endsWithJson,
tail);
}
if (isPdfJsonDebugDumpEnabled()) {
try {
String tmpDir = System.getProperty("java.io.tmpdir");
String customDir = System.getenv("SPDF_PDFJSON_DUMP_DIR");
java.nio.file.Path dumpDir =
customDir != null && !customDir.isBlank()
? java.nio.file.Path.of(customDir)
: java.nio.file.Path.of(tmpDir);
java.nio.file.Path dumpPath =
java.nio.file.Files.createTempFile(dumpDir, "pdfjson_", ".json");
java.nio.file.Files.write(dumpPath, jsonBytes);
log.debug("PDF JSON debug dump ({}): {}", label, dumpPath);
} catch (Exception ex) {
log.warn("Failed to write PDF JSON debug dump ({}): {}", label, ex.getMessage());
}
}
if (isPdfJsonRepeatScanEnabled()) {
logRepeatedJsonStrings(label, jsonBytes);
}
}
private boolean isPdfJsonDebugDumpEnabled() {
String env = System.getenv("SPDF_PDFJSON_DUMP");
if (env != null && env.equalsIgnoreCase("true")) {
return true;
}
return Boolean.getBoolean("spdf.pdfjson.dump");
}
private boolean isPdfJsonRepeatScanEnabled() {
String env = System.getenv("SPDF_PDFJSON_REPEAT_SCAN");
if (env != null && env.equalsIgnoreCase("true")) {
return true;
}
return Boolean.getBoolean("spdf.pdfjson.repeatScan");
}
private void logRepeatedJsonStrings(String label, byte[] jsonBytes) {
final int minLen = 12;
final int maxLen = 200;
final int maxUnique = 50000;
java.util.Map<String, Integer> counts = new java.util.HashMap<>();
boolean inString = false;
boolean escape = false;
boolean tooLong = false;
StringBuilder current = new StringBuilder(64);
boolean capped = false;
for (byte b : jsonBytes) {
char ch = (char) (b & 0xFF);
if (!inString) {
if (ch == '"') {
inString = true;
escape = false;
tooLong = false;
current.setLength(0);
}
continue;
}
if (escape) {
escape = false;
if (!tooLong && current.length() < maxLen) {
current.append(ch);
}
continue;
}
if (ch == '\\') {
escape = true;
continue;
}
if (ch == '"') {
inString = false;
if (!tooLong) {
int len = current.length();
if (len >= minLen && len <= maxLen) {
String value = current.toString();
if (!looksLikeBase64(value)) {
if (!capped || counts.containsKey(value)) {
counts.merge(value, 1, Integer::sum);
if (!capped && counts.size() >= maxUnique) {
capped = true;
}
}
}
}
}
continue;
}
if (!tooLong) {
if (current.length() < maxLen) {
current.append(ch);
} else {
tooLong = true;
}
}
}
java.util.List<java.util.Map.Entry<String, Integer>> top =
counts.entrySet().stream()
.filter(e -> e.getValue() > 1)
.sorted((a, b) -> Integer.compare(b.getValue(), a.getValue()))
.limit(20)
.toList();
if (!top.isEmpty()) {
String summary =
top.stream()
.map(
e ->
String.format(
"\"%s\"(len=%d,count=%d)",
truncateForLog(e.getKey()),
e.getKey().length(),
e.getValue()))
.collect(java.util.stream.Collectors.joining("; "));
log.debug(
"PDF JSON repeat scan ({}): top strings -> {}{}",
label,
summary,
capped ? " (capped)" : "");
} else {
log.debug(
"PDF JSON repeat scan ({}): no repeated strings found{}",
label,
capped ? " (capped)" : "");
}
}
private boolean looksLikeBase64(String value) {
if (value.length() < 32) {
return false;
}
int base64Chars = 0;
for (int i = 0; i < value.length(); i++) {
char c = value.charAt(i);
if ((c >= 'A' && c <= 'Z')
|| (c >= 'a' && c <= 'z')
|| (c >= '0' && c <= '9')
|| c == '+'
|| c == '/'
|| c == '=') {
base64Chars++;
}
}
return base64Chars >= value.length() * 0.9;
}
private String truncateForLog(String value) {
int max = 64;
if (value.length() <= max) {
return value.replaceAll("[\\r\\n\\t]+", " ");
}
return value.substring(0, max).replaceAll("[\\r\\n\\t]+", " ") + "...";
}
/**
* Validate that the current user has access to the given job.
*

View File

@ -37,23 +37,68 @@ import stirling.software.SPDF.model.json.PdfJsonStream;
@Component
public class PdfJsonCosMapper {
public enum SerializationContext {
DEFAULT,
ANNOTATION_RAW_DATA,
FORM_FIELD_RAW_DATA,
CONTENT_STREAMS_LIGHTWEIGHT,
RESOURCES_LIGHTWEIGHT;
public boolean omitStreamData() {
return this == CONTENT_STREAMS_LIGHTWEIGHT || this == RESOURCES_LIGHTWEIGHT;
}
}
public PdfJsonStream serializeStream(PDStream stream) throws IOException {
if (stream == null) {
return null;
}
return serializeStream(
stream.getCOSObject(), Collections.newSetFromMap(new IdentityHashMap<>()));
stream.getCOSObject(),
Collections.newSetFromMap(new IdentityHashMap<>()),
SerializationContext.DEFAULT);
}
public PdfJsonStream serializeStream(COSStream cosStream) throws IOException {
if (cosStream == null) {
return null;
}
return serializeStream(cosStream, Collections.newSetFromMap(new IdentityHashMap<>()));
return serializeStream(
cosStream,
Collections.newSetFromMap(new IdentityHashMap<>()),
SerializationContext.DEFAULT);
}
public PdfJsonStream serializeStream(COSStream cosStream, SerializationContext context)
throws IOException {
if (cosStream == null) {
return null;
}
SerializationContext effective = context != null ? context : SerializationContext.DEFAULT;
return serializeStream(
cosStream, Collections.newSetFromMap(new IdentityHashMap<>()), effective);
}
public PdfJsonStream serializeStream(PDStream stream, SerializationContext context)
throws IOException {
if (stream == null) {
return null;
}
return serializeStream(stream.getCOSObject(), context);
}
public PdfJsonCosValue serializeCosValue(COSBase base) throws IOException {
return serializeCosValue(base, Collections.newSetFromMap(new IdentityHashMap<>()));
return serializeCosValue(
base,
Collections.newSetFromMap(new IdentityHashMap<>()),
SerializationContext.DEFAULT);
}
public PdfJsonCosValue serializeCosValue(COSBase base, SerializationContext context)
throws IOException {
SerializationContext effective = context != null ? context : SerializationContext.DEFAULT;
return serializeCosValue(
base, Collections.newSetFromMap(new IdentityHashMap<>()), effective);
}
public COSBase deserializeCosValue(PdfJsonCosValue value, PDDocument document)
@ -165,8 +210,8 @@ public class PdfJsonCosMapper {
return cosStream;
}
private PdfJsonCosValue serializeCosValue(COSBase base, Set<COSBase> visited)
throws IOException {
private PdfJsonCosValue serializeCosValue(
COSBase base, Set<COSBase> visited, SerializationContext context) throws IOException {
if (base == null) {
return null;
}
@ -220,21 +265,23 @@ public class PdfJsonCosMapper {
if (base instanceof COSArray array) {
List<PdfJsonCosValue> items = new ArrayList<>(array.size());
for (COSBase item : array) {
PdfJsonCosValue serialized = serializeCosValue(item, visited);
PdfJsonCosValue serialized = serializeCosValue(item, visited, context);
items.add(serialized);
}
builder.type(PdfJsonCosValue.Type.ARRAY).items(items);
return builder.build();
}
if (base instanceof COSStream stream) {
builder.type(PdfJsonCosValue.Type.STREAM).stream(serializeStream(stream, visited));
builder.type(PdfJsonCosValue.Type.STREAM).stream(
serializeStream(stream, visited, context));
return builder.build();
}
if (base instanceof COSDictionary dictionary) {
Map<String, PdfJsonCosValue> entries = new LinkedHashMap<>();
for (COSName key : dictionary.keySet()) {
PdfJsonCosValue serialized =
serializeCosValue(dictionary.getDictionaryObject(key), visited);
serializeCosValue(
dictionary.getDictionaryObject(key), visited, context);
entries.put(key.getName(), serialized);
}
builder.type(PdfJsonCosValue.Type.DICTIONARY).entries(entries);
@ -248,16 +295,23 @@ public class PdfJsonCosMapper {
}
}
private PdfJsonStream serializeStream(COSStream cosStream, Set<COSBase> visited)
private PdfJsonStream serializeStream(
COSStream cosStream, Set<COSBase> visited, SerializationContext context)
throws IOException {
Map<String, PdfJsonCosValue> dictionary = new LinkedHashMap<>();
for (COSName key : cosStream.keySet()) {
COSBase value = cosStream.getDictionaryObject(key);
PdfJsonCosValue serialized = serializeCosValue(value, visited);
PdfJsonCosValue serialized = serializeCosValue(value, visited, context);
if (serialized != null) {
dictionary.put(key.getName(), serialized);
}
}
if (context != null && context.omitStreamData()) {
log.debug("Omitting stream rawData during {} serialization", context);
return PdfJsonStream.builder().dictionary(dictionary).rawData(null).build();
}
String rawData = null;
try (InputStream inputStream = cosStream.createRawInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream()) {

View File

@ -16,6 +16,7 @@ import { pdfWorkerManager } from '@app/services/pdfWorkerManager';
import { Util } from 'pdfjs-dist/legacy/build/pdf.mjs';
import {
PdfJsonDocument,
PdfJsonFont,
PdfJsonImageElement,
PdfJsonPage,
TextGroup,
@ -450,14 +451,25 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const start = performance.now();
try {
const response = await apiClient.get(
`/api/v1/convert/pdf/text-editor/page/${cachedJobId}/${pageNumber}`,
{
responseType: 'json',
},
);
const [pageResponse, pageFontsResponse] = await Promise.all([
apiClient.get(
`/api/v1/convert/pdf/text-editor/page/${cachedJobId}/${pageNumber}`,
{
responseType: 'json',
},
),
apiClient.get(
`/api/v1/convert/pdf/text-editor/fonts/${cachedJobId}/${pageNumber}`,
{
responseType: 'json',
},
),
]);
const pageData = response.data as PdfJsonPage;
const pageData = pageResponse.data as PdfJsonPage;
const pageFonts = Array.isArray(pageFontsResponse.data)
? (pageFontsResponse.data as PdfJsonFont[])
: [];
const normalizedImages = (pageData.imageElements ?? []).map(cloneImageElement);
if (imagesByPageRef.current.length <= pageIndex) {
@ -471,12 +483,31 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
}
const nextPages = [...prevDoc.pages];
const existingPage = nextPages[pageIndex] ?? {};
const fontMap = new Map<string, PdfJsonFont>();
for (const existingFont of prevDoc.fonts ?? []) {
if (!existingFont) {
continue;
}
const existingKey = existingFont.uid || `${existingFont.pageNumber ?? -1}:${existingFont.id ?? ''}`;
fontMap.set(existingKey, existingFont);
}
if (pageFonts.length > 0) {
for (const font of pageFonts) {
if (!font) {
continue;
}
const key = font.uid || `${font.pageNumber ?? -1}:${font.id ?? ''}`;
fontMap.set(key, font);
}
}
const nextFonts = Array.from(fontMap.values());
nextPages[pageIndex] = {
...existingPage,
imageElements: normalizedImages.map(cloneImageElement),
};
return {
...prevDoc,
fonts: nextFonts,
pages: nextPages,
};
});
@ -1087,8 +1118,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const canUseIncremental =
isLazyMode &&
cachedJobId &&
dirtyPageIndices.length > 0 &&
dirtyPageIndices.length < totalPages;
dirtyPageIndices.length > 0;
if (canUseIncremental) {
await ensureImagesForPages(dirtyPageIndices);
@ -1105,10 +1135,8 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
document.pages?.filter((_, index) => dirtyPageSet.has(index)) ?? [];
const partialDocument: PdfJsonDocument = {
metadata: document.metadata,
xmpMetadata: document.xmpMetadata,
fonts: document.fonts,
lazyImages: true,
// Incremental export only needs changed pages.
// Fonts/resources/content streams are resolved from server-side cache.
pages: partialPages,
};
@ -1135,11 +1163,13 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
setErrorMessage(null);
return;
} catch (incrementalError) {
if (isLazyMode && cachedJobIdRef.current) {
throw new Error('Incremental export failed for cached document. Please reload and retry.');
}
console.warn(
'[handleGeneratePdf] Incremental export failed, falling back to full export',
incrementalError,
);
// Fall through to full export below
}
}
@ -1272,8 +1302,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const canUseIncremental =
isLazyMode &&
cachedJobId &&
dirtyPageIndices.length > 0 &&
dirtyPageIndices.length < totalPages;
dirtyPageIndices.length > 0;
if (canUseIncremental) {
await ensureImagesForPages(dirtyPageIndices);
@ -1290,10 +1319,8 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
document.pages?.filter((_, index) => dirtyPageSet.has(index)) ?? [];
const partialDocument: PdfJsonDocument = {
metadata: document.metadata,
xmpMetadata: document.xmpMetadata,
fonts: document.fonts,
lazyImages: true,
// Incremental export only needs changed pages.
// Fonts/resources/content streams are resolved from server-side cache.
pages: partialPages,
};
@ -1312,6 +1339,9 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
downloadName = detectedName || expectedName;
pdfBlob = response.data;
} catch (incrementalError) {
if (isLazyMode && cachedJobId) {
throw new Error('Incremental export failed for cached document. Please reload and retry.');
}
console.warn(
'[handleSaveToWorkbench] Incremental export failed, falling back to full export',
incrementalError,

View File

@ -1209,7 +1209,7 @@ export const buildUpdatedDocument = (
...page,
textElements: updatedElements,
imageElements: images.map(cloneImageElement),
contentStreams: page.contentStreams ?? [],
contentStreams: page.contentStreams ?? null,
};
});
@ -1282,7 +1282,7 @@ export const restoreGlyphElements = (
...page,
textElements: rebuiltElements,
imageElements: images.map(cloneImageElement),
contentStreams: page.contentStreams ?? [],
contentStreams: page.contentStreams ?? null,
};
});