This commit is contained in:
Anthony Stirling 2025-10-23 11:20:16 +01:00
parent 5780b3a119
commit 4d9cf45009
20 changed files with 2628 additions and 176 deletions

View File

@ -31,7 +31,8 @@ public class ConvertPdfJsonController {
description = description =
"Extracts PDF text, fonts, and metadata into an editable JSON structure that can be" "Extracts PDF text, fonts, and metadata into an editable JSON structure that can be"
+ " transformed back into a PDF. Input:PDF Output:JSON Type:SISO") + " transformed back into a PDF. Input:PDF Output:JSON Type:SISO")
public ResponseEntity<byte[]> convertPdfToJson(@ModelAttribute PDFFile request) throws Exception { public ResponseEntity<byte[]> convertPdfToJson(@ModelAttribute PDFFile request)
throws Exception {
MultipartFile inputFile = request.getFileInput(); MultipartFile inputFile = request.getFileInput();
if (inputFile == null) { if (inputFile == null) {
throw ExceptionUtils.createNullArgumentException("fileInput"); throw ExceptionUtils.createNullArgumentException("fileInput");
@ -44,8 +45,7 @@ public class ConvertPdfJsonController {
? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "") ? Filenames.toSimpleFileName(originalName).replaceFirst("[.][^.]+$", "")
: "document"; : "document";
String docName = baseName + ".json"; String docName = baseName + ".json";
return WebResponseUtils.bytesToWebResponse( return WebResponseUtils.bytesToWebResponse(jsonBytes, docName, MediaType.APPLICATION_JSON);
jsonBytes, docName, MediaType.APPLICATION_JSON);
} }
@AutoJobPostMapping(consumes = "multipart/form-data", value = "/json/pdf") @AutoJobPostMapping(consumes = "multipart/form-data", value = "/json/pdf")
@ -55,7 +55,8 @@ public class ConvertPdfJsonController {
description = description =
"Rebuilds a PDF from the editable JSON structure generated by the PDF to JSON" "Rebuilds a PDF from the editable JSON structure generated by the PDF to JSON"
+ " endpoint. Input:JSON Output:PDF Type:SISO") + " endpoint. Input:JSON Output:PDF Type:SISO")
public ResponseEntity<byte[]> convertJsonToPdf(@ModelAttribute GeneralFile request) throws Exception { public ResponseEntity<byte[]> convertJsonToPdf(@ModelAttribute GeneralFile request)
throws Exception {
MultipartFile jsonFile = request.getFileInput(); MultipartFile jsonFile = request.getFileInput();
if (jsonFile == null) { if (jsonFile == null) {
throw ExceptionUtils.createNullArgumentException("fileInput"); throw ExceptionUtils.createNullArgumentException("fileInput");

View File

@ -0,0 +1,49 @@
package stirling.software.SPDF.model.json;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonCosValue {
public enum Type {
NULL,
BOOLEAN,
INTEGER,
FLOAT,
NAME,
STRING,
ARRAY,
DICTIONARY,
STREAM
}
private Type type;
/**
* Holds the decoded value for primitives (boolean, integer, float, name, string). For name
* values the stored value is the PDF name literal. For string values the content is Base64
* encoded to safely transport arbitrary binaries.
*/
private Object value;
/** Reference to nested values for arrays. */
private List<PdfJsonCosValue> items;
/** Reference to nested values for dictionaries. */
private Map<String, PdfJsonCosValue> entries;
/** Stream payload when {@code type == STREAM}. */
private PdfJsonStream stream;
}

View File

@ -19,6 +19,9 @@ public class PdfJsonDocument {
private PdfJsonMetadata metadata; private PdfJsonMetadata metadata;
/** Optional XMP metadata packet stored as Base64. */
private String xmpMetadata;
@Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>(); @Builder.Default private List<PdfJsonFont> fonts = new ArrayList<>();
@Builder.Default private List<PdfJsonPage> pages = new ArrayList<>(); @Builder.Default private List<PdfJsonPage> pages = new ArrayList<>();

View File

@ -14,12 +14,42 @@ import lombok.NoArgsConstructor;
@JsonInclude(JsonInclude.Include.NON_NULL) @JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonFont { public class PdfJsonFont {
/** PDF resource name (e.g. F1) used as the primary identifier. */
private String id; private String id;
private String name;
/** Logical page number that owns this font resource. */
private Integer pageNumber;
/** Stable UID combining page number and resource for diagnostics. */
private String uid;
/** Reported PostScript/Base font name. */
private String baseName;
/** Declared subtype in the COS dictionary. */
private String subtype; private String subtype;
/** Encoding dictionary or name. */
private String encoding; private String encoding;
/** CID system info for Type0 fonts. */
private PdfJsonFontCidSystemInfo cidSystemInfo;
/** True when the original PDF embedded the font program. */
private Boolean embedded; private Boolean embedded;
/** Font program bytes (TTF/OTF/CFF/PFB) encoded as Base64. */
private String program;
/** Hint describing the font program type (ttf, otf, cff, pfb, etc.). */
private String programFormat;
/** ToUnicode stream encoded as Base64 when present. */
private String toUnicode;
/** Mapped Standard 14 font name when available. */
private String standard14Name; private String standard14Name;
/** Font descriptor flags copied from the source document. */
private Integer fontDescriptorFlags; private Integer fontDescriptorFlags;
private String base64Data;
} }

View File

@ -0,0 +1,20 @@
package stirling.software.SPDF.model.json;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonFontCidSystemInfo {
private String registry;
private String ordering;
private Integer supplement;
}

View File

@ -23,4 +23,10 @@ public class PdfJsonPage {
private Integer rotation; private Integer rotation;
@Builder.Default private List<PdfJsonTextElement> textElements = new ArrayList<>(); @Builder.Default private List<PdfJsonTextElement> textElements = new ArrayList<>();
/** Serialized representation of the page resources dictionary. */
private PdfJsonCosValue resources;
/** Raw content streams associated with the page, preserved for lossless round-tripping. */
@Builder.Default private List<PdfJsonStream> contentStreams = new ArrayList<>();
} }

View File

@ -0,0 +1,27 @@
package stirling.software.SPDF.model.json;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@JsonInclude(JsonInclude.Include.NON_NULL)
public class PdfJsonStream {
/**
* A dictionary of entries that describe the stream metadata (Filter, DecodeParms, etc). Each
* entry is represented using {@link PdfJsonCosValue} so nested structures are supported.
*/
private Map<String, PdfJsonCosValue> dictionary;
/** Raw stream bytes in Base64 form. Data is stored exactly as it appeared in the source PDF. */
private String rawData;
}

View File

@ -20,6 +20,8 @@ public class PdfJsonTextElement {
private String text; private String text;
private String fontId; private String fontId;
private Float fontSize; private Float fontSize;
private Float fontMatrixSize;
private Float fontSizeInPt;
private Float x; private Float x;
private Float y; private Float y;
private Float width; private Float width;

View File

@ -7,6 +7,7 @@ logging.level.org.eclipse.jetty=WARN
#logging.level.org.opensaml=DEBUG #logging.level.org.opensaml=DEBUG
#logging.level.stirling.software.proprietary.security=DEBUG #logging.level.stirling.software.proprietary.security=DEBUG
logging.level.com.zaxxer.hikari=WARN logging.level.com.zaxxer.hikari=WARN
logging.level.stirling.software.SPDF.service.PdfJsonConversionService=TRACE
spring.jpa.open-in-view=false spring.jpa.open-in-view=false
server.forward-headers-strategy=NATIVE server.forward-headers-strategy=NATIVE
server.error.path=/error server.error.path=/error

43
compare_json.py Normal file
View File

@ -0,0 +1,43 @@
import json
import sys
from pathlib import Path
if len(sys.argv) != 3:
print('Usage: compare_json.py <file1> <file2>')
sys.exit(1)
path1, path2 = map(Path, sys.argv[1:])
def load(path):
with path.open('r', encoding='utf-8') as fh:
return json.load(fh)
doc1 = load(path1)
doc2 = load(path2)
if doc1 == doc2:
print('Documents identical')
sys.exit(0)
pages1 = doc1.get('pages', [])
pages2 = doc2.get('pages', [])
for page_index, (p1, p2) in enumerate(zip(pages1, pages2), start=1):
elems1 = p1.get('textElements') or []
elems2 = p2.get('textElements') or []
if len(elems1) != len(elems2):
print(f'Page {page_index}: element count {len(elems1)} vs {len(elems2)}')
diff_found = False
for elem_index, (e1, e2) in enumerate(zip(elems1, elems2)):
if e1 == e2:
continue
diff_found = True
print(f'Page {page_index} element {elem_index} differs')
common_keys = sorted(set(e1) | set(e2))
for key in common_keys:
if e1.get(key) != e2.get(key):
print(f' {key}: {e1.get(key)!r} -> {e2.get(key)!r}')
break
if diff_found:
break

View File

@ -4006,5 +4006,8 @@
"finish": "Finish", "finish": "Finish",
"startTour": "Start Tour", "startTour": "Start Tour",
"startTourDescription": "Take a guided tour of Stirling PDF's key features" "startTourDescription": "Take a guided tour of Stirling PDF's key features"
},
"pdfJsonEditor": {
"viewLabel": "JSON Editor"
} }
} }

View File

@ -0,0 +1,463 @@
import React, { useEffect, useMemo, useState } from 'react';
import {
Alert,
Badge,
Box,
Button,
Card,
Divider,
FileButton,
Group,
Pagination,
ScrollArea,
Stack,
Text,
Title,
} from '@mantine/core';
import { useTranslation } from 'react-i18next';
import DescriptionIcon from '@mui/icons-material/DescriptionOutlined';
import FileDownloadIcon from '@mui/icons-material/FileDownloadOutlined';
import PictureAsPdfIcon from '@mui/icons-material/PictureAsPdfOutlined';
import AutorenewIcon from '@mui/icons-material/Autorenew';
import WarningAmberIcon from '@mui/icons-material/WarningAmber';
import UploadIcon from '@mui/icons-material/Upload';
import {
PdfJsonEditorViewData,
PdfJsonPage,
} from '../../../tools/pdfJsonEditorTypes';
import { pageDimensions } from '../../../tools/pdfJsonEditorUtils';
const MAX_RENDER_WIDTH = 820;
const MIN_BOX_SIZE = 18;
interface PdfJsonEditorViewProps {
data: PdfJsonEditorViewData;
}
const toCssBounds = (
page: PdfJsonPage | null | undefined,
pageHeight: number,
scale: number,
bounds: { left: number; right: number; top: number; bottom: number },
) => {
const width = Math.max(bounds.right - bounds.left, 1);
const height = Math.max(bounds.bottom - bounds.top, 1);
const scaledWidth = Math.max(width * scale, MIN_BOX_SIZE);
const scaledHeight = Math.max(height * scale, MIN_BOX_SIZE / 2);
const top = Math.max(pageHeight - bounds.bottom, 0) * scale;
return {
left: bounds.left * scale,
top,
width: scaledWidth,
height: scaledHeight,
};
};
const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
const { t } = useTranslation();
const [activeGroupId, setActiveGroupId] = useState<string | null>(null);
const [editingGroupId, setEditingGroupId] = useState<string | null>(null);
const {
document: pdfDocument,
groupsByPage,
selectedPage,
dirtyPages,
hasDocument,
fileName,
errorMessage,
isGeneratingPdf,
hasChanges,
onLoadJson,
onSelectPage,
onGroupEdit,
onReset,
onDownloadJson,
onGeneratePdf,
} = data;
const pages = pdfDocument?.pages ?? [];
const currentPage = pages[selectedPage] ?? null;
const pageGroups = groupsByPage[selectedPage] ?? [];
const visibleGroups = useMemo(
() =>
pageGroups.filter((group) => {
const hasContent = ((group.text ?? '').trim().length > 0) || ((group.originalText ?? '').trim().length > 0);
return hasContent || editingGroupId === group.id;
}),
[editingGroupId, pageGroups]
);
const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage);
const scale = useMemo(() => Math.min(MAX_RENDER_WIDTH / pageWidth, 1.5), [pageWidth]);
const scaledWidth = pageWidth * scale;
const scaledHeight = pageHeight * scale;
useEffect(() => {
setActiveGroupId(null);
setEditingGroupId(null);
}, [selectedPage]);
useEffect(() => {
if (!editingGroupId) {
return;
}
const editor = document.querySelector<HTMLElement>(`[data-editor-group="${editingGroupId}"]`);
if (editor) {
editor.focus();
const selection = window.getSelection();
if (selection) {
selection.removeAllRanges();
const range = document.createRange();
range.selectNodeContents(editor);
range.collapse(false);
selection.addRange(range);
}
}
}, [editingGroupId]);
const handlePageChange = (pageNumber: number) => {
setActiveGroupId(null);
setEditingGroupId(null);
onSelectPage(pageNumber - 1);
};
const handleBackgroundClick = () => {
setEditingGroupId(null);
setActiveGroupId(null);
};
const renderGroupContainer = (
groupId: string,
isActive: boolean,
isChanged: boolean,
content: React.ReactNode,
onActivate?: (event: React.MouseEvent) => void,
) => (
<Box
component="div"
style={{
width: '100%',
height: '100%',
border: isActive
? '2px solid var(--mantine-color-blue-5)'
: isChanged
? '1px solid var(--mantine-color-yellow-5)'
: '1px solid transparent',
borderRadius: 6,
backgroundColor: isChanged || isActive ? 'rgba(250,255,189,0.28)' : 'transparent',
transition: 'border 120ms ease, background-color 120ms ease',
pointerEvents: 'auto',
overflow: 'hidden',
display: 'flex',
alignItems: 'flex-start',
justifyContent: 'flex-start',
padding: 0,
}}
onClick={(event) => {
event.stopPropagation();
onActivate?.(event);
}}
onMouseEnter={() => setActiveGroupId(groupId)}
onMouseLeave={() => {
if (editingGroupId !== groupId) {
setActiveGroupId((current) => (current === groupId ? null : current));
}
}}
>
{content}
</Box>
);
return (
<Stack gap="xl" className="h-full" style={{ padding: '1.5rem', overflow: 'auto' }}>
<Card withBorder radius="md" shadow="xs" padding="lg">
<Stack gap="sm">
<Group justify="space-between" align="center">
<Group gap="xs" align="center">
<DescriptionIcon fontSize="small" />
<Title order={3}>{t('pdfJsonEditor.title', 'PDF JSON Editor')}</Title>
{hasChanges && <Badge color="yellow" size="sm">{t('pdfJsonEditor.badges.unsaved', 'Edited')}</Badge>}
</Group>
<Group gap="sm">
<FileButton onChange={onLoadJson} accept="application/json">
{(props) => (
<Button variant="light" leftSection={<UploadIcon fontSize="small" />} {...props}>
{t('pdfJsonEditor.actions.load', 'Load JSON')}
</Button>
)}
</FileButton>
<Button
variant="subtle"
leftSection={<AutorenewIcon fontSize="small" />}
onClick={onReset}
disabled={!hasDocument}
>
{t('pdfJsonEditor.actions.reset', 'Reset Changes')}
</Button>
<Button
variant="default"
leftSection={<FileDownloadIcon fontSize="small" />}
onClick={onDownloadJson}
disabled={!hasDocument}
>
{t('pdfJsonEditor.actions.downloadJson', 'Download JSON')}
</Button>
<Button
leftSection={<PictureAsPdfIcon fontSize="small" />}
onClick={onGeneratePdf}
loading={isGeneratingPdf}
disabled={!hasDocument || !hasChanges}
>
{t('pdfJsonEditor.actions.generatePdf', 'Generate PDF')}
</Button>
</Group>
</Group>
{fileName && (
<Text size="sm" c="dimmed">
{t('pdfJsonEditor.currentFile', 'Current file: {{name}}', { name: fileName })}
</Text>
)}
</Stack>
</Card>
{errorMessage && (
<Alert icon={<WarningAmberIcon fontSize="small" />} color="red" radius="md">
{errorMessage}
</Alert>
)}
{!hasDocument && (
<Card withBorder radius="md" padding="xl">
<Stack align="center" gap="md">
<DescriptionIcon sx={{ fontSize: 48 }} />
<Text size="lg" fw={600}>
{t('pdfJsonEditor.empty.title', 'No JSON loaded yet')}
</Text>
<Text size="sm" c="dimmed" ta="center" maw={420}>
{t('pdfJsonEditor.empty.subtitle', 'Use the Load JSON button above to open a file generated by the PDF → JSON converter.')}
</Text>
</Stack>
</Card>
)}
{hasDocument && (
<Stack gap="lg" className="flex-1" style={{ minHeight: 0 }}>
<Group justify="space-between" align="center">
<Group gap="sm">
<Text fw={500}>
{t('pdfJsonEditor.pageSummary', 'Page {{number}} of {{total}}', {
number: selectedPage + 1,
total: pages.length,
})}
</Text>
{dirtyPages[selectedPage] && (
<Badge color="yellow" size="xs">
{t('pdfJsonEditor.badges.modified', 'Edited')}
</Badge>
)}
</Group>
{pages.length > 1 && (
<Pagination
value={selectedPage + 1}
onChange={handlePageChange}
total={pages.length}
size="sm"
/>
)}
</Group>
<Card withBorder padding="md" radius="md" shadow="xs" style={{ flex: 1, minHeight: 0 }}>
<ScrollArea h="100%" offsetScrollbars>
<Box
style={{
margin: '0 auto',
background: '#f3f4f6',
padding: '1.5rem',
borderRadius: '0.75rem',
}}
onClick={handleBackgroundClick}
>
<Box
style={{
position: 'relative',
width: `${scaledWidth}px`,
height: `${scaledHeight}px`,
backgroundColor: '#ffffff',
boxShadow: '0 0 12px rgba(15, 23, 42, 0.12)',
borderRadius: '0.5rem',
overflow: 'hidden',
}}
>
{visibleGroups.length === 0 ? (
<Group justify="center" align="center" style={{ height: '100%' }}>
<Stack gap={4} align="center">
<Text size="sm" c="dimmed">
{t('pdfJsonEditor.noTextOnPage', 'No editable text was detected on this page.')}
</Text>
</Stack>
</Group>
) : (
visibleGroups.map((group) => {
const bounds = toCssBounds(currentPage, pageHeight, scale, group.bounds);
const changed = group.text !== group.originalText;
const isActive = activeGroupId === group.id || editingGroupId === group.id;
const isEditing = editingGroupId === group.id;
const fontSizePx = Math.max((group.fontSize ?? 12) * scale, 8);
const visualHeight = Math.max(bounds.height, fontSizePx * 1.35);
const containerStyle: React.CSSProperties = {
position: 'absolute',
left: `${bounds.left}px`,
top: `${bounds.top}px`,
width: `${bounds.width}px`,
height: `${visualHeight}px`,
display: 'flex',
alignItems: 'flex-start',
justifyContent: 'flex-start',
pointerEvents: 'auto',
cursor: 'text',
};
const commonProps = {
key: group.id,
style: containerStyle,
};
if (isEditing) {
return (
<Box {...commonProps}>
{renderGroupContainer(
group.id,
true,
changed,
<div
contentEditable
suppressContentEditableWarning
data-editor-group={group.id}
onBlur={(event) => {
const value = event.currentTarget.innerText.replace(/\u00A0/g, ' ');
onGroupEdit(group.pageIndex, group.id, value);
setEditingGroupId(null);
}}
onInput={(event) => {
const value = event.currentTarget.innerText.replace(/\u00A0/g, ' ');
onGroupEdit(group.pageIndex, group.id, value);
}}
style={{
width: '100%',
height: '100%',
padding: '3px 4px',
backgroundColor: 'rgba(255,255,255,0.95)',
color: '#111827',
fontSize: `${fontSizePx}px`,
lineHeight: 1.25,
outline: 'none',
border: 'none',
display: 'block',
whiteSpace: 'pre-wrap',
overflowWrap: 'anywhere',
cursor: 'text',
}}
>
{group.text || '\u00A0'}
</div>,
)}
</Box>
);
}
return (
<Box
{...commonProps}
>
{renderGroupContainer(
group.id,
isActive,
changed,
<div
style={{
width: '100%',
minHeight: '100%',
padding: '2px 4px',
whiteSpace: 'pre-wrap',
fontSize: `${fontSizePx}px`,
lineHeight: 1.25,
color: '#111827',
display: 'block',
cursor: 'text',
}}
>
<span style={{ pointerEvents: 'none' }}>{group.text || '\u00A0'}</span>
</div>,
() => {
setEditingGroupId(group.id);
setActiveGroupId(group.id);
},
)}
</Box>
);
})
)}
</Box>
</Box>
</ScrollArea>
</Card>
<Card padding="md" withBorder radius="md">
<Stack gap="xs">
<Text fw={500}>{t('pdfJsonEditor.groupList', 'Detected Text Groups')}</Text>
<Divider />
<ScrollArea h={180} offsetScrollbars>
<Stack gap="sm">
{visibleGroups.map((group) => {
const changed = group.text !== group.originalText;
return (
<Card
key={`list-${group.id}`}
padding="sm"
radius="md"
withBorder
shadow={changed ? 'sm' : 'none'}
onMouseEnter={() => setActiveGroupId(group.id)}
onMouseLeave={() => setActiveGroupId((current) => (current === group.id ? null : current))}
style={{ cursor: 'pointer' }}
onClick={() => {
setActiveGroupId(group.id);
setEditingGroupId(group.id);
}}
>
<Stack gap={4}>
<Group gap="xs">
{changed && <Badge color="yellow" size="xs">{t('pdfJsonEditor.badges.modified', 'Edited')}</Badge>}
{group.fontId && (
<Badge size="xs" variant="outline">{group.fontId}</Badge>
)}
{group.fontSize && (
<Badge size="xs" variant="light">
{t('pdfJsonEditor.fontSizeValue', '{{size}}pt', { size: group.fontSize.toFixed(1) })}
</Badge>
)}
</Group>
<Text size="sm" c="dimmed" lineClamp={2}>
{group.text || t('pdfJsonEditor.emptyGroup', '[Empty Group]')}
</Text>
</Stack>
</Card>
);
})}
</Stack>
</ScrollArea>
</Stack>
</Card>
</Stack>
)}
</Stack>
);
};
export default PdfJsonEditorView;

View File

@ -31,7 +31,9 @@ export const CONVERSION_ENDPOINTS = {
'pdf-pdfa': '/api/v1/convert/pdf/pdfa', 'pdf-pdfa': '/api/v1/convert/pdf/pdfa',
'html-pdf': '/api/v1/convert/html/pdf', 'html-pdf': '/api/v1/convert/html/pdf',
'markdown-pdf': '/api/v1/convert/markdown/pdf', 'markdown-pdf': '/api/v1/convert/markdown/pdf',
'eml-pdf': '/api/v1/convert/eml/pdf' 'eml-pdf': '/api/v1/convert/eml/pdf',
'pdf-json': '/api/v1/convert/pdf/json',
'json-pdf': '/api/v1/convert/json/pdf'
} as const; } as const;
export const ENDPOINT_NAMES = { export const ENDPOINT_NAMES = {
@ -48,7 +50,9 @@ export const ENDPOINT_NAMES = {
'pdf-pdfa': 'pdf-to-pdfa', 'pdf-pdfa': 'pdf-to-pdfa',
'html-pdf': 'html-to-pdf', 'html-pdf': 'html-to-pdf',
'markdown-pdf': 'markdown-to-pdf', 'markdown-pdf': 'markdown-to-pdf',
'eml-pdf': 'eml-to-pdf' 'eml-pdf': 'eml-to-pdf',
'pdf-json': 'pdf-to-json',
'json-pdf': 'json-to-pdf'
} as const; } as const;
@ -80,6 +84,7 @@ export const FROM_FORMAT_OPTIONS = [
{ value: 'txt', label: 'TXT', group: 'Text' }, { value: 'txt', label: 'TXT', group: 'Text' },
{ value: 'rtf', label: 'RTF', group: 'Text' }, { value: 'rtf', label: 'RTF', group: 'Text' },
{ value: 'eml', label: 'EML', group: 'Email' }, { value: 'eml', label: 'EML', group: 'Email' },
{ value: 'json', label: 'JSON', group: 'Data' },
]; ];
export const TO_FORMAT_OPTIONS = [ export const TO_FORMAT_OPTIONS = [
@ -101,13 +106,14 @@ export const TO_FORMAT_OPTIONS = [
{ value: 'webp', label: 'WEBP', group: 'Image' }, { value: 'webp', label: 'WEBP', group: 'Image' },
{ value: 'html', label: 'HTML', group: 'Web' }, { value: 'html', label: 'HTML', group: 'Web' },
{ value: 'xml', label: 'XML', group: 'Web' }, { value: 'xml', label: 'XML', group: 'Web' },
{ value: 'json', label: 'JSON', group: 'Data' },
]; ];
// Conversion matrix - what each source format can convert to // Conversion matrix - what each source format can convert to
export const CONVERSION_MATRIX: Record<string, string[]> = { export const CONVERSION_MATRIX: Record<string, string[]> = {
'any': ['pdf'], // Mixed files always convert to PDF 'any': ['pdf'], // Mixed files always convert to PDF
'image': ['pdf'], // Multiple images always convert to PDF 'image': ['pdf'], // Multiple images always convert to PDF
'pdf': ['png', 'jpg', 'gif', 'tiff', 'bmp', 'webp', 'docx', 'odt', 'pptx', 'odp', 'csv', 'txt', 'rtf', 'md', 'html', 'xml', 'pdfa'], 'pdf': ['png', 'jpg', 'gif', 'tiff', 'bmp', 'webp', 'docx', 'odt', 'pptx', 'odp', 'csv', 'txt', 'rtf', 'md', 'html', 'xml', 'pdfa', 'json'],
'docx': ['pdf'], 'doc': ['pdf'], 'odt': ['pdf'], 'docx': ['pdf'], 'doc': ['pdf'], 'odt': ['pdf'],
'xlsx': ['pdf'], 'xls': ['pdf'], 'ods': ['pdf'], 'xlsx': ['pdf'], 'xls': ['pdf'], 'ods': ['pdf'],
'pptx': ['pdf'], 'ppt': ['pdf'], 'odp': ['pdf'], 'pptx': ['pdf'], 'ppt': ['pdf'], 'odp': ['pdf'],
@ -116,7 +122,8 @@ export const CONVERSION_MATRIX: Record<string, string[]> = {
'zip': ['pdf'], 'zip': ['pdf'],
'md': ['pdf'], 'md': ['pdf'],
'txt': ['pdf'], 'rtf': ['pdf'], 'txt': ['pdf'], 'rtf': ['pdf'],
'eml': ['pdf'] 'eml': ['pdf'],
'json': ['pdf']
}; };
// Map extensions to endpoint keys // Map extensions to endpoint keys
@ -130,7 +137,8 @@ export const EXTENSION_TO_ENDPOINT: Record<string, Record<string, string>> = {
'csv': 'pdf-to-csv', 'csv': 'pdf-to-csv',
'txt': 'pdf-to-text', 'rtf': 'pdf-to-text', 'md': 'pdf-to-markdown', 'txt': 'pdf-to-text', 'rtf': 'pdf-to-text', 'md': 'pdf-to-markdown',
'html': 'pdf-to-html', 'xml': 'pdf-to-xml', 'html': 'pdf-to-html', 'xml': 'pdf-to-xml',
'pdfa': 'pdf-to-pdfa' 'pdfa': 'pdf-to-pdfa',
'json': 'pdf-to-json'
}, },
'docx': { 'pdf': 'file-to-pdf' }, 'doc': { 'pdf': 'file-to-pdf' }, 'odt': { 'pdf': 'file-to-pdf' }, 'docx': { 'pdf': 'file-to-pdf' }, 'doc': { 'pdf': 'file-to-pdf' }, 'odt': { 'pdf': 'file-to-pdf' },
'xlsx': { 'pdf': 'file-to-pdf' }, 'xls': { 'pdf': 'file-to-pdf' }, 'ods': { 'pdf': 'file-to-pdf' }, 'xlsx': { 'pdf': 'file-to-pdf' }, 'xls': { 'pdf': 'file-to-pdf' }, 'ods': { 'pdf': 'file-to-pdf' },
@ -141,7 +149,8 @@ export const EXTENSION_TO_ENDPOINT: Record<string, Record<string, string>> = {
'zip': { 'pdf': 'html-to-pdf' }, 'zip': { 'pdf': 'html-to-pdf' },
'md': { 'pdf': 'markdown-to-pdf' }, 'md': { 'pdf': 'markdown-to-pdf' },
'txt': { 'pdf': 'file-to-pdf' }, 'rtf': { 'pdf': 'file-to-pdf' }, 'txt': { 'pdf': 'file-to-pdf' }, 'rtf': { 'pdf': 'file-to-pdf' },
'eml': { 'pdf': 'eml-to-pdf' } 'eml': { 'pdf': 'eml-to-pdf' },
'json': { 'pdf': 'json-to-pdf' }
}; };
export type ColorType = typeof COLOR_TYPES[keyof typeof COLOR_TYPES]; export type ColorType = typeof COLOR_TYPES[keyof typeof COLOR_TYPES];

View File

@ -5,7 +5,7 @@ export const CONVERT_SUPPORTED_FORMATS = [
// OpenDocument // OpenDocument
'odt', 'ott', 'ods', 'ots', 'odp', 'otp', 'odg', 'otg', 'odt', 'ott', 'ods', 'ots', 'odp', 'otp', 'odg', 'otg',
// Text formats // Text formats
'txt', 'text', 'xml', 'rtf', 'html', 'lwp', 'md', 'txt', 'text', 'xml', 'rtf', 'html', 'lwp', 'md', 'json',
// Images // Images
'bmp', 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'pbm', 'pgm', 'ppm', 'ras', 'xbm', 'xpm', 'svg', 'svm', 'wmf', 'webp', 'bmp', 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'pbm', 'pgm', 'ppm', 'ras', 'xbm', 'xpm', 'svg', 'svm', 'wmf', 'webp',
// StarOffice // StarOffice

View File

@ -5,6 +5,7 @@ import SplitPdfPanel from "../tools/Split";
import CompressPdfPanel from "../tools/Compress"; import CompressPdfPanel from "../tools/Compress";
import OCRPanel from "../tools/OCR"; import OCRPanel from "../tools/OCR";
import ConvertPanel from "../tools/Convert"; import ConvertPanel from "../tools/Convert";
import PdfJsonEditor from "../tools/PdfJsonEditor";
import Sanitize from "../tools/Sanitize"; import Sanitize from "../tools/Sanitize";
import AddPassword from "../tools/AddPassword"; import AddPassword from "../tools/AddPassword";
import ChangePermissions from "../tools/ChangePermissions"; import ChangePermissions from "../tools/ChangePermissions";
@ -710,6 +711,19 @@ export function useTranslatedToolCatalog(): TranslatedToolCatalog {
supportsAutomate: false, supportsAutomate: false,
automationSettings: null automationSettings: null
}, },
pdfJsonEditor: {
icon: <LocalIcon icon="code-rounded" width="1.5rem" height="1.5rem" />,
name: t("home.pdfJsonEditor.title", "PDF JSON Editor"),
component: PdfJsonEditor,
description: t("home.pdfJsonEditor.desc", "Review and edit Stirling PDF JSON exports with grouped text editing and PDF regeneration"),
categoryId: ToolCategoryId.ADVANCED_TOOLS,
subcategoryId: SubcategoryId.DEVELOPER_TOOLS,
workbench: 'custom:pdfJsonEditor',
endpoints: ["json-pdf"],
synonyms: getSynonyms(t, "pdfJsonEditor"),
supportsAutomate: false,
automationSettings: null
},
devApi: { devApi: {
icon: <LocalIcon icon="open-in-new-rounded" width="1.5rem" height="1.5rem" style={{ color: "#2F7BF6" }} />, icon: <LocalIcon icon="open-in-new-rounded" width="1.5rem" height="1.5rem" style={{ color: "#2F7BF6" }} />,
name: t("home.devApi.title", "API"), name: t("home.devApi.title", "API"),

View File

@ -0,0 +1,289 @@
import { useCallback, useEffect, useMemo, useState, useRef } from 'react';
import { useTranslation } from 'react-i18next';
import DescriptionIcon from '@mui/icons-material/DescriptionOutlined';
import { useToolWorkflow } from '../contexts/ToolWorkflowContext';
import { useNavigationActions, useNavigationState } from '../contexts/NavigationContext';
import { BaseToolProps, ToolComponent } from '../types/tool';
import { CONVERSION_ENDPOINTS } from '../constants/convertConstants';
import apiClient from '../services/apiClient';
import { downloadBlob, downloadTextAsFile } from '../utils/downloadUtils';
import { getFilenameFromHeaders } from '../utils/fileResponseUtils';
import {
PdfJsonDocument,
TextGroup,
PdfJsonEditorViewData,
} from './pdfJsonEditorTypes';
import {
deepCloneDocument,
getDirtyPages,
groupDocumentText,
restoreGlyphElements,
} from './pdfJsonEditorUtils';
import PdfJsonEditorView from '../components/tools/pdfJsonEditor/PdfJsonEditorView';
const VIEW_ID = 'pdfJsonEditorView';
const WORKBENCH_ID = 'custom:pdfJsonEditor' as const;
const sanitizeBaseName = (name?: string | null): string => {
if (!name || name.trim().length === 0) {
return 'document';
}
return name.replace(/\.[^.]+$/u, '');
};
const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const { t } = useTranslation();
const {
registerCustomWorkbenchView,
unregisterCustomWorkbenchView,
setCustomWorkbenchViewData,
clearCustomWorkbenchViewData,
setLeftPanelView,
} = useToolWorkflow();
const { actions: navigationActions } = useNavigationActions();
const navigationState = useNavigationState();
const [loadedDocument, setLoadedDocument] = useState<PdfJsonDocument | null>(null);
const [groupsByPage, setGroupsByPage] = useState<TextGroup[][]>([]);
const [selectedPage, setSelectedPage] = useState(0);
const [fileName, setFileName] = useState('');
const [errorMessage, setErrorMessage] = useState<string | null>(null);
const [isGeneratingPdf, setIsGeneratingPdf] = useState(false);
const dirtyPages = useMemo(() => getDirtyPages(groupsByPage), [groupsByPage]);
const hasChanges = useMemo(() => dirtyPages.some(Boolean), [dirtyPages]);
const hasDocument = loadedDocument !== null;
const viewLabel = useMemo(() => t('pdfJsonEditor.viewLabel', 'JSON Editor'), [t]);
const resetToDocument = useCallback((document: PdfJsonDocument | null) => {
if (!document) {
setGroupsByPage([]);
setSelectedPage(0);
return;
}
const cloned = deepCloneDocument(document);
const groups = groupDocumentText(cloned);
setGroupsByPage(groups);
setSelectedPage(0);
}, []);
const handleLoadFile = useCallback(
async (file: File | null) => {
if (!file) {
return;
}
try {
const content = await file.text();
const parsed = JSON.parse(content) as PdfJsonDocument;
setLoadedDocument(parsed);
resetToDocument(parsed);
setFileName(file.name);
setErrorMessage(null);
} catch (error) {
console.error('Failed to parse JSON', error);
setLoadedDocument(null);
setGroupsByPage([]);
setErrorMessage(
t(
'pdfJsonEditor.errors.invalidJson',
'Unable to read the JSON file. Ensure it was generated by the PDF to JSON tool.'
)
);
}
},
[resetToDocument, t]
);
const handleSelectPage = useCallback((pageIndex: number) => {
setSelectedPage(pageIndex);
}, []);
const handleGroupTextChange = useCallback((pageIndex: number, groupId: string, value: string) => {
setGroupsByPage((previous) =>
previous.map((groups, idx) =>
idx !== pageIndex
? groups
: groups.map((group) => (group.id === groupId ? { ...group, text: value } : group))
)
);
}, []);
const handleResetEdits = useCallback(() => {
if (!loadedDocument) {
return;
}
resetToDocument(loadedDocument);
setErrorMessage(null);
}, [loadedDocument, resetToDocument]);
const buildPayload = useCallback(() => {
if (!loadedDocument) {
return null;
}
const updatedDocument = restoreGlyphElements(loadedDocument, groupsByPage);
const baseName = sanitizeBaseName(fileName || loadedDocument.metadata?.title || undefined);
return {
document: updatedDocument,
filename: `${baseName}.json`,
};
}, [fileName, groupsByPage, loadedDocument]);
const handleDownloadJson = useCallback(() => {
const payload = buildPayload();
if (!payload) {
return;
}
const { document, filename } = payload;
const serialized = JSON.stringify(document, null, 2);
downloadTextAsFile(serialized, filename, 'application/json');
if (onComplete) {
const exportedFile = new File([serialized], filename, { type: 'application/json' });
onComplete([exportedFile]);
}
}, [buildPayload, onComplete]);
const handleGeneratePdf = useCallback(async () => {
const payload = buildPayload();
if (!payload) {
return;
}
const { document, filename } = payload;
const serialized = JSON.stringify(document, null, 2);
const jsonFile = new File([serialized], filename, { type: 'application/json' });
const formData = new FormData();
formData.append('fileInput', jsonFile);
try {
setIsGeneratingPdf(true);
const response = await apiClient.post(CONVERSION_ENDPOINTS['json-pdf'], formData, {
responseType: 'blob',
});
const contentDisposition = response.headers?.['content-disposition'] ?? '';
const detectedName = getFilenameFromHeaders(contentDisposition);
const baseName = sanitizeBaseName(filename).replace(/-edited$/u, '');
const downloadName = detectedName || `${baseName || 'document'}.pdf`;
downloadBlob(response.data, downloadName);
if (onComplete) {
const pdfFile = new File([response.data], downloadName, { type: 'application/pdf' });
onComplete([pdfFile]);
}
setErrorMessage(null);
} catch (error: any) {
console.error('Failed to convert JSON back to PDF', error);
const message =
error?.response?.data ||
error?.message ||
t('pdfJsonEditor.errors.pdfConversion', 'Unable to convert the edited JSON back into a PDF.');
const msgString = typeof message === 'string' ? message : String(message);
setErrorMessage(msgString);
if (onError) {
onError(msgString);
}
} finally {
setIsGeneratingPdf(false);
}
}, [buildPayload, onComplete, onError, t]);
const viewData = useMemo<PdfJsonEditorViewData>(() => ({
document: loadedDocument,
groupsByPage,
selectedPage,
dirtyPages,
hasDocument,
fileName,
errorMessage,
isGeneratingPdf,
hasChanges,
onLoadJson: handleLoadFile,
onSelectPage: handleSelectPage,
onGroupEdit: handleGroupTextChange,
onReset: handleResetEdits,
onDownloadJson: handleDownloadJson,
onGeneratePdf: handleGeneratePdf,
}), [
dirtyPages,
errorMessage,
fileName,
groupsByPage,
handleDownloadJson,
handleGeneratePdf,
handleGroupTextChange,
handleLoadFile,
handleResetEdits,
handleSelectPage,
hasChanges,
hasDocument,
isGeneratingPdf,
loadedDocument,
selectedPage,
]);
const latestViewDataRef = useRef<PdfJsonEditorViewData>(viewData);
latestViewDataRef.current = viewData;
useEffect(() => {
registerCustomWorkbenchView({
id: VIEW_ID,
workbenchId: WORKBENCH_ID,
label: viewLabel,
icon: <DescriptionIcon fontSize="small" />,
component: PdfJsonEditorView,
});
setLeftPanelView('hidden');
setCustomWorkbenchViewData(VIEW_ID, latestViewDataRef.current);
return () => {
clearCustomWorkbenchViewData(VIEW_ID);
unregisterCustomWorkbenchView(VIEW_ID);
setLeftPanelView('toolPicker');
};
}, [
clearCustomWorkbenchViewData,
registerCustomWorkbenchView,
setCustomWorkbenchViewData,
setLeftPanelView,
viewLabel,
unregisterCustomWorkbenchView,
]);
useEffect(() => {
if (
navigationState.selectedTool === 'pdfJsonEditor' &&
navigationState.workbench !== WORKBENCH_ID
) {
navigationActions.setWorkbench(WORKBENCH_ID);
}
}, [navigationActions, navigationState.selectedTool, navigationState.workbench]);
const lastSentViewDataRef = useRef<PdfJsonEditorViewData | null>(null);
useEffect(() => {
if (lastSentViewDataRef.current === viewData) {
return;
}
lastSentViewDataRef.current = viewData;
setCustomWorkbenchViewData(VIEW_ID, viewData);
}, [setCustomWorkbenchViewData, viewData]);
// All editing happens in the custom workbench view.
return null;
};
(PdfJsonEditor as ToolComponent).tool = () => {
throw new Error('PDF JSON Editor does not support automation operations.');
};
(PdfJsonEditor as ToolComponent).getDefaultParameters = () => ({
groups: [],
});
export default PdfJsonEditor as ToolComponent;

View File

@ -0,0 +1,110 @@
export interface PdfJsonFontCidSystemInfo {
registry?: string | null;
ordering?: string | null;
supplement?: number | null;
}
export interface PdfJsonFont {
id?: string;
pageNumber?: number | null;
uid?: string | null;
baseName?: string | null;
subtype?: string | null;
encoding?: string | null;
cidSystemInfo?: PdfJsonFontCidSystemInfo | null;
embedded?: boolean | null;
program?: string | null;
programFormat?: string | null;
toUnicode?: string | null;
standard14Name?: string | null;
fontDescriptorFlags?: number | null;
}
export interface PdfJsonTextElement {
text?: string | null;
fontId?: string | null;
fontSize?: number | null;
fontMatrixSize?: number | null;
fontSizeInPt?: number | null;
renderingMode?: number | null;
x?: number | null;
y?: number | null;
width?: number | null;
height?: number | null;
textMatrix?: number[] | null;
}
export interface PdfJsonStream {
dictionary?: Record<string, unknown> | null;
rawData?: string | null;
}
export interface PdfJsonPage {
pageNumber?: number | null;
width?: number | null;
height?: number | null;
rotation?: number | null;
textElements?: PdfJsonTextElement[] | null;
resources?: unknown;
contentStreams?: PdfJsonStream[] | null;
}
export interface PdfJsonMetadata {
title?: string | null;
author?: string | null;
subject?: string | null;
keywords?: string | null;
creator?: string | null;
producer?: string | null;
creationDate?: string | null;
modificationDate?: string | null;
trapped?: string | null;
numberOfPages?: number | null;
}
export interface PdfJsonDocument {
metadata?: PdfJsonMetadata | null;
xmpMetadata?: string | null;
fonts?: PdfJsonFont[] | null;
pages?: PdfJsonPage[] | null;
}
export interface BoundingBox {
left: number;
right: number;
top: number;
bottom: number;
}
export interface TextGroup {
id: string;
pageIndex: number;
fontId?: string | null;
fontSize?: number | null;
elements: PdfJsonTextElement[];
originalElements: PdfJsonTextElement[];
text: string;
originalText: string;
bounds: BoundingBox;
}
export const DEFAULT_PAGE_WIDTH = 612;
export const DEFAULT_PAGE_HEIGHT = 792;
export interface PdfJsonEditorViewData {
document: PdfJsonDocument | null;
groupsByPage: TextGroup[][];
selectedPage: number;
dirtyPages: boolean[];
hasDocument: boolean;
fileName: string;
errorMessage: string | null;
isGeneratingPdf: boolean;
hasChanges: boolean;
onLoadJson: (file: File | null) => Promise<void> | void;
onSelectPage: (pageIndex: number) => void;
onGroupEdit: (pageIndex: number, groupId: string, value: string) => void;
onReset: () => void;
onDownloadJson: () => void;
onGeneratePdf: () => void;
}

View File

@ -0,0 +1,344 @@
import {
BoundingBox,
PdfJsonDocument,
PdfJsonPage,
PdfJsonTextElement,
TextGroup,
DEFAULT_PAGE_HEIGHT,
DEFAULT_PAGE_WIDTH,
} from './pdfJsonEditorTypes';
const LINE_TOLERANCE = 2;
const GAP_FACTOR = 0.6;
const SPACE_MIN_GAP = 1.5;
export const valueOr = (value: number | null | undefined, fallback = 0): number => {
if (value === null || value === undefined || Number.isNaN(value)) {
return fallback;
}
return value;
};
export const cloneTextElement = (element: PdfJsonTextElement): PdfJsonTextElement => ({
...element,
textMatrix: element.textMatrix ? [...element.textMatrix] : element.textMatrix ?? undefined,
});
const getBaseline = (element: PdfJsonTextElement): number => {
if (element.textMatrix && element.textMatrix.length === 6) {
return valueOr(element.textMatrix[5]);
}
return valueOr(element.y);
};
const getX = (element: PdfJsonTextElement): number => {
if (element.textMatrix && element.textMatrix.length === 6) {
return valueOr(element.textMatrix[4]);
}
return valueOr(element.x);
};
const getWidth = (element: PdfJsonTextElement): number => {
const width = valueOr(element.width, 0);
if (width === 0 && element.text) {
const fontSize = valueOr(element.fontSize, 12);
return fontSize * Math.max(element.text.length * 0.45, 0.5);
}
return width;
};
const getFontSize = (element: PdfJsonTextElement): number => valueOr(element.fontSize, 12);
const getHeight = (element: PdfJsonTextElement): number => {
const height = valueOr(element.height);
if (height === 0) {
return getFontSize(element) * 1.05;
}
return height;
};
const getElementBounds = (element: PdfJsonTextElement): BoundingBox => {
const left = getX(element);
const width = getWidth(element);
const bottom = getBaseline(element);
const height = getHeight(element);
const top = bottom - height;
return {
left,
right: left + width,
top,
bottom,
};
};
const mergeBounds = (bounds: BoundingBox[]): BoundingBox => {
if (bounds.length === 0) {
return { left: 0, right: 0, top: 0, bottom: 0 };
}
return bounds.reduce(
(acc, current) => ({
left: Math.min(acc.left, current.left),
right: Math.max(acc.right, current.right),
top: Math.min(acc.top, current.top),
bottom: Math.max(acc.bottom, current.bottom),
}),
{ ...bounds[0] }
);
};
const shouldInsertSpace = (prev: PdfJsonTextElement, current: PdfJsonTextElement): boolean => {
const prevRight = getX(prev) + getWidth(prev);
const gap = getX(current) - prevRight;
const avgFontSize = (getFontSize(prev) + getFontSize(current)) / 2;
const threshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR);
return gap > threshold;
};
const buildGroupText = (elements: PdfJsonTextElement[]): string => {
let result = '';
elements.forEach((element, index) => {
const value = element.text ?? '';
if (index === 0) {
result += value;
return;
}
const previous = elements[index - 1];
const needsSpace = shouldInsertSpace(previous, element);
const startsWithWhitespace = /^\s/u.test(value);
if (needsSpace && !startsWithWhitespace) {
result += ' ';
}
result += value;
});
return result;
};
const createGroup = (
pageIndex: number,
idSuffix: number,
elements: PdfJsonTextElement[],
): TextGroup => {
const clones = elements.map(cloneTextElement);
const originalClones = clones.map(cloneTextElement);
const bounds = mergeBounds(elements.map(getElementBounds));
return {
id: `${pageIndex}-${idSuffix}`,
pageIndex,
fontId: elements[0]?.fontId,
fontSize: elements[0]?.fontSize,
elements: clones,
originalElements: originalClones,
text: buildGroupText(elements),
originalText: buildGroupText(elements),
bounds,
};
};
export const groupPageTextElements = (page: PdfJsonPage | null | undefined, pageIndex: number): TextGroup[] => {
if (!page?.textElements || page.textElements.length === 0) {
return [];
}
const elements = page.textElements
.map(cloneTextElement)
.filter((element) => element.text !== null && element.text !== undefined);
elements.sort((a, b) => getBaseline(b) - getBaseline(a));
const lines: { baseline: number; elements: PdfJsonTextElement[] }[] = [];
elements.forEach((element) => {
const baseline = getBaseline(element);
const fontSize = getFontSize(element);
const tolerance = Math.max(LINE_TOLERANCE, fontSize * 0.12);
const existingLine = lines.find((line) => Math.abs(line.baseline - baseline) <= tolerance);
if (existingLine) {
existingLine.elements.push(element);
} else {
lines.push({ baseline, elements: [element] });
}
});
lines.forEach((line) => {
line.elements.sort((a, b) => getX(a) - getX(b));
});
let groupCounter = 0;
const groups: TextGroup[] = [];
lines.forEach((line) => {
let currentBucket: PdfJsonTextElement[] = [];
line.elements.forEach((element) => {
if (currentBucket.length === 0) {
currentBucket.push(element);
return;
}
const previous = currentBucket[currentBucket.length - 1];
const gap = getX(element) - (getX(previous) + getWidth(previous));
const avgFontSize = (getFontSize(previous) + getFontSize(element)) / 2;
const splitThreshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR);
const sameFont = previous.fontId === element.fontId;
const shouldSplit = gap > splitThreshold * (sameFont ? 1.4 : 1.0);
if (shouldSplit) {
groups.push(createGroup(pageIndex, groupCounter, currentBucket));
groupCounter += 1;
currentBucket = [element];
} else {
currentBucket.push(element);
}
});
if (currentBucket.length > 0) {
groups.push(createGroup(pageIndex, groupCounter, currentBucket));
groupCounter += 1;
}
});
return groups;
};
export const groupDocumentText = (document: PdfJsonDocument | null | undefined): TextGroup[][] => {
const pages = document?.pages ?? [];
return pages.map((page, index) => groupPageTextElements(page, index));
};
export const deepCloneDocument = (document: PdfJsonDocument): PdfJsonDocument => {
if (typeof structuredClone === 'function') {
return structuredClone(document);
}
return JSON.parse(JSON.stringify(document));
};
export const pageDimensions = (page: PdfJsonPage | null | undefined): { width: number; height: number } => {
return {
width: valueOr(page?.width, DEFAULT_PAGE_WIDTH),
height: valueOr(page?.height, DEFAULT_PAGE_HEIGHT),
};
};
export const createMergedElement = (group: TextGroup): PdfJsonTextElement => {
const reference = group.originalElements[0];
const merged = cloneTextElement(reference);
merged.text = group.text;
if (reference.textMatrix && reference.textMatrix.length === 6) {
merged.textMatrix = [...reference.textMatrix];
}
return merged;
};
const distributeTextAcrossElements = (text: string | undefined, elements: PdfJsonTextElement[]): void => {
if (elements.length === 0) {
return;
}
const targetChars = Array.from(text ?? '');
let cursor = 0;
elements.forEach((element, index) => {
const originalText = element.text ?? '';
let sliceLength = Array.from(originalText).length;
if (sliceLength <= 0) {
sliceLength = 1;
}
if (index === elements.length - 1) {
element.text = targetChars.slice(cursor).join('');
cursor = targetChars.length;
return;
}
const slice = targetChars.slice(cursor, cursor + sliceLength).join('');
element.text = slice;
cursor = Math.min(cursor + sliceLength, targetChars.length);
});
if (cursor < targetChars.length) {
const last = elements[elements.length - 1];
last.text = (last.text ?? '') + targetChars.slice(cursor).join('');
}
elements.forEach((element) => {
if (element.text == null) {
element.text = '';
}
});
};
export const buildUpdatedDocument = (
source: PdfJsonDocument,
groupsByPage: TextGroup[][],
): PdfJsonDocument => {
const updated = deepCloneDocument(source);
const pages = updated.pages ?? [];
updated.pages = pages.map((page, pageIndex) => {
const groups = groupsByPage[pageIndex] ?? [];
if (!groups.length) {
return page;
}
const hasPageChanges = groups.some((group) => group.text !== group.originalText);
const updatedElements: PdfJsonTextElement[] = groups.flatMap((group) => {
if (group.text === group.originalText) {
return group.originalElements.map(cloneTextElement);
}
return [createMergedElement(group)];
});
return {
...page,
textElements: updatedElements,
contentStreams: page.contentStreams ?? [],
};
});
return updated;
};
export const restoreGlyphElements = (
source: PdfJsonDocument,
groupsByPage: TextGroup[][],
): PdfJsonDocument => {
const updated = deepCloneDocument(source);
const pages = updated.pages ?? [];
updated.pages = pages.map((page, pageIndex) => {
const groups = groupsByPage[pageIndex] ?? [];
if (!groups.length) {
return page;
}
const rebuiltElements: PdfJsonTextElement[] = [];
let pageChanged = false;
groups.forEach((group) => {
const originals = group.originalElements.map(cloneTextElement);
if (group.text !== group.originalText) {
pageChanged = true;
distributeTextAcrossElements(group.text, originals);
}
rebuiltElements.push(...originals);
});
return {
...page,
textElements: rebuiltElements,
contentStreams: page.contentStreams ?? [],
};
});
return updated;
};
export const getDirtyPages = (groupsByPage: TextGroup[][]): boolean[] => {
return groupsByPage.map((groups) => groups.some((group) => group.text !== group.originalText));
};

View File

@ -46,6 +46,7 @@ export const REGULAR_TOOL_IDS = [
'validateSignature', 'validateSignature',
'replaceColor', 'replaceColor',
'showJS', 'showJS',
'pdfJsonEditor',
'bookletImposition', 'bookletImposition',
] as const; ] as const;
@ -92,4 +93,3 @@ type Disjoint<A, B> = [A & B] extends [never] ? true : false;
type _Check1 = Assert<Disjoint<RegularToolId, SuperToolId>>; type _Check1 = Assert<Disjoint<RegularToolId, SuperToolId>>;
type _Check2 = Assert<Disjoint<RegularToolId, LinkToolId>>; type _Check2 = Assert<Disjoint<RegularToolId, LinkToolId>>;
type _Check3 = Assert<Disjoint<SuperToolId, LinkToolId>>; type _Check3 = Assert<Disjoint<SuperToolId, LinkToolId>>;