paragraphs? :o

This commit is contained in:
Anthony Stirling 2025-11-10 22:55:16 +00:00
parent 5fadb92f51
commit 3ed62c8dbf
7 changed files with 443 additions and 93 deletions

View File

@ -1,5 +1,6 @@
multipart.enabled=true multipart.enabled=true
logging.level.org.springframework=WARN logging.level.org.springframework=WARN
logging.level.org.springframework.security=WARN
logging.level.org.hibernate=WARN logging.level.org.hibernate=WARN
logging.level.org.eclipse.jetty=WARN logging.level.org.eclipse.jetty=WARN
#logging.level.org.springframework.security.oauth2=DEBUG #logging.level.org.springframework.security.oauth2=DEBUG

View File

@ -4498,8 +4498,17 @@
"forceSingleElement": { "forceSingleElement": {
"title": "Lock edited text to a single PDF element", "title": "Lock edited text to a single PDF element",
"description": "When enabled, the editor exports each edited text box as one PDF text element to avoid overlapping glyphs or mixed fonts." "description": "When enabled, the editor exports each edited text box as one PDF text element to avoid overlapping glyphs or mixed fonts."
},
"textGroupingMode": {
"title": "Text grouping mode",
"description": "Paragraph mode merges aligned lines into one textbox; single-line mode keeps every PDF line separate. Auto picks the best option per page."
} }
}, },
"grouping": {
"auto": "Auto",
"paragraph": "Paragraph",
"single": "Single Line"
},
"disclaimer": { "disclaimer": {
"heading": "Preview limitations", "heading": "Preview limitations",
"textFocus": "This workspace focuses on editing text and repositioning embedded images. Complex page artwork, form widgets, and layered graphics are preserved for export but are not fully editable here.", "textFocus": "This workspace focuses on editing text and repositioning embedded images. Complex page artwork, form widgets, and layered graphics are preserved for export but are not fully editable here.",

View File

@ -36,7 +36,7 @@ function persistRedirectPath(path: string): void {
try { try {
document.cookie = `${OAUTH_REDIRECT_COOKIE}=${encodeURIComponent(path)}; path=/; max-age=${OAUTH_REDIRECT_COOKIE_MAX_AGE}; SameSite=Lax`; document.cookie = `${OAUTH_REDIRECT_COOKIE}=${encodeURIComponent(path)}; path=/; max-age=${OAUTH_REDIRECT_COOKIE_MAX_AGE}; SameSite=Lax`;
} catch (error) { } catch (error) {
console.warn('[SpringAuth] Failed to persist OAuth redirect path', error); // console.warn('[SpringAuth] Failed to persist OAuth redirect path', error);
} }
} }
@ -113,21 +113,21 @@ class SpringAuthClient {
const token = localStorage.getItem('stirling_jwt'); const token = localStorage.getItem('stirling_jwt');
if (!token) { if (!token) {
console.debug('[SpringAuth] getSession: No JWT in localStorage'); // console.debug('[SpringAuth] getSession: No JWT in localStorage');
return { data: { session: null }, error: null }; return { data: { session: null }, error: null };
} }
// Verify with backend // Verify with backend
console.debug('[SpringAuth] getSession: Verifying JWT with /api/v1/auth/me'); // console.debug('[SpringAuth] getSession: Verifying JWT with /api/v1/auth/me');
const response = await fetch('/api/v1/auth/me', { const response = await fetch('/api/v1/auth/me', {
headers: { headers: {
'Authorization': `Bearer ${token}`, 'Authorization': `Bearer ${token}`,
}, },
}); });
console.debug('[SpringAuth] /me response status:', response.status); // console.debug('[SpringAuth] /me response status:', response.status);
const contentType = response.headers.get('content-type'); const contentType = response.headers.get('content-type');
console.debug('[SpringAuth] /me content-type:', contentType); // console.debug('[SpringAuth] /me content-type:', contentType);
if (!response.ok) { if (!response.ok) {
// Log the error response for debugging // Log the error response for debugging
@ -140,7 +140,7 @@ class SpringAuthClient {
// Token invalid or expired - clear it // Token invalid or expired - clear it
localStorage.removeItem('stirling_jwt'); localStorage.removeItem('stirling_jwt');
console.warn('[SpringAuth] getSession: Cleared invalid JWT from localStorage'); // console.warn('[SpringAuth] getSession: Cleared invalid JWT from localStorage');
return { data: { session: null }, error: { message: `Auth failed: ${response.status}` } }; return { data: { session: null }, error: { message: `Auth failed: ${response.status}` } };
} }
@ -155,7 +155,7 @@ class SpringAuthClient {
} }
const data = await response.json(); const data = await response.json();
console.debug('[SpringAuth] /me response data:', data); // console.debug('[SpringAuth] /me response data:', data);
// Create session object // Create session object
const session: Session = { const session: Session = {
@ -165,7 +165,7 @@ class SpringAuthClient {
expires_at: Date.now() + 3600 * 1000, expires_at: Date.now() + 3600 * 1000,
}; };
console.debug('[SpringAuth] getSession: Session retrieved successfully'); // console.debug('[SpringAuth] getSession: Session retrieved successfully');
return { data: { session }, error: null }; return { data: { session }, error: null };
} catch (error) { } catch (error) {
console.error('[SpringAuth] getSession error:', error); console.error('[SpringAuth] getSession error:', error);
@ -206,7 +206,7 @@ class SpringAuthClient {
// Store JWT in localStorage // Store JWT in localStorage
localStorage.setItem('stirling_jwt', token); localStorage.setItem('stirling_jwt', token);
console.log('[SpringAuth] JWT stored in localStorage'); // console.log('[SpringAuth] JWT stored in localStorage');
// Dispatch custom event for other components to react to JWT availability // Dispatch custom event for other components to react to JWT availability
window.dispatchEvent(new CustomEvent('jwt-available')); window.dispatchEvent(new CustomEvent('jwt-available'));
@ -285,7 +285,7 @@ class SpringAuthClient {
// Redirect to Spring OAuth2 endpoint (Vite will proxy to backend) // Redirect to Spring OAuth2 endpoint (Vite will proxy to backend)
const redirectUrl = `/oauth2/authorization/${params.provider}`; const redirectUrl = `/oauth2/authorization/${params.provider}`;
console.log('[SpringAuth] Redirecting to OAuth:', redirectUrl); // console.log('[SpringAuth] Redirecting to OAuth:', redirectUrl);
// Use window.location.assign for full page navigation // Use window.location.assign for full page navigation
window.location.assign(redirectUrl); window.location.assign(redirectUrl);
return { error: null }; return { error: null };
@ -303,7 +303,7 @@ class SpringAuthClient {
try { try {
// Clear JWT from localStorage immediately // Clear JWT from localStorage immediately
localStorage.removeItem('stirling_jwt'); localStorage.removeItem('stirling_jwt');
console.log('[SpringAuth] JWT removed from localStorage'); // console.log('[SpringAuth] JWT removed from localStorage');
const csrfToken = this.getCsrfToken(); const csrfToken = this.getCsrfToken();
const headers: HeadersInit = {}; const headers: HeadersInit = {};
@ -446,7 +446,7 @@ class SpringAuthClient {
// Refresh if token expires soon // Refresh if token expires soon
if (timeUntilExpiry > 0 && timeUntilExpiry < this.TOKEN_REFRESH_THRESHOLD) { if (timeUntilExpiry > 0 && timeUntilExpiry < this.TOKEN_REFRESH_THRESHOLD) {
console.log('[SpringAuth] Proactively refreshing token'); // console.log('[SpringAuth] Proactively refreshing token');
await this.refreshSession(); await this.refreshSession();
} }
} }

View File

@ -245,6 +245,26 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
onForceSingleTextElementChange, onForceSingleTextElementChange,
} = data; } = data;
const syncEditorValue = useCallback(
(element: HTMLElement, pageIndex: number, groupId: string) => {
const value = element.innerText.replace(/\u00A0/g, ' ');
const offset = getCaretOffset(element);
caretOffsetsRef.current.set(groupId, offset);
onGroupEdit(pageIndex, groupId, value);
requestAnimationFrame(() => {
if (editingGroupId !== groupId) {
return;
}
const editor = editorRefs.current.get(groupId);
if (editor) {
const savedOffset = caretOffsetsRef.current.get(groupId) ?? editor.innerText.length;
setCaretOffset(editor, savedOffset);
}
});
},
[editingGroupId, onGroupEdit],
);
const resolveFont = (fontId: string | null | undefined, pageIndex: number | null | undefined): PdfJsonFont | null => { const resolveFont = (fontId: string | null | undefined, pageIndex: number | null | undefined): PdfJsonFont | null => {
if (!fontId || !pdfDocument?.fonts) { if (!fontId || !pdfDocument?.fonts) {
return null; return null;
@ -646,7 +666,14 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
// Measure text widths once per page/configuration and apply static scaling // Measure text widths once per page/configuration and apply static scaling
useLayoutEffect(() => { useLayoutEffect(() => {
if (!autoScaleText || visibleGroups.length === 0) { if (!autoScaleText) {
// Clear all scales when auto-scale is disabled
setTextScales(new Map());
measurementKeyRef.current = '';
return;
}
if (visibleGroups.length === 0) {
return; return;
} }
@ -667,6 +694,13 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
return; return;
} }
// Skip multi-line paragraphs - auto-scaling doesn't work well with wrapped text
const lineCount = (group.text || '').split('\n').length;
if (lineCount > 1) {
newScales.set(group.id, 1);
return;
}
const element = document.querySelector<HTMLElement>(`[data-text-group="${group.id}"]`); const element = document.querySelector<HTMLElement>(`[data-text-group="${group.id}"]`);
if (!element) { if (!element) {
return; return;
@ -705,7 +739,16 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
// Delay measurement to ensure fonts and layout are ready // Delay measurement to ensure fonts and layout are ready
const timer = setTimeout(measureTextScales, 150); const timer = setTimeout(measureTextScales, 150);
return () => clearTimeout(timer); return () => clearTimeout(timer);
}, [autoScaleText, visibleGroups, editingGroupId, currentPage, pageHeight, scale, fontFamilies.size, selectedPage]); }, [
autoScaleText,
visibleGroups,
editingGroupId,
currentPage,
pageHeight,
scale,
fontFamilies.size,
selectedPage,
]);
useLayoutEffect(() => { useLayoutEffect(() => {
// Only restore caret position during re-renders while already editing // Only restore caret position during re-renders while already editing
@ -792,7 +835,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
}} }}
> >
{content} {content}
{activeGroupId === groupId && editingGroupId !== groupId && ( {activeGroupId === groupId && (
<ActionIcon <ActionIcon
size="xs" size="xs"
variant="filled" variant="filled"
@ -956,6 +999,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
onChange={(event) => onForceSingleTextElementChange(event.currentTarget.checked)} onChange={(event) => onForceSingleTextElementChange(event.currentTarget.checked)}
/> />
</Group> </Group>
</Stack> </Stack>
</Card> </Card>
@ -1325,11 +1369,24 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
if (fontSizePx > 0) { if (fontSizePx > 0) {
lineHeightRatio = Math.max(lineHeightRatio, lineHeightPx / fontSizePx); lineHeightRatio = Math.max(lineHeightRatio, lineHeightPx / fontSizePx);
} }
const detectedSpacingPx =
group.lineSpacing && group.lineSpacing > 0 ? group.lineSpacing * scale : undefined;
if (detectedSpacingPx && detectedSpacingPx > 0) {
lineHeightPx = Math.max(lineHeightPx, detectedSpacingPx);
if (fontSizePx > 0) {
lineHeightRatio = Math.max(lineHeightRatio, detectedSpacingPx / fontSizePx);
}
}
const lineCount = Math.max(group.text.split('\n').length, 1);
const paragraphHeightPx =
lineCount > 1
? lineHeightPx + (lineCount - 1) * (detectedSpacingPx ?? lineHeightPx)
: lineHeightPx;
let containerLeft = bounds.left; let containerLeft = bounds.left;
let containerTop = bounds.top; let containerTop = bounds.top;
let containerWidth = Math.max(bounds.width, fontSizePx); let containerWidth = Math.max(bounds.width, fontSizePx);
let containerHeight = Math.max(bounds.height, lineHeightPx); let containerHeight = Math.max(bounds.height, paragraphHeightPx);
let transform: string | undefined; let transform: string | undefined;
let transformOrigin: React.CSSProperties['transformOrigin']; let transformOrigin: React.CSSProperties['transformOrigin'];
@ -1349,7 +1406,13 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
containerTop = anchorTop - containerHeight; containerTop = anchorTop - containerHeight;
} }
if (!hasRotation && group.baseline !== null && group.baseline !== undefined && geometry) { if (
lineCount === 1 &&
!hasRotation &&
group.baseline !== null &&
group.baseline !== undefined &&
geometry
) {
const cssBaselineTop = (pageHeight - group.baseline) * scale; const cssBaselineTop = (pageHeight - group.baseline) * scale;
containerTop = Math.max(cssBaselineTop - ascentPx, 0); containerTop = Math.max(cssBaselineTop - ascentPx, 0);
containerHeight = Math.max(containerHeight, ascentPx + descentPx); containerHeight = Math.max(containerHeight, ascentPx + descentPx);
@ -1364,7 +1427,8 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
left: `${containerLeft}px`, left: `${containerLeft}px`,
top: `${containerTop}px`, top: `${containerTop}px`,
width: `${containerWidth}px`, width: `${containerWidth}px`,
height: `${containerHeight}px`, height: isEditing ? 'auto' : `${containerHeight}px`,
minHeight: `${containerHeight}px`,
display: 'flex', display: 'flex',
alignItems: 'flex-start', alignItems: 'flex-start',
justifyContent: 'flex-start', justifyContent: 'flex-start',
@ -1423,23 +1487,12 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
setEditingGroupId(null); setEditingGroupId(null);
}} }}
onInput={(event) => { onInput={(event) => {
const value = event.currentTarget.innerText.replace(/\u00A0/g, ' '); syncEditorValue(event.currentTarget, group.pageIndex, group.id);
const offset = getCaretOffset(event.currentTarget);
caretOffsetsRef.current.set(group.id, offset);
onGroupEdit(group.pageIndex, group.id, value);
requestAnimationFrame(() => {
if (editingGroupId !== group.id) {
return;
}
const editor = editorRefs.current.get(group.id);
if (editor) {
setCaretOffset(editor, caretOffsetsRef.current.get(group.id) ?? editor.innerText.length);
}
});
}} }}
style={{ style={{
width: '100%', width: '100%',
height: '100%', minHeight: '100%',
height: 'auto',
padding: 0, padding: 0,
backgroundColor: 'rgba(255,255,255,0.95)', backgroundColor: 'rgba(255,255,255,0.95)',
color: textColor, color: textColor,
@ -1486,7 +1539,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
color: textColor, color: textColor,
display: 'block', display: 'block',
cursor: 'text', cursor: 'text',
overflow: 'visible', overflow: 'hidden',
}} }}
> >
<span <span
@ -1496,6 +1549,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
display: 'inline-block', display: 'inline-block',
transform: shouldScale ? `scaleX(${textScale})` : undefined, transform: shouldScale ? `scaleX(${textScale})` : undefined,
transformOrigin: 'left center', transformOrigin: 'left center',
whiteSpace: 'pre',
}} }}
> >
{group.text || '\u00A0'} {group.text || '\u00A0'}
@ -1503,57 +1557,43 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
</div>, </div>,
undefined, undefined,
(event: React.MouseEvent) => { (event: React.MouseEvent) => {
// Double-click to edit const clickX = event.clientX;
if (event.detail === 2) { const clickY = event.clientY;
// Capture click position BEFORE switching to edit mode
const clickX = event.clientX;
const clickY = event.clientY;
setEditingGroupId(group.id); setActiveGroupId(group.id);
setActiveGroupId(group.id); setEditingGroupId(group.id);
caretOffsetsRef.current.delete(group.id);
// Clear any stored offset to prevent interference requestAnimationFrame(() => {
caretOffsetsRef.current.delete(group.id); const editor = document.querySelector<HTMLElement>(`[data-editor-group="${group.id}"]`);
if (!editor) return;
editor.focus();
// Wait for editor to render, then position cursor at click location setTimeout(() => {
requestAnimationFrame(() => { if (document.caretRangeFromPoint) {
const editor = document.querySelector<HTMLElement>(`[data-editor-group="${group.id}"]`); const range = document.caretRangeFromPoint(clickX, clickY);
if (!editor) return; if (range) {
const selection = window.getSelection();
// Focus the editor first if (selection) {
editor.focus(); selection.removeAllRanges();
selection.addRange(range);
// Use caretRangeFromPoint to position cursor at click coordinates
setTimeout(() => {
if (document.caretRangeFromPoint) {
const range = document.caretRangeFromPoint(clickX, clickY);
if (range) {
const selection = window.getSelection();
if (selection) {
selection.removeAllRanges();
selection.addRange(range);
}
}
} else if ((document as any).caretPositionFromPoint) {
// Firefox fallback
const pos = (document as any).caretPositionFromPoint(clickX, clickY);
if (pos) {
const range = document.createRange();
range.setStart(pos.offsetNode, pos.offset);
range.collapse(true);
const selection = window.getSelection();
if (selection) {
selection.removeAllRanges();
selection.addRange(range);
}
} }
} }
}, 10); } else if ((document as any).caretPositionFromPoint) {
}); const pos = (document as any).caretPositionFromPoint(clickX, clickY);
} else { if (pos) {
// Single click just selects const range = document.createRange();
setActiveGroupId(group.id); range.setStart(pos.offsetNode, pos.offset);
} range.collapse(true);
const selection = window.getSelection();
if (selection) {
selection.removeAllRanges();
selection.addRange(range);
}
}
}
}, 10);
});
}, },
)} )}
</Box> </Box>

View File

@ -1028,6 +1028,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
selectedPage, selectedPage,
forceSingleTextElement, forceSingleTextElement,
requestPagePreview, requestPagePreview,
setForceSingleTextElement,
]); ]);
const latestViewDataRef = useRef<PdfJsonEditorViewData>(viewData); const latestViewDataRef = useRef<PdfJsonEditorViewData>(viewData);

View File

@ -155,6 +155,8 @@ export interface TextGroup {
fontId?: string | null; fontId?: string | null;
fontSize?: number | null; fontSize?: number | null;
fontMatrixSize?: number | null; fontMatrixSize?: number | null;
lineSpacing?: number | null;
lineElementCounts?: number[] | null;
color?: string | null; color?: string | null;
fontWeight?: number | 'normal' | 'bold' | null; fontWeight?: number | 'normal' | 'bold' | null;
rotation?: number | null; rotation?: number | null;

View File

@ -24,6 +24,54 @@ type FontMetrics = {
type FontMetricsMap = Map<string, FontMetrics>; type FontMetricsMap = Map<string, FontMetrics>;
const sanitizeParagraphText = (text: string | undefined | null): string => {
if (!text) {
return '';
}
return text.replace(/\r?\n/g, '');
};
const splitParagraphIntoLines = (text: string | undefined | null): string[] => {
if (text === null || text === undefined) {
return [''];
}
return text.replace(/\r/g, '').split('\n');
};
const extractElementBaseline = (element: PdfJsonTextElement): number | null => {
if (!element) {
return null;
}
if (element.textMatrix && element.textMatrix.length >= 6) {
const baseline = element.textMatrix[5];
return typeof baseline === 'number' ? baseline : null;
}
if (typeof element.y === 'number') {
return element.y;
}
return null;
};
const shiftElementsBy = (elements: PdfJsonTextElement[], delta: number): PdfJsonTextElement[] => {
if (delta === 0) {
return elements.map(cloneTextElement);
}
return elements.map((element) => {
const clone = cloneTextElement(element);
if (clone.textMatrix && clone.textMatrix.length >= 6) {
const matrix = [...clone.textMatrix];
matrix[5] = (matrix[5] ?? 0) + delta;
clone.textMatrix = matrix;
}
if (typeof clone.y === 'number') {
clone.y += delta;
} else if (clone.y === null || clone.y === undefined) {
clone.y = delta;
}
return clone;
});
};
const countGraphemes = (text: string): number => { const countGraphemes = (text: string): number => {
if (!text) { if (!text) {
return 0; return 0;
@ -472,6 +520,123 @@ const createGroup = (
}; };
}; };
const groupLinesIntoParagraphs = (
lineGroups: TextGroup[],
metrics?: FontMetricsMap,
): TextGroup[] => {
if (lineGroups.length === 0) {
return [];
}
const paragraphs: TextGroup[][] = [];
let currentParagraph: TextGroup[] = [lineGroups[0]];
for (let i = 1; i < lineGroups.length; i++) {
const prevLine = lineGroups[i - 1];
const currentLine = lineGroups[i];
// Calculate line spacing
const prevBaseline = prevLine.baseline ?? 0;
const currentBaseline = currentLine.baseline ?? 0;
const lineSpacing = Math.abs(prevBaseline - currentBaseline);
// Calculate average font size
const prevFontSize = prevLine.fontSize ?? 12;
const currentFontSize = currentLine.fontSize ?? 12;
const avgFontSize = (prevFontSize + currentFontSize) / 2;
// Check horizontal alignment (left edge)
const prevLeft = prevLine.bounds.left;
const currentLeft = currentLine.bounds.left;
const leftAlignmentTolerance = avgFontSize * 0.3;
const isLeftAligned = Math.abs(prevLeft - currentLeft) <= leftAlignmentTolerance;
// Check if fonts match
const sameFont = prevLine.fontId === currentLine.fontId;
// Check for consistent spacing rather than expected spacing
// Line spacing in PDFs can range from 1.0x to 3.0x font size
// We just want to ensure spacing is consistent between consecutive lines
// and not excessively large (which would indicate a paragraph break)
const maxReasonableSpacing = avgFontSize * 3.0; // Max ~3x font size for normal line spacing
const hasReasonableSpacing = lineSpacing <= maxReasonableSpacing;
// Merge into paragraph if:
// 1. Left aligned
// 2. Same font
// 3. Reasonable line spacing (not a large gap indicating paragraph break)
const shouldMerge = isLeftAligned && sameFont && hasReasonableSpacing;
if (shouldMerge) {
currentParagraph.push(currentLine);
} else {
paragraphs.push(currentParagraph);
currentParagraph = [currentLine];
}
}
// Don't forget the last paragraph
if (currentParagraph.length > 0) {
paragraphs.push(currentParagraph);
}
// Merge line groups into single paragraph groups
return paragraphs.map((lines, paragraphIndex) => {
if (lines.length === 1) {
return lines[0];
}
// Combine all elements from all lines
const allElements = lines.flatMap(line => line.originalElements);
const pageIndex = lines[0].pageIndex;
const lineElementCounts = lines.map((line) => line.originalElements.length);
// Create merged group with newlines between lines
const paragraphText = lines.map(line => line.text).join('\n');
const mergedBounds = mergeBounds(lines.map(line => line.bounds));
const spacingValues: number[] = [];
for (let i = 1; i < lines.length; i++) {
const prevBaseline = lines[i - 1].baseline ?? lines[i - 1].bounds.bottom;
const currentBaseline = lines[i].baseline ?? lines[i].bounds.bottom;
const spacing = Math.abs(prevBaseline - currentBaseline);
if (spacing > 0) {
spacingValues.push(spacing);
}
}
const averageSpacing =
spacingValues.length > 0
? spacingValues.reduce((sum, value) => sum + value, 0) / spacingValues.length
: null;
const firstElement = allElements[0];
const rotation = computeGroupRotation(allElements);
const anchor = rotation !== null ? getAnchorPoint(firstElement) : null;
const baselineLength = computeBaselineLength(allElements, metrics);
const baseline = computeAverageBaseline(allElements);
return {
id: lines[0].id, // Keep the first line's ID
pageIndex,
fontId: firstElement?.fontId,
fontSize: firstElement?.fontSize,
fontMatrixSize: firstElement?.fontMatrixSize,
lineSpacing: averageSpacing,
lineElementCounts: lines.length > 1 ? lineElementCounts : null,
color: firstElement ? extractColor(firstElement) : null,
fontWeight: null,
rotation,
anchor,
baselineLength,
baseline,
elements: allElements.map(cloneTextElement),
originalElements: allElements.map(cloneTextElement),
text: paragraphText,
originalText: paragraphText,
bounds: mergedBounds,
};
});
};
export const groupPageTextElements = ( export const groupPageTextElements = (
page: PdfJsonPage | null | undefined, page: PdfJsonPage | null | undefined,
pageIndex: number, pageIndex: number,
@ -508,7 +673,7 @@ export const groupPageTextElements = (
}); });
let groupCounter = 0; let groupCounter = 0;
const groups: TextGroup[] = []; const lineGroups: TextGroup[] = [];
lines.forEach((line) => { lines.forEach((line) => {
let currentBucket: PdfJsonTextElement[] = []; let currentBucket: PdfJsonTextElement[] = [];
@ -527,6 +692,19 @@ export const groupPageTextElements = (
const sameFont = previous.fontId === element.fontId; const sameFont = previous.fontId === element.fontId;
let shouldSplit = gap > splitThreshold * (sameFont ? 1.4 : 1.0); let shouldSplit = gap > splitThreshold * (sameFont ? 1.4 : 1.0);
if (shouldSplit) {
const prevBaseline = getBaseline(previous);
const currentBaseline = getBaseline(element);
const baselineDelta = Math.abs(prevBaseline - currentBaseline);
const prevEndX = getX(previous) + getWidth(previous, metrics);
const prevEndY = prevBaseline;
const diagonalGap = Math.hypot(Math.max(0, getX(element) - prevEndX), baselineDelta);
const diagonalThreshold = Math.max(avgFontSize * 0.8, splitThreshold);
if (diagonalGap <= diagonalThreshold) {
shouldSplit = false;
}
}
const previousRotation = extractElementRotation(previous); const previousRotation = extractElementRotation(previous);
const currentRotation = extractElementRotation(element); const currentRotation = extractElementRotation(element);
if ( if (
@ -539,7 +717,7 @@ export const groupPageTextElements = (
} }
if (shouldSplit) { if (shouldSplit) {
groups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics)); lineGroups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics));
groupCounter += 1; groupCounter += 1;
currentBucket = [element]; currentBucket = [element];
} else { } else {
@ -548,15 +726,17 @@ export const groupPageTextElements = (
}); });
if (currentBucket.length > 0) { if (currentBucket.length > 0) {
groups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics)); lineGroups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics));
groupCounter += 1; groupCounter += 1;
} }
}); });
return groups; return groupLinesIntoParagraphs(lineGroups, metrics);
}; };
export const groupDocumentText = (document: PdfJsonDocument | null | undefined): TextGroup[][] => { export const groupDocumentText = (
document: PdfJsonDocument | null | undefined,
): TextGroup[][] => {
const pages = document?.pages ?? []; const pages = document?.pages ?? [];
const metrics = buildFontMetrics(document); const metrics = buildFontMetrics(document);
return pages.map((page, index) => groupPageTextElements(page, index, metrics)); return pages.map((page, index) => groupPageTextElements(page, index, metrics));
@ -600,7 +780,7 @@ export const pageDimensions = (page: PdfJsonPage | null | undefined): { width: n
export const createMergedElement = (group: TextGroup): PdfJsonTextElement => { export const createMergedElement = (group: TextGroup): PdfJsonTextElement => {
const reference = group.originalElements[0]; const reference = group.originalElements[0];
const merged = cloneTextElement(reference); const merged = cloneTextElement(reference);
merged.text = group.text; merged.text = sanitizeParagraphText(group.text);
clearGlyphHints(merged); clearGlyphHints(merged);
if (reference.textMatrix && reference.textMatrix.length === 6) { if (reference.textMatrix && reference.textMatrix.length === 6) {
merged.textMatrix = [...reference.textMatrix]; merged.textMatrix = [...reference.textMatrix];
@ -613,7 +793,8 @@ const distributeTextAcrossElements = (text: string | undefined, elements: PdfJso
return true; return true;
} }
const targetChars = Array.from(text ?? ''); const normalizedText = sanitizeParagraphText(text);
const targetChars = Array.from(normalizedText);
if (targetChars.length === 0) { if (targetChars.length === 0) {
elements.forEach((element) => { elements.forEach((element) => {
element.text = ''; element.text = '';
@ -627,10 +808,6 @@ const distributeTextAcrossElements = (text: string | undefined, elements: PdfJso
const graphemeCount = Array.from(originalText).length; const graphemeCount = Array.from(originalText).length;
return graphemeCount > 0 ? graphemeCount : 1; return graphemeCount > 0 ? graphemeCount : 1;
}); });
const totalCapacity = capacities.reduce((sum, value) => sum + value, 0);
if (targetChars.length > totalCapacity) {
return false;
}
let cursor = 0; let cursor = 0;
elements.forEach((element, index) => { elements.forEach((element, index) => {
@ -640,7 +817,9 @@ const distributeTextAcrossElements = (text: string | undefined, elements: PdfJso
if (index === elements.length - 1) { if (index === elements.length - 1) {
sliceLength = remaining; sliceLength = remaining;
} else { } else {
sliceLength = Math.min(capacities[index], remaining); const capacity = Math.max(capacities[index], 1);
const minRemainingForRest = Math.max(elements.length - index - 1, 0);
sliceLength = Math.min(capacity, Math.max(remaining - minRemainingForRest, 1));
} }
} }
@ -658,6 +837,118 @@ const distributeTextAcrossElements = (text: string | undefined, elements: PdfJso
return true; return true;
}; };
const sliceElementsByLineCounts = (group: TextGroup): PdfJsonTextElement[][] => {
const counts = group.lineElementCounts;
if (!counts || counts.length === 0) {
if (!group.originalElements.length) {
return [];
}
return [group.originalElements];
}
const result: PdfJsonTextElement[][] = [];
let cursor = 0;
counts.forEach((count) => {
if (count <= 0) {
return;
}
const slice = group.originalElements.slice(cursor, cursor + count);
if (slice.length > 0) {
result.push(slice);
}
cursor += count;
});
return result;
};
const rebuildParagraphLineElements = (group: TextGroup): PdfJsonTextElement[] | null => {
if (!group.text || !group.text.includes('\n')) {
return null;
}
const lineTexts = splitParagraphIntoLines(group.text);
if (lineTexts.length === 0) {
return [];
}
const lineElementGroups = sliceElementsByLineCounts(group);
if (!lineElementGroups.length) {
return null;
}
const lineBaselines = lineElementGroups.map((elements) => {
for (const element of elements) {
const baseline = extractElementBaseline(element);
if (baseline !== null) {
return baseline;
}
}
return group.baseline ?? null;
});
const spacingFromBaselines = (() => {
for (let i = 1; i < lineBaselines.length; i += 1) {
const prev = lineBaselines[i - 1];
const current = lineBaselines[i];
if (prev !== null && current !== null) {
const diff = Math.abs(prev - current);
if (diff > 0) {
return diff;
}
}
}
return null;
})();
const spacing =
(group.lineSpacing && group.lineSpacing > 0
? group.lineSpacing
: spacingFromBaselines) ??
Math.max(group.fontMatrixSize ?? group.fontSize ?? 12, 6) * 1.2;
let direction = -1;
for (let i = 1; i < lineBaselines.length; i += 1) {
const prev = lineBaselines[i - 1];
const current = lineBaselines[i];
if (prev !== null && current !== null && Math.abs(prev - current) > 0.05) {
direction = current < prev ? -1 : 1;
break;
}
}
const templateCount = lineElementGroups.length;
const lastTemplateIndex = Math.max(templateCount - 1, 0);
const rebuilt: PdfJsonTextElement[] = [];
for (let index = 0; index < lineTexts.length; index += 1) {
const templateIndex = Math.min(index, lastTemplateIndex);
const templateElements = lineElementGroups[templateIndex];
if (!templateElements || templateElements.length === 0) {
return null;
}
const shiftSteps = index - templateIndex;
const delta = shiftSteps * spacing * direction;
const clones = shiftElementsBy(templateElements, delta);
const normalizedLine = sanitizeParagraphText(lineTexts[index]);
const distributed = distributeTextAcrossElements(normalizedLine, clones);
if (!distributed) {
const primary = clones[0];
primary.text = normalizedLine;
clearGlyphHints(primary);
for (let i = 1; i < clones.length; i += 1) {
clones[i].text = '';
clearGlyphHints(clones[i]);
}
}
rebuilt.push(...clones);
}
return rebuilt;
};
export const buildUpdatedDocument = ( export const buildUpdatedDocument = (
source: PdfJsonDocument, source: PdfJsonDocument,
groupsByPage: TextGroup[][], groupsByPage: TextGroup[][],
@ -724,11 +1015,17 @@ export const restoreGlyphElements = (
rebuiltElements.push(createMergedElement(group)); rebuiltElements.push(createMergedElement(group));
return; return;
} }
const paragraphElements = rebuildParagraphLineElements(group);
if (paragraphElements && paragraphElements.length > 0) {
rebuiltElements.push(...paragraphElements);
return;
}
const originalGlyphCount = group.originalElements.reduce( const originalGlyphCount = group.originalElements.reduce(
(sum, element) => sum + countGraphemes(element.text ?? ''), (sum, element) => sum + countGraphemes(element.text ?? ''),
0, 0,
); );
const targetGlyphCount = countGraphemes(group.text); const normalizedText = sanitizeParagraphText(group.text);
const targetGlyphCount = countGraphemes(normalizedText);
if (targetGlyphCount !== originalGlyphCount) { if (targetGlyphCount !== originalGlyphCount) {
rebuiltElements.push(createMergedElement(group)); rebuiltElements.push(createMergedElement(group));
@ -736,7 +1033,7 @@ export const restoreGlyphElements = (
} }
const originals = group.originalElements.map(cloneTextElement); const originals = group.originalElements.map(cloneTextElement);
const distributed = distributeTextAcrossElements(group.text, originals); const distributed = distributeTextAcrossElements(normalizedText, originals);
if (distributed) { if (distributed) {
rebuiltElements.push(...originals); rebuiltElements.push(...originals);
} else { } else {