From 3ed62c8dbfbc1a132edb71b887eda4eb43a232c5 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Mon, 10 Nov 2025 22:55:16 +0000 Subject: [PATCH] paragraphs? :o --- .../src/main/resources/application.properties | 1 + .../public/locales/en-GB/translation.json | 9 + .../src/proprietary/auth/springAuthClient.ts | 24 +- .../tools/pdfJsonEditor/PdfJsonEditorView.tsx | 174 ++++++---- .../tools/pdfJsonEditor/PdfJsonEditor.tsx | 1 + .../tools/pdfJsonEditor/pdfJsonEditorTypes.ts | 2 + .../tools/pdfJsonEditor/pdfJsonEditorUtils.ts | 325 +++++++++++++++++- 7 files changed, 443 insertions(+), 93 deletions(-) diff --git a/app/core/src/main/resources/application.properties b/app/core/src/main/resources/application.properties index 7cc321b2d..b16e1cc77 100644 --- a/app/core/src/main/resources/application.properties +++ b/app/core/src/main/resources/application.properties @@ -1,5 +1,6 @@ multipart.enabled=true logging.level.org.springframework=WARN +logging.level.org.springframework.security=WARN logging.level.org.hibernate=WARN logging.level.org.eclipse.jetty=WARN #logging.level.org.springframework.security.oauth2=DEBUG diff --git a/frontend/public/locales/en-GB/translation.json b/frontend/public/locales/en-GB/translation.json index f9ff17cea..5e0bc62d1 100644 --- a/frontend/public/locales/en-GB/translation.json +++ b/frontend/public/locales/en-GB/translation.json @@ -4498,8 +4498,17 @@ "forceSingleElement": { "title": "Lock edited text to a single PDF element", "description": "When enabled, the editor exports each edited text box as one PDF text element to avoid overlapping glyphs or mixed fonts." + }, + "textGroupingMode": { + "title": "Text grouping mode", + "description": "Paragraph mode merges aligned lines into one textbox; single-line mode keeps every PDF line separate. Auto picks the best option per page." } }, + "grouping": { + "auto": "Auto", + "paragraph": "Paragraph", + "single": "Single Line" + }, "disclaimer": { "heading": "Preview limitations", "textFocus": "This workspace focuses on editing text and repositioning embedded images. Complex page artwork, form widgets, and layered graphics are preserved for export but are not fully editable here.", diff --git a/frontend/src/proprietary/auth/springAuthClient.ts b/frontend/src/proprietary/auth/springAuthClient.ts index 73c883b56..40ed9848a 100644 --- a/frontend/src/proprietary/auth/springAuthClient.ts +++ b/frontend/src/proprietary/auth/springAuthClient.ts @@ -36,7 +36,7 @@ function persistRedirectPath(path: string): void { try { document.cookie = `${OAUTH_REDIRECT_COOKIE}=${encodeURIComponent(path)}; path=/; max-age=${OAUTH_REDIRECT_COOKIE_MAX_AGE}; SameSite=Lax`; } catch (error) { - console.warn('[SpringAuth] Failed to persist OAuth redirect path', error); + // console.warn('[SpringAuth] Failed to persist OAuth redirect path', error); } } @@ -113,21 +113,21 @@ class SpringAuthClient { const token = localStorage.getItem('stirling_jwt'); if (!token) { - console.debug('[SpringAuth] getSession: No JWT in localStorage'); + // console.debug('[SpringAuth] getSession: No JWT in localStorage'); return { data: { session: null }, error: null }; } // Verify with backend - console.debug('[SpringAuth] getSession: Verifying JWT with /api/v1/auth/me'); + // console.debug('[SpringAuth] getSession: Verifying JWT with /api/v1/auth/me'); const response = await fetch('/api/v1/auth/me', { headers: { 'Authorization': `Bearer ${token}`, }, }); - console.debug('[SpringAuth] /me response status:', response.status); + // console.debug('[SpringAuth] /me response status:', response.status); const contentType = response.headers.get('content-type'); - console.debug('[SpringAuth] /me content-type:', contentType); + // console.debug('[SpringAuth] /me content-type:', contentType); if (!response.ok) { // Log the error response for debugging @@ -140,7 +140,7 @@ class SpringAuthClient { // Token invalid or expired - clear it localStorage.removeItem('stirling_jwt'); - console.warn('[SpringAuth] getSession: Cleared invalid JWT from localStorage'); + // console.warn('[SpringAuth] getSession: Cleared invalid JWT from localStorage'); return { data: { session: null }, error: { message: `Auth failed: ${response.status}` } }; } @@ -155,7 +155,7 @@ class SpringAuthClient { } const data = await response.json(); - console.debug('[SpringAuth] /me response data:', data); + // console.debug('[SpringAuth] /me response data:', data); // Create session object const session: Session = { @@ -165,7 +165,7 @@ class SpringAuthClient { expires_at: Date.now() + 3600 * 1000, }; - console.debug('[SpringAuth] getSession: Session retrieved successfully'); + // console.debug('[SpringAuth] getSession: Session retrieved successfully'); return { data: { session }, error: null }; } catch (error) { console.error('[SpringAuth] getSession error:', error); @@ -206,7 +206,7 @@ class SpringAuthClient { // Store JWT in localStorage localStorage.setItem('stirling_jwt', token); - console.log('[SpringAuth] JWT stored in localStorage'); + // console.log('[SpringAuth] JWT stored in localStorage'); // Dispatch custom event for other components to react to JWT availability window.dispatchEvent(new CustomEvent('jwt-available')); @@ -285,7 +285,7 @@ class SpringAuthClient { // Redirect to Spring OAuth2 endpoint (Vite will proxy to backend) const redirectUrl = `/oauth2/authorization/${params.provider}`; - console.log('[SpringAuth] Redirecting to OAuth:', redirectUrl); + // console.log('[SpringAuth] Redirecting to OAuth:', redirectUrl); // Use window.location.assign for full page navigation window.location.assign(redirectUrl); return { error: null }; @@ -303,7 +303,7 @@ class SpringAuthClient { try { // Clear JWT from localStorage immediately localStorage.removeItem('stirling_jwt'); - console.log('[SpringAuth] JWT removed from localStorage'); + // console.log('[SpringAuth] JWT removed from localStorage'); const csrfToken = this.getCsrfToken(); const headers: HeadersInit = {}; @@ -446,7 +446,7 @@ class SpringAuthClient { // Refresh if token expires soon if (timeUntilExpiry > 0 && timeUntilExpiry < this.TOKEN_REFRESH_THRESHOLD) { - console.log('[SpringAuth] Proactively refreshing token'); + // console.log('[SpringAuth] Proactively refreshing token'); await this.refreshSession(); } } diff --git a/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx b/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx index 3832f2c10..869d6229d 100644 --- a/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx +++ b/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx @@ -245,6 +245,26 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { onForceSingleTextElementChange, } = data; + const syncEditorValue = useCallback( + (element: HTMLElement, pageIndex: number, groupId: string) => { + const value = element.innerText.replace(/\u00A0/g, ' '); + const offset = getCaretOffset(element); + caretOffsetsRef.current.set(groupId, offset); + onGroupEdit(pageIndex, groupId, value); + requestAnimationFrame(() => { + if (editingGroupId !== groupId) { + return; + } + const editor = editorRefs.current.get(groupId); + if (editor) { + const savedOffset = caretOffsetsRef.current.get(groupId) ?? editor.innerText.length; + setCaretOffset(editor, savedOffset); + } + }); + }, + [editingGroupId, onGroupEdit], + ); + const resolveFont = (fontId: string | null | undefined, pageIndex: number | null | undefined): PdfJsonFont | null => { if (!fontId || !pdfDocument?.fonts) { return null; @@ -646,7 +666,14 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { // Measure text widths once per page/configuration and apply static scaling useLayoutEffect(() => { - if (!autoScaleText || visibleGroups.length === 0) { + if (!autoScaleText) { + // Clear all scales when auto-scale is disabled + setTextScales(new Map()); + measurementKeyRef.current = ''; + return; + } + + if (visibleGroups.length === 0) { return; } @@ -667,6 +694,13 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { return; } + // Skip multi-line paragraphs - auto-scaling doesn't work well with wrapped text + const lineCount = (group.text || '').split('\n').length; + if (lineCount > 1) { + newScales.set(group.id, 1); + return; + } + const element = document.querySelector(`[data-text-group="${group.id}"]`); if (!element) { return; @@ -705,7 +739,16 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { // Delay measurement to ensure fonts and layout are ready const timer = setTimeout(measureTextScales, 150); return () => clearTimeout(timer); - }, [autoScaleText, visibleGroups, editingGroupId, currentPage, pageHeight, scale, fontFamilies.size, selectedPage]); + }, [ + autoScaleText, + visibleGroups, + editingGroupId, + currentPage, + pageHeight, + scale, + fontFamilies.size, + selectedPage, + ]); useLayoutEffect(() => { // Only restore caret position during re-renders while already editing @@ -792,7 +835,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { }} > {content} - {activeGroupId === groupId && editingGroupId !== groupId && ( + {activeGroupId === groupId && ( { onChange={(event) => onForceSingleTextElementChange(event.currentTarget.checked)} /> + @@ -1325,11 +1369,24 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { if (fontSizePx > 0) { lineHeightRatio = Math.max(lineHeightRatio, lineHeightPx / fontSizePx); } + const detectedSpacingPx = + group.lineSpacing && group.lineSpacing > 0 ? group.lineSpacing * scale : undefined; + if (detectedSpacingPx && detectedSpacingPx > 0) { + lineHeightPx = Math.max(lineHeightPx, detectedSpacingPx); + if (fontSizePx > 0) { + lineHeightRatio = Math.max(lineHeightRatio, detectedSpacingPx / fontSizePx); + } + } + const lineCount = Math.max(group.text.split('\n').length, 1); + const paragraphHeightPx = + lineCount > 1 + ? lineHeightPx + (lineCount - 1) * (detectedSpacingPx ?? lineHeightPx) + : lineHeightPx; let containerLeft = bounds.left; let containerTop = bounds.top; let containerWidth = Math.max(bounds.width, fontSizePx); - let containerHeight = Math.max(bounds.height, lineHeightPx); + let containerHeight = Math.max(bounds.height, paragraphHeightPx); let transform: string | undefined; let transformOrigin: React.CSSProperties['transformOrigin']; @@ -1349,7 +1406,13 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { containerTop = anchorTop - containerHeight; } - if (!hasRotation && group.baseline !== null && group.baseline !== undefined && geometry) { + if ( + lineCount === 1 && + !hasRotation && + group.baseline !== null && + group.baseline !== undefined && + geometry + ) { const cssBaselineTop = (pageHeight - group.baseline) * scale; containerTop = Math.max(cssBaselineTop - ascentPx, 0); containerHeight = Math.max(containerHeight, ascentPx + descentPx); @@ -1364,7 +1427,8 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { left: `${containerLeft}px`, top: `${containerTop}px`, width: `${containerWidth}px`, - height: `${containerHeight}px`, + height: isEditing ? 'auto' : `${containerHeight}px`, + minHeight: `${containerHeight}px`, display: 'flex', alignItems: 'flex-start', justifyContent: 'flex-start', @@ -1423,23 +1487,12 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { setEditingGroupId(null); }} onInput={(event) => { - const value = event.currentTarget.innerText.replace(/\u00A0/g, ' '); - const offset = getCaretOffset(event.currentTarget); - caretOffsetsRef.current.set(group.id, offset); - onGroupEdit(group.pageIndex, group.id, value); - requestAnimationFrame(() => { - if (editingGroupId !== group.id) { - return; - } - const editor = editorRefs.current.get(group.id); - if (editor) { - setCaretOffset(editor, caretOffsetsRef.current.get(group.id) ?? editor.innerText.length); - } - }); + syncEditorValue(event.currentTarget, group.pageIndex, group.id); }} style={{ width: '100%', - height: '100%', + minHeight: '100%', + height: 'auto', padding: 0, backgroundColor: 'rgba(255,255,255,0.95)', color: textColor, @@ -1486,7 +1539,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { color: textColor, display: 'block', cursor: 'text', - overflow: 'visible', + overflow: 'hidden', }} > { display: 'inline-block', transform: shouldScale ? `scaleX(${textScale})` : undefined, transformOrigin: 'left center', + whiteSpace: 'pre', }} > {group.text || '\u00A0'} @@ -1503,57 +1557,43 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { , undefined, (event: React.MouseEvent) => { - // Double-click to edit - if (event.detail === 2) { - // Capture click position BEFORE switching to edit mode - const clickX = event.clientX; - const clickY = event.clientY; + const clickX = event.clientX; + const clickY = event.clientY; - setEditingGroupId(group.id); - setActiveGroupId(group.id); + setActiveGroupId(group.id); + setEditingGroupId(group.id); + caretOffsetsRef.current.delete(group.id); - // Clear any stored offset to prevent interference - caretOffsetsRef.current.delete(group.id); + requestAnimationFrame(() => { + const editor = document.querySelector(`[data-editor-group="${group.id}"]`); + if (!editor) return; + editor.focus(); - // Wait for editor to render, then position cursor at click location - requestAnimationFrame(() => { - const editor = document.querySelector(`[data-editor-group="${group.id}"]`); - if (!editor) return; - - // Focus the editor first - editor.focus(); - - // Use caretRangeFromPoint to position cursor at click coordinates - setTimeout(() => { - if (document.caretRangeFromPoint) { - const range = document.caretRangeFromPoint(clickX, clickY); - if (range) { - const selection = window.getSelection(); - if (selection) { - selection.removeAllRanges(); - selection.addRange(range); - } - } - } else if ((document as any).caretPositionFromPoint) { - // Firefox fallback - const pos = (document as any).caretPositionFromPoint(clickX, clickY); - if (pos) { - const range = document.createRange(); - range.setStart(pos.offsetNode, pos.offset); - range.collapse(true); - const selection = window.getSelection(); - if (selection) { - selection.removeAllRanges(); - selection.addRange(range); - } + setTimeout(() => { + if (document.caretRangeFromPoint) { + const range = document.caretRangeFromPoint(clickX, clickY); + if (range) { + const selection = window.getSelection(); + if (selection) { + selection.removeAllRanges(); + selection.addRange(range); } } - }, 10); - }); - } else { - // Single click just selects - setActiveGroupId(group.id); - } + } else if ((document as any).caretPositionFromPoint) { + const pos = (document as any).caretPositionFromPoint(clickX, clickY); + if (pos) { + const range = document.createRange(); + range.setStart(pos.offsetNode, pos.offset); + range.collapse(true); + const selection = window.getSelection(); + if (selection) { + selection.removeAllRanges(); + selection.addRange(range); + } + } + } + }, 10); + }); }, )} diff --git a/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx b/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx index 14648e3af..8fbddd8dc 100644 --- a/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx +++ b/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx @@ -1028,6 +1028,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { selectedPage, forceSingleTextElement, requestPagePreview, + setForceSingleTextElement, ]); const latestViewDataRef = useRef(viewData); diff --git a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts index fcb8aa83c..b75cdf762 100644 --- a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts +++ b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts @@ -155,6 +155,8 @@ export interface TextGroup { fontId?: string | null; fontSize?: number | null; fontMatrixSize?: number | null; + lineSpacing?: number | null; + lineElementCounts?: number[] | null; color?: string | null; fontWeight?: number | 'normal' | 'bold' | null; rotation?: number | null; diff --git a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts index 52cec1675..813155810 100644 --- a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts +++ b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts @@ -24,6 +24,54 @@ type FontMetrics = { type FontMetricsMap = Map; +const sanitizeParagraphText = (text: string | undefined | null): string => { + if (!text) { + return ''; + } + return text.replace(/\r?\n/g, ''); +}; + +const splitParagraphIntoLines = (text: string | undefined | null): string[] => { + if (text === null || text === undefined) { + return ['']; + } + return text.replace(/\r/g, '').split('\n'); +}; + +const extractElementBaseline = (element: PdfJsonTextElement): number | null => { + if (!element) { + return null; + } + if (element.textMatrix && element.textMatrix.length >= 6) { + const baseline = element.textMatrix[5]; + return typeof baseline === 'number' ? baseline : null; + } + if (typeof element.y === 'number') { + return element.y; + } + return null; +}; + +const shiftElementsBy = (elements: PdfJsonTextElement[], delta: number): PdfJsonTextElement[] => { + if (delta === 0) { + return elements.map(cloneTextElement); + } + return elements.map((element) => { + const clone = cloneTextElement(element); + if (clone.textMatrix && clone.textMatrix.length >= 6) { + const matrix = [...clone.textMatrix]; + matrix[5] = (matrix[5] ?? 0) + delta; + clone.textMatrix = matrix; + } + if (typeof clone.y === 'number') { + clone.y += delta; + } else if (clone.y === null || clone.y === undefined) { + clone.y = delta; + } + return clone; + }); +}; + const countGraphemes = (text: string): number => { if (!text) { return 0; @@ -472,6 +520,123 @@ const createGroup = ( }; }; +const groupLinesIntoParagraphs = ( + lineGroups: TextGroup[], + metrics?: FontMetricsMap, +): TextGroup[] => { + if (lineGroups.length === 0) { + return []; + } + + const paragraphs: TextGroup[][] = []; + let currentParagraph: TextGroup[] = [lineGroups[0]]; + + for (let i = 1; i < lineGroups.length; i++) { + const prevLine = lineGroups[i - 1]; + const currentLine = lineGroups[i]; + + // Calculate line spacing + const prevBaseline = prevLine.baseline ?? 0; + const currentBaseline = currentLine.baseline ?? 0; + const lineSpacing = Math.abs(prevBaseline - currentBaseline); + + // Calculate average font size + const prevFontSize = prevLine.fontSize ?? 12; + const currentFontSize = currentLine.fontSize ?? 12; + const avgFontSize = (prevFontSize + currentFontSize) / 2; + + // Check horizontal alignment (left edge) + const prevLeft = prevLine.bounds.left; + const currentLeft = currentLine.bounds.left; + const leftAlignmentTolerance = avgFontSize * 0.3; + const isLeftAligned = Math.abs(prevLeft - currentLeft) <= leftAlignmentTolerance; + + // Check if fonts match + const sameFont = prevLine.fontId === currentLine.fontId; + + // Check for consistent spacing rather than expected spacing + // Line spacing in PDFs can range from 1.0x to 3.0x font size + // We just want to ensure spacing is consistent between consecutive lines + // and not excessively large (which would indicate a paragraph break) + const maxReasonableSpacing = avgFontSize * 3.0; // Max ~3x font size for normal line spacing + const hasReasonableSpacing = lineSpacing <= maxReasonableSpacing; + + // Merge into paragraph if: + // 1. Left aligned + // 2. Same font + // 3. Reasonable line spacing (not a large gap indicating paragraph break) + const shouldMerge = isLeftAligned && sameFont && hasReasonableSpacing; + + if (shouldMerge) { + currentParagraph.push(currentLine); + } else { + paragraphs.push(currentParagraph); + currentParagraph = [currentLine]; + } + } + + // Don't forget the last paragraph + if (currentParagraph.length > 0) { + paragraphs.push(currentParagraph); + } + + // Merge line groups into single paragraph groups + return paragraphs.map((lines, paragraphIndex) => { + if (lines.length === 1) { + return lines[0]; + } + + // Combine all elements from all lines + const allElements = lines.flatMap(line => line.originalElements); + const pageIndex = lines[0].pageIndex; + const lineElementCounts = lines.map((line) => line.originalElements.length); + + // Create merged group with newlines between lines + const paragraphText = lines.map(line => line.text).join('\n'); + const mergedBounds = mergeBounds(lines.map(line => line.bounds)); + const spacingValues: number[] = []; + for (let i = 1; i < lines.length; i++) { + const prevBaseline = lines[i - 1].baseline ?? lines[i - 1].bounds.bottom; + const currentBaseline = lines[i].baseline ?? lines[i].bounds.bottom; + const spacing = Math.abs(prevBaseline - currentBaseline); + if (spacing > 0) { + spacingValues.push(spacing); + } + } + const averageSpacing = + spacingValues.length > 0 + ? spacingValues.reduce((sum, value) => sum + value, 0) / spacingValues.length + : null; + + const firstElement = allElements[0]; + const rotation = computeGroupRotation(allElements); + const anchor = rotation !== null ? getAnchorPoint(firstElement) : null; + const baselineLength = computeBaselineLength(allElements, metrics); + const baseline = computeAverageBaseline(allElements); + + return { + id: lines[0].id, // Keep the first line's ID + pageIndex, + fontId: firstElement?.fontId, + fontSize: firstElement?.fontSize, + fontMatrixSize: firstElement?.fontMatrixSize, + lineSpacing: averageSpacing, + lineElementCounts: lines.length > 1 ? lineElementCounts : null, + color: firstElement ? extractColor(firstElement) : null, + fontWeight: null, + rotation, + anchor, + baselineLength, + baseline, + elements: allElements.map(cloneTextElement), + originalElements: allElements.map(cloneTextElement), + text: paragraphText, + originalText: paragraphText, + bounds: mergedBounds, + }; + }); +}; + export const groupPageTextElements = ( page: PdfJsonPage | null | undefined, pageIndex: number, @@ -508,7 +673,7 @@ export const groupPageTextElements = ( }); let groupCounter = 0; - const groups: TextGroup[] = []; + const lineGroups: TextGroup[] = []; lines.forEach((line) => { let currentBucket: PdfJsonTextElement[] = []; @@ -527,6 +692,19 @@ export const groupPageTextElements = ( const sameFont = previous.fontId === element.fontId; let shouldSplit = gap > splitThreshold * (sameFont ? 1.4 : 1.0); + if (shouldSplit) { + const prevBaseline = getBaseline(previous); + const currentBaseline = getBaseline(element); + const baselineDelta = Math.abs(prevBaseline - currentBaseline); + const prevEndX = getX(previous) + getWidth(previous, metrics); + const prevEndY = prevBaseline; + const diagonalGap = Math.hypot(Math.max(0, getX(element) - prevEndX), baselineDelta); + const diagonalThreshold = Math.max(avgFontSize * 0.8, splitThreshold); + if (diagonalGap <= diagonalThreshold) { + shouldSplit = false; + } + } + const previousRotation = extractElementRotation(previous); const currentRotation = extractElementRotation(element); if ( @@ -539,7 +717,7 @@ export const groupPageTextElements = ( } if (shouldSplit) { - groups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics)); + lineGroups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics)); groupCounter += 1; currentBucket = [element]; } else { @@ -548,15 +726,17 @@ export const groupPageTextElements = ( }); if (currentBucket.length > 0) { - groups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics)); + lineGroups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics)); groupCounter += 1; } }); - return groups; + return groupLinesIntoParagraphs(lineGroups, metrics); }; -export const groupDocumentText = (document: PdfJsonDocument | null | undefined): TextGroup[][] => { +export const groupDocumentText = ( + document: PdfJsonDocument | null | undefined, +): TextGroup[][] => { const pages = document?.pages ?? []; const metrics = buildFontMetrics(document); return pages.map((page, index) => groupPageTextElements(page, index, metrics)); @@ -600,7 +780,7 @@ export const pageDimensions = (page: PdfJsonPage | null | undefined): { width: n export const createMergedElement = (group: TextGroup): PdfJsonTextElement => { const reference = group.originalElements[0]; const merged = cloneTextElement(reference); - merged.text = group.text; + merged.text = sanitizeParagraphText(group.text); clearGlyphHints(merged); if (reference.textMatrix && reference.textMatrix.length === 6) { merged.textMatrix = [...reference.textMatrix]; @@ -613,7 +793,8 @@ const distributeTextAcrossElements = (text: string | undefined, elements: PdfJso return true; } - const targetChars = Array.from(text ?? ''); + const normalizedText = sanitizeParagraphText(text); + const targetChars = Array.from(normalizedText); if (targetChars.length === 0) { elements.forEach((element) => { element.text = ''; @@ -627,10 +808,6 @@ const distributeTextAcrossElements = (text: string | undefined, elements: PdfJso const graphemeCount = Array.from(originalText).length; return graphemeCount > 0 ? graphemeCount : 1; }); - const totalCapacity = capacities.reduce((sum, value) => sum + value, 0); - if (targetChars.length > totalCapacity) { - return false; - } let cursor = 0; elements.forEach((element, index) => { @@ -640,7 +817,9 @@ const distributeTextAcrossElements = (text: string | undefined, elements: PdfJso if (index === elements.length - 1) { sliceLength = remaining; } else { - sliceLength = Math.min(capacities[index], remaining); + const capacity = Math.max(capacities[index], 1); + const minRemainingForRest = Math.max(elements.length - index - 1, 0); + sliceLength = Math.min(capacity, Math.max(remaining - minRemainingForRest, 1)); } } @@ -658,6 +837,118 @@ const distributeTextAcrossElements = (text: string | undefined, elements: PdfJso return true; }; +const sliceElementsByLineCounts = (group: TextGroup): PdfJsonTextElement[][] => { + const counts = group.lineElementCounts; + if (!counts || counts.length === 0) { + if (!group.originalElements.length) { + return []; + } + return [group.originalElements]; + } + + const result: PdfJsonTextElement[][] = []; + let cursor = 0; + counts.forEach((count) => { + if (count <= 0) { + return; + } + const slice = group.originalElements.slice(cursor, cursor + count); + if (slice.length > 0) { + result.push(slice); + } + cursor += count; + }); + return result; +}; + +const rebuildParagraphLineElements = (group: TextGroup): PdfJsonTextElement[] | null => { + if (!group.text || !group.text.includes('\n')) { + return null; + } + + const lineTexts = splitParagraphIntoLines(group.text); + if (lineTexts.length === 0) { + return []; + } + + const lineElementGroups = sliceElementsByLineCounts(group); + if (!lineElementGroups.length) { + return null; + } + + const lineBaselines = lineElementGroups.map((elements) => { + for (const element of elements) { + const baseline = extractElementBaseline(element); + if (baseline !== null) { + return baseline; + } + } + return group.baseline ?? null; + }); + + const spacingFromBaselines = (() => { + for (let i = 1; i < lineBaselines.length; i += 1) { + const prev = lineBaselines[i - 1]; + const current = lineBaselines[i]; + if (prev !== null && current !== null) { + const diff = Math.abs(prev - current); + if (diff > 0) { + return diff; + } + } + } + return null; + })(); + + const spacing = + (group.lineSpacing && group.lineSpacing > 0 + ? group.lineSpacing + : spacingFromBaselines) ?? + Math.max(group.fontMatrixSize ?? group.fontSize ?? 12, 6) * 1.2; + + let direction = -1; + for (let i = 1; i < lineBaselines.length; i += 1) { + const prev = lineBaselines[i - 1]; + const current = lineBaselines[i]; + if (prev !== null && current !== null && Math.abs(prev - current) > 0.05) { + direction = current < prev ? -1 : 1; + break; + } + } + + const templateCount = lineElementGroups.length; + const lastTemplateIndex = Math.max(templateCount - 1, 0); + const rebuilt: PdfJsonTextElement[] = []; + + for (let index = 0; index < lineTexts.length; index += 1) { + const templateIndex = Math.min(index, lastTemplateIndex); + const templateElements = lineElementGroups[templateIndex]; + if (!templateElements || templateElements.length === 0) { + return null; + } + + const shiftSteps = index - templateIndex; + const delta = shiftSteps * spacing * direction; + const clones = shiftElementsBy(templateElements, delta); + const normalizedLine = sanitizeParagraphText(lineTexts[index]); + const distributed = distributeTextAcrossElements(normalizedLine, clones); + + if (!distributed) { + const primary = clones[0]; + primary.text = normalizedLine; + clearGlyphHints(primary); + for (let i = 1; i < clones.length; i += 1) { + clones[i].text = ''; + clearGlyphHints(clones[i]); + } + } + + rebuilt.push(...clones); + } + + return rebuilt; +}; + export const buildUpdatedDocument = ( source: PdfJsonDocument, groupsByPage: TextGroup[][], @@ -724,11 +1015,17 @@ export const restoreGlyphElements = ( rebuiltElements.push(createMergedElement(group)); return; } + const paragraphElements = rebuildParagraphLineElements(group); + if (paragraphElements && paragraphElements.length > 0) { + rebuiltElements.push(...paragraphElements); + return; + } const originalGlyphCount = group.originalElements.reduce( (sum, element) => sum + countGraphemes(element.text ?? ''), 0, ); - const targetGlyphCount = countGraphemes(group.text); + const normalizedText = sanitizeParagraphText(group.text); + const targetGlyphCount = countGraphemes(normalizedText); if (targetGlyphCount !== originalGlyphCount) { rebuiltElements.push(createMergedElement(group)); @@ -736,7 +1033,7 @@ export const restoreGlyphElements = ( } const originals = group.originalElements.map(cloneTextElement); - const distributed = distributeTextAcrossElements(group.text, originals); + const distributed = distributeTextAcrossElements(normalizedText, originals); if (distributed) { rebuiltElements.push(...originals); } else {