import { BoundingBox, PdfJsonDocument, PdfJsonPage, PdfJsonTextElement, PdfJsonImageElement, TextGroup, DEFAULT_PAGE_HEIGHT, DEFAULT_PAGE_WIDTH, } from './pdfTextEditorTypes'; const LINE_TOLERANCE = 2; const GAP_FACTOR = 0.6; const SPACE_MIN_GAP = 1.5; const MIN_CHAR_WIDTH_FACTOR = 0.35; const MAX_CHAR_WIDTH_FACTOR = 1.25; const EXTRA_GAP_RATIO = 0.8; type FontMetrics = { unitsPerEm: number; ascent: number; descent: number; }; type FontMetricsMap = Map; const sanitizeParagraphText = (text: string | undefined | null): string => { if (!text) { return ''; } return text.replace(/\r?\n/g, ''); }; const splitParagraphIntoLines = (text: string | undefined | null): string[] => { if (text === null || text === undefined) { return ['']; } return text.replace(/\r/g, '').split('\n'); }; const extractElementBaseline = (element: PdfJsonTextElement): number | null => { if (!element) { return null; } if (element.textMatrix && element.textMatrix.length >= 6) { const baseline = element.textMatrix[5]; return typeof baseline === 'number' ? baseline : null; } if (typeof element.y === 'number') { return element.y; } return null; }; const shiftElementsBy = (elements: PdfJsonTextElement[], delta: number): PdfJsonTextElement[] => { if (delta === 0) { return elements.map(cloneTextElement); } return elements.map((element) => { const clone = cloneTextElement(element); if (clone.textMatrix && clone.textMatrix.length >= 6) { const matrix = [...clone.textMatrix]; matrix[5] = (matrix[5] ?? 0) + delta; clone.textMatrix = matrix; } if (typeof clone.y === 'number') { clone.y += delta; } else if (clone.y === null || clone.y === undefined) { clone.y = delta; } return clone; }); }; const countGraphemes = (text: string): number => { if (!text) { return 0; } return Array.from(text).length; }; const metricsFor = (metrics: FontMetricsMap | undefined, fontId?: string | null): FontMetrics | undefined => { if (!metrics || !fontId) { return undefined; } return metrics.get(fontId) ?? undefined; }; const buildFontMetrics = (document: PdfJsonDocument | null | undefined): FontMetricsMap => { const metrics: FontMetricsMap = new Map(); document?.fonts?.forEach((font) => { if (!font) { return; } const unitsPerEm = font.unitsPerEm && font.unitsPerEm > 0 ? font.unitsPerEm : 1000; const ascent = font.ascent ?? unitsPerEm * 0.8; const descent = font.descent ?? -(unitsPerEm * 0.2); const metric: FontMetrics = { unitsPerEm, ascent, descent }; if (font.id) { metrics.set(font.id, metric); } if (font.uid) { metrics.set(font.uid, metric); } }); return metrics; }; export const valueOr = (value: number | null | undefined, fallback = 0): number => { if (value === null || value === undefined || Number.isNaN(value)) { return fallback; } return value; }; export const cloneTextElement = (element: PdfJsonTextElement): PdfJsonTextElement => ({ ...element, textMatrix: element.textMatrix ? [...element.textMatrix] : element.textMatrix ?? undefined, }); const clearGlyphHints = (element: PdfJsonTextElement): void => { if (!element) { return; } element.charCodes = undefined; }; export const cloneImageElement = (element: PdfJsonImageElement): PdfJsonImageElement => ({ ...element, transform: element.transform ? [...element.transform] : element.transform ?? undefined, }); const getBaseline = (element: PdfJsonTextElement): number => { if (element.textMatrix && element.textMatrix.length === 6) { return valueOr(element.textMatrix[5]); } return valueOr(element.y); }; const getX = (element: PdfJsonTextElement): number => { if (element.textMatrix && element.textMatrix.length === 6) { return valueOr(element.textMatrix[4]); } return valueOr(element.x); }; const getWidth = (element: PdfJsonTextElement, metrics?: FontMetricsMap): number => { const width = valueOr(element.width, 0); if (width > 0) { return width; } const text = element.text ?? ''; const glyphCount = Math.max(1, countGraphemes(text)); const spacingFallback = Math.max( valueOr(element.spaceWidth, 0), valueOr(element.wordSpacing, 0), valueOr(element.characterSpacing, 0), ); if (spacingFallback > 0 && text.trim().length === 0) { return spacingFallback; } const fontSize = getFontSize(element); const fontMetrics = metricsFor(metrics, element.fontId); if (fontMetrics) { const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000; const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8; const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2)); const combinedUnits = Math.max(unitsPerEm * 0.8, ascentUnits + descentUnits); const averageAdvanceUnits = Math.max(unitsPerEm * 0.5, combinedUnits / Math.max(1, glyphCount)); const fallbackWidth = (averageAdvanceUnits / unitsPerEm) * glyphCount * fontSize; if (fallbackWidth > 0) { return fallbackWidth; } } return fontSize * glyphCount * 0.5; }; const getFontSize = (element: PdfJsonTextElement): number => valueOr(element.fontMatrixSize ?? element.fontSize, 12); const getHeight = (element: PdfJsonTextElement, metrics?: FontMetricsMap): number => { const height = valueOr(element.height, 0); if (height > 0) { return height; } const fontSize = getFontSize(element); const fontMetrics = metricsFor(metrics, element.fontId); if (fontMetrics) { const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000; const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8; const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2)); const totalUnits = Math.max(unitsPerEm, ascentUnits + descentUnits); if (totalUnits > 0) { return (totalUnits / unitsPerEm) * fontSize; } } return fontSize; }; const getElementBounds = ( element: PdfJsonTextElement, metrics?: FontMetricsMap, ): BoundingBox => { const left = getX(element); const width = getWidth(element, metrics); const baseline = getBaseline(element); const height = getHeight(element, metrics); let ascentRatio = 0.8; let descentRatio = 0.2; const fontMetrics = metricsFor(metrics, element.fontId); if (fontMetrics) { const unitsPerEm = fontMetrics.unitsPerEm > 0 ? fontMetrics.unitsPerEm : 1000; const ascentUnits = fontMetrics.ascent ?? unitsPerEm * 0.8; const descentUnits = Math.abs(fontMetrics.descent ?? -(unitsPerEm * 0.2)); const totalUnits = Math.max(unitsPerEm, ascentUnits + descentUnits); if (totalUnits > 0) { ascentRatio = ascentUnits / totalUnits; descentRatio = descentUnits / totalUnits; } } const bottom = baseline + height * ascentRatio; const top = baseline - height * descentRatio; return { left, right: left + width, top, bottom, }; }; export const getImageBounds = (element: PdfJsonImageElement): BoundingBox => { const left = valueOr(element.left ?? element.x, 0); const computedWidth = valueOr(element.width, Math.max(valueOr(element.right, left) - left, 0)); const right = valueOr(element.right ?? left + computedWidth, left + computedWidth); const bottom = valueOr(element.bottom ?? element.y, 0); const computedHeight = valueOr(element.height, Math.max(valueOr(element.top, bottom) - bottom, 0)); const top = valueOr(element.top ?? bottom + computedHeight, bottom + computedHeight); return { left, right, bottom, top, }; }; const getSpacingHint = (element: PdfJsonTextElement): number => { const spaceWidth = valueOr(element.spaceWidth, 0); if (spaceWidth > 0) { return spaceWidth; } const wordSpacing = valueOr(element.wordSpacing, 0); if (wordSpacing > 0) { return wordSpacing; } const characterSpacing = valueOr(element.characterSpacing, 0); return Math.max(characterSpacing, 0); }; const estimateCharWidth = ( element: PdfJsonTextElement, avgFontSize: number, metrics?: FontMetricsMap, ): number => { const rawWidth = getWidth(element, metrics); const minWidth = avgFontSize * MIN_CHAR_WIDTH_FACTOR; const maxWidth = avgFontSize * MAX_CHAR_WIDTH_FACTOR; return Math.min(Math.max(rawWidth, minWidth), maxWidth); }; const mergeBounds = (bounds: BoundingBox[]): BoundingBox => { if (bounds.length === 0) { return { left: 0, right: 0, top: 0, bottom: 0 }; } return bounds.reduce( (acc, current) => ({ left: Math.min(acc.left, current.left), right: Math.max(acc.right, current.right), top: Math.min(acc.top, current.top), bottom: Math.max(acc.bottom, current.bottom), }), { ...bounds[0] } ); }; const shouldInsertSpace = ( prev: PdfJsonTextElement, current: PdfJsonTextElement, metrics?: FontMetricsMap, ): boolean => { const prevRight = getX(prev) + getWidth(prev, metrics); const trailingGap = Math.max(0, getX(current) - prevRight); const avgFontSize = (getFontSize(prev) + getFontSize(current)) / 2; const baselineAdvance = Math.max(0, getX(current) - getX(prev)); const charWidthEstimate = estimateCharWidth(prev, avgFontSize, metrics); const inferredGap = Math.max(0, baselineAdvance - charWidthEstimate); const spacingHint = Math.max( SPACE_MIN_GAP, getSpacingHint(prev), getSpacingHint(current), avgFontSize * GAP_FACTOR, ); if (trailingGap > spacingHint) { return true; } if (inferredGap > spacingHint * EXTRA_GAP_RATIO) { return true; } const prevText = (prev.text ?? '').trimEnd(); if (prevText.endsWith('-')) { return false; } return false; }; const buildGroupText = (elements: PdfJsonTextElement[], metrics?: FontMetricsMap): string => { let result = ''; elements.forEach((element, index) => { const value = element.text ?? ''; if (index === 0) { result += value; return; } const previous = elements[index - 1]; const needsSpace = shouldInsertSpace(previous, element, metrics); const startsWithWhitespace = /^\s/u.test(value); if (needsSpace && !startsWithWhitespace) { result += ' '; } result += value; }); return result; }; const rgbToCss = (components: number[]): string => { if (components.length >= 3) { const r = Math.round(Math.max(0, Math.min(1, components[0])) * 255); const g = Math.round(Math.max(0, Math.min(1, components[1])) * 255); const b = Math.round(Math.max(0, Math.min(1, components[2])) * 255); return `rgb(${r}, ${g}, ${b})`; } return 'rgb(0, 0, 0)'; }; const cmykToCss = (components: number[]): string => { if (components.length >= 4) { const c = Math.max(0, Math.min(1, components[0])); const m = Math.max(0, Math.min(1, components[1])); const y = Math.max(0, Math.min(1, components[2])); const k = Math.max(0, Math.min(1, components[3])); const r = Math.round(255 * (1 - c) * (1 - k)); const g = Math.round(255 * (1 - m) * (1 - k)); const b = Math.round(255 * (1 - y) * (1 - k)); return `rgb(${r}, ${g}, ${b})`; } return 'rgb(0, 0, 0)'; }; const grayToCss = (components: number[]): string => { if (components.length >= 1) { const gray = Math.round(Math.max(0, Math.min(1, components[0])) * 255); return `rgb(${gray}, ${gray}, ${gray})`; } return 'rgb(0, 0, 0)'; }; const extractColor = (element: PdfJsonTextElement): string | null => { const fillColor = element.fillColor; if (!fillColor || !fillColor.components || fillColor.components.length === 0) { return null; } const colorSpace = (fillColor.colorSpace ?? '').toLowerCase(); if (colorSpace.includes('rgb') || colorSpace.includes('srgb')) { return rgbToCss(fillColor.components); } if (colorSpace.includes('cmyk')) { return cmykToCss(fillColor.components); } if (colorSpace.includes('gray') || colorSpace.includes('grey')) { return grayToCss(fillColor.components); } // Default to RGB interpretation if (fillColor.components.length >= 3) { return rgbToCss(fillColor.components); } if (fillColor.components.length === 1) { return grayToCss(fillColor.components); } return null; }; const RAD_TO_DEG = 180 / Math.PI; const normalizeAngle = (angle: number): number => { let normalized = angle % 360; if (normalized > 180) { normalized -= 360; } else if (normalized <= -180) { normalized += 360; } return normalized; }; const extractElementRotation = (element: PdfJsonTextElement): number | null => { const matrix = element.textMatrix; if (!matrix || matrix.length !== 6) { return null; } const a = matrix[0]; const b = matrix[1]; if (Math.abs(a) < 1e-6 && Math.abs(b) < 1e-6) { return null; } const angle = Math.atan2(b, a) * RAD_TO_DEG; if (Math.abs(angle) < 0.5) { return null; } return normalizeAngle(angle); }; const computeGroupRotation = (elements: PdfJsonTextElement[]): number | null => { const angles = elements .map(extractElementRotation) .filter((angle): angle is number => angle !== null); if (angles.length === 0) { return null; } const vector = angles.reduce( (acc, angle) => { const radians = (angle * Math.PI) / 180; acc.x += Math.cos(radians); acc.y += Math.sin(radians); return acc; }, { x: 0, y: 0 }, ); if (Math.abs(vector.x) < 1e-6 && Math.abs(vector.y) < 1e-6) { return null; } const average = Math.atan2(vector.y, vector.x) * RAD_TO_DEG; const normalized = normalizeAngle(average); return Math.abs(normalized) < 0.5 ? null : normalized; }; const getAnchorPoint = (element: PdfJsonTextElement): { x: number; y: number } => { if (element.textMatrix && element.textMatrix.length === 6) { return { x: valueOr(element.textMatrix[4]), y: valueOr(element.textMatrix[5]), }; } return { x: valueOr(element.x), y: valueOr(element.y), }; }; const computeBaselineLength = ( elements: PdfJsonTextElement[], metrics?: FontMetricsMap, ): number => elements.reduce((acc, current) => acc + getWidth(current, metrics), 0); const computeAverageBaseline = (elements: PdfJsonTextElement[]): number | null => { if (elements.length === 0) { return null; } let sum = 0; elements.forEach((element) => { sum += getBaseline(element); }); return sum / elements.length; }; const createGroup = ( pageIndex: number, idSuffix: number, elements: PdfJsonTextElement[], metrics?: FontMetricsMap, ): TextGroup => { const clones = elements.map(cloneTextElement); const originalClones = clones.map(cloneTextElement); const bounds = mergeBounds(elements.map((element) => getElementBounds(element, metrics))); const firstElement = elements[0]; const rotation = computeGroupRotation(elements); const anchor = rotation !== null ? getAnchorPoint(firstElement) : null; const baselineLength = computeBaselineLength(elements, metrics); const baseline = computeAverageBaseline(elements); return { id: `${pageIndex}-${idSuffix}`, pageIndex, fontId: firstElement?.fontId, fontSize: firstElement?.fontSize, fontMatrixSize: firstElement?.fontMatrixSize, color: firstElement ? extractColor(firstElement) : null, fontWeight: null, // Will be determined from font descriptor rotation, anchor, baselineLength, baseline, elements: clones, originalElements: originalClones, text: buildGroupText(elements, metrics), originalText: buildGroupText(elements, metrics), bounds, }; }; const groupLinesIntoParagraphs = ( lineGroups: TextGroup[], metrics?: FontMetricsMap, ): TextGroup[] => { if (lineGroups.length === 0) { return []; } const paragraphs: TextGroup[][] = []; let currentParagraph: TextGroup[] = [lineGroups[0]]; for (let i = 1; i < lineGroups.length; i++) { const prevLine = lineGroups[i - 1]; const currentLine = lineGroups[i]; // Calculate line spacing const prevBaseline = prevLine.baseline ?? 0; const currentBaseline = currentLine.baseline ?? 0; const lineSpacing = Math.abs(prevBaseline - currentBaseline); // Calculate average font size const prevFontSize = prevLine.fontSize ?? 12; const currentFontSize = currentLine.fontSize ?? 12; const avgFontSize = (prevFontSize + currentFontSize) / 2; // Check horizontal alignment (left edge) const prevLeft = prevLine.bounds.left; const currentLeft = currentLine.bounds.left; const leftAlignmentTolerance = avgFontSize * 0.3; const isLeftAligned = Math.abs(prevLeft - currentLeft) <= leftAlignmentTolerance; // Check if fonts match const sameFont = prevLine.fontId === currentLine.fontId; // Check for consistent spacing rather than expected spacing // Line spacing in PDFs can range from 1.0x to 3.0x font size // We just want to ensure spacing is consistent between consecutive lines // and not excessively large (which would indicate a paragraph break) const maxReasonableSpacing = avgFontSize * 3.0; // Max ~3x font size for normal line spacing const hasReasonableSpacing = lineSpacing <= maxReasonableSpacing; // Merge into paragraph if: // 1. Left aligned // 2. Same font // 3. Reasonable line spacing (not a large gap indicating paragraph break) const shouldMerge = isLeftAligned && sameFont && hasReasonableSpacing; if (shouldMerge) { currentParagraph.push(currentLine); } else { paragraphs.push(currentParagraph); currentParagraph = [currentLine]; } } // Don't forget the last paragraph if (currentParagraph.length > 0) { paragraphs.push(currentParagraph); } // Merge line groups into single paragraph groups return paragraphs.map((lines, paragraphIndex) => { if (lines.length === 1) { return lines[0]; } // Combine all elements from all lines const allElements = lines.flatMap(line => line.originalElements); const pageIndex = lines[0].pageIndex; const lineElementCounts = lines.map((line) => line.originalElements.length); // Create merged group with newlines between lines const paragraphText = lines.map(line => line.text).join('\n'); const mergedBounds = mergeBounds(lines.map(line => line.bounds)); const spacingValues: number[] = []; for (let i = 1; i < lines.length; i++) { const prevBaseline = lines[i - 1].baseline ?? lines[i - 1].bounds.bottom; const currentBaseline = lines[i].baseline ?? lines[i].bounds.bottom; const spacing = Math.abs(prevBaseline - currentBaseline); if (spacing > 0) { spacingValues.push(spacing); } } const averageSpacing = spacingValues.length > 0 ? spacingValues.reduce((sum, value) => sum + value, 0) / spacingValues.length : null; const firstElement = allElements[0]; const rotation = computeGroupRotation(allElements); const anchor = rotation !== null ? getAnchorPoint(firstElement) : null; const baselineLength = computeBaselineLength(allElements, metrics); const baseline = computeAverageBaseline(allElements); return { id: lines[0].id, // Keep the first line's ID pageIndex, fontId: firstElement?.fontId, fontSize: firstElement?.fontSize, fontMatrixSize: firstElement?.fontMatrixSize, lineSpacing: averageSpacing, lineElementCounts: lines.length > 1 ? lineElementCounts : null, color: firstElement ? extractColor(firstElement) : null, fontWeight: null, rotation, anchor, baselineLength, baseline, elements: allElements.map(cloneTextElement), originalElements: allElements.map(cloneTextElement), text: paragraphText, originalText: paragraphText, bounds: mergedBounds, }; }); }; export const groupPageTextElements = ( page: PdfJsonPage | null | undefined, pageIndex: number, metrics?: FontMetricsMap, groupingMode: 'auto' | 'paragraph' | 'singleLine' = 'auto', ): TextGroup[] => { if (!page?.textElements || page.textElements.length === 0) { return []; } const pageWidth = valueOr(page.width, DEFAULT_PAGE_WIDTH); const elements = page.textElements .map(cloneTextElement) .filter((element) => element.text !== null && element.text !== undefined); elements.sort((a, b) => getBaseline(b) - getBaseline(a)); const lines: { baseline: number; elements: PdfJsonTextElement[] }[] = []; elements.forEach((element) => { const baseline = getBaseline(element); const fontSize = getFontSize(element); const tolerance = Math.max(LINE_TOLERANCE, fontSize * 0.12); const existingLine = lines.find((line) => Math.abs(line.baseline - baseline) <= tolerance); if (existingLine) { existingLine.elements.push(element); } else { lines.push({ baseline, elements: [element] }); } }); lines.forEach((line) => { line.elements.sort((a, b) => getX(a) - getX(b)); }); let groupCounter = 0; const lineGroups: TextGroup[] = []; lines.forEach((line) => { let currentBucket: PdfJsonTextElement[] = []; line.elements.forEach((element) => { if (currentBucket.length === 0) { currentBucket.push(element); return; } const previous = currentBucket[currentBucket.length - 1]; const gap = getX(element) - (getX(previous) + getWidth(previous, metrics)); const avgFontSize = (getFontSize(previous) + getFontSize(element)) / 2; const splitThreshold = Math.max(SPACE_MIN_GAP, avgFontSize * GAP_FACTOR); const sameFont = previous.fontId === element.fontId; let shouldSplit = gap > splitThreshold * (sameFont ? 1.4 : 1.0); if (shouldSplit) { const prevBaseline = getBaseline(previous); const currentBaseline = getBaseline(element); const baselineDelta = Math.abs(prevBaseline - currentBaseline); const prevEndX = getX(previous) + getWidth(previous, metrics); const prevEndY = prevBaseline; const diagonalGap = Math.hypot(Math.max(0, getX(element) - prevEndX), baselineDelta); const diagonalThreshold = Math.max(avgFontSize * 0.8, splitThreshold); if (diagonalGap <= diagonalThreshold) { shouldSplit = false; } } const previousRotation = extractElementRotation(previous); const currentRotation = extractElementRotation(element); if ( shouldSplit && previousRotation !== null && currentRotation !== null && Math.abs(normalizeAngle(previousRotation - currentRotation)) < 1 ) { shouldSplit = false; } if (shouldSplit) { lineGroups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics)); groupCounter += 1; currentBucket = [element]; } else { currentBucket.push(element); } }); if (currentBucket.length > 0) { lineGroups.push(createGroup(pageIndex, groupCounter, currentBucket, metrics)); groupCounter += 1; } }); // Apply paragraph grouping based on mode if (groupingMode === 'singleLine') { // Single line mode: skip paragraph grouping return lineGroups; } if (groupingMode === 'paragraph') { // Paragraph mode: always apply grouping return groupLinesIntoParagraphs(lineGroups, metrics); } // Auto mode: use heuristic to determine if we should group // Analyze the page content to decide let multiLineGroups = 0; let totalWords = 0; let longTextGroups = 0; let totalGroups = 0; const wordCounts: number[] = []; let fullWidthLines = 0; // Define "full width" as extending to at least 70% of page width const fullWidthThreshold = pageWidth * 0.7; lineGroups.forEach((group) => { const text = (group.text || '').trim(); if (text.length === 0) return; totalGroups++; const lines = text.split('\n'); const lineCount = lines.length; const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length; totalWords += wordCount; wordCounts.push(wordCount); if (lineCount > 1) { multiLineGroups++; } if (wordCount >= 10 || text.length >= 50) { longTextGroups++; } // Check if this line extends close to the right margin (paragraph-like) const rightEdge = group.bounds.right; if (rightEdge >= fullWidthThreshold) { fullWidthLines++; } }); if (totalGroups === 0) { return lineGroups; } const avgWordsPerGroup = totalWords / totalGroups; const longTextRatio = longTextGroups / totalGroups; const fullWidthRatio = fullWidthLines / totalGroups; // Calculate variance in line lengths (paragraphs have varying lengths, lists are uniform) const variance = wordCounts.reduce((sum, count) => { const diff = count - avgWordsPerGroup; return sum + diff * diff; }, 0) / totalGroups; const stdDev = Math.sqrt(variance); const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0; // Check each criterion const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8; const criterion2 = avgWordsPerGroup > 5; const criterion3 = longTextRatio > 0.4; const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4; // Log detection stats console.log(`📄 Page ${pageIndex} Grouping Analysis (mode: ${groupingMode}):`); console.log(` Stats:`); console.log(` • Page width: ${pageWidth.toFixed(1)}pt (full-width threshold: ${fullWidthThreshold.toFixed(1)}pt)`); console.log(` • Multi-line groups: ${multiLineGroups}`); console.log(` • Total groups: ${totalGroups}`); console.log(` • Total words: ${totalWords}`); console.log(` • Long text groups (≥10 words or ≥50 chars): ${longTextGroups}`); console.log(` • Full-width lines (≥70% page width): ${fullWidthLines}`); console.log(` • Avg words per group: ${avgWordsPerGroup.toFixed(2)}`); console.log(` • Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`); console.log(` • Full-width ratio: ${(fullWidthRatio * 100).toFixed(1)}%`); console.log(` • Std deviation: ${stdDev.toFixed(2)}`); console.log(` • Coefficient of variation: ${coefficientOfVariation.toFixed(2)}`); console.log(` Criteria:`); console.log(` 1. Multi-line + Avg Words: ${criterion1 ? '✅ PASS' : '❌ FAIL'}`); console.log(` (${multiLineGroups} >= 2 AND ${avgWordsPerGroup.toFixed(2)} > 8)`); console.log(` 2. Avg Words Only: ${criterion2 ? '✅ PASS' : '❌ FAIL'}`); console.log(` (${avgWordsPerGroup.toFixed(2)} > 5)`); console.log(` 3. Long Text Ratio: ${criterion3 ? '✅ PASS' : '❌ FAIL'}`); console.log(` (${(longTextRatio * 100).toFixed(1)}% > 40%)`); console.log(` 4. Line Width Pattern: ${criterion4 ? '✅ PASS' : '❌ FAIL'}`); console.log(` (CV ${coefficientOfVariation.toFixed(2)} > 0.5 OR ${(fullWidthRatio * 100).toFixed(1)}% > 60%)`); console.log(` ${coefficientOfVariation > 0.5 ? '✓ High variance (varying line lengths)' : '✗ Low variance'} ${fullWidthRatio > 0.6 ? '✓ Many full-width lines (paragraph-like)' : '✗ Few full-width lines (list-like)'}`); console.log(` Decision: ${isParagraphPage ? '📝 PARAGRAPH MODE' : '📋 LINE MODE'}`); if (isParagraphPage) { console.log(` Reason: All criteria passed (AND logic)`); } else { const failedReasons = []; if (!criterion1) failedReasons.push('insufficient multi-line groups or word density'); if (!criterion2) failedReasons.push('low average words per group'); if (!criterion3) failedReasons.push('low ratio of long text groups'); if (!criterion4) failedReasons.push('low variance and few full-width lines (list-like structure)'); console.log(` Reason: ${failedReasons.join(', ')}`); } console.log(''); // Only apply paragraph grouping if it looks like a paragraph-heavy page if (isParagraphPage) { console.log(`🔀 Applying paragraph grouping to page ${pageIndex}`); return groupLinesIntoParagraphs(lineGroups, metrics); } // For sparse pages, keep lines separate console.log(`📋 Keeping lines separate for page ${pageIndex}`); return lineGroups; }; export const groupDocumentText = ( document: PdfJsonDocument | null | undefined, groupingMode: 'auto' | 'paragraph' | 'singleLine' = 'auto', ): TextGroup[][] => { const pages = document?.pages ?? []; const metrics = buildFontMetrics(document); return pages.map((page, index) => groupPageTextElements(page, index, metrics, groupingMode)); }; export const extractPageImages = ( page: PdfJsonPage | null | undefined, pageIndex: number, ): PdfJsonImageElement[] => { const images = page?.imageElements ?? []; return images.map((image, imageIndex) => { const clone = cloneImageElement(image); if (!clone.id || clone.id.trim().length === 0) { clone.id = `page-${pageIndex}-image-${imageIndex}`; } return clone; }); }; export const extractDocumentImages = ( document: PdfJsonDocument | null | undefined, ): PdfJsonImageElement[][] => { const pages = document?.pages ?? []; return pages.map((page, index) => extractPageImages(page, index)); }; export const deepCloneDocument = (document: PdfJsonDocument): PdfJsonDocument => { if (typeof structuredClone === 'function') { return structuredClone(document); } return JSON.parse(JSON.stringify(document)); }; export const pageDimensions = (page: PdfJsonPage | null | undefined): { width: number; height: number } => { return { width: valueOr(page?.width, DEFAULT_PAGE_WIDTH), height: valueOr(page?.height, DEFAULT_PAGE_HEIGHT), }; }; export const createMergedElement = (group: TextGroup): PdfJsonTextElement => { const reference = group.originalElements[0]; const merged = cloneTextElement(reference); merged.text = sanitizeParagraphText(group.text); clearGlyphHints(merged); if (reference.textMatrix && reference.textMatrix.length === 6) { merged.textMatrix = [...reference.textMatrix]; } return merged; }; const distributeTextAcrossElements = (text: string | undefined, elements: PdfJsonTextElement[]): boolean => { if (elements.length === 0) { return true; } const normalizedText = sanitizeParagraphText(text); const targetChars = Array.from(normalizedText); if (targetChars.length === 0) { elements.forEach((element) => { element.text = ''; clearGlyphHints(element); }); return true; } const capacities = elements.map((element) => { const originalText = element.text ?? ''; const graphemeCount = Array.from(originalText).length; return graphemeCount > 0 ? graphemeCount : 1; }); let cursor = 0; elements.forEach((element, index) => { const remaining = targetChars.length - cursor; let sliceLength = 0; if (remaining > 0) { if (index === elements.length - 1) { sliceLength = remaining; } else { const capacity = Math.max(capacities[index], 1); const minRemainingForRest = Math.max(elements.length - index - 1, 0); sliceLength = Math.min(capacity, Math.max(remaining - minRemainingForRest, 1)); } } element.text = sliceLength > 0 ? targetChars.slice(cursor, cursor + sliceLength).join('') : ''; clearGlyphHints(element); cursor += sliceLength; }); elements.forEach((element) => { if (element.text == null) { element.text = ''; } }); return true; }; const sliceElementsByLineCounts = (group: TextGroup): PdfJsonTextElement[][] => { const counts = group.lineElementCounts; if (!counts || counts.length === 0) { if (!group.originalElements.length) { return []; } return [group.originalElements]; } const result: PdfJsonTextElement[][] = []; let cursor = 0; counts.forEach((count) => { if (count <= 0) { return; } const slice = group.originalElements.slice(cursor, cursor + count); if (slice.length > 0) { result.push(slice); } cursor += count; }); return result; }; const rebuildParagraphLineElements = (group: TextGroup): PdfJsonTextElement[] | null => { if (!group.text || !group.text.includes('\n')) { return null; } const lineTexts = splitParagraphIntoLines(group.text); if (lineTexts.length === 0) { return []; } const lineElementGroups = sliceElementsByLineCounts(group); if (!lineElementGroups.length) { return null; } const lineBaselines = lineElementGroups.map((elements) => { for (const element of elements) { const baseline = extractElementBaseline(element); if (baseline !== null) { return baseline; } } return group.baseline ?? null; }); const spacingFromBaselines = (() => { for (let i = 1; i < lineBaselines.length; i += 1) { const prev = lineBaselines[i - 1]; const current = lineBaselines[i]; if (prev !== null && current !== null) { const diff = Math.abs(prev - current); if (diff > 0) { return diff; } } } return null; })(); const spacing = (group.lineSpacing && group.lineSpacing > 0 ? group.lineSpacing : spacingFromBaselines) ?? Math.max(group.fontMatrixSize ?? group.fontSize ?? 12, 6) * 1.2; let direction = -1; for (let i = 1; i < lineBaselines.length; i += 1) { const prev = lineBaselines[i - 1]; const current = lineBaselines[i]; if (prev !== null && current !== null && Math.abs(prev - current) > 0.05) { direction = current < prev ? -1 : 1; break; } } const templateCount = lineElementGroups.length; const lastTemplateIndex = Math.max(templateCount - 1, 0); const rebuilt: PdfJsonTextElement[] = []; for (let index = 0; index < lineTexts.length; index += 1) { const templateIndex = Math.min(index, lastTemplateIndex); const templateElements = lineElementGroups[templateIndex]; if (!templateElements || templateElements.length === 0) { return null; } const shiftSteps = index - templateIndex; const delta = shiftSteps * spacing * direction; const clones = shiftElementsBy(templateElements, delta); const normalizedLine = sanitizeParagraphText(lineTexts[index]); const distributed = distributeTextAcrossElements(normalizedLine, clones); if (!distributed) { const primary = clones[0]; primary.text = normalizedLine; clearGlyphHints(primary); for (let i = 1; i < clones.length; i += 1) { clones[i].text = ''; clearGlyphHints(clones[i]); } } rebuilt.push(...clones); } return rebuilt; }; export const buildUpdatedDocument = ( source: PdfJsonDocument, groupsByPage: TextGroup[][], imagesByPage: PdfJsonImageElement[][], ): PdfJsonDocument => { const updated = deepCloneDocument(source); const pages = updated.pages ?? []; updated.pages = pages.map((page, pageIndex) => { const groups = groupsByPage[pageIndex] ?? []; const images = imagesByPage[pageIndex] ?? []; if (!groups.length) { return { ...page, imageElements: images.map(cloneImageElement), }; } const updatedElements: PdfJsonTextElement[] = groups.flatMap((group) => { if (group.text === group.originalText) { return group.originalElements.map(cloneTextElement); } return [createMergedElement(group)]; }); return { ...page, textElements: updatedElements, imageElements: images.map(cloneImageElement), contentStreams: page.contentStreams ?? [], }; }); return updated; }; export const restoreGlyphElements = ( source: PdfJsonDocument, groupsByPage: TextGroup[][], imagesByPage: PdfJsonImageElement[][], originalImagesByPage: PdfJsonImageElement[][], forceMergedGroups: boolean = false, ): PdfJsonDocument => { const updated = deepCloneDocument(source); const pages = updated.pages ?? []; updated.pages = pages.map((page, pageIndex) => { const groups = groupsByPage[pageIndex] ?? []; const images = imagesByPage[pageIndex] ?? []; const baselineImages = originalImagesByPage[pageIndex] ?? []; if (!groups.length) { return { ...page, imageElements: images.map(cloneImageElement), }; } const rebuiltElements: PdfJsonTextElement[] = []; groups.forEach((group) => { if (group.text !== group.originalText) { // Always try to rebuild paragraph lines if text has newlines const paragraphElements = rebuildParagraphLineElements(group); if (paragraphElements && paragraphElements.length > 0) { rebuiltElements.push(...paragraphElements); return; } // If no newlines or rebuilding failed, check if we should force merge if (forceMergedGroups) { rebuiltElements.push(createMergedElement(group)); return; } const originalGlyphCount = group.originalElements.reduce( (sum, element) => sum + countGraphemes(element.text ?? ''), 0, ); const normalizedText = sanitizeParagraphText(group.text); const targetGlyphCount = countGraphemes(normalizedText); if (targetGlyphCount !== originalGlyphCount) { rebuiltElements.push(createMergedElement(group)); return; } const originals = group.originalElements.map(cloneTextElement); const distributed = distributeTextAcrossElements(normalizedText, originals); if (distributed) { rebuiltElements.push(...originals); } else { rebuiltElements.push(createMergedElement(group)); } return; } rebuiltElements.push(...group.originalElements.map(cloneTextElement)); }); return { ...page, textElements: rebuiltElements, imageElements: images.map(cloneImageElement), contentStreams: page.contentStreams ?? [], }; }); return updated; }; const approxEqual = (a: number | null | undefined, b: number | null | undefined, tolerance = 0.25): boolean => { const first = typeof a === 'number' && Number.isFinite(a) ? a : 0; const second = typeof b === 'number' && Number.isFinite(b) ? b : 0; return Math.abs(first - second) <= tolerance; }; const arrayApproxEqual = ( first: number[] | null | undefined, second: number[] | null | undefined, tolerance = 0.25, ): boolean => { if (!first && !second) { return true; } if (!first || !second) { return false; } if (first.length !== second.length) { return false; } for (let index = 0; index < first.length; index += 1) { if (!approxEqual(first[index], second[index], tolerance)) { return false; } } return true; }; const areImageElementsEqual = ( current: PdfJsonImageElement, original: PdfJsonImageElement, ): boolean => { if (current === original) { return true; } if (!current || !original) { return false; } const sameData = (current.imageData ?? null) === (original.imageData ?? null); const sameFormat = (current.imageFormat ?? null) === (original.imageFormat ?? null); return ( sameData && sameFormat && approxEqual(current.x, original.x) && approxEqual(current.y, original.y) && approxEqual(current.width, original.width) && approxEqual(current.height, original.height) && approxEqual(current.left, original.left) && approxEqual(current.right, original.right) && approxEqual(current.top, original.top) && approxEqual(current.bottom, original.bottom) && (current.zOrder ?? null) === (original.zOrder ?? null) && arrayApproxEqual(current.transform, original.transform) ); }; export const areImageListsDifferent = ( current: PdfJsonImageElement[], original: PdfJsonImageElement[], ): boolean => { if (current.length !== original.length) { return true; } for (let index = 0; index < current.length; index += 1) { if (!areImageElementsEqual(current[index], original[index])) { return true; } } return false; }; export const getDirtyPages = ( groupsByPage: TextGroup[][], imagesByPage: PdfJsonImageElement[][], originalGroupsByPage: TextGroup[][], originalImagesByPage: PdfJsonImageElement[][], ): boolean[] => { return groupsByPage.map((groups, index) => { // Check if any text was modified const textDirty = groups.some((group) => group.text !== group.originalText); // Check if any groups were deleted by comparing with original groups const originalGroups = originalGroupsByPage[index] ?? []; const groupCountChanged = groups.length !== originalGroups.length; const imageDirty = areImageListsDifferent( imagesByPage[index] ?? [], originalImagesByPage[index] ?? [], ); const isDirty = textDirty || groupCountChanged || imageDirty; if (groupCountChanged || textDirty) { console.log(`📄 Page ${index} dirty check:`, { textDirty, groupCountChanged, originalGroupsLength: originalGroups.length, currentGroupsLength: groups.length, imageDirty, isDirty, }); } return isDirty; }); };