From 2d8113b3f64e7ac873faef2657035c20a7ef51c9 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Tue, 11 Nov 2025 12:09:40 +0000 Subject: [PATCH] auto paragrpah mode --- .../public/locales/en-GB/translation.json | 19 ++- .../tools/pdfJsonEditor/PdfJsonEditorView.tsx | 141 +++++++++++++++++- .../tools/pdfJsonEditor/PdfJsonEditor.tsx | 25 +++- .../tools/pdfJsonEditor/pdfJsonEditorTypes.ts | 2 + .../tools/pdfJsonEditor/pdfJsonEditorUtils.ts | 62 +++++++- 5 files changed, 233 insertions(+), 16 deletions(-) diff --git a/frontend/public/locales/en-GB/translation.json b/frontend/public/locales/en-GB/translation.json index 5e0bc62d1..86e105329 100644 --- a/frontend/public/locales/en-GB/translation.json +++ b/frontend/public/locales/en-GB/translation.json @@ -4495,19 +4495,25 @@ "title": "Auto-scale text to fit boxes", "description": "Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF." }, + "groupingMode": { + "title": "Text Grouping Mode", + "autoDescription": "Automatically detects page type and groups text appropriately.", + "paragraphDescription": "Groups aligned lines into multi-line paragraph text boxes.", + "singleLineDescription": "Keeps each PDF text line as a separate text box." + }, "forceSingleElement": { "title": "Lock edited text to a single PDF element", "description": "When enabled, the editor exports each edited text box as one PDF text element to avoid overlapping glyphs or mixed fonts." - }, - "textGroupingMode": { - "title": "Text grouping mode", - "description": "Paragraph mode merges aligned lines into one textbox; single-line mode keeps every PDF line separate. Auto picks the best option per page." } }, - "grouping": { + "pageType": { + "paragraph": "Paragraph page", + "sparse": "Sparse text" + }, + "groupingMode": { "auto": "Auto", "paragraph": "Paragraph", - "single": "Single Line" + "singleLine": "Single Line" }, "disclaimer": { "heading": "Preview limitations", @@ -4521,6 +4527,7 @@ "loading": "Loading", "normalizing": "Normalizing", "parsing": "Parsing", + "processing": "Processing", "fonts": "Fonts", "text": "Text Extraction", "images": "Images", diff --git a/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx b/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx index 869d6229d..6b2aee7d5 100644 --- a/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx +++ b/frontend/src/proprietary/components/tools/pdfJsonEditor/PdfJsonEditorView.tsx @@ -13,6 +13,7 @@ import { Pagination, Progress, ScrollArea, + SegmentedControl, Stack, Switch, Text, @@ -202,6 +203,95 @@ const buildFontLookupKeys = ( return Array.from(new Set(keys.filter((value) => value && value.length > 0))); }; +/** + * Analyzes text groups on a page to determine if it's paragraph-heavy or sparse. + * Returns true if the page appears to be document-like with substantial text content. + */ +const analyzePageContentType = (groups: TextGroup[]): boolean => { + if (groups.length === 0) return false; + + let multiLineGroups = 0; + let totalWords = 0; + let longTextGroups = 0; + let totalGroups = 0; + const groupDetails: Array<{ + id: string; + lines: number; + words: number; + chars: number; + text: string; + }> = []; + + groups.forEach((group) => { + const text = (group.text || '').trim(); + if (text.length === 0) return; + + totalGroups++; + const lines = text.split('\n'); + const lineCount = lines.length; + const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length; + + totalWords += wordCount; + + // Count multi-line paragraphs + if (lineCount > 1) { + multiLineGroups++; + } + + // Count text groups with substantial content (more than a few words) + if (wordCount >= 5 || text.length >= 30) { + longTextGroups++; + } + + groupDetails.push({ + id: group.id, + lines: lineCount, + words: wordCount, + chars: text.length, + text: text.substring(0, 50) + (text.length > 50 ? '...' : ''), + }); + }); + + if (totalGroups === 0) return false; + + // Heuristics for paragraph mode: + // 1. Has multiple substantial multi-line groups (2+) AND decent average words + // 2. Average words per group > 12 (strong indicator of document text) + // 3. More than 40% of groups have substantial text (typical of documents) + const avgWordsPerGroup = totalWords / totalGroups; + const longTextRatio = longTextGroups / totalGroups; + + const isParagraphPage = + (multiLineGroups >= 2 && avgWordsPerGroup > 8) || + avgWordsPerGroup > 12 || + longTextRatio > 0.4; + + // Log detailed statistics + console.group(`📊 Page Content Analysis`); + console.log('📄 Overall Statistics:'); + console.log(` Total text groups: ${totalGroups}`); + console.log(` Total words: ${totalWords}`); + console.log(` Average words per group: ${avgWordsPerGroup.toFixed(2)}`); + console.log(` Multi-line groups: ${multiLineGroups}`); + console.log(` Long text groups (≥5 words or ≥30 chars): ${longTextGroups}`); + console.log(` Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`); + console.log(''); + console.log('🔍 Detection Criteria:'); + console.log(` ✓ Multi-line groups ≥ 2 AND avg words > 8? ${multiLineGroups >= 2 && avgWordsPerGroup > 8 ? '✅ YES' : '❌ NO'} (multi-line: ${multiLineGroups}, avg: ${avgWordsPerGroup.toFixed(2)})`); + console.log(` ✓ Avg words/group > 12? ${avgWordsPerGroup > 12 ? '✅ YES' : '❌ NO'} (current: ${avgWordsPerGroup.toFixed(2)})`); + console.log(` ✓ Long text ratio > 40%? ${longTextRatio > 0.4 ? '✅ YES' : '❌ NO'} (current: ${(longTextRatio * 100).toFixed(1)}%)`); + console.log(''); + console.log(`📋 Result: ${isParagraphPage ? '📝 PARAGRAPH PAGE' : '📄 SPARSE PAGE'}`); + console.log(''); + console.log('📦 Individual Groups:'); + console.table(groupDetails); + console.groupEnd(); + + return isParagraphPage; +}; + +type GroupingMode = 'auto' | 'paragraph' | 'singleLine'; + const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const { t } = useTranslation(); const [activeGroupId, setActiveGroupId] = useState(null); @@ -232,6 +322,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { conversionProgress, hasChanges, forceSingleTextElement, + groupingMode: externalGroupingMode, requestPagePreview, onLoadJson, onSelectPage, @@ -243,6 +334,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { onDownloadJson, onGeneratePdf, onForceSingleTextElementChange, + onGroupingModeChange, } = data; const syncEditorValue = useCallback( @@ -430,6 +522,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { const pageImages = imagesByPage[selectedPage] ?? []; const pagePreview = pagePreviews.get(selectedPage); + // Detect if current page contains paragraph-heavy content + const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups), [pageGroups]); + const extractPreferredFontId = useCallback((target?: TextGroup | null) => { if (!target) { return undefined; @@ -981,6 +1076,50 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { /> + + + + {t('pdfJsonEditor.options.groupingMode.title', 'Text Grouping Mode')} + + {externalGroupingMode === 'auto' && isParagraphPage && ( + + {t('pdfJsonEditor.pageType.paragraph', 'Paragraph page')} + + )} + {externalGroupingMode === 'auto' && !isParagraphPage && hasDocument && ( + + {t('pdfJsonEditor.pageType.sparse', 'Sparse text')} + + )} + + + {externalGroupingMode === 'auto' + ? t( + 'pdfJsonEditor.options.groupingMode.autoDescription', + 'Automatically detects page type and groups text appropriately.' + ) + : externalGroupingMode === 'paragraph' + ? t( + 'pdfJsonEditor.options.groupingMode.paragraphDescription', + 'Groups aligned lines into multi-line paragraph text boxes.' + ) + : t( + 'pdfJsonEditor.options.groupingMode.singleLineDescription', + 'Keeps each PDF text line as a separate text box.' + )} + + onGroupingModeChange(value as GroupingMode)} + data={[ + { label: t('pdfJsonEditor.groupingMode.auto', 'Auto'), value: 'auto' }, + { label: t('pdfJsonEditor.groupingMode.paragraph', 'Paragraph'), value: 'paragraph' }, + { label: t('pdfJsonEditor.groupingMode.singleLine', 'Single Line'), value: 'singleLine' }, + ]} + fullWidth + /> + +
@@ -1547,7 +1686,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => { style={{ pointerEvents: 'none', display: 'inline-block', - transform: shouldScale ? `scaleX(${textScale})` : undefined, + transform: shouldScale ? `scaleX(${textScale})` : 'none', transformOrigin: 'left center', whiteSpace: 'pre', }} diff --git a/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx b/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx index 8fbddd8dc..8cec6c5a2 100644 --- a/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx +++ b/frontend/src/proprietary/tools/pdfJsonEditor/PdfJsonEditor.tsx @@ -78,6 +78,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { message: string; } | null>(null); const [forceSingleTextElement, setForceSingleTextElement] = useState(false); + const [groupingMode, setGroupingMode] = useState<'auto' | 'paragraph' | 'singleLine'>('auto'); const [hasVectorPreview, setHasVectorPreview] = useState(false); const [pagePreviews, setPagePreviews] = useState>(new Map()); @@ -136,7 +137,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { const viewLabel = useMemo(() => t('pdfJsonEditor.viewLabel', 'PDF Editor'), [t]); const { selectedFiles } = useFileSelection(); - const resetToDocument = useCallback((document: PdfJsonDocument | null) => { + const resetToDocument = useCallback((document: PdfJsonDocument | null, mode: 'auto' | 'paragraph' | 'singleLine') => { if (!document) { setGroupsByPage([]); setImagesByPage([]); @@ -150,7 +151,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { return; } const cloned = deepCloneDocument(document); - const groups = groupDocumentText(cloned); + const groups = groupDocumentText(cloned, mode); const images = extractDocumentImages(cloned); const originalImages = images.map((page) => page.map(cloneImageElement)); originalImagesRef.current = originalImages; @@ -513,7 +514,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { } setLoadedDocument(parsed); - resetToDocument(parsed); + resetToDocument(parsed, groupingMode); setIsLazyMode(shouldUseLazyMode); setCachedJobId(shouldUseLazyMode ? pendingJobId : null); setFileName(file.name); @@ -532,7 +533,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { } setLoadedDocument(null); - resetToDocument(null); + resetToDocument(null, groupingMode); clearPdfPreview(); if (isPdf) { @@ -555,7 +556,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { } } }, - [resetToDocument, t], + [groupingMode, resetToDocument, t], ); const handleSelectPage = useCallback((pageIndex: number) => { @@ -686,9 +687,9 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { if (!loadedDocument) { return; } - resetToDocument(loadedDocument); + resetToDocument(loadedDocument, groupingMode); setErrorMessage(null); - }, [loadedDocument, resetToDocument]); + }, [groupingMode, loadedDocument, resetToDocument]); const buildPayload = useCallback(() => { if (!loadedDocument) { @@ -975,6 +976,13 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { [hasVectorPreview], ); + // Re-group text when grouping mode changes + useEffect(() => { + if (loadedDocument) { + resetToDocument(loadedDocument, groupingMode); + } + }, [groupingMode, loadedDocument, resetToDocument]); + const viewData = useMemo(() => ({ document: loadedDocument, groupsByPage, @@ -991,6 +999,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { conversionProgress, hasChanges, forceSingleTextElement, + groupingMode, requestPagePreview, onLoadJson: handleLoadFile, onSelectPage: handleSelectPage, @@ -1002,6 +1011,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { onDownloadJson: handleDownloadJson, onGeneratePdf: handleGeneratePdf, onForceSingleTextElementChange: setForceSingleTextElement, + onGroupingModeChange: setGroupingMode, }), [ handleImageTransform, imagesByPage, @@ -1027,6 +1037,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => { loadedDocument, selectedPage, forceSingleTextElement, + groupingMode, requestPagePreview, setForceSingleTextElement, ]); diff --git a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts index b75cdf762..d230850aa 100644 --- a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts +++ b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorTypes.ts @@ -197,6 +197,7 @@ export interface PdfJsonEditorViewData { conversionProgress: ConversionProgress | null; hasChanges: boolean; forceSingleTextElement: boolean; + groupingMode: 'auto' | 'paragraph' | 'singleLine'; requestPagePreview: (pageIndex: number, scale: number) => void; onLoadJson: (file: File | null) => Promise | void; onSelectPage: (pageIndex: number) => void; @@ -218,4 +219,5 @@ export interface PdfJsonEditorViewData { onDownloadJson: () => void; onGeneratePdf: () => void; onForceSingleTextElementChange: (value: boolean) => void; + onGroupingModeChange: (value: 'auto' | 'paragraph' | 'singleLine') => void; } diff --git a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts index 813155810..d5d2864fe 100644 --- a/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts +++ b/frontend/src/proprietary/tools/pdfJsonEditor/pdfJsonEditorUtils.ts @@ -641,6 +641,7 @@ export const groupPageTextElements = ( page: PdfJsonPage | null | undefined, pageIndex: number, metrics?: FontMetricsMap, + groupingMode: 'auto' | 'paragraph' | 'singleLine' = 'auto', ): TextGroup[] => { if (!page?.textElements || page.textElements.length === 0) { return []; @@ -731,15 +732,72 @@ export const groupPageTextElements = ( } }); - return groupLinesIntoParagraphs(lineGroups, metrics); + // Apply paragraph grouping based on mode + if (groupingMode === 'singleLine') { + // Single line mode: skip paragraph grouping + return lineGroups; + } + + if (groupingMode === 'paragraph') { + // Paragraph mode: always apply grouping + return groupLinesIntoParagraphs(lineGroups, metrics); + } + + // Auto mode: use heuristic to determine if we should group + // Analyze the page content to decide + let multiLineGroups = 0; + let totalWords = 0; + let longTextGroups = 0; + let totalGroups = 0; + + lineGroups.forEach((group) => { + const text = (group.text || '').trim(); + if (text.length === 0) return; + + totalGroups++; + const lines = text.split('\n'); + const lineCount = lines.length; + const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length; + + totalWords += wordCount; + + if (lineCount > 1) { + multiLineGroups++; + } + + if (wordCount >= 5 || text.length >= 30) { + longTextGroups++; + } + }); + + if (totalGroups === 0) { + return lineGroups; + } + + const avgWordsPerGroup = totalWords / totalGroups; + const longTextRatio = longTextGroups / totalGroups; + + const isParagraphPage = + (multiLineGroups >= 2 && avgWordsPerGroup > 8) || + avgWordsPerGroup > 12 || + longTextRatio > 0.4; + + // Only apply paragraph grouping if it looks like a paragraph-heavy page + if (isParagraphPage) { + return groupLinesIntoParagraphs(lineGroups, metrics); + } + + // For sparse pages, keep lines separate + return lineGroups; }; export const groupDocumentText = ( document: PdfJsonDocument | null | undefined, + groupingMode: 'auto' | 'paragraph' | 'singleLine' = 'auto', ): TextGroup[][] => { const pages = document?.pages ?? []; const metrics = buildFontMetrics(document); - return pages.map((page, index) => groupPageTextElements(page, index, metrics)); + return pages.map((page, index) => groupPageTextElements(page, index, metrics, groupingMode)); }; export const extractPageImages = (