diff --git a/frontend/src/proprietary/components/tools/pdfTextEditor/FontStatusPanel.tsx b/frontend/src/proprietary/components/tools/pdfTextEditor/FontStatusPanel.tsx index 4bb8ce65c..6b8be300c 100644 --- a/frontend/src/proprietary/components/tools/pdfTextEditor/FontStatusPanel.tsx +++ b/frontend/src/proprietary/components/tools/pdfTextEditor/FontStatusPanel.tsx @@ -199,14 +199,30 @@ const FontStatusPanel: React.FC = ({ document, pageIndex } - - {statusIcon} - - {pageLabel} - - - {fonts.length} - + + + {statusIcon} + + {pageLabel} + + + {fonts.length} + + + + {/* Warning badges BEFORE expansion */} + + {summary.systemFallback > 0 && ( + }> + {summary.systemFallback} {t('pdfTextEditor.fontAnalysis.fallback', 'fallback')} + + )} + {summary.missing > 0 && ( + }> + {summary.missing} {t('pdfTextEditor.fontAnalysis.missing', 'missing')} + + )} + diff --git a/frontend/src/proprietary/components/tools/pdfTextEditor/PdfTextEditorView.tsx b/frontend/src/proprietary/components/tools/pdfTextEditor/PdfTextEditorView.tsx index 169841d60..6678ae498 100644 --- a/frontend/src/proprietary/components/tools/pdfTextEditor/PdfTextEditorView.tsx +++ b/frontend/src/proprietary/components/tools/pdfTextEditor/PdfTextEditorView.tsx @@ -247,20 +247,16 @@ const buildFontLookupKeys = ( * Analyzes text groups on a page to determine if it's paragraph-heavy or sparse. * Returns true if the page appears to be document-like with substantial text content. */ -const analyzePageContentType = (groups: TextGroup[]): boolean => { +const analyzePageContentType = (groups: TextGroup[], pageWidth: number): boolean => { if (groups.length === 0) return false; let multiLineGroups = 0; let totalWords = 0; let longTextGroups = 0; let totalGroups = 0; - const groupDetails: Array<{ - id: string; - lines: number; - words: number; - chars: number; - text: string; - }> = []; + let fullWidthLines = 0; + const wordCounts: number[] = []; + const fullWidthThreshold = pageWidth * 0.7; groups.forEach((group) => { const text = (group.text || '').trim(); @@ -272,39 +268,46 @@ const analyzePageContentType = (groups: TextGroup[]): boolean => { const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length; totalWords += wordCount; + wordCounts.push(wordCount); // Count multi-line paragraphs if (lineCount > 1) { multiLineGroups++; } - // Count text groups with substantial content (more than a few words) - if (wordCount >= 5 || text.length >= 30) { + // Count text groups with substantial content (≥10 words or ≥50 chars) + if (wordCount >= 10 || text.length >= 50) { longTextGroups++; } - groupDetails.push({ - id: group.id, - lines: lineCount, - words: wordCount, - chars: text.length, - text: text.substring(0, 50) + (text.length > 50 ? '...' : ''), - }); + // Check if this line extends close to the right margin + const rightEdge = group.bounds.right; + if (rightEdge >= fullWidthThreshold) { + fullWidthLines++; + } }); if (totalGroups === 0) return false; - // Heuristics for paragraph mode: - // 1. Has multiple substantial multi-line groups (2+) AND decent average words - // 2. Average words per group > 12 (strong indicator of document text) - // 3. More than 40% of groups have substantial text (typical of documents) const avgWordsPerGroup = totalWords / totalGroups; const longTextRatio = longTextGroups / totalGroups; + const fullWidthRatio = fullWidthLines / totalGroups; - const isParagraphPage = - (multiLineGroups >= 2 && avgWordsPerGroup > 8) || - avgWordsPerGroup > 12 || - longTextRatio > 0.4; + // Calculate variance in line lengths + const variance = wordCounts.reduce((sum, count) => { + const diff = count - avgWordsPerGroup; + return sum + diff * diff; + }, 0) / totalGroups; + const stdDev = Math.sqrt(variance); + const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0; + + // All 4 criteria must pass for paragraph mode + const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8; + const criterion2 = avgWordsPerGroup > 5; + const criterion3 = longTextRatio > 0.4; + const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; + + const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4; return isParagraphPage; }; @@ -543,9 +546,10 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { const pageGroups = groupsByPage[selectedPage] ?? []; const pageImages = imagesByPage[selectedPage] ?? []; const pagePreview = pagePreviews.get(selectedPage); + const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage); // Detect if current page contains paragraph-heavy content - const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups), [pageGroups]); + const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups, pageWidth), [pageGroups, pageWidth]); const isParagraphLayout = externalGroupingMode === 'paragraph' || (externalGroupingMode === 'auto' && isParagraphPage); @@ -788,7 +792,6 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { ), [pageImages], ); - const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage); const scale = useMemo(() => Math.min(MAX_RENDER_WIDTH / pageWidth, 2.5), [pageWidth]); const scaledWidth = pageWidth * scale; const scaledHeight = pageHeight * scale; @@ -1036,14 +1039,37 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { position: 'absolute', top: -8, right: -8, - zIndex: 10, + zIndex: 9999, cursor: 'pointer', + pointerEvents: 'auto', + }} + onMouseDown={(event) => { + console.log(`❌ MOUSEDOWN on X button for group ${groupId}`); + event.stopPropagation(); + event.preventDefault(); + + // Find the current group to check if it's already empty + const currentGroups = groupsByPage[pageIndex] ?? []; + const currentGroup = currentGroups.find(g => g.id === groupId); + const currentText = (currentGroup?.text ?? '').trim(); + + if (currentText.length === 0) { + // Already empty - remove the textbox entirely + console.log(` Text already empty, removing textbox`); + onGroupDelete(pageIndex, groupId); + setActiveGroupId(null); + setEditingGroupId(null); + } else { + // Has text - clear it but keep the textbox + console.log(` Clearing text (textbox remains)`); + onGroupEdit(pageIndex, groupId, ''); + } + console.log(` Operation completed`); }} onClick={(event) => { + console.log(`❌ X button ONCLICK fired for group ${groupId} on page ${pageIndex}`); event.stopPropagation(); - onGroupDelete(pageIndex, groupId); - setActiveGroupId(null); - setEditingGroupId(null); + event.preventDefault(); }} > diff --git a/frontend/src/proprietary/tools/pdfTextEditor/PdfTextEditor.tsx b/frontend/src/proprietary/tools/pdfTextEditor/PdfTextEditor.tsx index f3666f4fd..73f8dc7ab 100644 --- a/frontend/src/proprietary/tools/pdfTextEditor/PdfTextEditor.tsx +++ b/frontend/src/proprietary/tools/pdfTextEditor/PdfTextEditor.tsx @@ -89,6 +89,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { const [loadingImagePages, setLoadingImagePages] = useState>(new Set()); const originalImagesRef = useRef([]); + const originalGroupsRef = useRef([]); const imagesByPageRef = useRef([]); const autoLoadKeyRef = useRef(null); const loadRequestIdRef = useRef(0); @@ -131,7 +132,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { }, []); const dirtyPages = useMemo( - () => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current), + () => getDirtyPages(groupsByPage, imagesByPage, originalGroupsRef.current, originalImagesRef.current), [groupsByPage, imagesByPage], ); const hasChanges = useMemo(() => dirtyPages.some(Boolean), [dirtyPages]); @@ -157,6 +158,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { const images = extractDocumentImages(cloned); const originalImages = images.map((page) => page.map(cloneImageElement)); originalImagesRef.current = originalImages; + originalGroupsRef.current = groups.map((page) => page.map((group) => ({ ...group }))); imagesByPageRef.current = images.map((page) => page.map(cloneImageElement)); const initialLoaded = new Set(); originalImages.forEach((pageImages, index) => { @@ -595,13 +597,16 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { }, []); const handleGroupDelete = useCallback((pageIndex: number, groupId: string) => { - setGroupsByPage((previous) => - previous.map((groups, idx) => - idx !== pageIndex - ? groups - : groups.map((group) => (group.id === groupId ? { ...group, text: '' } : group)) - ) - ); + console.log(`🗑️ Deleting group ${groupId} from page ${pageIndex}`); + setGroupsByPage((previous) => { + const updated = previous.map((groups, idx) => { + if (idx !== pageIndex) return groups; + const filtered = groups.filter((group) => group.id !== groupId); + console.log(` Before: ${groups.length} groups, After: ${filtered.length} groups`); + return filtered; + }); + return updated; + }); }, []); const handleImageTransform = useCallback( diff --git a/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorUtils.ts b/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorUtils.ts index f7e005125..67bdae96b 100644 --- a/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorUtils.ts +++ b/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorUtils.ts @@ -647,6 +647,8 @@ export const groupPageTextElements = ( return []; } + const pageWidth = valueOr(page.width, DEFAULT_PAGE_WIDTH); + const elements = page.textElements .map(cloneTextElement) .filter((element) => element.text !== null && element.text !== undefined); @@ -749,6 +751,11 @@ export const groupPageTextElements = ( let totalWords = 0; let longTextGroups = 0; let totalGroups = 0; + const wordCounts: number[] = []; + let fullWidthLines = 0; + + // Define "full width" as extending to at least 70% of page width + const fullWidthThreshold = pageWidth * 0.7; lineGroups.forEach((group) => { const text = (group.text || '').trim(); @@ -760,14 +767,21 @@ export const groupPageTextElements = ( const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length; totalWords += wordCount; + wordCounts.push(wordCount); if (lineCount > 1) { multiLineGroups++; } - if (wordCount >= 5 || text.length >= 30) { + if (wordCount >= 10 || text.length >= 50) { longTextGroups++; } + + // Check if this line extends close to the right margin (paragraph-like) + const rightEdge = group.bounds.right; + if (rightEdge >= fullWidthThreshold) { + fullWidthLines++; + } }); if (totalGroups === 0) { @@ -776,18 +790,69 @@ export const groupPageTextElements = ( const avgWordsPerGroup = totalWords / totalGroups; const longTextRatio = longTextGroups / totalGroups; + const fullWidthRatio = fullWidthLines / totalGroups; - const isParagraphPage = - (multiLineGroups >= 2 && avgWordsPerGroup > 8) || - avgWordsPerGroup > 12 || - longTextRatio > 0.4; + // Calculate variance in line lengths (paragraphs have varying lengths, lists are uniform) + const variance = wordCounts.reduce((sum, count) => { + const diff = count - avgWordsPerGroup; + return sum + diff * diff; + }, 0) / totalGroups; + const stdDev = Math.sqrt(variance); + const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0; + + // Check each criterion + const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8; + const criterion2 = avgWordsPerGroup > 5; + const criterion3 = longTextRatio > 0.4; + const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text + + const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4; + + // Log detection stats + console.log(`📄 Page ${pageIndex} Grouping Analysis (mode: ${groupingMode}):`); + console.log(` Stats:`); + console.log(` • Page width: ${pageWidth.toFixed(1)}pt (full-width threshold: ${fullWidthThreshold.toFixed(1)}pt)`); + console.log(` • Multi-line groups: ${multiLineGroups}`); + console.log(` • Total groups: ${totalGroups}`); + console.log(` • Total words: ${totalWords}`); + console.log(` • Long text groups (≥10 words or ≥50 chars): ${longTextGroups}`); + console.log(` • Full-width lines (≥70% page width): ${fullWidthLines}`); + console.log(` • Avg words per group: ${avgWordsPerGroup.toFixed(2)}`); + console.log(` • Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`); + console.log(` • Full-width ratio: ${(fullWidthRatio * 100).toFixed(1)}%`); + console.log(` • Std deviation: ${stdDev.toFixed(2)}`); + console.log(` • Coefficient of variation: ${coefficientOfVariation.toFixed(2)}`); + console.log(` Criteria:`); + console.log(` 1. Multi-line + Avg Words: ${criterion1 ? '✅ PASS' : '❌ FAIL'}`); + console.log(` (${multiLineGroups} >= 2 AND ${avgWordsPerGroup.toFixed(2)} > 8)`); + console.log(` 2. Avg Words Only: ${criterion2 ? '✅ PASS' : '❌ FAIL'}`); + console.log(` (${avgWordsPerGroup.toFixed(2)} > 5)`); + console.log(` 3. Long Text Ratio: ${criterion3 ? '✅ PASS' : '❌ FAIL'}`); + console.log(` (${(longTextRatio * 100).toFixed(1)}% > 40%)`); + console.log(` 4. Line Width Pattern: ${criterion4 ? '✅ PASS' : '❌ FAIL'}`); + console.log(` (CV ${coefficientOfVariation.toFixed(2)} > 0.5 OR ${(fullWidthRatio * 100).toFixed(1)}% > 60%)`); + console.log(` ${coefficientOfVariation > 0.5 ? '✓ High variance (varying line lengths)' : '✗ Low variance'} ${fullWidthRatio > 0.6 ? '✓ Many full-width lines (paragraph-like)' : '✗ Few full-width lines (list-like)'}`); + console.log(` Decision: ${isParagraphPage ? '📝 PARAGRAPH MODE' : '📋 LINE MODE'}`); + if (isParagraphPage) { + console.log(` Reason: All criteria passed (AND logic)`); + } else { + const failedReasons = []; + if (!criterion1) failedReasons.push('insufficient multi-line groups or word density'); + if (!criterion2) failedReasons.push('low average words per group'); + if (!criterion3) failedReasons.push('low ratio of long text groups'); + if (!criterion4) failedReasons.push('low variance and few full-width lines (list-like structure)'); + console.log(` Reason: ${failedReasons.join(', ')}`); + } + console.log(''); // Only apply paragraph grouping if it looks like a paragraph-heavy page if (isParagraphPage) { + console.log(`🔀 Applying paragraph grouping to page ${pageIndex}`); return groupLinesIntoParagraphs(lineGroups, metrics); } // For sparse pages, keep lines separate + console.log(`📋 Keeping lines separate for page ${pageIndex}`); return lineGroups; }; @@ -1192,14 +1257,35 @@ export const areImageListsDifferent = ( export const getDirtyPages = ( groupsByPage: TextGroup[][], imagesByPage: PdfJsonImageElement[][], + originalGroupsByPage: TextGroup[][], originalImagesByPage: PdfJsonImageElement[][], ): boolean[] => { return groupsByPage.map((groups, index) => { + // Check if any text was modified const textDirty = groups.some((group) => group.text !== group.originalText); + + // Check if any groups were deleted by comparing with original groups + const originalGroups = originalGroupsByPage[index] ?? []; + const groupCountChanged = groups.length !== originalGroups.length; + const imageDirty = areImageListsDifferent( imagesByPage[index] ?? [], originalImagesByPage[index] ?? [], ); - return textDirty || imageDirty; + + const isDirty = textDirty || groupCountChanged || imageDirty; + + if (groupCountChanged || textDirty) { + console.log(`📄 Page ${index} dirty check:`, { + textDirty, + groupCountChanged, + originalGroupsLength: originalGroups.length, + currentGroupsLength: groups.length, + imageDirty, + isDirty, + }); + } + + return isDirty; }); };