diff --git a/frontend/src/proprietary/components/tools/pdfTextEditor/PdfTextEditorView.tsx b/frontend/src/proprietary/components/tools/pdfTextEditor/PdfTextEditorView.tsx index 6678ae498..0f11f6210 100644 --- a/frontend/src/proprietary/components/tools/pdfTextEditor/PdfTextEditorView.tsx +++ b/frontend/src/proprietary/components/tools/pdfTextEditor/PdfTextEditorView.tsx @@ -34,6 +34,7 @@ import { PdfJsonFont, PdfJsonPage, ConversionProgress, + TextGroup, } from '@app/tools/pdfTextEditor/pdfTextEditorTypes'; import { getImageBounds, pageDimensions } from '@app/tools/pdfTextEditor/pdfTextEditorUtils'; import FontStatusPanel from '@app/components/tools/pdfTextEditor/FontStatusPanel'; @@ -301,13 +302,12 @@ const analyzePageContentType = (groups: TextGroup[], pageWidth: number): boolean const stdDev = Math.sqrt(variance); const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0; - // All 4 criteria must pass for paragraph mode - const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8; - const criterion2 = avgWordsPerGroup > 5; - const criterion3 = longTextRatio > 0.4; - const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; + // All 3 criteria must pass for paragraph mode + const criterion1 = avgWordsPerGroup > 5; + const criterion2 = longTextRatio > 0.4; + const criterion3 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; - const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4; + const isParagraphPage = criterion1 && criterion2 && criterion3; return isParagraphPage; }; @@ -319,6 +319,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { const [activeGroupId, setActiveGroupId] = useState(null); const [editingGroupId, setEditingGroupId] = useState(null); const [activeImageId, setActiveImageId] = useState(null); + const [selectedGroupIds, setSelectedGroupIds] = useState>(new Set()); + const [widthOverrides, setWidthOverrides] = useState>(new Map()); const draggingImageRef = useRef(null); const rndRefs = useRef>(new Map()); const pendingDragUpdateRef = useRef(null); @@ -330,6 +332,15 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { const containerRef = useRef(null); const editorRefs = useRef>(new Map()); const caretOffsetsRef = useRef>(new Map()); + const lastSelectedGroupIdRef = useRef(null); + const widthOverridesRef = useRef>(widthOverrides); + const resizingRef = useRef<{ + groupId: string; + startX: number; + startWidth: number; + baseWidth: number; + maxWidth: number; + } | null>(null); const { document: pdfDocument, @@ -359,6 +370,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { onGeneratePdf, onForceSingleTextElementChange, onGroupingModeChange, + onMergeGroups, + onUngroupGroup, } = data; const handleModeChangeRequest = useCallback((newMode: GroupingMode) => { @@ -382,6 +395,15 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { setPendingModeChange(null); }, []); + const clearSelection = useCallback(() => { + setSelectedGroupIds(new Set()); + lastSelectedGroupIdRef.current = null; + }, []); + + useEffect(() => { + widthOverridesRef.current = widthOverrides; + }, [widthOverrides]); + const resolveFont = (fontId: string | null | undefined, pageIndex: number | null | undefined): PdfJsonFont | null => { if (!fontId || !pdfDocument?.fonts) { return null; @@ -548,11 +570,78 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { const pagePreview = pagePreviews.get(selectedPage); const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage); + useEffect(() => { + clearSelection(); + }, [clearSelection, selectedPage]); + + useEffect(() => { + clearSelection(); + }, [clearSelection, externalGroupingMode]); + + useEffect(() => { + setWidthOverrides(new Map()); + }, [pdfDocument]); + + useEffect(() => { + setSelectedGroupIds((prev) => { + const filtered = Array.from(prev).filter((id) => pageGroups.some((group) => group.id === id)); + if (filtered.length === prev.size) { + return prev; + } + return new Set(filtered); + }); + setWidthOverrides((prev) => { + const filtered = new Map(); + pageGroups.forEach((group) => { + if (prev.has(group.id)) { + filtered.set(group.id, prev.get(group.id) ?? 0); + } + }); + if (filtered.size === prev.size) { + return prev; + } + return filtered; + }); + }, [pageGroups]); + // Detect if current page contains paragraph-heavy content - const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups, pageWidth), [pageGroups, pageWidth]); + const isParagraphPage = useMemo(() => { + const result = analyzePageContentType(pageGroups, pageWidth); + console.log(`🏷️ Page ${selectedPage} badge: ${result ? 'PARAGRAPH' : 'SPARSE'} (${pageGroups.length} groups)`); + return result; + }, [pageGroups, pageWidth, selectedPage]); const isParagraphLayout = externalGroupingMode === 'paragraph' || (externalGroupingMode === 'auto' && isParagraphPage); + const resolveGroupWidth = useCallback( + (group: TextGroup): { width: number; base: number; max: number } => { + const baseWidth = Math.max(group.bounds.right - group.bounds.left, 1); + const maxWidth = Math.max(pageWidth - group.bounds.left, baseWidth); + const override = widthOverrides.get(group.id); + const resolved = override ? Math.min(Math.max(override, baseWidth), maxWidth) : baseWidth; + return { width: resolved, base: baseWidth, max: maxWidth }; + }, + [pageWidth, widthOverrides], + ); + + const selectedGroupIdsArray = useMemo(() => Array.from(selectedGroupIds), [selectedGroupIds]); + const selectionIndices = useMemo(() => { + return selectedGroupIdsArray + .map((id) => pageGroups.findIndex((group) => group.id === id)) + .filter((index) => index >= 0) + .sort((a, b) => a - b); + }, [pageGroups, selectedGroupIdsArray]); + const canMergeSelection = selectionIndices.length >= 2 && selectionIndices.every((value, idx, array) => idx === 0 || value === array[idx - 1] + 1); + const paragraphSelectionIds = useMemo(() => + selectedGroupIdsArray.filter((id) => { + const target = pageGroups.find((group) => group.id === id); + return target ? (target.childLineGroups?.length ?? 0) > 1 : false; + }), + [pageGroups, selectedGroupIdsArray]); + const canUngroupSelection = paragraphSelectionIds.length > 0; + const hasWidthOverrides = selectedGroupIdsArray.some((id) => widthOverrides.has(id)); + const hasSelection = selectedGroupIdsArray.length > 0; + const syncEditorValue = useCallback( ( element: HTMLElement, @@ -581,6 +670,69 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { [editingGroupId, onGroupEdit], ); + const handleMergeSelection = useCallback(() => { + if (!canMergeSelection) { + return; + } + const orderedIds = selectionIndices + .map((index) => pageGroups[index]?.id) + .filter((value): value is string => Boolean(value)); + if (orderedIds.length < 2) { + return; + } + const merged = onMergeGroups(selectedPage, orderedIds); + if (merged) { + clearSelection(); + } + }, [canMergeSelection, selectionIndices, pageGroups, onMergeGroups, selectedPage, clearSelection]); + + const handleUngroupSelection = useCallback(() => { + if (!canUngroupSelection) { + return; + } + let changed = false; + paragraphSelectionIds.forEach((id) => { + const result = onUngroupGroup(selectedPage, id); + if (result) { + changed = true; + } + }); + if (changed) { + clearSelection(); + } + }, [canUngroupSelection, paragraphSelectionIds, onUngroupGroup, selectedPage, clearSelection]); + + const handleWidthAdjustment = useCallback( + (mode: 'expand' | 'reset') => { + if (mode === 'expand' && !hasSelection) { + return; + } + if (mode === 'reset' && !hasWidthOverrides) { + return; + } + const selectedGroups = selectedGroupIdsArray + .map((id) => pageGroups.find((group) => group.id === id)) + .filter((group): group is TextGroup => Boolean(group)); + if (selectedGroups.length === 0) { + return; + } + setWidthOverrides((prev) => { + const next = new Map(prev); + selectedGroups.forEach((group) => { + const baseWidth = Math.max(group.bounds.right - group.bounds.left, 1); + const maxWidth = Math.max(pageWidth - group.bounds.left, baseWidth); + if (mode === 'expand') { + next.set(group.id, maxWidth); + } else { + next.delete(group.id); + } + }); + return next; + }); + }, + [hasSelection, hasWidthOverrides, selectedGroupIdsArray, pageGroups, pageWidth], + ); + const extractPreferredFontId = useCallback((target?: TextGroup | null) => { if (!target) { return undefined; @@ -874,7 +1026,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { textSpan.style.transform = 'none'; const bounds = toCssBounds(currentPage, pageHeight, scale, group.bounds); - const containerWidth = bounds.width; + const { width: resolvedWidth } = resolveGroupWidth(group); + const containerWidth = resolvedWidth * scale; const textWidth = textSpan.getBoundingClientRect().width; // Restore original transform @@ -907,6 +1060,7 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { fontFamilies.size, selectedPage, isParagraphLayout, + resolveGroupWidth, ]); useLayoutEffect(() => { @@ -977,6 +1131,7 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { const handlePageChange = (pageNumber: number) => { setActiveGroupId(null); setEditingGroupId(null); + clearSelection(); onSelectPage(pageNumber - 1); }; @@ -984,8 +1139,97 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { setEditingGroupId(null); setActiveGroupId(null); setActiveImageId(null); + clearSelection(); }; + const handleSelectionInteraction = useCallback( + (groupId: string, groupIndex: number, event: React.MouseEvent): boolean => { + const multiSelect = event.metaKey || event.ctrlKey; + const rangeSelect = event.shiftKey && lastSelectedGroupIdRef.current !== null; + setSelectedGroupIds((previous) => { + if (multiSelect) { + const next = new Set(previous); + if (next.has(groupId)) { + next.delete(groupId); + } else { + next.add(groupId); + } + return next; + } + if (rangeSelect) { + const anchorId = lastSelectedGroupIdRef.current; + const anchorIndex = anchorId ? pageGroups.findIndex((group) => group.id === anchorId) : -1; + if (anchorIndex === -1) { + return new Set([groupId]); + } + const start = Math.min(anchorIndex, groupIndex); + const end = Math.max(anchorIndex, groupIndex); + const next = new Set(); + for (let idx = start; idx <= end; idx += 1) { + const candidate = pageGroups[idx]; + if (candidate) { + next.add(candidate.id); + } + } + return next; + } + return new Set([groupId]); + }); + if (!rangeSelect) { + lastSelectedGroupIdRef.current = groupId; + } + return !(multiSelect || rangeSelect); + }, + [pageGroups], + ); + + const handleResizeStart = useCallback( + (event: React.MouseEvent, group: TextGroup, currentWidth: number) => { + const baseWidth = Math.max(group.bounds.right - group.bounds.left, 1); + const maxWidth = Math.max(pageWidth - group.bounds.left, baseWidth); + event.stopPropagation(); + event.preventDefault(); + const startX = event.clientX; + const handleMouseMove = (moveEvent: MouseEvent) => { + const context = resizingRef.current; + if (!context) { + return; + } + moveEvent.preventDefault(); + const deltaPx = moveEvent.clientX - context.startX; + const deltaWidth = deltaPx / scale; + const nextWidth = Math.min( + Math.max(context.startWidth + deltaWidth, context.baseWidth), + context.maxWidth, + ); + setWidthOverrides((prev) => { + const next = new Map(prev); + if (Math.abs(nextWidth - context.baseWidth) <= 0.5) { + next.delete(context.groupId); + } else { + next.set(context.groupId, nextWidth); + } + return next; + }); + }; + const handleMouseUp = () => { + resizingRef.current = null; + window.removeEventListener('mousemove', handleMouseMove); + window.removeEventListener('mouseup', handleMouseUp); + }; + resizingRef.current = { + groupId: group.id, + startX, + startWidth: currentWidth, + baseWidth, + maxWidth, + }; + window.addEventListener('mousemove', handleMouseMove); + window.addEventListener('mouseup', handleMouseUp); + }, + [pageWidth, scale], + ); + const renderGroupContainer = ( groupId: string, pageIndex: number, @@ -994,6 +1238,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { content: React.ReactNode, onActivate?: (event: React.MouseEvent) => void, onClick?: (event: React.MouseEvent) => void, + isSelected = false, + resizeHandle?: React.ReactNode, ) => ( { marginTop: '-3px', outline: isActive ? '2px solid var(--mantine-color-blue-5)' - : isChanged - ? '1px solid var(--mantine-color-yellow-5)' - : 'none', + : isSelected + ? '1px solid var(--mantine-color-violet-5)' + : isChanged + ? '1px solid var(--mantine-color-yellow-5)' + : 'none', outlineOffset: '-1px', borderRadius: 6, - backgroundColor: isChanged || isActive ? 'rgba(250,255,189,0.28)' : 'transparent', + backgroundColor: isActive + ? 'rgba(184,212,255,0.35)' + : isSelected + ? 'rgba(206,190,255,0.32)' + : isChanged + ? 'rgba(250,255,189,0.28)' + : 'transparent', transition: 'outline 120ms ease, background-color 120ms ease', pointerEvents: 'auto', overflow: 'visible', @@ -1029,6 +1283,7 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { }} > {content} + {resizeHandle} {activeGroupId === groupId && ( { {t('pdfTextEditor.options.groupingMode.title', 'Text Grouping Mode')} {externalGroupingMode === 'auto' && isParagraphPage && ( - + {t('pdfTextEditor.pageType.paragraph', 'Paragraph page')} )} {externalGroupingMode === 'auto' && !isParagraphPage && hasDocument && ( - + {t('pdfTextEditor.pageType.sparse', 'Sparse text')} )} @@ -1239,6 +1494,59 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { /> + + + + {t('pdfTextEditor.options.manualGrouping.title', 'Manual Text Grouping')} + + + {t('pdfTextEditor.badges.beta', 'Beta')} + + + + {t( + 'pdfTextEditor.options.manualGrouping.description', + 'Hold Ctrl (Cmd) or Shift while clicking to multi-select text boxes, then merge or ungroup them manually.', + )} + + + + + + + + + + +
@@ -1615,7 +1923,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { let containerLeft = bounds.left; let containerTop = bounds.top; - let containerWidth = Math.max(bounds.width, fontSizePx); + const { width: resolvedWidth, base: baseWidth, max: maxWidth } = resolveGroupWidth(group); + let containerWidth = Math.max(resolvedWidth * scale, fontSizePx); let containerHeight = Math.max(bounds.height, paragraphHeightPx); let transform: string | undefined; let transformOrigin: React.CSSProperties['transformOrigin']; @@ -1654,14 +1963,15 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { // Determine text wrapping behavior based on whether text has been changed const hasChanges = changed; - const shouldWrap = hasChanges && isParagraphLayout; - const whiteSpace = shouldWrap ? 'pre-wrap' : 'pre'; - const wordBreak = shouldWrap ? 'break-word' : 'normal'; - const overflowWrap = shouldWrap ? 'break-word' : 'normal'; + const widthExtended = resolvedWidth - baseWidth > 0.5; + const enableWrap = isParagraphLayout || widthExtended || isEditing || hasChanges; + const whiteSpace = enableWrap ? 'pre-wrap' : 'pre'; + const wordBreak = enableWrap ? 'break-word' : 'normal'; + const overflowWrap = enableWrap ? 'break-word' : 'normal'; // For paragraph mode, allow height to grow to accommodate lines without wrapping // For single-line mode, maintain fixed height based on PDF bounds - const useFlexibleHeight = isEditing || shouldWrap || (isParagraphLayout && lineCount > 1); + const useFlexibleHeight = isEditing || enableWrap || (isParagraphLayout && lineCount > 1); // The renderGroupContainer wrapper adds 4px horizontal padding (2px left + 2px right) // We need to add this to the container width to compensate, so the inner content @@ -1685,6 +1995,35 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { transformOrigin, }; + const showResizeHandle = !hasRotation && (selectedGroupIds.has(group.id) || activeGroupId === group.id); + const resizeHandle = showResizeHandle ? ( + handleResizeStart(event, group, resolvedWidth)} + style={{ + position: 'absolute', + top: '50%', + right: -6, + width: 12, + height: 32, + marginTop: -16, + cursor: 'ew-resize', + borderRadius: 6, + backgroundColor: 'rgba(76, 110, 245, 0.35)', + border: '1px solid rgba(76, 110, 245, 0.8)', + display: 'flex', + alignItems: 'center', + justifyContent: 'center', + color: 'white', + fontSize: 9, + userSelect: 'none', + }} + > + || + + ) : null; + if (isEditing) { return ( @@ -1741,7 +2080,7 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { minHeight: '100%', height: 'auto', padding: 0, - backgroundColor: 'rgba(255,255,255,0.95)', + backgroundColor: 'rgba(255,255,255,0.95)', color: textColor, fontSize: `${fontSizePx}px`, fontFamily, @@ -1750,15 +2089,19 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { outline: 'none', border: 'none', display: 'block', - whiteSpace: isParagraphLayout ? 'pre-wrap' : 'pre', - wordBreak: isParagraphLayout ? 'break-word' : 'normal', - overflowWrap: isParagraphLayout ? 'break-word' : 'normal', + whiteSpace, + wordBreak, + overflowWrap, cursor: 'text', overflow: 'visible', }} > {group.text || '\u00A0'}
, + undefined, + undefined, + selectedGroupIds.has(group.id), + resizeHandle, )}
); @@ -1790,14 +2133,14 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { color: textColor, display: 'block', cursor: 'text', - overflow: shouldWrap ? 'visible' : 'hidden', + overflow: enableWrap ? 'visible' : 'hidden', }} > { , undefined, (event: React.MouseEvent) => { + const shouldActivate = handleSelectionInteraction(group.id, pageGroupIndex, event); + if (!shouldActivate) { + setActiveGroupId(null); + setEditingGroupId(null); + return; + } + const clickX = event.clientX; const clickY = event.clientY; @@ -1815,6 +2165,22 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { setEditingGroupId(group.id); caretOffsetsRef.current.delete(group.id); + // Log group stats when selected + const lines = (group.text ?? '').split('\n'); + const words = (group.text ?? '').split(/\s+/).filter(w => w.length > 0).length; + const chars = (group.text ?? '').length; + const width = group.bounds.right - group.bounds.left; + const height = group.bounds.bottom - group.bounds.top; + const isMultiLine = lines.length > 1; + console.log(`πŸ“ Selected Text Group "${group.id}":`); + console.log(` Lines: ${lines.length}, Words: ${words}, Chars: ${chars}`); + console.log(` Dimensions: ${width.toFixed(1)}pt Γ— ${height.toFixed(1)}pt`); + console.log(` Type: ${isMultiLine ? 'MULTI-LINE (paragraph)' : 'SINGLE-LINE'}`); + console.log(` Text preview: "${(group.text ?? '').substring(0, 80)}${(group.text ?? '').length > 80 ? '...' : ''}"`); + if (isMultiLine) { + console.log(` Line spacing: ${group.lineSpacing?.toFixed(1) ?? 'unknown'}pt`); + } + requestAnimationFrame(() => { const editor = document.querySelector(`[data-editor-group="${group.id}"]`); if (!editor) return; @@ -1846,6 +2212,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => { }, 10); }); }, + selectedGroupIds.has(group.id), + resizeHandle, )} ); diff --git a/frontend/src/proprietary/tools/pdfTextEditor/PdfTextEditor.tsx b/frontend/src/proprietary/tools/pdfTextEditor/PdfTextEditor.tsx index 73f8dc7ab..4fded970e 100644 --- a/frontend/src/proprietary/tools/pdfTextEditor/PdfTextEditor.tsx +++ b/frontend/src/proprietary/tools/pdfTextEditor/PdfTextEditor.tsx @@ -18,6 +18,7 @@ import { PdfJsonPage, TextGroup, PdfTextEditorViewData, + BoundingBox, } from './pdfTextEditorTypes'; import { deepCloneDocument, @@ -26,6 +27,7 @@ import { restoreGlyphElements, extractDocumentImages, cloneImageElement, + cloneTextElement, valueOr, } from './pdfTextEditorUtils'; import PdfTextEditorView from '@app/components/tools/pdfTextEditor/PdfTextEditorView'; @@ -52,6 +54,148 @@ const getAutoLoadKey = (file: File): string => { return `${file.name}|${file.size}|${file.lastModified}`; }; +const normalizeLineArray = (value: string | undefined | null, expected: number): string[] => { + const normalized = (value ?? '').replace(/\r/g, ''); + if (expected <= 0) { + return [normalized]; + } + const parts = normalized.split('\n'); + if (parts.length === expected) { + return parts; + } + if (parts.length < expected) { + return parts.concat(Array(expected - parts.length).fill('')); + } + const head = parts.slice(0, Math.max(expected - 1, 0)); + const tail = parts.slice(Math.max(expected - 1, 0)).join('\n'); + return [...head, tail]; +}; + +const cloneLineTemplate = (line: TextGroup, text?: string, originalText?: string): TextGroup => ({ + ...line, + text: text ?? line.text, + originalText: originalText ?? line.originalText, + childLineGroups: null, + lineElementCounts: null, + lineSpacing: null, + elements: line.elements.map(cloneTextElement), + originalElements: line.originalElements.map(cloneTextElement), +}); + +const expandGroupToLines = (group: TextGroup): TextGroup[] => { + if (group.childLineGroups && group.childLineGroups.length > 0) { + const textLines = normalizeLineArray(group.text, group.childLineGroups.length); + const originalLines = normalizeLineArray(group.originalText, group.childLineGroups.length); + return group.childLineGroups.map((child, index) => + cloneLineTemplate(child, textLines[index], originalLines[index]), + ); + } + return [cloneLineTemplate(group)]; +}; + +const mergeBoundingBoxes = (boxes: BoundingBox[]): BoundingBox => { + if (boxes.length === 0) { + return { left: 0, right: 0, top: 0, bottom: 0 }; + } + return boxes.reduce( + (acc, box) => ({ + left: Math.min(acc.left, box.left), + right: Math.max(acc.right, box.right), + top: Math.min(acc.top, box.top), + bottom: Math.max(acc.bottom, box.bottom), + }), + { ...boxes[0] }, + ); +}; + +const buildMergedGroupFromSelection = (groups: TextGroup[]): TextGroup | null => { + if (groups.length === 0) { + return null; + } + + const lineTemplates = groups.flatMap(expandGroupToLines); + if (lineTemplates.length <= 1) { + return null; + } + + const lineTexts = lineTemplates.map((line) => line.text ?? ''); + const lineOriginalTexts = lineTemplates.map((line) => line.originalText ?? ''); + const combinedOriginals = lineTemplates.flatMap((line) => line.originalElements.map(cloneTextElement)); + const combinedElements = combinedOriginals.map(cloneTextElement); + const mergedBounds = mergeBoundingBoxes(lineTemplates.map((line) => line.bounds)); + + const spacingValues: number[] = []; + for (let index = 1; index < lineTemplates.length; index += 1) { + const prevBaseline = lineTemplates[index - 1].baseline ?? lineTemplates[index - 1].bounds.bottom; + const currentBaseline = lineTemplates[index].baseline ?? lineTemplates[index].bounds.bottom; + const spacing = Math.abs(prevBaseline - currentBaseline); + if (spacing > 0) { + spacingValues.push(spacing); + } + } + const averageSpacing = + spacingValues.length > 0 + ? spacingValues.reduce((sum, value) => sum + value, 0) / spacingValues.length + : null; + + const first = groups[0]; + const lineElementCounts = lineTemplates.map((line) => Math.max(line.originalElements.length, 1)); + const paragraph: TextGroup = { + ...first, + text: lineTexts.join('\n'), + originalText: lineOriginalTexts.join('\n'), + elements: combinedElements, + originalElements: combinedOriginals, + bounds: mergedBounds, + lineSpacing: averageSpacing, + lineElementCounts: lineElementCounts.length > 1 ? lineElementCounts : null, + childLineGroups: lineTemplates.map((line, index) => + cloneLineTemplate(line, lineTexts[index], lineOriginalTexts[index]), + ), + }; + + return paragraph; +}; + +const splitParagraphGroup = (group: TextGroup): TextGroup[] => { + if (!group.childLineGroups || group.childLineGroups.length <= 1) { + return []; + } + + const templateLines = group.childLineGroups.map((child) => cloneLineTemplate(child)); + const lineCount = templateLines.length; + const textLines = normalizeLineArray(group.text, lineCount); + const originalLines = normalizeLineArray(group.originalText, lineCount); + const baseCounts = + group.lineElementCounts && group.lineElementCounts.length === lineCount + ? [...group.lineElementCounts] + : templateLines.map((line) => Math.max(line.originalElements.length, 1)); + + const totalOriginals = group.originalElements.length; + const counted = baseCounts.reduce((sum, count) => sum + count, 0); + if (counted < totalOriginals && baseCounts.length > 0) { + baseCounts[baseCounts.length - 1] += totalOriginals - counted; + } + + let offset = 0; + return templateLines.map((template, index) => { + const take = Math.max(1, baseCounts[index] ?? 1); + const slice = group.originalElements.slice(offset, offset + take).map(cloneTextElement); + offset += take; + return { + ...template, + id: `${group.id}-line-${index + 1}-${Date.now()}-${index}`, + text: textLines[index] ?? '', + originalText: originalLines[index] ?? '', + elements: slice.map(cloneTextElement), + originalElements: slice, + lineElementCounts: null, + lineSpacing: null, + childLineGroups: null, + }; + }); +}; + const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { const { t } = useTranslation(); const { @@ -609,6 +753,73 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { }); }, []); + const handleMergeGroups = useCallback((pageIndex: number, groupIds: string[]): boolean => { + if (groupIds.length < 2) { + return false; + } + let updated = false; + setGroupsByPage((previous) => + previous.map((groups, idx) => { + if (idx !== pageIndex) { + return groups; + } + const indices = groupIds + .map((id) => groups.findIndex((group) => group.id === id)) + .filter((index) => index >= 0); + if (indices.length !== groupIds.length) { + return groups; + } + const sorted = [...indices].sort((a, b) => a - b); + for (let i = 1; i < sorted.length; i += 1) { + if (sorted[i] !== sorted[i - 1] + 1) { + return groups; + } + } + const selection = sorted.map((position) => groups[position]); + const merged = buildMergedGroupFromSelection(selection); + if (!merged) { + return groups; + } + const next = [ + ...groups.slice(0, sorted[0]), + merged, + ...groups.slice(sorted[sorted.length - 1] + 1), + ]; + updated = true; + return next; + }), + ); + return updated; + }, []); + + const handleUngroupGroup = useCallback((pageIndex: number, groupId: string): boolean => { + let updated = false; + setGroupsByPage((previous) => + previous.map((groups, idx) => { + if (idx !== pageIndex) { + return groups; + } + const targetIndex = groups.findIndex((group) => group.id === groupId); + if (targetIndex < 0) { + return groups; + } + const targetGroup = groups[targetIndex]; + const splits = splitParagraphGroup(targetGroup); + if (splits.length <= 1) { + return groups; + } + const next = [ + ...groups.slice(0, targetIndex), + ...splits, + ...groups.slice(targetIndex + 1), + ]; + updated = true; + return next; + }), + ); + return updated; + }, []); + const handleImageTransform = useCallback( ( pageIndex: number, @@ -1064,7 +1275,11 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { onGeneratePdf: handleGeneratePdf, onForceSingleTextElementChange: setForceSingleTextElement, onGroupingModeChange: setGroupingMode, + onMergeGroups: handleMergeGroups, + onUngroupGroup: handleUngroupGroup, }), [ + handleMergeGroups, + handleUngroupGroup, handleImageTransform, imagesByPage, pagePreviews, diff --git a/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorTypes.ts b/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorTypes.ts index a90ae7774..46639f11a 100644 --- a/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorTypes.ts +++ b/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorTypes.ts @@ -168,6 +168,7 @@ export interface TextGroup { text: string; originalText: string; bounds: BoundingBox; + childLineGroups?: TextGroup[] | null; } export const DEFAULT_PAGE_WIDTH = 612; @@ -219,4 +220,6 @@ export interface PdfTextEditorViewData { onGeneratePdf: () => void; onForceSingleTextElementChange: (value: boolean) => void; onGroupingModeChange: (value: 'auto' | 'paragraph' | 'singleLine') => void; + onMergeGroups: (pageIndex: number, groupIds: string[]) => boolean; + onUngroupGroup: (pageIndex: number, groupId: string) => boolean; } diff --git a/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorUtils.ts b/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorUtils.ts index 67bdae96b..3ce10b2cb 100644 --- a/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorUtils.ts +++ b/frontend/src/proprietary/tools/pdfTextEditor/pdfTextEditorUtils.ts @@ -520,8 +520,18 @@ const createGroup = ( }; }; +const cloneLineTemplate = (line: TextGroup): TextGroup => ({ + ...line, + childLineGroups: null, + lineElementCounts: null, + lineSpacing: null, + elements: line.elements.map(cloneTextElement), + originalElements: line.originalElements.map(cloneTextElement), +}); + const groupLinesIntoParagraphs = ( lineGroups: TextGroup[], + pageWidth: number, metrics?: FontMetricsMap, ): TextGroup[] => { if (lineGroups.length === 0) { @@ -530,6 +540,8 @@ const groupLinesIntoParagraphs = ( const paragraphs: TextGroup[][] = []; let currentParagraph: TextGroup[] = [lineGroups[0]]; + const bulletFlags = new Map(); + bulletFlags.set(lineGroups[0].id, false); for (let i = 1; i < lineGroups.length; i++) { const prevLine = lineGroups[i - 1]; @@ -561,11 +573,85 @@ const groupLinesIntoParagraphs = ( const maxReasonableSpacing = avgFontSize * 3.0; // Max ~3x font size for normal line spacing const hasReasonableSpacing = lineSpacing <= maxReasonableSpacing; + // Check if current line looks like a bullet/list item + const prevRight = prevLine.bounds.right; + const currentRight = currentLine.bounds.right; + const prevWidth = prevRight - prevLeft; + const currentWidth = currentRight - currentLeft; + + // Count word count to help identify bullets (typically short) + const prevWords = (prevLine.text ?? '').split(/\s+/).filter(w => w.length > 0).length; + const currentWords = (currentLine.text ?? '').split(/\s+/).filter(w => w.length > 0).length; + const prevText = (prevLine.text ?? '').trim(); + const currentText = (currentLine.text ?? '').trim(); + + // Bullet detection - look for bullet markers or very short lines + const bulletMarkerRegex = /^[\u2022\u2023\u25E6\u2043\u2219‒·◦‣⁃\-\*]\s|^\d+[\.\)]\s|^[a-z][\.\)]\s/i; + const prevHasBulletMarker = bulletMarkerRegex.test(prevText); + const currentHasBulletMarker = bulletMarkerRegex.test(currentText); + + // True bullets are: + // 1. Have bullet markers/numbers OR + // 2. Very short (< 10 words) AND much narrower than average (< 60% of page width) + const headingKeywords = ['action items', 'next steps', 'notes', 'logistics', 'tasks']; + const normalizedPageWidth = pageWidth > 0 ? pageWidth : avgFontSize * 70; + const maxReferenceWidth = normalizedPageWidth > 0 ? normalizedPageWidth : avgFontSize * 70; + const indentDelta = currentLeft - prevLeft; + const indentThreshold = Math.max(avgFontSize * 0.6, 8); + const hasIndent = indentDelta > indentThreshold; + const currentWidthRatio = maxReferenceWidth > 0 ? currentWidth / maxReferenceWidth : 0; + const prevWidthRatio = maxReferenceWidth > 0 ? prevWidth / maxReferenceWidth : 0; + const prevLooksLikeHeading = + prevText.endsWith(':') || + (prevWords <= 4 && prevWidthRatio < 0.4) || + headingKeywords.some((keyword) => prevText.toLowerCase().includes(keyword)); + + const wrapCandidate = + !currentHasBulletMarker && + !hasIndent && + !prevLooksLikeHeading && + currentWords <= 12 && + currentWidthRatio < 0.45 && + Math.abs(prevLeft - currentLeft) <= leftAlignmentTolerance && + currentWidth < prevWidth * 0.85; + + const currentIsBullet = wrapCandidate + ? false + : currentHasBulletMarker || + (hasIndent && (currentWords <= 14 || currentWidthRatio <= 0.65)) || + (prevLooksLikeHeading && (currentWords <= 16 || currentWidthRatio <= 0.8 || prevWidthRatio < 0.35)) || + (currentWords <= 8 && currentWidthRatio <= 0.45 && prevWidth - currentWidth > avgFontSize * 4); + + const prevIsBullet = bulletFlags.get(prevLine.id) ?? prevHasBulletMarker; + bulletFlags.set(currentLine.id, currentIsBullet); + + // Detect paragraphβ†’bullet transition + const likelyBulletStart = !prevIsBullet && currentIsBullet; + + // Don't merge two consecutive bullets + const bothAreBullets = prevIsBullet && currentIsBullet; + // Merge into paragraph if: // 1. Left aligned // 2. Same font - // 3. Reasonable line spacing (not a large gap indicating paragraph break) - const shouldMerge = isLeftAligned && sameFont && hasReasonableSpacing; + // 3. Reasonable line spacing + // 4. NOT transitioning to bullets + // 5. NOT both are bullets + const shouldMerge = + isLeftAligned && + sameFont && + hasReasonableSpacing && + !likelyBulletStart && + !bothAreBullets && + !currentIsBullet; + + if (i < 10 || likelyBulletStart || bothAreBullets || !shouldMerge) { + console.log(` Line ${i}:`); + console.log(` prev: "${prevText.substring(0, 40)}" (${prevWords}w, ${prevWidth.toFixed(0)}pt, marker:${prevHasBulletMarker}, bullet:${prevIsBullet})`); + console.log(` curr: "${currentText.substring(0, 40)}" (${currentWords}w, ${currentWidth.toFixed(0)}pt, marker:${currentHasBulletMarker}, bullet:${currentIsBullet})`); + console.log(` checks: leftAlign:${isLeftAligned} (${Math.abs(prevLeft - currentLeft).toFixed(1)}pt), sameFont:${sameFont}, spacing:${hasReasonableSpacing} (${lineSpacing.toFixed(1)}pt/${maxReasonableSpacing.toFixed(1)}pt)`); + console.log(` decision: merge=${shouldMerge} (bulletStart:${likelyBulletStart}, bothBullets:${bothAreBullets})`); + } if (shouldMerge) { currentParagraph.push(currentLine); @@ -587,17 +673,24 @@ const groupLinesIntoParagraphs = ( } // Combine all elements from all lines - const allElements = lines.flatMap(line => line.originalElements); + const lineTemplates = lines.map(line => cloneLineTemplate(line)); + const flattenedLineTemplates = lineTemplates.flatMap((line) => + line.childLineGroups && line.childLineGroups.length > 0 + ? line.childLineGroups + : [line], + ); + const allLines = flattenedLineTemplates.length > 0 ? flattenedLineTemplates : lineTemplates; + const allElements = allLines.flatMap(line => line.originalElements); const pageIndex = lines[0].pageIndex; - const lineElementCounts = lines.map((line) => line.originalElements.length); + const lineElementCounts = allLines.map((line) => line.originalElements.length); // Create merged group with newlines between lines - const paragraphText = lines.map(line => line.text).join('\n'); - const mergedBounds = mergeBounds(lines.map(line => line.bounds)); + const paragraphText = allLines.map(line => line.text).join('\n'); + const mergedBounds = mergeBounds(allLines.map(line => line.bounds)); const spacingValues: number[] = []; - for (let i = 1; i < lines.length; i++) { - const prevBaseline = lines[i - 1].baseline ?? lines[i - 1].bounds.bottom; - const currentBaseline = lines[i].baseline ?? lines[i].bounds.bottom; + for (let i = 1; i < allLines.length; i++) { + const prevBaseline = allLines[i - 1].baseline ?? allLines[i - 1].bounds.bottom; + const currentBaseline = allLines[i].baseline ?? allLines[i].bounds.bottom; const spacing = Math.abs(prevBaseline - currentBaseline); if (spacing > 0) { spacingValues.push(spacing); @@ -633,6 +726,7 @@ const groupLinesIntoParagraphs = ( text: paragraphText, originalText: paragraphText, bounds: mergedBounds, + childLineGroups: allLines, }; }); }; @@ -742,7 +836,7 @@ export const groupPageTextElements = ( if (groupingMode === 'paragraph') { // Paragraph mode: always apply grouping - return groupLinesIntoParagraphs(lineGroups, metrics); + return groupLinesIntoParagraphs(lineGroups, pageWidth, metrics); } // Auto mode: use heuristic to determine if we should group @@ -801,12 +895,11 @@ export const groupPageTextElements = ( const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0; // Check each criterion - const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8; - const criterion2 = avgWordsPerGroup > 5; - const criterion3 = longTextRatio > 0.4; - const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text + const criterion1 = avgWordsPerGroup > 5; + const criterion2 = longTextRatio > 0.4; + const criterion3 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text - const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4; + const isParagraphPage = criterion1 && criterion2 && criterion3; // Log detection stats console.log(`πŸ“„ Page ${pageIndex} Grouping Analysis (mode: ${groupingMode}):`); @@ -823,24 +916,21 @@ export const groupPageTextElements = ( console.log(` β€’ Std deviation: ${stdDev.toFixed(2)}`); console.log(` β€’ Coefficient of variation: ${coefficientOfVariation.toFixed(2)}`); console.log(` Criteria:`); - console.log(` 1. Multi-line + Avg Words: ${criterion1 ? 'βœ… PASS' : '❌ FAIL'}`); - console.log(` (${multiLineGroups} >= 2 AND ${avgWordsPerGroup.toFixed(2)} > 8)`); - console.log(` 2. Avg Words Only: ${criterion2 ? 'βœ… PASS' : '❌ FAIL'}`); + console.log(` 1. Avg Words Per Group: ${criterion1 ? 'βœ… PASS' : '❌ FAIL'}`); console.log(` (${avgWordsPerGroup.toFixed(2)} > 5)`); - console.log(` 3. Long Text Ratio: ${criterion3 ? 'βœ… PASS' : '❌ FAIL'}`); + console.log(` 2. Long Text Ratio: ${criterion2 ? 'βœ… PASS' : '❌ FAIL'}`); console.log(` (${(longTextRatio * 100).toFixed(1)}% > 40%)`); - console.log(` 4. Line Width Pattern: ${criterion4 ? 'βœ… PASS' : '❌ FAIL'}`); + console.log(` 3. Line Width Pattern: ${criterion3 ? 'βœ… PASS' : '❌ FAIL'}`); console.log(` (CV ${coefficientOfVariation.toFixed(2)} > 0.5 OR ${(fullWidthRatio * 100).toFixed(1)}% > 60%)`); console.log(` ${coefficientOfVariation > 0.5 ? 'βœ“ High variance (varying line lengths)' : 'βœ— Low variance'} ${fullWidthRatio > 0.6 ? 'βœ“ Many full-width lines (paragraph-like)' : 'βœ— Few full-width lines (list-like)'}`); console.log(` Decision: ${isParagraphPage ? 'πŸ“ PARAGRAPH MODE' : 'πŸ“‹ LINE MODE'}`); if (isParagraphPage) { - console.log(` Reason: All criteria passed (AND logic)`); + console.log(` Reason: All three criteria passed (AND logic)`); } else { const failedReasons = []; - if (!criterion1) failedReasons.push('insufficient multi-line groups or word density'); - if (!criterion2) failedReasons.push('low average words per group'); - if (!criterion3) failedReasons.push('low ratio of long text groups'); - if (!criterion4) failedReasons.push('low variance and few full-width lines (list-like structure)'); + if (!criterion1) failedReasons.push('low average words per group'); + if (!criterion2) failedReasons.push('low ratio of long text groups'); + if (!criterion3) failedReasons.push('low variance and few full-width lines (list-like structure)'); console.log(` Reason: ${failedReasons.join(', ')}`); } console.log(''); @@ -848,7 +938,7 @@ export const groupPageTextElements = ( // Only apply paragraph grouping if it looks like a paragraph-heavy page if (isParagraphPage) { console.log(`πŸ”€ Applying paragraph grouping to page ${pageIndex}`); - return groupLinesIntoParagraphs(lineGroups, metrics); + return groupLinesIntoParagraphs(lineGroups, pageWidth, metrics); } // For sparse pages, keep lines separate