init cool stuff

This commit is contained in:
Anthony Stirling 2025-11-14 01:09:04 +00:00
parent a58c69016b
commit 8247156f96
4 changed files with 728 additions and 52 deletions

View File

@ -34,6 +34,7 @@ import {
PdfJsonFont, PdfJsonFont,
PdfJsonPage, PdfJsonPage,
ConversionProgress, ConversionProgress,
TextGroup,
} from '@app/tools/pdfTextEditor/pdfTextEditorTypes'; } from '@app/tools/pdfTextEditor/pdfTextEditorTypes';
import { getImageBounds, pageDimensions } from '@app/tools/pdfTextEditor/pdfTextEditorUtils'; import { getImageBounds, pageDimensions } from '@app/tools/pdfTextEditor/pdfTextEditorUtils';
import FontStatusPanel from '@app/components/tools/pdfTextEditor/FontStatusPanel'; import FontStatusPanel from '@app/components/tools/pdfTextEditor/FontStatusPanel';
@ -301,13 +302,12 @@ const analyzePageContentType = (groups: TextGroup[], pageWidth: number): boolean
const stdDev = Math.sqrt(variance); const stdDev = Math.sqrt(variance);
const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0; const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0;
// All 4 criteria must pass for paragraph mode // All 3 criteria must pass for paragraph mode
const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8; const criterion1 = avgWordsPerGroup > 5;
const criterion2 = avgWordsPerGroup > 5; const criterion2 = longTextRatio > 0.4;
const criterion3 = longTextRatio > 0.4; const criterion3 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6;
const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6;
const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4; const isParagraphPage = criterion1 && criterion2 && criterion3;
return isParagraphPage; return isParagraphPage;
}; };
@ -319,6 +319,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
const [activeGroupId, setActiveGroupId] = useState<string | null>(null); const [activeGroupId, setActiveGroupId] = useState<string | null>(null);
const [editingGroupId, setEditingGroupId] = useState<string | null>(null); const [editingGroupId, setEditingGroupId] = useState<string | null>(null);
const [activeImageId, setActiveImageId] = useState<string | null>(null); const [activeImageId, setActiveImageId] = useState<string | null>(null);
const [selectedGroupIds, setSelectedGroupIds] = useState<Set<string>>(new Set());
const [widthOverrides, setWidthOverrides] = useState<Map<string, number>>(new Map());
const draggingImageRef = useRef<string | null>(null); const draggingImageRef = useRef<string | null>(null);
const rndRefs = useRef<Map<string, any>>(new Map()); const rndRefs = useRef<Map<string, any>>(new Map());
const pendingDragUpdateRef = useRef<number | null>(null); const pendingDragUpdateRef = useRef<number | null>(null);
@ -330,6 +332,15 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
const containerRef = useRef<HTMLDivElement | null>(null); const containerRef = useRef<HTMLDivElement | null>(null);
const editorRefs = useRef<Map<string, HTMLDivElement>>(new Map()); const editorRefs = useRef<Map<string, HTMLDivElement>>(new Map());
const caretOffsetsRef = useRef<Map<string, number>>(new Map()); const caretOffsetsRef = useRef<Map<string, number>>(new Map());
const lastSelectedGroupIdRef = useRef<string | null>(null);
const widthOverridesRef = useRef<Map<string, number>>(widthOverrides);
const resizingRef = useRef<{
groupId: string;
startX: number;
startWidth: number;
baseWidth: number;
maxWidth: number;
} | null>(null);
const { const {
document: pdfDocument, document: pdfDocument,
@ -359,6 +370,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
onGeneratePdf, onGeneratePdf,
onForceSingleTextElementChange, onForceSingleTextElementChange,
onGroupingModeChange, onGroupingModeChange,
onMergeGroups,
onUngroupGroup,
} = data; } = data;
const handleModeChangeRequest = useCallback((newMode: GroupingMode) => { const handleModeChangeRequest = useCallback((newMode: GroupingMode) => {
@ -382,6 +395,15 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
setPendingModeChange(null); setPendingModeChange(null);
}, []); }, []);
const clearSelection = useCallback(() => {
setSelectedGroupIds(new Set());
lastSelectedGroupIdRef.current = null;
}, []);
useEffect(() => {
widthOverridesRef.current = widthOverrides;
}, [widthOverrides]);
const resolveFont = (fontId: string | null | undefined, pageIndex: number | null | undefined): PdfJsonFont | null => { const resolveFont = (fontId: string | null | undefined, pageIndex: number | null | undefined): PdfJsonFont | null => {
if (!fontId || !pdfDocument?.fonts) { if (!fontId || !pdfDocument?.fonts) {
return null; return null;
@ -548,11 +570,78 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
const pagePreview = pagePreviews.get(selectedPage); const pagePreview = pagePreviews.get(selectedPage);
const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage); const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage);
useEffect(() => {
clearSelection();
}, [clearSelection, selectedPage]);
useEffect(() => {
clearSelection();
}, [clearSelection, externalGroupingMode]);
useEffect(() => {
setWidthOverrides(new Map());
}, [pdfDocument]);
useEffect(() => {
setSelectedGroupIds((prev) => {
const filtered = Array.from(prev).filter((id) => pageGroups.some((group) => group.id === id));
if (filtered.length === prev.size) {
return prev;
}
return new Set(filtered);
});
setWidthOverrides((prev) => {
const filtered = new Map<string, number>();
pageGroups.forEach((group) => {
if (prev.has(group.id)) {
filtered.set(group.id, prev.get(group.id) ?? 0);
}
});
if (filtered.size === prev.size) {
return prev;
}
return filtered;
});
}, [pageGroups]);
// Detect if current page contains paragraph-heavy content // Detect if current page contains paragraph-heavy content
const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups, pageWidth), [pageGroups, pageWidth]); const isParagraphPage = useMemo(() => {
const result = analyzePageContentType(pageGroups, pageWidth);
console.log(`🏷️ Page ${selectedPage} badge: ${result ? 'PARAGRAPH' : 'SPARSE'} (${pageGroups.length} groups)`);
return result;
}, [pageGroups, pageWidth, selectedPage]);
const isParagraphLayout = const isParagraphLayout =
externalGroupingMode === 'paragraph' || (externalGroupingMode === 'auto' && isParagraphPage); externalGroupingMode === 'paragraph' || (externalGroupingMode === 'auto' && isParagraphPage);
const resolveGroupWidth = useCallback(
(group: TextGroup): { width: number; base: number; max: number } => {
const baseWidth = Math.max(group.bounds.right - group.bounds.left, 1);
const maxWidth = Math.max(pageWidth - group.bounds.left, baseWidth);
const override = widthOverrides.get(group.id);
const resolved = override ? Math.min(Math.max(override, baseWidth), maxWidth) : baseWidth;
return { width: resolved, base: baseWidth, max: maxWidth };
},
[pageWidth, widthOverrides],
);
const selectedGroupIdsArray = useMemo(() => Array.from(selectedGroupIds), [selectedGroupIds]);
const selectionIndices = useMemo(() => {
return selectedGroupIdsArray
.map((id) => pageGroups.findIndex((group) => group.id === id))
.filter((index) => index >= 0)
.sort((a, b) => a - b);
}, [pageGroups, selectedGroupIdsArray]);
const canMergeSelection = selectionIndices.length >= 2 && selectionIndices.every((value, idx, array) => idx === 0 || value === array[idx - 1] + 1);
const paragraphSelectionIds = useMemo(() =>
selectedGroupIdsArray.filter((id) => {
const target = pageGroups.find((group) => group.id === id);
return target ? (target.childLineGroups?.length ?? 0) > 1 : false;
}),
[pageGroups, selectedGroupIdsArray]);
const canUngroupSelection = paragraphSelectionIds.length > 0;
const hasWidthOverrides = selectedGroupIdsArray.some((id) => widthOverrides.has(id));
const hasSelection = selectedGroupIdsArray.length > 0;
const syncEditorValue = useCallback( const syncEditorValue = useCallback(
( (
element: HTMLElement, element: HTMLElement,
@ -581,6 +670,69 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
[editingGroupId, onGroupEdit], [editingGroupId, onGroupEdit],
); );
const handleMergeSelection = useCallback(() => {
if (!canMergeSelection) {
return;
}
const orderedIds = selectionIndices
.map((index) => pageGroups[index]?.id)
.filter((value): value is string => Boolean(value));
if (orderedIds.length < 2) {
return;
}
const merged = onMergeGroups(selectedPage, orderedIds);
if (merged) {
clearSelection();
}
}, [canMergeSelection, selectionIndices, pageGroups, onMergeGroups, selectedPage, clearSelection]);
const handleUngroupSelection = useCallback(() => {
if (!canUngroupSelection) {
return;
}
let changed = false;
paragraphSelectionIds.forEach((id) => {
const result = onUngroupGroup(selectedPage, id);
if (result) {
changed = true;
}
});
if (changed) {
clearSelection();
}
}, [canUngroupSelection, paragraphSelectionIds, onUngroupGroup, selectedPage, clearSelection]);
const handleWidthAdjustment = useCallback(
(mode: 'expand' | 'reset') => {
if (mode === 'expand' && !hasSelection) {
return;
}
if (mode === 'reset' && !hasWidthOverrides) {
return;
}
const selectedGroups = selectedGroupIdsArray
.map((id) => pageGroups.find((group) => group.id === id))
.filter((group): group is TextGroup => Boolean(group));
if (selectedGroups.length === 0) {
return;
}
setWidthOverrides((prev) => {
const next = new Map(prev);
selectedGroups.forEach((group) => {
const baseWidth = Math.max(group.bounds.right - group.bounds.left, 1);
const maxWidth = Math.max(pageWidth - group.bounds.left, baseWidth);
if (mode === 'expand') {
next.set(group.id, maxWidth);
} else {
next.delete(group.id);
}
});
return next;
});
},
[hasSelection, hasWidthOverrides, selectedGroupIdsArray, pageGroups, pageWidth],
);
const extractPreferredFontId = useCallback((target?: TextGroup | null) => { const extractPreferredFontId = useCallback((target?: TextGroup | null) => {
if (!target) { if (!target) {
return undefined; return undefined;
@ -874,7 +1026,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
textSpan.style.transform = 'none'; textSpan.style.transform = 'none';
const bounds = toCssBounds(currentPage, pageHeight, scale, group.bounds); const bounds = toCssBounds(currentPage, pageHeight, scale, group.bounds);
const containerWidth = bounds.width; const { width: resolvedWidth } = resolveGroupWidth(group);
const containerWidth = resolvedWidth * scale;
const textWidth = textSpan.getBoundingClientRect().width; const textWidth = textSpan.getBoundingClientRect().width;
// Restore original transform // Restore original transform
@ -907,6 +1060,7 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
fontFamilies.size, fontFamilies.size,
selectedPage, selectedPage,
isParagraphLayout, isParagraphLayout,
resolveGroupWidth,
]); ]);
useLayoutEffect(() => { useLayoutEffect(() => {
@ -977,6 +1131,7 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
const handlePageChange = (pageNumber: number) => { const handlePageChange = (pageNumber: number) => {
setActiveGroupId(null); setActiveGroupId(null);
setEditingGroupId(null); setEditingGroupId(null);
clearSelection();
onSelectPage(pageNumber - 1); onSelectPage(pageNumber - 1);
}; };
@ -984,8 +1139,97 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
setEditingGroupId(null); setEditingGroupId(null);
setActiveGroupId(null); setActiveGroupId(null);
setActiveImageId(null); setActiveImageId(null);
clearSelection();
}; };
const handleSelectionInteraction = useCallback(
(groupId: string, groupIndex: number, event: React.MouseEvent): boolean => {
const multiSelect = event.metaKey || event.ctrlKey;
const rangeSelect = event.shiftKey && lastSelectedGroupIdRef.current !== null;
setSelectedGroupIds((previous) => {
if (multiSelect) {
const next = new Set(previous);
if (next.has(groupId)) {
next.delete(groupId);
} else {
next.add(groupId);
}
return next;
}
if (rangeSelect) {
const anchorId = lastSelectedGroupIdRef.current;
const anchorIndex = anchorId ? pageGroups.findIndex((group) => group.id === anchorId) : -1;
if (anchorIndex === -1) {
return new Set([groupId]);
}
const start = Math.min(anchorIndex, groupIndex);
const end = Math.max(anchorIndex, groupIndex);
const next = new Set<string>();
for (let idx = start; idx <= end; idx += 1) {
const candidate = pageGroups[idx];
if (candidate) {
next.add(candidate.id);
}
}
return next;
}
return new Set([groupId]);
});
if (!rangeSelect) {
lastSelectedGroupIdRef.current = groupId;
}
return !(multiSelect || rangeSelect);
},
[pageGroups],
);
const handleResizeStart = useCallback(
(event: React.MouseEvent, group: TextGroup, currentWidth: number) => {
const baseWidth = Math.max(group.bounds.right - group.bounds.left, 1);
const maxWidth = Math.max(pageWidth - group.bounds.left, baseWidth);
event.stopPropagation();
event.preventDefault();
const startX = event.clientX;
const handleMouseMove = (moveEvent: MouseEvent) => {
const context = resizingRef.current;
if (!context) {
return;
}
moveEvent.preventDefault();
const deltaPx = moveEvent.clientX - context.startX;
const deltaWidth = deltaPx / scale;
const nextWidth = Math.min(
Math.max(context.startWidth + deltaWidth, context.baseWidth),
context.maxWidth,
);
setWidthOverrides((prev) => {
const next = new Map(prev);
if (Math.abs(nextWidth - context.baseWidth) <= 0.5) {
next.delete(context.groupId);
} else {
next.set(context.groupId, nextWidth);
}
return next;
});
};
const handleMouseUp = () => {
resizingRef.current = null;
window.removeEventListener('mousemove', handleMouseMove);
window.removeEventListener('mouseup', handleMouseUp);
};
resizingRef.current = {
groupId: group.id,
startX,
startWidth: currentWidth,
baseWidth,
maxWidth,
};
window.addEventListener('mousemove', handleMouseMove);
window.addEventListener('mouseup', handleMouseUp);
},
[pageWidth, scale],
);
const renderGroupContainer = ( const renderGroupContainer = (
groupId: string, groupId: string,
pageIndex: number, pageIndex: number,
@ -994,6 +1238,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
content: React.ReactNode, content: React.ReactNode,
onActivate?: (event: React.MouseEvent) => void, onActivate?: (event: React.MouseEvent) => void,
onClick?: (event: React.MouseEvent) => void, onClick?: (event: React.MouseEvent) => void,
isSelected = false,
resizeHandle?: React.ReactNode,
) => ( ) => (
<Box <Box
component="div" component="div"
@ -1004,12 +1250,20 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
marginTop: '-3px', marginTop: '-3px',
outline: isActive outline: isActive
? '2px solid var(--mantine-color-blue-5)' ? '2px solid var(--mantine-color-blue-5)'
: isChanged : isSelected
? '1px solid var(--mantine-color-yellow-5)' ? '1px solid var(--mantine-color-violet-5)'
: 'none', : isChanged
? '1px solid var(--mantine-color-yellow-5)'
: 'none',
outlineOffset: '-1px', outlineOffset: '-1px',
borderRadius: 6, borderRadius: 6,
backgroundColor: isChanged || isActive ? 'rgba(250,255,189,0.28)' : 'transparent', backgroundColor: isActive
? 'rgba(184,212,255,0.35)'
: isSelected
? 'rgba(206,190,255,0.32)'
: isChanged
? 'rgba(250,255,189,0.28)'
: 'transparent',
transition: 'outline 120ms ease, background-color 120ms ease', transition: 'outline 120ms ease, background-color 120ms ease',
pointerEvents: 'auto', pointerEvents: 'auto',
overflow: 'visible', overflow: 'visible',
@ -1029,6 +1283,7 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
}} }}
> >
{content} {content}
{resizeHandle}
{activeGroupId === groupId && ( {activeGroupId === groupId && (
<ActionIcon <ActionIcon
size="xs" size="xs"
@ -1201,12 +1456,12 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
{t('pdfTextEditor.options.groupingMode.title', 'Text Grouping Mode')} {t('pdfTextEditor.options.groupingMode.title', 'Text Grouping Mode')}
</Text> </Text>
{externalGroupingMode === 'auto' && isParagraphPage && ( {externalGroupingMode === 'auto' && isParagraphPage && (
<Badge size="xs" color="blue" variant="light"> <Badge size="xs" color="blue" variant="light" key={`para-${selectedPage}`}>
{t('pdfTextEditor.pageType.paragraph', 'Paragraph page')} {t('pdfTextEditor.pageType.paragraph', 'Paragraph page')}
</Badge> </Badge>
)} )}
{externalGroupingMode === 'auto' && !isParagraphPage && hasDocument && ( {externalGroupingMode === 'auto' && !isParagraphPage && hasDocument && (
<Badge size="xs" color="gray" variant="light"> <Badge size="xs" color="gray" variant="light" key={`sparse-${selectedPage}`}>
{t('pdfTextEditor.pageType.sparse', 'Sparse text')} {t('pdfTextEditor.pageType.sparse', 'Sparse text')}
</Badge> </Badge>
)} )}
@ -1239,6 +1494,59 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
/> />
</Stack> </Stack>
<Stack gap="xs">
<Group gap={4} align="center">
<Text fw={500} size="sm">
{t('pdfTextEditor.options.manualGrouping.title', 'Manual Text Grouping')}
</Text>
<Badge size="xs" color="violet" variant="light">
{t('pdfTextEditor.badges.beta', 'Beta')}
</Badge>
</Group>
<Text size="xs" c="dimmed">
{t(
'pdfTextEditor.options.manualGrouping.description',
'Hold Ctrl (Cmd) or Shift while clicking to multi-select text boxes, then merge or ungroup them manually.',
)}
</Text>
<Group grow>
<Button
size="xs"
variant="subtle"
disabled={!canMergeSelection}
onClick={handleMergeSelection}
>
{t('pdfTextEditor.manual.merge', 'Merge selection')}
</Button>
<Button
size="xs"
variant="subtle"
disabled={!canUngroupSelection}
onClick={handleUngroupSelection}
>
{t('pdfTextEditor.manual.ungroup', 'Ungroup selection')}
</Button>
</Group>
<Group grow>
<Button
size="xs"
variant="light"
disabled={!hasSelection}
onClick={() => handleWidthAdjustment('expand')}
>
{t('pdfTextEditor.manual.expandWidth', 'Expand to page edge')}
</Button>
<Button
size="xs"
variant="light"
disabled={!hasWidthOverrides}
onClick={() => handleWidthAdjustment('reset')}
>
{t('pdfTextEditor.manual.resetWidth', 'Reset width')}
</Button>
</Group>
</Stack>
<Group justify="space-between" align="center"> <Group justify="space-between" align="center">
<div> <div>
<Text fw={500} size="sm"> <Text fw={500} size="sm">
@ -1615,7 +1923,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
let containerLeft = bounds.left; let containerLeft = bounds.left;
let containerTop = bounds.top; let containerTop = bounds.top;
let containerWidth = Math.max(bounds.width, fontSizePx); const { width: resolvedWidth, base: baseWidth, max: maxWidth } = resolveGroupWidth(group);
let containerWidth = Math.max(resolvedWidth * scale, fontSizePx);
let containerHeight = Math.max(bounds.height, paragraphHeightPx); let containerHeight = Math.max(bounds.height, paragraphHeightPx);
let transform: string | undefined; let transform: string | undefined;
let transformOrigin: React.CSSProperties['transformOrigin']; let transformOrigin: React.CSSProperties['transformOrigin'];
@ -1654,14 +1963,15 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
// Determine text wrapping behavior based on whether text has been changed // Determine text wrapping behavior based on whether text has been changed
const hasChanges = changed; const hasChanges = changed;
const shouldWrap = hasChanges && isParagraphLayout; const widthExtended = resolvedWidth - baseWidth > 0.5;
const whiteSpace = shouldWrap ? 'pre-wrap' : 'pre'; const enableWrap = isParagraphLayout || widthExtended || isEditing || hasChanges;
const wordBreak = shouldWrap ? 'break-word' : 'normal'; const whiteSpace = enableWrap ? 'pre-wrap' : 'pre';
const overflowWrap = shouldWrap ? 'break-word' : 'normal'; const wordBreak = enableWrap ? 'break-word' : 'normal';
const overflowWrap = enableWrap ? 'break-word' : 'normal';
// For paragraph mode, allow height to grow to accommodate lines without wrapping // For paragraph mode, allow height to grow to accommodate lines without wrapping
// For single-line mode, maintain fixed height based on PDF bounds // For single-line mode, maintain fixed height based on PDF bounds
const useFlexibleHeight = isEditing || shouldWrap || (isParagraphLayout && lineCount > 1); const useFlexibleHeight = isEditing || enableWrap || (isParagraphLayout && lineCount > 1);
// The renderGroupContainer wrapper adds 4px horizontal padding (2px left + 2px right) // The renderGroupContainer wrapper adds 4px horizontal padding (2px left + 2px right)
// We need to add this to the container width to compensate, so the inner content // We need to add this to the container width to compensate, so the inner content
@ -1685,6 +1995,35 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
transformOrigin, transformOrigin,
}; };
const showResizeHandle = !hasRotation && (selectedGroupIds.has(group.id) || activeGroupId === group.id);
const resizeHandle = showResizeHandle ? (
<Box
role="button"
aria-label={t('pdfTextEditor.manual.resizeHandle', 'Adjust text width')}
onMouseDown={(event) => handleResizeStart(event, group, resolvedWidth)}
style={{
position: 'absolute',
top: '50%',
right: -6,
width: 12,
height: 32,
marginTop: -16,
cursor: 'ew-resize',
borderRadius: 6,
backgroundColor: 'rgba(76, 110, 245, 0.35)',
border: '1px solid rgba(76, 110, 245, 0.8)',
display: 'flex',
alignItems: 'center',
justifyContent: 'center',
color: 'white',
fontSize: 9,
userSelect: 'none',
}}
>
||
</Box>
) : null;
if (isEditing) { if (isEditing) {
return ( return (
<Box key={group.id} style={containerStyle}> <Box key={group.id} style={containerStyle}>
@ -1741,7 +2080,7 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
minHeight: '100%', minHeight: '100%',
height: 'auto', height: 'auto',
padding: 0, padding: 0,
backgroundColor: 'rgba(255,255,255,0.95)', backgroundColor: 'rgba(255,255,255,0.95)',
color: textColor, color: textColor,
fontSize: `${fontSizePx}px`, fontSize: `${fontSizePx}px`,
fontFamily, fontFamily,
@ -1750,15 +2089,19 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
outline: 'none', outline: 'none',
border: 'none', border: 'none',
display: 'block', display: 'block',
whiteSpace: isParagraphLayout ? 'pre-wrap' : 'pre', whiteSpace,
wordBreak: isParagraphLayout ? 'break-word' : 'normal', wordBreak,
overflowWrap: isParagraphLayout ? 'break-word' : 'normal', overflowWrap,
cursor: 'text', cursor: 'text',
overflow: 'visible', overflow: 'visible',
}} }}
> >
{group.text || '\u00A0'} {group.text || '\u00A0'}
</div>, </div>,
undefined,
undefined,
selectedGroupIds.has(group.id),
resizeHandle,
)} )}
</Box> </Box>
); );
@ -1790,14 +2133,14 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
color: textColor, color: textColor,
display: 'block', display: 'block',
cursor: 'text', cursor: 'text',
overflow: shouldWrap ? 'visible' : 'hidden', overflow: enableWrap ? 'visible' : 'hidden',
}} }}
> >
<span <span
data-text-content data-text-content
style={{ style={{
pointerEvents: 'none', pointerEvents: 'none',
display: shouldWrap ? 'inline' : 'inline-block', display: enableWrap ? 'inline' : 'inline-block',
transform: shouldScale ? `scaleX(${textScale})` : 'none', transform: shouldScale ? `scaleX(${textScale})` : 'none',
transformOrigin: 'left center', transformOrigin: 'left center',
whiteSpace, whiteSpace,
@ -1808,6 +2151,13 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
</div>, </div>,
undefined, undefined,
(event: React.MouseEvent) => { (event: React.MouseEvent) => {
const shouldActivate = handleSelectionInteraction(group.id, pageGroupIndex, event);
if (!shouldActivate) {
setActiveGroupId(null);
setEditingGroupId(null);
return;
}
const clickX = event.clientX; const clickX = event.clientX;
const clickY = event.clientY; const clickY = event.clientY;
@ -1815,6 +2165,22 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
setEditingGroupId(group.id); setEditingGroupId(group.id);
caretOffsetsRef.current.delete(group.id); caretOffsetsRef.current.delete(group.id);
// Log group stats when selected
const lines = (group.text ?? '').split('\n');
const words = (group.text ?? '').split(/\s+/).filter(w => w.length > 0).length;
const chars = (group.text ?? '').length;
const width = group.bounds.right - group.bounds.left;
const height = group.bounds.bottom - group.bounds.top;
const isMultiLine = lines.length > 1;
console.log(`📝 Selected Text Group "${group.id}":`);
console.log(` Lines: ${lines.length}, Words: ${words}, Chars: ${chars}`);
console.log(` Dimensions: ${width.toFixed(1)}pt × ${height.toFixed(1)}pt`);
console.log(` Type: ${isMultiLine ? 'MULTI-LINE (paragraph)' : 'SINGLE-LINE'}`);
console.log(` Text preview: "${(group.text ?? '').substring(0, 80)}${(group.text ?? '').length > 80 ? '...' : ''}"`);
if (isMultiLine) {
console.log(` Line spacing: ${group.lineSpacing?.toFixed(1) ?? 'unknown'}pt`);
}
requestAnimationFrame(() => { requestAnimationFrame(() => {
const editor = document.querySelector<HTMLElement>(`[data-editor-group="${group.id}"]`); const editor = document.querySelector<HTMLElement>(`[data-editor-group="${group.id}"]`);
if (!editor) return; if (!editor) return;
@ -1846,6 +2212,8 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
}, 10); }, 10);
}); });
}, },
selectedGroupIds.has(group.id),
resizeHandle,
)} )}
</Box> </Box>
); );

View File

@ -18,6 +18,7 @@ import {
PdfJsonPage, PdfJsonPage,
TextGroup, TextGroup,
PdfTextEditorViewData, PdfTextEditorViewData,
BoundingBox,
} from './pdfTextEditorTypes'; } from './pdfTextEditorTypes';
import { import {
deepCloneDocument, deepCloneDocument,
@ -26,6 +27,7 @@ import {
restoreGlyphElements, restoreGlyphElements,
extractDocumentImages, extractDocumentImages,
cloneImageElement, cloneImageElement,
cloneTextElement,
valueOr, valueOr,
} from './pdfTextEditorUtils'; } from './pdfTextEditorUtils';
import PdfTextEditorView from '@app/components/tools/pdfTextEditor/PdfTextEditorView'; import PdfTextEditorView from '@app/components/tools/pdfTextEditor/PdfTextEditorView';
@ -52,6 +54,148 @@ const getAutoLoadKey = (file: File): string => {
return `${file.name}|${file.size}|${file.lastModified}`; return `${file.name}|${file.size}|${file.lastModified}`;
}; };
const normalizeLineArray = (value: string | undefined | null, expected: number): string[] => {
const normalized = (value ?? '').replace(/\r/g, '');
if (expected <= 0) {
return [normalized];
}
const parts = normalized.split('\n');
if (parts.length === expected) {
return parts;
}
if (parts.length < expected) {
return parts.concat(Array(expected - parts.length).fill(''));
}
const head = parts.slice(0, Math.max(expected - 1, 0));
const tail = parts.slice(Math.max(expected - 1, 0)).join('\n');
return [...head, tail];
};
const cloneLineTemplate = (line: TextGroup, text?: string, originalText?: string): TextGroup => ({
...line,
text: text ?? line.text,
originalText: originalText ?? line.originalText,
childLineGroups: null,
lineElementCounts: null,
lineSpacing: null,
elements: line.elements.map(cloneTextElement),
originalElements: line.originalElements.map(cloneTextElement),
});
const expandGroupToLines = (group: TextGroup): TextGroup[] => {
if (group.childLineGroups && group.childLineGroups.length > 0) {
const textLines = normalizeLineArray(group.text, group.childLineGroups.length);
const originalLines = normalizeLineArray(group.originalText, group.childLineGroups.length);
return group.childLineGroups.map((child, index) =>
cloneLineTemplate(child, textLines[index], originalLines[index]),
);
}
return [cloneLineTemplate(group)];
};
const mergeBoundingBoxes = (boxes: BoundingBox[]): BoundingBox => {
if (boxes.length === 0) {
return { left: 0, right: 0, top: 0, bottom: 0 };
}
return boxes.reduce(
(acc, box) => ({
left: Math.min(acc.left, box.left),
right: Math.max(acc.right, box.right),
top: Math.min(acc.top, box.top),
bottom: Math.max(acc.bottom, box.bottom),
}),
{ ...boxes[0] },
);
};
const buildMergedGroupFromSelection = (groups: TextGroup[]): TextGroup | null => {
if (groups.length === 0) {
return null;
}
const lineTemplates = groups.flatMap(expandGroupToLines);
if (lineTemplates.length <= 1) {
return null;
}
const lineTexts = lineTemplates.map((line) => line.text ?? '');
const lineOriginalTexts = lineTemplates.map((line) => line.originalText ?? '');
const combinedOriginals = lineTemplates.flatMap((line) => line.originalElements.map(cloneTextElement));
const combinedElements = combinedOriginals.map(cloneTextElement);
const mergedBounds = mergeBoundingBoxes(lineTemplates.map((line) => line.bounds));
const spacingValues: number[] = [];
for (let index = 1; index < lineTemplates.length; index += 1) {
const prevBaseline = lineTemplates[index - 1].baseline ?? lineTemplates[index - 1].bounds.bottom;
const currentBaseline = lineTemplates[index].baseline ?? lineTemplates[index].bounds.bottom;
const spacing = Math.abs(prevBaseline - currentBaseline);
if (spacing > 0) {
spacingValues.push(spacing);
}
}
const averageSpacing =
spacingValues.length > 0
? spacingValues.reduce((sum, value) => sum + value, 0) / spacingValues.length
: null;
const first = groups[0];
const lineElementCounts = lineTemplates.map((line) => Math.max(line.originalElements.length, 1));
const paragraph: TextGroup = {
...first,
text: lineTexts.join('\n'),
originalText: lineOriginalTexts.join('\n'),
elements: combinedElements,
originalElements: combinedOriginals,
bounds: mergedBounds,
lineSpacing: averageSpacing,
lineElementCounts: lineElementCounts.length > 1 ? lineElementCounts : null,
childLineGroups: lineTemplates.map((line, index) =>
cloneLineTemplate(line, lineTexts[index], lineOriginalTexts[index]),
),
};
return paragraph;
};
const splitParagraphGroup = (group: TextGroup): TextGroup[] => {
if (!group.childLineGroups || group.childLineGroups.length <= 1) {
return [];
}
const templateLines = group.childLineGroups.map((child) => cloneLineTemplate(child));
const lineCount = templateLines.length;
const textLines = normalizeLineArray(group.text, lineCount);
const originalLines = normalizeLineArray(group.originalText, lineCount);
const baseCounts =
group.lineElementCounts && group.lineElementCounts.length === lineCount
? [...group.lineElementCounts]
: templateLines.map((line) => Math.max(line.originalElements.length, 1));
const totalOriginals = group.originalElements.length;
const counted = baseCounts.reduce((sum, count) => sum + count, 0);
if (counted < totalOriginals && baseCounts.length > 0) {
baseCounts[baseCounts.length - 1] += totalOriginals - counted;
}
let offset = 0;
return templateLines.map((template, index) => {
const take = Math.max(1, baseCounts[index] ?? 1);
const slice = group.originalElements.slice(offset, offset + take).map(cloneTextElement);
offset += take;
return {
...template,
id: `${group.id}-line-${index + 1}-${Date.now()}-${index}`,
text: textLines[index] ?? '',
originalText: originalLines[index] ?? '',
elements: slice.map(cloneTextElement),
originalElements: slice,
lineElementCounts: null,
lineSpacing: null,
childLineGroups: null,
};
});
};
const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => { const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const { t } = useTranslation(); const { t } = useTranslation();
const { const {
@ -609,6 +753,73 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
}); });
}, []); }, []);
const handleMergeGroups = useCallback((pageIndex: number, groupIds: string[]): boolean => {
if (groupIds.length < 2) {
return false;
}
let updated = false;
setGroupsByPage((previous) =>
previous.map((groups, idx) => {
if (idx !== pageIndex) {
return groups;
}
const indices = groupIds
.map((id) => groups.findIndex((group) => group.id === id))
.filter((index) => index >= 0);
if (indices.length !== groupIds.length) {
return groups;
}
const sorted = [...indices].sort((a, b) => a - b);
for (let i = 1; i < sorted.length; i += 1) {
if (sorted[i] !== sorted[i - 1] + 1) {
return groups;
}
}
const selection = sorted.map((position) => groups[position]);
const merged = buildMergedGroupFromSelection(selection);
if (!merged) {
return groups;
}
const next = [
...groups.slice(0, sorted[0]),
merged,
...groups.slice(sorted[sorted.length - 1] + 1),
];
updated = true;
return next;
}),
);
return updated;
}, []);
const handleUngroupGroup = useCallback((pageIndex: number, groupId: string): boolean => {
let updated = false;
setGroupsByPage((previous) =>
previous.map((groups, idx) => {
if (idx !== pageIndex) {
return groups;
}
const targetIndex = groups.findIndex((group) => group.id === groupId);
if (targetIndex < 0) {
return groups;
}
const targetGroup = groups[targetIndex];
const splits = splitParagraphGroup(targetGroup);
if (splits.length <= 1) {
return groups;
}
const next = [
...groups.slice(0, targetIndex),
...splits,
...groups.slice(targetIndex + 1),
];
updated = true;
return next;
}),
);
return updated;
}, []);
const handleImageTransform = useCallback( const handleImageTransform = useCallback(
( (
pageIndex: number, pageIndex: number,
@ -1064,7 +1275,11 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
onGeneratePdf: handleGeneratePdf, onGeneratePdf: handleGeneratePdf,
onForceSingleTextElementChange: setForceSingleTextElement, onForceSingleTextElementChange: setForceSingleTextElement,
onGroupingModeChange: setGroupingMode, onGroupingModeChange: setGroupingMode,
onMergeGroups: handleMergeGroups,
onUngroupGroup: handleUngroupGroup,
}), [ }), [
handleMergeGroups,
handleUngroupGroup,
handleImageTransform, handleImageTransform,
imagesByPage, imagesByPage,
pagePreviews, pagePreviews,

View File

@ -168,6 +168,7 @@ export interface TextGroup {
text: string; text: string;
originalText: string; originalText: string;
bounds: BoundingBox; bounds: BoundingBox;
childLineGroups?: TextGroup[] | null;
} }
export const DEFAULT_PAGE_WIDTH = 612; export const DEFAULT_PAGE_WIDTH = 612;
@ -219,4 +220,6 @@ export interface PdfTextEditorViewData {
onGeneratePdf: () => void; onGeneratePdf: () => void;
onForceSingleTextElementChange: (value: boolean) => void; onForceSingleTextElementChange: (value: boolean) => void;
onGroupingModeChange: (value: 'auto' | 'paragraph' | 'singleLine') => void; onGroupingModeChange: (value: 'auto' | 'paragraph' | 'singleLine') => void;
onMergeGroups: (pageIndex: number, groupIds: string[]) => boolean;
onUngroupGroup: (pageIndex: number, groupId: string) => boolean;
} }

View File

@ -520,8 +520,18 @@ const createGroup = (
}; };
}; };
const cloneLineTemplate = (line: TextGroup): TextGroup => ({
...line,
childLineGroups: null,
lineElementCounts: null,
lineSpacing: null,
elements: line.elements.map(cloneTextElement),
originalElements: line.originalElements.map(cloneTextElement),
});
const groupLinesIntoParagraphs = ( const groupLinesIntoParagraphs = (
lineGroups: TextGroup[], lineGroups: TextGroup[],
pageWidth: number,
metrics?: FontMetricsMap, metrics?: FontMetricsMap,
): TextGroup[] => { ): TextGroup[] => {
if (lineGroups.length === 0) { if (lineGroups.length === 0) {
@ -530,6 +540,8 @@ const groupLinesIntoParagraphs = (
const paragraphs: TextGroup[][] = []; const paragraphs: TextGroup[][] = [];
let currentParagraph: TextGroup[] = [lineGroups[0]]; let currentParagraph: TextGroup[] = [lineGroups[0]];
const bulletFlags = new Map<string, boolean>();
bulletFlags.set(lineGroups[0].id, false);
for (let i = 1; i < lineGroups.length; i++) { for (let i = 1; i < lineGroups.length; i++) {
const prevLine = lineGroups[i - 1]; const prevLine = lineGroups[i - 1];
@ -561,11 +573,85 @@ const groupLinesIntoParagraphs = (
const maxReasonableSpacing = avgFontSize * 3.0; // Max ~3x font size for normal line spacing const maxReasonableSpacing = avgFontSize * 3.0; // Max ~3x font size for normal line spacing
const hasReasonableSpacing = lineSpacing <= maxReasonableSpacing; const hasReasonableSpacing = lineSpacing <= maxReasonableSpacing;
// Check if current line looks like a bullet/list item
const prevRight = prevLine.bounds.right;
const currentRight = currentLine.bounds.right;
const prevWidth = prevRight - prevLeft;
const currentWidth = currentRight - currentLeft;
// Count word count to help identify bullets (typically short)
const prevWords = (prevLine.text ?? '').split(/\s+/).filter(w => w.length > 0).length;
const currentWords = (currentLine.text ?? '').split(/\s+/).filter(w => w.length > 0).length;
const prevText = (prevLine.text ?? '').trim();
const currentText = (currentLine.text ?? '').trim();
// Bullet detection - look for bullet markers or very short lines
const bulletMarkerRegex = /^[\u2022\u2023\u25E6\u2043\u2219•·◦‣\-\*]\s|^\d+[\.\)]\s|^[a-z][\.\)]\s/i;
const prevHasBulletMarker = bulletMarkerRegex.test(prevText);
const currentHasBulletMarker = bulletMarkerRegex.test(currentText);
// True bullets are:
// 1. Have bullet markers/numbers OR
// 2. Very short (< 10 words) AND much narrower than average (< 60% of page width)
const headingKeywords = ['action items', 'next steps', 'notes', 'logistics', 'tasks'];
const normalizedPageWidth = pageWidth > 0 ? pageWidth : avgFontSize * 70;
const maxReferenceWidth = normalizedPageWidth > 0 ? normalizedPageWidth : avgFontSize * 70;
const indentDelta = currentLeft - prevLeft;
const indentThreshold = Math.max(avgFontSize * 0.6, 8);
const hasIndent = indentDelta > indentThreshold;
const currentWidthRatio = maxReferenceWidth > 0 ? currentWidth / maxReferenceWidth : 0;
const prevWidthRatio = maxReferenceWidth > 0 ? prevWidth / maxReferenceWidth : 0;
const prevLooksLikeHeading =
prevText.endsWith(':') ||
(prevWords <= 4 && prevWidthRatio < 0.4) ||
headingKeywords.some((keyword) => prevText.toLowerCase().includes(keyword));
const wrapCandidate =
!currentHasBulletMarker &&
!hasIndent &&
!prevLooksLikeHeading &&
currentWords <= 12 &&
currentWidthRatio < 0.45 &&
Math.abs(prevLeft - currentLeft) <= leftAlignmentTolerance &&
currentWidth < prevWidth * 0.85;
const currentIsBullet = wrapCandidate
? false
: currentHasBulletMarker ||
(hasIndent && (currentWords <= 14 || currentWidthRatio <= 0.65)) ||
(prevLooksLikeHeading && (currentWords <= 16 || currentWidthRatio <= 0.8 || prevWidthRatio < 0.35)) ||
(currentWords <= 8 && currentWidthRatio <= 0.45 && prevWidth - currentWidth > avgFontSize * 4);
const prevIsBullet = bulletFlags.get(prevLine.id) ?? prevHasBulletMarker;
bulletFlags.set(currentLine.id, currentIsBullet);
// Detect paragraph→bullet transition
const likelyBulletStart = !prevIsBullet && currentIsBullet;
// Don't merge two consecutive bullets
const bothAreBullets = prevIsBullet && currentIsBullet;
// Merge into paragraph if: // Merge into paragraph if:
// 1. Left aligned // 1. Left aligned
// 2. Same font // 2. Same font
// 3. Reasonable line spacing (not a large gap indicating paragraph break) // 3. Reasonable line spacing
const shouldMerge = isLeftAligned && sameFont && hasReasonableSpacing; // 4. NOT transitioning to bullets
// 5. NOT both are bullets
const shouldMerge =
isLeftAligned &&
sameFont &&
hasReasonableSpacing &&
!likelyBulletStart &&
!bothAreBullets &&
!currentIsBullet;
if (i < 10 || likelyBulletStart || bothAreBullets || !shouldMerge) {
console.log(` Line ${i}:`);
console.log(` prev: "${prevText.substring(0, 40)}" (${prevWords}w, ${prevWidth.toFixed(0)}pt, marker:${prevHasBulletMarker}, bullet:${prevIsBullet})`);
console.log(` curr: "${currentText.substring(0, 40)}" (${currentWords}w, ${currentWidth.toFixed(0)}pt, marker:${currentHasBulletMarker}, bullet:${currentIsBullet})`);
console.log(` checks: leftAlign:${isLeftAligned} (${Math.abs(prevLeft - currentLeft).toFixed(1)}pt), sameFont:${sameFont}, spacing:${hasReasonableSpacing} (${lineSpacing.toFixed(1)}pt/${maxReasonableSpacing.toFixed(1)}pt)`);
console.log(` decision: merge=${shouldMerge} (bulletStart:${likelyBulletStart}, bothBullets:${bothAreBullets})`);
}
if (shouldMerge) { if (shouldMerge) {
currentParagraph.push(currentLine); currentParagraph.push(currentLine);
@ -587,17 +673,24 @@ const groupLinesIntoParagraphs = (
} }
// Combine all elements from all lines // Combine all elements from all lines
const allElements = lines.flatMap(line => line.originalElements); const lineTemplates = lines.map(line => cloneLineTemplate(line));
const flattenedLineTemplates = lineTemplates.flatMap((line) =>
line.childLineGroups && line.childLineGroups.length > 0
? line.childLineGroups
: [line],
);
const allLines = flattenedLineTemplates.length > 0 ? flattenedLineTemplates : lineTemplates;
const allElements = allLines.flatMap(line => line.originalElements);
const pageIndex = lines[0].pageIndex; const pageIndex = lines[0].pageIndex;
const lineElementCounts = lines.map((line) => line.originalElements.length); const lineElementCounts = allLines.map((line) => line.originalElements.length);
// Create merged group with newlines between lines // Create merged group with newlines between lines
const paragraphText = lines.map(line => line.text).join('\n'); const paragraphText = allLines.map(line => line.text).join('\n');
const mergedBounds = mergeBounds(lines.map(line => line.bounds)); const mergedBounds = mergeBounds(allLines.map(line => line.bounds));
const spacingValues: number[] = []; const spacingValues: number[] = [];
for (let i = 1; i < lines.length; i++) { for (let i = 1; i < allLines.length; i++) {
const prevBaseline = lines[i - 1].baseline ?? lines[i - 1].bounds.bottom; const prevBaseline = allLines[i - 1].baseline ?? allLines[i - 1].bounds.bottom;
const currentBaseline = lines[i].baseline ?? lines[i].bounds.bottom; const currentBaseline = allLines[i].baseline ?? allLines[i].bounds.bottom;
const spacing = Math.abs(prevBaseline - currentBaseline); const spacing = Math.abs(prevBaseline - currentBaseline);
if (spacing > 0) { if (spacing > 0) {
spacingValues.push(spacing); spacingValues.push(spacing);
@ -633,6 +726,7 @@ const groupLinesIntoParagraphs = (
text: paragraphText, text: paragraphText,
originalText: paragraphText, originalText: paragraphText,
bounds: mergedBounds, bounds: mergedBounds,
childLineGroups: allLines,
}; };
}); });
}; };
@ -742,7 +836,7 @@ export const groupPageTextElements = (
if (groupingMode === 'paragraph') { if (groupingMode === 'paragraph') {
// Paragraph mode: always apply grouping // Paragraph mode: always apply grouping
return groupLinesIntoParagraphs(lineGroups, metrics); return groupLinesIntoParagraphs(lineGroups, pageWidth, metrics);
} }
// Auto mode: use heuristic to determine if we should group // Auto mode: use heuristic to determine if we should group
@ -801,12 +895,11 @@ export const groupPageTextElements = (
const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0; const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0;
// Check each criterion // Check each criterion
const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8; const criterion1 = avgWordsPerGroup > 5;
const criterion2 = avgWordsPerGroup > 5; const criterion2 = longTextRatio > 0.4;
const criterion3 = longTextRatio > 0.4; const criterion3 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text
const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text
const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4; const isParagraphPage = criterion1 && criterion2 && criterion3;
// Log detection stats // Log detection stats
console.log(`📄 Page ${pageIndex} Grouping Analysis (mode: ${groupingMode}):`); console.log(`📄 Page ${pageIndex} Grouping Analysis (mode: ${groupingMode}):`);
@ -823,24 +916,21 @@ export const groupPageTextElements = (
console.log(` • Std deviation: ${stdDev.toFixed(2)}`); console.log(` • Std deviation: ${stdDev.toFixed(2)}`);
console.log(` • Coefficient of variation: ${coefficientOfVariation.toFixed(2)}`); console.log(` • Coefficient of variation: ${coefficientOfVariation.toFixed(2)}`);
console.log(` Criteria:`); console.log(` Criteria:`);
console.log(` 1. Multi-line + Avg Words: ${criterion1 ? '✅ PASS' : '❌ FAIL'}`); console.log(` 1. Avg Words Per Group: ${criterion1 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (${multiLineGroups} >= 2 AND ${avgWordsPerGroup.toFixed(2)} > 8)`);
console.log(` 2. Avg Words Only: ${criterion2 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (${avgWordsPerGroup.toFixed(2)} > 5)`); console.log(` (${avgWordsPerGroup.toFixed(2)} > 5)`);
console.log(` 3. Long Text Ratio: ${criterion3 ? '✅ PASS' : '❌ FAIL'}`); console.log(` 2. Long Text Ratio: ${criterion2 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (${(longTextRatio * 100).toFixed(1)}% > 40%)`); console.log(` (${(longTextRatio * 100).toFixed(1)}% > 40%)`);
console.log(` 4. Line Width Pattern: ${criterion4 ? '✅ PASS' : '❌ FAIL'}`); console.log(` 3. Line Width Pattern: ${criterion3 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (CV ${coefficientOfVariation.toFixed(2)} > 0.5 OR ${(fullWidthRatio * 100).toFixed(1)}% > 60%)`); console.log(` (CV ${coefficientOfVariation.toFixed(2)} > 0.5 OR ${(fullWidthRatio * 100).toFixed(1)}% > 60%)`);
console.log(` ${coefficientOfVariation > 0.5 ? '✓ High variance (varying line lengths)' : '✗ Low variance'} ${fullWidthRatio > 0.6 ? '✓ Many full-width lines (paragraph-like)' : '✗ Few full-width lines (list-like)'}`); console.log(` ${coefficientOfVariation > 0.5 ? '✓ High variance (varying line lengths)' : '✗ Low variance'} ${fullWidthRatio > 0.6 ? '✓ Many full-width lines (paragraph-like)' : '✗ Few full-width lines (list-like)'}`);
console.log(` Decision: ${isParagraphPage ? '📝 PARAGRAPH MODE' : '📋 LINE MODE'}`); console.log(` Decision: ${isParagraphPage ? '📝 PARAGRAPH MODE' : '📋 LINE MODE'}`);
if (isParagraphPage) { if (isParagraphPage) {
console.log(` Reason: All criteria passed (AND logic)`); console.log(` Reason: All three criteria passed (AND logic)`);
} else { } else {
const failedReasons = []; const failedReasons = [];
if (!criterion1) failedReasons.push('insufficient multi-line groups or word density'); if (!criterion1) failedReasons.push('low average words per group');
if (!criterion2) failedReasons.push('low average words per group'); if (!criterion2) failedReasons.push('low ratio of long text groups');
if (!criterion3) failedReasons.push('low ratio of long text groups'); if (!criterion3) failedReasons.push('low variance and few full-width lines (list-like structure)');
if (!criterion4) failedReasons.push('low variance and few full-width lines (list-like structure)');
console.log(` Reason: ${failedReasons.join(', ')}`); console.log(` Reason: ${failedReasons.join(', ')}`);
} }
console.log(''); console.log('');
@ -848,7 +938,7 @@ export const groupPageTextElements = (
// Only apply paragraph grouping if it looks like a paragraph-heavy page // Only apply paragraph grouping if it looks like a paragraph-heavy page
if (isParagraphPage) { if (isParagraphPage) {
console.log(`🔀 Applying paragraph grouping to page ${pageIndex}`); console.log(`🔀 Applying paragraph grouping to page ${pageIndex}`);
return groupLinesIntoParagraphs(lineGroups, metrics); return groupLinesIntoParagraphs(lineGroups, pageWidth, metrics);
} }
// For sparse pages, keep lines separate // For sparse pages, keep lines separate