paragraphs

This commit is contained in:
Anthony Stirling 2025-11-13 22:55:39 +00:00
parent c74f71af3a
commit a58c69016b
4 changed files with 186 additions and 53 deletions

View File

@ -199,14 +199,30 @@ const FontStatusPanel: React.FC<FontStatusPanelProps> = ({ document, pageIndex }
<Accordion variant="contained" defaultValue={hasWarnings ? 'fonts' : undefined}>
<Accordion.Item value="fonts">
<Accordion.Control>
<Group gap="xs" wrap="nowrap">
{statusIcon}
<Text size="sm" fw={500}>
{pageLabel}
</Text>
<Badge size="xs" color={statusColor} variant="dot">
{fonts.length}
</Badge>
<Group gap="xs" wrap="wrap" style={{ flex: 1 }}>
<Group gap="xs" wrap="nowrap">
{statusIcon}
<Text size="sm" fw={500}>
{pageLabel}
</Text>
<Badge size="xs" color={statusColor} variant="dot">
{fonts.length}
</Badge>
</Group>
{/* Warning badges BEFORE expansion */}
<Group gap={4} wrap="wrap">
{summary.systemFallback > 0 && (
<Badge size="xs" color="yellow" variant="filled" leftSection={<WarningIcon sx={{ fontSize: 12 }} />}>
{summary.systemFallback} {t('pdfTextEditor.fontAnalysis.fallback', 'fallback')}
</Badge>
)}
{summary.missing > 0 && (
<Badge size="xs" color="red" variant="filled" leftSection={<ErrorIcon sx={{ fontSize: 12 }} />}>
{summary.missing} {t('pdfTextEditor.fontAnalysis.missing', 'missing')}
</Badge>
)}
</Group>
</Group>
</Accordion.Control>
<Accordion.Panel>

View File

@ -247,20 +247,16 @@ const buildFontLookupKeys = (
* Analyzes text groups on a page to determine if it's paragraph-heavy or sparse.
* Returns true if the page appears to be document-like with substantial text content.
*/
const analyzePageContentType = (groups: TextGroup[]): boolean => {
const analyzePageContentType = (groups: TextGroup[], pageWidth: number): boolean => {
if (groups.length === 0) return false;
let multiLineGroups = 0;
let totalWords = 0;
let longTextGroups = 0;
let totalGroups = 0;
const groupDetails: Array<{
id: string;
lines: number;
words: number;
chars: number;
text: string;
}> = [];
let fullWidthLines = 0;
const wordCounts: number[] = [];
const fullWidthThreshold = pageWidth * 0.7;
groups.forEach((group) => {
const text = (group.text || '').trim();
@ -272,39 +268,46 @@ const analyzePageContentType = (groups: TextGroup[]): boolean => {
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
totalWords += wordCount;
wordCounts.push(wordCount);
// Count multi-line paragraphs
if (lineCount > 1) {
multiLineGroups++;
}
// Count text groups with substantial content (more than a few words)
if (wordCount >= 5 || text.length >= 30) {
// Count text groups with substantial content (≥10 words or ≥50 chars)
if (wordCount >= 10 || text.length >= 50) {
longTextGroups++;
}
groupDetails.push({
id: group.id,
lines: lineCount,
words: wordCount,
chars: text.length,
text: text.substring(0, 50) + (text.length > 50 ? '...' : ''),
});
// Check if this line extends close to the right margin
const rightEdge = group.bounds.right;
if (rightEdge >= fullWidthThreshold) {
fullWidthLines++;
}
});
if (totalGroups === 0) return false;
// Heuristics for paragraph mode:
// 1. Has multiple substantial multi-line groups (2+) AND decent average words
// 2. Average words per group > 12 (strong indicator of document text)
// 3. More than 40% of groups have substantial text (typical of documents)
const avgWordsPerGroup = totalWords / totalGroups;
const longTextRatio = longTextGroups / totalGroups;
const fullWidthRatio = fullWidthLines / totalGroups;
const isParagraphPage =
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
avgWordsPerGroup > 12 ||
longTextRatio > 0.4;
// Calculate variance in line lengths
const variance = wordCounts.reduce((sum, count) => {
const diff = count - avgWordsPerGroup;
return sum + diff * diff;
}, 0) / totalGroups;
const stdDev = Math.sqrt(variance);
const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0;
// All 4 criteria must pass for paragraph mode
const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8;
const criterion2 = avgWordsPerGroup > 5;
const criterion3 = longTextRatio > 0.4;
const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6;
const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4;
return isParagraphPage;
};
@ -543,9 +546,10 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
const pageGroups = groupsByPage[selectedPage] ?? [];
const pageImages = imagesByPage[selectedPage] ?? [];
const pagePreview = pagePreviews.get(selectedPage);
const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage);
// Detect if current page contains paragraph-heavy content
const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups), [pageGroups]);
const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups, pageWidth), [pageGroups, pageWidth]);
const isParagraphLayout =
externalGroupingMode === 'paragraph' || (externalGroupingMode === 'auto' && isParagraphPage);
@ -788,7 +792,6 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
),
[pageImages],
);
const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage);
const scale = useMemo(() => Math.min(MAX_RENDER_WIDTH / pageWidth, 2.5), [pageWidth]);
const scaledWidth = pageWidth * scale;
const scaledHeight = pageHeight * scale;
@ -1036,14 +1039,37 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
position: 'absolute',
top: -8,
right: -8,
zIndex: 10,
zIndex: 9999,
cursor: 'pointer',
pointerEvents: 'auto',
}}
onMouseDown={(event) => {
console.log(`❌ MOUSEDOWN on X button for group ${groupId}`);
event.stopPropagation();
event.preventDefault();
// Find the current group to check if it's already empty
const currentGroups = groupsByPage[pageIndex] ?? [];
const currentGroup = currentGroups.find(g => g.id === groupId);
const currentText = (currentGroup?.text ?? '').trim();
if (currentText.length === 0) {
// Already empty - remove the textbox entirely
console.log(` Text already empty, removing textbox`);
onGroupDelete(pageIndex, groupId);
setActiveGroupId(null);
setEditingGroupId(null);
} else {
// Has text - clear it but keep the textbox
console.log(` Clearing text (textbox remains)`);
onGroupEdit(pageIndex, groupId, '');
}
console.log(` Operation completed`);
}}
onClick={(event) => {
console.log(`❌ X button ONCLICK fired for group ${groupId} on page ${pageIndex}`);
event.stopPropagation();
onGroupDelete(pageIndex, groupId);
setActiveGroupId(null);
setEditingGroupId(null);
event.preventDefault();
}}
>
<CloseIcon style={{ fontSize: 12 }} />

View File

@ -89,6 +89,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const [loadingImagePages, setLoadingImagePages] = useState<Set<number>>(new Set());
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
const originalGroupsRef = useRef<TextGroup[][]>([]);
const imagesByPageRef = useRef<PdfJsonImageElement[][]>([]);
const autoLoadKeyRef = useRef<string | null>(null);
const loadRequestIdRef = useRef(0);
@ -131,7 +132,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
}, []);
const dirtyPages = useMemo(
() => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current),
() => getDirtyPages(groupsByPage, imagesByPage, originalGroupsRef.current, originalImagesRef.current),
[groupsByPage, imagesByPage],
);
const hasChanges = useMemo(() => dirtyPages.some(Boolean), [dirtyPages]);
@ -157,6 +158,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const images = extractDocumentImages(cloned);
const originalImages = images.map((page) => page.map(cloneImageElement));
originalImagesRef.current = originalImages;
originalGroupsRef.current = groups.map((page) => page.map((group) => ({ ...group })));
imagesByPageRef.current = images.map((page) => page.map(cloneImageElement));
const initialLoaded = new Set<number>();
originalImages.forEach((pageImages, index) => {
@ -595,13 +597,16 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
}, []);
const handleGroupDelete = useCallback((pageIndex: number, groupId: string) => {
setGroupsByPage((previous) =>
previous.map((groups, idx) =>
idx !== pageIndex
? groups
: groups.map((group) => (group.id === groupId ? { ...group, text: '' } : group))
)
);
console.log(`🗑️ Deleting group ${groupId} from page ${pageIndex}`);
setGroupsByPage((previous) => {
const updated = previous.map((groups, idx) => {
if (idx !== pageIndex) return groups;
const filtered = groups.filter((group) => group.id !== groupId);
console.log(` Before: ${groups.length} groups, After: ${filtered.length} groups`);
return filtered;
});
return updated;
});
}, []);
const handleImageTransform = useCallback(

View File

@ -647,6 +647,8 @@ export const groupPageTextElements = (
return [];
}
const pageWidth = valueOr(page.width, DEFAULT_PAGE_WIDTH);
const elements = page.textElements
.map(cloneTextElement)
.filter((element) => element.text !== null && element.text !== undefined);
@ -749,6 +751,11 @@ export const groupPageTextElements = (
let totalWords = 0;
let longTextGroups = 0;
let totalGroups = 0;
const wordCounts: number[] = [];
let fullWidthLines = 0;
// Define "full width" as extending to at least 70% of page width
const fullWidthThreshold = pageWidth * 0.7;
lineGroups.forEach((group) => {
const text = (group.text || '').trim();
@ -760,14 +767,21 @@ export const groupPageTextElements = (
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
totalWords += wordCount;
wordCounts.push(wordCount);
if (lineCount > 1) {
multiLineGroups++;
}
if (wordCount >= 5 || text.length >= 30) {
if (wordCount >= 10 || text.length >= 50) {
longTextGroups++;
}
// Check if this line extends close to the right margin (paragraph-like)
const rightEdge = group.bounds.right;
if (rightEdge >= fullWidthThreshold) {
fullWidthLines++;
}
});
if (totalGroups === 0) {
@ -776,18 +790,69 @@ export const groupPageTextElements = (
const avgWordsPerGroup = totalWords / totalGroups;
const longTextRatio = longTextGroups / totalGroups;
const fullWidthRatio = fullWidthLines / totalGroups;
const isParagraphPage =
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
avgWordsPerGroup > 12 ||
longTextRatio > 0.4;
// Calculate variance in line lengths (paragraphs have varying lengths, lists are uniform)
const variance = wordCounts.reduce((sum, count) => {
const diff = count - avgWordsPerGroup;
return sum + diff * diff;
}, 0) / totalGroups;
const stdDev = Math.sqrt(variance);
const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0;
// Check each criterion
const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8;
const criterion2 = avgWordsPerGroup > 5;
const criterion3 = longTextRatio > 0.4;
const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text
const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4;
// Log detection stats
console.log(`📄 Page ${pageIndex} Grouping Analysis (mode: ${groupingMode}):`);
console.log(` Stats:`);
console.log(` • Page width: ${pageWidth.toFixed(1)}pt (full-width threshold: ${fullWidthThreshold.toFixed(1)}pt)`);
console.log(` • Multi-line groups: ${multiLineGroups}`);
console.log(` • Total groups: ${totalGroups}`);
console.log(` • Total words: ${totalWords}`);
console.log(` • Long text groups (≥10 words or ≥50 chars): ${longTextGroups}`);
console.log(` • Full-width lines (≥70% page width): ${fullWidthLines}`);
console.log(` • Avg words per group: ${avgWordsPerGroup.toFixed(2)}`);
console.log(` • Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`);
console.log(` • Full-width ratio: ${(fullWidthRatio * 100).toFixed(1)}%`);
console.log(` • Std deviation: ${stdDev.toFixed(2)}`);
console.log(` • Coefficient of variation: ${coefficientOfVariation.toFixed(2)}`);
console.log(` Criteria:`);
console.log(` 1. Multi-line + Avg Words: ${criterion1 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (${multiLineGroups} >= 2 AND ${avgWordsPerGroup.toFixed(2)} > 8)`);
console.log(` 2. Avg Words Only: ${criterion2 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (${avgWordsPerGroup.toFixed(2)} > 5)`);
console.log(` 3. Long Text Ratio: ${criterion3 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (${(longTextRatio * 100).toFixed(1)}% > 40%)`);
console.log(` 4. Line Width Pattern: ${criterion4 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (CV ${coefficientOfVariation.toFixed(2)} > 0.5 OR ${(fullWidthRatio * 100).toFixed(1)}% > 60%)`);
console.log(` ${coefficientOfVariation > 0.5 ? '✓ High variance (varying line lengths)' : '✗ Low variance'} ${fullWidthRatio > 0.6 ? '✓ Many full-width lines (paragraph-like)' : '✗ Few full-width lines (list-like)'}`);
console.log(` Decision: ${isParagraphPage ? '📝 PARAGRAPH MODE' : '📋 LINE MODE'}`);
if (isParagraphPage) {
console.log(` Reason: All criteria passed (AND logic)`);
} else {
const failedReasons = [];
if (!criterion1) failedReasons.push('insufficient multi-line groups or word density');
if (!criterion2) failedReasons.push('low average words per group');
if (!criterion3) failedReasons.push('low ratio of long text groups');
if (!criterion4) failedReasons.push('low variance and few full-width lines (list-like structure)');
console.log(` Reason: ${failedReasons.join(', ')}`);
}
console.log('');
// Only apply paragraph grouping if it looks like a paragraph-heavy page
if (isParagraphPage) {
console.log(`🔀 Applying paragraph grouping to page ${pageIndex}`);
return groupLinesIntoParagraphs(lineGroups, metrics);
}
// For sparse pages, keep lines separate
console.log(`📋 Keeping lines separate for page ${pageIndex}`);
return lineGroups;
};
@ -1192,14 +1257,35 @@ export const areImageListsDifferent = (
export const getDirtyPages = (
groupsByPage: TextGroup[][],
imagesByPage: PdfJsonImageElement[][],
originalGroupsByPage: TextGroup[][],
originalImagesByPage: PdfJsonImageElement[][],
): boolean[] => {
return groupsByPage.map((groups, index) => {
// Check if any text was modified
const textDirty = groups.some((group) => group.text !== group.originalText);
// Check if any groups were deleted by comparing with original groups
const originalGroups = originalGroupsByPage[index] ?? [];
const groupCountChanged = groups.length !== originalGroups.length;
const imageDirty = areImageListsDifferent(
imagesByPage[index] ?? [],
originalImagesByPage[index] ?? [],
);
return textDirty || imageDirty;
const isDirty = textDirty || groupCountChanged || imageDirty;
if (groupCountChanged || textDirty) {
console.log(`📄 Page ${index} dirty check:`, {
textDirty,
groupCountChanged,
originalGroupsLength: originalGroups.length,
currentGroupsLength: groups.length,
imageDirty,
isDirty,
});
}
return isDirty;
});
};