mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
paragraphs
This commit is contained in:
parent
c74f71af3a
commit
a58c69016b
@ -199,14 +199,30 @@ const FontStatusPanel: React.FC<FontStatusPanelProps> = ({ document, pageIndex }
|
||||
<Accordion variant="contained" defaultValue={hasWarnings ? 'fonts' : undefined}>
|
||||
<Accordion.Item value="fonts">
|
||||
<Accordion.Control>
|
||||
<Group gap="xs" wrap="nowrap">
|
||||
{statusIcon}
|
||||
<Text size="sm" fw={500}>
|
||||
{pageLabel}
|
||||
</Text>
|
||||
<Badge size="xs" color={statusColor} variant="dot">
|
||||
{fonts.length}
|
||||
</Badge>
|
||||
<Group gap="xs" wrap="wrap" style={{ flex: 1 }}>
|
||||
<Group gap="xs" wrap="nowrap">
|
||||
{statusIcon}
|
||||
<Text size="sm" fw={500}>
|
||||
{pageLabel}
|
||||
</Text>
|
||||
<Badge size="xs" color={statusColor} variant="dot">
|
||||
{fonts.length}
|
||||
</Badge>
|
||||
</Group>
|
||||
|
||||
{/* Warning badges BEFORE expansion */}
|
||||
<Group gap={4} wrap="wrap">
|
||||
{summary.systemFallback > 0 && (
|
||||
<Badge size="xs" color="yellow" variant="filled" leftSection={<WarningIcon sx={{ fontSize: 12 }} />}>
|
||||
{summary.systemFallback} {t('pdfTextEditor.fontAnalysis.fallback', 'fallback')}
|
||||
</Badge>
|
||||
)}
|
||||
{summary.missing > 0 && (
|
||||
<Badge size="xs" color="red" variant="filled" leftSection={<ErrorIcon sx={{ fontSize: 12 }} />}>
|
||||
{summary.missing} {t('pdfTextEditor.fontAnalysis.missing', 'missing')}
|
||||
</Badge>
|
||||
)}
|
||||
</Group>
|
||||
</Group>
|
||||
</Accordion.Control>
|
||||
<Accordion.Panel>
|
||||
|
||||
@ -247,20 +247,16 @@ const buildFontLookupKeys = (
|
||||
* Analyzes text groups on a page to determine if it's paragraph-heavy or sparse.
|
||||
* Returns true if the page appears to be document-like with substantial text content.
|
||||
*/
|
||||
const analyzePageContentType = (groups: TextGroup[]): boolean => {
|
||||
const analyzePageContentType = (groups: TextGroup[], pageWidth: number): boolean => {
|
||||
if (groups.length === 0) return false;
|
||||
|
||||
let multiLineGroups = 0;
|
||||
let totalWords = 0;
|
||||
let longTextGroups = 0;
|
||||
let totalGroups = 0;
|
||||
const groupDetails: Array<{
|
||||
id: string;
|
||||
lines: number;
|
||||
words: number;
|
||||
chars: number;
|
||||
text: string;
|
||||
}> = [];
|
||||
let fullWidthLines = 0;
|
||||
const wordCounts: number[] = [];
|
||||
const fullWidthThreshold = pageWidth * 0.7;
|
||||
|
||||
groups.forEach((group) => {
|
||||
const text = (group.text || '').trim();
|
||||
@ -272,39 +268,46 @@ const analyzePageContentType = (groups: TextGroup[]): boolean => {
|
||||
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
||||
|
||||
totalWords += wordCount;
|
||||
wordCounts.push(wordCount);
|
||||
|
||||
// Count multi-line paragraphs
|
||||
if (lineCount > 1) {
|
||||
multiLineGroups++;
|
||||
}
|
||||
|
||||
// Count text groups with substantial content (more than a few words)
|
||||
if (wordCount >= 5 || text.length >= 30) {
|
||||
// Count text groups with substantial content (≥10 words or ≥50 chars)
|
||||
if (wordCount >= 10 || text.length >= 50) {
|
||||
longTextGroups++;
|
||||
}
|
||||
|
||||
groupDetails.push({
|
||||
id: group.id,
|
||||
lines: lineCount,
|
||||
words: wordCount,
|
||||
chars: text.length,
|
||||
text: text.substring(0, 50) + (text.length > 50 ? '...' : ''),
|
||||
});
|
||||
// Check if this line extends close to the right margin
|
||||
const rightEdge = group.bounds.right;
|
||||
if (rightEdge >= fullWidthThreshold) {
|
||||
fullWidthLines++;
|
||||
}
|
||||
});
|
||||
|
||||
if (totalGroups === 0) return false;
|
||||
|
||||
// Heuristics for paragraph mode:
|
||||
// 1. Has multiple substantial multi-line groups (2+) AND decent average words
|
||||
// 2. Average words per group > 12 (strong indicator of document text)
|
||||
// 3. More than 40% of groups have substantial text (typical of documents)
|
||||
const avgWordsPerGroup = totalWords / totalGroups;
|
||||
const longTextRatio = longTextGroups / totalGroups;
|
||||
const fullWidthRatio = fullWidthLines / totalGroups;
|
||||
|
||||
const isParagraphPage =
|
||||
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
|
||||
avgWordsPerGroup > 12 ||
|
||||
longTextRatio > 0.4;
|
||||
// Calculate variance in line lengths
|
||||
const variance = wordCounts.reduce((sum, count) => {
|
||||
const diff = count - avgWordsPerGroup;
|
||||
return sum + diff * diff;
|
||||
}, 0) / totalGroups;
|
||||
const stdDev = Math.sqrt(variance);
|
||||
const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0;
|
||||
|
||||
// All 4 criteria must pass for paragraph mode
|
||||
const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8;
|
||||
const criterion2 = avgWordsPerGroup > 5;
|
||||
const criterion3 = longTextRatio > 0.4;
|
||||
const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6;
|
||||
|
||||
const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4;
|
||||
|
||||
return isParagraphPage;
|
||||
};
|
||||
@ -543,9 +546,10 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
|
||||
const pageGroups = groupsByPage[selectedPage] ?? [];
|
||||
const pageImages = imagesByPage[selectedPage] ?? [];
|
||||
const pagePreview = pagePreviews.get(selectedPage);
|
||||
const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage);
|
||||
|
||||
// Detect if current page contains paragraph-heavy content
|
||||
const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups), [pageGroups]);
|
||||
const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups, pageWidth), [pageGroups, pageWidth]);
|
||||
const isParagraphLayout =
|
||||
externalGroupingMode === 'paragraph' || (externalGroupingMode === 'auto' && isParagraphPage);
|
||||
|
||||
@ -788,7 +792,6 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
|
||||
),
|
||||
[pageImages],
|
||||
);
|
||||
const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage);
|
||||
const scale = useMemo(() => Math.min(MAX_RENDER_WIDTH / pageWidth, 2.5), [pageWidth]);
|
||||
const scaledWidth = pageWidth * scale;
|
||||
const scaledHeight = pageHeight * scale;
|
||||
@ -1036,14 +1039,37 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
|
||||
position: 'absolute',
|
||||
top: -8,
|
||||
right: -8,
|
||||
zIndex: 10,
|
||||
zIndex: 9999,
|
||||
cursor: 'pointer',
|
||||
pointerEvents: 'auto',
|
||||
}}
|
||||
onMouseDown={(event) => {
|
||||
console.log(`❌ MOUSEDOWN on X button for group ${groupId}`);
|
||||
event.stopPropagation();
|
||||
event.preventDefault();
|
||||
|
||||
// Find the current group to check if it's already empty
|
||||
const currentGroups = groupsByPage[pageIndex] ?? [];
|
||||
const currentGroup = currentGroups.find(g => g.id === groupId);
|
||||
const currentText = (currentGroup?.text ?? '').trim();
|
||||
|
||||
if (currentText.length === 0) {
|
||||
// Already empty - remove the textbox entirely
|
||||
console.log(` Text already empty, removing textbox`);
|
||||
onGroupDelete(pageIndex, groupId);
|
||||
setActiveGroupId(null);
|
||||
setEditingGroupId(null);
|
||||
} else {
|
||||
// Has text - clear it but keep the textbox
|
||||
console.log(` Clearing text (textbox remains)`);
|
||||
onGroupEdit(pageIndex, groupId, '');
|
||||
}
|
||||
console.log(` Operation completed`);
|
||||
}}
|
||||
onClick={(event) => {
|
||||
console.log(`❌ X button ONCLICK fired for group ${groupId} on page ${pageIndex}`);
|
||||
event.stopPropagation();
|
||||
onGroupDelete(pageIndex, groupId);
|
||||
setActiveGroupId(null);
|
||||
setEditingGroupId(null);
|
||||
event.preventDefault();
|
||||
}}
|
||||
>
|
||||
<CloseIcon style={{ fontSize: 12 }} />
|
||||
|
||||
@ -89,6 +89,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
const [loadingImagePages, setLoadingImagePages] = useState<Set<number>>(new Set());
|
||||
|
||||
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
|
||||
const originalGroupsRef = useRef<TextGroup[][]>([]);
|
||||
const imagesByPageRef = useRef<PdfJsonImageElement[][]>([]);
|
||||
const autoLoadKeyRef = useRef<string | null>(null);
|
||||
const loadRequestIdRef = useRef(0);
|
||||
@ -131,7 +132,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}, []);
|
||||
|
||||
const dirtyPages = useMemo(
|
||||
() => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current),
|
||||
() => getDirtyPages(groupsByPage, imagesByPage, originalGroupsRef.current, originalImagesRef.current),
|
||||
[groupsByPage, imagesByPage],
|
||||
);
|
||||
const hasChanges = useMemo(() => dirtyPages.some(Boolean), [dirtyPages]);
|
||||
@ -157,6 +158,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
const images = extractDocumentImages(cloned);
|
||||
const originalImages = images.map((page) => page.map(cloneImageElement));
|
||||
originalImagesRef.current = originalImages;
|
||||
originalGroupsRef.current = groups.map((page) => page.map((group) => ({ ...group })));
|
||||
imagesByPageRef.current = images.map((page) => page.map(cloneImageElement));
|
||||
const initialLoaded = new Set<number>();
|
||||
originalImages.forEach((pageImages, index) => {
|
||||
@ -595,13 +597,16 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}, []);
|
||||
|
||||
const handleGroupDelete = useCallback((pageIndex: number, groupId: string) => {
|
||||
setGroupsByPage((previous) =>
|
||||
previous.map((groups, idx) =>
|
||||
idx !== pageIndex
|
||||
? groups
|
||||
: groups.map((group) => (group.id === groupId ? { ...group, text: '' } : group))
|
||||
)
|
||||
);
|
||||
console.log(`🗑️ Deleting group ${groupId} from page ${pageIndex}`);
|
||||
setGroupsByPage((previous) => {
|
||||
const updated = previous.map((groups, idx) => {
|
||||
if (idx !== pageIndex) return groups;
|
||||
const filtered = groups.filter((group) => group.id !== groupId);
|
||||
console.log(` Before: ${groups.length} groups, After: ${filtered.length} groups`);
|
||||
return filtered;
|
||||
});
|
||||
return updated;
|
||||
});
|
||||
}, []);
|
||||
|
||||
const handleImageTransform = useCallback(
|
||||
|
||||
@ -647,6 +647,8 @@ export const groupPageTextElements = (
|
||||
return [];
|
||||
}
|
||||
|
||||
const pageWidth = valueOr(page.width, DEFAULT_PAGE_WIDTH);
|
||||
|
||||
const elements = page.textElements
|
||||
.map(cloneTextElement)
|
||||
.filter((element) => element.text !== null && element.text !== undefined);
|
||||
@ -749,6 +751,11 @@ export const groupPageTextElements = (
|
||||
let totalWords = 0;
|
||||
let longTextGroups = 0;
|
||||
let totalGroups = 0;
|
||||
const wordCounts: number[] = [];
|
||||
let fullWidthLines = 0;
|
||||
|
||||
// Define "full width" as extending to at least 70% of page width
|
||||
const fullWidthThreshold = pageWidth * 0.7;
|
||||
|
||||
lineGroups.forEach((group) => {
|
||||
const text = (group.text || '').trim();
|
||||
@ -760,14 +767,21 @@ export const groupPageTextElements = (
|
||||
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
||||
|
||||
totalWords += wordCount;
|
||||
wordCounts.push(wordCount);
|
||||
|
||||
if (lineCount > 1) {
|
||||
multiLineGroups++;
|
||||
}
|
||||
|
||||
if (wordCount >= 5 || text.length >= 30) {
|
||||
if (wordCount >= 10 || text.length >= 50) {
|
||||
longTextGroups++;
|
||||
}
|
||||
|
||||
// Check if this line extends close to the right margin (paragraph-like)
|
||||
const rightEdge = group.bounds.right;
|
||||
if (rightEdge >= fullWidthThreshold) {
|
||||
fullWidthLines++;
|
||||
}
|
||||
});
|
||||
|
||||
if (totalGroups === 0) {
|
||||
@ -776,18 +790,69 @@ export const groupPageTextElements = (
|
||||
|
||||
const avgWordsPerGroup = totalWords / totalGroups;
|
||||
const longTextRatio = longTextGroups / totalGroups;
|
||||
const fullWidthRatio = fullWidthLines / totalGroups;
|
||||
|
||||
const isParagraphPage =
|
||||
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
|
||||
avgWordsPerGroup > 12 ||
|
||||
longTextRatio > 0.4;
|
||||
// Calculate variance in line lengths (paragraphs have varying lengths, lists are uniform)
|
||||
const variance = wordCounts.reduce((sum, count) => {
|
||||
const diff = count - avgWordsPerGroup;
|
||||
return sum + diff * diff;
|
||||
}, 0) / totalGroups;
|
||||
const stdDev = Math.sqrt(variance);
|
||||
const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0;
|
||||
|
||||
// Check each criterion
|
||||
const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8;
|
||||
const criterion2 = avgWordsPerGroup > 5;
|
||||
const criterion3 = longTextRatio > 0.4;
|
||||
const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text
|
||||
|
||||
const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4;
|
||||
|
||||
// Log detection stats
|
||||
console.log(`📄 Page ${pageIndex} Grouping Analysis (mode: ${groupingMode}):`);
|
||||
console.log(` Stats:`);
|
||||
console.log(` • Page width: ${pageWidth.toFixed(1)}pt (full-width threshold: ${fullWidthThreshold.toFixed(1)}pt)`);
|
||||
console.log(` • Multi-line groups: ${multiLineGroups}`);
|
||||
console.log(` • Total groups: ${totalGroups}`);
|
||||
console.log(` • Total words: ${totalWords}`);
|
||||
console.log(` • Long text groups (≥10 words or ≥50 chars): ${longTextGroups}`);
|
||||
console.log(` • Full-width lines (≥70% page width): ${fullWidthLines}`);
|
||||
console.log(` • Avg words per group: ${avgWordsPerGroup.toFixed(2)}`);
|
||||
console.log(` • Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`);
|
||||
console.log(` • Full-width ratio: ${(fullWidthRatio * 100).toFixed(1)}%`);
|
||||
console.log(` • Std deviation: ${stdDev.toFixed(2)}`);
|
||||
console.log(` • Coefficient of variation: ${coefficientOfVariation.toFixed(2)}`);
|
||||
console.log(` Criteria:`);
|
||||
console.log(` 1. Multi-line + Avg Words: ${criterion1 ? '✅ PASS' : '❌ FAIL'}`);
|
||||
console.log(` (${multiLineGroups} >= 2 AND ${avgWordsPerGroup.toFixed(2)} > 8)`);
|
||||
console.log(` 2. Avg Words Only: ${criterion2 ? '✅ PASS' : '❌ FAIL'}`);
|
||||
console.log(` (${avgWordsPerGroup.toFixed(2)} > 5)`);
|
||||
console.log(` 3. Long Text Ratio: ${criterion3 ? '✅ PASS' : '❌ FAIL'}`);
|
||||
console.log(` (${(longTextRatio * 100).toFixed(1)}% > 40%)`);
|
||||
console.log(` 4. Line Width Pattern: ${criterion4 ? '✅ PASS' : '❌ FAIL'}`);
|
||||
console.log(` (CV ${coefficientOfVariation.toFixed(2)} > 0.5 OR ${(fullWidthRatio * 100).toFixed(1)}% > 60%)`);
|
||||
console.log(` ${coefficientOfVariation > 0.5 ? '✓ High variance (varying line lengths)' : '✗ Low variance'} ${fullWidthRatio > 0.6 ? '✓ Many full-width lines (paragraph-like)' : '✗ Few full-width lines (list-like)'}`);
|
||||
console.log(` Decision: ${isParagraphPage ? '📝 PARAGRAPH MODE' : '📋 LINE MODE'}`);
|
||||
if (isParagraphPage) {
|
||||
console.log(` Reason: All criteria passed (AND logic)`);
|
||||
} else {
|
||||
const failedReasons = [];
|
||||
if (!criterion1) failedReasons.push('insufficient multi-line groups or word density');
|
||||
if (!criterion2) failedReasons.push('low average words per group');
|
||||
if (!criterion3) failedReasons.push('low ratio of long text groups');
|
||||
if (!criterion4) failedReasons.push('low variance and few full-width lines (list-like structure)');
|
||||
console.log(` Reason: ${failedReasons.join(', ')}`);
|
||||
}
|
||||
console.log('');
|
||||
|
||||
// Only apply paragraph grouping if it looks like a paragraph-heavy page
|
||||
if (isParagraphPage) {
|
||||
console.log(`🔀 Applying paragraph grouping to page ${pageIndex}`);
|
||||
return groupLinesIntoParagraphs(lineGroups, metrics);
|
||||
}
|
||||
|
||||
// For sparse pages, keep lines separate
|
||||
console.log(`📋 Keeping lines separate for page ${pageIndex}`);
|
||||
return lineGroups;
|
||||
};
|
||||
|
||||
@ -1192,14 +1257,35 @@ export const areImageListsDifferent = (
|
||||
export const getDirtyPages = (
|
||||
groupsByPage: TextGroup[][],
|
||||
imagesByPage: PdfJsonImageElement[][],
|
||||
originalGroupsByPage: TextGroup[][],
|
||||
originalImagesByPage: PdfJsonImageElement[][],
|
||||
): boolean[] => {
|
||||
return groupsByPage.map((groups, index) => {
|
||||
// Check if any text was modified
|
||||
const textDirty = groups.some((group) => group.text !== group.originalText);
|
||||
|
||||
// Check if any groups were deleted by comparing with original groups
|
||||
const originalGroups = originalGroupsByPage[index] ?? [];
|
||||
const groupCountChanged = groups.length !== originalGroups.length;
|
||||
|
||||
const imageDirty = areImageListsDifferent(
|
||||
imagesByPage[index] ?? [],
|
||||
originalImagesByPage[index] ?? [],
|
||||
);
|
||||
return textDirty || imageDirty;
|
||||
|
||||
const isDirty = textDirty || groupCountChanged || imageDirty;
|
||||
|
||||
if (groupCountChanged || textDirty) {
|
||||
console.log(`📄 Page ${index} dirty check:`, {
|
||||
textDirty,
|
||||
groupCountChanged,
|
||||
originalGroupsLength: originalGroups.length,
|
||||
currentGroupsLength: groups.length,
|
||||
imageDirty,
|
||||
isDirty,
|
||||
});
|
||||
}
|
||||
|
||||
return isDirty;
|
||||
});
|
||||
};
|
||||
|
||||
Loading…
Reference in New Issue
Block a user