mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
paragraphs
This commit is contained in:
parent
c74f71af3a
commit
a58c69016b
@ -199,14 +199,30 @@ const FontStatusPanel: React.FC<FontStatusPanelProps> = ({ document, pageIndex }
|
|||||||
<Accordion variant="contained" defaultValue={hasWarnings ? 'fonts' : undefined}>
|
<Accordion variant="contained" defaultValue={hasWarnings ? 'fonts' : undefined}>
|
||||||
<Accordion.Item value="fonts">
|
<Accordion.Item value="fonts">
|
||||||
<Accordion.Control>
|
<Accordion.Control>
|
||||||
<Group gap="xs" wrap="nowrap">
|
<Group gap="xs" wrap="wrap" style={{ flex: 1 }}>
|
||||||
{statusIcon}
|
<Group gap="xs" wrap="nowrap">
|
||||||
<Text size="sm" fw={500}>
|
{statusIcon}
|
||||||
{pageLabel}
|
<Text size="sm" fw={500}>
|
||||||
</Text>
|
{pageLabel}
|
||||||
<Badge size="xs" color={statusColor} variant="dot">
|
</Text>
|
||||||
{fonts.length}
|
<Badge size="xs" color={statusColor} variant="dot">
|
||||||
</Badge>
|
{fonts.length}
|
||||||
|
</Badge>
|
||||||
|
</Group>
|
||||||
|
|
||||||
|
{/* Warning badges BEFORE expansion */}
|
||||||
|
<Group gap={4} wrap="wrap">
|
||||||
|
{summary.systemFallback > 0 && (
|
||||||
|
<Badge size="xs" color="yellow" variant="filled" leftSection={<WarningIcon sx={{ fontSize: 12 }} />}>
|
||||||
|
{summary.systemFallback} {t('pdfTextEditor.fontAnalysis.fallback', 'fallback')}
|
||||||
|
</Badge>
|
||||||
|
)}
|
||||||
|
{summary.missing > 0 && (
|
||||||
|
<Badge size="xs" color="red" variant="filled" leftSection={<ErrorIcon sx={{ fontSize: 12 }} />}>
|
||||||
|
{summary.missing} {t('pdfTextEditor.fontAnalysis.missing', 'missing')}
|
||||||
|
</Badge>
|
||||||
|
)}
|
||||||
|
</Group>
|
||||||
</Group>
|
</Group>
|
||||||
</Accordion.Control>
|
</Accordion.Control>
|
||||||
<Accordion.Panel>
|
<Accordion.Panel>
|
||||||
|
|||||||
@ -247,20 +247,16 @@ const buildFontLookupKeys = (
|
|||||||
* Analyzes text groups on a page to determine if it's paragraph-heavy or sparse.
|
* Analyzes text groups on a page to determine if it's paragraph-heavy or sparse.
|
||||||
* Returns true if the page appears to be document-like with substantial text content.
|
* Returns true if the page appears to be document-like with substantial text content.
|
||||||
*/
|
*/
|
||||||
const analyzePageContentType = (groups: TextGroup[]): boolean => {
|
const analyzePageContentType = (groups: TextGroup[], pageWidth: number): boolean => {
|
||||||
if (groups.length === 0) return false;
|
if (groups.length === 0) return false;
|
||||||
|
|
||||||
let multiLineGroups = 0;
|
let multiLineGroups = 0;
|
||||||
let totalWords = 0;
|
let totalWords = 0;
|
||||||
let longTextGroups = 0;
|
let longTextGroups = 0;
|
||||||
let totalGroups = 0;
|
let totalGroups = 0;
|
||||||
const groupDetails: Array<{
|
let fullWidthLines = 0;
|
||||||
id: string;
|
const wordCounts: number[] = [];
|
||||||
lines: number;
|
const fullWidthThreshold = pageWidth * 0.7;
|
||||||
words: number;
|
|
||||||
chars: number;
|
|
||||||
text: string;
|
|
||||||
}> = [];
|
|
||||||
|
|
||||||
groups.forEach((group) => {
|
groups.forEach((group) => {
|
||||||
const text = (group.text || '').trim();
|
const text = (group.text || '').trim();
|
||||||
@ -272,39 +268,46 @@ const analyzePageContentType = (groups: TextGroup[]): boolean => {
|
|||||||
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
||||||
|
|
||||||
totalWords += wordCount;
|
totalWords += wordCount;
|
||||||
|
wordCounts.push(wordCount);
|
||||||
|
|
||||||
// Count multi-line paragraphs
|
// Count multi-line paragraphs
|
||||||
if (lineCount > 1) {
|
if (lineCount > 1) {
|
||||||
multiLineGroups++;
|
multiLineGroups++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count text groups with substantial content (more than a few words)
|
// Count text groups with substantial content (≥10 words or ≥50 chars)
|
||||||
if (wordCount >= 5 || text.length >= 30) {
|
if (wordCount >= 10 || text.length >= 50) {
|
||||||
longTextGroups++;
|
longTextGroups++;
|
||||||
}
|
}
|
||||||
|
|
||||||
groupDetails.push({
|
// Check if this line extends close to the right margin
|
||||||
id: group.id,
|
const rightEdge = group.bounds.right;
|
||||||
lines: lineCount,
|
if (rightEdge >= fullWidthThreshold) {
|
||||||
words: wordCount,
|
fullWidthLines++;
|
||||||
chars: text.length,
|
}
|
||||||
text: text.substring(0, 50) + (text.length > 50 ? '...' : ''),
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|
||||||
if (totalGroups === 0) return false;
|
if (totalGroups === 0) return false;
|
||||||
|
|
||||||
// Heuristics for paragraph mode:
|
|
||||||
// 1. Has multiple substantial multi-line groups (2+) AND decent average words
|
|
||||||
// 2. Average words per group > 12 (strong indicator of document text)
|
|
||||||
// 3. More than 40% of groups have substantial text (typical of documents)
|
|
||||||
const avgWordsPerGroup = totalWords / totalGroups;
|
const avgWordsPerGroup = totalWords / totalGroups;
|
||||||
const longTextRatio = longTextGroups / totalGroups;
|
const longTextRatio = longTextGroups / totalGroups;
|
||||||
|
const fullWidthRatio = fullWidthLines / totalGroups;
|
||||||
|
|
||||||
const isParagraphPage =
|
// Calculate variance in line lengths
|
||||||
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
|
const variance = wordCounts.reduce((sum, count) => {
|
||||||
avgWordsPerGroup > 12 ||
|
const diff = count - avgWordsPerGroup;
|
||||||
longTextRatio > 0.4;
|
return sum + diff * diff;
|
||||||
|
}, 0) / totalGroups;
|
||||||
|
const stdDev = Math.sqrt(variance);
|
||||||
|
const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0;
|
||||||
|
|
||||||
|
// All 4 criteria must pass for paragraph mode
|
||||||
|
const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8;
|
||||||
|
const criterion2 = avgWordsPerGroup > 5;
|
||||||
|
const criterion3 = longTextRatio > 0.4;
|
||||||
|
const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6;
|
||||||
|
|
||||||
|
const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4;
|
||||||
|
|
||||||
return isParagraphPage;
|
return isParagraphPage;
|
||||||
};
|
};
|
||||||
@ -543,9 +546,10 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
|
|||||||
const pageGroups = groupsByPage[selectedPage] ?? [];
|
const pageGroups = groupsByPage[selectedPage] ?? [];
|
||||||
const pageImages = imagesByPage[selectedPage] ?? [];
|
const pageImages = imagesByPage[selectedPage] ?? [];
|
||||||
const pagePreview = pagePreviews.get(selectedPage);
|
const pagePreview = pagePreviews.get(selectedPage);
|
||||||
|
const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage);
|
||||||
|
|
||||||
// Detect if current page contains paragraph-heavy content
|
// Detect if current page contains paragraph-heavy content
|
||||||
const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups), [pageGroups]);
|
const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups, pageWidth), [pageGroups, pageWidth]);
|
||||||
const isParagraphLayout =
|
const isParagraphLayout =
|
||||||
externalGroupingMode === 'paragraph' || (externalGroupingMode === 'auto' && isParagraphPage);
|
externalGroupingMode === 'paragraph' || (externalGroupingMode === 'auto' && isParagraphPage);
|
||||||
|
|
||||||
@ -788,7 +792,6 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
|
|||||||
),
|
),
|
||||||
[pageImages],
|
[pageImages],
|
||||||
);
|
);
|
||||||
const { width: pageWidth, height: pageHeight } = pageDimensions(currentPage);
|
|
||||||
const scale = useMemo(() => Math.min(MAX_RENDER_WIDTH / pageWidth, 2.5), [pageWidth]);
|
const scale = useMemo(() => Math.min(MAX_RENDER_WIDTH / pageWidth, 2.5), [pageWidth]);
|
||||||
const scaledWidth = pageWidth * scale;
|
const scaledWidth = pageWidth * scale;
|
||||||
const scaledHeight = pageHeight * scale;
|
const scaledHeight = pageHeight * scale;
|
||||||
@ -1036,14 +1039,37 @@ const PdfTextEditorView = ({ data }: PdfTextEditorViewProps) => {
|
|||||||
position: 'absolute',
|
position: 'absolute',
|
||||||
top: -8,
|
top: -8,
|
||||||
right: -8,
|
right: -8,
|
||||||
zIndex: 10,
|
zIndex: 9999,
|
||||||
cursor: 'pointer',
|
cursor: 'pointer',
|
||||||
|
pointerEvents: 'auto',
|
||||||
|
}}
|
||||||
|
onMouseDown={(event) => {
|
||||||
|
console.log(`❌ MOUSEDOWN on X button for group ${groupId}`);
|
||||||
|
event.stopPropagation();
|
||||||
|
event.preventDefault();
|
||||||
|
|
||||||
|
// Find the current group to check if it's already empty
|
||||||
|
const currentGroups = groupsByPage[pageIndex] ?? [];
|
||||||
|
const currentGroup = currentGroups.find(g => g.id === groupId);
|
||||||
|
const currentText = (currentGroup?.text ?? '').trim();
|
||||||
|
|
||||||
|
if (currentText.length === 0) {
|
||||||
|
// Already empty - remove the textbox entirely
|
||||||
|
console.log(` Text already empty, removing textbox`);
|
||||||
|
onGroupDelete(pageIndex, groupId);
|
||||||
|
setActiveGroupId(null);
|
||||||
|
setEditingGroupId(null);
|
||||||
|
} else {
|
||||||
|
// Has text - clear it but keep the textbox
|
||||||
|
console.log(` Clearing text (textbox remains)`);
|
||||||
|
onGroupEdit(pageIndex, groupId, '');
|
||||||
|
}
|
||||||
|
console.log(` Operation completed`);
|
||||||
}}
|
}}
|
||||||
onClick={(event) => {
|
onClick={(event) => {
|
||||||
|
console.log(`❌ X button ONCLICK fired for group ${groupId} on page ${pageIndex}`);
|
||||||
event.stopPropagation();
|
event.stopPropagation();
|
||||||
onGroupDelete(pageIndex, groupId);
|
event.preventDefault();
|
||||||
setActiveGroupId(null);
|
|
||||||
setEditingGroupId(null);
|
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
<CloseIcon style={{ fontSize: 12 }} />
|
<CloseIcon style={{ fontSize: 12 }} />
|
||||||
|
|||||||
@ -89,6 +89,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
const [loadingImagePages, setLoadingImagePages] = useState<Set<number>>(new Set());
|
const [loadingImagePages, setLoadingImagePages] = useState<Set<number>>(new Set());
|
||||||
|
|
||||||
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
|
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
|
||||||
|
const originalGroupsRef = useRef<TextGroup[][]>([]);
|
||||||
const imagesByPageRef = useRef<PdfJsonImageElement[][]>([]);
|
const imagesByPageRef = useRef<PdfJsonImageElement[][]>([]);
|
||||||
const autoLoadKeyRef = useRef<string | null>(null);
|
const autoLoadKeyRef = useRef<string | null>(null);
|
||||||
const loadRequestIdRef = useRef(0);
|
const loadRequestIdRef = useRef(0);
|
||||||
@ -131,7 +132,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
const dirtyPages = useMemo(
|
const dirtyPages = useMemo(
|
||||||
() => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current),
|
() => getDirtyPages(groupsByPage, imagesByPage, originalGroupsRef.current, originalImagesRef.current),
|
||||||
[groupsByPage, imagesByPage],
|
[groupsByPage, imagesByPage],
|
||||||
);
|
);
|
||||||
const hasChanges = useMemo(() => dirtyPages.some(Boolean), [dirtyPages]);
|
const hasChanges = useMemo(() => dirtyPages.some(Boolean), [dirtyPages]);
|
||||||
@ -157,6 +158,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
const images = extractDocumentImages(cloned);
|
const images = extractDocumentImages(cloned);
|
||||||
const originalImages = images.map((page) => page.map(cloneImageElement));
|
const originalImages = images.map((page) => page.map(cloneImageElement));
|
||||||
originalImagesRef.current = originalImages;
|
originalImagesRef.current = originalImages;
|
||||||
|
originalGroupsRef.current = groups.map((page) => page.map((group) => ({ ...group })));
|
||||||
imagesByPageRef.current = images.map((page) => page.map(cloneImageElement));
|
imagesByPageRef.current = images.map((page) => page.map(cloneImageElement));
|
||||||
const initialLoaded = new Set<number>();
|
const initialLoaded = new Set<number>();
|
||||||
originalImages.forEach((pageImages, index) => {
|
originalImages.forEach((pageImages, index) => {
|
||||||
@ -595,13 +597,16 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
const handleGroupDelete = useCallback((pageIndex: number, groupId: string) => {
|
const handleGroupDelete = useCallback((pageIndex: number, groupId: string) => {
|
||||||
setGroupsByPage((previous) =>
|
console.log(`🗑️ Deleting group ${groupId} from page ${pageIndex}`);
|
||||||
previous.map((groups, idx) =>
|
setGroupsByPage((previous) => {
|
||||||
idx !== pageIndex
|
const updated = previous.map((groups, idx) => {
|
||||||
? groups
|
if (idx !== pageIndex) return groups;
|
||||||
: groups.map((group) => (group.id === groupId ? { ...group, text: '' } : group))
|
const filtered = groups.filter((group) => group.id !== groupId);
|
||||||
)
|
console.log(` Before: ${groups.length} groups, After: ${filtered.length} groups`);
|
||||||
);
|
return filtered;
|
||||||
|
});
|
||||||
|
return updated;
|
||||||
|
});
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
const handleImageTransform = useCallback(
|
const handleImageTransform = useCallback(
|
||||||
|
|||||||
@ -647,6 +647,8 @@ export const groupPageTextElements = (
|
|||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const pageWidth = valueOr(page.width, DEFAULT_PAGE_WIDTH);
|
||||||
|
|
||||||
const elements = page.textElements
|
const elements = page.textElements
|
||||||
.map(cloneTextElement)
|
.map(cloneTextElement)
|
||||||
.filter((element) => element.text !== null && element.text !== undefined);
|
.filter((element) => element.text !== null && element.text !== undefined);
|
||||||
@ -749,6 +751,11 @@ export const groupPageTextElements = (
|
|||||||
let totalWords = 0;
|
let totalWords = 0;
|
||||||
let longTextGroups = 0;
|
let longTextGroups = 0;
|
||||||
let totalGroups = 0;
|
let totalGroups = 0;
|
||||||
|
const wordCounts: number[] = [];
|
||||||
|
let fullWidthLines = 0;
|
||||||
|
|
||||||
|
// Define "full width" as extending to at least 70% of page width
|
||||||
|
const fullWidthThreshold = pageWidth * 0.7;
|
||||||
|
|
||||||
lineGroups.forEach((group) => {
|
lineGroups.forEach((group) => {
|
||||||
const text = (group.text || '').trim();
|
const text = (group.text || '').trim();
|
||||||
@ -760,14 +767,21 @@ export const groupPageTextElements = (
|
|||||||
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
||||||
|
|
||||||
totalWords += wordCount;
|
totalWords += wordCount;
|
||||||
|
wordCounts.push(wordCount);
|
||||||
|
|
||||||
if (lineCount > 1) {
|
if (lineCount > 1) {
|
||||||
multiLineGroups++;
|
multiLineGroups++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wordCount >= 5 || text.length >= 30) {
|
if (wordCount >= 10 || text.length >= 50) {
|
||||||
longTextGroups++;
|
longTextGroups++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if this line extends close to the right margin (paragraph-like)
|
||||||
|
const rightEdge = group.bounds.right;
|
||||||
|
if (rightEdge >= fullWidthThreshold) {
|
||||||
|
fullWidthLines++;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (totalGroups === 0) {
|
if (totalGroups === 0) {
|
||||||
@ -776,18 +790,69 @@ export const groupPageTextElements = (
|
|||||||
|
|
||||||
const avgWordsPerGroup = totalWords / totalGroups;
|
const avgWordsPerGroup = totalWords / totalGroups;
|
||||||
const longTextRatio = longTextGroups / totalGroups;
|
const longTextRatio = longTextGroups / totalGroups;
|
||||||
|
const fullWidthRatio = fullWidthLines / totalGroups;
|
||||||
|
|
||||||
const isParagraphPage =
|
// Calculate variance in line lengths (paragraphs have varying lengths, lists are uniform)
|
||||||
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
|
const variance = wordCounts.reduce((sum, count) => {
|
||||||
avgWordsPerGroup > 12 ||
|
const diff = count - avgWordsPerGroup;
|
||||||
longTextRatio > 0.4;
|
return sum + diff * diff;
|
||||||
|
}, 0) / totalGroups;
|
||||||
|
const stdDev = Math.sqrt(variance);
|
||||||
|
const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0;
|
||||||
|
|
||||||
|
// Check each criterion
|
||||||
|
const criterion1 = multiLineGroups >= 2 && avgWordsPerGroup > 8;
|
||||||
|
const criterion2 = avgWordsPerGroup > 5;
|
||||||
|
const criterion3 = longTextRatio > 0.4;
|
||||||
|
const criterion4 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text
|
||||||
|
|
||||||
|
const isParagraphPage = criterion1 && criterion2 && criterion3 && criterion4;
|
||||||
|
|
||||||
|
// Log detection stats
|
||||||
|
console.log(`📄 Page ${pageIndex} Grouping Analysis (mode: ${groupingMode}):`);
|
||||||
|
console.log(` Stats:`);
|
||||||
|
console.log(` • Page width: ${pageWidth.toFixed(1)}pt (full-width threshold: ${fullWidthThreshold.toFixed(1)}pt)`);
|
||||||
|
console.log(` • Multi-line groups: ${multiLineGroups}`);
|
||||||
|
console.log(` • Total groups: ${totalGroups}`);
|
||||||
|
console.log(` • Total words: ${totalWords}`);
|
||||||
|
console.log(` • Long text groups (≥10 words or ≥50 chars): ${longTextGroups}`);
|
||||||
|
console.log(` • Full-width lines (≥70% page width): ${fullWidthLines}`);
|
||||||
|
console.log(` • Avg words per group: ${avgWordsPerGroup.toFixed(2)}`);
|
||||||
|
console.log(` • Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`);
|
||||||
|
console.log(` • Full-width ratio: ${(fullWidthRatio * 100).toFixed(1)}%`);
|
||||||
|
console.log(` • Std deviation: ${stdDev.toFixed(2)}`);
|
||||||
|
console.log(` • Coefficient of variation: ${coefficientOfVariation.toFixed(2)}`);
|
||||||
|
console.log(` Criteria:`);
|
||||||
|
console.log(` 1. Multi-line + Avg Words: ${criterion1 ? '✅ PASS' : '❌ FAIL'}`);
|
||||||
|
console.log(` (${multiLineGroups} >= 2 AND ${avgWordsPerGroup.toFixed(2)} > 8)`);
|
||||||
|
console.log(` 2. Avg Words Only: ${criterion2 ? '✅ PASS' : '❌ FAIL'}`);
|
||||||
|
console.log(` (${avgWordsPerGroup.toFixed(2)} > 5)`);
|
||||||
|
console.log(` 3. Long Text Ratio: ${criterion3 ? '✅ PASS' : '❌ FAIL'}`);
|
||||||
|
console.log(` (${(longTextRatio * 100).toFixed(1)}% > 40%)`);
|
||||||
|
console.log(` 4. Line Width Pattern: ${criterion4 ? '✅ PASS' : '❌ FAIL'}`);
|
||||||
|
console.log(` (CV ${coefficientOfVariation.toFixed(2)} > 0.5 OR ${(fullWidthRatio * 100).toFixed(1)}% > 60%)`);
|
||||||
|
console.log(` ${coefficientOfVariation > 0.5 ? '✓ High variance (varying line lengths)' : '✗ Low variance'} ${fullWidthRatio > 0.6 ? '✓ Many full-width lines (paragraph-like)' : '✗ Few full-width lines (list-like)'}`);
|
||||||
|
console.log(` Decision: ${isParagraphPage ? '📝 PARAGRAPH MODE' : '📋 LINE MODE'}`);
|
||||||
|
if (isParagraphPage) {
|
||||||
|
console.log(` Reason: All criteria passed (AND logic)`);
|
||||||
|
} else {
|
||||||
|
const failedReasons = [];
|
||||||
|
if (!criterion1) failedReasons.push('insufficient multi-line groups or word density');
|
||||||
|
if (!criterion2) failedReasons.push('low average words per group');
|
||||||
|
if (!criterion3) failedReasons.push('low ratio of long text groups');
|
||||||
|
if (!criterion4) failedReasons.push('low variance and few full-width lines (list-like structure)');
|
||||||
|
console.log(` Reason: ${failedReasons.join(', ')}`);
|
||||||
|
}
|
||||||
|
console.log('');
|
||||||
|
|
||||||
// Only apply paragraph grouping if it looks like a paragraph-heavy page
|
// Only apply paragraph grouping if it looks like a paragraph-heavy page
|
||||||
if (isParagraphPage) {
|
if (isParagraphPage) {
|
||||||
|
console.log(`🔀 Applying paragraph grouping to page ${pageIndex}`);
|
||||||
return groupLinesIntoParagraphs(lineGroups, metrics);
|
return groupLinesIntoParagraphs(lineGroups, metrics);
|
||||||
}
|
}
|
||||||
|
|
||||||
// For sparse pages, keep lines separate
|
// For sparse pages, keep lines separate
|
||||||
|
console.log(`📋 Keeping lines separate for page ${pageIndex}`);
|
||||||
return lineGroups;
|
return lineGroups;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1192,14 +1257,35 @@ export const areImageListsDifferent = (
|
|||||||
export const getDirtyPages = (
|
export const getDirtyPages = (
|
||||||
groupsByPage: TextGroup[][],
|
groupsByPage: TextGroup[][],
|
||||||
imagesByPage: PdfJsonImageElement[][],
|
imagesByPage: PdfJsonImageElement[][],
|
||||||
|
originalGroupsByPage: TextGroup[][],
|
||||||
originalImagesByPage: PdfJsonImageElement[][],
|
originalImagesByPage: PdfJsonImageElement[][],
|
||||||
): boolean[] => {
|
): boolean[] => {
|
||||||
return groupsByPage.map((groups, index) => {
|
return groupsByPage.map((groups, index) => {
|
||||||
|
// Check if any text was modified
|
||||||
const textDirty = groups.some((group) => group.text !== group.originalText);
|
const textDirty = groups.some((group) => group.text !== group.originalText);
|
||||||
|
|
||||||
|
// Check if any groups were deleted by comparing with original groups
|
||||||
|
const originalGroups = originalGroupsByPage[index] ?? [];
|
||||||
|
const groupCountChanged = groups.length !== originalGroups.length;
|
||||||
|
|
||||||
const imageDirty = areImageListsDifferent(
|
const imageDirty = areImageListsDifferent(
|
||||||
imagesByPage[index] ?? [],
|
imagesByPage[index] ?? [],
|
||||||
originalImagesByPage[index] ?? [],
|
originalImagesByPage[index] ?? [],
|
||||||
);
|
);
|
||||||
return textDirty || imageDirty;
|
|
||||||
|
const isDirty = textDirty || groupCountChanged || imageDirty;
|
||||||
|
|
||||||
|
if (groupCountChanged || textDirty) {
|
||||||
|
console.log(`📄 Page ${index} dirty check:`, {
|
||||||
|
textDirty,
|
||||||
|
groupCountChanged,
|
||||||
|
originalGroupsLength: originalGroups.length,
|
||||||
|
currentGroupsLength: groups.length,
|
||||||
|
imageDirty,
|
||||||
|
isDirty,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return isDirty;
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user