mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
auto paragrpah mode
This commit is contained in:
parent
3ed62c8dbf
commit
2d8113b3f6
@ -4495,19 +4495,25 @@
|
|||||||
"title": "Auto-scale text to fit boxes",
|
"title": "Auto-scale text to fit boxes",
|
||||||
"description": "Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF."
|
"description": "Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF."
|
||||||
},
|
},
|
||||||
|
"groupingMode": {
|
||||||
|
"title": "Text Grouping Mode",
|
||||||
|
"autoDescription": "Automatically detects page type and groups text appropriately.",
|
||||||
|
"paragraphDescription": "Groups aligned lines into multi-line paragraph text boxes.",
|
||||||
|
"singleLineDescription": "Keeps each PDF text line as a separate text box."
|
||||||
|
},
|
||||||
"forceSingleElement": {
|
"forceSingleElement": {
|
||||||
"title": "Lock edited text to a single PDF element",
|
"title": "Lock edited text to a single PDF element",
|
||||||
"description": "When enabled, the editor exports each edited text box as one PDF text element to avoid overlapping glyphs or mixed fonts."
|
"description": "When enabled, the editor exports each edited text box as one PDF text element to avoid overlapping glyphs or mixed fonts."
|
||||||
},
|
|
||||||
"textGroupingMode": {
|
|
||||||
"title": "Text grouping mode",
|
|
||||||
"description": "Paragraph mode merges aligned lines into one textbox; single-line mode keeps every PDF line separate. Auto picks the best option per page."
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"grouping": {
|
"pageType": {
|
||||||
|
"paragraph": "Paragraph page",
|
||||||
|
"sparse": "Sparse text"
|
||||||
|
},
|
||||||
|
"groupingMode": {
|
||||||
"auto": "Auto",
|
"auto": "Auto",
|
||||||
"paragraph": "Paragraph",
|
"paragraph": "Paragraph",
|
||||||
"single": "Single Line"
|
"singleLine": "Single Line"
|
||||||
},
|
},
|
||||||
"disclaimer": {
|
"disclaimer": {
|
||||||
"heading": "Preview limitations",
|
"heading": "Preview limitations",
|
||||||
@ -4521,6 +4527,7 @@
|
|||||||
"loading": "Loading",
|
"loading": "Loading",
|
||||||
"normalizing": "Normalizing",
|
"normalizing": "Normalizing",
|
||||||
"parsing": "Parsing",
|
"parsing": "Parsing",
|
||||||
|
"processing": "Processing",
|
||||||
"fonts": "Fonts",
|
"fonts": "Fonts",
|
||||||
"text": "Text Extraction",
|
"text": "Text Extraction",
|
||||||
"images": "Images",
|
"images": "Images",
|
||||||
|
|||||||
@ -13,6 +13,7 @@ import {
|
|||||||
Pagination,
|
Pagination,
|
||||||
Progress,
|
Progress,
|
||||||
ScrollArea,
|
ScrollArea,
|
||||||
|
SegmentedControl,
|
||||||
Stack,
|
Stack,
|
||||||
Switch,
|
Switch,
|
||||||
Text,
|
Text,
|
||||||
@ -202,6 +203,95 @@ const buildFontLookupKeys = (
|
|||||||
return Array.from(new Set(keys.filter((value) => value && value.length > 0)));
|
return Array.from(new Set(keys.filter((value) => value && value.length > 0)));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzes text groups on a page to determine if it's paragraph-heavy or sparse.
|
||||||
|
* Returns true if the page appears to be document-like with substantial text content.
|
||||||
|
*/
|
||||||
|
const analyzePageContentType = (groups: TextGroup[]): boolean => {
|
||||||
|
if (groups.length === 0) return false;
|
||||||
|
|
||||||
|
let multiLineGroups = 0;
|
||||||
|
let totalWords = 0;
|
||||||
|
let longTextGroups = 0;
|
||||||
|
let totalGroups = 0;
|
||||||
|
const groupDetails: Array<{
|
||||||
|
id: string;
|
||||||
|
lines: number;
|
||||||
|
words: number;
|
||||||
|
chars: number;
|
||||||
|
text: string;
|
||||||
|
}> = [];
|
||||||
|
|
||||||
|
groups.forEach((group) => {
|
||||||
|
const text = (group.text || '').trim();
|
||||||
|
if (text.length === 0) return;
|
||||||
|
|
||||||
|
totalGroups++;
|
||||||
|
const lines = text.split('\n');
|
||||||
|
const lineCount = lines.length;
|
||||||
|
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
||||||
|
|
||||||
|
totalWords += wordCount;
|
||||||
|
|
||||||
|
// Count multi-line paragraphs
|
||||||
|
if (lineCount > 1) {
|
||||||
|
multiLineGroups++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count text groups with substantial content (more than a few words)
|
||||||
|
if (wordCount >= 5 || text.length >= 30) {
|
||||||
|
longTextGroups++;
|
||||||
|
}
|
||||||
|
|
||||||
|
groupDetails.push({
|
||||||
|
id: group.id,
|
||||||
|
lines: lineCount,
|
||||||
|
words: wordCount,
|
||||||
|
chars: text.length,
|
||||||
|
text: text.substring(0, 50) + (text.length > 50 ? '...' : ''),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
if (totalGroups === 0) return false;
|
||||||
|
|
||||||
|
// Heuristics for paragraph mode:
|
||||||
|
// 1. Has multiple substantial multi-line groups (2+) AND decent average words
|
||||||
|
// 2. Average words per group > 12 (strong indicator of document text)
|
||||||
|
// 3. More than 40% of groups have substantial text (typical of documents)
|
||||||
|
const avgWordsPerGroup = totalWords / totalGroups;
|
||||||
|
const longTextRatio = longTextGroups / totalGroups;
|
||||||
|
|
||||||
|
const isParagraphPage =
|
||||||
|
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
|
||||||
|
avgWordsPerGroup > 12 ||
|
||||||
|
longTextRatio > 0.4;
|
||||||
|
|
||||||
|
// Log detailed statistics
|
||||||
|
console.group(`📊 Page Content Analysis`);
|
||||||
|
console.log('📄 Overall Statistics:');
|
||||||
|
console.log(` Total text groups: ${totalGroups}`);
|
||||||
|
console.log(` Total words: ${totalWords}`);
|
||||||
|
console.log(` Average words per group: ${avgWordsPerGroup.toFixed(2)}`);
|
||||||
|
console.log(` Multi-line groups: ${multiLineGroups}`);
|
||||||
|
console.log(` Long text groups (≥5 words or ≥30 chars): ${longTextGroups}`);
|
||||||
|
console.log(` Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`);
|
||||||
|
console.log('');
|
||||||
|
console.log('🔍 Detection Criteria:');
|
||||||
|
console.log(` ✓ Multi-line groups ≥ 2 AND avg words > 8? ${multiLineGroups >= 2 && avgWordsPerGroup > 8 ? '✅ YES' : '❌ NO'} (multi-line: ${multiLineGroups}, avg: ${avgWordsPerGroup.toFixed(2)})`);
|
||||||
|
console.log(` ✓ Avg words/group > 12? ${avgWordsPerGroup > 12 ? '✅ YES' : '❌ NO'} (current: ${avgWordsPerGroup.toFixed(2)})`);
|
||||||
|
console.log(` ✓ Long text ratio > 40%? ${longTextRatio > 0.4 ? '✅ YES' : '❌ NO'} (current: ${(longTextRatio * 100).toFixed(1)}%)`);
|
||||||
|
console.log('');
|
||||||
|
console.log(`📋 Result: ${isParagraphPage ? '📝 PARAGRAPH PAGE' : '📄 SPARSE PAGE'}`);
|
||||||
|
console.log('');
|
||||||
|
console.log('📦 Individual Groups:');
|
||||||
|
console.table(groupDetails);
|
||||||
|
console.groupEnd();
|
||||||
|
|
||||||
|
return isParagraphPage;
|
||||||
|
};
|
||||||
|
|
||||||
|
type GroupingMode = 'auto' | 'paragraph' | 'singleLine';
|
||||||
|
|
||||||
const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
const [activeGroupId, setActiveGroupId] = useState<string | null>(null);
|
const [activeGroupId, setActiveGroupId] = useState<string | null>(null);
|
||||||
@ -232,6 +322,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
|||||||
conversionProgress,
|
conversionProgress,
|
||||||
hasChanges,
|
hasChanges,
|
||||||
forceSingleTextElement,
|
forceSingleTextElement,
|
||||||
|
groupingMode: externalGroupingMode,
|
||||||
requestPagePreview,
|
requestPagePreview,
|
||||||
onLoadJson,
|
onLoadJson,
|
||||||
onSelectPage,
|
onSelectPage,
|
||||||
@ -243,6 +334,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
|||||||
onDownloadJson,
|
onDownloadJson,
|
||||||
onGeneratePdf,
|
onGeneratePdf,
|
||||||
onForceSingleTextElementChange,
|
onForceSingleTextElementChange,
|
||||||
|
onGroupingModeChange,
|
||||||
} = data;
|
} = data;
|
||||||
|
|
||||||
const syncEditorValue = useCallback(
|
const syncEditorValue = useCallback(
|
||||||
@ -430,6 +522,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
|||||||
const pageImages = imagesByPage[selectedPage] ?? [];
|
const pageImages = imagesByPage[selectedPage] ?? [];
|
||||||
const pagePreview = pagePreviews.get(selectedPage);
|
const pagePreview = pagePreviews.get(selectedPage);
|
||||||
|
|
||||||
|
// Detect if current page contains paragraph-heavy content
|
||||||
|
const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups), [pageGroups]);
|
||||||
|
|
||||||
const extractPreferredFontId = useCallback((target?: TextGroup | null) => {
|
const extractPreferredFontId = useCallback((target?: TextGroup | null) => {
|
||||||
if (!target) {
|
if (!target) {
|
||||||
return undefined;
|
return undefined;
|
||||||
@ -981,6 +1076,50 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
|||||||
/>
|
/>
|
||||||
</Group>
|
</Group>
|
||||||
|
|
||||||
|
<Stack gap="xs">
|
||||||
|
<Group gap={4} align="center">
|
||||||
|
<Text fw={500} size="sm">
|
||||||
|
{t('pdfJsonEditor.options.groupingMode.title', 'Text Grouping Mode')}
|
||||||
|
</Text>
|
||||||
|
{externalGroupingMode === 'auto' && isParagraphPage && (
|
||||||
|
<Badge size="xs" color="blue" variant="light">
|
||||||
|
{t('pdfJsonEditor.pageType.paragraph', 'Paragraph page')}
|
||||||
|
</Badge>
|
||||||
|
)}
|
||||||
|
{externalGroupingMode === 'auto' && !isParagraphPage && hasDocument && (
|
||||||
|
<Badge size="xs" color="gray" variant="light">
|
||||||
|
{t('pdfJsonEditor.pageType.sparse', 'Sparse text')}
|
||||||
|
</Badge>
|
||||||
|
)}
|
||||||
|
</Group>
|
||||||
|
<Text size="xs" c="dimmed">
|
||||||
|
{externalGroupingMode === 'auto'
|
||||||
|
? t(
|
||||||
|
'pdfJsonEditor.options.groupingMode.autoDescription',
|
||||||
|
'Automatically detects page type and groups text appropriately.'
|
||||||
|
)
|
||||||
|
: externalGroupingMode === 'paragraph'
|
||||||
|
? t(
|
||||||
|
'pdfJsonEditor.options.groupingMode.paragraphDescription',
|
||||||
|
'Groups aligned lines into multi-line paragraph text boxes.'
|
||||||
|
)
|
||||||
|
: t(
|
||||||
|
'pdfJsonEditor.options.groupingMode.singleLineDescription',
|
||||||
|
'Keeps each PDF text line as a separate text box.'
|
||||||
|
)}
|
||||||
|
</Text>
|
||||||
|
<SegmentedControl
|
||||||
|
value={externalGroupingMode}
|
||||||
|
onChange={(value) => onGroupingModeChange(value as GroupingMode)}
|
||||||
|
data={[
|
||||||
|
{ label: t('pdfJsonEditor.groupingMode.auto', 'Auto'), value: 'auto' },
|
||||||
|
{ label: t('pdfJsonEditor.groupingMode.paragraph', 'Paragraph'), value: 'paragraph' },
|
||||||
|
{ label: t('pdfJsonEditor.groupingMode.singleLine', 'Single Line'), value: 'singleLine' },
|
||||||
|
]}
|
||||||
|
fullWidth
|
||||||
|
/>
|
||||||
|
</Stack>
|
||||||
|
|
||||||
<Group justify="space-between" align="center">
|
<Group justify="space-between" align="center">
|
||||||
<div>
|
<div>
|
||||||
<Text fw={500} size="sm">
|
<Text fw={500} size="sm">
|
||||||
@ -1547,7 +1686,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
|||||||
style={{
|
style={{
|
||||||
pointerEvents: 'none',
|
pointerEvents: 'none',
|
||||||
display: 'inline-block',
|
display: 'inline-block',
|
||||||
transform: shouldScale ? `scaleX(${textScale})` : undefined,
|
transform: shouldScale ? `scaleX(${textScale})` : 'none',
|
||||||
transformOrigin: 'left center',
|
transformOrigin: 'left center',
|
||||||
whiteSpace: 'pre',
|
whiteSpace: 'pre',
|
||||||
}}
|
}}
|
||||||
|
|||||||
@ -78,6 +78,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
message: string;
|
message: string;
|
||||||
} | null>(null);
|
} | null>(null);
|
||||||
const [forceSingleTextElement, setForceSingleTextElement] = useState(false);
|
const [forceSingleTextElement, setForceSingleTextElement] = useState(false);
|
||||||
|
const [groupingMode, setGroupingMode] = useState<'auto' | 'paragraph' | 'singleLine'>('auto');
|
||||||
const [hasVectorPreview, setHasVectorPreview] = useState(false);
|
const [hasVectorPreview, setHasVectorPreview] = useState(false);
|
||||||
const [pagePreviews, setPagePreviews] = useState<Map<number, string>>(new Map());
|
const [pagePreviews, setPagePreviews] = useState<Map<number, string>>(new Map());
|
||||||
|
|
||||||
@ -136,7 +137,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
const viewLabel = useMemo(() => t('pdfJsonEditor.viewLabel', 'PDF Editor'), [t]);
|
const viewLabel = useMemo(() => t('pdfJsonEditor.viewLabel', 'PDF Editor'), [t]);
|
||||||
const { selectedFiles } = useFileSelection();
|
const { selectedFiles } = useFileSelection();
|
||||||
|
|
||||||
const resetToDocument = useCallback((document: PdfJsonDocument | null) => {
|
const resetToDocument = useCallback((document: PdfJsonDocument | null, mode: 'auto' | 'paragraph' | 'singleLine') => {
|
||||||
if (!document) {
|
if (!document) {
|
||||||
setGroupsByPage([]);
|
setGroupsByPage([]);
|
||||||
setImagesByPage([]);
|
setImagesByPage([]);
|
||||||
@ -150,7 +151,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const cloned = deepCloneDocument(document);
|
const cloned = deepCloneDocument(document);
|
||||||
const groups = groupDocumentText(cloned);
|
const groups = groupDocumentText(cloned, mode);
|
||||||
const images = extractDocumentImages(cloned);
|
const images = extractDocumentImages(cloned);
|
||||||
const originalImages = images.map((page) => page.map(cloneImageElement));
|
const originalImages = images.map((page) => page.map(cloneImageElement));
|
||||||
originalImagesRef.current = originalImages;
|
originalImagesRef.current = originalImages;
|
||||||
@ -513,7 +514,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
setLoadedDocument(parsed);
|
setLoadedDocument(parsed);
|
||||||
resetToDocument(parsed);
|
resetToDocument(parsed, groupingMode);
|
||||||
setIsLazyMode(shouldUseLazyMode);
|
setIsLazyMode(shouldUseLazyMode);
|
||||||
setCachedJobId(shouldUseLazyMode ? pendingJobId : null);
|
setCachedJobId(shouldUseLazyMode ? pendingJobId : null);
|
||||||
setFileName(file.name);
|
setFileName(file.name);
|
||||||
@ -532,7 +533,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
setLoadedDocument(null);
|
setLoadedDocument(null);
|
||||||
resetToDocument(null);
|
resetToDocument(null, groupingMode);
|
||||||
clearPdfPreview();
|
clearPdfPreview();
|
||||||
|
|
||||||
if (isPdf) {
|
if (isPdf) {
|
||||||
@ -555,7 +556,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
[resetToDocument, t],
|
[groupingMode, resetToDocument, t],
|
||||||
);
|
);
|
||||||
|
|
||||||
const handleSelectPage = useCallback((pageIndex: number) => {
|
const handleSelectPage = useCallback((pageIndex: number) => {
|
||||||
@ -686,9 +687,9 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
if (!loadedDocument) {
|
if (!loadedDocument) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
resetToDocument(loadedDocument);
|
resetToDocument(loadedDocument, groupingMode);
|
||||||
setErrorMessage(null);
|
setErrorMessage(null);
|
||||||
}, [loadedDocument, resetToDocument]);
|
}, [groupingMode, loadedDocument, resetToDocument]);
|
||||||
|
|
||||||
const buildPayload = useCallback(() => {
|
const buildPayload = useCallback(() => {
|
||||||
if (!loadedDocument) {
|
if (!loadedDocument) {
|
||||||
@ -975,6 +976,13 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
[hasVectorPreview],
|
[hasVectorPreview],
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Re-group text when grouping mode changes
|
||||||
|
useEffect(() => {
|
||||||
|
if (loadedDocument) {
|
||||||
|
resetToDocument(loadedDocument, groupingMode);
|
||||||
|
}
|
||||||
|
}, [groupingMode, loadedDocument, resetToDocument]);
|
||||||
|
|
||||||
const viewData = useMemo<PdfJsonEditorViewData>(() => ({
|
const viewData = useMemo<PdfJsonEditorViewData>(() => ({
|
||||||
document: loadedDocument,
|
document: loadedDocument,
|
||||||
groupsByPage,
|
groupsByPage,
|
||||||
@ -991,6 +999,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
conversionProgress,
|
conversionProgress,
|
||||||
hasChanges,
|
hasChanges,
|
||||||
forceSingleTextElement,
|
forceSingleTextElement,
|
||||||
|
groupingMode,
|
||||||
requestPagePreview,
|
requestPagePreview,
|
||||||
onLoadJson: handleLoadFile,
|
onLoadJson: handleLoadFile,
|
||||||
onSelectPage: handleSelectPage,
|
onSelectPage: handleSelectPage,
|
||||||
@ -1002,6 +1011,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
onDownloadJson: handleDownloadJson,
|
onDownloadJson: handleDownloadJson,
|
||||||
onGeneratePdf: handleGeneratePdf,
|
onGeneratePdf: handleGeneratePdf,
|
||||||
onForceSingleTextElementChange: setForceSingleTextElement,
|
onForceSingleTextElementChange: setForceSingleTextElement,
|
||||||
|
onGroupingModeChange: setGroupingMode,
|
||||||
}), [
|
}), [
|
||||||
handleImageTransform,
|
handleImageTransform,
|
||||||
imagesByPage,
|
imagesByPage,
|
||||||
@ -1027,6 +1037,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
|||||||
loadedDocument,
|
loadedDocument,
|
||||||
selectedPage,
|
selectedPage,
|
||||||
forceSingleTextElement,
|
forceSingleTextElement,
|
||||||
|
groupingMode,
|
||||||
requestPagePreview,
|
requestPagePreview,
|
||||||
setForceSingleTextElement,
|
setForceSingleTextElement,
|
||||||
]);
|
]);
|
||||||
|
|||||||
@ -197,6 +197,7 @@ export interface PdfJsonEditorViewData {
|
|||||||
conversionProgress: ConversionProgress | null;
|
conversionProgress: ConversionProgress | null;
|
||||||
hasChanges: boolean;
|
hasChanges: boolean;
|
||||||
forceSingleTextElement: boolean;
|
forceSingleTextElement: boolean;
|
||||||
|
groupingMode: 'auto' | 'paragraph' | 'singleLine';
|
||||||
requestPagePreview: (pageIndex: number, scale: number) => void;
|
requestPagePreview: (pageIndex: number, scale: number) => void;
|
||||||
onLoadJson: (file: File | null) => Promise<void> | void;
|
onLoadJson: (file: File | null) => Promise<void> | void;
|
||||||
onSelectPage: (pageIndex: number) => void;
|
onSelectPage: (pageIndex: number) => void;
|
||||||
@ -218,4 +219,5 @@ export interface PdfJsonEditorViewData {
|
|||||||
onDownloadJson: () => void;
|
onDownloadJson: () => void;
|
||||||
onGeneratePdf: () => void;
|
onGeneratePdf: () => void;
|
||||||
onForceSingleTextElementChange: (value: boolean) => void;
|
onForceSingleTextElementChange: (value: boolean) => void;
|
||||||
|
onGroupingModeChange: (value: 'auto' | 'paragraph' | 'singleLine') => void;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -641,6 +641,7 @@ export const groupPageTextElements = (
|
|||||||
page: PdfJsonPage | null | undefined,
|
page: PdfJsonPage | null | undefined,
|
||||||
pageIndex: number,
|
pageIndex: number,
|
||||||
metrics?: FontMetricsMap,
|
metrics?: FontMetricsMap,
|
||||||
|
groupingMode: 'auto' | 'paragraph' | 'singleLine' = 'auto',
|
||||||
): TextGroup[] => {
|
): TextGroup[] => {
|
||||||
if (!page?.textElements || page.textElements.length === 0) {
|
if (!page?.textElements || page.textElements.length === 0) {
|
||||||
return [];
|
return [];
|
||||||
@ -731,15 +732,72 @@ export const groupPageTextElements = (
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
return groupLinesIntoParagraphs(lineGroups, metrics);
|
// Apply paragraph grouping based on mode
|
||||||
|
if (groupingMode === 'singleLine') {
|
||||||
|
// Single line mode: skip paragraph grouping
|
||||||
|
return lineGroups;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (groupingMode === 'paragraph') {
|
||||||
|
// Paragraph mode: always apply grouping
|
||||||
|
return groupLinesIntoParagraphs(lineGroups, metrics);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Auto mode: use heuristic to determine if we should group
|
||||||
|
// Analyze the page content to decide
|
||||||
|
let multiLineGroups = 0;
|
||||||
|
let totalWords = 0;
|
||||||
|
let longTextGroups = 0;
|
||||||
|
let totalGroups = 0;
|
||||||
|
|
||||||
|
lineGroups.forEach((group) => {
|
||||||
|
const text = (group.text || '').trim();
|
||||||
|
if (text.length === 0) return;
|
||||||
|
|
||||||
|
totalGroups++;
|
||||||
|
const lines = text.split('\n');
|
||||||
|
const lineCount = lines.length;
|
||||||
|
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
||||||
|
|
||||||
|
totalWords += wordCount;
|
||||||
|
|
||||||
|
if (lineCount > 1) {
|
||||||
|
multiLineGroups++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wordCount >= 5 || text.length >= 30) {
|
||||||
|
longTextGroups++;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (totalGroups === 0) {
|
||||||
|
return lineGroups;
|
||||||
|
}
|
||||||
|
|
||||||
|
const avgWordsPerGroup = totalWords / totalGroups;
|
||||||
|
const longTextRatio = longTextGroups / totalGroups;
|
||||||
|
|
||||||
|
const isParagraphPage =
|
||||||
|
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
|
||||||
|
avgWordsPerGroup > 12 ||
|
||||||
|
longTextRatio > 0.4;
|
||||||
|
|
||||||
|
// Only apply paragraph grouping if it looks like a paragraph-heavy page
|
||||||
|
if (isParagraphPage) {
|
||||||
|
return groupLinesIntoParagraphs(lineGroups, metrics);
|
||||||
|
}
|
||||||
|
|
||||||
|
// For sparse pages, keep lines separate
|
||||||
|
return lineGroups;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const groupDocumentText = (
|
export const groupDocumentText = (
|
||||||
document: PdfJsonDocument | null | undefined,
|
document: PdfJsonDocument | null | undefined,
|
||||||
|
groupingMode: 'auto' | 'paragraph' | 'singleLine' = 'auto',
|
||||||
): TextGroup[][] => {
|
): TextGroup[][] => {
|
||||||
const pages = document?.pages ?? [];
|
const pages = document?.pages ?? [];
|
||||||
const metrics = buildFontMetrics(document);
|
const metrics = buildFontMetrics(document);
|
||||||
return pages.map((page, index) => groupPageTextElements(page, index, metrics));
|
return pages.map((page, index) => groupPageTextElements(page, index, metrics, groupingMode));
|
||||||
};
|
};
|
||||||
|
|
||||||
export const extractPageImages = (
|
export const extractPageImages = (
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user