mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
auto paragrpah mode
This commit is contained in:
parent
3ed62c8dbf
commit
2d8113b3f6
@ -4495,19 +4495,25 @@
|
||||
"title": "Auto-scale text to fit boxes",
|
||||
"description": "Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF."
|
||||
},
|
||||
"groupingMode": {
|
||||
"title": "Text Grouping Mode",
|
||||
"autoDescription": "Automatically detects page type and groups text appropriately.",
|
||||
"paragraphDescription": "Groups aligned lines into multi-line paragraph text boxes.",
|
||||
"singleLineDescription": "Keeps each PDF text line as a separate text box."
|
||||
},
|
||||
"forceSingleElement": {
|
||||
"title": "Lock edited text to a single PDF element",
|
||||
"description": "When enabled, the editor exports each edited text box as one PDF text element to avoid overlapping glyphs or mixed fonts."
|
||||
},
|
||||
"textGroupingMode": {
|
||||
"title": "Text grouping mode",
|
||||
"description": "Paragraph mode merges aligned lines into one textbox; single-line mode keeps every PDF line separate. Auto picks the best option per page."
|
||||
}
|
||||
},
|
||||
"grouping": {
|
||||
"pageType": {
|
||||
"paragraph": "Paragraph page",
|
||||
"sparse": "Sparse text"
|
||||
},
|
||||
"groupingMode": {
|
||||
"auto": "Auto",
|
||||
"paragraph": "Paragraph",
|
||||
"single": "Single Line"
|
||||
"singleLine": "Single Line"
|
||||
},
|
||||
"disclaimer": {
|
||||
"heading": "Preview limitations",
|
||||
@ -4521,6 +4527,7 @@
|
||||
"loading": "Loading",
|
||||
"normalizing": "Normalizing",
|
||||
"parsing": "Parsing",
|
||||
"processing": "Processing",
|
||||
"fonts": "Fonts",
|
||||
"text": "Text Extraction",
|
||||
"images": "Images",
|
||||
|
||||
@ -13,6 +13,7 @@ import {
|
||||
Pagination,
|
||||
Progress,
|
||||
ScrollArea,
|
||||
SegmentedControl,
|
||||
Stack,
|
||||
Switch,
|
||||
Text,
|
||||
@ -202,6 +203,95 @@ const buildFontLookupKeys = (
|
||||
return Array.from(new Set(keys.filter((value) => value && value.length > 0)));
|
||||
};
|
||||
|
||||
/**
|
||||
* Analyzes text groups on a page to determine if it's paragraph-heavy or sparse.
|
||||
* Returns true if the page appears to be document-like with substantial text content.
|
||||
*/
|
||||
const analyzePageContentType = (groups: TextGroup[]): boolean => {
|
||||
if (groups.length === 0) return false;
|
||||
|
||||
let multiLineGroups = 0;
|
||||
let totalWords = 0;
|
||||
let longTextGroups = 0;
|
||||
let totalGroups = 0;
|
||||
const groupDetails: Array<{
|
||||
id: string;
|
||||
lines: number;
|
||||
words: number;
|
||||
chars: number;
|
||||
text: string;
|
||||
}> = [];
|
||||
|
||||
groups.forEach((group) => {
|
||||
const text = (group.text || '').trim();
|
||||
if (text.length === 0) return;
|
||||
|
||||
totalGroups++;
|
||||
const lines = text.split('\n');
|
||||
const lineCount = lines.length;
|
||||
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
||||
|
||||
totalWords += wordCount;
|
||||
|
||||
// Count multi-line paragraphs
|
||||
if (lineCount > 1) {
|
||||
multiLineGroups++;
|
||||
}
|
||||
|
||||
// Count text groups with substantial content (more than a few words)
|
||||
if (wordCount >= 5 || text.length >= 30) {
|
||||
longTextGroups++;
|
||||
}
|
||||
|
||||
groupDetails.push({
|
||||
id: group.id,
|
||||
lines: lineCount,
|
||||
words: wordCount,
|
||||
chars: text.length,
|
||||
text: text.substring(0, 50) + (text.length > 50 ? '...' : ''),
|
||||
});
|
||||
});
|
||||
|
||||
if (totalGroups === 0) return false;
|
||||
|
||||
// Heuristics for paragraph mode:
|
||||
// 1. Has multiple substantial multi-line groups (2+) AND decent average words
|
||||
// 2. Average words per group > 12 (strong indicator of document text)
|
||||
// 3. More than 40% of groups have substantial text (typical of documents)
|
||||
const avgWordsPerGroup = totalWords / totalGroups;
|
||||
const longTextRatio = longTextGroups / totalGroups;
|
||||
|
||||
const isParagraphPage =
|
||||
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
|
||||
avgWordsPerGroup > 12 ||
|
||||
longTextRatio > 0.4;
|
||||
|
||||
// Log detailed statistics
|
||||
console.group(`📊 Page Content Analysis`);
|
||||
console.log('📄 Overall Statistics:');
|
||||
console.log(` Total text groups: ${totalGroups}`);
|
||||
console.log(` Total words: ${totalWords}`);
|
||||
console.log(` Average words per group: ${avgWordsPerGroup.toFixed(2)}`);
|
||||
console.log(` Multi-line groups: ${multiLineGroups}`);
|
||||
console.log(` Long text groups (≥5 words or ≥30 chars): ${longTextGroups}`);
|
||||
console.log(` Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`);
|
||||
console.log('');
|
||||
console.log('🔍 Detection Criteria:');
|
||||
console.log(` ✓ Multi-line groups ≥ 2 AND avg words > 8? ${multiLineGroups >= 2 && avgWordsPerGroup > 8 ? '✅ YES' : '❌ NO'} (multi-line: ${multiLineGroups}, avg: ${avgWordsPerGroup.toFixed(2)})`);
|
||||
console.log(` ✓ Avg words/group > 12? ${avgWordsPerGroup > 12 ? '✅ YES' : '❌ NO'} (current: ${avgWordsPerGroup.toFixed(2)})`);
|
||||
console.log(` ✓ Long text ratio > 40%? ${longTextRatio > 0.4 ? '✅ YES' : '❌ NO'} (current: ${(longTextRatio * 100).toFixed(1)}%)`);
|
||||
console.log('');
|
||||
console.log(`📋 Result: ${isParagraphPage ? '📝 PARAGRAPH PAGE' : '📄 SPARSE PAGE'}`);
|
||||
console.log('');
|
||||
console.log('📦 Individual Groups:');
|
||||
console.table(groupDetails);
|
||||
console.groupEnd();
|
||||
|
||||
return isParagraphPage;
|
||||
};
|
||||
|
||||
type GroupingMode = 'auto' | 'paragraph' | 'singleLine';
|
||||
|
||||
const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
const { t } = useTranslation();
|
||||
const [activeGroupId, setActiveGroupId] = useState<string | null>(null);
|
||||
@ -232,6 +322,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
conversionProgress,
|
||||
hasChanges,
|
||||
forceSingleTextElement,
|
||||
groupingMode: externalGroupingMode,
|
||||
requestPagePreview,
|
||||
onLoadJson,
|
||||
onSelectPage,
|
||||
@ -243,6 +334,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
onDownloadJson,
|
||||
onGeneratePdf,
|
||||
onForceSingleTextElementChange,
|
||||
onGroupingModeChange,
|
||||
} = data;
|
||||
|
||||
const syncEditorValue = useCallback(
|
||||
@ -430,6 +522,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
const pageImages = imagesByPage[selectedPage] ?? [];
|
||||
const pagePreview = pagePreviews.get(selectedPage);
|
||||
|
||||
// Detect if current page contains paragraph-heavy content
|
||||
const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups), [pageGroups]);
|
||||
|
||||
const extractPreferredFontId = useCallback((target?: TextGroup | null) => {
|
||||
if (!target) {
|
||||
return undefined;
|
||||
@ -981,6 +1076,50 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
/>
|
||||
</Group>
|
||||
|
||||
<Stack gap="xs">
|
||||
<Group gap={4} align="center">
|
||||
<Text fw={500} size="sm">
|
||||
{t('pdfJsonEditor.options.groupingMode.title', 'Text Grouping Mode')}
|
||||
</Text>
|
||||
{externalGroupingMode === 'auto' && isParagraphPage && (
|
||||
<Badge size="xs" color="blue" variant="light">
|
||||
{t('pdfJsonEditor.pageType.paragraph', 'Paragraph page')}
|
||||
</Badge>
|
||||
)}
|
||||
{externalGroupingMode === 'auto' && !isParagraphPage && hasDocument && (
|
||||
<Badge size="xs" color="gray" variant="light">
|
||||
{t('pdfJsonEditor.pageType.sparse', 'Sparse text')}
|
||||
</Badge>
|
||||
)}
|
||||
</Group>
|
||||
<Text size="xs" c="dimmed">
|
||||
{externalGroupingMode === 'auto'
|
||||
? t(
|
||||
'pdfJsonEditor.options.groupingMode.autoDescription',
|
||||
'Automatically detects page type and groups text appropriately.'
|
||||
)
|
||||
: externalGroupingMode === 'paragraph'
|
||||
? t(
|
||||
'pdfJsonEditor.options.groupingMode.paragraphDescription',
|
||||
'Groups aligned lines into multi-line paragraph text boxes.'
|
||||
)
|
||||
: t(
|
||||
'pdfJsonEditor.options.groupingMode.singleLineDescription',
|
||||
'Keeps each PDF text line as a separate text box.'
|
||||
)}
|
||||
</Text>
|
||||
<SegmentedControl
|
||||
value={externalGroupingMode}
|
||||
onChange={(value) => onGroupingModeChange(value as GroupingMode)}
|
||||
data={[
|
||||
{ label: t('pdfJsonEditor.groupingMode.auto', 'Auto'), value: 'auto' },
|
||||
{ label: t('pdfJsonEditor.groupingMode.paragraph', 'Paragraph'), value: 'paragraph' },
|
||||
{ label: t('pdfJsonEditor.groupingMode.singleLine', 'Single Line'), value: 'singleLine' },
|
||||
]}
|
||||
fullWidth
|
||||
/>
|
||||
</Stack>
|
||||
|
||||
<Group justify="space-between" align="center">
|
||||
<div>
|
||||
<Text fw={500} size="sm">
|
||||
@ -1547,7 +1686,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
|
||||
style={{
|
||||
pointerEvents: 'none',
|
||||
display: 'inline-block',
|
||||
transform: shouldScale ? `scaleX(${textScale})` : undefined,
|
||||
transform: shouldScale ? `scaleX(${textScale})` : 'none',
|
||||
transformOrigin: 'left center',
|
||||
whiteSpace: 'pre',
|
||||
}}
|
||||
|
||||
@ -78,6 +78,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
message: string;
|
||||
} | null>(null);
|
||||
const [forceSingleTextElement, setForceSingleTextElement] = useState(false);
|
||||
const [groupingMode, setGroupingMode] = useState<'auto' | 'paragraph' | 'singleLine'>('auto');
|
||||
const [hasVectorPreview, setHasVectorPreview] = useState(false);
|
||||
const [pagePreviews, setPagePreviews] = useState<Map<number, string>>(new Map());
|
||||
|
||||
@ -136,7 +137,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
const viewLabel = useMemo(() => t('pdfJsonEditor.viewLabel', 'PDF Editor'), [t]);
|
||||
const { selectedFiles } = useFileSelection();
|
||||
|
||||
const resetToDocument = useCallback((document: PdfJsonDocument | null) => {
|
||||
const resetToDocument = useCallback((document: PdfJsonDocument | null, mode: 'auto' | 'paragraph' | 'singleLine') => {
|
||||
if (!document) {
|
||||
setGroupsByPage([]);
|
||||
setImagesByPage([]);
|
||||
@ -150,7 +151,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
return;
|
||||
}
|
||||
const cloned = deepCloneDocument(document);
|
||||
const groups = groupDocumentText(cloned);
|
||||
const groups = groupDocumentText(cloned, mode);
|
||||
const images = extractDocumentImages(cloned);
|
||||
const originalImages = images.map((page) => page.map(cloneImageElement));
|
||||
originalImagesRef.current = originalImages;
|
||||
@ -513,7 +514,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}
|
||||
|
||||
setLoadedDocument(parsed);
|
||||
resetToDocument(parsed);
|
||||
resetToDocument(parsed, groupingMode);
|
||||
setIsLazyMode(shouldUseLazyMode);
|
||||
setCachedJobId(shouldUseLazyMode ? pendingJobId : null);
|
||||
setFileName(file.name);
|
||||
@ -532,7 +533,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}
|
||||
|
||||
setLoadedDocument(null);
|
||||
resetToDocument(null);
|
||||
resetToDocument(null, groupingMode);
|
||||
clearPdfPreview();
|
||||
|
||||
if (isPdf) {
|
||||
@ -555,7 +556,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}
|
||||
}
|
||||
},
|
||||
[resetToDocument, t],
|
||||
[groupingMode, resetToDocument, t],
|
||||
);
|
||||
|
||||
const handleSelectPage = useCallback((pageIndex: number) => {
|
||||
@ -686,9 +687,9 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
if (!loadedDocument) {
|
||||
return;
|
||||
}
|
||||
resetToDocument(loadedDocument);
|
||||
resetToDocument(loadedDocument, groupingMode);
|
||||
setErrorMessage(null);
|
||||
}, [loadedDocument, resetToDocument]);
|
||||
}, [groupingMode, loadedDocument, resetToDocument]);
|
||||
|
||||
const buildPayload = useCallback(() => {
|
||||
if (!loadedDocument) {
|
||||
@ -975,6 +976,13 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
[hasVectorPreview],
|
||||
);
|
||||
|
||||
// Re-group text when grouping mode changes
|
||||
useEffect(() => {
|
||||
if (loadedDocument) {
|
||||
resetToDocument(loadedDocument, groupingMode);
|
||||
}
|
||||
}, [groupingMode, loadedDocument, resetToDocument]);
|
||||
|
||||
const viewData = useMemo<PdfJsonEditorViewData>(() => ({
|
||||
document: loadedDocument,
|
||||
groupsByPage,
|
||||
@ -991,6 +999,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
conversionProgress,
|
||||
hasChanges,
|
||||
forceSingleTextElement,
|
||||
groupingMode,
|
||||
requestPagePreview,
|
||||
onLoadJson: handleLoadFile,
|
||||
onSelectPage: handleSelectPage,
|
||||
@ -1002,6 +1011,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
onDownloadJson: handleDownloadJson,
|
||||
onGeneratePdf: handleGeneratePdf,
|
||||
onForceSingleTextElementChange: setForceSingleTextElement,
|
||||
onGroupingModeChange: setGroupingMode,
|
||||
}), [
|
||||
handleImageTransform,
|
||||
imagesByPage,
|
||||
@ -1027,6 +1037,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
loadedDocument,
|
||||
selectedPage,
|
||||
forceSingleTextElement,
|
||||
groupingMode,
|
||||
requestPagePreview,
|
||||
setForceSingleTextElement,
|
||||
]);
|
||||
|
||||
@ -197,6 +197,7 @@ export interface PdfJsonEditorViewData {
|
||||
conversionProgress: ConversionProgress | null;
|
||||
hasChanges: boolean;
|
||||
forceSingleTextElement: boolean;
|
||||
groupingMode: 'auto' | 'paragraph' | 'singleLine';
|
||||
requestPagePreview: (pageIndex: number, scale: number) => void;
|
||||
onLoadJson: (file: File | null) => Promise<void> | void;
|
||||
onSelectPage: (pageIndex: number) => void;
|
||||
@ -218,4 +219,5 @@ export interface PdfJsonEditorViewData {
|
||||
onDownloadJson: () => void;
|
||||
onGeneratePdf: () => void;
|
||||
onForceSingleTextElementChange: (value: boolean) => void;
|
||||
onGroupingModeChange: (value: 'auto' | 'paragraph' | 'singleLine') => void;
|
||||
}
|
||||
|
||||
@ -641,6 +641,7 @@ export const groupPageTextElements = (
|
||||
page: PdfJsonPage | null | undefined,
|
||||
pageIndex: number,
|
||||
metrics?: FontMetricsMap,
|
||||
groupingMode: 'auto' | 'paragraph' | 'singleLine' = 'auto',
|
||||
): TextGroup[] => {
|
||||
if (!page?.textElements || page.textElements.length === 0) {
|
||||
return [];
|
||||
@ -731,15 +732,72 @@ export const groupPageTextElements = (
|
||||
}
|
||||
});
|
||||
|
||||
return groupLinesIntoParagraphs(lineGroups, metrics);
|
||||
// Apply paragraph grouping based on mode
|
||||
if (groupingMode === 'singleLine') {
|
||||
// Single line mode: skip paragraph grouping
|
||||
return lineGroups;
|
||||
}
|
||||
|
||||
if (groupingMode === 'paragraph') {
|
||||
// Paragraph mode: always apply grouping
|
||||
return groupLinesIntoParagraphs(lineGroups, metrics);
|
||||
}
|
||||
|
||||
// Auto mode: use heuristic to determine if we should group
|
||||
// Analyze the page content to decide
|
||||
let multiLineGroups = 0;
|
||||
let totalWords = 0;
|
||||
let longTextGroups = 0;
|
||||
let totalGroups = 0;
|
||||
|
||||
lineGroups.forEach((group) => {
|
||||
const text = (group.text || '').trim();
|
||||
if (text.length === 0) return;
|
||||
|
||||
totalGroups++;
|
||||
const lines = text.split('\n');
|
||||
const lineCount = lines.length;
|
||||
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
||||
|
||||
totalWords += wordCount;
|
||||
|
||||
if (lineCount > 1) {
|
||||
multiLineGroups++;
|
||||
}
|
||||
|
||||
if (wordCount >= 5 || text.length >= 30) {
|
||||
longTextGroups++;
|
||||
}
|
||||
});
|
||||
|
||||
if (totalGroups === 0) {
|
||||
return lineGroups;
|
||||
}
|
||||
|
||||
const avgWordsPerGroup = totalWords / totalGroups;
|
||||
const longTextRatio = longTextGroups / totalGroups;
|
||||
|
||||
const isParagraphPage =
|
||||
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
|
||||
avgWordsPerGroup > 12 ||
|
||||
longTextRatio > 0.4;
|
||||
|
||||
// Only apply paragraph grouping if it looks like a paragraph-heavy page
|
||||
if (isParagraphPage) {
|
||||
return groupLinesIntoParagraphs(lineGroups, metrics);
|
||||
}
|
||||
|
||||
// For sparse pages, keep lines separate
|
||||
return lineGroups;
|
||||
};
|
||||
|
||||
export const groupDocumentText = (
|
||||
document: PdfJsonDocument | null | undefined,
|
||||
groupingMode: 'auto' | 'paragraph' | 'singleLine' = 'auto',
|
||||
): TextGroup[][] => {
|
||||
const pages = document?.pages ?? [];
|
||||
const metrics = buildFontMetrics(document);
|
||||
return pages.map((page, index) => groupPageTextElements(page, index, metrics));
|
||||
return pages.map((page, index) => groupPageTextElements(page, index, metrics, groupingMode));
|
||||
};
|
||||
|
||||
export const extractPageImages = (
|
||||
|
||||
Loading…
Reference in New Issue
Block a user