auto paragrpah mode

This commit is contained in:
Anthony Stirling 2025-11-11 12:09:40 +00:00
parent 3ed62c8dbf
commit 2d8113b3f6
5 changed files with 233 additions and 16 deletions

View File

@ -4495,19 +4495,25 @@
"title": "Auto-scale text to fit boxes",
"description": "Automatically scales text horizontally to fit within its original bounding box when font rendering differs from PDF."
},
"groupingMode": {
"title": "Text Grouping Mode",
"autoDescription": "Automatically detects page type and groups text appropriately.",
"paragraphDescription": "Groups aligned lines into multi-line paragraph text boxes.",
"singleLineDescription": "Keeps each PDF text line as a separate text box."
},
"forceSingleElement": {
"title": "Lock edited text to a single PDF element",
"description": "When enabled, the editor exports each edited text box as one PDF text element to avoid overlapping glyphs or mixed fonts."
},
"textGroupingMode": {
"title": "Text grouping mode",
"description": "Paragraph mode merges aligned lines into one textbox; single-line mode keeps every PDF line separate. Auto picks the best option per page."
}
},
"grouping": {
"pageType": {
"paragraph": "Paragraph page",
"sparse": "Sparse text"
},
"groupingMode": {
"auto": "Auto",
"paragraph": "Paragraph",
"single": "Single Line"
"singleLine": "Single Line"
},
"disclaimer": {
"heading": "Preview limitations",
@ -4521,6 +4527,7 @@
"loading": "Loading",
"normalizing": "Normalizing",
"parsing": "Parsing",
"processing": "Processing",
"fonts": "Fonts",
"text": "Text Extraction",
"images": "Images",

View File

@ -13,6 +13,7 @@ import {
Pagination,
Progress,
ScrollArea,
SegmentedControl,
Stack,
Switch,
Text,
@ -202,6 +203,95 @@ const buildFontLookupKeys = (
return Array.from(new Set(keys.filter((value) => value && value.length > 0)));
};
/**
* Analyzes text groups on a page to determine if it's paragraph-heavy or sparse.
* Returns true if the page appears to be document-like with substantial text content.
*/
const analyzePageContentType = (groups: TextGroup[]): boolean => {
if (groups.length === 0) return false;
let multiLineGroups = 0;
let totalWords = 0;
let longTextGroups = 0;
let totalGroups = 0;
const groupDetails: Array<{
id: string;
lines: number;
words: number;
chars: number;
text: string;
}> = [];
groups.forEach((group) => {
const text = (group.text || '').trim();
if (text.length === 0) return;
totalGroups++;
const lines = text.split('\n');
const lineCount = lines.length;
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
totalWords += wordCount;
// Count multi-line paragraphs
if (lineCount > 1) {
multiLineGroups++;
}
// Count text groups with substantial content (more than a few words)
if (wordCount >= 5 || text.length >= 30) {
longTextGroups++;
}
groupDetails.push({
id: group.id,
lines: lineCount,
words: wordCount,
chars: text.length,
text: text.substring(0, 50) + (text.length > 50 ? '...' : ''),
});
});
if (totalGroups === 0) return false;
// Heuristics for paragraph mode:
// 1. Has multiple substantial multi-line groups (2+) AND decent average words
// 2. Average words per group > 12 (strong indicator of document text)
// 3. More than 40% of groups have substantial text (typical of documents)
const avgWordsPerGroup = totalWords / totalGroups;
const longTextRatio = longTextGroups / totalGroups;
const isParagraphPage =
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
avgWordsPerGroup > 12 ||
longTextRatio > 0.4;
// Log detailed statistics
console.group(`📊 Page Content Analysis`);
console.log('📄 Overall Statistics:');
console.log(` Total text groups: ${totalGroups}`);
console.log(` Total words: ${totalWords}`);
console.log(` Average words per group: ${avgWordsPerGroup.toFixed(2)}`);
console.log(` Multi-line groups: ${multiLineGroups}`);
console.log(` Long text groups (≥5 words or ≥30 chars): ${longTextGroups}`);
console.log(` Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`);
console.log('');
console.log('🔍 Detection Criteria:');
console.log(` ✓ Multi-line groups ≥ 2 AND avg words > 8? ${multiLineGroups >= 2 && avgWordsPerGroup > 8 ? '✅ YES' : '❌ NO'} (multi-line: ${multiLineGroups}, avg: ${avgWordsPerGroup.toFixed(2)})`);
console.log(` ✓ Avg words/group > 12? ${avgWordsPerGroup > 12 ? '✅ YES' : '❌ NO'} (current: ${avgWordsPerGroup.toFixed(2)})`);
console.log(` ✓ Long text ratio > 40%? ${longTextRatio > 0.4 ? '✅ YES' : '❌ NO'} (current: ${(longTextRatio * 100).toFixed(1)}%)`);
console.log('');
console.log(`📋 Result: ${isParagraphPage ? '📝 PARAGRAPH PAGE' : '📄 SPARSE PAGE'}`);
console.log('');
console.log('📦 Individual Groups:');
console.table(groupDetails);
console.groupEnd();
return isParagraphPage;
};
type GroupingMode = 'auto' | 'paragraph' | 'singleLine';
const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
const { t } = useTranslation();
const [activeGroupId, setActiveGroupId] = useState<string | null>(null);
@ -232,6 +322,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
conversionProgress,
hasChanges,
forceSingleTextElement,
groupingMode: externalGroupingMode,
requestPagePreview,
onLoadJson,
onSelectPage,
@ -243,6 +334,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
onDownloadJson,
onGeneratePdf,
onForceSingleTextElementChange,
onGroupingModeChange,
} = data;
const syncEditorValue = useCallback(
@ -430,6 +522,9 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
const pageImages = imagesByPage[selectedPage] ?? [];
const pagePreview = pagePreviews.get(selectedPage);
// Detect if current page contains paragraph-heavy content
const isParagraphPage = useMemo(() => analyzePageContentType(pageGroups), [pageGroups]);
const extractPreferredFontId = useCallback((target?: TextGroup | null) => {
if (!target) {
return undefined;
@ -981,6 +1076,50 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
/>
</Group>
<Stack gap="xs">
<Group gap={4} align="center">
<Text fw={500} size="sm">
{t('pdfJsonEditor.options.groupingMode.title', 'Text Grouping Mode')}
</Text>
{externalGroupingMode === 'auto' && isParagraphPage && (
<Badge size="xs" color="blue" variant="light">
{t('pdfJsonEditor.pageType.paragraph', 'Paragraph page')}
</Badge>
)}
{externalGroupingMode === 'auto' && !isParagraphPage && hasDocument && (
<Badge size="xs" color="gray" variant="light">
{t('pdfJsonEditor.pageType.sparse', 'Sparse text')}
</Badge>
)}
</Group>
<Text size="xs" c="dimmed">
{externalGroupingMode === 'auto'
? t(
'pdfJsonEditor.options.groupingMode.autoDescription',
'Automatically detects page type and groups text appropriately.'
)
: externalGroupingMode === 'paragraph'
? t(
'pdfJsonEditor.options.groupingMode.paragraphDescription',
'Groups aligned lines into multi-line paragraph text boxes.'
)
: t(
'pdfJsonEditor.options.groupingMode.singleLineDescription',
'Keeps each PDF text line as a separate text box.'
)}
</Text>
<SegmentedControl
value={externalGroupingMode}
onChange={(value) => onGroupingModeChange(value as GroupingMode)}
data={[
{ label: t('pdfJsonEditor.groupingMode.auto', 'Auto'), value: 'auto' },
{ label: t('pdfJsonEditor.groupingMode.paragraph', 'Paragraph'), value: 'paragraph' },
{ label: t('pdfJsonEditor.groupingMode.singleLine', 'Single Line'), value: 'singleLine' },
]}
fullWidth
/>
</Stack>
<Group justify="space-between" align="center">
<div>
<Text fw={500} size="sm">
@ -1547,7 +1686,7 @@ const PdfJsonEditorView = ({ data }: PdfJsonEditorViewProps) => {
style={{
pointerEvents: 'none',
display: 'inline-block',
transform: shouldScale ? `scaleX(${textScale})` : undefined,
transform: shouldScale ? `scaleX(${textScale})` : 'none',
transformOrigin: 'left center',
whiteSpace: 'pre',
}}

View File

@ -78,6 +78,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
message: string;
} | null>(null);
const [forceSingleTextElement, setForceSingleTextElement] = useState(false);
const [groupingMode, setGroupingMode] = useState<'auto' | 'paragraph' | 'singleLine'>('auto');
const [hasVectorPreview, setHasVectorPreview] = useState(false);
const [pagePreviews, setPagePreviews] = useState<Map<number, string>>(new Map());
@ -136,7 +137,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
const viewLabel = useMemo(() => t('pdfJsonEditor.viewLabel', 'PDF Editor'), [t]);
const { selectedFiles } = useFileSelection();
const resetToDocument = useCallback((document: PdfJsonDocument | null) => {
const resetToDocument = useCallback((document: PdfJsonDocument | null, mode: 'auto' | 'paragraph' | 'singleLine') => {
if (!document) {
setGroupsByPage([]);
setImagesByPage([]);
@ -150,7 +151,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
return;
}
const cloned = deepCloneDocument(document);
const groups = groupDocumentText(cloned);
const groups = groupDocumentText(cloned, mode);
const images = extractDocumentImages(cloned);
const originalImages = images.map((page) => page.map(cloneImageElement));
originalImagesRef.current = originalImages;
@ -513,7 +514,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
}
setLoadedDocument(parsed);
resetToDocument(parsed);
resetToDocument(parsed, groupingMode);
setIsLazyMode(shouldUseLazyMode);
setCachedJobId(shouldUseLazyMode ? pendingJobId : null);
setFileName(file.name);
@ -532,7 +533,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
}
setLoadedDocument(null);
resetToDocument(null);
resetToDocument(null, groupingMode);
clearPdfPreview();
if (isPdf) {
@ -555,7 +556,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
}
}
},
[resetToDocument, t],
[groupingMode, resetToDocument, t],
);
const handleSelectPage = useCallback((pageIndex: number) => {
@ -686,9 +687,9 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
if (!loadedDocument) {
return;
}
resetToDocument(loadedDocument);
resetToDocument(loadedDocument, groupingMode);
setErrorMessage(null);
}, [loadedDocument, resetToDocument]);
}, [groupingMode, loadedDocument, resetToDocument]);
const buildPayload = useCallback(() => {
if (!loadedDocument) {
@ -975,6 +976,13 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
[hasVectorPreview],
);
// Re-group text when grouping mode changes
useEffect(() => {
if (loadedDocument) {
resetToDocument(loadedDocument, groupingMode);
}
}, [groupingMode, loadedDocument, resetToDocument]);
const viewData = useMemo<PdfJsonEditorViewData>(() => ({
document: loadedDocument,
groupsByPage,
@ -991,6 +999,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
conversionProgress,
hasChanges,
forceSingleTextElement,
groupingMode,
requestPagePreview,
onLoadJson: handleLoadFile,
onSelectPage: handleSelectPage,
@ -1002,6 +1011,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
onDownloadJson: handleDownloadJson,
onGeneratePdf: handleGeneratePdf,
onForceSingleTextElementChange: setForceSingleTextElement,
onGroupingModeChange: setGroupingMode,
}), [
handleImageTransform,
imagesByPage,
@ -1027,6 +1037,7 @@ const PdfJsonEditor = ({ onComplete, onError }: BaseToolProps) => {
loadedDocument,
selectedPage,
forceSingleTextElement,
groupingMode,
requestPagePreview,
setForceSingleTextElement,
]);

View File

@ -197,6 +197,7 @@ export interface PdfJsonEditorViewData {
conversionProgress: ConversionProgress | null;
hasChanges: boolean;
forceSingleTextElement: boolean;
groupingMode: 'auto' | 'paragraph' | 'singleLine';
requestPagePreview: (pageIndex: number, scale: number) => void;
onLoadJson: (file: File | null) => Promise<void> | void;
onSelectPage: (pageIndex: number) => void;
@ -218,4 +219,5 @@ export interface PdfJsonEditorViewData {
onDownloadJson: () => void;
onGeneratePdf: () => void;
onForceSingleTextElementChange: (value: boolean) => void;
onGroupingModeChange: (value: 'auto' | 'paragraph' | 'singleLine') => void;
}

View File

@ -641,6 +641,7 @@ export const groupPageTextElements = (
page: PdfJsonPage | null | undefined,
pageIndex: number,
metrics?: FontMetricsMap,
groupingMode: 'auto' | 'paragraph' | 'singleLine' = 'auto',
): TextGroup[] => {
if (!page?.textElements || page.textElements.length === 0) {
return [];
@ -731,15 +732,72 @@ export const groupPageTextElements = (
}
});
return groupLinesIntoParagraphs(lineGroups, metrics);
// Apply paragraph grouping based on mode
if (groupingMode === 'singleLine') {
// Single line mode: skip paragraph grouping
return lineGroups;
}
if (groupingMode === 'paragraph') {
// Paragraph mode: always apply grouping
return groupLinesIntoParagraphs(lineGroups, metrics);
}
// Auto mode: use heuristic to determine if we should group
// Analyze the page content to decide
let multiLineGroups = 0;
let totalWords = 0;
let longTextGroups = 0;
let totalGroups = 0;
lineGroups.forEach((group) => {
const text = (group.text || '').trim();
if (text.length === 0) return;
totalGroups++;
const lines = text.split('\n');
const lineCount = lines.length;
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
totalWords += wordCount;
if (lineCount > 1) {
multiLineGroups++;
}
if (wordCount >= 5 || text.length >= 30) {
longTextGroups++;
}
});
if (totalGroups === 0) {
return lineGroups;
}
const avgWordsPerGroup = totalWords / totalGroups;
const longTextRatio = longTextGroups / totalGroups;
const isParagraphPage =
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
avgWordsPerGroup > 12 ||
longTextRatio > 0.4;
// Only apply paragraph grouping if it looks like a paragraph-heavy page
if (isParagraphPage) {
return groupLinesIntoParagraphs(lineGroups, metrics);
}
// For sparse pages, keep lines separate
return lineGroups;
};
export const groupDocumentText = (
document: PdfJsonDocument | null | undefined,
groupingMode: 'auto' | 'paragraph' | 'singleLine' = 'auto',
): TextGroup[][] => {
const pages = document?.pages ?? [];
const metrics = buildFontMetrics(document);
return pages.map((page, index) => groupPageTextElements(page, index, metrics));
return pages.map((page, index) => groupPageTextElements(page, index, metrics, groupingMode));
};
export const extractPageImages = (