mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
Merge remote-tracking branch 'origin/codex/add-pdf-to-json-and-json-to-pdf-features' into demo
This commit is contained in:
commit
2c1d93887a
@ -415,9 +415,16 @@ public class PdfJsonConversionService {
|
||||
for (PDPage page : document.getPages()) {
|
||||
PdfJsonPageDimension dim = new PdfJsonPageDimension();
|
||||
dim.setPageNumber(pageIndex + 1);
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
dim.setWidth(mediaBox.getWidth());
|
||||
dim.setHeight(mediaBox.getHeight());
|
||||
// Use CropBox if present (defines visible page area), otherwise fall back
|
||||
// to MediaBox
|
||||
PDRectangle pageBox = page.getCropBox();
|
||||
if (pageBox == null
|
||||
|| pageBox.getWidth() == 0
|
||||
|| pageBox.getHeight() == 0) {
|
||||
pageBox = page.getMediaBox();
|
||||
}
|
||||
dim.setWidth(pageBox.getWidth());
|
||||
dim.setHeight(pageBox.getHeight());
|
||||
dim.setRotation(page.getRotation());
|
||||
pageDimensions.add(dim);
|
||||
pageIndex++;
|
||||
@ -1851,9 +1858,13 @@ public class PdfJsonConversionService {
|
||||
for (PDPage page : document.getPages()) {
|
||||
PdfJsonPage pageModel = new PdfJsonPage();
|
||||
pageModel.setPageNumber(pageIndex + 1);
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
pageModel.setWidth(mediaBox.getWidth());
|
||||
pageModel.setHeight(mediaBox.getHeight());
|
||||
// Use CropBox if present (defines visible page area), otherwise fall back to MediaBox
|
||||
PDRectangle pageBox = page.getCropBox();
|
||||
if (pageBox == null || pageBox.getWidth() == 0 || pageBox.getHeight() == 0) {
|
||||
pageBox = page.getMediaBox();
|
||||
}
|
||||
pageModel.setWidth(pageBox.getWidth());
|
||||
pageModel.setHeight(pageBox.getHeight());
|
||||
pageModel.setRotation(page.getRotation());
|
||||
pageModel.setTextElements(textByPage.getOrDefault(pageIndex + 1, new ArrayList<>()));
|
||||
pageModel.setImageElements(imagesByPage.getOrDefault(pageIndex + 1, new ArrayList<>()));
|
||||
|
||||
@ -4533,6 +4533,32 @@
|
||||
"cancel": "Cancel",
|
||||
"confirm": "Reset and Change Mode"
|
||||
},
|
||||
"welcomeBanner": {
|
||||
"title": "Welcome to PDF Text Editor (Early Access)",
|
||||
"experimental": "This is an experimental feature in active development. Expect some instability and issues during use.",
|
||||
"howItWorks": "This tool converts your PDF to an editable format where you can modify text content and reposition images. Changes are saved back as a new PDF.",
|
||||
"bestFor": "Works Best With:",
|
||||
"bestFor1": "Simple PDFs containing primarily text and images",
|
||||
"bestFor2": "Documents with standard paragraph formatting",
|
||||
"bestFor3": "Letters, essays, reports, and basic documents",
|
||||
"notIdealFor": "Not Ideal For:",
|
||||
"notIdealFor1": "PDFs with special formatting like bullet points, tables, or multi-column layouts",
|
||||
"notIdealFor2": "Magazines, brochures, or heavily designed documents",
|
||||
"notIdealFor3": "Instruction manuals with complex layouts",
|
||||
"limitations": "Current Limitations:",
|
||||
"limitation1": "Font rendering may differ slightly from the original PDF",
|
||||
"limitation2": "Complex graphics, form fields, and annotations are preserved but not editable",
|
||||
"limitation3": "Large files may take time to convert and process",
|
||||
"knownIssues": "Known Issues (Being Fixed):",
|
||||
"issue1": "Text colour is not currently preserved (will be added soon)",
|
||||
"issue2": "Paragraph mode has more alignment and spacing issues - Single Line mode recommended",
|
||||
"issue3": "The preview display differs from the exported PDF - exported PDFs are closer to the original",
|
||||
"issue4": "Rotated text alignment may need manual adjustment",
|
||||
"issue5": "Transparency and layering effects may vary from original",
|
||||
"feedback": "This is an early access feature. Please report any issues you encounter to help us improve!",
|
||||
"gotIt": "Got it",
|
||||
"dontShowAgain": "Don't show again"
|
||||
},
|
||||
"disclaimer": {
|
||||
"heading": "Preview limitations",
|
||||
"textFocus": "This workspace focuses on editing text and repositioning embedded images. Complex page artwork, form widgets, and layered graphics are preserved for export but are not fully editable here.",
|
||||
@ -4579,6 +4605,21 @@
|
||||
"standard14": "Standard PDF Font",
|
||||
"warnings": "Warnings",
|
||||
"suggestions": "Notes"
|
||||
},
|
||||
"manual": {
|
||||
"mergeTooltip": "Merge selected boxes into a single paragraph",
|
||||
"merge": "Merge selection",
|
||||
"ungroupTooltip": "Split paragraph back into separate lines",
|
||||
"ungroup": "Ungroup selection",
|
||||
"widthMenu": "Width options",
|
||||
"expandWidth": "Expand to page edge",
|
||||
"resetWidth": "Reset width",
|
||||
"resizeHandle": "Adjust text width"
|
||||
},
|
||||
"options": {
|
||||
"manualGrouping": {
|
||||
"descriptionInline": "Tip: Hold Ctrl (Cmd) or Shift to multi-select text boxes. A floating toolbar will appear above the selection so you can merge, ungroup, or adjust widths."
|
||||
}
|
||||
}
|
||||
},
|
||||
"workspace": {
|
||||
|
||||
@ -121,10 +121,11 @@ export const NavigationProvider: React.FC<{
|
||||
hasUnsavedChanges
|
||||
});
|
||||
|
||||
// If we're leaving pageEditor or viewer workbench and have unsaved changes, request navigation
|
||||
// If we're leaving pageEditor, viewer, or custom workbench and have unsaved changes, request navigation
|
||||
const leavingWorkbenchWithChanges =
|
||||
(state.workbench === 'pageEditor' && workbench !== 'pageEditor' && hasUnsavedChanges) ||
|
||||
(state.workbench === 'viewer' && workbench !== 'viewer' && hasUnsavedChanges);
|
||||
(state.workbench === 'viewer' && workbench !== 'viewer' && hasUnsavedChanges) ||
|
||||
(state.workbench.startsWith('custom:') && workbench !== state.workbench && hasUnsavedChanges);
|
||||
|
||||
if (leavingWorkbenchWithChanges) {
|
||||
// Update state to reflect unsaved changes so modal knows
|
||||
@ -132,7 +133,19 @@ export const NavigationProvider: React.FC<{
|
||||
dispatch({ type: 'SET_UNSAVED_CHANGES', payload: { hasChanges: true } });
|
||||
}
|
||||
const performWorkbenchChange = () => {
|
||||
dispatch({ type: 'SET_WORKBENCH', payload: { workbench } });
|
||||
// When leaving a custom workbench, clear the selected tool
|
||||
console.log('[NavigationContext] performWorkbenchChange executing', {
|
||||
from: state.workbench,
|
||||
to: workbench,
|
||||
isCustom: state.workbench.startsWith('custom:')
|
||||
});
|
||||
if (state.workbench.startsWith('custom:')) {
|
||||
console.log('[NavigationContext] Clearing tool and changing workbench to:', workbench);
|
||||
dispatch({ type: 'SET_TOOL_AND_WORKBENCH', payload: { toolId: null, workbench } });
|
||||
} else {
|
||||
console.log('[NavigationContext] Just changing workbench to:', workbench);
|
||||
dispatch({ type: 'SET_WORKBENCH', payload: { workbench } });
|
||||
}
|
||||
};
|
||||
dispatch({ type: 'SET_PENDING_NAVIGATION', payload: { navigationFn: performWorkbenchChange } });
|
||||
dispatch({ type: 'SHOW_NAVIGATION_WARNING', payload: { show: true } });
|
||||
@ -149,10 +162,11 @@ export const NavigationProvider: React.FC<{
|
||||
// Check for unsaved changes using registered checker or state
|
||||
const hasUnsavedChanges = unsavedChangesCheckerRef.current?.() || state.hasUnsavedChanges;
|
||||
|
||||
// If we're leaving pageEditor or viewer workbench and have unsaved changes, request navigation
|
||||
// If we're leaving pageEditor, viewer, or custom workbench and have unsaved changes, request navigation
|
||||
const leavingWorkbenchWithChanges =
|
||||
(state.workbench === 'pageEditor' && workbench !== 'pageEditor' && hasUnsavedChanges) ||
|
||||
(state.workbench === 'viewer' && workbench !== 'viewer' && hasUnsavedChanges);
|
||||
(state.workbench === 'viewer' && workbench !== 'viewer' && hasUnsavedChanges) ||
|
||||
(state.workbench.startsWith('custom:') && workbench !== state.workbench && hasUnsavedChanges);
|
||||
|
||||
if (leavingWorkbenchWithChanges) {
|
||||
const performWorkbenchChange = () => {
|
||||
@ -192,13 +206,19 @@ export const NavigationProvider: React.FC<{
|
||||
}, [state.hasUnsavedChanges]),
|
||||
|
||||
confirmNavigation: useCallback(() => {
|
||||
console.log('[NavigationContext] confirmNavigation called', {
|
||||
hasPendingNav: !!state.pendingNavigation,
|
||||
currentWorkbench: state.workbench,
|
||||
currentTool: state.selectedTool
|
||||
});
|
||||
if (state.pendingNavigation) {
|
||||
state.pendingNavigation();
|
||||
}
|
||||
|
||||
dispatch({ type: 'SET_PENDING_NAVIGATION', payload: { navigationFn: null } });
|
||||
dispatch({ type: 'SHOW_NAVIGATION_WARNING', payload: { show: false } });
|
||||
}, [state.pendingNavigation]),
|
||||
console.log('[NavigationContext] confirmNavigation completed');
|
||||
}, [state.pendingNavigation, state.workbench, state.selectedTool]),
|
||||
|
||||
cancelNavigation: useCallback(() => {
|
||||
dispatch({ type: 'SET_PENDING_NAVIGATION', payload: { navigationFn: null } });
|
||||
|
||||
@ -218,15 +218,25 @@ export function ToolWorkflowProvider({ children }: ToolWorkflowProviderProps) {
|
||||
}, [customViewRegistry, customViewData]);
|
||||
|
||||
useEffect(() => {
|
||||
if (isBaseWorkbench(navigationState.workbench)) {
|
||||
const { workbench } = navigationState;
|
||||
if (isBaseWorkbench(workbench)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const currentCustomView = customWorkbenchViews.find(view => view.workbenchId === navigationState.workbench);
|
||||
const currentCustomView = customWorkbenchViews.find(view => view.workbenchId === workbench);
|
||||
const expectedWorkbench = selectedTool?.workbench;
|
||||
const workbenchOwnedBySelectedTool = expectedWorkbench === workbench;
|
||||
|
||||
if (!currentCustomView || currentCustomView.data == null) {
|
||||
// If the currently selected tool expects this custom workbench, allow it
|
||||
// some time to register/populate the view instead of immediately bouncing
|
||||
// the user back to Active Files.
|
||||
if (workbenchOwnedBySelectedTool) {
|
||||
return;
|
||||
}
|
||||
actions.setWorkbench(getDefaultWorkbench());
|
||||
}
|
||||
}, [actions, customWorkbenchViews, navigationState.workbench]);
|
||||
}, [actions, customWorkbenchViews, navigationState.workbench, selectedTool]);
|
||||
|
||||
// Persisted via PreferencesContext; no direct localStorage writes needed here
|
||||
|
||||
@ -421,4 +431,4 @@ export function useToolWorkflow(): ToolWorkflowContextValue {
|
||||
throw new Error('useToolWorkflow must be used within a ToolWorkflowProvider');
|
||||
}
|
||||
return context;
|
||||
}
|
||||
}
|
||||
|
||||
@ -173,10 +173,6 @@ const FontStatusPanel: React.FC<FontStatusPanelProps> = ({ document, pageIndex }
|
||||
[document, pageIndex]
|
||||
);
|
||||
|
||||
if (!document || fontAnalysis.fonts.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const { canReproducePerfectly, hasWarnings, summary, fonts } = fontAnalysis;
|
||||
|
||||
const statusIcon = useMemo(() => {
|
||||
@ -189,6 +185,11 @@ const FontStatusPanel: React.FC<FontStatusPanelProps> = ({ document, pageIndex }
|
||||
return <InfoIcon sx={{ fontSize: 16 }} />;
|
||||
}, [canReproducePerfectly, hasWarnings]);
|
||||
|
||||
// Early return AFTER all hooks are declared
|
||||
if (!document || fontAnalysis.fonts.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const statusColor = canReproducePerfectly ? 'green' : hasWarnings ? 'yellow' : 'blue';
|
||||
|
||||
const pageLabel = pageIndex !== undefined
|
||||
@ -199,14 +200,30 @@ const FontStatusPanel: React.FC<FontStatusPanelProps> = ({ document, pageIndex }
|
||||
<Accordion variant="contained" defaultValue={hasWarnings ? 'fonts' : undefined}>
|
||||
<Accordion.Item value="fonts">
|
||||
<Accordion.Control>
|
||||
<Group gap="xs" wrap="nowrap">
|
||||
{statusIcon}
|
||||
<Text size="sm" fw={500}>
|
||||
{pageLabel}
|
||||
</Text>
|
||||
<Badge size="xs" color={statusColor} variant="dot">
|
||||
{fonts.length}
|
||||
</Badge>
|
||||
<Group gap="xs" wrap="wrap" style={{ flex: 1 }}>
|
||||
<Group gap="xs" wrap="nowrap">
|
||||
{statusIcon}
|
||||
<Text size="sm" fw={500}>
|
||||
{pageLabel}
|
||||
</Text>
|
||||
<Badge size="xs" color={statusColor} variant="dot">
|
||||
{fonts.length}
|
||||
</Badge>
|
||||
</Group>
|
||||
|
||||
{/* Warning badges BEFORE expansion */}
|
||||
<Group gap={4} wrap="wrap">
|
||||
{summary.systemFallback > 0 && (
|
||||
<Badge size="xs" color="yellow" variant="filled" leftSection={<WarningIcon sx={{ fontSize: 12 }} />}>
|
||||
{summary.systemFallback} {t('pdfTextEditor.fontAnalysis.fallback', 'fallback')}
|
||||
</Badge>
|
||||
)}
|
||||
{summary.missing > 0 && (
|
||||
<Badge size="xs" color="red" variant="filled" leftSection={<ErrorIcon sx={{ fontSize: 12 }} />}>
|
||||
{summary.missing} {t('pdfTextEditor.fontAnalysis.missing', 'missing')}
|
||||
</Badge>
|
||||
)}
|
||||
</Group>
|
||||
</Group>
|
||||
</Accordion.Control>
|
||||
<Accordion.Panel>
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -18,6 +18,7 @@ import {
|
||||
PdfJsonPage,
|
||||
TextGroup,
|
||||
PdfTextEditorViewData,
|
||||
BoundingBox,
|
||||
} from './pdfTextEditorTypes';
|
||||
import {
|
||||
deepCloneDocument,
|
||||
@ -26,6 +27,7 @@ import {
|
||||
restoreGlyphElements,
|
||||
extractDocumentImages,
|
||||
cloneImageElement,
|
||||
cloneTextElement,
|
||||
valueOr,
|
||||
} from './pdfTextEditorUtils';
|
||||
import PdfTextEditorView from '@app/components/tools/pdfTextEditor/PdfTextEditorView';
|
||||
@ -52,6 +54,148 @@ const getAutoLoadKey = (file: File): string => {
|
||||
return `${file.name}|${file.size}|${file.lastModified}`;
|
||||
};
|
||||
|
||||
const normalizeLineArray = (value: string | undefined | null, expected: number): string[] => {
|
||||
const normalized = (value ?? '').replace(/\r/g, '');
|
||||
if (expected <= 0) {
|
||||
return [normalized];
|
||||
}
|
||||
const parts = normalized.split('\n');
|
||||
if (parts.length === expected) {
|
||||
return parts;
|
||||
}
|
||||
if (parts.length < expected) {
|
||||
return parts.concat(Array(expected - parts.length).fill(''));
|
||||
}
|
||||
const head = parts.slice(0, Math.max(expected - 1, 0));
|
||||
const tail = parts.slice(Math.max(expected - 1, 0)).join('\n');
|
||||
return [...head, tail];
|
||||
};
|
||||
|
||||
const cloneLineTemplate = (line: TextGroup, text?: string, originalText?: string): TextGroup => ({
|
||||
...line,
|
||||
text: text ?? line.text,
|
||||
originalText: originalText ?? line.originalText,
|
||||
childLineGroups: null,
|
||||
lineElementCounts: null,
|
||||
lineSpacing: null,
|
||||
elements: line.elements.map(cloneTextElement),
|
||||
originalElements: line.originalElements.map(cloneTextElement),
|
||||
});
|
||||
|
||||
const expandGroupToLines = (group: TextGroup): TextGroup[] => {
|
||||
if (group.childLineGroups && group.childLineGroups.length > 0) {
|
||||
const textLines = normalizeLineArray(group.text, group.childLineGroups.length);
|
||||
const originalLines = normalizeLineArray(group.originalText, group.childLineGroups.length);
|
||||
return group.childLineGroups.map((child, index) =>
|
||||
cloneLineTemplate(child, textLines[index], originalLines[index]),
|
||||
);
|
||||
}
|
||||
return [cloneLineTemplate(group)];
|
||||
};
|
||||
|
||||
const mergeBoundingBoxes = (boxes: BoundingBox[]): BoundingBox => {
|
||||
if (boxes.length === 0) {
|
||||
return { left: 0, right: 0, top: 0, bottom: 0 };
|
||||
}
|
||||
return boxes.reduce(
|
||||
(acc, box) => ({
|
||||
left: Math.min(acc.left, box.left),
|
||||
right: Math.max(acc.right, box.right),
|
||||
top: Math.min(acc.top, box.top),
|
||||
bottom: Math.max(acc.bottom, box.bottom),
|
||||
}),
|
||||
{ ...boxes[0] },
|
||||
);
|
||||
};
|
||||
|
||||
const buildMergedGroupFromSelection = (groups: TextGroup[]): TextGroup | null => {
|
||||
if (groups.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const lineTemplates = groups.flatMap(expandGroupToLines);
|
||||
if (lineTemplates.length <= 1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const lineTexts = lineTemplates.map((line) => line.text ?? '');
|
||||
const lineOriginalTexts = lineTemplates.map((line) => line.originalText ?? '');
|
||||
const combinedOriginals = lineTemplates.flatMap((line) => line.originalElements.map(cloneTextElement));
|
||||
const combinedElements = combinedOriginals.map(cloneTextElement);
|
||||
const mergedBounds = mergeBoundingBoxes(lineTemplates.map((line) => line.bounds));
|
||||
|
||||
const spacingValues: number[] = [];
|
||||
for (let index = 1; index < lineTemplates.length; index += 1) {
|
||||
const prevBaseline = lineTemplates[index - 1].baseline ?? lineTemplates[index - 1].bounds.bottom;
|
||||
const currentBaseline = lineTemplates[index].baseline ?? lineTemplates[index].bounds.bottom;
|
||||
const spacing = Math.abs(prevBaseline - currentBaseline);
|
||||
if (spacing > 0) {
|
||||
spacingValues.push(spacing);
|
||||
}
|
||||
}
|
||||
const averageSpacing =
|
||||
spacingValues.length > 0
|
||||
? spacingValues.reduce((sum, value) => sum + value, 0) / spacingValues.length
|
||||
: null;
|
||||
|
||||
const first = groups[0];
|
||||
const lineElementCounts = lineTemplates.map((line) => Math.max(line.originalElements.length, 1));
|
||||
const paragraph: TextGroup = {
|
||||
...first,
|
||||
text: lineTexts.join('\n'),
|
||||
originalText: lineOriginalTexts.join('\n'),
|
||||
elements: combinedElements,
|
||||
originalElements: combinedOriginals,
|
||||
bounds: mergedBounds,
|
||||
lineSpacing: averageSpacing,
|
||||
lineElementCounts: lineElementCounts.length > 1 ? lineElementCounts : null,
|
||||
childLineGroups: lineTemplates.map((line, index) =>
|
||||
cloneLineTemplate(line, lineTexts[index], lineOriginalTexts[index]),
|
||||
),
|
||||
};
|
||||
|
||||
return paragraph;
|
||||
};
|
||||
|
||||
const splitParagraphGroup = (group: TextGroup): TextGroup[] => {
|
||||
if (!group.childLineGroups || group.childLineGroups.length <= 1) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const templateLines = group.childLineGroups.map((child) => cloneLineTemplate(child));
|
||||
const lineCount = templateLines.length;
|
||||
const textLines = normalizeLineArray(group.text, lineCount);
|
||||
const originalLines = normalizeLineArray(group.originalText, lineCount);
|
||||
const baseCounts =
|
||||
group.lineElementCounts && group.lineElementCounts.length === lineCount
|
||||
? [...group.lineElementCounts]
|
||||
: templateLines.map((line) => Math.max(line.originalElements.length, 1));
|
||||
|
||||
const totalOriginals = group.originalElements.length;
|
||||
const counted = baseCounts.reduce((sum, count) => sum + count, 0);
|
||||
if (counted < totalOriginals && baseCounts.length > 0) {
|
||||
baseCounts[baseCounts.length - 1] += totalOriginals - counted;
|
||||
}
|
||||
|
||||
let offset = 0;
|
||||
return templateLines.map((template, index) => {
|
||||
const take = Math.max(1, baseCounts[index] ?? 1);
|
||||
const slice = group.originalElements.slice(offset, offset + take).map(cloneTextElement);
|
||||
offset += take;
|
||||
return {
|
||||
...template,
|
||||
id: `${group.id}-line-${index + 1}-${Date.now()}-${index}`,
|
||||
text: textLines[index] ?? '',
|
||||
originalText: originalLines[index] ?? '',
|
||||
elements: slice.map(cloneTextElement),
|
||||
originalElements: slice,
|
||||
lineElementCounts: null,
|
||||
lineSpacing: null,
|
||||
childLineGroups: null,
|
||||
};
|
||||
});
|
||||
};
|
||||
|
||||
const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
const { t } = useTranslation();
|
||||
const {
|
||||
@ -63,6 +207,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
} = useToolWorkflow();
|
||||
const { actions: navigationActions } = useNavigationActions();
|
||||
const navigationState = useNavigationState();
|
||||
const { registerUnsavedChangesChecker, unregisterUnsavedChangesChecker } = navigationActions;
|
||||
|
||||
const [loadedDocument, setLoadedDocument] = useState<PdfJsonDocument | null>(null);
|
||||
const [groupsByPage, setGroupsByPage] = useState<TextGroup[][]>([]);
|
||||
@ -89,6 +234,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
const [loadingImagePages, setLoadingImagePages] = useState<Set<number>>(new Set());
|
||||
|
||||
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
|
||||
const originalGroupsRef = useRef<TextGroup[][]>([]);
|
||||
const imagesByPageRef = useRef<PdfJsonImageElement[][]>([]);
|
||||
const autoLoadKeyRef = useRef<string | null>(null);
|
||||
const loadRequestIdRef = useRef(0);
|
||||
@ -131,7 +277,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}, []);
|
||||
|
||||
const dirtyPages = useMemo(
|
||||
() => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current),
|
||||
() => getDirtyPages(groupsByPage, imagesByPage, originalGroupsRef.current, originalImagesRef.current),
|
||||
[groupsByPage, imagesByPage],
|
||||
);
|
||||
const hasChanges = useMemo(() => dirtyPages.some(Boolean), [dirtyPages]);
|
||||
@ -157,6 +303,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
const images = extractDocumentImages(cloned);
|
||||
const originalImages = images.map((page) => page.map(cloneImageElement));
|
||||
originalImagesRef.current = originalImages;
|
||||
originalGroupsRef.current = groups.map((page) => page.map((group) => ({ ...group })));
|
||||
imagesByPageRef.current = images.map((page) => page.map(cloneImageElement));
|
||||
const initialLoaded = new Set<number>();
|
||||
originalImages.forEach((pageImages, index) => {
|
||||
@ -351,8 +498,6 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
let shouldUseLazyMode = false;
|
||||
let pendingJobId: string | null = null;
|
||||
|
||||
setErrorMessage(null);
|
||||
|
||||
if (isPdf) {
|
||||
latestPdfRequestIdRef.current = requestId;
|
||||
setIsConverting(true);
|
||||
@ -539,7 +684,6 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
setCachedJobId(shouldUseLazyMode ? pendingJobId : null);
|
||||
setFileName(file.name);
|
||||
setErrorMessage(null);
|
||||
autoLoadKeyRef.current = fileKey;
|
||||
} catch (error: any) {
|
||||
console.error('Failed to load file', error);
|
||||
console.error('Error details:', {
|
||||
@ -598,13 +742,83 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}, []);
|
||||
|
||||
const handleGroupDelete = useCallback((pageIndex: number, groupId: string) => {
|
||||
console.log(`🗑️ Deleting group ${groupId} from page ${pageIndex}`);
|
||||
setGroupsByPage((previous) => {
|
||||
const updated = previous.map((groups, idx) => {
|
||||
if (idx !== pageIndex) return groups;
|
||||
const filtered = groups.filter((group) => group.id !== groupId);
|
||||
console.log(` Before: ${groups.length} groups, After: ${filtered.length} groups`);
|
||||
return filtered;
|
||||
});
|
||||
return updated;
|
||||
});
|
||||
}, []);
|
||||
|
||||
const handleMergeGroups = useCallback((pageIndex: number, groupIds: string[]): boolean => {
|
||||
if (groupIds.length < 2) {
|
||||
return false;
|
||||
}
|
||||
let updated = false;
|
||||
setGroupsByPage((previous) =>
|
||||
previous.map((groups, idx) =>
|
||||
idx !== pageIndex
|
||||
? groups
|
||||
: groups.map((group) => (group.id === groupId ? { ...group, text: '' } : group))
|
||||
)
|
||||
previous.map((groups, idx) => {
|
||||
if (idx !== pageIndex) {
|
||||
return groups;
|
||||
}
|
||||
const indices = groupIds
|
||||
.map((id) => groups.findIndex((group) => group.id === id))
|
||||
.filter((index) => index >= 0);
|
||||
if (indices.length !== groupIds.length) {
|
||||
return groups;
|
||||
}
|
||||
const sorted = [...indices].sort((a, b) => a - b);
|
||||
for (let i = 1; i < sorted.length; i += 1) {
|
||||
if (sorted[i] !== sorted[i - 1] + 1) {
|
||||
return groups;
|
||||
}
|
||||
}
|
||||
const selection = sorted.map((position) => groups[position]);
|
||||
const merged = buildMergedGroupFromSelection(selection);
|
||||
if (!merged) {
|
||||
return groups;
|
||||
}
|
||||
const next = [
|
||||
...groups.slice(0, sorted[0]),
|
||||
merged,
|
||||
...groups.slice(sorted[sorted.length - 1] + 1),
|
||||
];
|
||||
updated = true;
|
||||
return next;
|
||||
}),
|
||||
);
|
||||
return updated;
|
||||
}, []);
|
||||
|
||||
const handleUngroupGroup = useCallback((pageIndex: number, groupId: string): boolean => {
|
||||
let updated = false;
|
||||
setGroupsByPage((previous) =>
|
||||
previous.map((groups, idx) => {
|
||||
if (idx !== pageIndex) {
|
||||
return groups;
|
||||
}
|
||||
const targetIndex = groups.findIndex((group) => group.id === groupId);
|
||||
if (targetIndex < 0) {
|
||||
return groups;
|
||||
}
|
||||
const targetGroup = groups[targetIndex];
|
||||
const splits = splitParagraphGroup(targetGroup);
|
||||
if (splits.length <= 1) {
|
||||
return groups;
|
||||
}
|
||||
const next = [
|
||||
...groups.slice(0, targetIndex),
|
||||
...splits,
|
||||
...groups.slice(targetIndex + 1),
|
||||
];
|
||||
updated = true;
|
||||
return next;
|
||||
}),
|
||||
);
|
||||
return updated;
|
||||
}, []);
|
||||
|
||||
const handleImageTransform = useCallback(
|
||||
@ -746,7 +960,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
}
|
||||
}, [buildPayload, onComplete]);
|
||||
|
||||
const handleGeneratePdf = useCallback(async () => {
|
||||
const handleGeneratePdf = useCallback(async (skipComplete = false) => {
|
||||
try {
|
||||
setIsGeneratingPdf(true);
|
||||
|
||||
@ -840,7 +1054,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
|
||||
downloadBlob(response.data, downloadName);
|
||||
|
||||
if (onComplete) {
|
||||
if (onComplete && !skipComplete) {
|
||||
const pdfFile = new File([response.data], downloadName, { type: 'application/pdf' });
|
||||
onComplete([pdfFile]);
|
||||
}
|
||||
@ -881,7 +1095,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
|
||||
downloadBlob(response.data, downloadName);
|
||||
|
||||
if (onComplete) {
|
||||
if (onComplete && !skipComplete) {
|
||||
const pdfFile = new File([response.data], downloadName, { type: 'application/pdf' });
|
||||
onComplete([pdfFile]);
|
||||
}
|
||||
@ -1052,7 +1266,6 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
forceSingleTextElement,
|
||||
groupingMode,
|
||||
requestPagePreview,
|
||||
onLoadJson: handleLoadFile,
|
||||
onSelectPage: handleSelectPage,
|
||||
onGroupEdit: handleGroupTextChange,
|
||||
onGroupDelete: handleGroupDelete,
|
||||
@ -1061,9 +1274,17 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
onReset: handleResetEdits,
|
||||
onDownloadJson: handleDownloadJson,
|
||||
onGeneratePdf: handleGeneratePdf,
|
||||
onGeneratePdfForNavigation: async () => {
|
||||
// Generate PDF without triggering tool completion
|
||||
await handleGeneratePdf(true);
|
||||
},
|
||||
onForceSingleTextElementChange: setForceSingleTextElement,
|
||||
onGroupingModeChange: setGroupingMode,
|
||||
onMergeGroups: handleMergeGroups,
|
||||
onUngroupGroup: handleUngroupGroup,
|
||||
}), [
|
||||
handleMergeGroups,
|
||||
handleUngroupGroup,
|
||||
handleImageTransform,
|
||||
imagesByPage,
|
||||
pagePreviews,
|
||||
@ -1076,7 +1297,6 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
handleGroupTextChange,
|
||||
handleGroupDelete,
|
||||
handleImageReset,
|
||||
handleLoadFile,
|
||||
handleResetEdits,
|
||||
handleSelectPage,
|
||||
hasChanges,
|
||||
@ -1155,14 +1375,30 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
|
||||
unregisterCustomWorkbenchView,
|
||||
]);
|
||||
|
||||
// Note: Compare tool doesn't auto-force workbench, and neither should we
|
||||
// The workbench should be set when the tool is selected via proper channels
|
||||
// (tool registry, tool picker, etc.) - not forced here
|
||||
|
||||
// Keep hasChanges in a ref for the checker to access
|
||||
const hasChangesRef = useRef(hasChanges);
|
||||
useEffect(() => {
|
||||
if (
|
||||
navigationState.selectedTool === 'pdfTextEditor' &&
|
||||
navigationState.workbench !== WORKBENCH_ID
|
||||
) {
|
||||
navigationActions.setWorkbench(WORKBENCH_ID);
|
||||
}
|
||||
}, [navigationActions, navigationState.selectedTool, navigationState.workbench]);
|
||||
hasChangesRef.current = hasChanges;
|
||||
console.log('[PdfTextEditor] hasChanges updated to:', hasChanges);
|
||||
}, [hasChanges]);
|
||||
|
||||
// Register unsaved changes checker for navigation guard
|
||||
useEffect(() => {
|
||||
const checker = () => {
|
||||
console.log('[PdfTextEditor] Checking unsaved changes:', hasChangesRef.current);
|
||||
return hasChangesRef.current;
|
||||
};
|
||||
registerUnsavedChangesChecker(checker);
|
||||
console.log('[PdfTextEditor] Registered unsaved changes checker');
|
||||
return () => {
|
||||
console.log('[PdfTextEditor] Unregistered unsaved changes checker');
|
||||
unregisterUnsavedChangesChecker();
|
||||
};
|
||||
}, [registerUnsavedChangesChecker, unregisterUnsavedChangesChecker]);
|
||||
|
||||
const lastSentViewDataRef = useRef<PdfTextEditorViewData | null>(null);
|
||||
|
||||
|
||||
@ -168,6 +168,7 @@ export interface TextGroup {
|
||||
text: string;
|
||||
originalText: string;
|
||||
bounds: BoundingBox;
|
||||
childLineGroups?: TextGroup[] | null;
|
||||
}
|
||||
|
||||
export const DEFAULT_PAGE_WIDTH = 612;
|
||||
@ -199,7 +200,6 @@ export interface PdfTextEditorViewData {
|
||||
forceSingleTextElement: boolean;
|
||||
groupingMode: 'auto' | 'paragraph' | 'singleLine';
|
||||
requestPagePreview: (pageIndex: number, scale: number) => void;
|
||||
onLoadJson: (file: File | null) => Promise<void> | void;
|
||||
onSelectPage: (pageIndex: number) => void;
|
||||
onGroupEdit: (pageIndex: number, groupId: string, value: string) => void;
|
||||
onGroupDelete: (pageIndex: number, groupId: string) => void;
|
||||
@ -218,6 +218,9 @@ export interface PdfTextEditorViewData {
|
||||
onReset: () => void;
|
||||
onDownloadJson: () => void;
|
||||
onGeneratePdf: () => void;
|
||||
onGeneratePdfForNavigation: () => Promise<void>;
|
||||
onForceSingleTextElementChange: (value: boolean) => void;
|
||||
onGroupingModeChange: (value: 'auto' | 'paragraph' | 'singleLine') => void;
|
||||
onMergeGroups: (pageIndex: number, groupIds: string[]) => boolean;
|
||||
onUngroupGroup: (pageIndex: number, groupId: string) => boolean;
|
||||
}
|
||||
|
||||
@ -520,8 +520,18 @@ const createGroup = (
|
||||
};
|
||||
};
|
||||
|
||||
const cloneLineTemplate = (line: TextGroup): TextGroup => ({
|
||||
...line,
|
||||
childLineGroups: null,
|
||||
lineElementCounts: null,
|
||||
lineSpacing: null,
|
||||
elements: line.elements.map(cloneTextElement),
|
||||
originalElements: line.originalElements.map(cloneTextElement),
|
||||
});
|
||||
|
||||
const groupLinesIntoParagraphs = (
|
||||
lineGroups: TextGroup[],
|
||||
pageWidth: number,
|
||||
metrics?: FontMetricsMap,
|
||||
): TextGroup[] => {
|
||||
if (lineGroups.length === 0) {
|
||||
@ -530,6 +540,8 @@ const groupLinesIntoParagraphs = (
|
||||
|
||||
const paragraphs: TextGroup[][] = [];
|
||||
let currentParagraph: TextGroup[] = [lineGroups[0]];
|
||||
const bulletFlags = new Map<string, boolean>();
|
||||
bulletFlags.set(lineGroups[0].id, false);
|
||||
|
||||
for (let i = 1; i < lineGroups.length; i++) {
|
||||
const prevLine = lineGroups[i - 1];
|
||||
@ -561,11 +573,85 @@ const groupLinesIntoParagraphs = (
|
||||
const maxReasonableSpacing = avgFontSize * 3.0; // Max ~3x font size for normal line spacing
|
||||
const hasReasonableSpacing = lineSpacing <= maxReasonableSpacing;
|
||||
|
||||
// Check if current line looks like a bullet/list item
|
||||
const prevRight = prevLine.bounds.right;
|
||||
const currentRight = currentLine.bounds.right;
|
||||
const prevWidth = prevRight - prevLeft;
|
||||
const currentWidth = currentRight - currentLeft;
|
||||
|
||||
// Count word count to help identify bullets (typically short)
|
||||
const prevWords = (prevLine.text ?? '').split(/\s+/).filter(w => w.length > 0).length;
|
||||
const currentWords = (currentLine.text ?? '').split(/\s+/).filter(w => w.length > 0).length;
|
||||
const prevText = (prevLine.text ?? '').trim();
|
||||
const currentText = (currentLine.text ?? '').trim();
|
||||
|
||||
// Bullet detection - look for bullet markers or very short lines
|
||||
const bulletMarkerRegex = /^[\u2022\u2023\u25E6\u2043\u2219•·◦‣⁃\-\*]\s|^\d+[\.\)]\s|^[a-z][\.\)]\s/i;
|
||||
const prevHasBulletMarker = bulletMarkerRegex.test(prevText);
|
||||
const currentHasBulletMarker = bulletMarkerRegex.test(currentText);
|
||||
|
||||
// True bullets are:
|
||||
// 1. Have bullet markers/numbers OR
|
||||
// 2. Very short (< 10 words) AND much narrower than average (< 60% of page width)
|
||||
const headingKeywords = ['action items', 'next steps', 'notes', 'logistics', 'tasks'];
|
||||
const normalizedPageWidth = pageWidth > 0 ? pageWidth : avgFontSize * 70;
|
||||
const maxReferenceWidth = normalizedPageWidth > 0 ? normalizedPageWidth : avgFontSize * 70;
|
||||
const indentDelta = currentLeft - prevLeft;
|
||||
const indentThreshold = Math.max(avgFontSize * 0.6, 8);
|
||||
const hasIndent = indentDelta > indentThreshold;
|
||||
const currentWidthRatio = maxReferenceWidth > 0 ? currentWidth / maxReferenceWidth : 0;
|
||||
const prevWidthRatio = maxReferenceWidth > 0 ? prevWidth / maxReferenceWidth : 0;
|
||||
const prevLooksLikeHeading =
|
||||
prevText.endsWith(':') ||
|
||||
(prevWords <= 4 && prevWidthRatio < 0.4) ||
|
||||
headingKeywords.some((keyword) => prevText.toLowerCase().includes(keyword));
|
||||
|
||||
const wrapCandidate =
|
||||
!currentHasBulletMarker &&
|
||||
!hasIndent &&
|
||||
!prevLooksLikeHeading &&
|
||||
currentWords <= 12 &&
|
||||
currentWidthRatio < 0.45 &&
|
||||
Math.abs(prevLeft - currentLeft) <= leftAlignmentTolerance &&
|
||||
currentWidth < prevWidth * 0.85;
|
||||
|
||||
const currentIsBullet = wrapCandidate
|
||||
? false
|
||||
: currentHasBulletMarker ||
|
||||
(hasIndent && (currentWords <= 14 || currentWidthRatio <= 0.65)) ||
|
||||
(prevLooksLikeHeading && (currentWords <= 16 || currentWidthRatio <= 0.8 || prevWidthRatio < 0.35)) ||
|
||||
(currentWords <= 8 && currentWidthRatio <= 0.45 && prevWidth - currentWidth > avgFontSize * 4);
|
||||
|
||||
const prevIsBullet = bulletFlags.get(prevLine.id) ?? prevHasBulletMarker;
|
||||
bulletFlags.set(currentLine.id, currentIsBullet);
|
||||
|
||||
// Detect paragraph→bullet transition
|
||||
const likelyBulletStart = !prevIsBullet && currentIsBullet;
|
||||
|
||||
// Don't merge two consecutive bullets
|
||||
const bothAreBullets = prevIsBullet && currentIsBullet;
|
||||
|
||||
// Merge into paragraph if:
|
||||
// 1. Left aligned
|
||||
// 2. Same font
|
||||
// 3. Reasonable line spacing (not a large gap indicating paragraph break)
|
||||
const shouldMerge = isLeftAligned && sameFont && hasReasonableSpacing;
|
||||
// 3. Reasonable line spacing
|
||||
// 4. NOT transitioning to bullets
|
||||
// 5. NOT both are bullets
|
||||
const shouldMerge =
|
||||
isLeftAligned &&
|
||||
sameFont &&
|
||||
hasReasonableSpacing &&
|
||||
!likelyBulletStart &&
|
||||
!bothAreBullets &&
|
||||
!currentIsBullet;
|
||||
|
||||
if (i < 10 || likelyBulletStart || bothAreBullets || !shouldMerge) {
|
||||
console.log(` Line ${i}:`);
|
||||
console.log(` prev: "${prevText.substring(0, 40)}" (${prevWords}w, ${prevWidth.toFixed(0)}pt, marker:${prevHasBulletMarker}, bullet:${prevIsBullet})`);
|
||||
console.log(` curr: "${currentText.substring(0, 40)}" (${currentWords}w, ${currentWidth.toFixed(0)}pt, marker:${currentHasBulletMarker}, bullet:${currentIsBullet})`);
|
||||
console.log(` checks: leftAlign:${isLeftAligned} (${Math.abs(prevLeft - currentLeft).toFixed(1)}pt), sameFont:${sameFont}, spacing:${hasReasonableSpacing} (${lineSpacing.toFixed(1)}pt/${maxReasonableSpacing.toFixed(1)}pt)`);
|
||||
console.log(` decision: merge=${shouldMerge} (bulletStart:${likelyBulletStart}, bothBullets:${bothAreBullets})`);
|
||||
}
|
||||
|
||||
if (shouldMerge) {
|
||||
currentParagraph.push(currentLine);
|
||||
@ -587,17 +673,24 @@ const groupLinesIntoParagraphs = (
|
||||
}
|
||||
|
||||
// Combine all elements from all lines
|
||||
const allElements = lines.flatMap(line => line.originalElements);
|
||||
const lineTemplates = lines.map(line => cloneLineTemplate(line));
|
||||
const flattenedLineTemplates = lineTemplates.flatMap((line) =>
|
||||
line.childLineGroups && line.childLineGroups.length > 0
|
||||
? line.childLineGroups
|
||||
: [line],
|
||||
);
|
||||
const allLines = flattenedLineTemplates.length > 0 ? flattenedLineTemplates : lineTemplates;
|
||||
const allElements = allLines.flatMap(line => line.originalElements);
|
||||
const pageIndex = lines[0].pageIndex;
|
||||
const lineElementCounts = lines.map((line) => line.originalElements.length);
|
||||
const lineElementCounts = allLines.map((line) => line.originalElements.length);
|
||||
|
||||
// Create merged group with newlines between lines
|
||||
const paragraphText = lines.map(line => line.text).join('\n');
|
||||
const mergedBounds = mergeBounds(lines.map(line => line.bounds));
|
||||
const paragraphText = allLines.map(line => line.text).join('\n');
|
||||
const mergedBounds = mergeBounds(allLines.map(line => line.bounds));
|
||||
const spacingValues: number[] = [];
|
||||
for (let i = 1; i < lines.length; i++) {
|
||||
const prevBaseline = lines[i - 1].baseline ?? lines[i - 1].bounds.bottom;
|
||||
const currentBaseline = lines[i].baseline ?? lines[i].bounds.bottom;
|
||||
for (let i = 1; i < allLines.length; i++) {
|
||||
const prevBaseline = allLines[i - 1].baseline ?? allLines[i - 1].bounds.bottom;
|
||||
const currentBaseline = allLines[i].baseline ?? allLines[i].bounds.bottom;
|
||||
const spacing = Math.abs(prevBaseline - currentBaseline);
|
||||
if (spacing > 0) {
|
||||
spacingValues.push(spacing);
|
||||
@ -633,6 +726,7 @@ const groupLinesIntoParagraphs = (
|
||||
text: paragraphText,
|
||||
originalText: paragraphText,
|
||||
bounds: mergedBounds,
|
||||
childLineGroups: allLines,
|
||||
};
|
||||
});
|
||||
};
|
||||
@ -647,6 +741,8 @@ export const groupPageTextElements = (
|
||||
return [];
|
||||
}
|
||||
|
||||
const pageWidth = valueOr(page.width, DEFAULT_PAGE_WIDTH);
|
||||
|
||||
const elements = page.textElements
|
||||
.map(cloneTextElement)
|
||||
.filter((element) => element.text !== null && element.text !== undefined);
|
||||
@ -740,7 +836,7 @@ export const groupPageTextElements = (
|
||||
|
||||
if (groupingMode === 'paragraph') {
|
||||
// Paragraph mode: always apply grouping
|
||||
return groupLinesIntoParagraphs(lineGroups, metrics);
|
||||
return groupLinesIntoParagraphs(lineGroups, pageWidth, metrics);
|
||||
}
|
||||
|
||||
// Auto mode: use heuristic to determine if we should group
|
||||
@ -749,6 +845,11 @@ export const groupPageTextElements = (
|
||||
let totalWords = 0;
|
||||
let longTextGroups = 0;
|
||||
let totalGroups = 0;
|
||||
const wordCounts: number[] = [];
|
||||
let fullWidthLines = 0;
|
||||
|
||||
// Define "full width" as extending to at least 70% of page width
|
||||
const fullWidthThreshold = pageWidth * 0.7;
|
||||
|
||||
lineGroups.forEach((group) => {
|
||||
const text = (group.text || '').trim();
|
||||
@ -760,14 +861,21 @@ export const groupPageTextElements = (
|
||||
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
|
||||
|
||||
totalWords += wordCount;
|
||||
wordCounts.push(wordCount);
|
||||
|
||||
if (lineCount > 1) {
|
||||
multiLineGroups++;
|
||||
}
|
||||
|
||||
if (wordCount >= 5 || text.length >= 30) {
|
||||
if (wordCount >= 10 || text.length >= 50) {
|
||||
longTextGroups++;
|
||||
}
|
||||
|
||||
// Check if this line extends close to the right margin (paragraph-like)
|
||||
const rightEdge = group.bounds.right;
|
||||
if (rightEdge >= fullWidthThreshold) {
|
||||
fullWidthLines++;
|
||||
}
|
||||
});
|
||||
|
||||
if (totalGroups === 0) {
|
||||
@ -776,18 +884,65 @@ export const groupPageTextElements = (
|
||||
|
||||
const avgWordsPerGroup = totalWords / totalGroups;
|
||||
const longTextRatio = longTextGroups / totalGroups;
|
||||
const fullWidthRatio = fullWidthLines / totalGroups;
|
||||
|
||||
const isParagraphPage =
|
||||
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
|
||||
avgWordsPerGroup > 12 ||
|
||||
longTextRatio > 0.4;
|
||||
// Calculate variance in line lengths (paragraphs have varying lengths, lists are uniform)
|
||||
const variance = wordCounts.reduce((sum, count) => {
|
||||
const diff = count - avgWordsPerGroup;
|
||||
return sum + diff * diff;
|
||||
}, 0) / totalGroups;
|
||||
const stdDev = Math.sqrt(variance);
|
||||
const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0;
|
||||
|
||||
// Check each criterion
|
||||
const criterion1 = avgWordsPerGroup > 5;
|
||||
const criterion2 = longTextRatio > 0.4;
|
||||
const criterion3 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text
|
||||
|
||||
const isParagraphPage = criterion1 && criterion2 && criterion3;
|
||||
|
||||
// Log detection stats
|
||||
console.log(`📄 Page ${pageIndex} Grouping Analysis (mode: ${groupingMode}):`);
|
||||
console.log(` Stats:`);
|
||||
console.log(` • Page width: ${pageWidth.toFixed(1)}pt (full-width threshold: ${fullWidthThreshold.toFixed(1)}pt)`);
|
||||
console.log(` • Multi-line groups: ${multiLineGroups}`);
|
||||
console.log(` • Total groups: ${totalGroups}`);
|
||||
console.log(` • Total words: ${totalWords}`);
|
||||
console.log(` • Long text groups (≥10 words or ≥50 chars): ${longTextGroups}`);
|
||||
console.log(` • Full-width lines (≥70% page width): ${fullWidthLines}`);
|
||||
console.log(` • Avg words per group: ${avgWordsPerGroup.toFixed(2)}`);
|
||||
console.log(` • Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`);
|
||||
console.log(` • Full-width ratio: ${(fullWidthRatio * 100).toFixed(1)}%`);
|
||||
console.log(` • Std deviation: ${stdDev.toFixed(2)}`);
|
||||
console.log(` • Coefficient of variation: ${coefficientOfVariation.toFixed(2)}`);
|
||||
console.log(` Criteria:`);
|
||||
console.log(` 1. Avg Words Per Group: ${criterion1 ? '✅ PASS' : '❌ FAIL'}`);
|
||||
console.log(` (${avgWordsPerGroup.toFixed(2)} > 5)`);
|
||||
console.log(` 2. Long Text Ratio: ${criterion2 ? '✅ PASS' : '❌ FAIL'}`);
|
||||
console.log(` (${(longTextRatio * 100).toFixed(1)}% > 40%)`);
|
||||
console.log(` 3. Line Width Pattern: ${criterion3 ? '✅ PASS' : '❌ FAIL'}`);
|
||||
console.log(` (CV ${coefficientOfVariation.toFixed(2)} > 0.5 OR ${(fullWidthRatio * 100).toFixed(1)}% > 60%)`);
|
||||
console.log(` ${coefficientOfVariation > 0.5 ? '✓ High variance (varying line lengths)' : '✗ Low variance'} ${fullWidthRatio > 0.6 ? '✓ Many full-width lines (paragraph-like)' : '✗ Few full-width lines (list-like)'}`);
|
||||
console.log(` Decision: ${isParagraphPage ? '📝 PARAGRAPH MODE' : '📋 LINE MODE'}`);
|
||||
if (isParagraphPage) {
|
||||
console.log(` Reason: All three criteria passed (AND logic)`);
|
||||
} else {
|
||||
const failedReasons = [];
|
||||
if (!criterion1) failedReasons.push('low average words per group');
|
||||
if (!criterion2) failedReasons.push('low ratio of long text groups');
|
||||
if (!criterion3) failedReasons.push('low variance and few full-width lines (list-like structure)');
|
||||
console.log(` Reason: ${failedReasons.join(', ')}`);
|
||||
}
|
||||
console.log('');
|
||||
|
||||
// Only apply paragraph grouping if it looks like a paragraph-heavy page
|
||||
if (isParagraphPage) {
|
||||
return groupLinesIntoParagraphs(lineGroups, metrics);
|
||||
console.log(`🔀 Applying paragraph grouping to page ${pageIndex}`);
|
||||
return groupLinesIntoParagraphs(lineGroups, pageWidth, metrics);
|
||||
}
|
||||
|
||||
// For sparse pages, keep lines separate
|
||||
console.log(`📋 Keeping lines separate for page ${pageIndex}`);
|
||||
return lineGroups;
|
||||
};
|
||||
|
||||
@ -829,10 +984,28 @@ export const deepCloneDocument = (document: PdfJsonDocument): PdfJsonDocument =>
|
||||
};
|
||||
|
||||
export const pageDimensions = (page: PdfJsonPage | null | undefined): { width: number; height: number } => {
|
||||
return {
|
||||
width: valueOr(page?.width, DEFAULT_PAGE_WIDTH),
|
||||
height: valueOr(page?.height, DEFAULT_PAGE_HEIGHT),
|
||||
};
|
||||
const width = valueOr(page?.width, DEFAULT_PAGE_WIDTH);
|
||||
const height = valueOr(page?.height, DEFAULT_PAGE_HEIGHT);
|
||||
|
||||
console.log(`📏 [pageDimensions] Calculating page size:`, {
|
||||
hasPage: !!page,
|
||||
rawWidth: page?.width,
|
||||
rawHeight: page?.height,
|
||||
mediaBox: page?.mediaBox,
|
||||
cropBox: page?.cropBox,
|
||||
rotation: page?.rotation,
|
||||
calculatedWidth: width,
|
||||
calculatedHeight: height,
|
||||
DEFAULT_PAGE_WIDTH,
|
||||
DEFAULT_PAGE_HEIGHT,
|
||||
commonFormats: {
|
||||
'US Letter': '612 × 792 pt',
|
||||
'A4': '595 × 842 pt',
|
||||
'Legal': '612 × 1008 pt',
|
||||
},
|
||||
});
|
||||
|
||||
return { width, height };
|
||||
};
|
||||
|
||||
export const createMergedElement = (group: TextGroup): PdfJsonTextElement => {
|
||||
@ -1192,14 +1365,35 @@ export const areImageListsDifferent = (
|
||||
export const getDirtyPages = (
|
||||
groupsByPage: TextGroup[][],
|
||||
imagesByPage: PdfJsonImageElement[][],
|
||||
originalGroupsByPage: TextGroup[][],
|
||||
originalImagesByPage: PdfJsonImageElement[][],
|
||||
): boolean[] => {
|
||||
return groupsByPage.map((groups, index) => {
|
||||
// Check if any text was modified
|
||||
const textDirty = groups.some((group) => group.text !== group.originalText);
|
||||
|
||||
// Check if any groups were deleted by comparing with original groups
|
||||
const originalGroups = originalGroupsByPage[index] ?? [];
|
||||
const groupCountChanged = groups.length !== originalGroups.length;
|
||||
|
||||
const imageDirty = areImageListsDifferent(
|
||||
imagesByPage[index] ?? [],
|
||||
originalImagesByPage[index] ?? [],
|
||||
);
|
||||
return textDirty || imageDirty;
|
||||
|
||||
const isDirty = textDirty || groupCountChanged || imageDirty;
|
||||
|
||||
if (groupCountChanged || textDirty) {
|
||||
console.log(`📄 Page ${index} dirty check:`, {
|
||||
textDirty,
|
||||
groupCountChanged,
|
||||
originalGroupsLength: originalGroups.length,
|
||||
currentGroupsLength: groups.length,
|
||||
imageDirty,
|
||||
isDirty,
|
||||
});
|
||||
}
|
||||
|
||||
return isDirty;
|
||||
});
|
||||
};
|
||||
|
||||
Loading…
Reference in New Issue
Block a user