Merge remote-tracking branch 'origin/codex/add-pdf-to-json-and-json-to-pdf-features' into demo

This commit is contained in:
Anthony Stirling 2025-11-14 15:35:54 +00:00
commit 2c1d93887a
9 changed files with 1395 additions and 266 deletions

View File

@ -415,9 +415,16 @@ public class PdfJsonConversionService {
for (PDPage page : document.getPages()) {
PdfJsonPageDimension dim = new PdfJsonPageDimension();
dim.setPageNumber(pageIndex + 1);
PDRectangle mediaBox = page.getMediaBox();
dim.setWidth(mediaBox.getWidth());
dim.setHeight(mediaBox.getHeight());
// Use CropBox if present (defines visible page area), otherwise fall back
// to MediaBox
PDRectangle pageBox = page.getCropBox();
if (pageBox == null
|| pageBox.getWidth() == 0
|| pageBox.getHeight() == 0) {
pageBox = page.getMediaBox();
}
dim.setWidth(pageBox.getWidth());
dim.setHeight(pageBox.getHeight());
dim.setRotation(page.getRotation());
pageDimensions.add(dim);
pageIndex++;
@ -1851,9 +1858,13 @@ public class PdfJsonConversionService {
for (PDPage page : document.getPages()) {
PdfJsonPage pageModel = new PdfJsonPage();
pageModel.setPageNumber(pageIndex + 1);
PDRectangle mediaBox = page.getMediaBox();
pageModel.setWidth(mediaBox.getWidth());
pageModel.setHeight(mediaBox.getHeight());
// Use CropBox if present (defines visible page area), otherwise fall back to MediaBox
PDRectangle pageBox = page.getCropBox();
if (pageBox == null || pageBox.getWidth() == 0 || pageBox.getHeight() == 0) {
pageBox = page.getMediaBox();
}
pageModel.setWidth(pageBox.getWidth());
pageModel.setHeight(pageBox.getHeight());
pageModel.setRotation(page.getRotation());
pageModel.setTextElements(textByPage.getOrDefault(pageIndex + 1, new ArrayList<>()));
pageModel.setImageElements(imagesByPage.getOrDefault(pageIndex + 1, new ArrayList<>()));

View File

@ -4533,6 +4533,32 @@
"cancel": "Cancel",
"confirm": "Reset and Change Mode"
},
"welcomeBanner": {
"title": "Welcome to PDF Text Editor (Early Access)",
"experimental": "This is an experimental feature in active development. Expect some instability and issues during use.",
"howItWorks": "This tool converts your PDF to an editable format where you can modify text content and reposition images. Changes are saved back as a new PDF.",
"bestFor": "Works Best With:",
"bestFor1": "Simple PDFs containing primarily text and images",
"bestFor2": "Documents with standard paragraph formatting",
"bestFor3": "Letters, essays, reports, and basic documents",
"notIdealFor": "Not Ideal For:",
"notIdealFor1": "PDFs with special formatting like bullet points, tables, or multi-column layouts",
"notIdealFor2": "Magazines, brochures, or heavily designed documents",
"notIdealFor3": "Instruction manuals with complex layouts",
"limitations": "Current Limitations:",
"limitation1": "Font rendering may differ slightly from the original PDF",
"limitation2": "Complex graphics, form fields, and annotations are preserved but not editable",
"limitation3": "Large files may take time to convert and process",
"knownIssues": "Known Issues (Being Fixed):",
"issue1": "Text colour is not currently preserved (will be added soon)",
"issue2": "Paragraph mode has more alignment and spacing issues - Single Line mode recommended",
"issue3": "The preview display differs from the exported PDF - exported PDFs are closer to the original",
"issue4": "Rotated text alignment may need manual adjustment",
"issue5": "Transparency and layering effects may vary from original",
"feedback": "This is an early access feature. Please report any issues you encounter to help us improve!",
"gotIt": "Got it",
"dontShowAgain": "Don't show again"
},
"disclaimer": {
"heading": "Preview limitations",
"textFocus": "This workspace focuses on editing text and repositioning embedded images. Complex page artwork, form widgets, and layered graphics are preserved for export but are not fully editable here.",
@ -4579,6 +4605,21 @@
"standard14": "Standard PDF Font",
"warnings": "Warnings",
"suggestions": "Notes"
},
"manual": {
"mergeTooltip": "Merge selected boxes into a single paragraph",
"merge": "Merge selection",
"ungroupTooltip": "Split paragraph back into separate lines",
"ungroup": "Ungroup selection",
"widthMenu": "Width options",
"expandWidth": "Expand to page edge",
"resetWidth": "Reset width",
"resizeHandle": "Adjust text width"
},
"options": {
"manualGrouping": {
"descriptionInline": "Tip: Hold Ctrl (Cmd) or Shift to multi-select text boxes. A floating toolbar will appear above the selection so you can merge, ungroup, or adjust widths."
}
}
},
"workspace": {

View File

@ -121,10 +121,11 @@ export const NavigationProvider: React.FC<{
hasUnsavedChanges
});
// If we're leaving pageEditor or viewer workbench and have unsaved changes, request navigation
// If we're leaving pageEditor, viewer, or custom workbench and have unsaved changes, request navigation
const leavingWorkbenchWithChanges =
(state.workbench === 'pageEditor' && workbench !== 'pageEditor' && hasUnsavedChanges) ||
(state.workbench === 'viewer' && workbench !== 'viewer' && hasUnsavedChanges);
(state.workbench === 'viewer' && workbench !== 'viewer' && hasUnsavedChanges) ||
(state.workbench.startsWith('custom:') && workbench !== state.workbench && hasUnsavedChanges);
if (leavingWorkbenchWithChanges) {
// Update state to reflect unsaved changes so modal knows
@ -132,7 +133,19 @@ export const NavigationProvider: React.FC<{
dispatch({ type: 'SET_UNSAVED_CHANGES', payload: { hasChanges: true } });
}
const performWorkbenchChange = () => {
dispatch({ type: 'SET_WORKBENCH', payload: { workbench } });
// When leaving a custom workbench, clear the selected tool
console.log('[NavigationContext] performWorkbenchChange executing', {
from: state.workbench,
to: workbench,
isCustom: state.workbench.startsWith('custom:')
});
if (state.workbench.startsWith('custom:')) {
console.log('[NavigationContext] Clearing tool and changing workbench to:', workbench);
dispatch({ type: 'SET_TOOL_AND_WORKBENCH', payload: { toolId: null, workbench } });
} else {
console.log('[NavigationContext] Just changing workbench to:', workbench);
dispatch({ type: 'SET_WORKBENCH', payload: { workbench } });
}
};
dispatch({ type: 'SET_PENDING_NAVIGATION', payload: { navigationFn: performWorkbenchChange } });
dispatch({ type: 'SHOW_NAVIGATION_WARNING', payload: { show: true } });
@ -149,10 +162,11 @@ export const NavigationProvider: React.FC<{
// Check for unsaved changes using registered checker or state
const hasUnsavedChanges = unsavedChangesCheckerRef.current?.() || state.hasUnsavedChanges;
// If we're leaving pageEditor or viewer workbench and have unsaved changes, request navigation
// If we're leaving pageEditor, viewer, or custom workbench and have unsaved changes, request navigation
const leavingWorkbenchWithChanges =
(state.workbench === 'pageEditor' && workbench !== 'pageEditor' && hasUnsavedChanges) ||
(state.workbench === 'viewer' && workbench !== 'viewer' && hasUnsavedChanges);
(state.workbench === 'viewer' && workbench !== 'viewer' && hasUnsavedChanges) ||
(state.workbench.startsWith('custom:') && workbench !== state.workbench && hasUnsavedChanges);
if (leavingWorkbenchWithChanges) {
const performWorkbenchChange = () => {
@ -192,13 +206,19 @@ export const NavigationProvider: React.FC<{
}, [state.hasUnsavedChanges]),
confirmNavigation: useCallback(() => {
console.log('[NavigationContext] confirmNavigation called', {
hasPendingNav: !!state.pendingNavigation,
currentWorkbench: state.workbench,
currentTool: state.selectedTool
});
if (state.pendingNavigation) {
state.pendingNavigation();
}
dispatch({ type: 'SET_PENDING_NAVIGATION', payload: { navigationFn: null } });
dispatch({ type: 'SHOW_NAVIGATION_WARNING', payload: { show: false } });
}, [state.pendingNavigation]),
console.log('[NavigationContext] confirmNavigation completed');
}, [state.pendingNavigation, state.workbench, state.selectedTool]),
cancelNavigation: useCallback(() => {
dispatch({ type: 'SET_PENDING_NAVIGATION', payload: { navigationFn: null } });

View File

@ -218,15 +218,25 @@ export function ToolWorkflowProvider({ children }: ToolWorkflowProviderProps) {
}, [customViewRegistry, customViewData]);
useEffect(() => {
if (isBaseWorkbench(navigationState.workbench)) {
const { workbench } = navigationState;
if (isBaseWorkbench(workbench)) {
return;
}
const currentCustomView = customWorkbenchViews.find(view => view.workbenchId === navigationState.workbench);
const currentCustomView = customWorkbenchViews.find(view => view.workbenchId === workbench);
const expectedWorkbench = selectedTool?.workbench;
const workbenchOwnedBySelectedTool = expectedWorkbench === workbench;
if (!currentCustomView || currentCustomView.data == null) {
// If the currently selected tool expects this custom workbench, allow it
// some time to register/populate the view instead of immediately bouncing
// the user back to Active Files.
if (workbenchOwnedBySelectedTool) {
return;
}
actions.setWorkbench(getDefaultWorkbench());
}
}, [actions, customWorkbenchViews, navigationState.workbench]);
}, [actions, customWorkbenchViews, navigationState.workbench, selectedTool]);
// Persisted via PreferencesContext; no direct localStorage writes needed here
@ -421,4 +431,4 @@ export function useToolWorkflow(): ToolWorkflowContextValue {
throw new Error('useToolWorkflow must be used within a ToolWorkflowProvider');
}
return context;
}
}

View File

@ -173,10 +173,6 @@ const FontStatusPanel: React.FC<FontStatusPanelProps> = ({ document, pageIndex }
[document, pageIndex]
);
if (!document || fontAnalysis.fonts.length === 0) {
return null;
}
const { canReproducePerfectly, hasWarnings, summary, fonts } = fontAnalysis;
const statusIcon = useMemo(() => {
@ -189,6 +185,11 @@ const FontStatusPanel: React.FC<FontStatusPanelProps> = ({ document, pageIndex }
return <InfoIcon sx={{ fontSize: 16 }} />;
}, [canReproducePerfectly, hasWarnings]);
// Early return AFTER all hooks are declared
if (!document || fontAnalysis.fonts.length === 0) {
return null;
}
const statusColor = canReproducePerfectly ? 'green' : hasWarnings ? 'yellow' : 'blue';
const pageLabel = pageIndex !== undefined
@ -199,14 +200,30 @@ const FontStatusPanel: React.FC<FontStatusPanelProps> = ({ document, pageIndex }
<Accordion variant="contained" defaultValue={hasWarnings ? 'fonts' : undefined}>
<Accordion.Item value="fonts">
<Accordion.Control>
<Group gap="xs" wrap="nowrap">
{statusIcon}
<Text size="sm" fw={500}>
{pageLabel}
</Text>
<Badge size="xs" color={statusColor} variant="dot">
{fonts.length}
</Badge>
<Group gap="xs" wrap="wrap" style={{ flex: 1 }}>
<Group gap="xs" wrap="nowrap">
{statusIcon}
<Text size="sm" fw={500}>
{pageLabel}
</Text>
<Badge size="xs" color={statusColor} variant="dot">
{fonts.length}
</Badge>
</Group>
{/* Warning badges BEFORE expansion */}
<Group gap={4} wrap="wrap">
{summary.systemFallback > 0 && (
<Badge size="xs" color="yellow" variant="filled" leftSection={<WarningIcon sx={{ fontSize: 12 }} />}>
{summary.systemFallback} {t('pdfTextEditor.fontAnalysis.fallback', 'fallback')}
</Badge>
)}
{summary.missing > 0 && (
<Badge size="xs" color="red" variant="filled" leftSection={<ErrorIcon sx={{ fontSize: 12 }} />}>
{summary.missing} {t('pdfTextEditor.fontAnalysis.missing', 'missing')}
</Badge>
)}
</Group>
</Group>
</Accordion.Control>
<Accordion.Panel>

View File

@ -18,6 +18,7 @@ import {
PdfJsonPage,
TextGroup,
PdfTextEditorViewData,
BoundingBox,
} from './pdfTextEditorTypes';
import {
deepCloneDocument,
@ -26,6 +27,7 @@ import {
restoreGlyphElements,
extractDocumentImages,
cloneImageElement,
cloneTextElement,
valueOr,
} from './pdfTextEditorUtils';
import PdfTextEditorView from '@app/components/tools/pdfTextEditor/PdfTextEditorView';
@ -52,6 +54,148 @@ const getAutoLoadKey = (file: File): string => {
return `${file.name}|${file.size}|${file.lastModified}`;
};
const normalizeLineArray = (value: string | undefined | null, expected: number): string[] => {
const normalized = (value ?? '').replace(/\r/g, '');
if (expected <= 0) {
return [normalized];
}
const parts = normalized.split('\n');
if (parts.length === expected) {
return parts;
}
if (parts.length < expected) {
return parts.concat(Array(expected - parts.length).fill(''));
}
const head = parts.slice(0, Math.max(expected - 1, 0));
const tail = parts.slice(Math.max(expected - 1, 0)).join('\n');
return [...head, tail];
};
const cloneLineTemplate = (line: TextGroup, text?: string, originalText?: string): TextGroup => ({
...line,
text: text ?? line.text,
originalText: originalText ?? line.originalText,
childLineGroups: null,
lineElementCounts: null,
lineSpacing: null,
elements: line.elements.map(cloneTextElement),
originalElements: line.originalElements.map(cloneTextElement),
});
const expandGroupToLines = (group: TextGroup): TextGroup[] => {
if (group.childLineGroups && group.childLineGroups.length > 0) {
const textLines = normalizeLineArray(group.text, group.childLineGroups.length);
const originalLines = normalizeLineArray(group.originalText, group.childLineGroups.length);
return group.childLineGroups.map((child, index) =>
cloneLineTemplate(child, textLines[index], originalLines[index]),
);
}
return [cloneLineTemplate(group)];
};
const mergeBoundingBoxes = (boxes: BoundingBox[]): BoundingBox => {
if (boxes.length === 0) {
return { left: 0, right: 0, top: 0, bottom: 0 };
}
return boxes.reduce(
(acc, box) => ({
left: Math.min(acc.left, box.left),
right: Math.max(acc.right, box.right),
top: Math.min(acc.top, box.top),
bottom: Math.max(acc.bottom, box.bottom),
}),
{ ...boxes[0] },
);
};
const buildMergedGroupFromSelection = (groups: TextGroup[]): TextGroup | null => {
if (groups.length === 0) {
return null;
}
const lineTemplates = groups.flatMap(expandGroupToLines);
if (lineTemplates.length <= 1) {
return null;
}
const lineTexts = lineTemplates.map((line) => line.text ?? '');
const lineOriginalTexts = lineTemplates.map((line) => line.originalText ?? '');
const combinedOriginals = lineTemplates.flatMap((line) => line.originalElements.map(cloneTextElement));
const combinedElements = combinedOriginals.map(cloneTextElement);
const mergedBounds = mergeBoundingBoxes(lineTemplates.map((line) => line.bounds));
const spacingValues: number[] = [];
for (let index = 1; index < lineTemplates.length; index += 1) {
const prevBaseline = lineTemplates[index - 1].baseline ?? lineTemplates[index - 1].bounds.bottom;
const currentBaseline = lineTemplates[index].baseline ?? lineTemplates[index].bounds.bottom;
const spacing = Math.abs(prevBaseline - currentBaseline);
if (spacing > 0) {
spacingValues.push(spacing);
}
}
const averageSpacing =
spacingValues.length > 0
? spacingValues.reduce((sum, value) => sum + value, 0) / spacingValues.length
: null;
const first = groups[0];
const lineElementCounts = lineTemplates.map((line) => Math.max(line.originalElements.length, 1));
const paragraph: TextGroup = {
...first,
text: lineTexts.join('\n'),
originalText: lineOriginalTexts.join('\n'),
elements: combinedElements,
originalElements: combinedOriginals,
bounds: mergedBounds,
lineSpacing: averageSpacing,
lineElementCounts: lineElementCounts.length > 1 ? lineElementCounts : null,
childLineGroups: lineTemplates.map((line, index) =>
cloneLineTemplate(line, lineTexts[index], lineOriginalTexts[index]),
),
};
return paragraph;
};
const splitParagraphGroup = (group: TextGroup): TextGroup[] => {
if (!group.childLineGroups || group.childLineGroups.length <= 1) {
return [];
}
const templateLines = group.childLineGroups.map((child) => cloneLineTemplate(child));
const lineCount = templateLines.length;
const textLines = normalizeLineArray(group.text, lineCount);
const originalLines = normalizeLineArray(group.originalText, lineCount);
const baseCounts =
group.lineElementCounts && group.lineElementCounts.length === lineCount
? [...group.lineElementCounts]
: templateLines.map((line) => Math.max(line.originalElements.length, 1));
const totalOriginals = group.originalElements.length;
const counted = baseCounts.reduce((sum, count) => sum + count, 0);
if (counted < totalOriginals && baseCounts.length > 0) {
baseCounts[baseCounts.length - 1] += totalOriginals - counted;
}
let offset = 0;
return templateLines.map((template, index) => {
const take = Math.max(1, baseCounts[index] ?? 1);
const slice = group.originalElements.slice(offset, offset + take).map(cloneTextElement);
offset += take;
return {
...template,
id: `${group.id}-line-${index + 1}-${Date.now()}-${index}`,
text: textLines[index] ?? '',
originalText: originalLines[index] ?? '',
elements: slice.map(cloneTextElement),
originalElements: slice,
lineElementCounts: null,
lineSpacing: null,
childLineGroups: null,
};
});
};
const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const { t } = useTranslation();
const {
@ -63,6 +207,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
} = useToolWorkflow();
const { actions: navigationActions } = useNavigationActions();
const navigationState = useNavigationState();
const { registerUnsavedChangesChecker, unregisterUnsavedChangesChecker } = navigationActions;
const [loadedDocument, setLoadedDocument] = useState<PdfJsonDocument | null>(null);
const [groupsByPage, setGroupsByPage] = useState<TextGroup[][]>([]);
@ -89,6 +234,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const [loadingImagePages, setLoadingImagePages] = useState<Set<number>>(new Set());
const originalImagesRef = useRef<PdfJsonImageElement[][]>([]);
const originalGroupsRef = useRef<TextGroup[][]>([]);
const imagesByPageRef = useRef<PdfJsonImageElement[][]>([]);
const autoLoadKeyRef = useRef<string | null>(null);
const loadRequestIdRef = useRef(0);
@ -131,7 +277,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
}, []);
const dirtyPages = useMemo(
() => getDirtyPages(groupsByPage, imagesByPage, originalImagesRef.current),
() => getDirtyPages(groupsByPage, imagesByPage, originalGroupsRef.current, originalImagesRef.current),
[groupsByPage, imagesByPage],
);
const hasChanges = useMemo(() => dirtyPages.some(Boolean), [dirtyPages]);
@ -157,6 +303,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
const images = extractDocumentImages(cloned);
const originalImages = images.map((page) => page.map(cloneImageElement));
originalImagesRef.current = originalImages;
originalGroupsRef.current = groups.map((page) => page.map((group) => ({ ...group })));
imagesByPageRef.current = images.map((page) => page.map(cloneImageElement));
const initialLoaded = new Set<number>();
originalImages.forEach((pageImages, index) => {
@ -351,8 +498,6 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
let shouldUseLazyMode = false;
let pendingJobId: string | null = null;
setErrorMessage(null);
if (isPdf) {
latestPdfRequestIdRef.current = requestId;
setIsConverting(true);
@ -539,7 +684,6 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
setCachedJobId(shouldUseLazyMode ? pendingJobId : null);
setFileName(file.name);
setErrorMessage(null);
autoLoadKeyRef.current = fileKey;
} catch (error: any) {
console.error('Failed to load file', error);
console.error('Error details:', {
@ -598,13 +742,83 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
}, []);
const handleGroupDelete = useCallback((pageIndex: number, groupId: string) => {
console.log(`🗑️ Deleting group ${groupId} from page ${pageIndex}`);
setGroupsByPage((previous) => {
const updated = previous.map((groups, idx) => {
if (idx !== pageIndex) return groups;
const filtered = groups.filter((group) => group.id !== groupId);
console.log(` Before: ${groups.length} groups, After: ${filtered.length} groups`);
return filtered;
});
return updated;
});
}, []);
const handleMergeGroups = useCallback((pageIndex: number, groupIds: string[]): boolean => {
if (groupIds.length < 2) {
return false;
}
let updated = false;
setGroupsByPage((previous) =>
previous.map((groups, idx) =>
idx !== pageIndex
? groups
: groups.map((group) => (group.id === groupId ? { ...group, text: '' } : group))
)
previous.map((groups, idx) => {
if (idx !== pageIndex) {
return groups;
}
const indices = groupIds
.map((id) => groups.findIndex((group) => group.id === id))
.filter((index) => index >= 0);
if (indices.length !== groupIds.length) {
return groups;
}
const sorted = [...indices].sort((a, b) => a - b);
for (let i = 1; i < sorted.length; i += 1) {
if (sorted[i] !== sorted[i - 1] + 1) {
return groups;
}
}
const selection = sorted.map((position) => groups[position]);
const merged = buildMergedGroupFromSelection(selection);
if (!merged) {
return groups;
}
const next = [
...groups.slice(0, sorted[0]),
merged,
...groups.slice(sorted[sorted.length - 1] + 1),
];
updated = true;
return next;
}),
);
return updated;
}, []);
const handleUngroupGroup = useCallback((pageIndex: number, groupId: string): boolean => {
let updated = false;
setGroupsByPage((previous) =>
previous.map((groups, idx) => {
if (idx !== pageIndex) {
return groups;
}
const targetIndex = groups.findIndex((group) => group.id === groupId);
if (targetIndex < 0) {
return groups;
}
const targetGroup = groups[targetIndex];
const splits = splitParagraphGroup(targetGroup);
if (splits.length <= 1) {
return groups;
}
const next = [
...groups.slice(0, targetIndex),
...splits,
...groups.slice(targetIndex + 1),
];
updated = true;
return next;
}),
);
return updated;
}, []);
const handleImageTransform = useCallback(
@ -746,7 +960,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
}
}, [buildPayload, onComplete]);
const handleGeneratePdf = useCallback(async () => {
const handleGeneratePdf = useCallback(async (skipComplete = false) => {
try {
setIsGeneratingPdf(true);
@ -840,7 +1054,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
downloadBlob(response.data, downloadName);
if (onComplete) {
if (onComplete && !skipComplete) {
const pdfFile = new File([response.data], downloadName, { type: 'application/pdf' });
onComplete([pdfFile]);
}
@ -881,7 +1095,7 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
downloadBlob(response.data, downloadName);
if (onComplete) {
if (onComplete && !skipComplete) {
const pdfFile = new File([response.data], downloadName, { type: 'application/pdf' });
onComplete([pdfFile]);
}
@ -1052,7 +1266,6 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
forceSingleTextElement,
groupingMode,
requestPagePreview,
onLoadJson: handleLoadFile,
onSelectPage: handleSelectPage,
onGroupEdit: handleGroupTextChange,
onGroupDelete: handleGroupDelete,
@ -1061,9 +1274,17 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
onReset: handleResetEdits,
onDownloadJson: handleDownloadJson,
onGeneratePdf: handleGeneratePdf,
onGeneratePdfForNavigation: async () => {
// Generate PDF without triggering tool completion
await handleGeneratePdf(true);
},
onForceSingleTextElementChange: setForceSingleTextElement,
onGroupingModeChange: setGroupingMode,
onMergeGroups: handleMergeGroups,
onUngroupGroup: handleUngroupGroup,
}), [
handleMergeGroups,
handleUngroupGroup,
handleImageTransform,
imagesByPage,
pagePreviews,
@ -1076,7 +1297,6 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
handleGroupTextChange,
handleGroupDelete,
handleImageReset,
handleLoadFile,
handleResetEdits,
handleSelectPage,
hasChanges,
@ -1155,14 +1375,30 @@ const PdfTextEditor = ({ onComplete, onError }: BaseToolProps) => {
unregisterCustomWorkbenchView,
]);
// Note: Compare tool doesn't auto-force workbench, and neither should we
// The workbench should be set when the tool is selected via proper channels
// (tool registry, tool picker, etc.) - not forced here
// Keep hasChanges in a ref for the checker to access
const hasChangesRef = useRef(hasChanges);
useEffect(() => {
if (
navigationState.selectedTool === 'pdfTextEditor' &&
navigationState.workbench !== WORKBENCH_ID
) {
navigationActions.setWorkbench(WORKBENCH_ID);
}
}, [navigationActions, navigationState.selectedTool, navigationState.workbench]);
hasChangesRef.current = hasChanges;
console.log('[PdfTextEditor] hasChanges updated to:', hasChanges);
}, [hasChanges]);
// Register unsaved changes checker for navigation guard
useEffect(() => {
const checker = () => {
console.log('[PdfTextEditor] Checking unsaved changes:', hasChangesRef.current);
return hasChangesRef.current;
};
registerUnsavedChangesChecker(checker);
console.log('[PdfTextEditor] Registered unsaved changes checker');
return () => {
console.log('[PdfTextEditor] Unregistered unsaved changes checker');
unregisterUnsavedChangesChecker();
};
}, [registerUnsavedChangesChecker, unregisterUnsavedChangesChecker]);
const lastSentViewDataRef = useRef<PdfTextEditorViewData | null>(null);

View File

@ -168,6 +168,7 @@ export interface TextGroup {
text: string;
originalText: string;
bounds: BoundingBox;
childLineGroups?: TextGroup[] | null;
}
export const DEFAULT_PAGE_WIDTH = 612;
@ -199,7 +200,6 @@ export interface PdfTextEditorViewData {
forceSingleTextElement: boolean;
groupingMode: 'auto' | 'paragraph' | 'singleLine';
requestPagePreview: (pageIndex: number, scale: number) => void;
onLoadJson: (file: File | null) => Promise<void> | void;
onSelectPage: (pageIndex: number) => void;
onGroupEdit: (pageIndex: number, groupId: string, value: string) => void;
onGroupDelete: (pageIndex: number, groupId: string) => void;
@ -218,6 +218,9 @@ export interface PdfTextEditorViewData {
onReset: () => void;
onDownloadJson: () => void;
onGeneratePdf: () => void;
onGeneratePdfForNavigation: () => Promise<void>;
onForceSingleTextElementChange: (value: boolean) => void;
onGroupingModeChange: (value: 'auto' | 'paragraph' | 'singleLine') => void;
onMergeGroups: (pageIndex: number, groupIds: string[]) => boolean;
onUngroupGroup: (pageIndex: number, groupId: string) => boolean;
}

View File

@ -520,8 +520,18 @@ const createGroup = (
};
};
const cloneLineTemplate = (line: TextGroup): TextGroup => ({
...line,
childLineGroups: null,
lineElementCounts: null,
lineSpacing: null,
elements: line.elements.map(cloneTextElement),
originalElements: line.originalElements.map(cloneTextElement),
});
const groupLinesIntoParagraphs = (
lineGroups: TextGroup[],
pageWidth: number,
metrics?: FontMetricsMap,
): TextGroup[] => {
if (lineGroups.length === 0) {
@ -530,6 +540,8 @@ const groupLinesIntoParagraphs = (
const paragraphs: TextGroup[][] = [];
let currentParagraph: TextGroup[] = [lineGroups[0]];
const bulletFlags = new Map<string, boolean>();
bulletFlags.set(lineGroups[0].id, false);
for (let i = 1; i < lineGroups.length; i++) {
const prevLine = lineGroups[i - 1];
@ -561,11 +573,85 @@ const groupLinesIntoParagraphs = (
const maxReasonableSpacing = avgFontSize * 3.0; // Max ~3x font size for normal line spacing
const hasReasonableSpacing = lineSpacing <= maxReasonableSpacing;
// Check if current line looks like a bullet/list item
const prevRight = prevLine.bounds.right;
const currentRight = currentLine.bounds.right;
const prevWidth = prevRight - prevLeft;
const currentWidth = currentRight - currentLeft;
// Count word count to help identify bullets (typically short)
const prevWords = (prevLine.text ?? '').split(/\s+/).filter(w => w.length > 0).length;
const currentWords = (currentLine.text ?? '').split(/\s+/).filter(w => w.length > 0).length;
const prevText = (prevLine.text ?? '').trim();
const currentText = (currentLine.text ?? '').trim();
// Bullet detection - look for bullet markers or very short lines
const bulletMarkerRegex = /^[\u2022\u2023\u25E6\u2043\u2219•·◦‣\-\*]\s|^\d+[\.\)]\s|^[a-z][\.\)]\s/i;
const prevHasBulletMarker = bulletMarkerRegex.test(prevText);
const currentHasBulletMarker = bulletMarkerRegex.test(currentText);
// True bullets are:
// 1. Have bullet markers/numbers OR
// 2. Very short (< 10 words) AND much narrower than average (< 60% of page width)
const headingKeywords = ['action items', 'next steps', 'notes', 'logistics', 'tasks'];
const normalizedPageWidth = pageWidth > 0 ? pageWidth : avgFontSize * 70;
const maxReferenceWidth = normalizedPageWidth > 0 ? normalizedPageWidth : avgFontSize * 70;
const indentDelta = currentLeft - prevLeft;
const indentThreshold = Math.max(avgFontSize * 0.6, 8);
const hasIndent = indentDelta > indentThreshold;
const currentWidthRatio = maxReferenceWidth > 0 ? currentWidth / maxReferenceWidth : 0;
const prevWidthRatio = maxReferenceWidth > 0 ? prevWidth / maxReferenceWidth : 0;
const prevLooksLikeHeading =
prevText.endsWith(':') ||
(prevWords <= 4 && prevWidthRatio < 0.4) ||
headingKeywords.some((keyword) => prevText.toLowerCase().includes(keyword));
const wrapCandidate =
!currentHasBulletMarker &&
!hasIndent &&
!prevLooksLikeHeading &&
currentWords <= 12 &&
currentWidthRatio < 0.45 &&
Math.abs(prevLeft - currentLeft) <= leftAlignmentTolerance &&
currentWidth < prevWidth * 0.85;
const currentIsBullet = wrapCandidate
? false
: currentHasBulletMarker ||
(hasIndent && (currentWords <= 14 || currentWidthRatio <= 0.65)) ||
(prevLooksLikeHeading && (currentWords <= 16 || currentWidthRatio <= 0.8 || prevWidthRatio < 0.35)) ||
(currentWords <= 8 && currentWidthRatio <= 0.45 && prevWidth - currentWidth > avgFontSize * 4);
const prevIsBullet = bulletFlags.get(prevLine.id) ?? prevHasBulletMarker;
bulletFlags.set(currentLine.id, currentIsBullet);
// Detect paragraph→bullet transition
const likelyBulletStart = !prevIsBullet && currentIsBullet;
// Don't merge two consecutive bullets
const bothAreBullets = prevIsBullet && currentIsBullet;
// Merge into paragraph if:
// 1. Left aligned
// 2. Same font
// 3. Reasonable line spacing (not a large gap indicating paragraph break)
const shouldMerge = isLeftAligned && sameFont && hasReasonableSpacing;
// 3. Reasonable line spacing
// 4. NOT transitioning to bullets
// 5. NOT both are bullets
const shouldMerge =
isLeftAligned &&
sameFont &&
hasReasonableSpacing &&
!likelyBulletStart &&
!bothAreBullets &&
!currentIsBullet;
if (i < 10 || likelyBulletStart || bothAreBullets || !shouldMerge) {
console.log(` Line ${i}:`);
console.log(` prev: "${prevText.substring(0, 40)}" (${prevWords}w, ${prevWidth.toFixed(0)}pt, marker:${prevHasBulletMarker}, bullet:${prevIsBullet})`);
console.log(` curr: "${currentText.substring(0, 40)}" (${currentWords}w, ${currentWidth.toFixed(0)}pt, marker:${currentHasBulletMarker}, bullet:${currentIsBullet})`);
console.log(` checks: leftAlign:${isLeftAligned} (${Math.abs(prevLeft - currentLeft).toFixed(1)}pt), sameFont:${sameFont}, spacing:${hasReasonableSpacing} (${lineSpacing.toFixed(1)}pt/${maxReasonableSpacing.toFixed(1)}pt)`);
console.log(` decision: merge=${shouldMerge} (bulletStart:${likelyBulletStart}, bothBullets:${bothAreBullets})`);
}
if (shouldMerge) {
currentParagraph.push(currentLine);
@ -587,17 +673,24 @@ const groupLinesIntoParagraphs = (
}
// Combine all elements from all lines
const allElements = lines.flatMap(line => line.originalElements);
const lineTemplates = lines.map(line => cloneLineTemplate(line));
const flattenedLineTemplates = lineTemplates.flatMap((line) =>
line.childLineGroups && line.childLineGroups.length > 0
? line.childLineGroups
: [line],
);
const allLines = flattenedLineTemplates.length > 0 ? flattenedLineTemplates : lineTemplates;
const allElements = allLines.flatMap(line => line.originalElements);
const pageIndex = lines[0].pageIndex;
const lineElementCounts = lines.map((line) => line.originalElements.length);
const lineElementCounts = allLines.map((line) => line.originalElements.length);
// Create merged group with newlines between lines
const paragraphText = lines.map(line => line.text).join('\n');
const mergedBounds = mergeBounds(lines.map(line => line.bounds));
const paragraphText = allLines.map(line => line.text).join('\n');
const mergedBounds = mergeBounds(allLines.map(line => line.bounds));
const spacingValues: number[] = [];
for (let i = 1; i < lines.length; i++) {
const prevBaseline = lines[i - 1].baseline ?? lines[i - 1].bounds.bottom;
const currentBaseline = lines[i].baseline ?? lines[i].bounds.bottom;
for (let i = 1; i < allLines.length; i++) {
const prevBaseline = allLines[i - 1].baseline ?? allLines[i - 1].bounds.bottom;
const currentBaseline = allLines[i].baseline ?? allLines[i].bounds.bottom;
const spacing = Math.abs(prevBaseline - currentBaseline);
if (spacing > 0) {
spacingValues.push(spacing);
@ -633,6 +726,7 @@ const groupLinesIntoParagraphs = (
text: paragraphText,
originalText: paragraphText,
bounds: mergedBounds,
childLineGroups: allLines,
};
});
};
@ -647,6 +741,8 @@ export const groupPageTextElements = (
return [];
}
const pageWidth = valueOr(page.width, DEFAULT_PAGE_WIDTH);
const elements = page.textElements
.map(cloneTextElement)
.filter((element) => element.text !== null && element.text !== undefined);
@ -740,7 +836,7 @@ export const groupPageTextElements = (
if (groupingMode === 'paragraph') {
// Paragraph mode: always apply grouping
return groupLinesIntoParagraphs(lineGroups, metrics);
return groupLinesIntoParagraphs(lineGroups, pageWidth, metrics);
}
// Auto mode: use heuristic to determine if we should group
@ -749,6 +845,11 @@ export const groupPageTextElements = (
let totalWords = 0;
let longTextGroups = 0;
let totalGroups = 0;
const wordCounts: number[] = [];
let fullWidthLines = 0;
// Define "full width" as extending to at least 70% of page width
const fullWidthThreshold = pageWidth * 0.7;
lineGroups.forEach((group) => {
const text = (group.text || '').trim();
@ -760,14 +861,21 @@ export const groupPageTextElements = (
const wordCount = text.split(/\s+/).filter((w) => w.length > 0).length;
totalWords += wordCount;
wordCounts.push(wordCount);
if (lineCount > 1) {
multiLineGroups++;
}
if (wordCount >= 5 || text.length >= 30) {
if (wordCount >= 10 || text.length >= 50) {
longTextGroups++;
}
// Check if this line extends close to the right margin (paragraph-like)
const rightEdge = group.bounds.right;
if (rightEdge >= fullWidthThreshold) {
fullWidthLines++;
}
});
if (totalGroups === 0) {
@ -776,18 +884,65 @@ export const groupPageTextElements = (
const avgWordsPerGroup = totalWords / totalGroups;
const longTextRatio = longTextGroups / totalGroups;
const fullWidthRatio = fullWidthLines / totalGroups;
const isParagraphPage =
(multiLineGroups >= 2 && avgWordsPerGroup > 8) ||
avgWordsPerGroup > 12 ||
longTextRatio > 0.4;
// Calculate variance in line lengths (paragraphs have varying lengths, lists are uniform)
const variance = wordCounts.reduce((sum, count) => {
const diff = count - avgWordsPerGroup;
return sum + diff * diff;
}, 0) / totalGroups;
const stdDev = Math.sqrt(variance);
const coefficientOfVariation = avgWordsPerGroup > 0 ? stdDev / avgWordsPerGroup : 0;
// Check each criterion
const criterion1 = avgWordsPerGroup > 5;
const criterion2 = longTextRatio > 0.4;
const criterion3 = coefficientOfVariation > 0.5 || fullWidthRatio > 0.6; // High variance OR many full-width lines = paragraph text
const isParagraphPage = criterion1 && criterion2 && criterion3;
// Log detection stats
console.log(`📄 Page ${pageIndex} Grouping Analysis (mode: ${groupingMode}):`);
console.log(` Stats:`);
console.log(` • Page width: ${pageWidth.toFixed(1)}pt (full-width threshold: ${fullWidthThreshold.toFixed(1)}pt)`);
console.log(` • Multi-line groups: ${multiLineGroups}`);
console.log(` • Total groups: ${totalGroups}`);
console.log(` • Total words: ${totalWords}`);
console.log(` • Long text groups (≥10 words or ≥50 chars): ${longTextGroups}`);
console.log(` • Full-width lines (≥70% page width): ${fullWidthLines}`);
console.log(` • Avg words per group: ${avgWordsPerGroup.toFixed(2)}`);
console.log(` • Long text ratio: ${(longTextRatio * 100).toFixed(1)}%`);
console.log(` • Full-width ratio: ${(fullWidthRatio * 100).toFixed(1)}%`);
console.log(` • Std deviation: ${stdDev.toFixed(2)}`);
console.log(` • Coefficient of variation: ${coefficientOfVariation.toFixed(2)}`);
console.log(` Criteria:`);
console.log(` 1. Avg Words Per Group: ${criterion1 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (${avgWordsPerGroup.toFixed(2)} > 5)`);
console.log(` 2. Long Text Ratio: ${criterion2 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (${(longTextRatio * 100).toFixed(1)}% > 40%)`);
console.log(` 3. Line Width Pattern: ${criterion3 ? '✅ PASS' : '❌ FAIL'}`);
console.log(` (CV ${coefficientOfVariation.toFixed(2)} > 0.5 OR ${(fullWidthRatio * 100).toFixed(1)}% > 60%)`);
console.log(` ${coefficientOfVariation > 0.5 ? '✓ High variance (varying line lengths)' : '✗ Low variance'} ${fullWidthRatio > 0.6 ? '✓ Many full-width lines (paragraph-like)' : '✗ Few full-width lines (list-like)'}`);
console.log(` Decision: ${isParagraphPage ? '📝 PARAGRAPH MODE' : '📋 LINE MODE'}`);
if (isParagraphPage) {
console.log(` Reason: All three criteria passed (AND logic)`);
} else {
const failedReasons = [];
if (!criterion1) failedReasons.push('low average words per group');
if (!criterion2) failedReasons.push('low ratio of long text groups');
if (!criterion3) failedReasons.push('low variance and few full-width lines (list-like structure)');
console.log(` Reason: ${failedReasons.join(', ')}`);
}
console.log('');
// Only apply paragraph grouping if it looks like a paragraph-heavy page
if (isParagraphPage) {
return groupLinesIntoParagraphs(lineGroups, metrics);
console.log(`🔀 Applying paragraph grouping to page ${pageIndex}`);
return groupLinesIntoParagraphs(lineGroups, pageWidth, metrics);
}
// For sparse pages, keep lines separate
console.log(`📋 Keeping lines separate for page ${pageIndex}`);
return lineGroups;
};
@ -829,10 +984,28 @@ export const deepCloneDocument = (document: PdfJsonDocument): PdfJsonDocument =>
};
export const pageDimensions = (page: PdfJsonPage | null | undefined): { width: number; height: number } => {
return {
width: valueOr(page?.width, DEFAULT_PAGE_WIDTH),
height: valueOr(page?.height, DEFAULT_PAGE_HEIGHT),
};
const width = valueOr(page?.width, DEFAULT_PAGE_WIDTH);
const height = valueOr(page?.height, DEFAULT_PAGE_HEIGHT);
console.log(`📏 [pageDimensions] Calculating page size:`, {
hasPage: !!page,
rawWidth: page?.width,
rawHeight: page?.height,
mediaBox: page?.mediaBox,
cropBox: page?.cropBox,
rotation: page?.rotation,
calculatedWidth: width,
calculatedHeight: height,
DEFAULT_PAGE_WIDTH,
DEFAULT_PAGE_HEIGHT,
commonFormats: {
'US Letter': '612 × 792 pt',
'A4': '595 × 842 pt',
'Legal': '612 × 1008 pt',
},
});
return { width, height };
};
export const createMergedElement = (group: TextGroup): PdfJsonTextElement => {
@ -1192,14 +1365,35 @@ export const areImageListsDifferent = (
export const getDirtyPages = (
groupsByPage: TextGroup[][],
imagesByPage: PdfJsonImageElement[][],
originalGroupsByPage: TextGroup[][],
originalImagesByPage: PdfJsonImageElement[][],
): boolean[] => {
return groupsByPage.map((groups, index) => {
// Check if any text was modified
const textDirty = groups.some((group) => group.text !== group.originalText);
// Check if any groups were deleted by comparing with original groups
const originalGroups = originalGroupsByPage[index] ?? [];
const groupCountChanged = groups.length !== originalGroups.length;
const imageDirty = areImageListsDifferent(
imagesByPage[index] ?? [],
originalImagesByPage[index] ?? [],
);
return textDirty || imageDirty;
const isDirty = textDirty || groupCountChanged || imageDirty;
if (groupCountChanged || textDirty) {
console.log(`📄 Page ${index} dirty check:`, {
textDirty,
groupCountChanged,
originalGroupsLength: originalGroups.length,
currentGroupsLength: groups.length,
imageDirty,
isDirty,
});
}
return isDirty;
});
};