fix problems with rendering totally different documents

This commit is contained in:
EthanHealy01 2025-11-06 17:04:04 +00:00
parent ebc8e0e737
commit 0dc376c436
7 changed files with 262 additions and 30 deletions

View File

@ -203,8 +203,12 @@ const CompareDocumentPane = ({
if (!dragRef.current.active || dragRef.current.page !== page.pageNumber) return;
const dx = e.clientX - dragRef.current.startX;
const dy = e.clientY - dragRef.current.startY;
const maxX = Math.max(0, Math.round(baseWidth * innerScale - containerWidth));
const maxY = Math.max(0, Math.round(baseHeight * innerScale - containerHeight));
// Clamp panning based on the actual rendered content size.
// The inner layer is width/height of the container, then scaled by innerScale.
const contentWidth = Math.max(0, Math.round(containerWidth * innerScale));
const contentHeight = Math.max(0, Math.round(containerHeight * innerScale));
const maxX = Math.max(0, contentWidth - Math.round(containerWidth));
const maxY = Math.max(0, contentHeight - Math.round(containerHeight));
const candX = dragRef.current.startPanX - dx;
const candY = dragRef.current.startPanY - dy;
const next = { x: Math.max(0, Math.min(maxX, candX)), y: Math.max(0, Math.min(maxY, candY)) };

View File

@ -107,9 +107,39 @@ export const computePageLayoutMetrics = (args: {
const baseWidth = isStackedPortrait ? stackedWidth : Math.round(page.width * fit);
const baseHeight = isStackedPortrait ? stackedHeight : Math.round(targetHeight);
const containerMaxW = scrollRefWidth ?? viewportWidth;
const containerWidth = Math.min(baseWidth, Math.max(120, containerMaxW));
const containerHeight = Math.round(baseHeight * (containerWidth / baseWidth));
const innerScale = Math.max(1, zoom);
// Container-first zooming with a stable baseline:
// Treat zoom=1 as "fit to available width" for the page's base size so
// the initial render is fully visible and centered (no cropping), regardless
// of rotation or pane/container width. When zoom < 1, shrink the container;
// when zoom > 1, keep the container at fit width and scale inner content.
const MIN_CONTAINER_WIDTH = 120;
const minScaleByWidth = MIN_CONTAINER_WIDTH / Math.max(1, baseWidth);
const fitScaleByContainer = containerMaxW / Math.max(1, baseWidth);
// Effective baseline scale used at zoom=1 (ensures at least the min width)
const baselineContainerScale = Math.max(minScaleByWidth, fitScaleByContainer);
// Lower bound the zoom so interactions remain stable
const desiredZoom = Math.max(0.1, zoom);
let containerScale: number;
let innerScale: number;
if (desiredZoom >= 1) {
// At or above baseline: keep container at fit width and scale inner content
containerScale = baselineContainerScale;
innerScale = +Math.max(0.1, desiredZoom).toFixed(4);
} else {
// Below baseline: shrink container proportionally, do not upscale inner
const scaled = baselineContainerScale * desiredZoom;
// Never smaller than minimum readable width
containerScale = Math.max(minScaleByWidth, scaled);
innerScale = 1;
}
const containerWidth = Math.max(
MIN_CONTAINER_WIDTH,
Math.min(containerMaxW, Math.round(baseWidth * containerScale))
);
const containerHeight = Math.round(baseHeight * (containerWidth / Math.max(1, baseWidth)));
return {
targetHeight,

View File

@ -259,9 +259,9 @@ export const createSummaryFile = (result: CompareResultData): File => {
export const clamp = (value: number): number => Math.min(1, Math.max(0, value));
export const getWorkerErrorCode = (value: unknown): 'EMPTY_TEXT' | 'TOO_LARGE' | undefined => {
export const getWorkerErrorCode = (value: unknown): 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR' | undefined => {
if (typeof value === 'object' && value !== null && 'code' in value) {
const potentialCode = (value as { code?: 'EMPTY_TEXT' | 'TOO_LARGE' }).code;
const potentialCode = (value as { code?: 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR' }).code;
return potentialCode;
}
return undefined;

View File

@ -36,7 +36,7 @@ export interface CompareOperationHook extends ToolOperationHook<CompareParameter
export const useCompareOperation = (): CompareOperationHook => {
const { t } = useTranslation();
const { selectors } = useFileContext();
const { selectors, actions: fileActions } = useFileContext();
const workerRef = useRef<Worker | null>(null);
const previousUrl = useRef<string | null>(null);
const activeRunIdRef = useRef(0);
@ -53,6 +53,8 @@ export const useCompareOperation = (): CompareOperationHook => {
const [result, setResult] = useState<CompareResultData | null>(null);
const [warnings, setWarnings] = useState<string[]>([]);
const longRunningToastIdRef = useRef<string | null>(null);
const dissimilarityToastIdRef = useRef<string | null>(null);
const dissimilarityToastShownRef = useRef<boolean>(false);
const ensureWorker = useCallback(() => {
if (!workerRef.current) {
@ -139,7 +141,7 @@ export const useCompareOperation = (): CompareOperationHook => {
dismissToast(longRunningToastIdRef.current);
longRunningToastIdRef.current = null;
}
const error: Error & { code?: 'EMPTY_TEXT' | 'TOO_LARGE' } = new Error(message.message);
const error: Error & { code?: 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR' } = new Error(message.message);
error.code = message.code;
reject(error);
break;
@ -228,6 +230,10 @@ export const useCompareOperation = (): CompareOperationHook => {
'compare.no.text.message',
'One or both of the selected PDFs have no text content. Please choose PDFs with text for comparison.'
),
tooDissimilarMessage: t(
'compare.too.dissimilar.message',
'These documents appear highly dissimilar. Comparison was stopped to save time.'
),
};
const operationStart = performance.now();
@ -271,10 +277,57 @@ export const useCompareOperation = (): CompareOperationHook => {
longRunningToastIdRef.current = toastId || null;
}
// Heuristic: surface an early warning toast when we observe a very high ratio of differences
const EARLY_TOAST_MIN_TOKENS = 15000; // wait for some signal before warning
const EARLY_TOAST_DIFF_RATIO = 0.8; // 80% added/removed vs unchanged
let observedAddedRemoved = 0;
let observedUnchanged = 0;
const handleEarlyDissimilarity = () => {
if (dissimilarityToastShownRef.current || dissimilarityToastIdRef.current) return;
const toastId = alert({
alertType: 'warning',
title: t('compare.earlyDissimilarity.title', 'These PDFs look highly different'),
body: t(
'compare.earlyDissimilarity.body',
"We're seeing very few similarities so far. You can stop the comparison if these aren't related documents."
),
location: 'bottom-right' as ToastLocation,
isPersistentPopup: true,
expandable: false,
buttonText: t('compare.earlyDissimilarity.stopButton', 'Stop comparison'),
buttonCallback: () => {
try { cancelOperation(); } catch {}
try { window.dispatchEvent(new CustomEvent('compare:clear-selected')); } catch {}
if (dissimilarityToastIdRef.current) {
dismissToast(dissimilarityToastIdRef.current);
dissimilarityToastIdRef.current = null;
}
},
});
dissimilarityToastIdRef.current = toastId || null;
dissimilarityToastShownRef.current = true;
};
const { tokens, stats, warnings: workerWarnings } = await runCompareWorker(
baseFiltered.tokens,
comparisonFiltered.tokens,
warningMessages
warningMessages,
(chunk) => {
// Incremental ratio tracking for early warning
for (const tok of chunk) {
if (tok.type === 'unchanged') observedUnchanged += 1;
else observedAddedRemoved += 1;
}
const seen = observedAddedRemoved + observedUnchanged;
if (
!dissimilarityToastShownRef.current &&
seen >= EARLY_TOAST_MIN_TOKENS &&
observedAddedRemoved / Math.max(1, seen) >= EARLY_TOAST_DIFF_RATIO
) {
handleEarlyDissimilarity();
}
}
);
if (cancelledRef.current || activeRunIdRef.current !== runId) return;
@ -409,6 +462,11 @@ export const useCompareOperation = (): CompareOperationHook => {
dismissToast(longRunningToastIdRef.current);
longRunningToastIdRef.current = null;
}
if (dissimilarityToastIdRef.current) {
dismissToast(dissimilarityToastIdRef.current);
dissimilarityToastIdRef.current = null;
}
dissimilarityToastShownRef.current = false;
}
},
[cleanupDownloadUrl, runCompareWorker, selectors, t]

View File

@ -53,6 +53,25 @@ const Compare = (props: BaseToolProps) => {
const compareIcon = useMemo(() => <CompareRoundedIcon fontSize="small" />, []);
const [swapConfirmOpen, setSwapConfirmOpen] = useState(false);
const [clearConfirmOpen, setClearConfirmOpen] = useState(false);
const performClearSelected = useCallback(() => {
try { base.operation.cancelOperation(); } catch { console.error('Failed to cancel operation'); }
try { base.operation.resetResults(); } catch { console.error('Failed to reset results'); }
base.params.setParameters(prev => ({ ...prev, baseFileId: null, comparisonFileId: null }));
try { fileActions.clearSelections(); } catch { console.error('Failed to clear selections'); }
clearCustomWorkbenchViewData(CUSTOM_VIEW_ID);
navigationActions.setWorkbench(getDefaultWorkbench());
}, [base.operation, base.params, clearCustomWorkbenchViewData, fileActions, navigationActions]);
useEffect(() => {
const handler = () => {
performClearSelected();
};
window.addEventListener('compare:clear-selected', handler as unknown as EventListener);
return () => {
window.removeEventListener('compare:clear-selected', handler as unknown as EventListener);
};
}, [performClearSelected]);
useEffect(() => {
registerCustomWorkbenchView({
@ -500,12 +519,7 @@ const Compare = (props: BaseToolProps) => {
variant="filled"
onClick={() => {
setClearConfirmOpen(false);
try { base.operation.cancelOperation(); } catch {console.error('Failed to cancel operation');}
try { base.operation.resetResults(); } catch {console.error('Failed to reset results');}
base.params.setParameters(prev => ({ ...prev, baseFileId: null, comparisonFileId: null }));
try { fileActions.clearSelections(); } catch {console.error('Failed to clear selections');}
clearCustomWorkbenchViewData(CUSTOM_VIEW_ID);
navigationActions.setWorkbench(getDefaultWorkbench());
performClearSelected();
}}
>
{t('compare.clear.confirm', 'Clear and return')}

View File

@ -98,6 +98,7 @@ export interface CompareWorkerWarnings {
complexMessage?: string;
tooLargeMessage?: string;
emptyTextMessage?: string;
tooDissimilarMessage?: string;
}
export interface CompareWorkerRequest {
@ -110,6 +111,14 @@ export interface CompareWorkerRequest {
batchSize?: number;
complexThreshold?: number;
maxWordThreshold?: number;
// Early-stop and runtime controls (optional)
earlyStopEnabled?: boolean;
minJaccardUnigram?: number;
minJaccardBigram?: number;
minTokensForEarlyStop?: number;
sampleLimit?: number;
runtimeMaxProcessedTokens?: number;
runtimeMinUnchangedRatio?: number;
};
};
}
@ -134,7 +143,7 @@ export type CompareWorkerResponse =
| {
type: 'error';
message: string;
code?: 'EMPTY_TEXT' | 'TOO_LARGE';
code?: 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR';
};
export interface CompareDocumentPaneProps {

View File

@ -12,6 +12,18 @@ const DEFAULT_SETTINGS = {
batchSize: 5000,
complexThreshold: 25000,
maxWordThreshold: 60000,
// Early stop configuration
earlyStopEnabled: true,
// Jaccard thresholds for quick prefilter (unigram/bigram)
minJaccardUnigram: 0.005,
minJaccardBigram: 0.003,
// Only consider early stop when docs are reasonably large
minTokensForEarlyStop: 20000,
// Sampling cap for similarity estimation
sampleLimit: 50000,
// Runtime stop-loss during chunked diff
runtimeMaxProcessedTokens: 150000,
runtimeMinUnchangedRatio: 0.001,
};
const buildMatrix = (words1: string[], words2: string[]) => {
@ -87,7 +99,8 @@ const chunkedDiff = (
words1: string[],
words2: string[],
chunkSize: number,
emit: (tokens: CompareDiffToken[]) => void
emit: (tokens: CompareDiffToken[]) => void,
runtimeStop?: { maxProcessedTokens: number; minUnchangedRatio: number }
) => {
if (words1.length === 0 && words2.length === 0) {
return;
@ -123,6 +136,12 @@ const chunkedDiff = (
let index2 = 0;
let buffer1: string[] = [];
let buffer2: string[] = [];
let totalProcessedBase = 0;
let totalProcessedComp = 0;
let totalUnchanged = 0;
const countUnchanged = (segment: CompareDiffToken[]) =>
segment.reduce((acc, token) => acc + (token.type === 'unchanged' ? 1 : 0), 0);
const flushRemainder = () => {
if (buffer1.length === 0 && buffer2.length === 0) {
@ -233,6 +252,24 @@ const chunkedDiff = (
buffer1 = window1.slice(baseConsumed);
buffer2 = window2.slice(comparisonConsumed);
// Update runtime counters and early stop if necessary
totalProcessedBase += baseConsumed;
totalProcessedComp += comparisonConsumed;
totalUnchanged += countUnchanged(commitTokens);
if (runtimeStop) {
const processedTotal = totalProcessedBase + totalProcessedComp;
if (processedTotal >= runtimeStop.maxProcessedTokens) {
const unchangedRatio = totalUnchanged / Math.max(1, processedTotal);
if (unchangedRatio < runtimeStop.minUnchangedRatio) {
// Signal early termination for extreme dissimilarity
const err = new Error('EARLY_STOP_TOO_DISSIMILAR');
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(err as any).__earlyStop = true;
throw err;
}
}
}
if (reachedEnd) {
flushRemainder();
@ -264,6 +301,40 @@ const chunkedDiff = (
flushRemainder();
};
// Fast similarity estimation using sampled unigrams and bigrams with Jaccard
const buildSampledSet = (tokens: string[], sampleLimit: number, ngram: 1 | 2): Set<string> => {
const result = new Set<string>();
if (tokens.length === 0) return result;
const stride = Math.max(1, Math.ceil(tokens.length / sampleLimit));
if (ngram === 1) {
for (let i = 0; i < tokens.length; i += stride) {
const t = tokens[i];
if (t) result.add(t);
}
return result;
}
// ngram === 2
for (let i = 0; i + 1 < tokens.length; i += stride) {
const a = tokens[i];
const b = tokens[i + 1];
if (a && b) result.add(`${a}|${b}`);
}
return result;
};
const jaccard = (a: Set<string>, b: Set<string>): number => {
if (a.size === 0 && b.size === 0) return 1;
if (a.size === 0 || b.size === 0) return 0;
let intersection = 0;
const smaller = a.size <= b.size ? a : b;
const larger = a.size <= b.size ? b : a;
for (const v of smaller) {
if (larger.has(v)) intersection += 1;
}
const union = a.size + b.size - intersection;
return union > 0 ? intersection / union : 0;
};
self.onmessage = (event: MessageEvent<CompareWorkerRequest>) => {
const { data } = event;
if (!data || data.type !== 'compare') {
@ -275,6 +346,13 @@ self.onmessage = (event: MessageEvent<CompareWorkerRequest>) => {
batchSize = DEFAULT_SETTINGS.batchSize,
complexThreshold = DEFAULT_SETTINGS.complexThreshold,
maxWordThreshold = DEFAULT_SETTINGS.maxWordThreshold,
earlyStopEnabled = DEFAULT_SETTINGS.earlyStopEnabled,
minJaccardUnigram = DEFAULT_SETTINGS.minJaccardUnigram,
minJaccardBigram = DEFAULT_SETTINGS.minJaccardBigram,
minTokensForEarlyStop = DEFAULT_SETTINGS.minTokensForEarlyStop,
sampleLimit = DEFAULT_SETTINGS.sampleLimit,
runtimeMaxProcessedTokens = DEFAULT_SETTINGS.runtimeMaxProcessedTokens,
runtimeMinUnchangedRatio = DEFAULT_SETTINGS.runtimeMinUnchangedRatio,
} = settings ?? {};
if (!baseTokens || !comparisonTokens || baseTokens.length === 0 || comparisonTokens.length === 0) {
@ -306,22 +384,61 @@ self.onmessage = (event: MessageEvent<CompareWorkerRequest>) => {
self.postMessage(warningResponse);
}
const start = performance.now();
chunkedDiff(
baseTokens,
comparisonTokens,
batchSize,
(tokens) => {
if (tokens.length === 0) {
return;
}
// Quick prefilter to avoid heavy diff on extremely dissimilar large docs
if (earlyStopEnabled && Math.min(baseTokens.length, comparisonTokens.length) >= minTokensForEarlyStop) {
const set1u = buildSampledSet(baseTokens, sampleLimit, 1);
const set2u = buildSampledSet(comparisonTokens, sampleLimit, 1);
const jUni = jaccard(set1u, set2u);
const set1b = buildSampledSet(baseTokens, sampleLimit, 2);
const set2b = buildSampledSet(comparisonTokens, sampleLimit, 2);
const jBi = jaccard(set1b, set2b);
if (jUni < minJaccardUnigram && jBi < minJaccardBigram) {
const response: CompareWorkerResponse = {
type: 'chunk',
tokens,
type: 'error',
message:
warnings.tooDissimilarMessage ??
'These documents appear highly dissimilar. Comparison was stopped to save time.',
code: 'TOO_DISSIMILAR',
};
self.postMessage(response);
return;
}
);
}
const start = performance.now();
try {
chunkedDiff(
baseTokens,
comparisonTokens,
batchSize,
(tokens) => {
if (tokens.length === 0) {
return;
}
const response: CompareWorkerResponse = {
type: 'chunk',
tokens,
};
self.postMessage(response);
},
{ maxProcessedTokens: runtimeMaxProcessedTokens, minUnchangedRatio: runtimeMinUnchangedRatio }
);
} catch (err) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const anyErr = err as any;
if (anyErr && (anyErr.__earlyStop || anyErr?.message === 'EARLY_STOP_TOO_DISSIMILAR')) {
const response: CompareWorkerResponse = {
type: 'error',
message:
warnings.tooDissimilarMessage ??
'These documents appear highly dissimilar. Comparison was stopped to save time.',
code: 'TOO_DISSIMILAR',
};
self.postMessage(response);
return;
}
throw err;
}
const durationMs = performance.now() - start;
const response: CompareWorkerResponse = {