mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-02-17 13:52:14 +01:00
fix problems with rendering totally different documents
This commit is contained in:
parent
ebc8e0e737
commit
0dc376c436
@ -203,8 +203,12 @@ const CompareDocumentPane = ({
|
||||
if (!dragRef.current.active || dragRef.current.page !== page.pageNumber) return;
|
||||
const dx = e.clientX - dragRef.current.startX;
|
||||
const dy = e.clientY - dragRef.current.startY;
|
||||
const maxX = Math.max(0, Math.round(baseWidth * innerScale - containerWidth));
|
||||
const maxY = Math.max(0, Math.round(baseHeight * innerScale - containerHeight));
|
||||
// Clamp panning based on the actual rendered content size.
|
||||
// The inner layer is width/height of the container, then scaled by innerScale.
|
||||
const contentWidth = Math.max(0, Math.round(containerWidth * innerScale));
|
||||
const contentHeight = Math.max(0, Math.round(containerHeight * innerScale));
|
||||
const maxX = Math.max(0, contentWidth - Math.round(containerWidth));
|
||||
const maxY = Math.max(0, contentHeight - Math.round(containerHeight));
|
||||
const candX = dragRef.current.startPanX - dx;
|
||||
const candY = dragRef.current.startPanY - dy;
|
||||
const next = { x: Math.max(0, Math.min(maxX, candX)), y: Math.max(0, Math.min(maxY, candY)) };
|
||||
|
||||
@ -107,9 +107,39 @@ export const computePageLayoutMetrics = (args: {
|
||||
const baseWidth = isStackedPortrait ? stackedWidth : Math.round(page.width * fit);
|
||||
const baseHeight = isStackedPortrait ? stackedHeight : Math.round(targetHeight);
|
||||
const containerMaxW = scrollRefWidth ?? viewportWidth;
|
||||
const containerWidth = Math.min(baseWidth, Math.max(120, containerMaxW));
|
||||
const containerHeight = Math.round(baseHeight * (containerWidth / baseWidth));
|
||||
const innerScale = Math.max(1, zoom);
|
||||
|
||||
// Container-first zooming with a stable baseline:
|
||||
// Treat zoom=1 as "fit to available width" for the page's base size so
|
||||
// the initial render is fully visible and centered (no cropping), regardless
|
||||
// of rotation or pane/container width. When zoom < 1, shrink the container;
|
||||
// when zoom > 1, keep the container at fit width and scale inner content.
|
||||
const MIN_CONTAINER_WIDTH = 120;
|
||||
const minScaleByWidth = MIN_CONTAINER_WIDTH / Math.max(1, baseWidth);
|
||||
const fitScaleByContainer = containerMaxW / Math.max(1, baseWidth);
|
||||
// Effective baseline scale used at zoom=1 (ensures at least the min width)
|
||||
const baselineContainerScale = Math.max(minScaleByWidth, fitScaleByContainer);
|
||||
// Lower bound the zoom so interactions remain stable
|
||||
const desiredZoom = Math.max(0.1, zoom);
|
||||
|
||||
let containerScale: number;
|
||||
let innerScale: number;
|
||||
if (desiredZoom >= 1) {
|
||||
// At or above baseline: keep container at fit width and scale inner content
|
||||
containerScale = baselineContainerScale;
|
||||
innerScale = +Math.max(0.1, desiredZoom).toFixed(4);
|
||||
} else {
|
||||
// Below baseline: shrink container proportionally, do not upscale inner
|
||||
const scaled = baselineContainerScale * desiredZoom;
|
||||
// Never smaller than minimum readable width
|
||||
containerScale = Math.max(minScaleByWidth, scaled);
|
||||
innerScale = 1;
|
||||
}
|
||||
|
||||
const containerWidth = Math.max(
|
||||
MIN_CONTAINER_WIDTH,
|
||||
Math.min(containerMaxW, Math.round(baseWidth * containerScale))
|
||||
);
|
||||
const containerHeight = Math.round(baseHeight * (containerWidth / Math.max(1, baseWidth)));
|
||||
|
||||
return {
|
||||
targetHeight,
|
||||
|
||||
@ -259,9 +259,9 @@ export const createSummaryFile = (result: CompareResultData): File => {
|
||||
|
||||
export const clamp = (value: number): number => Math.min(1, Math.max(0, value));
|
||||
|
||||
export const getWorkerErrorCode = (value: unknown): 'EMPTY_TEXT' | 'TOO_LARGE' | undefined => {
|
||||
export const getWorkerErrorCode = (value: unknown): 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR' | undefined => {
|
||||
if (typeof value === 'object' && value !== null && 'code' in value) {
|
||||
const potentialCode = (value as { code?: 'EMPTY_TEXT' | 'TOO_LARGE' }).code;
|
||||
const potentialCode = (value as { code?: 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR' }).code;
|
||||
return potentialCode;
|
||||
}
|
||||
return undefined;
|
||||
|
||||
@ -36,7 +36,7 @@ export interface CompareOperationHook extends ToolOperationHook<CompareParameter
|
||||
|
||||
export const useCompareOperation = (): CompareOperationHook => {
|
||||
const { t } = useTranslation();
|
||||
const { selectors } = useFileContext();
|
||||
const { selectors, actions: fileActions } = useFileContext();
|
||||
const workerRef = useRef<Worker | null>(null);
|
||||
const previousUrl = useRef<string | null>(null);
|
||||
const activeRunIdRef = useRef(0);
|
||||
@ -53,6 +53,8 @@ export const useCompareOperation = (): CompareOperationHook => {
|
||||
const [result, setResult] = useState<CompareResultData | null>(null);
|
||||
const [warnings, setWarnings] = useState<string[]>([]);
|
||||
const longRunningToastIdRef = useRef<string | null>(null);
|
||||
const dissimilarityToastIdRef = useRef<string | null>(null);
|
||||
const dissimilarityToastShownRef = useRef<boolean>(false);
|
||||
|
||||
const ensureWorker = useCallback(() => {
|
||||
if (!workerRef.current) {
|
||||
@ -139,7 +141,7 @@ export const useCompareOperation = (): CompareOperationHook => {
|
||||
dismissToast(longRunningToastIdRef.current);
|
||||
longRunningToastIdRef.current = null;
|
||||
}
|
||||
const error: Error & { code?: 'EMPTY_TEXT' | 'TOO_LARGE' } = new Error(message.message);
|
||||
const error: Error & { code?: 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR' } = new Error(message.message);
|
||||
error.code = message.code;
|
||||
reject(error);
|
||||
break;
|
||||
@ -228,6 +230,10 @@ export const useCompareOperation = (): CompareOperationHook => {
|
||||
'compare.no.text.message',
|
||||
'One or both of the selected PDFs have no text content. Please choose PDFs with text for comparison.'
|
||||
),
|
||||
tooDissimilarMessage: t(
|
||||
'compare.too.dissimilar.message',
|
||||
'These documents appear highly dissimilar. Comparison was stopped to save time.'
|
||||
),
|
||||
};
|
||||
|
||||
const operationStart = performance.now();
|
||||
@ -271,10 +277,57 @@ export const useCompareOperation = (): CompareOperationHook => {
|
||||
longRunningToastIdRef.current = toastId || null;
|
||||
}
|
||||
|
||||
// Heuristic: surface an early warning toast when we observe a very high ratio of differences
|
||||
const EARLY_TOAST_MIN_TOKENS = 15000; // wait for some signal before warning
|
||||
const EARLY_TOAST_DIFF_RATIO = 0.8; // 80% added/removed vs unchanged
|
||||
let observedAddedRemoved = 0;
|
||||
let observedUnchanged = 0;
|
||||
|
||||
const handleEarlyDissimilarity = () => {
|
||||
if (dissimilarityToastShownRef.current || dissimilarityToastIdRef.current) return;
|
||||
const toastId = alert({
|
||||
alertType: 'warning',
|
||||
title: t('compare.earlyDissimilarity.title', 'These PDFs look highly different'),
|
||||
body: t(
|
||||
'compare.earlyDissimilarity.body',
|
||||
"We're seeing very few similarities so far. You can stop the comparison if these aren't related documents."
|
||||
),
|
||||
location: 'bottom-right' as ToastLocation,
|
||||
isPersistentPopup: true,
|
||||
expandable: false,
|
||||
buttonText: t('compare.earlyDissimilarity.stopButton', 'Stop comparison'),
|
||||
buttonCallback: () => {
|
||||
try { cancelOperation(); } catch {}
|
||||
try { window.dispatchEvent(new CustomEvent('compare:clear-selected')); } catch {}
|
||||
if (dissimilarityToastIdRef.current) {
|
||||
dismissToast(dissimilarityToastIdRef.current);
|
||||
dissimilarityToastIdRef.current = null;
|
||||
}
|
||||
},
|
||||
});
|
||||
dissimilarityToastIdRef.current = toastId || null;
|
||||
dissimilarityToastShownRef.current = true;
|
||||
};
|
||||
|
||||
const { tokens, stats, warnings: workerWarnings } = await runCompareWorker(
|
||||
baseFiltered.tokens,
|
||||
comparisonFiltered.tokens,
|
||||
warningMessages
|
||||
warningMessages,
|
||||
(chunk) => {
|
||||
// Incremental ratio tracking for early warning
|
||||
for (const tok of chunk) {
|
||||
if (tok.type === 'unchanged') observedUnchanged += 1;
|
||||
else observedAddedRemoved += 1;
|
||||
}
|
||||
const seen = observedAddedRemoved + observedUnchanged;
|
||||
if (
|
||||
!dissimilarityToastShownRef.current &&
|
||||
seen >= EARLY_TOAST_MIN_TOKENS &&
|
||||
observedAddedRemoved / Math.max(1, seen) >= EARLY_TOAST_DIFF_RATIO
|
||||
) {
|
||||
handleEarlyDissimilarity();
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
if (cancelledRef.current || activeRunIdRef.current !== runId) return;
|
||||
@ -409,6 +462,11 @@ export const useCompareOperation = (): CompareOperationHook => {
|
||||
dismissToast(longRunningToastIdRef.current);
|
||||
longRunningToastIdRef.current = null;
|
||||
}
|
||||
if (dissimilarityToastIdRef.current) {
|
||||
dismissToast(dissimilarityToastIdRef.current);
|
||||
dissimilarityToastIdRef.current = null;
|
||||
}
|
||||
dissimilarityToastShownRef.current = false;
|
||||
}
|
||||
},
|
||||
[cleanupDownloadUrl, runCompareWorker, selectors, t]
|
||||
|
||||
@ -53,6 +53,25 @@ const Compare = (props: BaseToolProps) => {
|
||||
const compareIcon = useMemo(() => <CompareRoundedIcon fontSize="small" />, []);
|
||||
const [swapConfirmOpen, setSwapConfirmOpen] = useState(false);
|
||||
const [clearConfirmOpen, setClearConfirmOpen] = useState(false);
|
||||
const performClearSelected = useCallback(() => {
|
||||
try { base.operation.cancelOperation(); } catch { console.error('Failed to cancel operation'); }
|
||||
try { base.operation.resetResults(); } catch { console.error('Failed to reset results'); }
|
||||
base.params.setParameters(prev => ({ ...prev, baseFileId: null, comparisonFileId: null }));
|
||||
try { fileActions.clearSelections(); } catch { console.error('Failed to clear selections'); }
|
||||
clearCustomWorkbenchViewData(CUSTOM_VIEW_ID);
|
||||
navigationActions.setWorkbench(getDefaultWorkbench());
|
||||
}, [base.operation, base.params, clearCustomWorkbenchViewData, fileActions, navigationActions]);
|
||||
|
||||
useEffect(() => {
|
||||
const handler = () => {
|
||||
performClearSelected();
|
||||
};
|
||||
window.addEventListener('compare:clear-selected', handler as unknown as EventListener);
|
||||
return () => {
|
||||
window.removeEventListener('compare:clear-selected', handler as unknown as EventListener);
|
||||
};
|
||||
}, [performClearSelected]);
|
||||
|
||||
|
||||
useEffect(() => {
|
||||
registerCustomWorkbenchView({
|
||||
@ -500,12 +519,7 @@ const Compare = (props: BaseToolProps) => {
|
||||
variant="filled"
|
||||
onClick={() => {
|
||||
setClearConfirmOpen(false);
|
||||
try { base.operation.cancelOperation(); } catch {console.error('Failed to cancel operation');}
|
||||
try { base.operation.resetResults(); } catch {console.error('Failed to reset results');}
|
||||
base.params.setParameters(prev => ({ ...prev, baseFileId: null, comparisonFileId: null }));
|
||||
try { fileActions.clearSelections(); } catch {console.error('Failed to clear selections');}
|
||||
clearCustomWorkbenchViewData(CUSTOM_VIEW_ID);
|
||||
navigationActions.setWorkbench(getDefaultWorkbench());
|
||||
performClearSelected();
|
||||
}}
|
||||
>
|
||||
{t('compare.clear.confirm', 'Clear and return')}
|
||||
|
||||
@ -98,6 +98,7 @@ export interface CompareWorkerWarnings {
|
||||
complexMessage?: string;
|
||||
tooLargeMessage?: string;
|
||||
emptyTextMessage?: string;
|
||||
tooDissimilarMessage?: string;
|
||||
}
|
||||
|
||||
export interface CompareWorkerRequest {
|
||||
@ -110,6 +111,14 @@ export interface CompareWorkerRequest {
|
||||
batchSize?: number;
|
||||
complexThreshold?: number;
|
||||
maxWordThreshold?: number;
|
||||
// Early-stop and runtime controls (optional)
|
||||
earlyStopEnabled?: boolean;
|
||||
minJaccardUnigram?: number;
|
||||
minJaccardBigram?: number;
|
||||
minTokensForEarlyStop?: number;
|
||||
sampleLimit?: number;
|
||||
runtimeMaxProcessedTokens?: number;
|
||||
runtimeMinUnchangedRatio?: number;
|
||||
};
|
||||
};
|
||||
}
|
||||
@ -134,7 +143,7 @@ export type CompareWorkerResponse =
|
||||
| {
|
||||
type: 'error';
|
||||
message: string;
|
||||
code?: 'EMPTY_TEXT' | 'TOO_LARGE';
|
||||
code?: 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR';
|
||||
};
|
||||
|
||||
export interface CompareDocumentPaneProps {
|
||||
|
||||
@ -12,6 +12,18 @@ const DEFAULT_SETTINGS = {
|
||||
batchSize: 5000,
|
||||
complexThreshold: 25000,
|
||||
maxWordThreshold: 60000,
|
||||
// Early stop configuration
|
||||
earlyStopEnabled: true,
|
||||
// Jaccard thresholds for quick prefilter (unigram/bigram)
|
||||
minJaccardUnigram: 0.005,
|
||||
minJaccardBigram: 0.003,
|
||||
// Only consider early stop when docs are reasonably large
|
||||
minTokensForEarlyStop: 20000,
|
||||
// Sampling cap for similarity estimation
|
||||
sampleLimit: 50000,
|
||||
// Runtime stop-loss during chunked diff
|
||||
runtimeMaxProcessedTokens: 150000,
|
||||
runtimeMinUnchangedRatio: 0.001,
|
||||
};
|
||||
|
||||
const buildMatrix = (words1: string[], words2: string[]) => {
|
||||
@ -87,7 +99,8 @@ const chunkedDiff = (
|
||||
words1: string[],
|
||||
words2: string[],
|
||||
chunkSize: number,
|
||||
emit: (tokens: CompareDiffToken[]) => void
|
||||
emit: (tokens: CompareDiffToken[]) => void,
|
||||
runtimeStop?: { maxProcessedTokens: number; minUnchangedRatio: number }
|
||||
) => {
|
||||
if (words1.length === 0 && words2.length === 0) {
|
||||
return;
|
||||
@ -123,6 +136,12 @@ const chunkedDiff = (
|
||||
let index2 = 0;
|
||||
let buffer1: string[] = [];
|
||||
let buffer2: string[] = [];
|
||||
let totalProcessedBase = 0;
|
||||
let totalProcessedComp = 0;
|
||||
let totalUnchanged = 0;
|
||||
|
||||
const countUnchanged = (segment: CompareDiffToken[]) =>
|
||||
segment.reduce((acc, token) => acc + (token.type === 'unchanged' ? 1 : 0), 0);
|
||||
|
||||
const flushRemainder = () => {
|
||||
if (buffer1.length === 0 && buffer2.length === 0) {
|
||||
@ -233,6 +252,24 @@ const chunkedDiff = (
|
||||
|
||||
buffer1 = window1.slice(baseConsumed);
|
||||
buffer2 = window2.slice(comparisonConsumed);
|
||||
// Update runtime counters and early stop if necessary
|
||||
totalProcessedBase += baseConsumed;
|
||||
totalProcessedComp += comparisonConsumed;
|
||||
totalUnchanged += countUnchanged(commitTokens);
|
||||
|
||||
if (runtimeStop) {
|
||||
const processedTotal = totalProcessedBase + totalProcessedComp;
|
||||
if (processedTotal >= runtimeStop.maxProcessedTokens) {
|
||||
const unchangedRatio = totalUnchanged / Math.max(1, processedTotal);
|
||||
if (unchangedRatio < runtimeStop.minUnchangedRatio) {
|
||||
// Signal early termination for extreme dissimilarity
|
||||
const err = new Error('EARLY_STOP_TOO_DISSIMILAR');
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(err as any).__earlyStop = true;
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (reachedEnd) {
|
||||
flushRemainder();
|
||||
@ -264,6 +301,40 @@ const chunkedDiff = (
|
||||
flushRemainder();
|
||||
};
|
||||
|
||||
// Fast similarity estimation using sampled unigrams and bigrams with Jaccard
|
||||
const buildSampledSet = (tokens: string[], sampleLimit: number, ngram: 1 | 2): Set<string> => {
|
||||
const result = new Set<string>();
|
||||
if (tokens.length === 0) return result;
|
||||
const stride = Math.max(1, Math.ceil(tokens.length / sampleLimit));
|
||||
if (ngram === 1) {
|
||||
for (let i = 0; i < tokens.length; i += stride) {
|
||||
const t = tokens[i];
|
||||
if (t) result.add(t);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
// ngram === 2
|
||||
for (let i = 0; i + 1 < tokens.length; i += stride) {
|
||||
const a = tokens[i];
|
||||
const b = tokens[i + 1];
|
||||
if (a && b) result.add(`${a}|${b}`);
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
const jaccard = (a: Set<string>, b: Set<string>): number => {
|
||||
if (a.size === 0 && b.size === 0) return 1;
|
||||
if (a.size === 0 || b.size === 0) return 0;
|
||||
let intersection = 0;
|
||||
const smaller = a.size <= b.size ? a : b;
|
||||
const larger = a.size <= b.size ? b : a;
|
||||
for (const v of smaller) {
|
||||
if (larger.has(v)) intersection += 1;
|
||||
}
|
||||
const union = a.size + b.size - intersection;
|
||||
return union > 0 ? intersection / union : 0;
|
||||
};
|
||||
|
||||
self.onmessage = (event: MessageEvent<CompareWorkerRequest>) => {
|
||||
const { data } = event;
|
||||
if (!data || data.type !== 'compare') {
|
||||
@ -275,6 +346,13 @@ self.onmessage = (event: MessageEvent<CompareWorkerRequest>) => {
|
||||
batchSize = DEFAULT_SETTINGS.batchSize,
|
||||
complexThreshold = DEFAULT_SETTINGS.complexThreshold,
|
||||
maxWordThreshold = DEFAULT_SETTINGS.maxWordThreshold,
|
||||
earlyStopEnabled = DEFAULT_SETTINGS.earlyStopEnabled,
|
||||
minJaccardUnigram = DEFAULT_SETTINGS.minJaccardUnigram,
|
||||
minJaccardBigram = DEFAULT_SETTINGS.minJaccardBigram,
|
||||
minTokensForEarlyStop = DEFAULT_SETTINGS.minTokensForEarlyStop,
|
||||
sampleLimit = DEFAULT_SETTINGS.sampleLimit,
|
||||
runtimeMaxProcessedTokens = DEFAULT_SETTINGS.runtimeMaxProcessedTokens,
|
||||
runtimeMinUnchangedRatio = DEFAULT_SETTINGS.runtimeMinUnchangedRatio,
|
||||
} = settings ?? {};
|
||||
|
||||
if (!baseTokens || !comparisonTokens || baseTokens.length === 0 || comparisonTokens.length === 0) {
|
||||
@ -306,22 +384,61 @@ self.onmessage = (event: MessageEvent<CompareWorkerRequest>) => {
|
||||
self.postMessage(warningResponse);
|
||||
}
|
||||
|
||||
const start = performance.now();
|
||||
chunkedDiff(
|
||||
baseTokens,
|
||||
comparisonTokens,
|
||||
batchSize,
|
||||
(tokens) => {
|
||||
if (tokens.length === 0) {
|
||||
return;
|
||||
}
|
||||
// Quick prefilter to avoid heavy diff on extremely dissimilar large docs
|
||||
if (earlyStopEnabled && Math.min(baseTokens.length, comparisonTokens.length) >= minTokensForEarlyStop) {
|
||||
const set1u = buildSampledSet(baseTokens, sampleLimit, 1);
|
||||
const set2u = buildSampledSet(comparisonTokens, sampleLimit, 1);
|
||||
const jUni = jaccard(set1u, set2u);
|
||||
const set1b = buildSampledSet(baseTokens, sampleLimit, 2);
|
||||
const set2b = buildSampledSet(comparisonTokens, sampleLimit, 2);
|
||||
const jBi = jaccard(set1b, set2b);
|
||||
if (jUni < minJaccardUnigram && jBi < minJaccardBigram) {
|
||||
const response: CompareWorkerResponse = {
|
||||
type: 'chunk',
|
||||
tokens,
|
||||
type: 'error',
|
||||
message:
|
||||
warnings.tooDissimilarMessage ??
|
||||
'These documents appear highly dissimilar. Comparison was stopped to save time.',
|
||||
code: 'TOO_DISSIMILAR',
|
||||
};
|
||||
self.postMessage(response);
|
||||
return;
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
const start = performance.now();
|
||||
try {
|
||||
chunkedDiff(
|
||||
baseTokens,
|
||||
comparisonTokens,
|
||||
batchSize,
|
||||
(tokens) => {
|
||||
if (tokens.length === 0) {
|
||||
return;
|
||||
}
|
||||
const response: CompareWorkerResponse = {
|
||||
type: 'chunk',
|
||||
tokens,
|
||||
};
|
||||
self.postMessage(response);
|
||||
},
|
||||
{ maxProcessedTokens: runtimeMaxProcessedTokens, minUnchangedRatio: runtimeMinUnchangedRatio }
|
||||
);
|
||||
} catch (err) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const anyErr = err as any;
|
||||
if (anyErr && (anyErr.__earlyStop || anyErr?.message === 'EARLY_STOP_TOO_DISSIMILAR')) {
|
||||
const response: CompareWorkerResponse = {
|
||||
type: 'error',
|
||||
message:
|
||||
warnings.tooDissimilarMessage ??
|
||||
'These documents appear highly dissimilar. Comparison was stopped to save time.',
|
||||
code: 'TOO_DISSIMILAR',
|
||||
};
|
||||
self.postMessage(response);
|
||||
return;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
const durationMs = performance.now() - start;
|
||||
|
||||
const response: CompareWorkerResponse = {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user