fix problems with rendering totally different documents

2026-03-13 02:18:16 +01:00 · 2025-11-06 17:04:04 +00:00
parent ebc8e0e737
commit 0dc376c436
7 changed files with 262 additions and 30 deletions
--- a/frontend/src/core/components/tools/compare/CompareDocumentPane.tsx
+++ b/frontend/src/core/components/tools/compare/CompareDocumentPane.tsx
@@ -203,8 +203,12 @@ const CompareDocumentPane = ({
                      if (!dragRef.current.active || dragRef.current.page !== page.pageNumber) return;
                      const dx = e.clientX - dragRef.current.startX;
                      const dy = e.clientY - dragRef.current.startY;
-                      const maxX = Math.max(0, Math.round(baseWidth * innerScale - containerWidth));
-                      const maxY = Math.max(0, Math.round(baseHeight * innerScale - containerHeight));
+                      // Clamp panning based on the actual rendered content size.
+                      // The inner layer is width/height of the container, then scaled by innerScale.
+                      const contentWidth = Math.max(0, Math.round(containerWidth * innerScale));
+                      const contentHeight = Math.max(0, Math.round(containerHeight * innerScale));
+                      const maxX = Math.max(0, contentWidth - Math.round(containerWidth));
+                      const maxY = Math.max(0, contentHeight - Math.round(containerHeight));
                      const candX = dragRef.current.startPanX - dx;
                      const candY = dragRef.current.startPanY - dy;
                      const next = { x: Math.max(0, Math.min(maxX, candX)), y: Math.max(0, Math.min(maxY, candY)) };
--- a/frontend/src/core/components/tools/compare/compare.ts
+++ b/frontend/src/core/components/tools/compare/compare.ts
@@ -107,9 +107,39 @@ export const computePageLayoutMetrics = (args: {
  const baseWidth = isStackedPortrait ? stackedWidth : Math.round(page.width * fit);
  const baseHeight = isStackedPortrait ? stackedHeight : Math.round(targetHeight);
  const containerMaxW = scrollRefWidth ?? viewportWidth;
-  const containerWidth = Math.min(baseWidth, Math.max(120, containerMaxW));
-  const containerHeight = Math.round(baseHeight * (containerWidth / baseWidth));
-  const innerScale = Math.max(1, zoom);
+
+  // Container-first zooming with a stable baseline:
+  // Treat zoom=1 as "fit to available width" for the page's base size so
+  // the initial render is fully visible and centered (no cropping), regardless
+  // of rotation or pane/container width. When zoom < 1, shrink the container;
+  // when zoom > 1, keep the container at fit width and scale inner content.
+  const MIN_CONTAINER_WIDTH = 120;
+  const minScaleByWidth = MIN_CONTAINER_WIDTH / Math.max(1, baseWidth);
+  const fitScaleByContainer = containerMaxW / Math.max(1, baseWidth);
+  // Effective baseline scale used at zoom=1 (ensures at least the min width)
+  const baselineContainerScale = Math.max(minScaleByWidth, fitScaleByContainer);
+  // Lower bound the zoom so interactions remain stable
+  const desiredZoom = Math.max(0.1, zoom);
+
+  let containerScale: number;
+  let innerScale: number;
+  if (desiredZoom >= 1) {
+    // At or above baseline: keep container at fit width and scale inner content
+    containerScale = baselineContainerScale;
+    innerScale = +Math.max(0.1, desiredZoom).toFixed(4);
+  } else {
+    // Below baseline: shrink container proportionally, do not upscale inner
+    const scaled = baselineContainerScale * desiredZoom;
+    // Never smaller than minimum readable width
+    containerScale = Math.max(minScaleByWidth, scaled);
+    innerScale = 1;
+  }
+
+  const containerWidth = Math.max(
+    MIN_CONTAINER_WIDTH,
+    Math.min(containerMaxW, Math.round(baseWidth * containerScale))
+  );
+  const containerHeight = Math.round(baseHeight * (containerWidth / Math.max(1, baseWidth)));

  return {
    targetHeight,
--- a/frontend/src/core/hooks/tools/compare/operationUtils.ts
+++ b/frontend/src/core/hooks/tools/compare/operationUtils.ts
@@ -259,9 +259,9 @@ export const createSummaryFile = (result: CompareResultData): File => {

 export const clamp = (value: number): number => Math.min(1, Math.max(0, value));

-export const getWorkerErrorCode = (value: unknown): 'EMPTY_TEXT' | 'TOO_LARGE' | undefined => {
+export const getWorkerErrorCode = (value: unknown): 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR' | undefined => {
  if (typeof value === 'object' && value !== null && 'code' in value) {
-    const potentialCode = (value as { code?: 'EMPTY_TEXT' | 'TOO_LARGE' }).code;
+    const potentialCode = (value as { code?: 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR' }).code;
    return potentialCode;
  }
  return undefined;
--- a/frontend/src/core/hooks/tools/compare/useCompareOperation.ts
+++ b/frontend/src/core/hooks/tools/compare/useCompareOperation.ts
@@ -36,7 +36,7 @@ export interface CompareOperationHook extends ToolOperationHook<CompareParameter

 export const useCompareOperation = (): CompareOperationHook => {
  const { t } = useTranslation();
-  const { selectors } = useFileContext();
+  const { selectors, actions: fileActions } = useFileContext();
  const workerRef = useRef<Worker | null>(null);
  const previousUrl = useRef<string | null>(null);
  const activeRunIdRef = useRef(0);
@@ -53,6 +53,8 @@ export const useCompareOperation = (): CompareOperationHook => {
  const [result, setResult] = useState<CompareResultData | null>(null);
  const [warnings, setWarnings] = useState<string[]>([]);
  const longRunningToastIdRef = useRef<string | null>(null);
+  const dissimilarityToastIdRef = useRef<string | null>(null);
+  const dissimilarityToastShownRef = useRef<boolean>(false);

  const ensureWorker = useCallback(() => {
    if (!workerRef.current) {
@@ -139,7 +141,7 @@ export const useCompareOperation = (): CompareOperationHook => {
                dismissToast(longRunningToastIdRef.current);
                longRunningToastIdRef.current = null;
              }
-              const error: Error & { code?: 'EMPTY_TEXT' | 'TOO_LARGE' } = new Error(message.message);
+              const error: Error & { code?: 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR' } = new Error(message.message);
              error.code = message.code;
              reject(error);
              break;
@@ -228,6 +230,10 @@ export const useCompareOperation = (): CompareOperationHook => {
          'compare.no.text.message',
          'One or both of the selected PDFs have no text content. Please choose PDFs with text for comparison.'
        ),
+        tooDissimilarMessage: t(
+          'compare.too.dissimilar.message',
+          'These documents appear highly dissimilar. Comparison was stopped to save time.'
+        ),
      };

      const operationStart = performance.now();
@@ -271,10 +277,57 @@ export const useCompareOperation = (): CompareOperationHook => {
          longRunningToastIdRef.current = toastId || null;
        }

+        // Heuristic: surface an early warning toast when we observe a very high ratio of differences
+        const EARLY_TOAST_MIN_TOKENS = 15000; // wait for some signal before warning
+        const EARLY_TOAST_DIFF_RATIO = 0.8;   // 80% added/removed vs unchanged
+        let observedAddedRemoved = 0;
+        let observedUnchanged = 0;
+
+        const handleEarlyDissimilarity = () => {
+          if (dissimilarityToastShownRef.current || dissimilarityToastIdRef.current) return;
+          const toastId = alert({
+            alertType: 'warning',
+            title: t('compare.earlyDissimilarity.title', 'These PDFs look highly different'),
+            body: t(
+              'compare.earlyDissimilarity.body',
+              "We're seeing very few similarities so far. You can stop the comparison if these aren't related documents."
+            ),
+            location: 'bottom-right' as ToastLocation,
+            isPersistentPopup: true,
+            expandable: false,
+            buttonText: t('compare.earlyDissimilarity.stopButton', 'Stop comparison'),
+            buttonCallback: () => {
+              try { cancelOperation(); } catch {}
+              try { window.dispatchEvent(new CustomEvent('compare:clear-selected')); } catch {}
+              if (dissimilarityToastIdRef.current) {
+                dismissToast(dissimilarityToastIdRef.current);
+                dissimilarityToastIdRef.current = null;
+              }
+            },
+          });
+          dissimilarityToastIdRef.current = toastId || null;
+          dissimilarityToastShownRef.current = true;
+        };
+
        const { tokens, stats, warnings: workerWarnings } = await runCompareWorker(
          baseFiltered.tokens,
          comparisonFiltered.tokens,
-          warningMessages
+          warningMessages,
+          (chunk) => {
+            // Incremental ratio tracking for early warning
+            for (const tok of chunk) {
+              if (tok.type === 'unchanged') observedUnchanged += 1;
+              else observedAddedRemoved += 1;
+            }
+            const seen = observedAddedRemoved + observedUnchanged;
+            if (
+              !dissimilarityToastShownRef.current &&
+              seen >= EARLY_TOAST_MIN_TOKENS &&
+              observedAddedRemoved / Math.max(1, seen) >= EARLY_TOAST_DIFF_RATIO
+            ) {
+              handleEarlyDissimilarity();
+            }
+          }
        );

        if (cancelledRef.current || activeRunIdRef.current !== runId) return;
@@ -409,6 +462,11 @@ export const useCompareOperation = (): CompareOperationHook => {
          dismissToast(longRunningToastIdRef.current);
          longRunningToastIdRef.current = null;
        }
+        if (dissimilarityToastIdRef.current) {
+          dismissToast(dissimilarityToastIdRef.current);
+          dissimilarityToastIdRef.current = null;
+        }
+        dissimilarityToastShownRef.current = false;
      }
    },
    [cleanupDownloadUrl, runCompareWorker, selectors, t]
--- a/frontend/src/core/tools/Compare.tsx
+++ b/frontend/src/core/tools/Compare.tsx
@@ -53,6 +53,25 @@ const Compare = (props: BaseToolProps) => {
  const compareIcon = useMemo(() => <CompareRoundedIcon fontSize="small" />, []);
  const [swapConfirmOpen, setSwapConfirmOpen] = useState(false);
  const [clearConfirmOpen, setClearConfirmOpen] = useState(false);
+  const performClearSelected = useCallback(() => {
+    try { base.operation.cancelOperation(); } catch { console.error('Failed to cancel operation'); }
+    try { base.operation.resetResults(); } catch { console.error('Failed to reset results'); }
+    base.params.setParameters(prev => ({ ...prev, baseFileId: null, comparisonFileId: null }));
+    try { fileActions.clearSelections(); } catch { console.error('Failed to clear selections'); }
+    clearCustomWorkbenchViewData(CUSTOM_VIEW_ID);
+    navigationActions.setWorkbench(getDefaultWorkbench());
+  }, [base.operation, base.params, clearCustomWorkbenchViewData, fileActions, navigationActions]);
+
+  useEffect(() => {
+    const handler = () => {
+      performClearSelected();
+    };
+    window.addEventListener('compare:clear-selected', handler as unknown as EventListener);
+    return () => {
+      window.removeEventListener('compare:clear-selected', handler as unknown as EventListener);
+    };
+  }, [performClearSelected]);
+

  useEffect(() => {
    registerCustomWorkbenchView({
@@ -500,12 +519,7 @@ const Compare = (props: BaseToolProps) => {
                    variant="filled"
                    onClick={() => {
                      setClearConfirmOpen(false);
-                      try { base.operation.cancelOperation(); } catch {console.error('Failed to cancel operation');}
-                      try { base.operation.resetResults(); } catch {console.error('Failed to reset results');}
-                      base.params.setParameters(prev => ({ ...prev, baseFileId: null, comparisonFileId: null }));
-                      try { fileActions.clearSelections(); } catch {console.error('Failed to clear selections');}
-                      clearCustomWorkbenchViewData(CUSTOM_VIEW_ID);
-                      navigationActions.setWorkbench(getDefaultWorkbench());
+                      performClearSelected();
                    }}
                  >
                    {t('compare.clear.confirm', 'Clear and return')}
--- a/frontend/src/core/types/compare.ts
+++ b/frontend/src/core/types/compare.ts
@@ -98,6 +98,7 @@ export interface CompareWorkerWarnings {
  complexMessage?: string;
  tooLargeMessage?: string;
  emptyTextMessage?: string;
+  tooDissimilarMessage?: string;
 }

 export interface CompareWorkerRequest {
@@ -110,6 +111,14 @@ export interface CompareWorkerRequest {
      batchSize?: number;
      complexThreshold?: number;
      maxWordThreshold?: number;
+      // Early-stop and runtime controls (optional)
+      earlyStopEnabled?: boolean;
+      minJaccardUnigram?: number;
+      minJaccardBigram?: number;
+      minTokensForEarlyStop?: number;
+      sampleLimit?: number;
+      runtimeMaxProcessedTokens?: number;
+      runtimeMinUnchangedRatio?: number;
    };
  };
 }
@@ -134,7 +143,7 @@ export type CompareWorkerResponse =
  | {
      type: 'error';
      message: string;
-      code?: 'EMPTY_TEXT' | 'TOO_LARGE';
+      code?: 'EMPTY_TEXT' | 'TOO_LARGE' | 'TOO_DISSIMILAR';
    };

 export interface CompareDocumentPaneProps {
--- a/frontend/src/workers/compareWorker.ts
+++ b/frontend/src/workers/compareWorker.ts
@@ -12,6 +12,18 @@ const DEFAULT_SETTINGS = {
  batchSize: 5000,
  complexThreshold: 25000,
  maxWordThreshold: 60000,
+  // Early stop configuration
+  earlyStopEnabled: true,
+  // Jaccard thresholds for quick prefilter (unigram/bigram)
+  minJaccardUnigram: 0.005,
+  minJaccardBigram: 0.003,
+  // Only consider early stop when docs are reasonably large
+  minTokensForEarlyStop: 20000,
+  // Sampling cap for similarity estimation
+  sampleLimit: 50000,
+  // Runtime stop-loss during chunked diff
+  runtimeMaxProcessedTokens: 150000,
+  runtimeMinUnchangedRatio: 0.001,
 };

 const buildMatrix = (words1: string[], words2: string[]) => {
@@ -87,7 +99,8 @@ const chunkedDiff = (
  words1: string[],
  words2: string[],
  chunkSize: number,
-  emit: (tokens: CompareDiffToken[]) => void
+  emit: (tokens: CompareDiffToken[]) => void,
+  runtimeStop?: { maxProcessedTokens: number; minUnchangedRatio: number }
 ) => {
  if (words1.length === 0 && words2.length === 0) {
    return;
@@ -123,6 +136,12 @@ const chunkedDiff = (
  let index2 = 0;
  let buffer1: string[] = [];
  let buffer2: string[] = [];
+  let totalProcessedBase = 0;
+  let totalProcessedComp = 0;
+  let totalUnchanged = 0;
+
+  const countUnchanged = (segment: CompareDiffToken[]) =>
+    segment.reduce((acc, token) => acc + (token.type === 'unchanged' ? 1 : 0), 0);

  const flushRemainder = () => {
    if (buffer1.length === 0 && buffer2.length === 0) {
@@ -233,6 +252,24 @@ const chunkedDiff = (

    buffer1 = window1.slice(baseConsumed);
    buffer2 = window2.slice(comparisonConsumed);
+    // Update runtime counters and early stop if necessary
+    totalProcessedBase += baseConsumed;
+    totalProcessedComp += comparisonConsumed;
+    totalUnchanged += countUnchanged(commitTokens);
+
+    if (runtimeStop) {
+      const processedTotal = totalProcessedBase + totalProcessedComp;
+      if (processedTotal >= runtimeStop.maxProcessedTokens) {
+        const unchangedRatio = totalUnchanged / Math.max(1, processedTotal);
+        if (unchangedRatio < runtimeStop.minUnchangedRatio) {
+          // Signal early termination for extreme dissimilarity
+          const err = new Error('EARLY_STOP_TOO_DISSIMILAR');
+          // eslint-disable-next-line @typescript-eslint/no-explicit-any
+          (err as any).__earlyStop = true;
+          throw err;
+        }
+      }
+    }

    if (reachedEnd) {
      flushRemainder();
@@ -264,6 +301,40 @@ const chunkedDiff = (
  flushRemainder();
 };

+// Fast similarity estimation using sampled unigrams and bigrams with Jaccard
+const buildSampledSet = (tokens: string[], sampleLimit: number, ngram: 1 | 2): Set<string> => {
+  const result = new Set<string>();
+  if (tokens.length === 0) return result;
+  const stride = Math.max(1, Math.ceil(tokens.length / sampleLimit));
+  if (ngram === 1) {
+    for (let i = 0; i < tokens.length; i += stride) {
+      const t = tokens[i];
+      if (t) result.add(t);
+    }
+    return result;
+  }
+  // ngram === 2
+  for (let i = 0; i + 1 < tokens.length; i += stride) {
+    const a = tokens[i];
+    const b = tokens[i + 1];
+    if (a && b) result.add(`${a}|${b}`);
+  }
+  return result;
+};
+
+const jaccard = (a: Set<string>, b: Set<string>): number => {
+  if (a.size === 0 && b.size === 0) return 1;
+  if (a.size === 0 || b.size === 0) return 0;
+  let intersection = 0;
+  const smaller = a.size <= b.size ? a : b;
+  const larger = a.size <= b.size ? b : a;
+  for (const v of smaller) {
+    if (larger.has(v)) intersection += 1;
+  }
+  const union = a.size + b.size - intersection;
+  return union > 0 ? intersection / union : 0;
+};
+
 self.onmessage = (event: MessageEvent<CompareWorkerRequest>) => {
  const { data } = event;
  if (!data || data.type !== 'compare') {
@@ -275,6 +346,13 @@ self.onmessage = (event: MessageEvent<CompareWorkerRequest>) => {
    batchSize = DEFAULT_SETTINGS.batchSize,
    complexThreshold = DEFAULT_SETTINGS.complexThreshold,
    maxWordThreshold = DEFAULT_SETTINGS.maxWordThreshold,
+    earlyStopEnabled = DEFAULT_SETTINGS.earlyStopEnabled,
+    minJaccardUnigram = DEFAULT_SETTINGS.minJaccardUnigram,
+    minJaccardBigram = DEFAULT_SETTINGS.minJaccardBigram,
+    minTokensForEarlyStop = DEFAULT_SETTINGS.minTokensForEarlyStop,
+    sampleLimit = DEFAULT_SETTINGS.sampleLimit,
+    runtimeMaxProcessedTokens = DEFAULT_SETTINGS.runtimeMaxProcessedTokens,
+    runtimeMinUnchangedRatio = DEFAULT_SETTINGS.runtimeMinUnchangedRatio,
  } = settings ?? {};

  if (!baseTokens || !comparisonTokens || baseTokens.length === 0 || comparisonTokens.length === 0) {
@@ -306,22 +384,61 @@ self.onmessage = (event: MessageEvent<CompareWorkerRequest>) => {
    self.postMessage(warningResponse);
  }

-  const start = performance.now();
-  chunkedDiff(
-    baseTokens,
-    comparisonTokens,
-    batchSize,
-    (tokens) => {
-      if (tokens.length === 0) {
-        return;
-      }
+  // Quick prefilter to avoid heavy diff on extremely dissimilar large docs
+  if (earlyStopEnabled && Math.min(baseTokens.length, comparisonTokens.length) >= minTokensForEarlyStop) {
+    const set1u = buildSampledSet(baseTokens, sampleLimit, 1);
+    const set2u = buildSampledSet(comparisonTokens, sampleLimit, 1);
+    const jUni = jaccard(set1u, set2u);
+    const set1b = buildSampledSet(baseTokens, sampleLimit, 2);
+    const set2b = buildSampledSet(comparisonTokens, sampleLimit, 2);
+    const jBi = jaccard(set1b, set2b);
+    if (jUni < minJaccardUnigram && jBi < minJaccardBigram) {
      const response: CompareWorkerResponse = {
-        type: 'chunk',
-        tokens,
+        type: 'error',
+        message:
+          warnings.tooDissimilarMessage ??
+          'These documents appear highly dissimilar. Comparison was stopped to save time.',
+        code: 'TOO_DISSIMILAR',
      };
      self.postMessage(response);
+      return;
    }
-  );
+  }
+
+  const start = performance.now();
+  try {
+    chunkedDiff(
+      baseTokens,
+      comparisonTokens,
+      batchSize,
+      (tokens) => {
+        if (tokens.length === 0) {
+          return;
+        }
+        const response: CompareWorkerResponse = {
+          type: 'chunk',
+          tokens,
+        };
+        self.postMessage(response);
+      },
+      { maxProcessedTokens: runtimeMaxProcessedTokens, minUnchangedRatio: runtimeMinUnchangedRatio }
+    );
+  } catch (err) {
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    const anyErr = err as any;
+    if (anyErr && (anyErr.__earlyStop || anyErr?.message === 'EARLY_STOP_TOO_DISSIMILAR')) {
+      const response: CompareWorkerResponse = {
+        type: 'error',
+        message:
+          warnings.tooDissimilarMessage ??
+          'These documents appear highly dissimilar. Comparison was stopped to save time.',
+        code: 'TOO_DISSIMILAR',
+      };
+      self.postMessage(response);
+      return;
+    }
+    throw err;
+  }
  const durationMs = performance.now() - start;

  const response: CompareWorkerResponse = {