From fe9268efafd846ad268d2c76be840195c643840f Mon Sep 17 00:00:00 2001 From: EthanHealy01 Date: Tue, 28 Oct 2025 02:10:38 +0000 Subject: [PATCH] fix false positive problem caused by batch size --- frontend/src/components/shared/Tooltip.tsx | 16 +- .../compare/hooks/useCompareHighlights.ts | 15 +- .../tools/compare/useCompareOperation.ts | 13 +- frontend/src/workers/compareWorker.ts | 160 +++++++++++++++--- 4 files changed, 161 insertions(+), 43 deletions(-) diff --git a/frontend/src/components/shared/Tooltip.tsx b/frontend/src/components/shared/Tooltip.tsx index 775cbbee1..ed4a18f17 100644 --- a/frontend/src/components/shared/Tooltip.tsx +++ b/frontend/src/components/shared/Tooltip.tsx @@ -65,6 +65,10 @@ export const Tooltip: React.FC = ({ const clickPendingRef = useRef(false); const tooltipIdRef = useRef(`tooltip-${Math.random().toString(36).slice(2)}`); + // Runtime guard: some browsers may surface non-Node EventTargets for relatedTarget/target + const isDomNode = (value: unknown): value is Node => + typeof Node !== 'undefined' && value instanceof Node; + const clearTimers = useCallback(() => { if (openTimeoutRef.current) { clearTimeout(openTimeoutRef.current); @@ -103,9 +107,9 @@ export const Tooltip: React.FC = ({ (e: MouseEvent) => { const tEl = tooltipRef.current; const trg = triggerRef.current; - const target = e.target as Node | null; - const insideTooltip = tEl && target && tEl.contains(target); - const insideTrigger = trg && target && trg.contains(target); + const target = e.target as unknown; + const insideTooltip = Boolean(tEl && isDomNode(target) && tEl.contains(target)); + const insideTrigger = Boolean(trg && isDomNode(target) && trg.contains(target)); // If pinned: only close when clicking outside BOTH tooltip & trigger if (isPinned) { @@ -172,7 +176,7 @@ export const Tooltip: React.FC = ({ const related = e.relatedTarget as Node | null; // Moving into the tooltip → keep open - if (related && tooltipRef.current && tooltipRef.current.contains(related)) { + if (isDomNode(related) && tooltipRef.current && tooltipRef.current.contains(related)) { (children.props as any)?.onPointerLeave?.(e); return; } @@ -236,7 +240,7 @@ export const Tooltip: React.FC = ({ const handleBlur = useCallback( (e: React.FocusEvent) => { const related = e.relatedTarget as Node | null; - if (related && tooltipRef.current && tooltipRef.current.contains(related)) { + if (isDomNode(related) && tooltipRef.current && tooltipRef.current.contains(related)) { (children.props as any)?.onBlur?.(e); return; } @@ -258,7 +262,7 @@ export const Tooltip: React.FC = ({ const handleTooltipPointerLeave = useCallback( (e: React.PointerEvent) => { const related = e.relatedTarget as Node | null; - if (related && triggerRef.current && triggerRef.current.contains(related)) return; + if (isDomNode(related) && triggerRef.current && triggerRef.current.contains(related)) return; if (!isPinned) setOpen(false); }, [isPinned, setOpen] diff --git a/frontend/src/components/tools/compare/hooks/useCompareHighlights.ts b/frontend/src/components/tools/compare/hooks/useCompareHighlights.ts index c0bf49f08..ccc639a7f 100644 --- a/frontend/src/components/tools/compare/hooks/useCompareHighlights.ts +++ b/frontend/src/components/tools/compare/hooks/useCompareHighlights.ts @@ -45,7 +45,8 @@ const buildWordChanges = ( if (token.type === targetType) { const parts: string[] = []; const runIndices: number[] = []; - const pageNumber = metadata[metadataIndex]?.page ?? 1; + // We'll compute the page number from the first token in the run that has a bbox + let firstPageWithBox: number | null = null; while (i < tokens.length && tokens[i].type === targetType) { const t = tokens[i].text; const isPara = t === PARAGRAPH_SENTINEL || t.startsWith('\uE000') || t.includes('PARA'); @@ -53,8 +54,15 @@ const buildWordChanges = ( if (!isPara) { parts.push(t); // Only add to grouping if there is a corresponding metadata index - if (metadata[metadataIndex]) { - runIndices.push(metadataIndex); + // AND there is a bounding box to anchor highlights to + const meta = metadata[metadataIndex]; + if (meta) { + if (meta.bbox) { + runIndices.push(metadataIndex); + if (firstPageWithBox == null && typeof meta.page === 'number') { + firstPageWithBox = meta.page; + } + } } } metadataIndex += 1; @@ -67,6 +75,7 @@ const buildWordChanges = ( const endIndexForId = runIndices[runIndices.length - 1]; const groupId = `${groupPrefix}-${startIndexForId}-${endIndexForId}`; runIndices.forEach((idx) => tokenIndexToGroupId.set(idx, groupId)); + const pageNumber = firstPageWithBox ?? (metadata[startIndexForId]?.page ?? 1); items.push({ value: groupId, label, pageNumber }); } continue; diff --git a/frontend/src/hooks/tools/compare/useCompareOperation.ts b/frontend/src/hooks/tools/compare/useCompareOperation.ts index d70980da2..257f4e466 100644 --- a/frontend/src/hooks/tools/compare/useCompareOperation.ts +++ b/frontend/src/hooks/tools/compare/useCompareOperation.ts @@ -27,12 +27,6 @@ export interface CompareOperationHook extends ToolOperationHook { @@ -142,7 +136,12 @@ export const useCompareOperation = (): CompareOperationHook => { baseTokens, comparisonTokens, warnings: warningMessages, - settings: DEFAULT_WORKER_SETTINGS, + // Static worker settings to support large documents + settings: { + batchSize: 5000, + complexThreshold: 120000, + maxWordThreshold: 200000, + }, }, }; diff --git a/frontend/src/workers/compareWorker.ts b/frontend/src/workers/compareWorker.ts index 85b02a874..b4abd86e6 100644 --- a/frontend/src/workers/compareWorker.ts +++ b/frontend/src/workers/compareWorker.ts @@ -68,6 +68,21 @@ const diff = (words1: string[], words2: string[]): CompareDiffToken[] => { return backtrack(matrix, words1, words2); }; +const countBaseTokens = (segment: CompareDiffToken[]) => + segment.reduce((acc, token) => acc + (token.type !== 'added' ? 1 : 0), 0); + +const countComparisonTokens = (segment: CompareDiffToken[]) => + segment.reduce((acc, token) => acc + (token.type !== 'removed' ? 1 : 0), 0); + +const findLastUnchangedIndex = (segment: CompareDiffToken[]) => { + for (let i = segment.length - 1; i >= 0; i -= 1) { + if (segment[i].type === 'unchanged') { + return i; + } + } + return -1; +}; + const chunkedDiff = ( words1: string[], words2: string[], @@ -78,42 +93,133 @@ const chunkedDiff = ( } const tokens: CompareDiffToken[] = []; - let start1 = 0; - let start2 = 0; - const overlap = Math.max(0, Math.min(500, Math.floor(chunkSize * 0.1))); + const maxWindow = Math.max(chunkSize * 6, chunkSize + 512); + const minCommit = Math.max(1, Math.floor(chunkSize * 0.1)); - // Advance by the actual number of tokens consumed per chunk to maintain alignment - while (start1 < words1.length || start2 < words2.length) { - const end1 = Math.min(start1 + chunkSize, words1.length); - const end2 = Math.min(start2 + chunkSize, words2.length); - const slice1 = words1.slice(start1, end1); - const slice2 = words2.slice(start2, end2); + let index1 = 0; + let index2 = 0; + let buffer1: string[] = []; + let buffer2: string[] = []; - const chunkTokens = diff(slice1, slice2); - tokens.push(...chunkTokens); + const flushRemainder = () => { + if (buffer1.length === 0 && buffer2.length === 0) { + return; + } + const finalTokens = diff(buffer1, buffer2); + tokens.push(...finalTokens); + buffer1 = []; + buffer2 = []; + index1 = words1.length; + index2 = words2.length; + }; - // Count how many tokens from each side were consumed in this chunk - let consumed1 = 0; - let consumed2 = 0; - for (const t of chunkTokens) { - if (t.type === 'unchanged') { consumed1 += 1; consumed2 += 1; } - else if (t.type === 'removed') { consumed1 += 1; } - else if (t.type === 'added') { consumed2 += 1; } + while ( + index1 < words1.length || + index2 < words2.length || + buffer1.length > 0 || + buffer2.length > 0 + ) { + const remaining1 = Math.max(0, words1.length - index1); + const remaining2 = Math.max(0, words2.length - index2); + + let windowSize = Math.max(chunkSize, buffer1.length, buffer2.length); + let window1: string[] = []; + let window2: string[] = []; + let chunkTokens: CompareDiffToken[] = []; + let reachedEnd = false; + + while (true) { + const take1 = Math.min(Math.max(0, windowSize - buffer1.length), remaining1); + const take2 = Math.min(Math.max(0, windowSize - buffer2.length), remaining2); + + const slice1 = take1 > 0 ? words1.slice(index1, index1 + take1) : []; + const slice2 = take2 > 0 ? words2.slice(index2, index2 + take2) : []; + + window1 = buffer1.length > 0 ? [...buffer1, ...slice1] : slice1; + window2 = buffer2.length > 0 ? [...buffer2, ...slice2] : slice2; + + if (window1.length === 0 && window2.length === 0) { + flushRemainder(); + return tokens; + } + + chunkTokens = diff(window1, window2); + const lastStableIndex = findLastUnchangedIndex(chunkTokens); + + reachedEnd = + index1 + take1 >= words1.length && + index2 + take2 >= words2.length; + + const windowTooLarge = + window1.length >= maxWindow || + window2.length >= maxWindow; + + if (lastStableIndex >= 0 || reachedEnd || windowTooLarge) { + break; + } + + const canGrow1 = take1 < remaining1; + const canGrow2 = take2 < remaining2; + + if (!canGrow1 && !canGrow2) { + break; + } + + windowSize = Math.min( + maxWindow, + windowSize + Math.max(64, Math.floor(chunkSize * 0.5)) + ); } - // Fallback to ensure forward progress - if (consumed1 === 0 && consumed2 === 0) { - consumed1 = Math.min(chunkSize, words1.length - start1); - consumed2 = Math.min(chunkSize, words2.length - start2); + if (chunkTokens.length === 0) { + if (reachedEnd) { + flushRemainder(); + return tokens; + } + windowSize = Math.min(windowSize + Math.max(64, Math.floor(chunkSize * 0.5)), maxWindow); + continue; } - // Advance with overlap to allow re-synchronization across chunk boundaries - const nextStart1 = Math.min(words1.length, Math.max(start1 + consumed1 - overlap, start1 + 1)); - const nextStart2 = Math.min(words2.length, Math.max(start2 + consumed2 - overlap, start2 + 1)); - start1 = nextStart1; - start2 = nextStart2; + let commitIndex = reachedEnd ? chunkTokens.length - 1 : findLastUnchangedIndex(chunkTokens); + if (commitIndex < 0) { + commitIndex = reachedEnd + ? chunkTokens.length - 1 + : Math.min(chunkTokens.length - 1, minCommit - 1); + } + + const commitTokens = commitIndex >= 0 ? chunkTokens.slice(0, commitIndex + 1) : []; + const baseConsumed = countBaseTokens(commitTokens); + const comparisonConsumed = countComparisonTokens(commitTokens); + + tokens.push(...commitTokens); + + const consumedFromNew1 = Math.max(0, baseConsumed - buffer1.length); + const consumedFromNew2 = Math.max(0, comparisonConsumed - buffer2.length); + + index1 += consumedFromNew1; + index2 += consumedFromNew2; + + buffer1 = window1.slice(baseConsumed); + buffer2 = window2.slice(comparisonConsumed); + + if (reachedEnd) { + flushRemainder(); + break; + } + + // Prevent runaway buffers: if we made no progress, forcibly consume one token + if (commitTokens.length === 0 && buffer1.length + buffer2.length > 0) { + if (buffer1.length > 0 && index1 < words1.length) { + buffer1 = buffer1.slice(1); + index1 += 1; + } else if (buffer2.length > 0 && index2 < words2.length) { + buffer2 = buffer2.slice(1); + index2 += 1; + } + } } + flushRemainder(); return tokens; };