fix false positive problem caused by batch size

This commit is contained in:
EthanHealy01 2025-10-28 02:10:38 +00:00
parent 3afb6f7ac7
commit fe9268efaf
4 changed files with 161 additions and 43 deletions

View File

@ -65,6 +65,10 @@ export const Tooltip: React.FC<TooltipProps> = ({
const clickPendingRef = useRef(false);
const tooltipIdRef = useRef(`tooltip-${Math.random().toString(36).slice(2)}`);
// Runtime guard: some browsers may surface non-Node EventTargets for relatedTarget/target
const isDomNode = (value: unknown): value is Node =>
typeof Node !== 'undefined' && value instanceof Node;
const clearTimers = useCallback(() => {
if (openTimeoutRef.current) {
clearTimeout(openTimeoutRef.current);
@ -103,9 +107,9 @@ export const Tooltip: React.FC<TooltipProps> = ({
(e: MouseEvent) => {
const tEl = tooltipRef.current;
const trg = triggerRef.current;
const target = e.target as Node | null;
const insideTooltip = tEl && target && tEl.contains(target);
const insideTrigger = trg && target && trg.contains(target);
const target = e.target as unknown;
const insideTooltip = Boolean(tEl && isDomNode(target) && tEl.contains(target));
const insideTrigger = Boolean(trg && isDomNode(target) && trg.contains(target));
// If pinned: only close when clicking outside BOTH tooltip & trigger
if (isPinned) {
@ -172,7 +176,7 @@ export const Tooltip: React.FC<TooltipProps> = ({
const related = e.relatedTarget as Node | null;
// Moving into the tooltip → keep open
if (related && tooltipRef.current && tooltipRef.current.contains(related)) {
if (isDomNode(related) && tooltipRef.current && tooltipRef.current.contains(related)) {
(children.props as any)?.onPointerLeave?.(e);
return;
}
@ -236,7 +240,7 @@ export const Tooltip: React.FC<TooltipProps> = ({
const handleBlur = useCallback(
(e: React.FocusEvent) => {
const related = e.relatedTarget as Node | null;
if (related && tooltipRef.current && tooltipRef.current.contains(related)) {
if (isDomNode(related) && tooltipRef.current && tooltipRef.current.contains(related)) {
(children.props as any)?.onBlur?.(e);
return;
}
@ -258,7 +262,7 @@ export const Tooltip: React.FC<TooltipProps> = ({
const handleTooltipPointerLeave = useCallback(
(e: React.PointerEvent) => {
const related = e.relatedTarget as Node | null;
if (related && triggerRef.current && triggerRef.current.contains(related)) return;
if (isDomNode(related) && triggerRef.current && triggerRef.current.contains(related)) return;
if (!isPinned) setOpen(false);
},
[isPinned, setOpen]

View File

@ -45,7 +45,8 @@ const buildWordChanges = (
if (token.type === targetType) {
const parts: string[] = [];
const runIndices: number[] = [];
const pageNumber = metadata[metadataIndex]?.page ?? 1;
// We'll compute the page number from the first token in the run that has a bbox
let firstPageWithBox: number | null = null;
while (i < tokens.length && tokens[i].type === targetType) {
const t = tokens[i].text;
const isPara = t === PARAGRAPH_SENTINEL || t.startsWith('\uE000') || t.includes('PARA');
@ -53,8 +54,15 @@ const buildWordChanges = (
if (!isPara) {
parts.push(t);
// Only add to grouping if there is a corresponding metadata index
if (metadata[metadataIndex]) {
runIndices.push(metadataIndex);
// AND there is a bounding box to anchor highlights to
const meta = metadata[metadataIndex];
if (meta) {
if (meta.bbox) {
runIndices.push(metadataIndex);
if (firstPageWithBox == null && typeof meta.page === 'number') {
firstPageWithBox = meta.page;
}
}
}
}
metadataIndex += 1;
@ -67,6 +75,7 @@ const buildWordChanges = (
const endIndexForId = runIndices[runIndices.length - 1];
const groupId = `${groupPrefix}-${startIndexForId}-${endIndexForId}`;
runIndices.forEach((idx) => tokenIndexToGroupId.set(idx, groupId));
const pageNumber = firstPageWithBox ?? (metadata[startIndexForId]?.page ?? 1);
items.push({ value: groupId, label, pageNumber });
}
continue;

View File

@ -27,12 +27,6 @@ export interface CompareOperationHook extends ToolOperationHook<CompareParameter
warnings: string[];
}
const DEFAULT_WORKER_SETTINGS = {
batchSize: 6000,
complexThreshold: 120000,
maxWordThreshold: 200000,
};
// extractContentFromPdf moved to utils
export const useCompareOperation = (): CompareOperationHook => {
@ -142,7 +136,12 @@ export const useCompareOperation = (): CompareOperationHook => {
baseTokens,
comparisonTokens,
warnings: warningMessages,
settings: DEFAULT_WORKER_SETTINGS,
// Static worker settings to support large documents
settings: {
batchSize: 5000,
complexThreshold: 120000,
maxWordThreshold: 200000,
},
},
};

View File

@ -68,6 +68,21 @@ const diff = (words1: string[], words2: string[]): CompareDiffToken[] => {
return backtrack(matrix, words1, words2);
};
const countBaseTokens = (segment: CompareDiffToken[]) =>
segment.reduce((acc, token) => acc + (token.type !== 'added' ? 1 : 0), 0);
const countComparisonTokens = (segment: CompareDiffToken[]) =>
segment.reduce((acc, token) => acc + (token.type !== 'removed' ? 1 : 0), 0);
const findLastUnchangedIndex = (segment: CompareDiffToken[]) => {
for (let i = segment.length - 1; i >= 0; i -= 1) {
if (segment[i].type === 'unchanged') {
return i;
}
}
return -1;
};
const chunkedDiff = (
words1: string[],
words2: string[],
@ -78,42 +93,133 @@ const chunkedDiff = (
}
const tokens: CompareDiffToken[] = [];
let start1 = 0;
let start2 = 0;
const overlap = Math.max(0, Math.min(500, Math.floor(chunkSize * 0.1)));
const maxWindow = Math.max(chunkSize * 6, chunkSize + 512);
const minCommit = Math.max(1, Math.floor(chunkSize * 0.1));
// Advance by the actual number of tokens consumed per chunk to maintain alignment
while (start1 < words1.length || start2 < words2.length) {
const end1 = Math.min(start1 + chunkSize, words1.length);
const end2 = Math.min(start2 + chunkSize, words2.length);
const slice1 = words1.slice(start1, end1);
const slice2 = words2.slice(start2, end2);
let index1 = 0;
let index2 = 0;
let buffer1: string[] = [];
let buffer2: string[] = [];
const chunkTokens = diff(slice1, slice2);
tokens.push(...chunkTokens);
const flushRemainder = () => {
if (buffer1.length === 0 && buffer2.length === 0) {
return;
}
const finalTokens = diff(buffer1, buffer2);
tokens.push(...finalTokens);
buffer1 = [];
buffer2 = [];
index1 = words1.length;
index2 = words2.length;
};
// Count how many tokens from each side were consumed in this chunk
let consumed1 = 0;
let consumed2 = 0;
for (const t of chunkTokens) {
if (t.type === 'unchanged') { consumed1 += 1; consumed2 += 1; }
else if (t.type === 'removed') { consumed1 += 1; }
else if (t.type === 'added') { consumed2 += 1; }
while (
index1 < words1.length ||
index2 < words2.length ||
buffer1.length > 0 ||
buffer2.length > 0
) {
const remaining1 = Math.max(0, words1.length - index1);
const remaining2 = Math.max(0, words2.length - index2);
let windowSize = Math.max(chunkSize, buffer1.length, buffer2.length);
let window1: string[] = [];
let window2: string[] = [];
let chunkTokens: CompareDiffToken[] = [];
let reachedEnd = false;
while (true) {
const take1 = Math.min(Math.max(0, windowSize - buffer1.length), remaining1);
const take2 = Math.min(Math.max(0, windowSize - buffer2.length), remaining2);
const slice1 = take1 > 0 ? words1.slice(index1, index1 + take1) : [];
const slice2 = take2 > 0 ? words2.slice(index2, index2 + take2) : [];
window1 = buffer1.length > 0 ? [...buffer1, ...slice1] : slice1;
window2 = buffer2.length > 0 ? [...buffer2, ...slice2] : slice2;
if (window1.length === 0 && window2.length === 0) {
flushRemainder();
return tokens;
}
chunkTokens = diff(window1, window2);
const lastStableIndex = findLastUnchangedIndex(chunkTokens);
reachedEnd =
index1 + take1 >= words1.length &&
index2 + take2 >= words2.length;
const windowTooLarge =
window1.length >= maxWindow ||
window2.length >= maxWindow;
if (lastStableIndex >= 0 || reachedEnd || windowTooLarge) {
break;
}
const canGrow1 = take1 < remaining1;
const canGrow2 = take2 < remaining2;
if (!canGrow1 && !canGrow2) {
break;
}
windowSize = Math.min(
maxWindow,
windowSize + Math.max(64, Math.floor(chunkSize * 0.5))
);
}
// Fallback to ensure forward progress
if (consumed1 === 0 && consumed2 === 0) {
consumed1 = Math.min(chunkSize, words1.length - start1);
consumed2 = Math.min(chunkSize, words2.length - start2);
if (chunkTokens.length === 0) {
if (reachedEnd) {
flushRemainder();
return tokens;
}
windowSize = Math.min(windowSize + Math.max(64, Math.floor(chunkSize * 0.5)), maxWindow);
continue;
}
// Advance with overlap to allow re-synchronization across chunk boundaries
const nextStart1 = Math.min(words1.length, Math.max(start1 + consumed1 - overlap, start1 + 1));
const nextStart2 = Math.min(words2.length, Math.max(start2 + consumed2 - overlap, start2 + 1));
start1 = nextStart1;
start2 = nextStart2;
let commitIndex = reachedEnd ? chunkTokens.length - 1 : findLastUnchangedIndex(chunkTokens);
if (commitIndex < 0) {
commitIndex = reachedEnd
? chunkTokens.length - 1
: Math.min(chunkTokens.length - 1, minCommit - 1);
}
const commitTokens = commitIndex >= 0 ? chunkTokens.slice(0, commitIndex + 1) : [];
const baseConsumed = countBaseTokens(commitTokens);
const comparisonConsumed = countComparisonTokens(commitTokens);
tokens.push(...commitTokens);
const consumedFromNew1 = Math.max(0, baseConsumed - buffer1.length);
const consumedFromNew2 = Math.max(0, comparisonConsumed - buffer2.length);
index1 += consumedFromNew1;
index2 += consumedFromNew2;
buffer1 = window1.slice(baseConsumed);
buffer2 = window2.slice(comparisonConsumed);
if (reachedEnd) {
flushRemainder();
break;
}
// Prevent runaway buffers: if we made no progress, forcibly consume one token
if (commitTokens.length === 0 && buffer1.length + buffer2.length > 0) {
if (buffer1.length > 0 && index1 < words1.length) {
buffer1 = buffer1.slice(1);
index1 += 1;
} else if (buffer2.length > 0 && index2 < words2.length) {
buffer2 = buffer2.slice(1);
index2 += 1;
}
}
}
flushRemainder();
return tokens;
};