mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-12-18 20:04:17 +01:00
fix false positive problem caused by batch size
This commit is contained in:
parent
3afb6f7ac7
commit
fe9268efaf
@ -65,6 +65,10 @@ export const Tooltip: React.FC<TooltipProps> = ({
|
||||
const clickPendingRef = useRef(false);
|
||||
const tooltipIdRef = useRef(`tooltip-${Math.random().toString(36).slice(2)}`);
|
||||
|
||||
// Runtime guard: some browsers may surface non-Node EventTargets for relatedTarget/target
|
||||
const isDomNode = (value: unknown): value is Node =>
|
||||
typeof Node !== 'undefined' && value instanceof Node;
|
||||
|
||||
const clearTimers = useCallback(() => {
|
||||
if (openTimeoutRef.current) {
|
||||
clearTimeout(openTimeoutRef.current);
|
||||
@ -103,9 +107,9 @@ export const Tooltip: React.FC<TooltipProps> = ({
|
||||
(e: MouseEvent) => {
|
||||
const tEl = tooltipRef.current;
|
||||
const trg = triggerRef.current;
|
||||
const target = e.target as Node | null;
|
||||
const insideTooltip = tEl && target && tEl.contains(target);
|
||||
const insideTrigger = trg && target && trg.contains(target);
|
||||
const target = e.target as unknown;
|
||||
const insideTooltip = Boolean(tEl && isDomNode(target) && tEl.contains(target));
|
||||
const insideTrigger = Boolean(trg && isDomNode(target) && trg.contains(target));
|
||||
|
||||
// If pinned: only close when clicking outside BOTH tooltip & trigger
|
||||
if (isPinned) {
|
||||
@ -172,7 +176,7 @@ export const Tooltip: React.FC<TooltipProps> = ({
|
||||
const related = e.relatedTarget as Node | null;
|
||||
|
||||
// Moving into the tooltip → keep open
|
||||
if (related && tooltipRef.current && tooltipRef.current.contains(related)) {
|
||||
if (isDomNode(related) && tooltipRef.current && tooltipRef.current.contains(related)) {
|
||||
(children.props as any)?.onPointerLeave?.(e);
|
||||
return;
|
||||
}
|
||||
@ -236,7 +240,7 @@ export const Tooltip: React.FC<TooltipProps> = ({
|
||||
const handleBlur = useCallback(
|
||||
(e: React.FocusEvent) => {
|
||||
const related = e.relatedTarget as Node | null;
|
||||
if (related && tooltipRef.current && tooltipRef.current.contains(related)) {
|
||||
if (isDomNode(related) && tooltipRef.current && tooltipRef.current.contains(related)) {
|
||||
(children.props as any)?.onBlur?.(e);
|
||||
return;
|
||||
}
|
||||
@ -258,7 +262,7 @@ export const Tooltip: React.FC<TooltipProps> = ({
|
||||
const handleTooltipPointerLeave = useCallback(
|
||||
(e: React.PointerEvent) => {
|
||||
const related = e.relatedTarget as Node | null;
|
||||
if (related && triggerRef.current && triggerRef.current.contains(related)) return;
|
||||
if (isDomNode(related) && triggerRef.current && triggerRef.current.contains(related)) return;
|
||||
if (!isPinned) setOpen(false);
|
||||
},
|
||||
[isPinned, setOpen]
|
||||
|
||||
@ -45,7 +45,8 @@ const buildWordChanges = (
|
||||
if (token.type === targetType) {
|
||||
const parts: string[] = [];
|
||||
const runIndices: number[] = [];
|
||||
const pageNumber = metadata[metadataIndex]?.page ?? 1;
|
||||
// We'll compute the page number from the first token in the run that has a bbox
|
||||
let firstPageWithBox: number | null = null;
|
||||
while (i < tokens.length && tokens[i].type === targetType) {
|
||||
const t = tokens[i].text;
|
||||
const isPara = t === PARAGRAPH_SENTINEL || t.startsWith('\uE000') || t.includes('PARA');
|
||||
@ -53,8 +54,15 @@ const buildWordChanges = (
|
||||
if (!isPara) {
|
||||
parts.push(t);
|
||||
// Only add to grouping if there is a corresponding metadata index
|
||||
if (metadata[metadataIndex]) {
|
||||
runIndices.push(metadataIndex);
|
||||
// AND there is a bounding box to anchor highlights to
|
||||
const meta = metadata[metadataIndex];
|
||||
if (meta) {
|
||||
if (meta.bbox) {
|
||||
runIndices.push(metadataIndex);
|
||||
if (firstPageWithBox == null && typeof meta.page === 'number') {
|
||||
firstPageWithBox = meta.page;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
metadataIndex += 1;
|
||||
@ -67,6 +75,7 @@ const buildWordChanges = (
|
||||
const endIndexForId = runIndices[runIndices.length - 1];
|
||||
const groupId = `${groupPrefix}-${startIndexForId}-${endIndexForId}`;
|
||||
runIndices.forEach((idx) => tokenIndexToGroupId.set(idx, groupId));
|
||||
const pageNumber = firstPageWithBox ?? (metadata[startIndexForId]?.page ?? 1);
|
||||
items.push({ value: groupId, label, pageNumber });
|
||||
}
|
||||
continue;
|
||||
|
||||
@ -27,12 +27,6 @@ export interface CompareOperationHook extends ToolOperationHook<CompareParameter
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
const DEFAULT_WORKER_SETTINGS = {
|
||||
batchSize: 6000,
|
||||
complexThreshold: 120000,
|
||||
maxWordThreshold: 200000,
|
||||
};
|
||||
|
||||
// extractContentFromPdf moved to utils
|
||||
|
||||
export const useCompareOperation = (): CompareOperationHook => {
|
||||
@ -142,7 +136,12 @@ export const useCompareOperation = (): CompareOperationHook => {
|
||||
baseTokens,
|
||||
comparisonTokens,
|
||||
warnings: warningMessages,
|
||||
settings: DEFAULT_WORKER_SETTINGS,
|
||||
// Static worker settings to support large documents
|
||||
settings: {
|
||||
batchSize: 5000,
|
||||
complexThreshold: 120000,
|
||||
maxWordThreshold: 200000,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@ -68,6 +68,21 @@ const diff = (words1: string[], words2: string[]): CompareDiffToken[] => {
|
||||
return backtrack(matrix, words1, words2);
|
||||
};
|
||||
|
||||
const countBaseTokens = (segment: CompareDiffToken[]) =>
|
||||
segment.reduce((acc, token) => acc + (token.type !== 'added' ? 1 : 0), 0);
|
||||
|
||||
const countComparisonTokens = (segment: CompareDiffToken[]) =>
|
||||
segment.reduce((acc, token) => acc + (token.type !== 'removed' ? 1 : 0), 0);
|
||||
|
||||
const findLastUnchangedIndex = (segment: CompareDiffToken[]) => {
|
||||
for (let i = segment.length - 1; i >= 0; i -= 1) {
|
||||
if (segment[i].type === 'unchanged') {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
};
|
||||
|
||||
const chunkedDiff = (
|
||||
words1: string[],
|
||||
words2: string[],
|
||||
@ -78,42 +93,133 @@ const chunkedDiff = (
|
||||
}
|
||||
|
||||
const tokens: CompareDiffToken[] = [];
|
||||
let start1 = 0;
|
||||
let start2 = 0;
|
||||
const overlap = Math.max(0, Math.min(500, Math.floor(chunkSize * 0.1)));
|
||||
const maxWindow = Math.max(chunkSize * 6, chunkSize + 512);
|
||||
const minCommit = Math.max(1, Math.floor(chunkSize * 0.1));
|
||||
|
||||
// Advance by the actual number of tokens consumed per chunk to maintain alignment
|
||||
while (start1 < words1.length || start2 < words2.length) {
|
||||
const end1 = Math.min(start1 + chunkSize, words1.length);
|
||||
const end2 = Math.min(start2 + chunkSize, words2.length);
|
||||
const slice1 = words1.slice(start1, end1);
|
||||
const slice2 = words2.slice(start2, end2);
|
||||
let index1 = 0;
|
||||
let index2 = 0;
|
||||
let buffer1: string[] = [];
|
||||
let buffer2: string[] = [];
|
||||
|
||||
const chunkTokens = diff(slice1, slice2);
|
||||
tokens.push(...chunkTokens);
|
||||
const flushRemainder = () => {
|
||||
if (buffer1.length === 0 && buffer2.length === 0) {
|
||||
return;
|
||||
}
|
||||
const finalTokens = diff(buffer1, buffer2);
|
||||
tokens.push(...finalTokens);
|
||||
buffer1 = [];
|
||||
buffer2 = [];
|
||||
index1 = words1.length;
|
||||
index2 = words2.length;
|
||||
};
|
||||
|
||||
// Count how many tokens from each side were consumed in this chunk
|
||||
let consumed1 = 0;
|
||||
let consumed2 = 0;
|
||||
for (const t of chunkTokens) {
|
||||
if (t.type === 'unchanged') { consumed1 += 1; consumed2 += 1; }
|
||||
else if (t.type === 'removed') { consumed1 += 1; }
|
||||
else if (t.type === 'added') { consumed2 += 1; }
|
||||
while (
|
||||
index1 < words1.length ||
|
||||
index2 < words2.length ||
|
||||
buffer1.length > 0 ||
|
||||
buffer2.length > 0
|
||||
) {
|
||||
const remaining1 = Math.max(0, words1.length - index1);
|
||||
const remaining2 = Math.max(0, words2.length - index2);
|
||||
|
||||
let windowSize = Math.max(chunkSize, buffer1.length, buffer2.length);
|
||||
let window1: string[] = [];
|
||||
let window2: string[] = [];
|
||||
let chunkTokens: CompareDiffToken[] = [];
|
||||
let reachedEnd = false;
|
||||
|
||||
while (true) {
|
||||
const take1 = Math.min(Math.max(0, windowSize - buffer1.length), remaining1);
|
||||
const take2 = Math.min(Math.max(0, windowSize - buffer2.length), remaining2);
|
||||
|
||||
const slice1 = take1 > 0 ? words1.slice(index1, index1 + take1) : [];
|
||||
const slice2 = take2 > 0 ? words2.slice(index2, index2 + take2) : [];
|
||||
|
||||
window1 = buffer1.length > 0 ? [...buffer1, ...slice1] : slice1;
|
||||
window2 = buffer2.length > 0 ? [...buffer2, ...slice2] : slice2;
|
||||
|
||||
if (window1.length === 0 && window2.length === 0) {
|
||||
flushRemainder();
|
||||
return tokens;
|
||||
}
|
||||
|
||||
chunkTokens = diff(window1, window2);
|
||||
const lastStableIndex = findLastUnchangedIndex(chunkTokens);
|
||||
|
||||
reachedEnd =
|
||||
index1 + take1 >= words1.length &&
|
||||
index2 + take2 >= words2.length;
|
||||
|
||||
const windowTooLarge =
|
||||
window1.length >= maxWindow ||
|
||||
window2.length >= maxWindow;
|
||||
|
||||
if (lastStableIndex >= 0 || reachedEnd || windowTooLarge) {
|
||||
break;
|
||||
}
|
||||
|
||||
const canGrow1 = take1 < remaining1;
|
||||
const canGrow2 = take2 < remaining2;
|
||||
|
||||
if (!canGrow1 && !canGrow2) {
|
||||
break;
|
||||
}
|
||||
|
||||
windowSize = Math.min(
|
||||
maxWindow,
|
||||
windowSize + Math.max(64, Math.floor(chunkSize * 0.5))
|
||||
);
|
||||
}
|
||||
|
||||
// Fallback to ensure forward progress
|
||||
if (consumed1 === 0 && consumed2 === 0) {
|
||||
consumed1 = Math.min(chunkSize, words1.length - start1);
|
||||
consumed2 = Math.min(chunkSize, words2.length - start2);
|
||||
if (chunkTokens.length === 0) {
|
||||
if (reachedEnd) {
|
||||
flushRemainder();
|
||||
return tokens;
|
||||
}
|
||||
windowSize = Math.min(windowSize + Math.max(64, Math.floor(chunkSize * 0.5)), maxWindow);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Advance with overlap to allow re-synchronization across chunk boundaries
|
||||
const nextStart1 = Math.min(words1.length, Math.max(start1 + consumed1 - overlap, start1 + 1));
|
||||
const nextStart2 = Math.min(words2.length, Math.max(start2 + consumed2 - overlap, start2 + 1));
|
||||
start1 = nextStart1;
|
||||
start2 = nextStart2;
|
||||
let commitIndex = reachedEnd ? chunkTokens.length - 1 : findLastUnchangedIndex(chunkTokens);
|
||||
if (commitIndex < 0) {
|
||||
commitIndex = reachedEnd
|
||||
? chunkTokens.length - 1
|
||||
: Math.min(chunkTokens.length - 1, minCommit - 1);
|
||||
}
|
||||
|
||||
const commitTokens = commitIndex >= 0 ? chunkTokens.slice(0, commitIndex + 1) : [];
|
||||
const baseConsumed = countBaseTokens(commitTokens);
|
||||
const comparisonConsumed = countComparisonTokens(commitTokens);
|
||||
|
||||
tokens.push(...commitTokens);
|
||||
|
||||
const consumedFromNew1 = Math.max(0, baseConsumed - buffer1.length);
|
||||
const consumedFromNew2 = Math.max(0, comparisonConsumed - buffer2.length);
|
||||
|
||||
index1 += consumedFromNew1;
|
||||
index2 += consumedFromNew2;
|
||||
|
||||
buffer1 = window1.slice(baseConsumed);
|
||||
buffer2 = window2.slice(comparisonConsumed);
|
||||
|
||||
if (reachedEnd) {
|
||||
flushRemainder();
|
||||
break;
|
||||
}
|
||||
|
||||
// Prevent runaway buffers: if we made no progress, forcibly consume one token
|
||||
if (commitTokens.length === 0 && buffer1.length + buffer2.length > 0) {
|
||||
if (buffer1.length > 0 && index1 < words1.length) {
|
||||
buffer1 = buffer1.slice(1);
|
||||
index1 += 1;
|
||||
} else if (buffer2.length > 0 && index2 < words2.length) {
|
||||
buffer2 = buffer2.slice(1);
|
||||
index2 += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
flushRemainder();
|
||||
return tokens;
|
||||
};
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user