diff --git a/frontend/src/components/tools/compare/compareView.css b/frontend/src/components/tools/compare/compareView.css
index 322602697..39563767b 100644
--- a/frontend/src/components/tools/compare/compareView.css
+++ b/frontend/src/components/tools/compare/compareView.css
@@ -122,6 +122,17 @@
font-weight: 500 !important;
}
+/* Wider dropdown menu for long block text */
+.compare-changes-dropdown {
+ min-width: 520px !important;
+ max-width: 70vw !important;
+}
+
+/* Ensure options text uses full width inside wider dropdown */
+.compare-dropdown-option__text {
+ max-width: 100%;
+}
+
/* Style the dropdown container */
.compare-changes-select .mantine-Combobox-dropdown {
border: 1px solid var(--mantine-color-gray-3) !important;
diff --git a/frontend/src/hooks/tools/compare/operationUtils.ts b/frontend/src/hooks/tools/compare/operationUtils.ts
index cc0aae4fe..a9c9f3a8b 100644
--- a/frontend/src/hooks/tools/compare/operationUtils.ts
+++ b/frontend/src/hooks/tools/compare/operationUtils.ts
@@ -1,5 +1,6 @@
import { pdfWorkerManager } from '../../../services/pdfWorkerManager';
import { appendWord as sharedAppendWord } from '../../../utils/textDiff';
+import { PARAGRAPH_SENTINEL } from '../../../types/compare';
import type { StirlingFile } from '../../../types/fileContext';
import type { PDFPageProxy, TextContent, TextItem } from 'pdfjs-dist/types/src/display/api';
import type {
@@ -266,6 +267,27 @@ export const getWorkerErrorCode = (value: unknown): 'EMPTY_TEXT' | 'TOO_LARGE' |
return undefined;
};
+// Produce a filtered view of tokens/metadata that excludes paragraph sentinel markers,
+// returning a mapping to original indices for potential future use.
+export const filterTokensForDiff = (
+ tokens: string[],
+ metadata: TokenMetadata[],
+): { tokens: string[]; metadata: TokenMetadata[]; filteredToOriginal: number[] } => {
+ const outTokens: string[] = [];
+ const outMeta: TokenMetadata[] = [];
+ const map: number[] = [];
+ for (let i = 0; i < tokens.length; i += 1) {
+ const t = tokens[i];
+ const isPara = t === PARAGRAPH_SENTINEL || t.startsWith('\uE000') || t.includes('PARA');
+ if (!isPara) {
+ outTokens.push(t);
+ if (metadata[i]) outMeta.push(metadata[i]);
+ map.push(i);
+ }
+ }
+ return { tokens: outTokens, metadata: outMeta, filteredToOriginal: map };
+};
+
export const extractContentFromPdf = async (file: StirlingFile): Promise => {
const arrayBuffer = await file.arrayBuffer();
const pdfDoc = await pdfWorkerManager.createDocument(arrayBuffer, {
@@ -299,16 +321,23 @@ export const extractContentFromPdf = async (file: StirlingFile): Promise {
+ const isParagraphBreak = (curr: TextItem, prev: TextItem | null) => {
const hasHardBreak = 'hasEOL' in curr && (curr as TextItem).hasEOL;
if (hasHardBreak) return true;
if (!prev) return false;
const prevY = prev.transform[5];
const currY = curr.transform[5];
- return Math.abs(currY - prevY) > yJumpThreshold;
+ const dy = Math.abs(currY - prevY);
+ const currX = curr.transform[4];
+ const prevX = prev.transform[4];
+ const approxLine = Math.max(10, Math.abs((curr as any).height ?? 0) * 0.9);
+ const looksLikeParagraph = dy > approxLine * 1.8;
+ const likelySoftWrap = currX < prevX && dy < approxLine * 0.6;
+ return looksLikeParagraph && !likelySoftWrap;
};
const adjustBoundingBox = (left: number, top: number, width: number, height: number): TokenBoundingBox | null => {
diff --git a/frontend/src/hooks/tools/compare/useCompareOperation.ts b/frontend/src/hooks/tools/compare/useCompareOperation.ts
index 1deb53da6..d70980da2 100644
--- a/frontend/src/hooks/tools/compare/useCompareOperation.ts
+++ b/frontend/src/hooks/tools/compare/useCompareOperation.ts
@@ -19,6 +19,7 @@ import {
createSummaryFile,
extractContentFromPdf,
getWorkerErrorCode,
+ filterTokensForDiff,
} from './operationUtils';
export interface CompareOperationHook extends ToolOperationHook {
@@ -207,17 +208,21 @@ export const useCompareOperation = (): CompareOperationHook => {
setStatus(t('compare.status.processing', 'Analyzing differences...'));
+ // Filter out paragraph sentinels before diffing to avoid large false-positive runs
+ const baseFiltered = filterTokensForDiff(baseContent.tokens, baseContent.metadata);
+ const comparisonFiltered = filterTokensForDiff(comparisonContent.tokens, comparisonContent.metadata);
+
const { tokens, stats, warnings: workerWarnings } = await runCompareWorker(
- baseContent.tokens,
- comparisonContent.tokens,
+ baseFiltered.tokens,
+ comparisonFiltered.tokens,
warningMessages
);
const totals = aggregateTotals(tokens);
const processedAt = Date.now();
- const baseMetadata = baseContent.metadata;
- const comparisonMetadata = comparisonContent.metadata;
+ const baseMetadata = baseFiltered.metadata;
+ const comparisonMetadata = comparisonFiltered.metadata;
const changes = buildChanges(tokens, baseMetadata, comparisonMetadata);
diff --git a/frontend/src/workers/compareWorker.ts b/frontend/src/workers/compareWorker.ts
index c9c8d4829..85b02a874 100644
--- a/frontend/src/workers/compareWorker.ts
+++ b/frontend/src/workers/compareWorker.ts
@@ -80,11 +80,14 @@ const chunkedDiff = (
const tokens: CompareDiffToken[] = [];
let start1 = 0;
let start2 = 0;
+ const overlap = Math.max(0, Math.min(500, Math.floor(chunkSize * 0.1)));
// Advance by the actual number of tokens consumed per chunk to maintain alignment
while (start1 < words1.length || start2 < words2.length) {
- const slice1 = words1.slice(start1, Math.min(start1 + chunkSize, words1.length));
- const slice2 = words2.slice(start2, Math.min(start2 + chunkSize, words2.length));
+ const end1 = Math.min(start1 + chunkSize, words1.length);
+ const end2 = Math.min(start2 + chunkSize, words2.length);
+ const slice1 = words1.slice(start1, end1);
+ const slice2 = words2.slice(start2, end2);
const chunkTokens = diff(slice1, slice2);
tokens.push(...chunkTokens);
@@ -93,23 +96,22 @@ const chunkedDiff = (
let consumed1 = 0;
let consumed2 = 0;
for (const t of chunkTokens) {
- if (t.type === 'unchanged') {
- consumed1 += 1; consumed2 += 1;
- } else if (t.type === 'removed') {
- consumed1 += 1;
- } else if (t.type === 'added') {
- consumed2 += 1;
- }
+ if (t.type === 'unchanged') { consumed1 += 1; consumed2 += 1; }
+ else if (t.type === 'removed') { consumed1 += 1; }
+ else if (t.type === 'added') { consumed2 += 1; }
}
- // Fallback to progress by a small step if diff returned nothing (shouldn't happen)
+ // Fallback to ensure forward progress
if (consumed1 === 0 && consumed2 === 0) {
consumed1 = Math.min(chunkSize, words1.length - start1);
consumed2 = Math.min(chunkSize, words2.length - start2);
}
- start1 += consumed1;
- start2 += consumed2;
+ // Advance with overlap to allow re-synchronization across chunk boundaries
+ const nextStart1 = Math.min(words1.length, Math.max(start1 + consumed1 - overlap, start1 + 1));
+ const nextStart2 = Math.min(words2.length, Math.max(start2 + consumed2 - overlap, start2 + 1));
+ start1 = nextStart1;
+ start2 = nextStart2;
}
return tokens;