Feature/298 improve compare performance (#2124)

* Implement Diff.js

* Compare feature - add service worker and improve efficiency for large files

* Compare - messages updated to be compatable with language packs

* Compare - Acknowledge Diff.js usage

* Add message warning there is  no text in uploaded pdf to messages file

---------

Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
This commit is contained in:
reecebrowne 2024-10-29 15:56:45 +00:00 committed by GitHub
parent 4922ab700e
commit a9ce0e80ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 2497 additions and 224 deletions

View File

@ -79,6 +79,7 @@ info=Info
pro=Pro
page=Page
pages=Pages
loading=Loading...
legal.privacy=Privacy Policy
legal.terms=Terms and Conditions
@ -782,6 +783,9 @@ compare.highlightColor.2=Highlight Color 2:
compare.document.1=Document 1
compare.document.2=Document 2
compare.submit=Compare
compare.complex.message=One or both of the provided documents are large files, accuracy of comparison may be reduced
compare.large.file.message=One or Both of the provided documents are too large to process
compare.no.text.message=One or both of the selected PDFs have no text content. Please choose PDFs with text for comparison.
#BookToPDF
BookToPDF.title=Books and Comics to PDF
@ -1220,5 +1224,3 @@ splitByChapters.desc.2=Bookmark Level: Choose the level of bookmarks to use for
splitByChapters.desc.3=Include Metadata: If checked, the original PDF's metadata will be included in each split PDF.
splitByChapters.desc.4=Allow Duplicates: If checked, allows multiple bookmarks on the same page to create separate PDFs.
splitByChapters.submit=Split PDF

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,145 @@
importScripts('./diff.js');
self.onmessage = async function (e) {
const { text1, text2, color1, color2 } = e.data;
console.log('Received text for comparison:', { text1, text2 });
const startTime = performance.now();
if (text1.trim() === "" || text2.trim() === "") {
self.postMessage({ status: 'error', message: 'One or both of the texts are empty.' });
return;
}
const words1 = text1.split(' ');
const words2 = text2.split(' ');
const MAX_WORD_COUNT = 150000;
const COMPLEX_WORD_COUNT = 50000;
const BATCH_SIZE = 5000; // Define a suitable batch size for processing
const OVERLAP_SIZE = 200; // Number of words to overlap - bigger increases accuracy but affects performance
const isComplex = words1.length > COMPLEX_WORD_COUNT || words2.length > COMPLEX_WORD_COUNT;
const isTooLarge = words1.length > MAX_WORD_COUNT || words2.length > MAX_WORD_COUNT;
let complexMessage = 'One or both of the provided documents are large files, accuracy of comparison may be reduced';
let tooLargeMessage = 'One or Both of the provided documents are too large to process';
// Listen for messages from the main thread
self.addEventListener('message', (event) => {
if (event.data.type === 'SET_TOO_LARGE_MESSAGE') {
tooLargeMessage = event.data.message;
}
if (event.data.type === 'SET_COMPLEX_MESSAGE') {
complexMessage = event.data.message;
}
});
if (isTooLarge) {
self.postMessage({
status: 'warning',
message: tooLargeMessage,
});
return;
} else {
if (isComplex) {
self.postMessage({
status: 'warning',
message: complexMessage,
});
}
// Perform diff operation depending on document size
const differences = isComplex
? await staggeredBatchDiff(words1, words2, color1, color2, BATCH_SIZE, OVERLAP_SIZE)
: diff(words1, words2, color1, color2);
console.log(`Diff operation took ${performance.now() - startTime} milliseconds`);
self.postMessage({ status: 'success', differences });
}
};
//Splits text into smaller batches to run through diff checking algorithms. overlaps the batches to help ensure
async function staggeredBatchDiff(words1, words2, color1, color2, batchSize, overlapSize) {
const differences = [];
const totalWords1 = words1.length;
const totalWords2 = words2.length;
let previousEnd1 = 0; // Track where the last batch ended in words1
let previousEnd2 = 0; // Track where the last batch ended in words2
// Function to determine if differences are large, differences that are too large indicate potential error in batching
const isLargeDifference = (differences) => {
return differences.length > 50;
};
while (previousEnd1 < totalWords1 || previousEnd2 < totalWords2) {
// Define the next chunk boundaries
const start1 = previousEnd1;
const end1 = Math.min(start1 + batchSize, totalWords1);
const start2 = previousEnd2;
const end2 = Math.min(start2 + batchSize, totalWords2);
//If difference is too high decrease batch size for more granular check
const dynamicBatchSize = isLargeDifference(differences) ? batchSize / 2 : batchSize;
// Adjust the size of the current chunk using dynamic batch size
const batchWords1 = words1.slice(start1, end1 + dynamicBatchSize);
const batchWords2 = words2.slice(start2, end2 + dynamicBatchSize);
// Include overlap from the previous chunk
const overlapWords1 = previousEnd1 > 0 ? words1.slice(Math.max(0, previousEnd1 - overlapSize), previousEnd1) : [];
const overlapWords2 = previousEnd2 > 0 ? words2.slice(Math.max(0, previousEnd2 - overlapSize), previousEnd2) : [];
// Combine overlaps and current batches for comparison
const combinedWords1 = overlapWords1.concat(batchWords1);
const combinedWords2 = overlapWords2.concat(batchWords2);
// Perform the diff on the combined words
const batchDifferences = diff(combinedWords1, combinedWords2, color1, color2);
differences.push(...batchDifferences);
// Update the previous end indices based on the results of this batch
previousEnd1 = end1;
previousEnd2 = end2;
}
return differences;
}
// Standard diff function for small text comparisons
function diff(words1, words2, color1, color2) {
console.log(`Starting diff between ${words1.length} words and ${words2.length} words`);
const matrix = Array.from({ length: words1.length + 1 }, () => Array(words2.length + 1).fill(0));
for (let i = 1; i <= words1.length; i++) {
for (let j = 1; j <= words2.length; j++) {
matrix[i][j] = words1[i - 1] === words2[j - 1]
? matrix[i - 1][j - 1] + 1
: Math.max(matrix[i][j - 1], matrix[i - 1][j]);
}
}
return backtrack(matrix, words1, words2, color1, color2);
}
// Backtrack function to find differences
function backtrack(matrix, words1, words2, color1, color2) {
let i = words1.length, j = words2.length;
const differences = [];
while (i > 0 || j > 0) {
if (i > 0 && j > 0 && words1[i - 1] === words2[j - 1]) {
differences.unshift(['black', words1[i - 1]]);
i--; j--;
} else if (j > 0 && (i === 0 || matrix[i][j] === matrix[i][j - 1])) {
differences.unshift([color2, words2[j - 1]]);
j--;
} else {
differences.unshift([color1, words1[i - 1]]);
i--;
}
}
return differences;
}

View File

@ -1,236 +1,256 @@
<!DOCTYPE html>
<html th:lang="${#locale.language}" th:dir="#{language.direction}" th:data-language="${#locale.toString()}" xmlns:th="https://www.thymeleaf.org">
<head>
<html th:lang="${#locale.language}" th:dir="#{language.direction}" th:data-language="${#locale.toString()}"
xmlns:th="https://www.thymeleaf.org">
<head>
<th:block th:insert="~{fragments/common :: head(title=#{compare.title}, header=#{compare.header})}"></th:block>
<style>
.result-column {
border: 1px solid #ccc;
padding: 15px;
margin-bottom: 15px;
overflow-y: auto;
height: calc(100vh - 400px);
white-space: pre-wrap;
}
.flex-container {
display: flex;
flex-direction: row;
}
.color-selector {
display: flex;
flex-direction: row;
align-items: center;
width: 50%;
max-height: 100px;
margin-bottom: 2rem;
}
#color-box1, #color-box2 {
-webkit-appearance: none;
-moz-appearance: none;
appearance: none;
border: none;
background-color: transparent;
}
.spacer1 {
padding-right: calc(var(--bs-gutter-x) * .5);
}
.spacer2 {
padding-left: calc(var(--bs-gutter-x) * .5);
}
</style>
</head>
<style>
.result-column {
border: 1px solid #ccc;
padding: 15px;
margin-bottom: 15px;
overflow-y: auto;
height: calc(100vh - 400px);
white-space: pre-wrap;
}
<body>
<div id="page-container">
<div id="content-wrap">
<th:block th:insert="~{fragments/navbar.html :: navbar}"></th:block>
<br><br>
<div class="container">
<div class="row justify-content-center">
<div class="col-md-9 bg-card">
<div class="tool-header">
<span class="material-symbols-rounded tool-header-icon other">compare</span>
<span class="tool-header-text" th:text="#{compare.header}"></span>
</div>
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multipleInputsForSingleRequest=false, accept='application/pdf', remoteCall='false')}"></div>
<div th:replace="~{fragments/common :: fileSelector(name='fileInput2', multipleInputsForSingleRequest=false, accept='application/pdf', remoteCall='false')}"></div>
.flex-container {
display: flex;
flex-direction: row;
}
<div class="row">
<div class="flex-container">
<div class="color-selector spacer1">
<label th:text="#{compare.highlightColor.1}"></label>
<label for="color-box1"></label><input type="color" id="color-box1" value="#ff0000">
</div>
<div class="color-selector spacer2">
<label th:text="#{compare.highlightColor.2}"></label>
<label for="color-box2"></label><input type="color" id="color-box2" value="#008000">
</div>
</div>
</div>
.color-selector {
display: flex;
flex-direction: row;
align-items: center;
width: 50%;
max-height: 100px;
margin-bottom: 2rem;
}
<button class="btn btn-primary" onclick="comparePDFs()" th:text="#{compare.submit}"></button>
#color-box1,
#color-box2 {
-webkit-appearance: none;
-moz-appearance: none;
appearance: none;
border: none;
background-color: transparent;
}
<div class="row">
<div class="col-md-6">
<h3 th:text="#{compare.document.1}"></h3>
<div id="result1" class="result-column"></div>
</div>
<div class="col-md-6">
<h3 th:text="#{compare.document.2}"></h3>
<div id="result2" class="result-column"></div>
</div>
</div>
<script type="module" th:src="@{'/pdfjs-legacy/pdf.mjs'}"></script>
<script>
// get the elements
var result1 = document.getElementById('result1');
var result2 = document.getElementById('result2');
.spacer1 {
padding-right: calc(var(--bs-gutter-x) * .5);
}
// add event listeners
result1.addEventListener('scroll', function() {
result2.scrollTop = result1.scrollTop;
});
.spacer2 {
padding-left: calc(var(--bs-gutter-x) * .5);
}
</style>
</head>
result2.addEventListener('scroll', function() {
result1.scrollTop = result2.scrollTop;
});
async function comparePDFs() {
const file1 = document.getElementById("fileInput-input").files[0];
const file2 = document.getElementById("fileInput2-input").files[0];
var color1 = document.getElementById('color-box1').value;
var color2 = document.getElementById('color-box2').value;
if (!file1 || !file2) {
console.error("Please select two PDF files to compare");
return;
}
pdfjsLib.GlobalWorkerOptions.workerSrc = './pdfjs-legacy/pdf.worker.mjs'
const [pdf1, pdf2] = await Promise.all([
pdfjsLib.getDocument(URL.createObjectURL(file1)).promise,
pdfjsLib.getDocument(URL.createObjectURL(file2)).promise
]);
const extractText = async (pdf) => {
const pages = [];
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const content = await page.getTextContent();
const strings = content.items.map(item => item.str);
pages.push(strings.join(" "));
}
return pages.join(" ");
};
const [text1, text2] = await Promise.all([
extractText(pdf1),
extractText(pdf2)
]);
if (text1.trim() === "" || text2.trim() === "") {
alert("One or both of the selected PDFs have no text content. Please choose PDFs with text for comparison.");
return;
}
const diff = (text1, text2) => {
const words1 = text1.split(' ');
const words2 = text2.split(' ');
// Create a 2D array to hold our "matrix"
const matrix = Array(words1.length + 1).fill(null).map(() => Array(words2.length + 1).fill(0));
// Perform standard LCS algorithm
for (let i = 1; i <= words1.length; i++) {
for (let j = 1; j <= words2.length; j++) {
if (words1[i - 1] === words2[j - 1]) {
matrix[i][j] = matrix[i - 1][j - 1] + 1;
} else {
matrix[i][j] = Math.max(matrix[i][j - 1], matrix[i - 1][j]);
}
}
}
let i = words1.length;
let j = words2.length;
const differences = [];
// Backtrack through the matrix to create the diff
while (i > 0 || j > 0) {
if (i > 0 && j > 0 && words1[i - 1] === words2[j - 1]) {
differences.unshift(['black', words1[i - 1]]);
i--;
j--;
} else if (j > 0 && (i === 0 || matrix[i][j - 1] >= matrix[i - 1][j])) {
differences.unshift([color2, words2[j - 1]]);
j--;
} else if (i > 0 && (j === 0 || matrix[i][j - 1] < matrix[i - 1][j])) {
differences.unshift([color1, words1[i - 1]]);
i--;
}
}
console.log(differences);
return differences;
};
const differences = diff(text1, text2);
const displayDifferences = (differences) => {
const resultDiv1 = document.getElementById("result1");
const resultDiv2 = document.getElementById("result2");
resultDiv1.innerHTML = "";
resultDiv2.innerHTML = "";
differences.forEach(([color, word]) => {
const span1 = document.createElement("span");
const span2 = document.createElement("span");
// If it's an addition, show it in color2 in the second document and transparent in the first
if (color === color2) {
span1.style.color = "transparent";
span1.style.userSelect = "none";
span2.style.color = color;
}
// If it's a deletion, show it in color1 in the first document and transparent in the second
else if (color === color1) {
span1.style.color = color;
span2.style.color = "transparent";
span2.style.userSelect = "none";
}
// If it's unchanged, show it in black in both
else {
span1.style.color = color;
span2.style.color = color;
}
span1.textContent = word;
span2.textContent = word;
resultDiv1.appendChild(span1);
resultDiv2.appendChild(span2);
// Add space after each word, or a new line if the word ends with a full stop
const spaceOrNewline1 = document.createElement("span");
const spaceOrNewline2 = document.createElement("span");
if (word.endsWith(".")) {
spaceOrNewline1.innerHTML = "<br>";
spaceOrNewline2.innerHTML = "<br>";
} else {
spaceOrNewline1.textContent = " ";
spaceOrNewline2.textContent = " ";
}
resultDiv1.appendChild(spaceOrNewline1);
resultDiv2.appendChild(spaceOrNewline2);
});
};
console.log('Differences:', differences);
displayDifferences(differences);
}
</script>
<body>
<div id="page-container">
<div id="content-wrap">
<th:block th:insert="~{fragments/navbar.html :: navbar}"></th:block>
<br><br>
<div class="container">
<div class="row justify-content-center">
<div class="col-md-9 bg-card">
<div class="tool-header">
<span class="material-symbols-rounded tool-header-icon other">compare</span>
<span class="tool-header-text" th:text="#{compare.header}"></span>
</div>
<div
th:replace="~{fragments/common :: fileSelector(name='fileInput', disableMultipleFiles=true, multipleInputsForSingleRequest=false, accept='application/pdf', remoteCall='false')}">
</div>
<div
th:replace="~{fragments/common :: fileSelector(name='fileInput2', disableMultipleFiles=true, multipleInputsForSingleRequest=false, accept='application/pdf', remoteCall='false')}">
</div>
<div class="row">
<div class="flex-container">
<div class="color-selector spacer1">
<label th:text="#{compare.highlightColor.1}"></label>
<label for="color-box1"></label><input type="color" id="color-box1" value="#ff0000">
</div>
<div class="color-selector spacer2">
<label th:text="#{compare.highlightColor.2}"></label>
<label for="color-box2"></label><input type="color" id="color-box2" value="#008000">
</div>
</div>
</div>
<button class="btn btn-primary" onclick="comparePDFs()" th:text="#{compare.submit}"></button>
<div class="row">
<div class="col-md-6">
<h3 th:text="#{compare.document.1}"></h3>
<div id="result1" class="result-column"></div>
</div>
<div class="col-md-6">
<h3 th:text="#{compare.document.2}"></h3>
<div id="result2" class="result-column"></div>
</div>
</div>
<script type="module" th:src="@{'/pdfjs-legacy/pdf.mjs'}"></script>
<script th:inline="javascript">
// get the elements
var result1 = document.getElementById('result1');
var result2 = document.getElementById('result2');
// add event listeners
result1.addEventListener('scroll', function () {
result2.scrollTop = result1.scrollTop;
});
result2.addEventListener('scroll', function () {
result1.scrollTop = result2.scrollTop;
});
async function comparePDFs() {
const file1 = document.getElementById("fileInput-input").files[0];
const file2 = document.getElementById("fileInput2-input").files[0];
var color1 = document.getElementById('color-box1').value;
var color2 = document.getElementById('color-box2').value;
const complexMessage = /*[[#{compare.complex.message}]]*/ 'One or both of the provided documents are large files, accuracy of comparison may be reduced';
const largeFilesMessage = /*[[#{compare.large.file.message}]]*/ 'One or Both of the provided documents are too large to process';
const noTextMessage = /*[[#{compare.no.text.message}]]*/ 'One or both of the selected PDFs have no text content. Please choose PDFs with text for comparison."';
if (!file1 || !file2) {
console.error("Please select two PDF files to compare");
return;
}
pdfjsLib.GlobalWorkerOptions.workerSrc = './pdfjs-legacy/pdf.worker.mjs';
const [pdf1, pdf2] = await Promise.all([
pdfjsLib.getDocument(URL.createObjectURL(file1)).promise,
pdfjsLib.getDocument(URL.createObjectURL(file2)).promise
]);
const extractText = async (pdf) => {
const pages = [];
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const content = await page.getTextContent();
const strings = content.items.map(item => item.str);
pages.push(strings.join(" "));
}
return pages.join(" ");
};
const [text1, text2] = await Promise.all([
extractText(pdf1),
extractText(pdf2)
]);
if (text1.trim() === "" || text2.trim() === "") {
alert(noTextMessage);
return;
}
const resultDiv1 = document.getElementById("result1");
const resultDiv2 = document.getElementById("result2");
const loading = /*[[#{loading}]]*/ 'Loading...';
resultDiv1.innerHTML = loading;
resultDiv2.innerHTML = loading;
// Create a new Worker
const worker = new Worker('/js/compare/pdfWorker.js');
// Post messages to the worker
worker.postMessage({
type: 'SET_COMPLEX_MESSAGE',
message: complexMessage
});
worker.postMessage({
type: 'SET_TOO_LARGE_MESSAGE',
message: largeFilesMessage
});
// Error handling for the worker
worker.onerror = function (error) {
console.error('Worker error:', error);
};
worker.onmessage = function (e) {
const { status, differences, message } = e.data;
if (status === 'error') {
resultDiv1.innerHTML = '';
resultDiv2.innerHTML = '';
alert(message);
return;
}
if (status === 'success' && differences) {
console.log('Differences:', differences);
displayDifferences(differences);
}
if (event.data.status === 'warning') {
console.warn(event.data.message);
alert(event.data.message);
}
};
worker.postMessage({ text1, text2, color1, color2 });
const displayDifferences = (differences) => {
const resultDiv1 = document.getElementById("result1");
const resultDiv2 = document.getElementById("result2");
resultDiv1.innerHTML = "";
resultDiv2.innerHTML = "";
differences.forEach(([color, word]) => {
const span1 = document.createElement("span");
const span2 = document.createElement("span");
if (color === color2) {
span1.style.color = "transparent";
span1.style.userSelect = "none";
span2.style.color = color;
}
// If it's a deletion, show it in in the first document and transparent in the second
else if (color === color1) {
span1.style.color = color;
span2.style.color = "transparent";
span2.style.userSelect = "none";
}
// If it's unchanged, show it in black in both
else {
span1.style.color = color;
span2.style.color = color;
}
span1.textContent = word;
span2.textContent = word;
resultDiv1.appendChild(span1);
resultDiv2.appendChild(span2);
// Add space after each word, or a new line if the word ends with a full stop
const spaceOrNewline1 = document.createElement("span");
const spaceOrNewline2 = document.createElement("span");
if (word.endsWith(".")) {
spaceOrNewline1.innerHTML = "<br>";
spaceOrNewline2.innerHTML = "<br>";
} else {
spaceOrNewline1.textContent = " ";
spaceOrNewline2.textContent = " ";
}
resultDiv1.appendChild(spaceOrNewline1);
resultDiv2.appendChild(spaceOrNewline2);
});
};
}
</script>
</div>
</div>
</div>
<th:block th:insert="~{fragments/footer.html :: footer}"></th:block>
</div>
</body>
<th:block th:insert="~{fragments/footer.html :: footer}"></th:block>
</div>
</body>
</html>