diff --git a/src/main/java/stirling/software/SPDF/controller/OCRController.java b/src/main/java/stirling/software/SPDF/controller/OCRController.java index 0c579cb44..4f2066f1a 100644 --- a/src/main/java/stirling/software/SPDF/controller/OCRController.java +++ b/src/main/java/stirling/software/SPDF/controller/OCRController.java @@ -47,9 +47,13 @@ public class OCRController { @PostMapping("/ocr-pdf") public ResponseEntity processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile, @RequestParam("languages") List selectedLanguages, - @RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException { - + @RequestParam(name = "sidecar", required = false) Boolean sidecar, + @RequestParam(name = "deskew", required = false) Boolean deskew, + @RequestParam(name = "clean", required = false) Boolean clean, + @RequestParam(name = "clean-final", required = false) Boolean cleanFinal, + @RequestParam(name = "ocrType", required = false) String ocrType) throws IOException, InterruptedException { + //--output-type pdfa if (selectedLanguages == null || selectedLanguages.size() < 1) { throw new IOException("Please select at least one language."); @@ -62,18 +66,40 @@ public class OCRController { // Prepare the output file path Path tempOutputFile = Files.createTempFile("output_", ".pdf"); + // Prepare the output file path + Path sidecarTextPath = null; + // Run OCR Command String languageOption = String.join("+", selectedLanguages); List command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2")); - String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt"); if (sidecar != null && sidecar) { + sidecarTextPath = Files.createTempFile("sidecar", ".txt"); command.add("--sidecar"); - command.add(sidecarFile); + command.add(sidecarTextPath.toString()); } + if (deskew != null && deskew) { + command.add("--deskew"); + } + if (clean != null && clean) { + command.add("--clean"); + } + if (cleanFinal != null && cleanFinal) { + command.add("--clean-final"); + } + if (ocrType != null && !ocrType.equals("")) { + if("skip-text".equals(ocrType)) { + command.add("--skip-text"); + } else if("force-ocr".equals(ocrType)) { + command.add("--force-ocr"); + } else if("Normal".equals(ocrType)) { + + } + } + command.addAll(Arrays.asList("--language", languageOption, tempInputFile.toString(), tempOutputFile.toString())); @@ -104,9 +130,9 @@ public class OCRController { zipOut.closeEntry(); // Add text file to the zip - ZipEntry txtEntry = new ZipEntry(sidecarFile); + ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt")); zipOut.putNextEntry(txtEntry); - Files.copy(Paths.get(sidecarFile), zipOut); + Files.copy(sidecarTextPath, zipOut); zipOut.closeEntry(); } @@ -115,7 +141,7 @@ public class OCRController { // Clean up the temporary zip file Files.delete(tempZipFile); Files.delete(tempOutputFile); - Files.delete(Paths.get(sidecarFile)); + Files.delete(sidecarTextPath); // Return the zip file containing both the PDF and the text file headers.setContentType(MediaType.APPLICATION_OCTET_STREAM); diff --git a/src/main/java/stirling/software/SPDF/controller/converters/ConvertImgPDFController.java b/src/main/java/stirling/software/SPDF/controller/converters/ConvertImgPDFController.java index 8d0d43b26..5a57cc7de 100644 --- a/src/main/java/stirling/software/SPDF/controller/converters/ConvertImgPDFController.java +++ b/src/main/java/stirling/software/SPDF/controller/converters/ConvertImgPDFController.java @@ -73,7 +73,6 @@ public class ConvertImgPDFController { if (singleImage) { HttpHeaders headers = new HttpHeaders(); headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat))); - headers.setCacheControl("must-revalidate, post-check=0, pre-check=0"); ResponseEntity response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK); return response; } else { diff --git a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java index 97addfd83..70a018fa1 100644 --- a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java +++ b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java @@ -40,7 +40,7 @@ public class ProcessExecutor { semaphore.acquire(); try { - + System.out.print("Running command: " + String.join(" ", command)); ProcessBuilder processBuilder = new ProcessBuilder(command); Process process = processBuilder.start(); diff --git a/src/main/resources/messages_en_GB.properties b/src/main/resources/messages_en_GB.properties index 8a171bab2..6288c5cb8 100644 --- a/src/main/resources/messages_en_GB.properties +++ b/src/main/resources/messages_en_GB.properties @@ -93,11 +93,22 @@ settings.downloadOption.2=Open in new window settings.downloadOption.3=Download file settings.zipThreshold=Zip files when the number of downloaded files exceeds + + + #OCR ocr.title=OCR ocr.header=OCR (Optical Character Recognition) ocr.selectText.1=Select languages that are to be detected within the PDF (Ones listed are the ones currently detected): ocr.selectText.2=Produce text file containing OCR text alongside the OCR'ed PDF +ocr.selectText.3=Correct pages were scanned at a skewed angle by rotating them back into place +ocr.selectText.4=Clean page so its less likely that OCR will find text in background noise. (No output change) +ocr.selectText.5=Clean page so its less likely that OCR will find text in background noise, maintains cleanup in output. +ocr.selectText.6=Ignores pages that have interacive text on them, only OCRs pages that are images +ocr.selectText.7=Force OCR, will OCR Every page removing all original text elements +ocr.selectText.8=Normal (Will error if PDF contains text) +ocr.selectText.9=Additional Settings +ocr.selectText.10=OCR Mode ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker ocr.credit=This service uses OCRmyPDF and Tesseract for OCR. ocr.submit=Process PDF with OCR diff --git a/src/main/resources/templates/fragments/common.html b/src/main/resources/templates/fragments/common.html index 3279469e0..938c1c808 100644 --- a/src/main/resources/templates/fragments/common.html +++ b/src/main/resources/templates/fragments/common.html @@ -194,98 +194,106 @@ function toggleDarkMode() { } }); - async function submitMultiPdfForm(event,url) { + async function submitMultiPdfForm(event, url) { // Get the selected PDF files - var files = $('#fileInput-input')[0].files; + let files = $('#fileInput-input')[0].files; // Get the existing form data - var formData = new FormData($('form')[0]); + let formData = new FormData($('form')[0]); formData.delete('fileInput'); - + // Show the progress bar $('#progressBarContainer').show(); // Initialize the progress bar - var progressBar = $('#progressBar'); + let progressBar = $('#progressBar'); progressBar.css('width', '0%'); progressBar.attr('aria-valuenow', 0); progressBar.attr('aria-valuemax', files.length); - - // Check the flag in localStorage, default to 4 + + // Check the flag in localStorage, default to 4 const zipThreshold = parseInt(localStorage.getItem('zipThreshold'), 10) || 4; const zipFiles = files.length > zipThreshold; // Initialize JSZip instance if needed let jszip = null; if (zipFiles) { - jszip = new JSZip(); + jszip = new JSZip(); } - - + // Submit each PDF file in parallel - var promises = []; - for (var i = 0; i < files.length; i++) { - var promise = new Promise(function(resolve, reject) { - var fileFormData = new FormData(); + let promises = []; + for (let i = 0; i < files.length; i++) { + let promise = new Promise(async function(resolve, reject) { + let fileFormData = new FormData(); fileFormData.append('fileInput', files[i]); - for (var pair of formData.entries()) { + for (let pair of formData.entries()) { fileFormData.append(pair[0], pair[1]); } console.log(fileFormData); - fetch(url, { - method: 'POST', - body: fileFormData - }).then(function(response) { + try { + let response = await fetch(url, { + method: 'POST', + body: fileFormData + }); + if (!response) { throw new Error('Received null response for file ' + i); } + + if (!response.ok) { + throw new Error(`Error submitting request for file ${i}: ${response.status} ${response.statusText}`); + } + + let contentDisposition = response.headers.get('content-disposition'); + let fileName = "file.pdf" + if (!contentDisposition) { + //throw new Error('Content-Disposition header not found for file ' + i); + } else { + fileName = contentDisposition.split('filename=')[1].replace(/"/g, ''); + } console.log('Received response for file ' + i + ': ' + response); - var contentDisposition = response.headers.get('content-disposition'); - var fileName = contentDisposition.split('filename=')[1].replace(/"/g, ''); + - response.blob().then(function (blob) { - if (zipFiles) { - // Add the file to the ZIP archive - jszip.file(fileName, blob); - resolve(); - } else { - // Download the file directly - var url = window.URL.createObjectURL(blob); - var a = document.createElement('a'); - a.href = url; - a.download = fileName; - document.body.appendChild(a); - a.click(); - a.remove(); - resolve(); - } - }); - - }).catch(function(error) { + let blob = await response.blob(); + if (zipFiles) { + // Add the file to the ZIP archive + jszip.file(fileName, blob); + resolve(); + } else { + // Download the file directly + let url = window.URL.createObjectURL(blob); + let a = document.createElement('a'); + a.href = url; + a.download = fileName; + document.body.appendChild(a); + a.click(); + a.remove(); + resolve(); + } + } catch (error) { console.error('Error submitting request for file ' + i + ': ' + error); - + // Set default values or fallbacks for error properties - var status = error && error.status || 500; - var statusText = error && error.statusText || 'Internal Server Error'; - var message = error && error.message || 'An error occurred while processing your request.'; - - // Reject the Promise to signal that the request has failed + let status = error && error.status || 500; + let statusText = error && error.statusText || 'Internal Server Error'; + let message = error && error.message || 'An error occurred while processing your request.'; + + // Reject the Promise to signal that the request has failed reject(); - // Redirect to error page with Spring Boot error parameters - var url = '/error?status=' + status + '&error=' + encodeURIComponent(statusText) + '&message=' + encodeURIComponent(message); + // Redirect to error page with Spring Boot error parameters + let url = '/error?status=' + status + '&error=' + encodeURIComponent(statusText) + '&message=' + encodeURIComponent(message); window.location.href = url; - }); + } }); - + // Update the progress bar as each request finishes promise.then(function() { - var progress = ((progressBar.attr('aria-valuenow') / files.length) * 100) + (100 / files.length); - progressBar.css('width', progress + '%'); - progressBar.attr('aria-valuenow', parseInt(progressBar.attr('aria-valuenow')) + 1); + updateProgressBar(progressBar, files); }); - + promises.push(promise); } @@ -295,24 +303,33 @@ function toggleDarkMode() { } catch (error) { console.error('Error while uploading files: ' + error); } - + // Update the progress bar progressBar.css('width', '100%'); progressBar.attr('aria-valuenow', files.length); - - // After all requests are finished, download the ZIP file if needed + + // After all requests are finished, download the ZIP file if needed if (zipFiles) { - jszip.generateAsync({ type: "blob" }).then(function (content) { - var url = window.URL.createObjectURL(content); - var a = document.createElement('a'); - a.href = url; - a.download = "files.zip"; - document.body.appendChild(a); - a.click(); - a.remove(); - }); + try { + let content = await jszip.generateAsync({ type: "blob" }); + let url = window.URL.createObjectURL(content); + let a = document.createElement('a'); + a.href = url; + a.download = "files.zip"; + document.body.appendChild(a); + a.click(); + a.remove(); + } catch (error) { + console.error('Error generating ZIP file: ' + error); + } } } + function updateProgressBar(progressBar, files) { + let progress = ((progressBar.attr('aria-valuenow') / files.length) * 100) + (100 / files.length); + progressBar.css('width', progress + '%'); + progressBar.attr('aria-valuenow', parseInt(progressBar.attr('aria-valuenow')) + 1); + } + diff --git a/src/main/resources/templates/ocr-pdf.html b/src/main/resources/templates/ocr-pdf.html index 0b5bf48d7..0badec164 100644 --- a/src/main/resources/templates/ocr-pdf.html +++ b/src/main/resources/templates/ocr-pdf.html @@ -18,36 +18,40 @@
- -
-
- - -
-
-
-
+ +
+
+
+ + + +
+
+
+
+ +
-
+
- +
-
+
- +
-
+
- +
- +