ocr changes

This commit is contained in:
Anthony Stirling 2023-04-01 00:58:28 +01:00
parent 078d17374c
commit d575372b53
6 changed files with 151 additions and 94 deletions

View File

@ -47,9 +47,13 @@ public class OCRController {
@PostMapping("/ocr-pdf") @PostMapping("/ocr-pdf")
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile, public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
@RequestParam("languages") List<String> selectedLanguages, @RequestParam("languages") List<String> selectedLanguages,
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException { @RequestParam(name = "sidecar", required = false) Boolean sidecar,
@RequestParam(name = "deskew", required = false) Boolean deskew,
@RequestParam(name = "clean", required = false) Boolean clean,
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal,
@RequestParam(name = "ocrType", required = false) String ocrType) throws IOException, InterruptedException {
//--output-type pdfa //--output-type pdfa
if (selectedLanguages == null || selectedLanguages.size() < 1) { if (selectedLanguages == null || selectedLanguages.size() < 1) {
throw new IOException("Please select at least one language."); throw new IOException("Please select at least one language.");
@ -62,18 +66,40 @@ public class OCRController {
// Prepare the output file path // Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf"); Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Prepare the output file path
Path sidecarTextPath = null;
// Run OCR Command // Run OCR Command
String languageOption = String.join("+", selectedLanguages); String languageOption = String.join("+", selectedLanguages);
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2")); List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2"));
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
if (sidecar != null && sidecar) { if (sidecar != null && sidecar) {
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
command.add("--sidecar"); command.add("--sidecar");
command.add(sidecarFile); command.add(sidecarTextPath.toString());
} }
if (deskew != null && deskew) {
command.add("--deskew");
}
if (clean != null && clean) {
command.add("--clean");
}
if (cleanFinal != null && cleanFinal) {
command.add("--clean-final");
}
if (ocrType != null && !ocrType.equals("")) {
if("skip-text".equals(ocrType)) {
command.add("--skip-text");
} else if("force-ocr".equals(ocrType)) {
command.add("--force-ocr");
} else if("Normal".equals(ocrType)) {
}
}
command.addAll(Arrays.asList("--language", languageOption, command.addAll(Arrays.asList("--language", languageOption,
tempInputFile.toString(), tempOutputFile.toString())); tempInputFile.toString(), tempOutputFile.toString()));
@ -104,9 +130,9 @@ public class OCRController {
zipOut.closeEntry(); zipOut.closeEntry();
// Add text file to the zip // Add text file to the zip
ZipEntry txtEntry = new ZipEntry(sidecarFile); ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
zipOut.putNextEntry(txtEntry); zipOut.putNextEntry(txtEntry);
Files.copy(Paths.get(sidecarFile), zipOut); Files.copy(sidecarTextPath, zipOut);
zipOut.closeEntry(); zipOut.closeEntry();
} }
@ -115,7 +141,7 @@ public class OCRController {
// Clean up the temporary zip file // Clean up the temporary zip file
Files.delete(tempZipFile); Files.delete(tempZipFile);
Files.delete(tempOutputFile); Files.delete(tempOutputFile);
Files.delete(Paths.get(sidecarFile)); Files.delete(sidecarTextPath);
// Return the zip file containing both the PDF and the text file // Return the zip file containing both the PDF and the text file
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM); headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);

View File

@ -73,7 +73,6 @@ public class ConvertImgPDFController {
if (singleImage) { if (singleImage) {
HttpHeaders headers = new HttpHeaders(); HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat))); headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat)));
headers.setCacheControl("must-revalidate, post-check=0, pre-check=0");
ResponseEntity<Resource> response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK); ResponseEntity<Resource> response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK);
return response; return response;
} else { } else {

View File

@ -40,7 +40,7 @@ public class ProcessExecutor {
semaphore.acquire(); semaphore.acquire();
try { try {
System.out.print("Running command: " + String.join(" ", command));
ProcessBuilder processBuilder = new ProcessBuilder(command); ProcessBuilder processBuilder = new ProcessBuilder(command);
Process process = processBuilder.start(); Process process = processBuilder.start();

View File

@ -93,11 +93,22 @@ settings.downloadOption.2=Open in new window
settings.downloadOption.3=Download file settings.downloadOption.3=Download file
settings.zipThreshold=Zip files when the number of downloaded files exceeds settings.zipThreshold=Zip files when the number of downloaded files exceeds
#OCR #OCR
ocr.title=OCR ocr.title=OCR
ocr.header=OCR (Optical Character Recognition) ocr.header=OCR (Optical Character Recognition)
ocr.selectText.1=Select languages that are to be detected within the PDF (Ones listed are the ones currently detected): ocr.selectText.1=Select languages that are to be detected within the PDF (Ones listed are the ones currently detected):
ocr.selectText.2=Produce text file containing OCR text alongside the OCR'ed PDF ocr.selectText.2=Produce text file containing OCR text alongside the OCR'ed PDF
ocr.selectText.3=Correct pages were scanned at a skewed angle by rotating them back into place
ocr.selectText.4=Clean page so its less likely that OCR will find text in background noise. (No output change)
ocr.selectText.5=Clean page so its less likely that OCR will find text in background noise, maintains cleanup in output.
ocr.selectText.6=Ignores pages that have interacive text on them, only OCRs pages that are images
ocr.selectText.7=Force OCR, will OCR Every page removing all original text elements
ocr.selectText.8=Normal (Will error if PDF contains text)
ocr.selectText.9=Additional Settings
ocr.selectText.10=OCR Mode
ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker
ocr.credit=This service uses OCRmyPDF and Tesseract for OCR. ocr.credit=This service uses OCRmyPDF and Tesseract for OCR.
ocr.submit=Process PDF with OCR ocr.submit=Process PDF with OCR

View File

@ -194,98 +194,106 @@ function toggleDarkMode() {
} }
}); });
async function submitMultiPdfForm(event,url) { async function submitMultiPdfForm(event, url) {
// Get the selected PDF files // Get the selected PDF files
var files = $('#fileInput-input')[0].files; let files = $('#fileInput-input')[0].files;
// Get the existing form data // Get the existing form data
var formData = new FormData($('form')[0]); let formData = new FormData($('form')[0]);
formData.delete('fileInput'); formData.delete('fileInput');
// Show the progress bar // Show the progress bar
$('#progressBarContainer').show(); $('#progressBarContainer').show();
// Initialize the progress bar // Initialize the progress bar
var progressBar = $('#progressBar'); let progressBar = $('#progressBar');
progressBar.css('width', '0%'); progressBar.css('width', '0%');
progressBar.attr('aria-valuenow', 0); progressBar.attr('aria-valuenow', 0);
progressBar.attr('aria-valuemax', files.length); progressBar.attr('aria-valuemax', files.length);
// Check the flag in localStorage, default to 4 // Check the flag in localStorage, default to 4
const zipThreshold = parseInt(localStorage.getItem('zipThreshold'), 10) || 4; const zipThreshold = parseInt(localStorage.getItem('zipThreshold'), 10) || 4;
const zipFiles = files.length > zipThreshold; const zipFiles = files.length > zipThreshold;
// Initialize JSZip instance if needed // Initialize JSZip instance if needed
let jszip = null; let jszip = null;
if (zipFiles) { if (zipFiles) {
jszip = new JSZip(); jszip = new JSZip();
} }
// Submit each PDF file in parallel // Submit each PDF file in parallel
var promises = []; let promises = [];
for (var i = 0; i < files.length; i++) { for (let i = 0; i < files.length; i++) {
var promise = new Promise(function(resolve, reject) { let promise = new Promise(async function(resolve, reject) {
var fileFormData = new FormData(); let fileFormData = new FormData();
fileFormData.append('fileInput', files[i]); fileFormData.append('fileInput', files[i]);
for (var pair of formData.entries()) { for (let pair of formData.entries()) {
fileFormData.append(pair[0], pair[1]); fileFormData.append(pair[0], pair[1]);
} }
console.log(fileFormData); console.log(fileFormData);
fetch(url, { try {
method: 'POST', let response = await fetch(url, {
body: fileFormData method: 'POST',
}).then(function(response) { body: fileFormData
});
if (!response) { if (!response) {
throw new Error('Received null response for file ' + i); throw new Error('Received null response for file ' + i);
} }
if (!response.ok) {
throw new Error(`Error submitting request for file ${i}: ${response.status} ${response.statusText}`);
}
let contentDisposition = response.headers.get('content-disposition');
let fileName = "file.pdf"
if (!contentDisposition) {
//throw new Error('Content-Disposition header not found for file ' + i);
} else {
fileName = contentDisposition.split('filename=')[1].replace(/"/g, '');
}
console.log('Received response for file ' + i + ': ' + response); console.log('Received response for file ' + i + ': ' + response);
var contentDisposition = response.headers.get('content-disposition');
var fileName = contentDisposition.split('filename=')[1].replace(/"/g, '');
response.blob().then(function (blob) { let blob = await response.blob();
if (zipFiles) { if (zipFiles) {
// Add the file to the ZIP archive // Add the file to the ZIP archive
jszip.file(fileName, blob); jszip.file(fileName, blob);
resolve(); resolve();
} else { } else {
// Download the file directly // Download the file directly
var url = window.URL.createObjectURL(blob); let url = window.URL.createObjectURL(blob);
var a = document.createElement('a'); let a = document.createElement('a');
a.href = url; a.href = url;
a.download = fileName; a.download = fileName;
document.body.appendChild(a); document.body.appendChild(a);
a.click(); a.click();
a.remove(); a.remove();
resolve(); resolve();
} }
}); } catch (error) {
}).catch(function(error) {
console.error('Error submitting request for file ' + i + ': ' + error); console.error('Error submitting request for file ' + i + ': ' + error);
// Set default values or fallbacks for error properties // Set default values or fallbacks for error properties
var status = error && error.status || 500; let status = error && error.status || 500;
var statusText = error && error.statusText || 'Internal Server Error'; let statusText = error && error.statusText || 'Internal Server Error';
var message = error && error.message || 'An error occurred while processing your request.'; let message = error && error.message || 'An error occurred while processing your request.';
// Reject the Promise to signal that the request has failed // Reject the Promise to signal that the request has failed
reject(); reject();
// Redirect to error page with Spring Boot error parameters // Redirect to error page with Spring Boot error parameters
var url = '/error?status=' + status + '&error=' + encodeURIComponent(statusText) + '&message=' + encodeURIComponent(message); let url = '/error?status=' + status + '&error=' + encodeURIComponent(statusText) + '&message=' + encodeURIComponent(message);
window.location.href = url; window.location.href = url;
}); }
}); });
// Update the progress bar as each request finishes // Update the progress bar as each request finishes
promise.then(function() { promise.then(function() {
var progress = ((progressBar.attr('aria-valuenow') / files.length) * 100) + (100 / files.length); updateProgressBar(progressBar, files);
progressBar.css('width', progress + '%');
progressBar.attr('aria-valuenow', parseInt(progressBar.attr('aria-valuenow')) + 1);
}); });
promises.push(promise); promises.push(promise);
} }
@ -295,24 +303,33 @@ function toggleDarkMode() {
} catch (error) { } catch (error) {
console.error('Error while uploading files: ' + error); console.error('Error while uploading files: ' + error);
} }
// Update the progress bar // Update the progress bar
progressBar.css('width', '100%'); progressBar.css('width', '100%');
progressBar.attr('aria-valuenow', files.length); progressBar.attr('aria-valuenow', files.length);
// After all requests are finished, download the ZIP file if needed // After all requests are finished, download the ZIP file if needed
if (zipFiles) { if (zipFiles) {
jszip.generateAsync({ type: "blob" }).then(function (content) { try {
var url = window.URL.createObjectURL(content); let content = await jszip.generateAsync({ type: "blob" });
var a = document.createElement('a'); let url = window.URL.createObjectURL(content);
a.href = url; let a = document.createElement('a');
a.download = "files.zip"; a.href = url;
document.body.appendChild(a); a.download = "files.zip";
a.click(); document.body.appendChild(a);
a.remove(); a.click();
}); a.remove();
} catch (error) {
console.error('Error generating ZIP file: ' + error);
}
} }
} }
function updateProgressBar(progressBar, files) {
let progress = ((progressBar.attr('aria-valuenow') / files.length) * 100) + (100 / files.length);
progressBar.css('width', progress + '%');
progressBar.attr('aria-valuenow', parseInt(progressBar.attr('aria-valuenow')) + 1);
}

View File

@ -18,36 +18,40 @@
<form action="#" th:action="@{/ocr-pdf}" method="post" enctype="multipart/form-data" class="mb-3"> <form action="#" th:action="@{/ocr-pdf}" method="post" enctype="multipart/form-data" class="mb-3">
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div> <div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
<div class="form-group"> <div class="form-group">
<label for="languages" class="form-label" th:text="#{ocr.selectText.1}"></label> <label for="languages" class="form-label" th:text="#{ocr.selectText.1}"></label>
<div id="languages"> <hr>
<div th:each="language: ${languages}"> <div id="languages">
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" /> <div th:each="language, iterStat : ${languages}" >
<label class="form-check-label" th:for="${'language-' + language}" th:text="${language}"></label> <span th:text=" ${iterStat.index + 1} + '.'"></span>
</div> <input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
</div> <label class="form-check-label" th:for="${'language-' + language}" th:text=" ${language}"></label>
</div> </div>
<div class="form-group"> </div>
<hr>
</div>
<label for="languages" class="form-label" th:text="#{ocr.selectText.9}"></label>
<div class="form-check">
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" /> <input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
<label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label> <label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label>
</div> </div>
<div class="form-group"> <div class="form-check">
<input type="checkbox" class="form-check-input" name="deskew" id="deskew" /> <input type="checkbox" class="form-check-input" name="deskew" id="deskew" />
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.2}"></label> <label class="form-check-label" for="deskew" th:text="#{ocr.selectText.3}"></label>
</div> </div>
<div class="form-group"> <div class="form-check">
<input type="checkbox" class="form-check-input" name="clean" id="clean" /> <input type="checkbox" class="form-check-input" name="clean" id="clean" />
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.2}"></label> <label class="form-check-label" for="clean" th:text="#{ocr.selectText.4}"></label>
</div> </div>
<div class="form-group"> <div class="form-check">
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" /> <input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.2}"></label> <label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.5}"></label>
</div> </div>
<div class="form-group"> <div class="form-group">
<label th:text="#{pdfToImage.selectText}"></label> <label th:text="#{ocr.selectText.10}"></label>
<select class="form-control" name="ocrType"> <select class="form-control" name="ocrType">
<option value="skip-text">Ignores pages that have interacive text on them, only OCRs pages that are images</option> <option value="skip-text" th:text="#{ocr.selectText.6}"></option>
<option value="force-ocr">Force OCR, will OCR Every page removing all original text</option> <option value="force-ocr" th:text="#{ocr.selectText.7}"></option>
<option value="Normal">Normal (Will error if contains text)</option> <option value="Normal" th:text="#{ocr.selectText.8}"></option>
</select> </select>
</div> </div>
<button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button> <button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button>