ocr changes

This commit is contained in:
Anthony Stirling 2023-04-01 00:58:28 +01:00
parent 078d17374c
commit d575372b53
6 changed files with 151 additions and 94 deletions

View File

@ -47,9 +47,13 @@ public class OCRController {
@PostMapping("/ocr-pdf")
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
@RequestParam("languages") List<String> selectedLanguages,
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
@RequestParam(name = "sidecar", required = false) Boolean sidecar,
@RequestParam(name = "deskew", required = false) Boolean deskew,
@RequestParam(name = "clean", required = false) Boolean clean,
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal,
@RequestParam(name = "ocrType", required = false) String ocrType) throws IOException, InterruptedException {
//--output-type pdfa
if (selectedLanguages == null || selectedLanguages.size() < 1) {
throw new IOException("Please select at least one language.");
@ -62,18 +66,40 @@ public class OCRController {
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Prepare the output file path
Path sidecarTextPath = null;
// Run OCR Command
String languageOption = String.join("+", selectedLanguages);
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2"));
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
if (sidecar != null && sidecar) {
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
command.add("--sidecar");
command.add(sidecarFile);
command.add(sidecarTextPath.toString());
}
if (deskew != null && deskew) {
command.add("--deskew");
}
if (clean != null && clean) {
command.add("--clean");
}
if (cleanFinal != null && cleanFinal) {
command.add("--clean-final");
}
if (ocrType != null && !ocrType.equals("")) {
if("skip-text".equals(ocrType)) {
command.add("--skip-text");
} else if("force-ocr".equals(ocrType)) {
command.add("--force-ocr");
} else if("Normal".equals(ocrType)) {
}
}
command.addAll(Arrays.asList("--language", languageOption,
tempInputFile.toString(), tempOutputFile.toString()));
@ -104,9 +130,9 @@ public class OCRController {
zipOut.closeEntry();
// Add text file to the zip
ZipEntry txtEntry = new ZipEntry(sidecarFile);
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
zipOut.putNextEntry(txtEntry);
Files.copy(Paths.get(sidecarFile), zipOut);
Files.copy(sidecarTextPath, zipOut);
zipOut.closeEntry();
}
@ -115,7 +141,7 @@ public class OCRController {
// Clean up the temporary zip file
Files.delete(tempZipFile);
Files.delete(tempOutputFile);
Files.delete(Paths.get(sidecarFile));
Files.delete(sidecarTextPath);
// Return the zip file containing both the PDF and the text file
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);

View File

@ -73,7 +73,6 @@ public class ConvertImgPDFController {
if (singleImage) {
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat)));
headers.setCacheControl("must-revalidate, post-check=0, pre-check=0");
ResponseEntity<Resource> response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK);
return response;
} else {

View File

@ -40,7 +40,7 @@ public class ProcessExecutor {
semaphore.acquire();
try {
System.out.print("Running command: " + String.join(" ", command));
ProcessBuilder processBuilder = new ProcessBuilder(command);
Process process = processBuilder.start();

View File

@ -93,11 +93,22 @@ settings.downloadOption.2=Open in new window
settings.downloadOption.3=Download file
settings.zipThreshold=Zip files when the number of downloaded files exceeds
#OCR
ocr.title=OCR
ocr.header=OCR (Optical Character Recognition)
ocr.selectText.1=Select languages that are to be detected within the PDF (Ones listed are the ones currently detected):
ocr.selectText.2=Produce text file containing OCR text alongside the OCR'ed PDF
ocr.selectText.3=Correct pages were scanned at a skewed angle by rotating them back into place
ocr.selectText.4=Clean page so its less likely that OCR will find text in background noise. (No output change)
ocr.selectText.5=Clean page so its less likely that OCR will find text in background noise, maintains cleanup in output.
ocr.selectText.6=Ignores pages that have interacive text on them, only OCRs pages that are images
ocr.selectText.7=Force OCR, will OCR Every page removing all original text elements
ocr.selectText.8=Normal (Will error if PDF contains text)
ocr.selectText.9=Additional Settings
ocr.selectText.10=OCR Mode
ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker
ocr.credit=This service uses OCRmyPDF and Tesseract for OCR.
ocr.submit=Process PDF with OCR

View File

@ -194,98 +194,106 @@ function toggleDarkMode() {
}
});
async function submitMultiPdfForm(event,url) {
async function submitMultiPdfForm(event, url) {
// Get the selected PDF files
var files = $('#fileInput-input')[0].files;
let files = $('#fileInput-input')[0].files;
// Get the existing form data
var formData = new FormData($('form')[0]);
let formData = new FormData($('form')[0]);
formData.delete('fileInput');
// Show the progress bar
$('#progressBarContainer').show();
// Initialize the progress bar
var progressBar = $('#progressBar');
let progressBar = $('#progressBar');
progressBar.css('width', '0%');
progressBar.attr('aria-valuenow', 0);
progressBar.attr('aria-valuemax', files.length);
// Check the flag in localStorage, default to 4
// Check the flag in localStorage, default to 4
const zipThreshold = parseInt(localStorage.getItem('zipThreshold'), 10) || 4;
const zipFiles = files.length > zipThreshold;
// Initialize JSZip instance if needed
let jszip = null;
if (zipFiles) {
jszip = new JSZip();
jszip = new JSZip();
}
// Submit each PDF file in parallel
var promises = [];
for (var i = 0; i < files.length; i++) {
var promise = new Promise(function(resolve, reject) {
var fileFormData = new FormData();
let promises = [];
for (let i = 0; i < files.length; i++) {
let promise = new Promise(async function(resolve, reject) {
let fileFormData = new FormData();
fileFormData.append('fileInput', files[i]);
for (var pair of formData.entries()) {
for (let pair of formData.entries()) {
fileFormData.append(pair[0], pair[1]);
}
console.log(fileFormData);
fetch(url, {
method: 'POST',
body: fileFormData
}).then(function(response) {
try {
let response = await fetch(url, {
method: 'POST',
body: fileFormData
});
if (!response) {
throw new Error('Received null response for file ' + i);
}
if (!response.ok) {
throw new Error(`Error submitting request for file ${i}: ${response.status} ${response.statusText}`);
}
let contentDisposition = response.headers.get('content-disposition');
let fileName = "file.pdf"
if (!contentDisposition) {
//throw new Error('Content-Disposition header not found for file ' + i);
} else {
fileName = contentDisposition.split('filename=')[1].replace(/"/g, '');
}
console.log('Received response for file ' + i + ': ' + response);
var contentDisposition = response.headers.get('content-disposition');
var fileName = contentDisposition.split('filename=')[1].replace(/"/g, '');
response.blob().then(function (blob) {
if (zipFiles) {
// Add the file to the ZIP archive
jszip.file(fileName, blob);
resolve();
} else {
// Download the file directly
var url = window.URL.createObjectURL(blob);
var a = document.createElement('a');
a.href = url;
a.download = fileName;
document.body.appendChild(a);
a.click();
a.remove();
resolve();
}
});
}).catch(function(error) {
let blob = await response.blob();
if (zipFiles) {
// Add the file to the ZIP archive
jszip.file(fileName, blob);
resolve();
} else {
// Download the file directly
let url = window.URL.createObjectURL(blob);
let a = document.createElement('a');
a.href = url;
a.download = fileName;
document.body.appendChild(a);
a.click();
a.remove();
resolve();
}
} catch (error) {
console.error('Error submitting request for file ' + i + ': ' + error);
// Set default values or fallbacks for error properties
var status = error && error.status || 500;
var statusText = error && error.statusText || 'Internal Server Error';
var message = error && error.message || 'An error occurred while processing your request.';
// Reject the Promise to signal that the request has failed
let status = error && error.status || 500;
let statusText = error && error.statusText || 'Internal Server Error';
let message = error && error.message || 'An error occurred while processing your request.';
// Reject the Promise to signal that the request has failed
reject();
// Redirect to error page with Spring Boot error parameters
var url = '/error?status=' + status + '&error=' + encodeURIComponent(statusText) + '&message=' + encodeURIComponent(message);
// Redirect to error page with Spring Boot error parameters
let url = '/error?status=' + status + '&error=' + encodeURIComponent(statusText) + '&message=' + encodeURIComponent(message);
window.location.href = url;
});
}
});
// Update the progress bar as each request finishes
promise.then(function() {
var progress = ((progressBar.attr('aria-valuenow') / files.length) * 100) + (100 / files.length);
progressBar.css('width', progress + '%');
progressBar.attr('aria-valuenow', parseInt(progressBar.attr('aria-valuenow')) + 1);
updateProgressBar(progressBar, files);
});
promises.push(promise);
}
@ -295,24 +303,33 @@ function toggleDarkMode() {
} catch (error) {
console.error('Error while uploading files: ' + error);
}
// Update the progress bar
progressBar.css('width', '100%');
progressBar.attr('aria-valuenow', files.length);
// After all requests are finished, download the ZIP file if needed
// After all requests are finished, download the ZIP file if needed
if (zipFiles) {
jszip.generateAsync({ type: "blob" }).then(function (content) {
var url = window.URL.createObjectURL(content);
var a = document.createElement('a');
a.href = url;
a.download = "files.zip";
document.body.appendChild(a);
a.click();
a.remove();
});
try {
let content = await jszip.generateAsync({ type: "blob" });
let url = window.URL.createObjectURL(content);
let a = document.createElement('a');
a.href = url;
a.download = "files.zip";
document.body.appendChild(a);
a.click();
a.remove();
} catch (error) {
console.error('Error generating ZIP file: ' + error);
}
}
}
function updateProgressBar(progressBar, files) {
let progress = ((progressBar.attr('aria-valuenow') / files.length) * 100) + (100 / files.length);
progressBar.css('width', progress + '%');
progressBar.attr('aria-valuenow', parseInt(progressBar.attr('aria-valuenow')) + 1);
}

View File

@ -18,36 +18,40 @@
<form action="#" th:action="@{/ocr-pdf}" method="post" enctype="multipart/form-data" class="mb-3">
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
<div class="form-group">
<label for="languages" class="form-label" th:text="#{ocr.selectText.1}"></label>
<div id="languages">
<div th:each="language: ${languages}">
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
<label class="form-check-label" th:for="${'language-' + language}" th:text="${language}"></label>
</div>
</div>
</div>
<div class="form-group">
<label for="languages" class="form-label" th:text="#{ocr.selectText.1}"></label>
<hr>
<div id="languages">
<div th:each="language, iterStat : ${languages}" >
<span th:text=" ${iterStat.index + 1} + '.'"></span>
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
<label class="form-check-label" th:for="${'language-' + language}" th:text=" ${language}"></label>
</div>
</div>
<hr>
</div>
<label for="languages" class="form-label" th:text="#{ocr.selectText.9}"></label>
<div class="form-check">
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
<label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label>
</div>
<div class="form-group">
<div class="form-check">
<input type="checkbox" class="form-check-input" name="deskew" id="deskew" />
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.2}"></label>
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.3}"></label>
</div>
<div class="form-group">
<div class="form-check">
<input type="checkbox" class="form-check-input" name="clean" id="clean" />
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.2}"></label>
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.4}"></label>
</div>
<div class="form-group">
<div class="form-check">
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.2}"></label>
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.5}"></label>
</div>
<div class="form-group">
<label th:text="#{pdfToImage.selectText}"></label>
<label th:text="#{ocr.selectText.10}"></label>
<select class="form-control" name="ocrType">
<option value="skip-text">Ignores pages that have interacive text on them, only OCRs pages that are images</option>
<option value="force-ocr">Force OCR, will OCR Every page removing all original text</option>
<option value="Normal">Normal (Will error if contains text)</option>
<option value="skip-text" th:text="#{ocr.selectText.6}"></option>
<option value="force-ocr" th:text="#{ocr.selectText.7}"></option>
<option value="Normal" th:text="#{ocr.selectText.8}"></option>
</select>
</div>
<button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button>