mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-03 17:52:30 +02:00
ocr changes
This commit is contained in:
parent
078d17374c
commit
d575372b53
@ -47,9 +47,13 @@ public class OCRController {
|
||||
@PostMapping("/ocr-pdf")
|
||||
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
|
||||
@RequestParam("languages") List<String> selectedLanguages,
|
||||
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
|
||||
|
||||
@RequestParam(name = "sidecar", required = false) Boolean sidecar,
|
||||
@RequestParam(name = "deskew", required = false) Boolean deskew,
|
||||
@RequestParam(name = "clean", required = false) Boolean clean,
|
||||
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal,
|
||||
@RequestParam(name = "ocrType", required = false) String ocrType) throws IOException, InterruptedException {
|
||||
|
||||
|
||||
//--output-type pdfa
|
||||
if (selectedLanguages == null || selectedLanguages.size() < 1) {
|
||||
throw new IOException("Please select at least one language.");
|
||||
@ -62,18 +66,40 @@ public class OCRController {
|
||||
// Prepare the output file path
|
||||
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||
|
||||
// Prepare the output file path
|
||||
Path sidecarTextPath = null;
|
||||
|
||||
// Run OCR Command
|
||||
String languageOption = String.join("+", selectedLanguages);
|
||||
|
||||
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2"));
|
||||
|
||||
|
||||
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
|
||||
if (sidecar != null && sidecar) {
|
||||
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
|
||||
command.add("--sidecar");
|
||||
command.add(sidecarFile);
|
||||
command.add(sidecarTextPath.toString());
|
||||
}
|
||||
|
||||
if (deskew != null && deskew) {
|
||||
command.add("--deskew");
|
||||
}
|
||||
if (clean != null && clean) {
|
||||
command.add("--clean");
|
||||
}
|
||||
if (cleanFinal != null && cleanFinal) {
|
||||
command.add("--clean-final");
|
||||
}
|
||||
if (ocrType != null && !ocrType.equals("")) {
|
||||
if("skip-text".equals(ocrType)) {
|
||||
command.add("--skip-text");
|
||||
} else if("force-ocr".equals(ocrType)) {
|
||||
command.add("--force-ocr");
|
||||
} else if("Normal".equals(ocrType)) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
command.addAll(Arrays.asList("--language", languageOption,
|
||||
tempInputFile.toString(), tempOutputFile.toString()));
|
||||
|
||||
@ -104,9 +130,9 @@ public class OCRController {
|
||||
zipOut.closeEntry();
|
||||
|
||||
// Add text file to the zip
|
||||
ZipEntry txtEntry = new ZipEntry(sidecarFile);
|
||||
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
|
||||
zipOut.putNextEntry(txtEntry);
|
||||
Files.copy(Paths.get(sidecarFile), zipOut);
|
||||
Files.copy(sidecarTextPath, zipOut);
|
||||
zipOut.closeEntry();
|
||||
}
|
||||
|
||||
@ -115,7 +141,7 @@ public class OCRController {
|
||||
// Clean up the temporary zip file
|
||||
Files.delete(tempZipFile);
|
||||
Files.delete(tempOutputFile);
|
||||
Files.delete(Paths.get(sidecarFile));
|
||||
Files.delete(sidecarTextPath);
|
||||
|
||||
// Return the zip file containing both the PDF and the text file
|
||||
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
|
||||
|
@ -73,7 +73,6 @@ public class ConvertImgPDFController {
|
||||
if (singleImage) {
|
||||
HttpHeaders headers = new HttpHeaders();
|
||||
headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat)));
|
||||
headers.setCacheControl("must-revalidate, post-check=0, pre-check=0");
|
||||
ResponseEntity<Resource> response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK);
|
||||
return response;
|
||||
} else {
|
||||
|
@ -40,7 +40,7 @@ public class ProcessExecutor {
|
||||
semaphore.acquire();
|
||||
try {
|
||||
|
||||
|
||||
System.out.print("Running command: " + String.join(" ", command));
|
||||
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
||||
Process process = processBuilder.start();
|
||||
|
||||
|
@ -93,11 +93,22 @@ settings.downloadOption.2=Open in new window
|
||||
settings.downloadOption.3=Download file
|
||||
settings.zipThreshold=Zip files when the number of downloaded files exceeds
|
||||
|
||||
|
||||
|
||||
|
||||
#OCR
|
||||
ocr.title=OCR
|
||||
ocr.header=OCR (Optical Character Recognition)
|
||||
ocr.selectText.1=Select languages that are to be detected within the PDF (Ones listed are the ones currently detected):
|
||||
ocr.selectText.2=Produce text file containing OCR text alongside the OCR'ed PDF
|
||||
ocr.selectText.3=Correct pages were scanned at a skewed angle by rotating them back into place
|
||||
ocr.selectText.4=Clean page so its less likely that OCR will find text in background noise. (No output change)
|
||||
ocr.selectText.5=Clean page so its less likely that OCR will find text in background noise, maintains cleanup in output.
|
||||
ocr.selectText.6=Ignores pages that have interacive text on them, only OCRs pages that are images
|
||||
ocr.selectText.7=Force OCR, will OCR Every page removing all original text elements
|
||||
ocr.selectText.8=Normal (Will error if PDF contains text)
|
||||
ocr.selectText.9=Additional Settings
|
||||
ocr.selectText.10=OCR Mode
|
||||
ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker
|
||||
ocr.credit=This service uses OCRmyPDF and Tesseract for OCR.
|
||||
ocr.submit=Process PDF with OCR
|
||||
|
@ -194,98 +194,106 @@ function toggleDarkMode() {
|
||||
}
|
||||
});
|
||||
|
||||
async function submitMultiPdfForm(event,url) {
|
||||
async function submitMultiPdfForm(event, url) {
|
||||
// Get the selected PDF files
|
||||
var files = $('#fileInput-input')[0].files;
|
||||
let files = $('#fileInput-input')[0].files;
|
||||
|
||||
// Get the existing form data
|
||||
var formData = new FormData($('form')[0]);
|
||||
let formData = new FormData($('form')[0]);
|
||||
formData.delete('fileInput');
|
||||
|
||||
|
||||
// Show the progress bar
|
||||
$('#progressBarContainer').show();
|
||||
|
||||
// Initialize the progress bar
|
||||
var progressBar = $('#progressBar');
|
||||
let progressBar = $('#progressBar');
|
||||
progressBar.css('width', '0%');
|
||||
progressBar.attr('aria-valuenow', 0);
|
||||
progressBar.attr('aria-valuemax', files.length);
|
||||
|
||||
// Check the flag in localStorage, default to 4
|
||||
|
||||
// Check the flag in localStorage, default to 4
|
||||
const zipThreshold = parseInt(localStorage.getItem('zipThreshold'), 10) || 4;
|
||||
const zipFiles = files.length > zipThreshold;
|
||||
|
||||
// Initialize JSZip instance if needed
|
||||
let jszip = null;
|
||||
if (zipFiles) {
|
||||
jszip = new JSZip();
|
||||
jszip = new JSZip();
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Submit each PDF file in parallel
|
||||
var promises = [];
|
||||
for (var i = 0; i < files.length; i++) {
|
||||
var promise = new Promise(function(resolve, reject) {
|
||||
var fileFormData = new FormData();
|
||||
let promises = [];
|
||||
for (let i = 0; i < files.length; i++) {
|
||||
let promise = new Promise(async function(resolve, reject) {
|
||||
let fileFormData = new FormData();
|
||||
fileFormData.append('fileInput', files[i]);
|
||||
for (var pair of formData.entries()) {
|
||||
for (let pair of formData.entries()) {
|
||||
fileFormData.append(pair[0], pair[1]);
|
||||
}
|
||||
console.log(fileFormData);
|
||||
|
||||
fetch(url, {
|
||||
method: 'POST',
|
||||
body: fileFormData
|
||||
}).then(function(response) {
|
||||
try {
|
||||
let response = await fetch(url, {
|
||||
method: 'POST',
|
||||
body: fileFormData
|
||||
});
|
||||
|
||||
if (!response) {
|
||||
throw new Error('Received null response for file ' + i);
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Error submitting request for file ${i}: ${response.status} ${response.statusText}`);
|
||||
}
|
||||
|
||||
let contentDisposition = response.headers.get('content-disposition');
|
||||
let fileName = "file.pdf"
|
||||
if (!contentDisposition) {
|
||||
//throw new Error('Content-Disposition header not found for file ' + i);
|
||||
} else {
|
||||
fileName = contentDisposition.split('filename=')[1].replace(/"/g, '');
|
||||
}
|
||||
console.log('Received response for file ' + i + ': ' + response);
|
||||
|
||||
var contentDisposition = response.headers.get('content-disposition');
|
||||
var fileName = contentDisposition.split('filename=')[1].replace(/"/g, '');
|
||||
|
||||
|
||||
response.blob().then(function (blob) {
|
||||
if (zipFiles) {
|
||||
// Add the file to the ZIP archive
|
||||
jszip.file(fileName, blob);
|
||||
resolve();
|
||||
} else {
|
||||
// Download the file directly
|
||||
var url = window.URL.createObjectURL(blob);
|
||||
var a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = fileName;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
a.remove();
|
||||
resolve();
|
||||
}
|
||||
});
|
||||
|
||||
}).catch(function(error) {
|
||||
let blob = await response.blob();
|
||||
if (zipFiles) {
|
||||
// Add the file to the ZIP archive
|
||||
jszip.file(fileName, blob);
|
||||
resolve();
|
||||
} else {
|
||||
// Download the file directly
|
||||
let url = window.URL.createObjectURL(blob);
|
||||
let a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = fileName;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
a.remove();
|
||||
resolve();
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error submitting request for file ' + i + ': ' + error);
|
||||
|
||||
|
||||
// Set default values or fallbacks for error properties
|
||||
var status = error && error.status || 500;
|
||||
var statusText = error && error.statusText || 'Internal Server Error';
|
||||
var message = error && error.message || 'An error occurred while processing your request.';
|
||||
|
||||
// Reject the Promise to signal that the request has failed
|
||||
let status = error && error.status || 500;
|
||||
let statusText = error && error.statusText || 'Internal Server Error';
|
||||
let message = error && error.message || 'An error occurred while processing your request.';
|
||||
|
||||
// Reject the Promise to signal that the request has failed
|
||||
reject();
|
||||
// Redirect to error page with Spring Boot error parameters
|
||||
var url = '/error?status=' + status + '&error=' + encodeURIComponent(statusText) + '&message=' + encodeURIComponent(message);
|
||||
// Redirect to error page with Spring Boot error parameters
|
||||
let url = '/error?status=' + status + '&error=' + encodeURIComponent(statusText) + '&message=' + encodeURIComponent(message);
|
||||
window.location.href = url;
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
// Update the progress bar as each request finishes
|
||||
promise.then(function() {
|
||||
var progress = ((progressBar.attr('aria-valuenow') / files.length) * 100) + (100 / files.length);
|
||||
progressBar.css('width', progress + '%');
|
||||
progressBar.attr('aria-valuenow', parseInt(progressBar.attr('aria-valuenow')) + 1);
|
||||
updateProgressBar(progressBar, files);
|
||||
});
|
||||
|
||||
|
||||
promises.push(promise);
|
||||
}
|
||||
|
||||
@ -295,24 +303,33 @@ function toggleDarkMode() {
|
||||
} catch (error) {
|
||||
console.error('Error while uploading files: ' + error);
|
||||
}
|
||||
|
||||
|
||||
// Update the progress bar
|
||||
progressBar.css('width', '100%');
|
||||
progressBar.attr('aria-valuenow', files.length);
|
||||
|
||||
// After all requests are finished, download the ZIP file if needed
|
||||
|
||||
// After all requests are finished, download the ZIP file if needed
|
||||
if (zipFiles) {
|
||||
jszip.generateAsync({ type: "blob" }).then(function (content) {
|
||||
var url = window.URL.createObjectURL(content);
|
||||
var a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = "files.zip";
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
a.remove();
|
||||
});
|
||||
try {
|
||||
let content = await jszip.generateAsync({ type: "blob" });
|
||||
let url = window.URL.createObjectURL(content);
|
||||
let a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = "files.zip";
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
a.remove();
|
||||
} catch (error) {
|
||||
console.error('Error generating ZIP file: ' + error);
|
||||
}
|
||||
}
|
||||
}
|
||||
function updateProgressBar(progressBar, files) {
|
||||
let progress = ((progressBar.attr('aria-valuenow') / files.length) * 100) + (100 / files.length);
|
||||
progressBar.css('width', progress + '%');
|
||||
progressBar.attr('aria-valuenow', parseInt(progressBar.attr('aria-valuenow')) + 1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -18,36 +18,40 @@
|
||||
<form action="#" th:action="@{/ocr-pdf}" method="post" enctype="multipart/form-data" class="mb-3">
|
||||
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
||||
<div class="form-group">
|
||||
<label for="languages" class="form-label" th:text="#{ocr.selectText.1}"></label>
|
||||
<div id="languages">
|
||||
<div th:each="language: ${languages}">
|
||||
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
|
||||
<label class="form-check-label" th:for="${'language-' + language}" th:text="${language}"></label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="languages" class="form-label" th:text="#{ocr.selectText.1}"></label>
|
||||
<hr>
|
||||
<div id="languages">
|
||||
<div th:each="language, iterStat : ${languages}" >
|
||||
<span th:text=" ${iterStat.index + 1} + '.'"></span>
|
||||
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
|
||||
<label class="form-check-label" th:for="${'language-' + language}" th:text=" ${language}"></label>
|
||||
</div>
|
||||
</div>
|
||||
<hr>
|
||||
</div>
|
||||
<label for="languages" class="form-label" th:text="#{ocr.selectText.9}"></label>
|
||||
<div class="form-check">
|
||||
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
|
||||
<label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<div class="form-check">
|
||||
<input type="checkbox" class="form-check-input" name="deskew" id="deskew" />
|
||||
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.2}"></label>
|
||||
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.3}"></label>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<div class="form-check">
|
||||
<input type="checkbox" class="form-check-input" name="clean" id="clean" />
|
||||
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.2}"></label>
|
||||
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.4}"></label>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<div class="form-check">
|
||||
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
|
||||
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.2}"></label>
|
||||
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.5}"></label>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label th:text="#{pdfToImage.selectText}"></label>
|
||||
<label th:text="#{ocr.selectText.10}"></label>
|
||||
<select class="form-control" name="ocrType">
|
||||
<option value="skip-text">Ignores pages that have interacive text on them, only OCRs pages that are images</option>
|
||||
<option value="force-ocr">Force OCR, will OCR Every page removing all original text</option>
|
||||
<option value="Normal">Normal (Will error if contains text)</option>
|
||||
<option value="skip-text" th:text="#{ocr.selectText.6}"></option>
|
||||
<option value="force-ocr" th:text="#{ocr.selectText.7}"></option>
|
||||
<option value="Normal" th:text="#{ocr.selectText.8}"></option>
|
||||
</select>
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button>
|
||||
|
Loading…
Reference in New Issue
Block a user