mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-03 17:52:30 +02:00
ocr changes
This commit is contained in:
parent
078d17374c
commit
d575372b53
@ -47,9 +47,13 @@ public class OCRController {
|
|||||||
@PostMapping("/ocr-pdf")
|
@PostMapping("/ocr-pdf")
|
||||||
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
|
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
|
||||||
@RequestParam("languages") List<String> selectedLanguages,
|
@RequestParam("languages") List<String> selectedLanguages,
|
||||||
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
|
@RequestParam(name = "sidecar", required = false) Boolean sidecar,
|
||||||
|
@RequestParam(name = "deskew", required = false) Boolean deskew,
|
||||||
|
@RequestParam(name = "clean", required = false) Boolean clean,
|
||||||
|
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal,
|
||||||
|
@RequestParam(name = "ocrType", required = false) String ocrType) throws IOException, InterruptedException {
|
||||||
|
|
||||||
|
|
||||||
//--output-type pdfa
|
//--output-type pdfa
|
||||||
if (selectedLanguages == null || selectedLanguages.size() < 1) {
|
if (selectedLanguages == null || selectedLanguages.size() < 1) {
|
||||||
throw new IOException("Please select at least one language.");
|
throw new IOException("Please select at least one language.");
|
||||||
@ -62,18 +66,40 @@ public class OCRController {
|
|||||||
// Prepare the output file path
|
// Prepare the output file path
|
||||||
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||||
|
|
||||||
|
// Prepare the output file path
|
||||||
|
Path sidecarTextPath = null;
|
||||||
|
|
||||||
// Run OCR Command
|
// Run OCR Command
|
||||||
String languageOption = String.join("+", selectedLanguages);
|
String languageOption = String.join("+", selectedLanguages);
|
||||||
|
|
||||||
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2"));
|
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2"));
|
||||||
|
|
||||||
|
|
||||||
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
|
|
||||||
if (sidecar != null && sidecar) {
|
if (sidecar != null && sidecar) {
|
||||||
|
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
|
||||||
command.add("--sidecar");
|
command.add("--sidecar");
|
||||||
command.add(sidecarFile);
|
command.add(sidecarTextPath.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (deskew != null && deskew) {
|
||||||
|
command.add("--deskew");
|
||||||
|
}
|
||||||
|
if (clean != null && clean) {
|
||||||
|
command.add("--clean");
|
||||||
|
}
|
||||||
|
if (cleanFinal != null && cleanFinal) {
|
||||||
|
command.add("--clean-final");
|
||||||
|
}
|
||||||
|
if (ocrType != null && !ocrType.equals("")) {
|
||||||
|
if("skip-text".equals(ocrType)) {
|
||||||
|
command.add("--skip-text");
|
||||||
|
} else if("force-ocr".equals(ocrType)) {
|
||||||
|
command.add("--force-ocr");
|
||||||
|
} else if("Normal".equals(ocrType)) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
command.addAll(Arrays.asList("--language", languageOption,
|
command.addAll(Arrays.asList("--language", languageOption,
|
||||||
tempInputFile.toString(), tempOutputFile.toString()));
|
tempInputFile.toString(), tempOutputFile.toString()));
|
||||||
|
|
||||||
@ -104,9 +130,9 @@ public class OCRController {
|
|||||||
zipOut.closeEntry();
|
zipOut.closeEntry();
|
||||||
|
|
||||||
// Add text file to the zip
|
// Add text file to the zip
|
||||||
ZipEntry txtEntry = new ZipEntry(sidecarFile);
|
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
|
||||||
zipOut.putNextEntry(txtEntry);
|
zipOut.putNextEntry(txtEntry);
|
||||||
Files.copy(Paths.get(sidecarFile), zipOut);
|
Files.copy(sidecarTextPath, zipOut);
|
||||||
zipOut.closeEntry();
|
zipOut.closeEntry();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -115,7 +141,7 @@ public class OCRController {
|
|||||||
// Clean up the temporary zip file
|
// Clean up the temporary zip file
|
||||||
Files.delete(tempZipFile);
|
Files.delete(tempZipFile);
|
||||||
Files.delete(tempOutputFile);
|
Files.delete(tempOutputFile);
|
||||||
Files.delete(Paths.get(sidecarFile));
|
Files.delete(sidecarTextPath);
|
||||||
|
|
||||||
// Return the zip file containing both the PDF and the text file
|
// Return the zip file containing both the PDF and the text file
|
||||||
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
|
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
|
||||||
|
@ -73,7 +73,6 @@ public class ConvertImgPDFController {
|
|||||||
if (singleImage) {
|
if (singleImage) {
|
||||||
HttpHeaders headers = new HttpHeaders();
|
HttpHeaders headers = new HttpHeaders();
|
||||||
headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat)));
|
headers.setContentType(MediaType.parseMediaType(getMediaType(imageFormat)));
|
||||||
headers.setCacheControl("must-revalidate, post-check=0, pre-check=0");
|
|
||||||
ResponseEntity<Resource> response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK);
|
ResponseEntity<Resource> response = new ResponseEntity<>(new ByteArrayResource(result), headers, HttpStatus.OK);
|
||||||
return response;
|
return response;
|
||||||
} else {
|
} else {
|
||||||
|
@ -40,7 +40,7 @@ public class ProcessExecutor {
|
|||||||
semaphore.acquire();
|
semaphore.acquire();
|
||||||
try {
|
try {
|
||||||
|
|
||||||
|
System.out.print("Running command: " + String.join(" ", command));
|
||||||
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
||||||
Process process = processBuilder.start();
|
Process process = processBuilder.start();
|
||||||
|
|
||||||
|
@ -93,11 +93,22 @@ settings.downloadOption.2=Open in new window
|
|||||||
settings.downloadOption.3=Download file
|
settings.downloadOption.3=Download file
|
||||||
settings.zipThreshold=Zip files when the number of downloaded files exceeds
|
settings.zipThreshold=Zip files when the number of downloaded files exceeds
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#OCR
|
#OCR
|
||||||
ocr.title=OCR
|
ocr.title=OCR
|
||||||
ocr.header=OCR (Optical Character Recognition)
|
ocr.header=OCR (Optical Character Recognition)
|
||||||
ocr.selectText.1=Select languages that are to be detected within the PDF (Ones listed are the ones currently detected):
|
ocr.selectText.1=Select languages that are to be detected within the PDF (Ones listed are the ones currently detected):
|
||||||
ocr.selectText.2=Produce text file containing OCR text alongside the OCR'ed PDF
|
ocr.selectText.2=Produce text file containing OCR text alongside the OCR'ed PDF
|
||||||
|
ocr.selectText.3=Correct pages were scanned at a skewed angle by rotating them back into place
|
||||||
|
ocr.selectText.4=Clean page so its less likely that OCR will find text in background noise. (No output change)
|
||||||
|
ocr.selectText.5=Clean page so its less likely that OCR will find text in background noise, maintains cleanup in output.
|
||||||
|
ocr.selectText.6=Ignores pages that have interacive text on them, only OCRs pages that are images
|
||||||
|
ocr.selectText.7=Force OCR, will OCR Every page removing all original text elements
|
||||||
|
ocr.selectText.8=Normal (Will error if PDF contains text)
|
||||||
|
ocr.selectText.9=Additional Settings
|
||||||
|
ocr.selectText.10=OCR Mode
|
||||||
ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker
|
ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker
|
||||||
ocr.credit=This service uses OCRmyPDF and Tesseract for OCR.
|
ocr.credit=This service uses OCRmyPDF and Tesseract for OCR.
|
||||||
ocr.submit=Process PDF with OCR
|
ocr.submit=Process PDF with OCR
|
||||||
|
@ -194,98 +194,106 @@ function toggleDarkMode() {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
async function submitMultiPdfForm(event,url) {
|
async function submitMultiPdfForm(event, url) {
|
||||||
// Get the selected PDF files
|
// Get the selected PDF files
|
||||||
var files = $('#fileInput-input')[0].files;
|
let files = $('#fileInput-input')[0].files;
|
||||||
|
|
||||||
// Get the existing form data
|
// Get the existing form data
|
||||||
var formData = new FormData($('form')[0]);
|
let formData = new FormData($('form')[0]);
|
||||||
formData.delete('fileInput');
|
formData.delete('fileInput');
|
||||||
|
|
||||||
// Show the progress bar
|
// Show the progress bar
|
||||||
$('#progressBarContainer').show();
|
$('#progressBarContainer').show();
|
||||||
|
|
||||||
// Initialize the progress bar
|
// Initialize the progress bar
|
||||||
var progressBar = $('#progressBar');
|
let progressBar = $('#progressBar');
|
||||||
progressBar.css('width', '0%');
|
progressBar.css('width', '0%');
|
||||||
progressBar.attr('aria-valuenow', 0);
|
progressBar.attr('aria-valuenow', 0);
|
||||||
progressBar.attr('aria-valuemax', files.length);
|
progressBar.attr('aria-valuemax', files.length);
|
||||||
|
|
||||||
// Check the flag in localStorage, default to 4
|
// Check the flag in localStorage, default to 4
|
||||||
const zipThreshold = parseInt(localStorage.getItem('zipThreshold'), 10) || 4;
|
const zipThreshold = parseInt(localStorage.getItem('zipThreshold'), 10) || 4;
|
||||||
const zipFiles = files.length > zipThreshold;
|
const zipFiles = files.length > zipThreshold;
|
||||||
|
|
||||||
// Initialize JSZip instance if needed
|
// Initialize JSZip instance if needed
|
||||||
let jszip = null;
|
let jszip = null;
|
||||||
if (zipFiles) {
|
if (zipFiles) {
|
||||||
jszip = new JSZip();
|
jszip = new JSZip();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Submit each PDF file in parallel
|
// Submit each PDF file in parallel
|
||||||
var promises = [];
|
let promises = [];
|
||||||
for (var i = 0; i < files.length; i++) {
|
for (let i = 0; i < files.length; i++) {
|
||||||
var promise = new Promise(function(resolve, reject) {
|
let promise = new Promise(async function(resolve, reject) {
|
||||||
var fileFormData = new FormData();
|
let fileFormData = new FormData();
|
||||||
fileFormData.append('fileInput', files[i]);
|
fileFormData.append('fileInput', files[i]);
|
||||||
for (var pair of formData.entries()) {
|
for (let pair of formData.entries()) {
|
||||||
fileFormData.append(pair[0], pair[1]);
|
fileFormData.append(pair[0], pair[1]);
|
||||||
}
|
}
|
||||||
console.log(fileFormData);
|
console.log(fileFormData);
|
||||||
|
|
||||||
fetch(url, {
|
try {
|
||||||
method: 'POST',
|
let response = await fetch(url, {
|
||||||
body: fileFormData
|
method: 'POST',
|
||||||
}).then(function(response) {
|
body: fileFormData
|
||||||
|
});
|
||||||
|
|
||||||
if (!response) {
|
if (!response) {
|
||||||
throw new Error('Received null response for file ' + i);
|
throw new Error('Received null response for file ' + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`Error submitting request for file ${i}: ${response.status} ${response.statusText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
let contentDisposition = response.headers.get('content-disposition');
|
||||||
|
let fileName = "file.pdf"
|
||||||
|
if (!contentDisposition) {
|
||||||
|
//throw new Error('Content-Disposition header not found for file ' + i);
|
||||||
|
} else {
|
||||||
|
fileName = contentDisposition.split('filename=')[1].replace(/"/g, '');
|
||||||
|
}
|
||||||
console.log('Received response for file ' + i + ': ' + response);
|
console.log('Received response for file ' + i + ': ' + response);
|
||||||
|
|
||||||
var contentDisposition = response.headers.get('content-disposition');
|
|
||||||
var fileName = contentDisposition.split('filename=')[1].replace(/"/g, '');
|
|
||||||
|
|
||||||
response.blob().then(function (blob) {
|
let blob = await response.blob();
|
||||||
if (zipFiles) {
|
if (zipFiles) {
|
||||||
// Add the file to the ZIP archive
|
// Add the file to the ZIP archive
|
||||||
jszip.file(fileName, blob);
|
jszip.file(fileName, blob);
|
||||||
resolve();
|
resolve();
|
||||||
} else {
|
} else {
|
||||||
// Download the file directly
|
// Download the file directly
|
||||||
var url = window.URL.createObjectURL(blob);
|
let url = window.URL.createObjectURL(blob);
|
||||||
var a = document.createElement('a');
|
let a = document.createElement('a');
|
||||||
a.href = url;
|
a.href = url;
|
||||||
a.download = fileName;
|
a.download = fileName;
|
||||||
document.body.appendChild(a);
|
document.body.appendChild(a);
|
||||||
a.click();
|
a.click();
|
||||||
a.remove();
|
a.remove();
|
||||||
resolve();
|
resolve();
|
||||||
}
|
}
|
||||||
});
|
} catch (error) {
|
||||||
|
|
||||||
}).catch(function(error) {
|
|
||||||
console.error('Error submitting request for file ' + i + ': ' + error);
|
console.error('Error submitting request for file ' + i + ': ' + error);
|
||||||
|
|
||||||
// Set default values or fallbacks for error properties
|
// Set default values or fallbacks for error properties
|
||||||
var status = error && error.status || 500;
|
let status = error && error.status || 500;
|
||||||
var statusText = error && error.statusText || 'Internal Server Error';
|
let statusText = error && error.statusText || 'Internal Server Error';
|
||||||
var message = error && error.message || 'An error occurred while processing your request.';
|
let message = error && error.message || 'An error occurred while processing your request.';
|
||||||
|
|
||||||
// Reject the Promise to signal that the request has failed
|
// Reject the Promise to signal that the request has failed
|
||||||
reject();
|
reject();
|
||||||
// Redirect to error page with Spring Boot error parameters
|
// Redirect to error page with Spring Boot error parameters
|
||||||
var url = '/error?status=' + status + '&error=' + encodeURIComponent(statusText) + '&message=' + encodeURIComponent(message);
|
let url = '/error?status=' + status + '&error=' + encodeURIComponent(statusText) + '&message=' + encodeURIComponent(message);
|
||||||
window.location.href = url;
|
window.location.href = url;
|
||||||
});
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Update the progress bar as each request finishes
|
// Update the progress bar as each request finishes
|
||||||
promise.then(function() {
|
promise.then(function() {
|
||||||
var progress = ((progressBar.attr('aria-valuenow') / files.length) * 100) + (100 / files.length);
|
updateProgressBar(progressBar, files);
|
||||||
progressBar.css('width', progress + '%');
|
|
||||||
progressBar.attr('aria-valuenow', parseInt(progressBar.attr('aria-valuenow')) + 1);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
promises.push(promise);
|
promises.push(promise);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -295,24 +303,33 @@ function toggleDarkMode() {
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error while uploading files: ' + error);
|
console.error('Error while uploading files: ' + error);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the progress bar
|
// Update the progress bar
|
||||||
progressBar.css('width', '100%');
|
progressBar.css('width', '100%');
|
||||||
progressBar.attr('aria-valuenow', files.length);
|
progressBar.attr('aria-valuenow', files.length);
|
||||||
|
|
||||||
// After all requests are finished, download the ZIP file if needed
|
// After all requests are finished, download the ZIP file if needed
|
||||||
if (zipFiles) {
|
if (zipFiles) {
|
||||||
jszip.generateAsync({ type: "blob" }).then(function (content) {
|
try {
|
||||||
var url = window.URL.createObjectURL(content);
|
let content = await jszip.generateAsync({ type: "blob" });
|
||||||
var a = document.createElement('a');
|
let url = window.URL.createObjectURL(content);
|
||||||
a.href = url;
|
let a = document.createElement('a');
|
||||||
a.download = "files.zip";
|
a.href = url;
|
||||||
document.body.appendChild(a);
|
a.download = "files.zip";
|
||||||
a.click();
|
document.body.appendChild(a);
|
||||||
a.remove();
|
a.click();
|
||||||
});
|
a.remove();
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error generating ZIP file: ' + error);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
function updateProgressBar(progressBar, files) {
|
||||||
|
let progress = ((progressBar.attr('aria-valuenow') / files.length) * 100) + (100 / files.length);
|
||||||
|
progressBar.css('width', progress + '%');
|
||||||
|
progressBar.attr('aria-valuenow', parseInt(progressBar.attr('aria-valuenow')) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,36 +18,40 @@
|
|||||||
<form action="#" th:action="@{/ocr-pdf}" method="post" enctype="multipart/form-data" class="mb-3">
|
<form action="#" th:action="@{/ocr-pdf}" method="post" enctype="multipart/form-data" class="mb-3">
|
||||||
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label for="languages" class="form-label" th:text="#{ocr.selectText.1}"></label>
|
<label for="languages" class="form-label" th:text="#{ocr.selectText.1}"></label>
|
||||||
<div id="languages">
|
<hr>
|
||||||
<div th:each="language: ${languages}">
|
<div id="languages">
|
||||||
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
|
<div th:each="language, iterStat : ${languages}" >
|
||||||
<label class="form-check-label" th:for="${'language-' + language}" th:text="${language}"></label>
|
<span th:text=" ${iterStat.index + 1} + '.'"></span>
|
||||||
</div>
|
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
|
||||||
</div>
|
<label class="form-check-label" th:for="${'language-' + language}" th:text=" ${language}"></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group">
|
</div>
|
||||||
|
<hr>
|
||||||
|
</div>
|
||||||
|
<label for="languages" class="form-label" th:text="#{ocr.selectText.9}"></label>
|
||||||
|
<div class="form-check">
|
||||||
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
|
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
|
||||||
<label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label>
|
<label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group">
|
<div class="form-check">
|
||||||
<input type="checkbox" class="form-check-input" name="deskew" id="deskew" />
|
<input type="checkbox" class="form-check-input" name="deskew" id="deskew" />
|
||||||
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.2}"></label>
|
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.3}"></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group">
|
<div class="form-check">
|
||||||
<input type="checkbox" class="form-check-input" name="clean" id="clean" />
|
<input type="checkbox" class="form-check-input" name="clean" id="clean" />
|
||||||
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.2}"></label>
|
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.4}"></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group">
|
<div class="form-check">
|
||||||
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
|
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
|
||||||
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.2}"></label>
|
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.5}"></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label th:text="#{pdfToImage.selectText}"></label>
|
<label th:text="#{ocr.selectText.10}"></label>
|
||||||
<select class="form-control" name="ocrType">
|
<select class="form-control" name="ocrType">
|
||||||
<option value="skip-text">Ignores pages that have interacive text on them, only OCRs pages that are images</option>
|
<option value="skip-text" th:text="#{ocr.selectText.6}"></option>
|
||||||
<option value="force-ocr">Force OCR, will OCR Every page removing all original text</option>
|
<option value="force-ocr" th:text="#{ocr.selectText.7}"></option>
|
||||||
<option value="Normal">Normal (Will error if contains text)</option>
|
<option value="Normal" th:text="#{ocr.selectText.8}"></option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
<button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button>
|
<button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button>
|
||||||
|
Loading…
Reference in New Issue
Block a user