test stuff

This commit is contained in:
Anthony Stirling 2023-03-28 22:43:58 +01:00
parent 245f76792d
commit 146331b3ac
7 changed files with 134 additions and 64 deletions

View File

@ -19,6 +19,10 @@ dependencies {
implementation 'org.apache.logging.log4j:log4j-core:2.20.0'
// https://mvnrepository.com/artifact/org.apache.pdfbox/jbig2-imageio
implementation group: 'org.apache.pdfbox', name: 'jbig2-imageio', version: '3.0.4'
//general PDF
implementation 'org.apache.pdfbox:pdfbox:2.0.27'

View File

@ -69,7 +69,7 @@ public class CompressController {
command.add(tempInputFile.toString());
command.add(tempOutputFile.toString());
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
// Read the optimized PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);

View File

@ -28,6 +28,7 @@ import org.springframework.web.servlet.ModelAndView;
import stirling.software.SPDF.utils.ProcessExecutor;
//import com.spire.pdf.*;
import java.util.concurrent.Semaphore;
@Controller
public class OCRController {
@ -41,11 +42,14 @@ public class OCRController {
return modelAndView;
}
private final Semaphore semaphore = new Semaphore(2);
@PostMapping("/ocr-pdf")
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
@RequestParam("languages") List<String> selectedLanguages,
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
//--output-type pdfa
if (selectedLanguages == null || selectedLanguages.size() < 1) {
throw new IOException("Please select at least one language.");
@ -60,18 +64,26 @@ public class OCRController {
// Run OCR Command
String languageOption = String.join("+", selectedLanguages);
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,
tempInputFile.toString(), tempOutputFile.toString()));
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2"));
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
if (sidecar != null && sidecar) {
command.add("--sidecar");
command.add(sidecarFile);
}
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
command.addAll(Arrays.asList("--language", languageOption,
tempInputFile.toString(), tempOutputFile.toString()));
//Run CLI command
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
// Read the OCR processed PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files
Files.delete(tempInputFile);
// Return the OCR processed PDF as a response

View File

@ -53,7 +53,7 @@ public byte[] convertToPdf(MultipartFile inputFile) throws IOException, Interrup
"-o",
tempOutputFile.toString(),
tempInputFile.toString()));
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE).runCommandWithOutputHandling(command);
// Read the converted PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);

View File

@ -6,8 +6,40 @@ import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.concurrent.Semaphore;
public class ProcessExecutor {
public static int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
public enum Processes {
LIBRE_OFFICE,
OCR_MY_PDF
}
private static final Map<Processes, ProcessExecutor> instances = new HashMap<>();
private final Semaphore semaphore;
private ProcessExecutor(int semaphoreLimit) {
this.semaphore = new Semaphore(semaphoreLimit);
}
public static ProcessExecutor getInstance(Processes processType) {
return instances.computeIfAbsent(processType, key -> {
int semaphoreLimit = switch (key) {
case LIBRE_OFFICE -> 2;
case OCR_MY_PDF -> 2;
};
return new ProcessExecutor(semaphoreLimit);
});
}
public int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
int exitCode = 1;
semaphore.acquire();
try {
ProcessBuilder processBuilder = new ProcessBuilder(command);
Process process = processBuilder.start();
@ -41,7 +73,7 @@ public class ProcessExecutor {
outputReaderThread.start();
// Wait for the conversion process to complete
int exitCode = process.waitFor();
exitCode = process.waitFor();
// Wait for the reader threads to finish
errorReaderThread.join();
@ -59,7 +91,9 @@ public class ProcessExecutor {
throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage);
}
}
} finally {
semaphore.release();
}
return exitCode;
}

View File

@ -1,12 +1,12 @@
spring.http.multipart.max-file-size=1GB
spring.http.multipart.max-request-size=1GB
spring.http.multipart.max-file-size=2GB
spring.http.multipart.max-request-size=2GB
multipart.enabled=true
multipart.max-file-size=1000MB
multipart.max-request-size=1000MB
multipart.max-file-size=2000MB
multipart.max-request-size=2000MB
spring.servlet.multipart.max-file-size=1000MB
spring.servlet.multipart.max-request-size=1000MB
spring.servlet.multipart.max-file-size=2000MB
spring.servlet.multipart.max-request-size=2000MB
server.forward-headers-strategy=NATIVE

View File

@ -26,10 +26,30 @@
</div>
</div>
</div>
<!-- <div class="form-group">
<div class="form-group">
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
<label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label>
</div> -->
</div>
<div class="form-group">
<input type="checkbox" class="form-check-input" name="deskew" id="deskew" />
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.2}"></label>
</div>
<div class="form-group">
<input type="checkbox" class="form-check-input" name="clean" id="clean" />
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.2}"></label>
</div>
<div class="form-group">
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.2}"></label>
</div>
<div class="form-group">
<label th:text="#{pdfToImage.selectText}"></label>
<select class="form-control" name="ocrType">
<option value="skip-text">Ignores pages that have interacive text on them, only OCRs pages that are images</option>
<option value="force-ocr">Force OCR, will OCR Every page removing all original text</option>
<option value="Normal">Normal (Will error if contains text)</option>
</select>
</div>
<button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button>
</form>
<p th:text="#{ocr.credit}"></p>