From 146331b3ac9e50f1226acf15390211f7fc7bd7bd Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Tue, 28 Mar 2023 22:43:58 +0100 Subject: [PATCH] test stuff --- build.gradle | 4 + .../SPDF/controller/CompressController.java | 2 +- .../SPDF/controller/OCRController.java | 22 ++- .../converters/ConvertOfficeController.java | 2 +- .../software/SPDF/utils/ProcessExecutor.java | 132 +++++++++++------- src/main/resources/application.properties | 12 +- src/main/resources/templates/ocr-pdf.html | 24 +++- 7 files changed, 134 insertions(+), 64 deletions(-) diff --git a/build.gradle b/build.gradle index f7b096a89..5b835bb70 100644 --- a/build.gradle +++ b/build.gradle @@ -19,6 +19,10 @@ dependencies { implementation 'org.apache.logging.log4j:log4j-core:2.20.0' + // https://mvnrepository.com/artifact/org.apache.pdfbox/jbig2-imageio + implementation group: 'org.apache.pdfbox', name: 'jbig2-imageio', version: '3.0.4' + + //general PDF implementation 'org.apache.pdfbox:pdfbox:2.0.27' diff --git a/src/main/java/stirling/software/SPDF/controller/CompressController.java b/src/main/java/stirling/software/SPDF/controller/CompressController.java index d8028a757..f9ac49ffe 100644 --- a/src/main/java/stirling/software/SPDF/controller/CompressController.java +++ b/src/main/java/stirling/software/SPDF/controller/CompressController.java @@ -69,7 +69,7 @@ public class CompressController { command.add(tempInputFile.toString()); command.add(tempOutputFile.toString()); - int returnCode = ProcessExecutor.runCommandWithOutputHandling(command); + int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command); // Read the optimized PDF file byte[] pdfBytes = Files.readAllBytes(tempOutputFile); diff --git a/src/main/java/stirling/software/SPDF/controller/OCRController.java b/src/main/java/stirling/software/SPDF/controller/OCRController.java index 93e0f6eb0..0c579cb44 100644 --- a/src/main/java/stirling/software/SPDF/controller/OCRController.java +++ b/src/main/java/stirling/software/SPDF/controller/OCRController.java @@ -28,6 +28,7 @@ import org.springframework.web.servlet.ModelAndView; import stirling.software.SPDF.utils.ProcessExecutor; //import com.spire.pdf.*; +import java.util.concurrent.Semaphore; @Controller public class OCRController { @@ -41,11 +42,14 @@ public class OCRController { return modelAndView; } + private final Semaphore semaphore = new Semaphore(2); + @PostMapping("/ocr-pdf") public ResponseEntity processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile, @RequestParam("languages") List selectedLanguages, @RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException { + //--output-type pdfa if (selectedLanguages == null || selectedLanguages.size() < 1) { throw new IOException("Please select at least one language."); @@ -60,18 +64,26 @@ public class OCRController { // Run OCR Command String languageOption = String.join("+", selectedLanguages); - List command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption, - tempInputFile.toString(), tempOutputFile.toString())); + + List command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2")); + + String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt"); if (sidecar != null && sidecar) { command.add("--sidecar"); command.add(sidecarFile); } - int returnCode = ProcessExecutor.runCommandWithOutputHandling(command); - + + command.addAll(Arrays.asList("--language", languageOption, + tempInputFile.toString(), tempOutputFile.toString())); + + //Run CLI command + int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command); + // Read the OCR processed PDF file byte[] pdfBytes = Files.readAllBytes(tempOutputFile); - + + // Clean up the temporary files Files.delete(tempInputFile); // Return the OCR processed PDF as a response diff --git a/src/main/java/stirling/software/SPDF/controller/converters/ConvertOfficeController.java b/src/main/java/stirling/software/SPDF/controller/converters/ConvertOfficeController.java index 708f3df2b..71c8b4618 100644 --- a/src/main/java/stirling/software/SPDF/controller/converters/ConvertOfficeController.java +++ b/src/main/java/stirling/software/SPDF/controller/converters/ConvertOfficeController.java @@ -53,7 +53,7 @@ public byte[] convertToPdf(MultipartFile inputFile) throws IOException, Interrup "-o", tempOutputFile.toString(), tempInputFile.toString())); - int returnCode = ProcessExecutor.runCommandWithOutputHandling(command); + int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE).runCommandWithOutputHandling(command); // Read the converted PDF file byte[] pdfBytes = Files.readAllBytes(tempOutputFile); diff --git a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java index 73d597dcc..1e57ba8bc 100644 --- a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java +++ b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java @@ -6,60 +6,94 @@ import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; +import java.util.Map; +import java.util.HashMap; +import java.util.concurrent.Semaphore; public class ProcessExecutor { - public static int runCommandWithOutputHandling(List command) throws IOException, InterruptedException { - ProcessBuilder processBuilder = new ProcessBuilder(command); - Process process = processBuilder.start(); + + public enum Processes { + LIBRE_OFFICE, + OCR_MY_PDF + } - // Read the error stream and standard output stream concurrently - List errorLines = new ArrayList<>(); - List outputLines = new ArrayList<>(); + private static final Map instances = new HashMap<>(); - Thread errorReaderThread = new Thread(() -> { - try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) { - String line; - while ((line = errorReader.readLine()) != null) { - errorLines.add(line); - } - } catch (IOException e) { - e.printStackTrace(); - } - }); + private final Semaphore semaphore; - Thread outputReaderThread = new Thread(() -> { - try (BufferedReader outputReader = new BufferedReader(new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) { - String line; - while ((line = outputReader.readLine()) != null) { - outputLines.add(line); - } - } catch (IOException e) { - e.printStackTrace(); - } - }); - - errorReaderThread.start(); - outputReaderThread.start(); - - // Wait for the conversion process to complete - int exitCode = process.waitFor(); - - // Wait for the reader threads to finish - errorReaderThread.join(); - outputReaderThread.join(); - - if (outputLines.size() > 0) { - String outputMessage = String.join("\n", outputLines); - System.out.println("Command output:\n" + outputMessage); - } - - if (errorLines.size() > 0) { - String errorMessage = String.join("\n", errorLines); - System.out.println("Command error output:\n" + errorMessage); - if (exitCode != 0) { - throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage); - } - } + private ProcessExecutor(int semaphoreLimit) { + this.semaphore = new Semaphore(semaphoreLimit); + } + public static ProcessExecutor getInstance(Processes processType) { + return instances.computeIfAbsent(processType, key -> { + int semaphoreLimit = switch (key) { + case LIBRE_OFFICE -> 2; + case OCR_MY_PDF -> 2; + }; + return new ProcessExecutor(semaphoreLimit); + }); + } + + public int runCommandWithOutputHandling(List command) throws IOException, InterruptedException { + int exitCode = 1; + semaphore.acquire(); + try { + + + ProcessBuilder processBuilder = new ProcessBuilder(command); + Process process = processBuilder.start(); + + // Read the error stream and standard output stream concurrently + List errorLines = new ArrayList<>(); + List outputLines = new ArrayList<>(); + + Thread errorReaderThread = new Thread(() -> { + try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) { + String line; + while ((line = errorReader.readLine()) != null) { + errorLines.add(line); + } + } catch (IOException e) { + e.printStackTrace(); + } + }); + + Thread outputReaderThread = new Thread(() -> { + try (BufferedReader outputReader = new BufferedReader(new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) { + String line; + while ((line = outputReader.readLine()) != null) { + outputLines.add(line); + } + } catch (IOException e) { + e.printStackTrace(); + } + }); + + errorReaderThread.start(); + outputReaderThread.start(); + + // Wait for the conversion process to complete + exitCode = process.waitFor(); + + // Wait for the reader threads to finish + errorReaderThread.join(); + outputReaderThread.join(); + + if (outputLines.size() > 0) { + String outputMessage = String.join("\n", outputLines); + System.out.println("Command output:\n" + outputMessage); + } + + if (errorLines.size() > 0) { + String errorMessage = String.join("\n", errorLines); + System.out.println("Command error output:\n" + errorMessage); + if (exitCode != 0) { + throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage); + } + } + } finally { + semaphore.release(); + } return exitCode; } diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 266a0f350..d374846f7 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -1,12 +1,12 @@ -spring.http.multipart.max-file-size=1GB -spring.http.multipart.max-request-size=1GB +spring.http.multipart.max-file-size=2GB +spring.http.multipart.max-request-size=2GB multipart.enabled=true -multipart.max-file-size=1000MB -multipart.max-request-size=1000MB +multipart.max-file-size=2000MB +multipart.max-request-size=2000MB -spring.servlet.multipart.max-file-size=1000MB -spring.servlet.multipart.max-request-size=1000MB +spring.servlet.multipart.max-file-size=2000MB +spring.servlet.multipart.max-request-size=2000MB server.forward-headers-strategy=NATIVE diff --git a/src/main/resources/templates/ocr-pdf.html b/src/main/resources/templates/ocr-pdf.html index 40dec58e2..0b5bf48d7 100644 --- a/src/main/resources/templates/ocr-pdf.html +++ b/src/main/resources/templates/ocr-pdf.html @@ -26,10 +26,30 @@ - + +
+ + +
+
+ + +
+
+ + +
+
+ + +