test stuff

This commit is contained in:
Anthony Stirling 2023-03-28 22:43:58 +01:00
parent 245f76792d
commit 146331b3ac
7 changed files with 134 additions and 64 deletions

View File

@ -19,6 +19,10 @@ dependencies {
implementation 'org.apache.logging.log4j:log4j-core:2.20.0' implementation 'org.apache.logging.log4j:log4j-core:2.20.0'
// https://mvnrepository.com/artifact/org.apache.pdfbox/jbig2-imageio
implementation group: 'org.apache.pdfbox', name: 'jbig2-imageio', version: '3.0.4'
//general PDF //general PDF
implementation 'org.apache.pdfbox:pdfbox:2.0.27' implementation 'org.apache.pdfbox:pdfbox:2.0.27'

View File

@ -69,7 +69,7 @@ public class CompressController {
command.add(tempInputFile.toString()); command.add(tempInputFile.toString());
command.add(tempOutputFile.toString()); command.add(tempOutputFile.toString());
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command); int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
// Read the optimized PDF file // Read the optimized PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile); byte[] pdfBytes = Files.readAllBytes(tempOutputFile);

View File

@ -28,6 +28,7 @@ import org.springframework.web.servlet.ModelAndView;
import stirling.software.SPDF.utils.ProcessExecutor; import stirling.software.SPDF.utils.ProcessExecutor;
//import com.spire.pdf.*; //import com.spire.pdf.*;
import java.util.concurrent.Semaphore;
@Controller @Controller
public class OCRController { public class OCRController {
@ -41,11 +42,14 @@ public class OCRController {
return modelAndView; return modelAndView;
} }
private final Semaphore semaphore = new Semaphore(2);
@PostMapping("/ocr-pdf") @PostMapping("/ocr-pdf")
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile, public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
@RequestParam("languages") List<String> selectedLanguages, @RequestParam("languages") List<String> selectedLanguages,
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException { @RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
//--output-type pdfa //--output-type pdfa
if (selectedLanguages == null || selectedLanguages.size() < 1) { if (selectedLanguages == null || selectedLanguages.size() < 1) {
throw new IOException("Please select at least one language."); throw new IOException("Please select at least one language.");
@ -60,18 +64,26 @@ public class OCRController {
// Run OCR Command // Run OCR Command
String languageOption = String.join("+", selectedLanguages); String languageOption = String.join("+", selectedLanguages);
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,
tempInputFile.toString(), tempOutputFile.toString())); List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2"));
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt"); String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
if (sidecar != null && sidecar) { if (sidecar != null && sidecar) {
command.add("--sidecar"); command.add("--sidecar");
command.add(sidecarFile); command.add(sidecarFile);
} }
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
command.addAll(Arrays.asList("--language", languageOption,
tempInputFile.toString(), tempOutputFile.toString()));
//Run CLI command
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
// Read the OCR processed PDF file // Read the OCR processed PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile); byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files // Clean up the temporary files
Files.delete(tempInputFile); Files.delete(tempInputFile);
// Return the OCR processed PDF as a response // Return the OCR processed PDF as a response

View File

@ -53,7 +53,7 @@ public byte[] convertToPdf(MultipartFile inputFile) throws IOException, Interrup
"-o", "-o",
tempOutputFile.toString(), tempOutputFile.toString(),
tempInputFile.toString())); tempInputFile.toString()));
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command); int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE).runCommandWithOutputHandling(command);
// Read the converted PDF file // Read the converted PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile); byte[] pdfBytes = Files.readAllBytes(tempOutputFile);

View File

@ -6,60 +6,94 @@ import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.concurrent.Semaphore;
public class ProcessExecutor { public class ProcessExecutor {
public static int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
ProcessBuilder processBuilder = new ProcessBuilder(command); public enum Processes {
Process process = processBuilder.start(); LIBRE_OFFICE,
OCR_MY_PDF
}
// Read the error stream and standard output stream concurrently private static final Map<Processes, ProcessExecutor> instances = new HashMap<>();
List<String> errorLines = new ArrayList<>();
List<String> outputLines = new ArrayList<>();
Thread errorReaderThread = new Thread(() -> { private final Semaphore semaphore;
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) {
String line;
while ((line = errorReader.readLine()) != null) {
errorLines.add(line);
}
} catch (IOException e) {
e.printStackTrace();
}
});
Thread outputReaderThread = new Thread(() -> { private ProcessExecutor(int semaphoreLimit) {
try (BufferedReader outputReader = new BufferedReader(new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) { this.semaphore = new Semaphore(semaphoreLimit);
String line; }
while ((line = outputReader.readLine()) != null) {
outputLines.add(line);
}
} catch (IOException e) {
e.printStackTrace();
}
});
errorReaderThread.start();
outputReaderThread.start();
// Wait for the conversion process to complete
int exitCode = process.waitFor();
// Wait for the reader threads to finish
errorReaderThread.join();
outputReaderThread.join();
if (outputLines.size() > 0) {
String outputMessage = String.join("\n", outputLines);
System.out.println("Command output:\n" + outputMessage);
}
if (errorLines.size() > 0) {
String errorMessage = String.join("\n", errorLines);
System.out.println("Command error output:\n" + errorMessage);
if (exitCode != 0) {
throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage);
}
}
public static ProcessExecutor getInstance(Processes processType) {
return instances.computeIfAbsent(processType, key -> {
int semaphoreLimit = switch (key) {
case LIBRE_OFFICE -> 2;
case OCR_MY_PDF -> 2;
};
return new ProcessExecutor(semaphoreLimit);
});
}
public int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
int exitCode = 1;
semaphore.acquire();
try {
ProcessBuilder processBuilder = new ProcessBuilder(command);
Process process = processBuilder.start();
// Read the error stream and standard output stream concurrently
List<String> errorLines = new ArrayList<>();
List<String> outputLines = new ArrayList<>();
Thread errorReaderThread = new Thread(() -> {
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) {
String line;
while ((line = errorReader.readLine()) != null) {
errorLines.add(line);
}
} catch (IOException e) {
e.printStackTrace();
}
});
Thread outputReaderThread = new Thread(() -> {
try (BufferedReader outputReader = new BufferedReader(new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
String line;
while ((line = outputReader.readLine()) != null) {
outputLines.add(line);
}
} catch (IOException e) {
e.printStackTrace();
}
});
errorReaderThread.start();
outputReaderThread.start();
// Wait for the conversion process to complete
exitCode = process.waitFor();
// Wait for the reader threads to finish
errorReaderThread.join();
outputReaderThread.join();
if (outputLines.size() > 0) {
String outputMessage = String.join("\n", outputLines);
System.out.println("Command output:\n" + outputMessage);
}
if (errorLines.size() > 0) {
String errorMessage = String.join("\n", errorLines);
System.out.println("Command error output:\n" + errorMessage);
if (exitCode != 0) {
throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage);
}
}
} finally {
semaphore.release();
}
return exitCode; return exitCode;
} }

View File

@ -1,12 +1,12 @@
spring.http.multipart.max-file-size=1GB spring.http.multipart.max-file-size=2GB
spring.http.multipart.max-request-size=1GB spring.http.multipart.max-request-size=2GB
multipart.enabled=true multipart.enabled=true
multipart.max-file-size=1000MB multipart.max-file-size=2000MB
multipart.max-request-size=1000MB multipart.max-request-size=2000MB
spring.servlet.multipart.max-file-size=1000MB spring.servlet.multipart.max-file-size=2000MB
spring.servlet.multipart.max-request-size=1000MB spring.servlet.multipart.max-request-size=2000MB
server.forward-headers-strategy=NATIVE server.forward-headers-strategy=NATIVE

View File

@ -26,10 +26,30 @@
</div> </div>
</div> </div>
</div> </div>
<!-- <div class="form-group"> <div class="form-group">
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" /> <input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
<label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label> <label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label>
</div> --> </div>
<div class="form-group">
<input type="checkbox" class="form-check-input" name="deskew" id="deskew" />
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.2}"></label>
</div>
<div class="form-group">
<input type="checkbox" class="form-check-input" name="clean" id="clean" />
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.2}"></label>
</div>
<div class="form-group">
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.2}"></label>
</div>
<div class="form-group">
<label th:text="#{pdfToImage.selectText}"></label>
<select class="form-control" name="ocrType">
<option value="skip-text">Ignores pages that have interacive text on them, only OCRs pages that are images</option>
<option value="force-ocr">Force OCR, will OCR Every page removing all original text</option>
<option value="Normal">Normal (Will error if contains text)</option>
</select>
</div>
<button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button> <button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button>
</form> </form>
<p th:text="#{ocr.credit}"></p> <p th:text="#{ocr.credit}"></p>