mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-03 17:52:30 +02:00
test stuff
This commit is contained in:
parent
245f76792d
commit
146331b3ac
@ -19,6 +19,10 @@ dependencies {
|
||||
|
||||
implementation 'org.apache.logging.log4j:log4j-core:2.20.0'
|
||||
|
||||
// https://mvnrepository.com/artifact/org.apache.pdfbox/jbig2-imageio
|
||||
implementation group: 'org.apache.pdfbox', name: 'jbig2-imageio', version: '3.0.4'
|
||||
|
||||
|
||||
//general PDF
|
||||
implementation 'org.apache.pdfbox:pdfbox:2.0.27'
|
||||
|
||||
|
@ -69,7 +69,7 @@ public class CompressController {
|
||||
command.add(tempInputFile.toString());
|
||||
command.add(tempOutputFile.toString());
|
||||
|
||||
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
||||
|
||||
// Read the optimized PDF file
|
||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||
|
@ -28,6 +28,7 @@ import org.springframework.web.servlet.ModelAndView;
|
||||
|
||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||
//import com.spire.pdf.*;
|
||||
import java.util.concurrent.Semaphore;
|
||||
@Controller
|
||||
public class OCRController {
|
||||
|
||||
@ -41,11 +42,14 @@ public class OCRController {
|
||||
return modelAndView;
|
||||
}
|
||||
|
||||
private final Semaphore semaphore = new Semaphore(2);
|
||||
|
||||
@PostMapping("/ocr-pdf")
|
||||
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
|
||||
@RequestParam("languages") List<String> selectedLanguages,
|
||||
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
|
||||
|
||||
|
||||
//--output-type pdfa
|
||||
if (selectedLanguages == null || selectedLanguages.size() < 1) {
|
||||
throw new IOException("Please select at least one language.");
|
||||
@ -60,18 +64,26 @@ public class OCRController {
|
||||
|
||||
// Run OCR Command
|
||||
String languageOption = String.join("+", selectedLanguages);
|
||||
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,
|
||||
tempInputFile.toString(), tempOutputFile.toString()));
|
||||
|
||||
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2"));
|
||||
|
||||
|
||||
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
|
||||
if (sidecar != null && sidecar) {
|
||||
command.add("--sidecar");
|
||||
command.add(sidecarFile);
|
||||
}
|
||||
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
|
||||
|
||||
command.addAll(Arrays.asList("--language", languageOption,
|
||||
tempInputFile.toString(), tempOutputFile.toString()));
|
||||
|
||||
//Run CLI command
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
||||
|
||||
// Read the OCR processed PDF file
|
||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||
|
||||
|
||||
// Clean up the temporary files
|
||||
Files.delete(tempInputFile);
|
||||
// Return the OCR processed PDF as a response
|
||||
|
@ -53,7 +53,7 @@ public byte[] convertToPdf(MultipartFile inputFile) throws IOException, Interrup
|
||||
"-o",
|
||||
tempOutputFile.toString(),
|
||||
tempInputFile.toString()));
|
||||
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE).runCommandWithOutputHandling(command);
|
||||
|
||||
// Read the converted PDF file
|
||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||
|
@ -6,60 +6,94 @@ import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
public class ProcessExecutor {
|
||||
public static int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
|
||||
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
||||
Process process = processBuilder.start();
|
||||
|
||||
// Read the error stream and standard output stream concurrently
|
||||
List<String> errorLines = new ArrayList<>();
|
||||
List<String> outputLines = new ArrayList<>();
|
||||
public enum Processes {
|
||||
LIBRE_OFFICE,
|
||||
OCR_MY_PDF
|
||||
}
|
||||
|
||||
Thread errorReaderThread = new Thread(() -> {
|
||||
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = errorReader.readLine()) != null) {
|
||||
errorLines.add(line);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
private static final Map<Processes, ProcessExecutor> instances = new HashMap<>();
|
||||
|
||||
Thread outputReaderThread = new Thread(() -> {
|
||||
try (BufferedReader outputReader = new BufferedReader(new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = outputReader.readLine()) != null) {
|
||||
outputLines.add(line);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
private final Semaphore semaphore;
|
||||
|
||||
errorReaderThread.start();
|
||||
outputReaderThread.start();
|
||||
private ProcessExecutor(int semaphoreLimit) {
|
||||
this.semaphore = new Semaphore(semaphoreLimit);
|
||||
}
|
||||
|
||||
// Wait for the conversion process to complete
|
||||
int exitCode = process.waitFor();
|
||||
public static ProcessExecutor getInstance(Processes processType) {
|
||||
return instances.computeIfAbsent(processType, key -> {
|
||||
int semaphoreLimit = switch (key) {
|
||||
case LIBRE_OFFICE -> 2;
|
||||
case OCR_MY_PDF -> 2;
|
||||
};
|
||||
return new ProcessExecutor(semaphoreLimit);
|
||||
});
|
||||
}
|
||||
|
||||
// Wait for the reader threads to finish
|
||||
errorReaderThread.join();
|
||||
outputReaderThread.join();
|
||||
public int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
|
||||
int exitCode = 1;
|
||||
semaphore.acquire();
|
||||
try {
|
||||
|
||||
if (outputLines.size() > 0) {
|
||||
String outputMessage = String.join("\n", outputLines);
|
||||
System.out.println("Command output:\n" + outputMessage);
|
||||
}
|
||||
|
||||
if (errorLines.size() > 0) {
|
||||
String errorMessage = String.join("\n", errorLines);
|
||||
System.out.println("Command error output:\n" + errorMessage);
|
||||
if (exitCode != 0) {
|
||||
throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage);
|
||||
}
|
||||
}
|
||||
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
||||
Process process = processBuilder.start();
|
||||
|
||||
// Read the error stream and standard output stream concurrently
|
||||
List<String> errorLines = new ArrayList<>();
|
||||
List<String> outputLines = new ArrayList<>();
|
||||
|
||||
Thread errorReaderThread = new Thread(() -> {
|
||||
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = errorReader.readLine()) != null) {
|
||||
errorLines.add(line);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
|
||||
Thread outputReaderThread = new Thread(() -> {
|
||||
try (BufferedReader outputReader = new BufferedReader(new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = outputReader.readLine()) != null) {
|
||||
outputLines.add(line);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
|
||||
errorReaderThread.start();
|
||||
outputReaderThread.start();
|
||||
|
||||
// Wait for the conversion process to complete
|
||||
exitCode = process.waitFor();
|
||||
|
||||
// Wait for the reader threads to finish
|
||||
errorReaderThread.join();
|
||||
outputReaderThread.join();
|
||||
|
||||
if (outputLines.size() > 0) {
|
||||
String outputMessage = String.join("\n", outputLines);
|
||||
System.out.println("Command output:\n" + outputMessage);
|
||||
}
|
||||
|
||||
if (errorLines.size() > 0) {
|
||||
String errorMessage = String.join("\n", errorLines);
|
||||
System.out.println("Command error output:\n" + errorMessage);
|
||||
if (exitCode != 0) {
|
||||
throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
semaphore.release();
|
||||
}
|
||||
return exitCode;
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
spring.http.multipart.max-file-size=1GB
|
||||
spring.http.multipart.max-request-size=1GB
|
||||
spring.http.multipart.max-file-size=2GB
|
||||
spring.http.multipart.max-request-size=2GB
|
||||
|
||||
multipart.enabled=true
|
||||
multipart.max-file-size=1000MB
|
||||
multipart.max-request-size=1000MB
|
||||
multipart.max-file-size=2000MB
|
||||
multipart.max-request-size=2000MB
|
||||
|
||||
spring.servlet.multipart.max-file-size=1000MB
|
||||
spring.servlet.multipart.max-request-size=1000MB
|
||||
spring.servlet.multipart.max-file-size=2000MB
|
||||
spring.servlet.multipart.max-request-size=2000MB
|
||||
|
||||
server.forward-headers-strategy=NATIVE
|
||||
|
||||
|
@ -26,10 +26,30 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<!-- <div class="form-group">
|
||||
<div class="form-group">
|
||||
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
|
||||
<label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label>
|
||||
</div> -->
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<input type="checkbox" class="form-check-input" name="deskew" id="deskew" />
|
||||
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.2}"></label>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<input type="checkbox" class="form-check-input" name="clean" id="clean" />
|
||||
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.2}"></label>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
|
||||
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.2}"></label>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label th:text="#{pdfToImage.selectText}"></label>
|
||||
<select class="form-control" name="ocrType">
|
||||
<option value="skip-text">Ignores pages that have interacive text on them, only OCRs pages that are images</option>
|
||||
<option value="force-ocr">Force OCR, will OCR Every page removing all original text</option>
|
||||
<option value="Normal">Normal (Will error if contains text)</option>
|
||||
</select>
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button>
|
||||
</form>
|
||||
<p th:text="#{ocr.credit}"></p>
|
||||
|
Loading…
Reference in New Issue
Block a user