mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-03 17:52:30 +02:00
test stuff
This commit is contained in:
parent
245f76792d
commit
146331b3ac
@ -19,6 +19,10 @@ dependencies {
|
|||||||
|
|
||||||
implementation 'org.apache.logging.log4j:log4j-core:2.20.0'
|
implementation 'org.apache.logging.log4j:log4j-core:2.20.0'
|
||||||
|
|
||||||
|
// https://mvnrepository.com/artifact/org.apache.pdfbox/jbig2-imageio
|
||||||
|
implementation group: 'org.apache.pdfbox', name: 'jbig2-imageio', version: '3.0.4'
|
||||||
|
|
||||||
|
|
||||||
//general PDF
|
//general PDF
|
||||||
implementation 'org.apache.pdfbox:pdfbox:2.0.27'
|
implementation 'org.apache.pdfbox:pdfbox:2.0.27'
|
||||||
|
|
||||||
|
@ -69,7 +69,7 @@ public class CompressController {
|
|||||||
command.add(tempInputFile.toString());
|
command.add(tempInputFile.toString());
|
||||||
command.add(tempOutputFile.toString());
|
command.add(tempOutputFile.toString());
|
||||||
|
|
||||||
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
|
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
// Read the optimized PDF file
|
// Read the optimized PDF file
|
||||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||||
|
@ -28,6 +28,7 @@ import org.springframework.web.servlet.ModelAndView;
|
|||||||
|
|
||||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||||
//import com.spire.pdf.*;
|
//import com.spire.pdf.*;
|
||||||
|
import java.util.concurrent.Semaphore;
|
||||||
@Controller
|
@Controller
|
||||||
public class OCRController {
|
public class OCRController {
|
||||||
|
|
||||||
@ -41,11 +42,14 @@ public class OCRController {
|
|||||||
return modelAndView;
|
return modelAndView;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final Semaphore semaphore = new Semaphore(2);
|
||||||
|
|
||||||
@PostMapping("/ocr-pdf")
|
@PostMapping("/ocr-pdf")
|
||||||
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
|
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
|
||||||
@RequestParam("languages") List<String> selectedLanguages,
|
@RequestParam("languages") List<String> selectedLanguages,
|
||||||
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
|
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
|
||||||
|
|
||||||
|
|
||||||
//--output-type pdfa
|
//--output-type pdfa
|
||||||
if (selectedLanguages == null || selectedLanguages.size() < 1) {
|
if (selectedLanguages == null || selectedLanguages.size() < 1) {
|
||||||
throw new IOException("Please select at least one language.");
|
throw new IOException("Please select at least one language.");
|
||||||
@ -60,18 +64,26 @@ public class OCRController {
|
|||||||
|
|
||||||
// Run OCR Command
|
// Run OCR Command
|
||||||
String languageOption = String.join("+", selectedLanguages);
|
String languageOption = String.join("+", selectedLanguages);
|
||||||
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,
|
|
||||||
tempInputFile.toString(), tempOutputFile.toString()));
|
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2"));
|
||||||
|
|
||||||
|
|
||||||
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
|
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
|
||||||
if (sidecar != null && sidecar) {
|
if (sidecar != null && sidecar) {
|
||||||
command.add("--sidecar");
|
command.add("--sidecar");
|
||||||
command.add(sidecarFile);
|
command.add(sidecarFile);
|
||||||
}
|
}
|
||||||
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
|
|
||||||
|
command.addAll(Arrays.asList("--language", languageOption,
|
||||||
|
tempInputFile.toString(), tempOutputFile.toString()));
|
||||||
|
|
||||||
|
//Run CLI command
|
||||||
|
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
// Read the OCR processed PDF file
|
// Read the OCR processed PDF file
|
||||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||||
|
|
||||||
|
|
||||||
// Clean up the temporary files
|
// Clean up the temporary files
|
||||||
Files.delete(tempInputFile);
|
Files.delete(tempInputFile);
|
||||||
// Return the OCR processed PDF as a response
|
// Return the OCR processed PDF as a response
|
||||||
|
@ -53,7 +53,7 @@ public byte[] convertToPdf(MultipartFile inputFile) throws IOException, Interrup
|
|||||||
"-o",
|
"-o",
|
||||||
tempOutputFile.toString(),
|
tempOutputFile.toString(),
|
||||||
tempInputFile.toString()));
|
tempInputFile.toString()));
|
||||||
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
|
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE).runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
// Read the converted PDF file
|
// Read the converted PDF file
|
||||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||||
|
@ -6,8 +6,40 @@ import java.io.InputStreamReader;
|
|||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.concurrent.Semaphore;
|
||||||
public class ProcessExecutor {
|
public class ProcessExecutor {
|
||||||
public static int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
|
|
||||||
|
public enum Processes {
|
||||||
|
LIBRE_OFFICE,
|
||||||
|
OCR_MY_PDF
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Map<Processes, ProcessExecutor> instances = new HashMap<>();
|
||||||
|
|
||||||
|
private final Semaphore semaphore;
|
||||||
|
|
||||||
|
private ProcessExecutor(int semaphoreLimit) {
|
||||||
|
this.semaphore = new Semaphore(semaphoreLimit);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ProcessExecutor getInstance(Processes processType) {
|
||||||
|
return instances.computeIfAbsent(processType, key -> {
|
||||||
|
int semaphoreLimit = switch (key) {
|
||||||
|
case LIBRE_OFFICE -> 2;
|
||||||
|
case OCR_MY_PDF -> 2;
|
||||||
|
};
|
||||||
|
return new ProcessExecutor(semaphoreLimit);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
|
||||||
|
int exitCode = 1;
|
||||||
|
semaphore.acquire();
|
||||||
|
try {
|
||||||
|
|
||||||
|
|
||||||
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
||||||
Process process = processBuilder.start();
|
Process process = processBuilder.start();
|
||||||
|
|
||||||
@ -41,7 +73,7 @@ public class ProcessExecutor {
|
|||||||
outputReaderThread.start();
|
outputReaderThread.start();
|
||||||
|
|
||||||
// Wait for the conversion process to complete
|
// Wait for the conversion process to complete
|
||||||
int exitCode = process.waitFor();
|
exitCode = process.waitFor();
|
||||||
|
|
||||||
// Wait for the reader threads to finish
|
// Wait for the reader threads to finish
|
||||||
errorReaderThread.join();
|
errorReaderThread.join();
|
||||||
@ -59,7 +91,9 @@ public class ProcessExecutor {
|
|||||||
throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage);
|
throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} finally {
|
||||||
|
semaphore.release();
|
||||||
|
}
|
||||||
return exitCode;
|
return exitCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
spring.http.multipart.max-file-size=1GB
|
spring.http.multipart.max-file-size=2GB
|
||||||
spring.http.multipart.max-request-size=1GB
|
spring.http.multipart.max-request-size=2GB
|
||||||
|
|
||||||
multipart.enabled=true
|
multipart.enabled=true
|
||||||
multipart.max-file-size=1000MB
|
multipart.max-file-size=2000MB
|
||||||
multipart.max-request-size=1000MB
|
multipart.max-request-size=2000MB
|
||||||
|
|
||||||
spring.servlet.multipart.max-file-size=1000MB
|
spring.servlet.multipart.max-file-size=2000MB
|
||||||
spring.servlet.multipart.max-request-size=1000MB
|
spring.servlet.multipart.max-request-size=2000MB
|
||||||
|
|
||||||
server.forward-headers-strategy=NATIVE
|
server.forward-headers-strategy=NATIVE
|
||||||
|
|
||||||
|
@ -26,10 +26,30 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<!-- <div class="form-group">
|
<div class="form-group">
|
||||||
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
|
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
|
||||||
<label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label>
|
<label class="form-check-label" for="sidecar" th:text="#{ocr.selectText.2}"></label>
|
||||||
</div> -->
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<input type="checkbox" class="form-check-input" name="deskew" id="deskew" />
|
||||||
|
<label class="form-check-label" for="deskew" th:text="#{ocr.selectText.2}"></label>
|
||||||
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<input type="checkbox" class="form-check-input" name="clean" id="clean" />
|
||||||
|
<label class="form-check-label" for="clean" th:text="#{ocr.selectText.2}"></label>
|
||||||
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
|
||||||
|
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.2}"></label>
|
||||||
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label th:text="#{pdfToImage.selectText}"></label>
|
||||||
|
<select class="form-control" name="ocrType">
|
||||||
|
<option value="skip-text">Ignores pages that have interacive text on them, only OCRs pages that are images</option>
|
||||||
|
<option value="force-ocr">Force OCR, will OCR Every page removing all original text</option>
|
||||||
|
<option value="Normal">Normal (Will error if contains text)</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
<button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button>
|
<button type="submit" class="btn btn-primary" th:text="#{ocr.submit}"></button>
|
||||||
</form>
|
</form>
|
||||||
<p th:text="#{ocr.credit}"></p>
|
<p th:text="#{ocr.credit}"></p>
|
||||||
|
Loading…
Reference in New Issue
Block a user