Stirling-PDF/src/main/java/stirling/software/SPDF/controller/OCRController.java
Anthony Stirling a9145fe84c
Lots of changes (#70)
Image extraction and conversion to formats 

Multi parallel file execution for all forms so you can input multiple files quickly 

Any file at all pdf using libreoffice, super powerful
Sadly makes docker image larger but worth it 

OCR PDF using ocr my pdf
Works awesomely for adding text to a image

Improved compression using ocr my pdf app

Settings page with custom download options such as 
- open in same window
- open in new window
- download
- download as zip

Update detection in settings page it should show notification if there is a update (very hidden)

UI cleanups

Add other image formats to PDF to Image

Various fies to icons, and pdf.js usage
2023-03-20 21:55:11 +00:00

144 lines
5.4 KiB
Java

package stirling.software.SPDF.controller;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.ModelAndView;
import stirling.software.SPDF.utils.ProcessExecutor;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
//import com.spire.pdf.*;
@Controller
public class OCRController {
private static final Logger logger = LoggerFactory.getLogger(OCRController.class);
@GetMapping("/ocr-pdf")
public ModelAndView ocrPdfPage() {
ModelAndView modelAndView = new ModelAndView("ocr-pdf");
modelAndView.addObject("languages", getAvailableTesseractLanguages());
modelAndView.addObject("currentPage", "ocr-pdf");
return modelAndView;
}
@PostMapping("/ocr-pdf")
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
@RequestParam("languages") List<String> selectedLanguages,
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
//--output-type pdfa
if (selectedLanguages == null || selectedLanguages.size() < 1) {
throw new IOException("Please select at least one language.");
}
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", ".pdf");
inputFile.transferTo(tempInputFile.toFile());
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Run OCR Command
String languageOption = String.join("+", selectedLanguages);
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,
tempInputFile.toString(), tempOutputFile.toString()));
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
if (sidecar != null && sidecar) {
command.add("--sidecar");
command.add(sidecarFile);
}
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
// Read the OCR processed PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files
Files.delete(tempInputFile);
// Return the OCR processed PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
HttpHeaders headers = new HttpHeaders();
if (sidecar != null && sidecar) {
// Create a zip file containing both the PDF and the text file
String outputZipFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.zip";
Path tempZipFile = Files.createTempFile("output_", ".zip");
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
// Add PDF file to the zip
ZipEntry pdfEntry = new ZipEntry(outputFilename);
zipOut.putNextEntry(pdfEntry);
Files.copy(tempOutputFile, zipOut);
zipOut.closeEntry();
// Add text file to the zip
ZipEntry txtEntry = new ZipEntry(sidecarFile);
zipOut.putNextEntry(txtEntry);
Files.copy(Paths.get(sidecarFile), zipOut);
zipOut.closeEntry();
}
byte[] zipBytes = Files.readAllBytes(tempZipFile);
// Clean up the temporary zip file
Files.delete(tempZipFile);
Files.delete(tempOutputFile);
Files.delete(Paths.get(sidecarFile));
// Return the zip file containing both the PDF and the text file
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
headers.setContentDispositionFormData("attachment", outputZipFilename);
return ResponseEntity.ok().headers(headers).body(zipBytes);
} else {
// Return the OCR processed PDF as a response
Files.delete(tempOutputFile);
headers.setContentType(MediaType.APPLICATION_PDF);
headers.setContentDispositionFormData("attachment", outputFilename);
return ResponseEntity.ok().headers(headers).body(pdfBytes);
}
}
public List<String> getAvailableTesseractLanguages() {
String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";
File[] files = new File(tessdataDir).listFiles();
if (files == null) {
return Collections.emptyList();
}
return Arrays.stream(files)
.filter(file -> file.getName().endsWith(".traineddata"))
.map(file -> file.getName().replace(".traineddata", ""))
.filter(lang -> !lang.equalsIgnoreCase("osd"))
.collect(Collectors.toList());
}
}