From e13cb1943939c94118f10f469e59908710c522ca Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com.> Date: Tue, 25 Mar 2025 13:35:34 +0000 Subject: [PATCH] revert OCR --- .../controller/api/misc/OCRController.java | 51 ++++--------------- 1 file changed, 10 insertions(+), 41 deletions(-) diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java index c69d328c3..059e1f011 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java @@ -9,11 +9,7 @@ import java.util.stream.Collectors; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -import javax.imageio.IIOImage; import javax.imageio.ImageIO; -import javax.imageio.ImageWriteParam; -import javax.imageio.ImageWriter; -import javax.imageio.stream.FileImageOutputStream; import org.apache.pdfbox.multipdf.PDFMergerUtility; import org.apache.pdfbox.pdmodel.PDDocument; @@ -92,6 +88,7 @@ public class OCRController { Files.createDirectories(tempImagesDir); Process process = null; try { + // Save input file inputFile.transferTo(tempInputFile.toFile()); PDFMergerUtility merger = new PDFMergerUtility(); merger.setDestinationFileName(finalOutputFile.toString()); @@ -101,6 +98,7 @@ public class OCRController { for (int pageNum = 0; pageNum < pageCount; pageNum++) { PDPage page = document.getPage(pageNum); boolean hasText = false; + // Check for existing text try (PDDocument tempDoc = new PDDocument()) { tempDoc.addPage(page); PDFTextStripper stripper = new PDFTextStripper(); @@ -115,42 +113,12 @@ public class OCRController { Path pageOutputPath = tempOutputDir.resolve(String.format("page_%d.pdf", pageNum)); if (shouldOcr) { - // Render with lower DPI (200 instead of 300) - BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 200); - - // Convert to RGB to remove alpha channel if present - if (image.getType() != BufferedImage.TYPE_INT_RGB) { - BufferedImage rgbImage = - new BufferedImage( - image.getWidth(), - image.getHeight(), - BufferedImage.TYPE_INT_RGB); - rgbImage.getGraphics().drawImage(image, 0, 0, null); - image = rgbImage; - } - - // Save as JPEG with compression - String imageName = String.format("page_%d.jpg", pageNum); - Path imagePath = tempImagesDir.resolve(imageName); - - Iterator writers = ImageIO.getImageWritersByFormatName("jpg"); - if (!writers.hasNext()) { - throw new IllegalStateException("No JPG ImageWriter found"); - } - ImageWriter writer = writers.next(); - ImageWriteParam params = writer.getDefaultWriteParam(); - params.setCompressionMode(ImageWriteParam.MODE_EXPLICIT); - params.setCompressionQuality(0.7f); // Adjust quality here (0.7 = 70%) - - try (FileImageOutputStream output = - new FileImageOutputStream(imagePath.toFile())) { - writer.setOutput(output); - writer.write(null, new IIOImage(image, null, null), params); - } finally { - writer.dispose(); - } - - // Build OCR command with JPG image + // Convert page to image + BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300); + Path imagePath = + tempImagesDir.resolve(String.format("page_%d.png", pageNum)); + ImageIO.write(image, "png", imagePath.toFile()); + // Build OCR command List command = new ArrayList<>(); command.add("tesseract"); command.add(imagePath.toString()); @@ -160,6 +128,7 @@ public class OCRController { .toString()); command.add("-l"); command.add(String.join("+", languages)); + // Always output PDF command.add("pdf"); ProcessBuilder pb = new ProcessBuilder(command); process = pb.start(); @@ -246,4 +215,4 @@ public class OCRController { log.error("Error walking directory {}: {}", directory, e.getMessage()); } } -} +} \ No newline at end of file