mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-12 17:52:13 +02:00
revert OCR
This commit is contained in:
parent
1597d23f92
commit
e13cb19439
@ -9,11 +9,7 @@ import java.util.stream.Collectors;
|
|||||||
import java.util.zip.ZipEntry;
|
import java.util.zip.ZipEntry;
|
||||||
import java.util.zip.ZipOutputStream;
|
import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
import javax.imageio.IIOImage;
|
|
||||||
import javax.imageio.ImageIO;
|
import javax.imageio.ImageIO;
|
||||||
import javax.imageio.ImageWriteParam;
|
|
||||||
import javax.imageio.ImageWriter;
|
|
||||||
import javax.imageio.stream.FileImageOutputStream;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
@ -92,6 +88,7 @@ public class OCRController {
|
|||||||
Files.createDirectories(tempImagesDir);
|
Files.createDirectories(tempImagesDir);
|
||||||
Process process = null;
|
Process process = null;
|
||||||
try {
|
try {
|
||||||
|
// Save input file
|
||||||
inputFile.transferTo(tempInputFile.toFile());
|
inputFile.transferTo(tempInputFile.toFile());
|
||||||
PDFMergerUtility merger = new PDFMergerUtility();
|
PDFMergerUtility merger = new PDFMergerUtility();
|
||||||
merger.setDestinationFileName(finalOutputFile.toString());
|
merger.setDestinationFileName(finalOutputFile.toString());
|
||||||
@ -101,6 +98,7 @@ public class OCRController {
|
|||||||
for (int pageNum = 0; pageNum < pageCount; pageNum++) {
|
for (int pageNum = 0; pageNum < pageCount; pageNum++) {
|
||||||
PDPage page = document.getPage(pageNum);
|
PDPage page = document.getPage(pageNum);
|
||||||
boolean hasText = false;
|
boolean hasText = false;
|
||||||
|
// Check for existing text
|
||||||
try (PDDocument tempDoc = new PDDocument()) {
|
try (PDDocument tempDoc = new PDDocument()) {
|
||||||
tempDoc.addPage(page);
|
tempDoc.addPage(page);
|
||||||
PDFTextStripper stripper = new PDFTextStripper();
|
PDFTextStripper stripper = new PDFTextStripper();
|
||||||
@ -115,42 +113,12 @@ public class OCRController {
|
|||||||
Path pageOutputPath =
|
Path pageOutputPath =
|
||||||
tempOutputDir.resolve(String.format("page_%d.pdf", pageNum));
|
tempOutputDir.resolve(String.format("page_%d.pdf", pageNum));
|
||||||
if (shouldOcr) {
|
if (shouldOcr) {
|
||||||
// Render with lower DPI (200 instead of 300)
|
// Convert page to image
|
||||||
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 200);
|
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
|
||||||
|
Path imagePath =
|
||||||
// Convert to RGB to remove alpha channel if present
|
tempImagesDir.resolve(String.format("page_%d.png", pageNum));
|
||||||
if (image.getType() != BufferedImage.TYPE_INT_RGB) {
|
ImageIO.write(image, "png", imagePath.toFile());
|
||||||
BufferedImage rgbImage =
|
// Build OCR command
|
||||||
new BufferedImage(
|
|
||||||
image.getWidth(),
|
|
||||||
image.getHeight(),
|
|
||||||
BufferedImage.TYPE_INT_RGB);
|
|
||||||
rgbImage.getGraphics().drawImage(image, 0, 0, null);
|
|
||||||
image = rgbImage;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Save as JPEG with compression
|
|
||||||
String imageName = String.format("page_%d.jpg", pageNum);
|
|
||||||
Path imagePath = tempImagesDir.resolve(imageName);
|
|
||||||
|
|
||||||
Iterator<ImageWriter> writers = ImageIO.getImageWritersByFormatName("jpg");
|
|
||||||
if (!writers.hasNext()) {
|
|
||||||
throw new IllegalStateException("No JPG ImageWriter found");
|
|
||||||
}
|
|
||||||
ImageWriter writer = writers.next();
|
|
||||||
ImageWriteParam params = writer.getDefaultWriteParam();
|
|
||||||
params.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
|
|
||||||
params.setCompressionQuality(0.7f); // Adjust quality here (0.7 = 70%)
|
|
||||||
|
|
||||||
try (FileImageOutputStream output =
|
|
||||||
new FileImageOutputStream(imagePath.toFile())) {
|
|
||||||
writer.setOutput(output);
|
|
||||||
writer.write(null, new IIOImage(image, null, null), params);
|
|
||||||
} finally {
|
|
||||||
writer.dispose();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build OCR command with JPG image
|
|
||||||
List<String> command = new ArrayList<>();
|
List<String> command = new ArrayList<>();
|
||||||
command.add("tesseract");
|
command.add("tesseract");
|
||||||
command.add(imagePath.toString());
|
command.add(imagePath.toString());
|
||||||
@ -160,6 +128,7 @@ public class OCRController {
|
|||||||
.toString());
|
.toString());
|
||||||
command.add("-l");
|
command.add("-l");
|
||||||
command.add(String.join("+", languages));
|
command.add(String.join("+", languages));
|
||||||
|
// Always output PDF
|
||||||
command.add("pdf");
|
command.add("pdf");
|
||||||
ProcessBuilder pb = new ProcessBuilder(command);
|
ProcessBuilder pb = new ProcessBuilder(command);
|
||||||
process = pb.start();
|
process = pb.start();
|
||||||
|
Loading…
Reference in New Issue
Block a user