mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-12 17:52:13 +02:00
revert OCR
This commit is contained in:
parent
1597d23f92
commit
e13cb19439
@ -9,11 +9,7 @@ import java.util.stream.Collectors;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import javax.imageio.IIOImage;
|
||||
import javax.imageio.ImageIO;
|
||||
import javax.imageio.ImageWriteParam;
|
||||
import javax.imageio.ImageWriter;
|
||||
import javax.imageio.stream.FileImageOutputStream;
|
||||
|
||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
@ -92,6 +88,7 @@ public class OCRController {
|
||||
Files.createDirectories(tempImagesDir);
|
||||
Process process = null;
|
||||
try {
|
||||
// Save input file
|
||||
inputFile.transferTo(tempInputFile.toFile());
|
||||
PDFMergerUtility merger = new PDFMergerUtility();
|
||||
merger.setDestinationFileName(finalOutputFile.toString());
|
||||
@ -101,6 +98,7 @@ public class OCRController {
|
||||
for (int pageNum = 0; pageNum < pageCount; pageNum++) {
|
||||
PDPage page = document.getPage(pageNum);
|
||||
boolean hasText = false;
|
||||
// Check for existing text
|
||||
try (PDDocument tempDoc = new PDDocument()) {
|
||||
tempDoc.addPage(page);
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
@ -115,42 +113,12 @@ public class OCRController {
|
||||
Path pageOutputPath =
|
||||
tempOutputDir.resolve(String.format("page_%d.pdf", pageNum));
|
||||
if (shouldOcr) {
|
||||
// Render with lower DPI (200 instead of 300)
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 200);
|
||||
|
||||
// Convert to RGB to remove alpha channel if present
|
||||
if (image.getType() != BufferedImage.TYPE_INT_RGB) {
|
||||
BufferedImage rgbImage =
|
||||
new BufferedImage(
|
||||
image.getWidth(),
|
||||
image.getHeight(),
|
||||
BufferedImage.TYPE_INT_RGB);
|
||||
rgbImage.getGraphics().drawImage(image, 0, 0, null);
|
||||
image = rgbImage;
|
||||
}
|
||||
|
||||
// Save as JPEG with compression
|
||||
String imageName = String.format("page_%d.jpg", pageNum);
|
||||
Path imagePath = tempImagesDir.resolve(imageName);
|
||||
|
||||
Iterator<ImageWriter> writers = ImageIO.getImageWritersByFormatName("jpg");
|
||||
if (!writers.hasNext()) {
|
||||
throw new IllegalStateException("No JPG ImageWriter found");
|
||||
}
|
||||
ImageWriter writer = writers.next();
|
||||
ImageWriteParam params = writer.getDefaultWriteParam();
|
||||
params.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
|
||||
params.setCompressionQuality(0.7f); // Adjust quality here (0.7 = 70%)
|
||||
|
||||
try (FileImageOutputStream output =
|
||||
new FileImageOutputStream(imagePath.toFile())) {
|
||||
writer.setOutput(output);
|
||||
writer.write(null, new IIOImage(image, null, null), params);
|
||||
} finally {
|
||||
writer.dispose();
|
||||
}
|
||||
|
||||
// Build OCR command with JPG image
|
||||
// Convert page to image
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
|
||||
Path imagePath =
|
||||
tempImagesDir.resolve(String.format("page_%d.png", pageNum));
|
||||
ImageIO.write(image, "png", imagePath.toFile());
|
||||
// Build OCR command
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("tesseract");
|
||||
command.add(imagePath.toString());
|
||||
@ -160,6 +128,7 @@ public class OCRController {
|
||||
.toString());
|
||||
command.add("-l");
|
||||
command.add(String.join("+", languages));
|
||||
// Always output PDF
|
||||
command.add("pdf");
|
||||
ProcessBuilder pb = new ProcessBuilder(command);
|
||||
process = pb.start();
|
||||
@ -246,4 +215,4 @@ public class OCRController {
|
||||
log.error("Error walking directory {}: {}", directory, e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user