Changes to blank detection, WIP for %

This commit is contained in:
Anthony Stirling 2023-05-11 23:05:33 +01:00
parent 2d42ae9a36
commit a647347e10
4 changed files with 85 additions and 52 deletions

View File

@ -1,8 +1,9 @@
import cv2
import numpy as np
import sys
import argparse
def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5):
def is_blank_image(image_path, threshold=10, white_percent=99, white_value=255, blur_size=5):
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
if image is None:
@ -19,21 +20,21 @@ def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5):
total_pixels = thresholded_image.size
white_pixel_percentage = (white_pixels / total_pixels) * 100
return white_pixel_percentage > 99
return white_pixel_percentage > white_percent
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python detect_blank_page.py <image_path>")
sys.exit(1)
parser = argparse.ArgumentParser(description='Detect if an image is considered blank or not.')
parser.add_argument('image_path', help='The path to the image file.')
parser.add_argument('-t', '--threshold', type=int, default=10, help='Threshold for determining white pixels. The default value is 10.')
parser.add_argument('-w', '--white_percent', type=int, default=99, help='The percentage of white pixels for an image to be considered blank. The default value is 99.')
args = parser.parse_args()
image_path = sys.argv[1]
blank = is_blank_image(image_path)
blank = is_blank_image(args.image_path, args.threshold, args.white_percent)
if blank:
# Return code 1: The image is considered blank.
sys.exit(1)
else:
# Return code 0: The image is not considered blank.
sys.exit(0)
sys.exit(0)

View File

@ -1,11 +1,15 @@
package stirling.software.SPDF.controller.api.other;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javax.imageio.ImageIO;
@ -17,6 +21,7 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
@ -29,7 +34,9 @@ import stirling.software.SPDF.utils.ProcessExecutor;
public class BlankPageController {
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile,
@RequestParam(defaultValue = "10", name = "threshold") int threshold,
@RequestParam(defaultValue = "99", name = "whitePercent") int whitePercent) throws IOException, InterruptedException {
PDDocument document = null;
try {
document = PDDocument.load(inputFile.getInputStream());
@ -38,62 +45,67 @@ public class BlankPageController {
List<Integer> pagesToKeepIndex = new ArrayList<>();
int pageIndex = 0;
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (PDPage page : pages) {
pageIndex++;
textStripper.setStartPage(pageIndex);
textStripper.setEndPage(pageIndex);
System.out.println("checking page " + pageIndex);
textStripper.setStartPage(pageIndex + 1);
textStripper.setEndPage(pageIndex + 1);
String pageText = textStripper.getText(document);
boolean hasText = !pageText.trim().isEmpty();
if (hasText) {
pagesToKeepIndex.add(pageIndex);
System.out.println("page " + pageIndex + " has text");
continue;
}
boolean hasImages = hasImagesOnPage(page);
if (hasImages) {
pagesToKeepIndex.add(pageIndex);
System.out.println("page " + pageIndex + " has image");
continue;
}
}
System.out.print(pagesToKeepIndex.size());
PDDocument outputDocument = new PDDocument();
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (Integer i : pagesToKeepIndex) {
// Create temp file to save the image
Path tempFile = Files.createTempFile("image_", ".png");
// Render image and save as temp file
BufferedImage image = pdfRenderer.renderImageWithDPI(i - 1, 300);
ImageIO.write(image, "png", tempFile.toFile());
List<String> command = new ArrayList<>(Arrays.asList("python3", System.getProperty("user.dir") + "scripts/detect-blank-pages.py", tempFile.toString()));
// Run CLI command
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
//does contain data
if(returnCode ==0) {
outputDocument.addPage(document.getPage(i - 1));
pagesToKeepIndex.add(pageIndex);
System.out.println("page " + pageIndex + " has text");
} else {
System.out.print("Found blank page skipping, page #" + i);
boolean hasImages = hasImagesOnPage(page);
if (hasImages) {
System.out.println("page " + pageIndex + " has image");
Path tempFile = Files.createTempFile("image_", ".png");
// Render image and save as temp file
BufferedImage image = pdfRenderer.renderImageWithDPI(pageIndex, 300);
ImageIO.write(image, "png", tempFile.toFile());
List<String> command = new ArrayList<>(Arrays.asList("python3", System.getProperty("user.dir") + "scripts/detect-blank-pages.py", tempFile.toString() ,"--threshold", String.valueOf(threshold), "--white_percent", String.valueOf(whitePercent)));
// Run CLI command
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
// does contain data
if (returnCode == 0) {
System.out.println("page " + pageIndex + " has image which is not blank");
pagesToKeepIndex.add(pageIndex);
} else {
System.out.println("Skipping, Image was blank for page #" + pageIndex);
}
}
}
pageIndex++;
}
System.out.print("pagesToKeep=" + pagesToKeepIndex.size());
// Remove pages not present in pagesToKeepIndex
List<Integer> pageIndices = IntStream.range(0, pages.getCount()).boxed().collect(Collectors.toList());
Collections.reverse(pageIndices); // Reverse to prevent index shifting during removal
for (Integer i : pageIndices) {
if (!pagesToKeepIndex.contains(i)) {
pages.remove(i);
}
}
return PdfUtils.pdfDocToWebResponse(outputDocument, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_blanksRemoved.pdf");
return PdfUtils.pdfDocToWebResponse(document, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_blanksRemoved.pdf");
} catch (IOException e) {
e.printStackTrace();
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
} finally {
if(document != null)
if (document != null)
document.close();
}
}
private static boolean hasImagesOnPage(PDPage page) throws IOException {
ImageFinder imageFinder = new ImageFinder(page);
imageFinder.processPage(page);

View File

@ -101,8 +101,18 @@ public class ExtractImageScansController {
for (int i = 0; i < images.size(); i++) {
Path tempDir = Files.createTempDirectory("openCV_output");
List<String> command = new ArrayList<>(Arrays.asList("python3", "./scripts/split_photos.py", images.get(i), tempDir.toString(), String.valueOf(angleThreshold),
String.valueOf(tolerance), String.valueOf(minArea), String.valueOf(minContourArea), String.valueOf(borderSize)));
List<String> command = new ArrayList<>(Arrays.asList(
"python3",
"./scripts/split_photos.py",
images.get(i),
tempDir.toString(),
"--angle_threshold", String.valueOf(angleThreshold),
"--tolerance", String.valueOf(tolerance),
"--min_area", String.valueOf(minArea),
"--min_contour_area", String.valueOf(minContourArea),
"--border_size", String.valueOf(borderSize)
));
// Run CLI command
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);

View File

@ -16,6 +16,16 @@
<form id="multiPdfForm" th:action="@{remove-blanks}" method="post" enctype="multipart/form-data">
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
<div class="form-group">
<label for="threshold">Threshold:</label>
<input type="number" class="form-control" id="threshold" name="threshold" value="10">
<small id="thresholdHelp" class="form-text text-muted">Threshold for determining how white a white pixel must be)</small>
</div>
<div class="form-group">
<label for="whitePercent">White Percent (%):</label>
<input type="number" class="form-control" id="whitePercent" name="whitePercent" value="99">
<small id="whitePercentHelp" class="form-text text-muted">Percent of page that must be white to be removed</small>
</div>
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{removeBlanks.submit}"></button>
</form>
</div>