From 58618b3a21533fb7c071e34177fe7084d3f5bdcc Mon Sep 17 00:00:00 2001 From: Ludy Date: Tue, 20 Aug 2024 17:17:54 +0200 Subject: [PATCH] Add: Convert PDF to WebP (#1666) * Add PDF to WebP * add swagger param * back * creates a custom image for Docker from pymupdf * Converting with pdf2image and Pillow instead of pymupdf * webp remove to pdf-to-img * remove mupdf --- .github/labeler-config.yml | 1 + Dockerfile | 6 +- Dockerfile-fat | 6 +- Endpoint-groups.md | 4 +- scripts/png_to_webp.py | 174 ++++++++++++++++++ .../SPDF/config/EndpointConfiguration.java | 1 + .../converters/ConvertImgPDFController.java | 95 +++++++++- .../web/ConverterWebController.java | 30 +-- .../api/converters/ConvertToImageRequest.java | 2 +- src/main/resources/messages_en_GB.properties | 2 +- .../templates/convert/pdf-to-img.html | 1 + .../resources/templates/fragments/navbar.html | 2 +- 12 files changed, 296 insertions(+), 28 deletions(-) create mode 100644 scripts/png_to_webp.py diff --git a/.github/labeler-config.yml b/.github/labeler-config.yml index a6cce904..f777e3ea 100644 --- a/.github/labeler-config.yml +++ b/.github/labeler-config.yml @@ -2,6 +2,7 @@ Translation: - changed-files: - any-glob-to-any-file: 'src/main/resources/messages_*_*.properties' - any-glob-to-any-file: 'scripts/ignore_translation.toml' + - any-glob-to-any-file: 'src/main/resources/templates/fragments/languages.html' Front End: - changed-files: diff --git a/Dockerfile b/Dockerfile index 44031ba7..5ae8d7e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,16 +39,16 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et libreoffice \ # pdftohtml poppler-utils \ -# OCR MY PDF (unpaper for descew and other advanced featues) +# OCR MY PDF (unpaper for descew and other advanced features) ocrmypdf \ tesseract-ocr-data-eng \ # CV py3-opencv \ # python3/pip python3 \ - py3-pip && \ + py3-pip && \ # uno unoconv and HTML - pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \ + pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint pdf2image pillow && \ mv /usr/share/tessdata /usr/share/tessdata-original && \ mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \ fc-cache -f -v && \ diff --git a/Dockerfile-fat b/Dockerfile-fat index c31fe348..1efb0eae 100644 --- a/Dockerfile-fat +++ b/Dockerfile-fat @@ -9,7 +9,7 @@ COPY . . # Build the application with DOCKER_ENABLE_SECURITY=false RUN DOCKER_ENABLE_SECURITY=true \ -./gradlew clean build +./gradlew clean build # Main stage FROM alpine:3.20.2 @@ -32,7 +32,7 @@ ENV DOCKER_ENABLE_SECURITY=false \ UMASK=022 \ FAT_DOCKER=true \ INSTALL_BOOK_AND_ADVANCED_HTML_OPS=false - + # JDK for app RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \ @@ -64,7 +64,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et python3 \ py3-pip && \ # uno unoconv and HTML - pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \ + pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint pdf2image pillow && \ mv /usr/share/tessdata /usr/share/tessdata-original && \ mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \ fc-cache -f -v && \ diff --git a/Endpoint-groups.md b/Endpoint-groups.md index 9f906586..a1a29817 100644 --- a/Endpoint-groups.md +++ b/Endpoint-groups.md @@ -15,7 +15,7 @@ | file-to-pdf | | ✔️ | | | ✔️ | | | ✔️ | | | | | img-to-pdf | | ✔️ | | | | | | | | ✔️ | | | pdf-to-html | | ✔️ | | | ✔️ | | | ✔️ | | | | -| pdf-to-img | | ✔️ | | | | | | | | ✔️ | | +| pdf-to-img | | ✔️ | | | | ✔️ | | | | ✔️ | | | pdf-to-pdfa | | ✔️ | | | ✔️ | | | | ✔️ | | | | pdf-to-markdown | | ✔️ | | | | | | | | ✔️ | | | pdf-to-presentation | | ✔️ | | | ✔️ | | | ✔️ | | | | @@ -44,4 +44,4 @@ | remove-blanks | | | | ✔️ | ✔️ | ✔️ | ✔️ | | | | | | repair | | | | ✔️ | ✔️ | | | ✔️ | | | | | show-javascript | | | | ✔️ | | | | | | | ✔️ | -| sign | | | | ✔️ | | | | | | | ✔️ | \ No newline at end of file +| sign | | | | ✔️ | | | | | | | ✔️ | diff --git a/scripts/png_to_webp.py b/scripts/png_to_webp.py new file mode 100644 index 00000000..6f086d52 --- /dev/null +++ b/scripts/png_to_webp.py @@ -0,0 +1,174 @@ +""" +Author: Ludy87 +Description: This script converts a PDF file to WebP images. It includes functionality to resize images if they exceed specified dimensions and handle conversion of PDF pages to WebP format. + +Example +------- +To convert a PDF file to WebP images with each page as a separate WebP file: + python script.py input.pdf output_directory + +To convert a PDF file to a single WebP image: + python script.py input.pdf output_directory --single + +To adjust the DPI resolution for rendering PDF pages: + python script.py input.pdf output_directory --dpi 150 +""" + +import argparse +import os +from pdf2image import convert_from_path +from PIL import Image + + +def resize_image(input_image_path, output_image_path, max_size=(16383, 16383)): + """ + Resize the image if its dimensions exceed the maximum allowed size and save it as WebP. + + Parameters + ---------- + input_image_path : str + Path to the input image file. + output_image_path : str + Path where the output WebP image will be saved. + max_size : tuple of int, optional + Maximum allowed dimensions for the image (width, height). Default is (16383, 16383). + + Returns + ------- + None + """ + try: + # Open the image + image = Image.open(input_image_path) + width, height = image.size + max_width, max_height = max_size + + # Check if the image dimensions exceed the maximum allowed dimensions + if width > max_width or height > max_height: + # Calculate the scaling ratio + ratio = min(max_width / width, max_height / height) + new_width = int(width * ratio) + new_height = int(height * ratio) + + # Resize the image + resized_image = image.resize((new_width, new_height), Image.LANCZOS) + resized_image.save(output_image_path, format="WEBP", quality=100) + print( + f"The image was successfully resized to ({new_width}, {new_height}) and saved as WebP: {output_image_path}" + ) + else: + # If dimensions are within the allowed limits, save the image directly + image.save(output_image_path, format="WEBP", quality=100) + print(f"The image was successfully saved as WebP: {output_image_path}") + except Exception as e: + print(f"An error occurred: {e}") + + +def convert_image_to_webp(input_image, output_file): + """ + Convert an image to WebP format, resizing it if it exceeds the maximum dimensions. + + Parameters + ---------- + input_image : str + Path to the input image file. + output_file : str + Path where the output WebP image will be saved. + + Returns + ------- + None + """ + # Resize the image if it exceeds the maximum dimensions + resize_image(input_image, output_file, max_size=(16383, 16383)) + + +def pdf_to_webp(pdf_path, output_dir, dpi=300): + """ + Convert each page of a PDF file to WebP images. + + Parameters + ---------- + pdf_path : str + Path to the input PDF file. + output_dir : str + Directory where the WebP images will be saved. + dpi : int, optional + DPI resolution for rendering PDF pages. Default is 300. + + Returns + ------- + None + """ + # Convert the PDF to a list of images + images = convert_from_path(pdf_path, dpi=dpi) + + for page_number, image in enumerate(images): + # Define temporary PNG path + temp_png_path = os.path.join(output_dir, f"temp_page_{page_number + 1}.png") + image.save(temp_png_path, format="PNG") + + # Define the output path for WebP + output_path = os.path.join(output_dir, f"page_{page_number + 1}.webp") + + # Convert PNG to WebP + convert_image_to_webp(temp_png_path, output_path) + + # Delete the temporary PNG file + os.remove(temp_png_path) + + +def main(pdf_image_path, output_dir, dpi=300, single_images_flag=False): + """ + Main function to handle conversion from PDF to WebP images. + + Parameters + ---------- + pdf_image_path : str + Path to the input PDF file or image. + output_dir : str + Directory where the WebP images will be saved. + dpi : int, optional + DPI resolution for rendering PDF pages. Default is 300. + single_images_flag : bool, optional + If True, combine all pages into a single WebP image. Default is False. + + Returns + ------- + None + """ + if single_images_flag: + # Combine all pages into a single WebP image + output_path = os.path.join(output_dir, "combined_image.webp") + convert_image_to_webp(pdf_image_path, output_path) + else: + # Convert each PDF page to a separate WebP image + pdf_to_webp(pdf_image_path, output_dir, dpi) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert a PDF file to WebP images.") + parser.add_argument("pdf_path", help="The path to the input PDF file.") + parser.add_argument( + "output_dir", help="The directory where the WebP images should be saved." + ) + parser.add_argument( + "--dpi", + type=int, + default=300, + help="The DPI resolution for rendering the PDF pages (default: 300).", + ) + parser.add_argument( + "--single", + action="store_true", + help="Combine all pages into a single WebP image.", + ) + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + main( + args.pdf_path, + args.output_dir, + dpi=args.dpi, + single_images_flag=args.single, + ) diff --git a/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java b/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java index 303de37f..e0ae56a1 100644 --- a/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java +++ b/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java @@ -166,6 +166,7 @@ public class EndpointConfiguration { addEndpointToGroup("Python", REMOVE_BLANKS); addEndpointToGroup("Python", "html-to-pdf"); addEndpointToGroup("Python", "url-to-pdf"); + addEndpointToGroup("Python", "pdf-to-img"); // openCV addEndpointToGroup("OpenCV", "extract-image-scans"); diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java index 25c37aad..6d3afc6d 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java @@ -1,11 +1,23 @@ package stirling.software.SPDF.controller.api.converters; +import java.io.ByteArrayOutputStream; +import java.io.FileOutputStream; import java.io.IOException; import java.net.URLConnection; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; +import org.apache.commons.io.FileUtils; import org.apache.pdfbox.rendering.ImageType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.http.HttpHeaders; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; @@ -21,6 +33,8 @@ import io.swagger.v3.oas.annotations.tags.Tag; import stirling.software.SPDF.model.api.converters.ConvertToImageRequest; import stirling.software.SPDF.model.api.converters.ConvertToPdfRequest; import stirling.software.SPDF.utils.PdfUtils; +import stirling.software.SPDF.utils.ProcessExecutor; +import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult; import stirling.software.SPDF.utils.WebResponseUtils; @RestController @@ -60,15 +74,92 @@ public class ConvertImgPDFController { result = PdfUtils.convertFromPdf( pdfBytes, - imageFormat.toUpperCase(), + imageFormat.equalsIgnoreCase("webp") ? "png" : imageFormat.toUpperCase(), colorTypeResult, singleImage, Integer.valueOf(dpi), filename); - if (result == null || result.length == 0) { logger.error("resultant bytes for {} is null, error converting ", filename); } + if (imageFormat.equalsIgnoreCase("webp")) { + // Write the output stream to a temp file + Path tempFile = Files.createTempFile("temp_png", ".png"); + try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) { + fos.write(result); + fos.flush(); + } + + String pythonVersion = "python3"; + try { + ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV) + .runCommandWithOutputHandling(Arrays.asList("python3", "--version")); + } catch (IOException e) { + ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV) + .runCommandWithOutputHandling(Arrays.asList("python", "--version")); + pythonVersion = "python"; + } + + List command = new ArrayList<>(); + command.add(pythonVersion); + command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion + + // Create a temporary directory for the output WebP files + Path tempOutputDir = Files.createTempDirectory("webp_output"); + if (singleImage) { + // Run the Python script to convert PNG to WebP + command.add(tempFile.toString()); + command.add(tempOutputDir.toString()); + command.add("--single"); + } else { + // Save the uploaded PDF to a temporary file + Path tempPdfPath = Files.createTempFile("temp_pdf", ".pdf"); + file.transferTo(tempPdfPath.toFile()); + // Run the Python script to convert PDF to WebP + command.add(tempPdfPath.toString()); + command.add(tempOutputDir.toString()); + } + command.add("--dpi"); + command.add(dpi); + ProcessExecutorResult resultProcess = + ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV) + .runCommandWithOutputHandling(command); + + // Find all WebP files in the output directory + List webpFiles = + Files.walk(tempOutputDir) + .filter(path -> path.toString().endsWith(".webp")) + .collect(Collectors.toList()); + + if (webpFiles.isEmpty()) { + logger.error("No WebP files were created in: {}", tempOutputDir.toString()); + throw new IOException("No WebP files were created. " + resultProcess.getMessages()); + } + + byte[] bodyBytes = new byte[0]; + + if (webpFiles.size() == 1) { + // Return the single WebP file directly + Path webpFilePath = webpFiles.get(0); + bodyBytes = Files.readAllBytes(webpFilePath); + } else { + // Create a ZIP file containing all WebP images + ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream(); + try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) { + for (Path webpFile : webpFiles) { + zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString())); + Files.copy(webpFile, zos); + zos.closeEntry(); + } + } + bodyBytes = zipOutputStream.toByteArray(); + } + // Clean up the temporary files + Files.deleteIfExists(tempFile); + if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile()); + result = bodyBytes; + } + if (singleImage) { String docName = filename + "." + imageFormat; MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat)); diff --git a/src/main/java/stirling/software/SPDF/controller/web/ConverterWebController.java b/src/main/java/stirling/software/SPDF/controller/web/ConverterWebController.java index 23270df5..bfb18203 100644 --- a/src/main/java/stirling/software/SPDF/controller/web/ConverterWebController.java +++ b/src/main/java/stirling/software/SPDF/controller/web/ConverterWebController.java @@ -21,14 +21,6 @@ public class ConverterWebController { return "convert/book-to-pdf"; } - @ConditionalOnExpression("#{bookAndHtmlFormatsInstalled}") - @GetMapping("/pdf-to-book") - @Hidden - public String convertPdfToBookForm(Model model) { - model.addAttribute("currentPage", "pdf-to-book"); - return "convert/pdf-to-book"; - } - @GetMapping("/img-to-pdf") @Hidden public String convertImgToPdfForm(Model model) { @@ -57,13 +49,6 @@ public class ConverterWebController { return "convert/url-to-pdf"; } - @GetMapping("/pdf-to-img") - @Hidden - public String pdfToimgForm(Model model) { - model.addAttribute("currentPage", "pdf-to-img"); - return "convert/pdf-to-img"; - } - @GetMapping("/file-to-pdf") @Hidden public String convertToPdfForm(Model model) { @@ -73,6 +58,21 @@ public class ConverterWebController { // PDF TO...... + @ConditionalOnExpression("#{bookAndHtmlFormatsInstalled}") + @GetMapping("/pdf-to-book") + @Hidden + public String convertPdfToBookForm(Model model) { + model.addAttribute("currentPage", "pdf-to-book"); + return "convert/pdf-to-book"; + } + + @GetMapping("/pdf-to-img") + @Hidden + public String pdfToimgForm(Model model) { + model.addAttribute("currentPage", "pdf-to-img"); + return "convert/pdf-to-img"; + } + @GetMapping("/pdf-to-html") @Hidden public ModelAndView pdfToHTML() { diff --git a/src/main/java/stirling/software/SPDF/model/api/converters/ConvertToImageRequest.java b/src/main/java/stirling/software/SPDF/model/api/converters/ConvertToImageRequest.java index eaa8e361..3bde3e16 100644 --- a/src/main/java/stirling/software/SPDF/model/api/converters/ConvertToImageRequest.java +++ b/src/main/java/stirling/software/SPDF/model/api/converters/ConvertToImageRequest.java @@ -12,7 +12,7 @@ public class ConvertToImageRequest extends PDFFile { @Schema( description = "The output image format", - allowableValues = {"png", "jpeg", "jpg", "gif"}) + allowableValues = {"png", "jpeg", "jpg", "gif", "webp"}) private String imageFormat; @Schema( diff --git a/src/main/resources/messages_en_GB.properties b/src/main/resources/messages_en_GB.properties index 6ecafacc..7997abac 100644 --- a/src/main/resources/messages_en_GB.properties +++ b/src/main/resources/messages_en_GB.properties @@ -1147,4 +1147,4 @@ error.discordSubmit=Discord - Submit Support post removeImage.title=Remove image removeImage.header=Remove image removeImage.removeImage=Remove image -removeImage.submit=Remove image \ No newline at end of file +removeImage.submit=Remove image diff --git a/src/main/resources/templates/convert/pdf-to-img.html b/src/main/resources/templates/convert/pdf-to-img.html index 9907c301..e9f03c16 100644 --- a/src/main/resources/templates/convert/pdf-to-img.html +++ b/src/main/resources/templates/convert/pdf-to-img.html @@ -28,6 +28,7 @@ +
diff --git a/src/main/resources/templates/fragments/navbar.html b/src/main/resources/templates/fragments/navbar.html index 955fb3e6..d62dc172 100644 --- a/src/main/resources/templates/fragments/navbar.html +++ b/src/main/resources/templates/fragments/navbar.html @@ -335,7 +335,7 @@
- +