mirror of
				https://github.com/Frooodle/Stirling-PDF.git
				synced 2025-10-25 11:17:28 +02:00 
			
		
		
		
	Add: Convert PDF to WebP (#1666)
* Add PDF to WebP * add swagger param * back * creates a custom image for Docker from pymupdf * Converting with pdf2image and Pillow instead of pymupdf * webp remove to pdf-to-img * remove mupdf
This commit is contained in:
		
							parent
							
								
									4a4c7faf47
								
							
						
					
					
						commit
						58618b3a21
					
				
							
								
								
									
										1
									
								
								.github/labeler-config.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/labeler-config.yml
									
									
									
									
										vendored
									
									
								
							| @ -2,6 +2,7 @@ Translation: | ||||
|   - changed-files: | ||||
|     - any-glob-to-any-file: 'src/main/resources/messages_*_*.properties' | ||||
|     - any-glob-to-any-file: 'scripts/ignore_translation.toml' | ||||
|     - any-glob-to-any-file: 'src/main/resources/templates/fragments/languages.html' | ||||
| 
 | ||||
| Front End: | ||||
|   - changed-files: | ||||
|  | ||||
| @ -39,7 +39,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et | ||||
|         libreoffice \ | ||||
| # pdftohtml | ||||
|         poppler-utils \ | ||||
| # OCR MY PDF (unpaper for descew and other advanced featues) | ||||
| # OCR MY PDF (unpaper for descew and other advanced features) | ||||
|         ocrmypdf \ | ||||
|         tesseract-ocr-data-eng \ | ||||
| # CV | ||||
| @ -48,7 +48,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et | ||||
|         python3 \ | ||||
|         py3-pip && \ | ||||
| # uno unoconv and HTML | ||||
|     pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \ | ||||
|     pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint pdf2image pillow && \ | ||||
|     mv /usr/share/tessdata /usr/share/tessdata-original && \ | ||||
|     mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \ | ||||
|     fc-cache -f -v && \ | ||||
|  | ||||
| @ -64,7 +64,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et | ||||
|         python3 \ | ||||
|     py3-pip && \ | ||||
| # uno unoconv and HTML | ||||
|     pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \ | ||||
|     pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint pdf2image pillow && \ | ||||
|     mv /usr/share/tessdata /usr/share/tessdata-original && \ | ||||
|     mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \ | ||||
|     fc-cache -f -v && \ | ||||
|  | ||||
| @ -15,7 +15,7 @@ | ||||
| | file-to-pdf         |         | ✔️       |          |       | ✔️   |        |        | ✔️           |          |      |            | | ||||
| | img-to-pdf          |         | ✔️       |          |       |     |        |        |             |          | ✔️    |            | | ||||
| | pdf-to-html         |         | ✔️       |          |       | ✔️   |        |        | ✔️           |          |      |            | | ||||
| | pdf-to-img          |         | ✔️       |          |       |     |        |        |             |          | ✔️    |            | | ||||
| | pdf-to-img          |         | ✔️       |          |       |     | ✔️      |        |             |          | ✔️    |            | | ||||
| | pdf-to-pdfa         |         | ✔️       |          |       | ✔️   |        |        |             | ✔️        |      |            | | ||||
| | pdf-to-markdown     |         | ✔️       |          |       |     |        |        |             |          | ✔️    |            | | ||||
| | pdf-to-presentation |         | ✔️       |          |       | ✔️   |        |        | ✔️           |          |      |            | | ||||
|  | ||||
							
								
								
									
										174
									
								
								scripts/png_to_webp.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										174
									
								
								scripts/png_to_webp.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,174 @@ | ||||
| """ | ||||
| Author: Ludy87 | ||||
| Description: This script converts a PDF file to WebP images. It includes functionality to resize images if they exceed specified dimensions and handle conversion of PDF pages to WebP format. | ||||
| 
 | ||||
| Example | ||||
| ------- | ||||
| To convert a PDF file to WebP images with each page as a separate WebP file: | ||||
|     python script.py input.pdf output_directory | ||||
| 
 | ||||
| To convert a PDF file to a single WebP image: | ||||
|     python script.py input.pdf output_directory --single | ||||
| 
 | ||||
| To adjust the DPI resolution for rendering PDF pages: | ||||
|     python script.py input.pdf output_directory --dpi 150 | ||||
| """ | ||||
| 
 | ||||
| import argparse | ||||
| import os | ||||
| from pdf2image import convert_from_path | ||||
| from PIL import Image | ||||
| 
 | ||||
| 
 | ||||
| def resize_image(input_image_path, output_image_path, max_size=(16383, 16383)): | ||||
|     """ | ||||
|     Resize the image if its dimensions exceed the maximum allowed size and save it as WebP. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     input_image_path : str | ||||
|         Path to the input image file. | ||||
|     output_image_path : str | ||||
|         Path where the output WebP image will be saved. | ||||
|     max_size : tuple of int, optional | ||||
|         Maximum allowed dimensions for the image (width, height). Default is (16383, 16383). | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     None | ||||
|     """ | ||||
|     try: | ||||
|         # Open the image | ||||
|         image = Image.open(input_image_path) | ||||
|         width, height = image.size | ||||
|         max_width, max_height = max_size | ||||
| 
 | ||||
|         # Check if the image dimensions exceed the maximum allowed dimensions | ||||
|         if width > max_width or height > max_height: | ||||
|             # Calculate the scaling ratio | ||||
|             ratio = min(max_width / width, max_height / height) | ||||
|             new_width = int(width * ratio) | ||||
|             new_height = int(height * ratio) | ||||
| 
 | ||||
|             # Resize the image | ||||
|             resized_image = image.resize((new_width, new_height), Image.LANCZOS) | ||||
|             resized_image.save(output_image_path, format="WEBP", quality=100) | ||||
|             print( | ||||
|                 f"The image was successfully resized to ({new_width}, {new_height}) and saved as WebP: {output_image_path}" | ||||
|             ) | ||||
|         else: | ||||
|             # If dimensions are within the allowed limits, save the image directly | ||||
|             image.save(output_image_path, format="WEBP", quality=100) | ||||
|             print(f"The image was successfully saved as WebP: {output_image_path}") | ||||
|     except Exception as e: | ||||
|         print(f"An error occurred: {e}") | ||||
| 
 | ||||
| 
 | ||||
| def convert_image_to_webp(input_image, output_file): | ||||
|     """ | ||||
|     Convert an image to WebP format, resizing it if it exceeds the maximum dimensions. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     input_image : str | ||||
|         Path to the input image file. | ||||
|     output_file : str | ||||
|         Path where the output WebP image will be saved. | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     None | ||||
|     """ | ||||
|     # Resize the image if it exceeds the maximum dimensions | ||||
|     resize_image(input_image, output_file, max_size=(16383, 16383)) | ||||
| 
 | ||||
| 
 | ||||
| def pdf_to_webp(pdf_path, output_dir, dpi=300): | ||||
|     """ | ||||
|     Convert each page of a PDF file to WebP images. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     pdf_path : str | ||||
|         Path to the input PDF file. | ||||
|     output_dir : str | ||||
|         Directory where the WebP images will be saved. | ||||
|     dpi : int, optional | ||||
|         DPI resolution for rendering PDF pages. Default is 300. | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     None | ||||
|     """ | ||||
|     # Convert the PDF to a list of images | ||||
|     images = convert_from_path(pdf_path, dpi=dpi) | ||||
| 
 | ||||
|     for page_number, image in enumerate(images): | ||||
|         # Define temporary PNG path | ||||
|         temp_png_path = os.path.join(output_dir, f"temp_page_{page_number + 1}.png") | ||||
|         image.save(temp_png_path, format="PNG") | ||||
| 
 | ||||
|         # Define the output path for WebP | ||||
|         output_path = os.path.join(output_dir, f"page_{page_number + 1}.webp") | ||||
| 
 | ||||
|         # Convert PNG to WebP | ||||
|         convert_image_to_webp(temp_png_path, output_path) | ||||
| 
 | ||||
|         # Delete the temporary PNG file | ||||
|         os.remove(temp_png_path) | ||||
| 
 | ||||
| 
 | ||||
| def main(pdf_image_path, output_dir, dpi=300, single_images_flag=False): | ||||
|     """ | ||||
|     Main function to handle conversion from PDF to WebP images. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     pdf_image_path : str | ||||
|         Path to the input PDF file or image. | ||||
|     output_dir : str | ||||
|         Directory where the WebP images will be saved. | ||||
|     dpi : int, optional | ||||
|         DPI resolution for rendering PDF pages. Default is 300. | ||||
|     single_images_flag : bool, optional | ||||
|         If True, combine all pages into a single WebP image. Default is False. | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     None | ||||
|     """ | ||||
|     if single_images_flag: | ||||
|         # Combine all pages into a single WebP image | ||||
|         output_path = os.path.join(output_dir, "combined_image.webp") | ||||
|         convert_image_to_webp(pdf_image_path, output_path) | ||||
|     else: | ||||
|         # Convert each PDF page to a separate WebP image | ||||
|         pdf_to_webp(pdf_image_path, output_dir, dpi) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     parser = argparse.ArgumentParser(description="Convert a PDF file to WebP images.") | ||||
|     parser.add_argument("pdf_path", help="The path to the input PDF file.") | ||||
|     parser.add_argument( | ||||
|         "output_dir", help="The directory where the WebP images should be saved." | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--dpi", | ||||
|         type=int, | ||||
|         default=300, | ||||
|         help="The DPI resolution for rendering the PDF pages (default: 300).", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--single", | ||||
|         action="store_true", | ||||
|         help="Combine all pages into a single WebP image.", | ||||
|     ) | ||||
|     args = parser.parse_args() | ||||
| 
 | ||||
|     os.makedirs(args.output_dir, exist_ok=True) | ||||
|     main( | ||||
|         args.pdf_path, | ||||
|         args.output_dir, | ||||
|         dpi=args.dpi, | ||||
|         single_images_flag=args.single, | ||||
|     ) | ||||
| @ -166,6 +166,7 @@ public class EndpointConfiguration { | ||||
|         addEndpointToGroup("Python", REMOVE_BLANKS); | ||||
|         addEndpointToGroup("Python", "html-to-pdf"); | ||||
|         addEndpointToGroup("Python", "url-to-pdf"); | ||||
|         addEndpointToGroup("Python", "pdf-to-img"); | ||||
| 
 | ||||
|         // openCV | ||||
|         addEndpointToGroup("OpenCV", "extract-image-scans"); | ||||
|  | ||||
| @ -1,11 +1,23 @@ | ||||
| package stirling.software.SPDF.controller.api.converters; | ||||
| 
 | ||||
| import java.io.ByteArrayOutputStream; | ||||
| import java.io.FileOutputStream; | ||||
| import java.io.IOException; | ||||
| import java.net.URLConnection; | ||||
| import java.nio.file.Files; | ||||
| import java.nio.file.Path; | ||||
| import java.util.ArrayList; | ||||
| import java.util.Arrays; | ||||
| import java.util.List; | ||||
| import java.util.stream.Collectors; | ||||
| import java.util.zip.ZipEntry; | ||||
| import java.util.zip.ZipOutputStream; | ||||
| 
 | ||||
| import org.apache.commons.io.FileUtils; | ||||
| import org.apache.pdfbox.rendering.ImageType; | ||||
| import org.slf4j.Logger; | ||||
| import org.slf4j.LoggerFactory; | ||||
| import org.springframework.http.HttpHeaders; | ||||
| import org.springframework.http.MediaType; | ||||
| import org.springframework.http.ResponseEntity; | ||||
| import org.springframework.web.bind.annotation.ModelAttribute; | ||||
| @ -21,6 +33,8 @@ import io.swagger.v3.oas.annotations.tags.Tag; | ||||
| import stirling.software.SPDF.model.api.converters.ConvertToImageRequest; | ||||
| import stirling.software.SPDF.model.api.converters.ConvertToPdfRequest; | ||||
| import stirling.software.SPDF.utils.PdfUtils; | ||||
| import stirling.software.SPDF.utils.ProcessExecutor; | ||||
| import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult; | ||||
| import stirling.software.SPDF.utils.WebResponseUtils; | ||||
| 
 | ||||
| @RestController | ||||
| @ -60,15 +74,92 @@ public class ConvertImgPDFController { | ||||
|         result = | ||||
|                 PdfUtils.convertFromPdf( | ||||
|                         pdfBytes, | ||||
|                         imageFormat.toUpperCase(), | ||||
|                         imageFormat.equalsIgnoreCase("webp") ? "png" : imageFormat.toUpperCase(), | ||||
|                         colorTypeResult, | ||||
|                         singleImage, | ||||
|                         Integer.valueOf(dpi), | ||||
|                         filename); | ||||
| 
 | ||||
|         if (result == null || result.length == 0) { | ||||
|             logger.error("resultant bytes for {} is null, error converting ", filename); | ||||
|         } | ||||
|         if (imageFormat.equalsIgnoreCase("webp")) { | ||||
|             // Write the output stream to a temp file | ||||
|             Path tempFile = Files.createTempFile("temp_png", ".png"); | ||||
|             try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) { | ||||
|                 fos.write(result); | ||||
|                 fos.flush(); | ||||
|             } | ||||
| 
 | ||||
|             String pythonVersion = "python3"; | ||||
|             try { | ||||
|                 ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV) | ||||
|                         .runCommandWithOutputHandling(Arrays.asList("python3", "--version")); | ||||
|             } catch (IOException e) { | ||||
|                 ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV) | ||||
|                         .runCommandWithOutputHandling(Arrays.asList("python", "--version")); | ||||
|                 pythonVersion = "python"; | ||||
|             } | ||||
| 
 | ||||
|             List<String> command = new ArrayList<>(); | ||||
|             command.add(pythonVersion); | ||||
|             command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion | ||||
| 
 | ||||
|                 // Create a temporary directory for the output WebP files | ||||
|             Path tempOutputDir = Files.createTempDirectory("webp_output"); | ||||
|             if (singleImage) { | ||||
|                 // Run the Python script to convert PNG to WebP | ||||
|                 command.add(tempFile.toString()); | ||||
|                 command.add(tempOutputDir.toString()); | ||||
|                 command.add("--single"); | ||||
|             } else { | ||||
|                 // Save the uploaded PDF to a temporary file | ||||
|                 Path tempPdfPath = Files.createTempFile("temp_pdf", ".pdf"); | ||||
|                 file.transferTo(tempPdfPath.toFile()); | ||||
|                 // Run the Python script to convert PDF to WebP | ||||
|                 command.add(tempPdfPath.toString()); | ||||
|                 command.add(tempOutputDir.toString()); | ||||
|             } | ||||
|             command.add("--dpi"); | ||||
|             command.add(dpi); | ||||
|             ProcessExecutorResult resultProcess = | ||||
|                     ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV) | ||||
|                             .runCommandWithOutputHandling(command); | ||||
| 
 | ||||
|             // Find all WebP files in the output directory | ||||
|             List<Path> webpFiles = | ||||
|                     Files.walk(tempOutputDir) | ||||
|                             .filter(path -> path.toString().endsWith(".webp")) | ||||
|                             .collect(Collectors.toList()); | ||||
| 
 | ||||
|             if (webpFiles.isEmpty()) { | ||||
|                 logger.error("No WebP files were created in: {}", tempOutputDir.toString()); | ||||
|                 throw new IOException("No WebP files were created. " + resultProcess.getMessages()); | ||||
|             } | ||||
| 
 | ||||
|             byte[] bodyBytes = new byte[0]; | ||||
| 
 | ||||
|             if (webpFiles.size() == 1) { | ||||
|                 // Return the single WebP file directly | ||||
|                 Path webpFilePath = webpFiles.get(0); | ||||
|                 bodyBytes = Files.readAllBytes(webpFilePath); | ||||
|             } else { | ||||
|                 // Create a ZIP file containing all WebP images | ||||
|                 ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream(); | ||||
|                 try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) { | ||||
|                     for (Path webpFile : webpFiles) { | ||||
|                         zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString())); | ||||
|                         Files.copy(webpFile, zos); | ||||
|                         zos.closeEntry(); | ||||
|                     } | ||||
|                 } | ||||
|                 bodyBytes = zipOutputStream.toByteArray(); | ||||
|             } | ||||
|             // Clean up the temporary files | ||||
|             Files.deleteIfExists(tempFile); | ||||
|             if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile()); | ||||
|             result = bodyBytes; | ||||
|         } | ||||
| 
 | ||||
|         if (singleImage) { | ||||
|             String docName = filename + "." + imageFormat; | ||||
|             MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat)); | ||||
|  | ||||
| @ -21,14 +21,6 @@ public class ConverterWebController { | ||||
|         return "convert/book-to-pdf"; | ||||
|     } | ||||
| 
 | ||||
|     @ConditionalOnExpression("#{bookAndHtmlFormatsInstalled}") | ||||
|     @GetMapping("/pdf-to-book") | ||||
|     @Hidden | ||||
|     public String convertPdfToBookForm(Model model) { | ||||
|         model.addAttribute("currentPage", "pdf-to-book"); | ||||
|         return "convert/pdf-to-book"; | ||||
|     } | ||||
| 
 | ||||
|     @GetMapping("/img-to-pdf") | ||||
|     @Hidden | ||||
|     public String convertImgToPdfForm(Model model) { | ||||
| @ -57,13 +49,6 @@ public class ConverterWebController { | ||||
|         return "convert/url-to-pdf"; | ||||
|     } | ||||
| 
 | ||||
|     @GetMapping("/pdf-to-img") | ||||
|     @Hidden | ||||
|     public String pdfToimgForm(Model model) { | ||||
|         model.addAttribute("currentPage", "pdf-to-img"); | ||||
|         return "convert/pdf-to-img"; | ||||
|     } | ||||
| 
 | ||||
|     @GetMapping("/file-to-pdf") | ||||
|     @Hidden | ||||
|     public String convertToPdfForm(Model model) { | ||||
| @ -73,6 +58,21 @@ public class ConverterWebController { | ||||
| 
 | ||||
|     // PDF TO...... | ||||
| 
 | ||||
|     @ConditionalOnExpression("#{bookAndHtmlFormatsInstalled}") | ||||
|     @GetMapping("/pdf-to-book") | ||||
|     @Hidden | ||||
|     public String convertPdfToBookForm(Model model) { | ||||
|         model.addAttribute("currentPage", "pdf-to-book"); | ||||
|         return "convert/pdf-to-book"; | ||||
|     } | ||||
| 
 | ||||
|     @GetMapping("/pdf-to-img") | ||||
|     @Hidden | ||||
|     public String pdfToimgForm(Model model) { | ||||
|         model.addAttribute("currentPage", "pdf-to-img"); | ||||
|         return "convert/pdf-to-img"; | ||||
|     } | ||||
| 
 | ||||
|     @GetMapping("/pdf-to-html") | ||||
|     @Hidden | ||||
|     public ModelAndView pdfToHTML() { | ||||
|  | ||||
| @ -12,7 +12,7 @@ public class ConvertToImageRequest extends PDFFile { | ||||
| 
 | ||||
|     @Schema( | ||||
|             description = "The output image format", | ||||
|             allowableValues = {"png", "jpeg", "jpg", "gif"}) | ||||
|             allowableValues = {"png", "jpeg", "jpg", "gif", "webp"}) | ||||
|     private String imageFormat; | ||||
| 
 | ||||
|     @Schema( | ||||
|  | ||||
| @ -28,6 +28,7 @@ | ||||
|                     <option value="gif">GIF</option> | ||||
|                     <option value="tiff">TIFF</option> | ||||
|                     <option value="bmp">BMP</option> | ||||
|                     <option value="webp">WEPB</option> | ||||
|                   </select> | ||||
|                 </div> | ||||
|                 <div class="mb-3"> | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user