From 4367ae7934476e3d0515386aab647e2b41f43a37 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Sat, 22 Jul 2023 16:57:40 +0100 Subject: [PATCH] html and url to pdf init --- DockerfileBase | 2 +- .../api/converters/ConvertHtmlToPDF.java | 169 +++++++++++------- .../api/converters/ConvertWebsiteToPDF.java | 73 ++++++++ .../software/SPDF/utils/GeneralUtils.java | 40 +++++ .../software/SPDF/utils/ProcessExecutor.java | 13 +- 5 files changed, 228 insertions(+), 69 deletions(-) create mode 100644 src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java diff --git a/DockerfileBase b/DockerfileBase index 8a43832f..d1c2df74 100644 --- a/DockerfileBase +++ b/DockerfileBase @@ -29,7 +29,7 @@ RUN apt-get update && \ libjpeg-dev && \ pip install --upgrade pip && \ pip install --no-cache-dir \ - opencv-python-headless && \ + opencv-python-headless WeasyPrint && \ rm -rf /var/lib/apt/lists/* # Final stage: Copy necessary files from the previous stage diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertHtmlToPDF.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertHtmlToPDF.java index 6b5ecb95..edaf3213 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertHtmlToPDF.java +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertHtmlToPDF.java @@ -1,66 +1,103 @@ -package stirling.software.SPDF.controller.api.converters; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -import org.springframework.http.ResponseEntity; -import org.springframework.web.bind.annotation.PostMapping; -import org.springframework.web.bind.annotation.RequestPart; -import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.multipart.MultipartFile; - -import io.swagger.v3.oas.annotations.Operation; -import io.swagger.v3.oas.annotations.Parameter; -import io.swagger.v3.oas.annotations.tags.Tag; -import stirling.software.SPDF.utils.ProcessExecutor; -import stirling.software.SPDF.utils.WebResponseUtils; - -@RestController -@Tag(name = "Convert", description = "Convert APIs") -public class ConvertHtmlToPDF { - - @PostMapping(consumes = "multipart/form-data", value = "/pdf-to-pdfa") - @Operation( - summary = "Convert a PDF to a PDF/A", - description = "This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO" - ) - public ResponseEntity pdfToPdfA( - @RequestPart(required = true, value = "fileInput") - @Parameter(description = "The input PDF file to be converted to a PDF/A file", required = true) - MultipartFile inputFile) throws IOException, InterruptedException { - - // Save the uploaded file to a temporary location - Path tempInputFile = Files.createTempFile("input_", ".pdf"); - inputFile.transferTo(tempInputFile.toFile()); - - // Prepare the output file path - Path tempOutputFile = Files.createTempFile("output_", ".pdf"); - - // Prepare the OCRmyPDF command - List command = new ArrayList<>(); - command.add("ocrmypdf"); - command.add("--skip-text"); - command.add("--tesseract-timeout=0"); - command.add("--output-type"); - command.add("pdfa"); - command.add(tempInputFile.toString()); - command.add(tempOutputFile.toString()); - - int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command); - - // Read the optimized PDF file - byte[] pdfBytes = Files.readAllBytes(tempOutputFile); - - // Clean up the temporary files - Files.delete(tempInputFile); - Files.delete(tempOutputFile); - - // Return the optimized PDF as a response - String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf"; - return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename); - } - -} +package stirling.software.SPDF.controller.api.converters; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.*; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestPart; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import stirling.software.SPDF.utils.GeneralUtils; +import stirling.software.SPDF.utils.ProcessExecutor; +import stirling.software.SPDF.utils.WebResponseUtils; + +@RestController +@Tag(name = "Convert", description = "Convert APIs") +public class ConvertHtmlToPDF { + + + @PostMapping(consumes = "multipart/form-data", value = "/convert-to-pdf") + @Operation( + summary = "Convert an HTML or ZIP (containing HTML and CSS) to PDF", + description = "This endpoint takes an HTML or ZIP file input and converts it to a PDF format." + ) + public ResponseEntity HtmlToPdf( + @RequestPart(required = true, value = "fileInput") MultipartFile fileInput) throws IOException, InterruptedException { + + if (fileInput == null) { + throw new IllegalArgumentException("Please provide an HTML or ZIP file for conversion."); + } + + String originalFilename = fileInput.getOriginalFilename(); + if (originalFilename == null || (!originalFilename.endsWith(".html") && !originalFilename.endsWith(".zip"))) { + throw new IllegalArgumentException("File must be either .html or .zip format."); + } + + Path tempOutputFile = Files.createTempFile("output_", ".pdf"); + Path tempInputFile; + + if (originalFilename.endsWith(".html")) { + tempInputFile = Files.createTempFile("input_", ".html"); + Files.write(tempInputFile, fileInput.getBytes()); + } else { + tempInputFile = unzipAndGetMainHtml(fileInput); + } + + List command = new ArrayList<>(); + command.add("weasyprint"); + command.add(tempInputFile.toString()); + command.add(tempOutputFile.toString()); + int returnCode = 0; + if (originalFilename.endsWith(".zip")) { + returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT) + .runCommandWithOutputHandling(command, tempInputFile.getParent().toFile()); + } else { + + returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT) + .runCommandWithOutputHandling(command); + } + + byte[] pdfBytes = Files.readAllBytes(tempOutputFile); + + // Clean up temporary files + Files.delete(tempOutputFile); + Files.delete(tempInputFile); + if (originalFilename.endsWith(".zip")) { + GeneralUtils.deleteDirectory(tempInputFile.getParent()); + } + + String outputFilename = originalFilename.replaceFirst("[.][^.]+$", "") + ".pdf"; // Remove file extension and append .pdf + return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename); + } + + + + private Path unzipAndGetMainHtml(MultipartFile zipFile) throws IOException { + Path tempDirectory = Files.createTempDirectory("unzipped_"); + try (ZipInputStream zipIn = new ZipInputStream(new ByteArrayInputStream(zipFile.getBytes()))) { + ZipEntry entry = zipIn.getNextEntry(); + while (entry != null) { + Path filePath = tempDirectory.resolve(entry.getName()); + if (!entry.isDirectory()) { + Files.copy(zipIn, filePath); + } + zipIn.closeEntry(); + entry = zipIn.getNextEntry(); + } + } + return tempDirectory.resolve("index.html"); + } + + + + + +} diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java new file mode 100644 index 00000000..f69adbd1 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java @@ -0,0 +1,73 @@ +package stirling.software.SPDF.controller.api.converters; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestPart; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.Parameter; +import io.swagger.v3.oas.annotations.tags.Tag; +import stirling.software.SPDF.utils.GeneralUtils; +import stirling.software.SPDF.utils.ProcessExecutor; +import stirling.software.SPDF.utils.WebResponseUtils; + +@RestController +@Tag(name = "Convert", description = "Convert APIs") +public class ConvertWebsiteToPDF { + + @PostMapping(consumes = "multipart/form-data", value = "/url-to-pdf") + @Operation( + summary = "Convert a URL to a PDF", + description = "This endpoint fetches content from a URL and converts it to a PDF format." + ) + public ResponseEntity urlToPdf( + @RequestPart(required = true, value = "urlInput") + @Parameter(description = "The input URL to be converted to a PDF file", required = true) + String URL) throws IOException, InterruptedException { + + // Validate the URL format + if(!URL.matches("^https?://.*") && GeneralUtils.isValidURL(URL)) { + throw new IllegalArgumentException("Invalid URL format provided."); + } + + // Prepare the output file path + Path tempOutputFile = Files.createTempFile("output_", ".pdf"); + + // Prepare the OCRmyPDF command + List command = new ArrayList<>(); + command.add("weasyprint"); + command.add(URL); + command.add(tempOutputFile.toString()); + + int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT).runCommandWithOutputHandling(command); + + // Read the optimized PDF file + byte[] pdfBytes = Files.readAllBytes(tempOutputFile); + + // Clean up the temporary files + Files.delete(tempOutputFile); + + // Convert URL to a safe filename + String outputFilename = convertURLToFileName(URL); + + return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename); + } + + private String convertURLToFileName(String url) { + String safeName = url.replaceAll("[^a-zA-Z0-9]", "_"); + if(safeName.length() > 50) { + safeName = safeName.substring(0, 50); // restrict to 50 characters + } + return safeName + ".pdf"; + } + + +} diff --git a/src/main/java/stirling/software/SPDF/utils/GeneralUtils.java b/src/main/java/stirling/software/SPDF/utils/GeneralUtils.java index 03eccf88..28a1e73e 100644 --- a/src/main/java/stirling/software/SPDF/utils/GeneralUtils.java +++ b/src/main/java/stirling/software/SPDF/utils/GeneralUtils.java @@ -1,14 +1,54 @@ package stirling.software.SPDF.utils; import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; import java.util.List; public class GeneralUtils { + public static void deleteDirectory(Path path) throws IOException { + Files.walkFileTree(path, new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + Files.delete(dir); + return FileVisitResult.CONTINUE; + } + }); + } + + public static String convertToFileName(String name) { + String safeName = name.replaceAll("[^a-zA-Z0-9]", "_"); + if (safeName.length() > 50) { + safeName = safeName.substring(0, 50); + } + return safeName; + } + + + public static boolean isValidURL(String urlStr) { + try { + new URL(urlStr); + return true; + } catch (MalformedURLException e) { + return false; + } + } + + public static Long convertSizeToBytes(String sizeStr) { if (sizeStr == null) { return null; diff --git a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java index 33823507..f2a7ed55 100644 --- a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java +++ b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java @@ -1,6 +1,7 @@ package stirling.software.SPDF.utils; import java.io.BufferedReader; +import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; @@ -13,7 +14,7 @@ import java.util.concurrent.Semaphore; public class ProcessExecutor { public enum Processes { - LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT + LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT, WEASYPRINT } private static final Map instances = new ConcurrentHashMap<>(); @@ -25,6 +26,7 @@ public class ProcessExecutor { case OCR_MY_PDF -> 2; case PYTHON_OPENCV -> 8; case GHOSTSCRIPT -> 16; + case WEASYPRINT -> 16; }; return new ProcessExecutor(semaphoreLimit); }); @@ -35,14 +37,21 @@ public class ProcessExecutor { private ProcessExecutor(int semaphoreLimit) { this.semaphore = new Semaphore(semaphoreLimit); } - public int runCommandWithOutputHandling(List command) throws IOException, InterruptedException { + return runCommandWithOutputHandling(command, null); + } + public int runCommandWithOutputHandling(List command, File workingDirectory) throws IOException, InterruptedException { int exitCode = 1; semaphore.acquire(); try { System.out.print("Running command: " + String.join(" ", command)); ProcessBuilder processBuilder = new ProcessBuilder(command); + + // Use the working directory if it's set + if (workingDirectory != null) { + processBuilder.directory(workingDirectory); + } Process process = processBuilder.start(); // Read the error stream and standard output stream concurrently