html and url to pdf init

This commit is contained in:
Anthony Stirling 2023-07-22 16:57:40 +01:00
parent 749461334d
commit 4367ae7934
5 changed files with 228 additions and 69 deletions

View File

@ -29,7 +29,7 @@ RUN apt-get update && \
libjpeg-dev && \ libjpeg-dev && \
pip install --upgrade pip && \ pip install --upgrade pip && \
pip install --no-cache-dir \ pip install --no-cache-dir \
opencv-python-headless && \ opencv-python-headless WeasyPrint && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
# Final stage: Copy necessary files from the previous stage # Final stage: Copy necessary files from the previous stage

View File

@ -1,66 +1,103 @@
package stirling.software.SPDF.controller.api.converters; package stirling.software.SPDF.controller.api.converters;
import java.io.IOException; import java.io.ByteArrayInputStream;
import java.nio.file.Files; import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Files;
import java.util.ArrayList; import java.nio.file.Path;
import java.util.List; import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.springframework.http.ResponseEntity; import java.util.*;
import org.springframework.web.bind.annotation.PostMapping; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.RequestPart; import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RestController; import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter; import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag; import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.utils.ProcessExecutor; import stirling.software.SPDF.utils.GeneralUtils;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Convert", description = "Convert APIs") @RestController
public class ConvertHtmlToPDF { @Tag(name = "Convert", description = "Convert APIs")
public class ConvertHtmlToPDF {
@PostMapping(consumes = "multipart/form-data", value = "/pdf-to-pdfa")
@Operation(
summary = "Convert a PDF to a PDF/A", @PostMapping(consumes = "multipart/form-data", value = "/convert-to-pdf")
description = "This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO" @Operation(
) summary = "Convert an HTML or ZIP (containing HTML and CSS) to PDF",
public ResponseEntity<byte[]> pdfToPdfA( description = "This endpoint takes an HTML or ZIP file input and converts it to a PDF format."
@RequestPart(required = true, value = "fileInput") )
@Parameter(description = "The input PDF file to be converted to a PDF/A file", required = true) public ResponseEntity<byte[]> HtmlToPdf(
MultipartFile inputFile) throws IOException, InterruptedException { @RequestPart(required = true, value = "fileInput") MultipartFile fileInput) throws IOException, InterruptedException {
// Save the uploaded file to a temporary location if (fileInput == null) {
Path tempInputFile = Files.createTempFile("input_", ".pdf"); throw new IllegalArgumentException("Please provide an HTML or ZIP file for conversion.");
inputFile.transferTo(tempInputFile.toFile()); }
// Prepare the output file path String originalFilename = fileInput.getOriginalFilename();
Path tempOutputFile = Files.createTempFile("output_", ".pdf"); if (originalFilename == null || (!originalFilename.endsWith(".html") && !originalFilename.endsWith(".zip"))) {
throw new IllegalArgumentException("File must be either .html or .zip format.");
// Prepare the OCRmyPDF command }
List<String> command = new ArrayList<>();
command.add("ocrmypdf"); Path tempOutputFile = Files.createTempFile("output_", ".pdf");
command.add("--skip-text"); Path tempInputFile;
command.add("--tesseract-timeout=0");
command.add("--output-type"); if (originalFilename.endsWith(".html")) {
command.add("pdfa"); tempInputFile = Files.createTempFile("input_", ".html");
command.add(tempInputFile.toString()); Files.write(tempInputFile, fileInput.getBytes());
command.add(tempOutputFile.toString()); } else {
tempInputFile = unzipAndGetMainHtml(fileInput);
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command); }
// Read the optimized PDF file List<String> command = new ArrayList<>();
byte[] pdfBytes = Files.readAllBytes(tempOutputFile); command.add("weasyprint");
command.add(tempInputFile.toString());
// Clean up the temporary files command.add(tempOutputFile.toString());
Files.delete(tempInputFile); int returnCode = 0;
Files.delete(tempOutputFile); if (originalFilename.endsWith(".zip")) {
returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT)
// Return the optimized PDF as a response .runCommandWithOutputHandling(command, tempInputFile.getParent().toFile());
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf"; } else {
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
} returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT)
.runCommandWithOutputHandling(command);
} }
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up temporary files
Files.delete(tempOutputFile);
Files.delete(tempInputFile);
if (originalFilename.endsWith(".zip")) {
GeneralUtils.deleteDirectory(tempInputFile.getParent());
}
String outputFilename = originalFilename.replaceFirst("[.][^.]+$", "") + ".pdf"; // Remove file extension and append .pdf
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
private Path unzipAndGetMainHtml(MultipartFile zipFile) throws IOException {
Path tempDirectory = Files.createTempDirectory("unzipped_");
try (ZipInputStream zipIn = new ZipInputStream(new ByteArrayInputStream(zipFile.getBytes()))) {
ZipEntry entry = zipIn.getNextEntry();
while (entry != null) {
Path filePath = tempDirectory.resolve(entry.getName());
if (!entry.isDirectory()) {
Files.copy(zipIn, filePath);
}
zipIn.closeEntry();
entry = zipIn.getNextEntry();
}
}
return tempDirectory.resolve("index.html");
}
}

View File

@ -0,0 +1,73 @@
package stirling.software.SPDF.controller.api.converters;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.utils.GeneralUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Convert", description = "Convert APIs")
public class ConvertWebsiteToPDF {
@PostMapping(consumes = "multipart/form-data", value = "/url-to-pdf")
@Operation(
summary = "Convert a URL to a PDF",
description = "This endpoint fetches content from a URL and converts it to a PDF format."
)
public ResponseEntity<byte[]> urlToPdf(
@RequestPart(required = true, value = "urlInput")
@Parameter(description = "The input URL to be converted to a PDF file", required = true)
String URL) throws IOException, InterruptedException {
// Validate the URL format
if(!URL.matches("^https?://.*") && GeneralUtils.isValidURL(URL)) {
throw new IllegalArgumentException("Invalid URL format provided.");
}
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Prepare the OCRmyPDF command
List<String> command = new ArrayList<>();
command.add("weasyprint");
command.add(URL);
command.add(tempOutputFile.toString());
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT).runCommandWithOutputHandling(command);
// Read the optimized PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files
Files.delete(tempOutputFile);
// Convert URL to a safe filename
String outputFilename = convertURLToFileName(URL);
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
private String convertURLToFileName(String url) {
String safeName = url.replaceAll("[^a-zA-Z0-9]", "_");
if(safeName.length() > 50) {
safeName = safeName.substring(0, 50); // restrict to 50 characters
}
return safeName + ".pdf";
}
}

View File

@ -1,14 +1,54 @@
package stirling.software.SPDF.utils; package stirling.software.SPDF.utils;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.FileVisitResult;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
public class GeneralUtils { public class GeneralUtils {
public static void deleteDirectory(Path path) throws IOException {
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
Files.delete(file);
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
Files.delete(dir);
return FileVisitResult.CONTINUE;
}
});
}
public static String convertToFileName(String name) {
String safeName = name.replaceAll("[^a-zA-Z0-9]", "_");
if (safeName.length() > 50) {
safeName = safeName.substring(0, 50);
}
return safeName;
}
public static boolean isValidURL(String urlStr) {
try {
new URL(urlStr);
return true;
} catch (MalformedURLException e) {
return false;
}
}
public static Long convertSizeToBytes(String sizeStr) { public static Long convertSizeToBytes(String sizeStr) {
if (sizeStr == null) { if (sizeStr == null) {
return null; return null;

View File

@ -1,6 +1,7 @@
package stirling.software.SPDF.utils; package stirling.software.SPDF.utils;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
@ -13,7 +14,7 @@ import java.util.concurrent.Semaphore;
public class ProcessExecutor { public class ProcessExecutor {
public enum Processes { public enum Processes {
LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT, WEASYPRINT
} }
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>(); private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
@ -25,6 +26,7 @@ public class ProcessExecutor {
case OCR_MY_PDF -> 2; case OCR_MY_PDF -> 2;
case PYTHON_OPENCV -> 8; case PYTHON_OPENCV -> 8;
case GHOSTSCRIPT -> 16; case GHOSTSCRIPT -> 16;
case WEASYPRINT -> 16;
}; };
return new ProcessExecutor(semaphoreLimit); return new ProcessExecutor(semaphoreLimit);
}); });
@ -35,14 +37,21 @@ public class ProcessExecutor {
private ProcessExecutor(int semaphoreLimit) { private ProcessExecutor(int semaphoreLimit) {
this.semaphore = new Semaphore(semaphoreLimit); this.semaphore = new Semaphore(semaphoreLimit);
} }
public int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException { public int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
return runCommandWithOutputHandling(command, null);
}
public int runCommandWithOutputHandling(List<String> command, File workingDirectory) throws IOException, InterruptedException {
int exitCode = 1; int exitCode = 1;
semaphore.acquire(); semaphore.acquire();
try { try {
System.out.print("Running command: " + String.join(" ", command)); System.out.print("Running command: " + String.join(" ", command));
ProcessBuilder processBuilder = new ProcessBuilder(command); ProcessBuilder processBuilder = new ProcessBuilder(command);
// Use the working directory if it's set
if (workingDirectory != null) {
processBuilder.directory(workingDirectory);
}
Process process = processBuilder.start(); Process process = processBuilder.start();
// Read the error stream and standard output stream concurrently // Read the error stream and standard output stream concurrently