From 0652299bec3ac3a9268fcaffe3e2b511e3d199fb Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com.> Date: Mon, 9 Dec 2024 20:40:59 +0000 Subject: [PATCH] fixes --- .../controller/api/SplitPDFController.java | 157 +++++++----- .../api/SplitPdfByChaptersController.java | 136 ++++++----- .../api/SplitPdfBySectionsController.java | 10 +- .../converters/ConvertImgPDFController.java | 223 ++++++++++-------- .../controller/api/misc/OCRController.java | 8 +- 5 files changed, 304 insertions(+), 230 deletions(-) diff --git a/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java b/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java index e27df103..a8e74e2a 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/SplitPDFController.java @@ -52,84 +52,115 @@ public class SplitPDFController { "This endpoint splits a given PDF file into separate documents based on the specified page numbers or ranges. Users can specify pages using individual numbers, ranges, or 'all' for every page. Input:PDF Output:PDF Type:SIMO") public ResponseEntity splitPdf(@ModelAttribute PDFWithPageNums request) throws IOException { - MultipartFile file = request.getFileInput(); - String pages = request.getPageNumbers(); - // open the pdf document - PDDocument document = Loader.loadPDF(file.getBytes()); - // PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document); - int totalPages = document.getNumberOfPages(); - List pageNumbers = request.getPageNumbersList(document, false); - if (!pageNumbers.contains(totalPages - 1)) { - // Create a mutable ArrayList so we can add to it - pageNumbers = new ArrayList<>(pageNumbers); - pageNumbers.add(totalPages - 1); - } - - logger.info( - "Splitting PDF into pages: {}", - pageNumbers.stream().map(String::valueOf).collect(Collectors.joining(","))); - - // split the document + PDDocument document = null; + Path zipFile = null; List splitDocumentsBoas = new ArrayList<>(); - int previousPageNumber = 0; - for (int splitPoint : pageNumbers) { - try (PDDocument splitDocument = - pdfDocumentFactory.createNewDocumentBasedOnOldDocument(document)) { - for (int i = previousPageNumber; i <= splitPoint; i++) { - PDPage page = document.getPage(i); - splitDocument.addPage(page); - logger.info("Adding page {} to split document", i); + + try { + + MultipartFile file = request.getFileInput(); + String pages = request.getPageNumbers(); + // open the pdf document + + document = Loader.loadPDF(file.getBytes()); + // PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document); + int totalPages = document.getNumberOfPages(); + List pageNumbers = request.getPageNumbersList(document, false); + if (!pageNumbers.contains(totalPages - 1)) { + // Create a mutable ArrayList so we can add to it + pageNumbers = new ArrayList<>(pageNumbers); + pageNumbers.add(totalPages - 1); + } + + logger.info( + "Splitting PDF into pages: {}", + pageNumbers.stream().map(String::valueOf).collect(Collectors.joining(","))); + + // split the document + splitDocumentsBoas = new ArrayList<>(); + int previousPageNumber = 0; + for (int splitPoint : pageNumbers) { + try (PDDocument splitDocument = + pdfDocumentFactory.createNewDocumentBasedOnOldDocument(document)) { + for (int i = previousPageNumber; i <= splitPoint; i++) { + PDPage page = document.getPage(i); + splitDocument.addPage(page); + logger.info("Adding page {} to split document", i); + } + previousPageNumber = splitPoint + 1; + + // Transfer metadata to split pdf + // PdfMetadataService.setMetadataToPdf(splitDocument, metadata); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + splitDocument.save(baos); + + splitDocumentsBoas.add(baos); + } catch (Exception e) { + logger.error("Failed splitting documents and saving them", e); + throw e; } - previousPageNumber = splitPoint + 1; + } - // Transfer metadata to split pdf - // PdfMetadataService.setMetadataToPdf(splitDocument, metadata); + // closing the original document + document.close(); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - splitDocument.save(baos); + zipFile = Files.createTempFile("split_documents", ".zip"); - splitDocumentsBoas.add(baos); + String filename = + Filenames.toSimpleFileName(file.getOriginalFilename()) + .replaceFirst("[.][^.]+$", ""); + try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) { + // loop through the split documents and write them to the zip file + for (int i = 0; i < splitDocumentsBoas.size(); i++) { + String fileName = filename + "_" + (i + 1) + ".pdf"; + ByteArrayOutputStream baos = splitDocumentsBoas.get(i); + byte[] pdf = baos.toByteArray(); + + // Add PDF file to the zip + ZipEntry pdfEntry = new ZipEntry(fileName); + zipOut.putNextEntry(pdfEntry); + zipOut.write(pdf); + zipOut.closeEntry(); + + logger.info("Wrote split document {} to zip file", fileName); + } } catch (Exception e) { - logger.error("Failed splitting documents and saving them", e); + logger.error("Failed writing to zip", e); throw e; } - } - // closing the original document - document.close(); + logger.info( + "Successfully created zip file with split documents: {}", zipFile.toString()); + byte[] data = Files.readAllBytes(zipFile); + Files.deleteIfExists(zipFile); - Path zipFile = Files.createTempFile("split_documents", ".zip"); + // return the Resource in the response + return WebResponseUtils.bytesToWebResponse( + data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM); - String filename = - Filenames.toSimpleFileName(file.getOriginalFilename()) - .replaceFirst("[.][^.]+$", ""); - try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) { - // loop through the split documents and write them to the zip file - for (int i = 0; i < splitDocumentsBoas.size(); i++) { - String fileName = filename + "_" + (i + 1) + ".pdf"; - ByteArrayOutputStream baos = splitDocumentsBoas.get(i); - byte[] pdf = baos.toByteArray(); + } finally { + try { + // Close the main document + if (document != null) { + document.close(); + } - // Add PDF file to the zip - ZipEntry pdfEntry = new ZipEntry(fileName); - zipOut.putNextEntry(pdfEntry); - zipOut.write(pdf); - zipOut.closeEntry(); + // Close all ByteArrayOutputStreams + for (ByteArrayOutputStream baos : splitDocumentsBoas) { + if (baos != null) { + baos.close(); + } + } - logger.info("Wrote split document {} to zip file", fileName); + // Delete temporary zip file + if (zipFile != null) { + Files.deleteIfExists(zipFile); + } + } catch (Exception e) { + logger.error("Error while cleaning up resources", e); } - } catch (Exception e) { - logger.error("Failed writing to zip", e); - throw e; } - - logger.info("Successfully created zip file with split documents: {}", zipFile.toString()); - byte[] data = Files.readAllBytes(zipFile); - Files.deleteIfExists(zipFile); - - // return the Resource in the response - return WebResponseUtils.bytesToWebResponse( - data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM); } } diff --git a/src/main/java/stirling/software/SPDF/controller/api/SplitPdfByChaptersController.java b/src/main/java/stirling/software/SPDF/controller/api/SplitPdfByChaptersController.java index f344c276..c74ed294 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/SplitPdfByChaptersController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/SplitPdfByChaptersController.java @@ -59,70 +59,86 @@ public class SplitPdfByChaptersController { public ResponseEntity splitPdf(@ModelAttribute SplitPdfByChaptersRequest request) throws Exception { MultipartFile file = request.getFileInput(); - boolean includeMetadata = request.getIncludeMetadata(); - Integer bookmarkLevel = - request.getBookmarkLevel(); // levels start from 0 (top most bookmarks) - if (bookmarkLevel < 0) { - return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes()); - } - PDDocument sourceDocument = Loader.loadPDF(file.getBytes()); + PDDocument sourceDocument = null; + Path zipFile = null; - PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline(); - - if (outline == null) { - logger.warn("No outline found for {}", file.getOriginalFilename()); - return ResponseEntity.badRequest().body("No outline found".getBytes()); - } - List bookmarks = new ArrayList<>(); try { - bookmarks = - extractOutlineItems( - sourceDocument, - outline.getFirstChild(), - bookmarks, - outline.getFirstChild().getNextSibling(), - 0, - bookmarkLevel); - // to handle last page edge case - bookmarks.get(bookmarks.size() - 1).setEndPage(sourceDocument.getNumberOfPages()); - Bookmark lastBookmark = bookmarks.get(bookmarks.size() - 1); + boolean includeMetadata = request.getIncludeMetadata(); + Integer bookmarkLevel = + request.getBookmarkLevel(); // levels start from 0 (top most bookmarks) + if (bookmarkLevel < 0) { + return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes()); + } + sourceDocument = Loader.loadPDF(file.getBytes()); - } catch (Exception e) { - logger.error("Unable to extract outline items", e); - return ResponseEntity.internalServerError() - .body("Unable to extract outline items".getBytes()); + PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline(); + + if (outline == null) { + logger.warn("No outline found for {}", file.getOriginalFilename()); + return ResponseEntity.badRequest().body("No outline found".getBytes()); + } + List bookmarks = new ArrayList<>(); + try { + bookmarks = + extractOutlineItems( + sourceDocument, + outline.getFirstChild(), + bookmarks, + outline.getFirstChild().getNextSibling(), + 0, + bookmarkLevel); + // to handle last page edge case + bookmarks.get(bookmarks.size() - 1).setEndPage(sourceDocument.getNumberOfPages()); + Bookmark lastBookmark = bookmarks.get(bookmarks.size() - 1); + + } catch (Exception e) { + logger.error("Unable to extract outline items", e); + return ResponseEntity.internalServerError() + .body("Unable to extract outline items".getBytes()); + } + + boolean allowDuplicates = request.getAllowDuplicates(); + if (!allowDuplicates) { + /* + duplicates are generated when multiple bookmarks correspond to the same page, + if the user doesn't want duplicates mergeBookmarksThatCorrespondToSamePage() method will merge the titles of all + the bookmarks that correspond to the same page, and treat them as a single bookmark + */ + bookmarks = mergeBookmarksThatCorrespondToSamePage(bookmarks); + } + for (Bookmark bookmark : bookmarks) { + logger.info( + "{}::::{} to {}", + bookmark.getTitle(), + bookmark.getStartPage(), + bookmark.getEndPage()); + } + List splitDocumentsBoas = + getSplitDocumentsBoas(sourceDocument, bookmarks, includeMetadata); + + zipFile = createZipFile(bookmarks, splitDocumentsBoas); + + byte[] data = Files.readAllBytes(zipFile); + Files.deleteIfExists(zipFile); + + String filename = + Filenames.toSimpleFileName(file.getOriginalFilename()) + .replaceFirst("[.][^.]+$", ""); + sourceDocument.close(); + return WebResponseUtils.bytesToWebResponse( + data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM); + } finally { + try { + if (sourceDocument != null) { + sourceDocument.close(); + } + if (zipFile != null) { + Files.deleteIfExists(zipFile); + } + } catch (Exception e) { + logger.error("Error while cleaning up resources", e); + } } - - boolean allowDuplicates = request.getAllowDuplicates(); - if (!allowDuplicates) { - /* - duplicates are generated when multiple bookmarks correspond to the same page, - if the user doesn't want duplicates mergeBookmarksThatCorrespondToSamePage() method will merge the titles of all - the bookmarks that correspond to the same page, and treat them as a single bookmark - */ - bookmarks = mergeBookmarksThatCorrespondToSamePage(bookmarks); - } - for (Bookmark bookmark : bookmarks) { - logger.info( - "{}::::{} to {}", - bookmark.getTitle(), - bookmark.getStartPage(), - bookmark.getEndPage()); - } - List splitDocumentsBoas = - getSplitDocumentsBoas(sourceDocument, bookmarks, includeMetadata); - - Path zipFile = createZipFile(bookmarks, splitDocumentsBoas); - - byte[] data = Files.readAllBytes(zipFile); - Files.deleteIfExists(zipFile); - - String filename = - Filenames.toSimpleFileName(file.getOriginalFilename()) - .replaceFirst("[.][^.]+$", ""); - sourceDocument.close(); - return WebResponseUtils.bytesToWebResponse( - data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM); } private List mergeBookmarksThatCorrespondToSamePage(List bookmarks) { diff --git a/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySectionsController.java b/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySectionsController.java index 2b4f1313..eaa9c86d 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySectionsController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/SplitPdfBySectionsController.java @@ -105,15 +105,13 @@ public class SplitPdfBySectionsController { if (sectionNum == horiz * verti) pageNum++; } - } catch (Exception e) { - logger.error("exception", e); - } finally { data = Files.readAllBytes(zipFile); + return WebResponseUtils.bytesToWebResponse( + data, filename + "_split.zip", MediaType.APPLICATION_OCTET_STREAM); + + } finally { Files.deleteIfExists(zipFile); } - - return WebResponseUtils.bytesToWebResponse( - data, filename + "_split.zip", MediaType.APPLICATION_OCTET_STREAM); } public List splitPdfPages( diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java index be955dbd..b5eec392 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertImgPDFController.java @@ -65,112 +65,137 @@ public class ConvertImgPDFController { String colorType = request.getColorType(); String dpi = request.getDpi(); - byte[] pdfBytes = file.getBytes(); - ImageType colorTypeResult = ImageType.RGB; - if ("greyscale".equals(colorType)) { - colorTypeResult = ImageType.GRAY; - } else if ("blackwhite".equals(colorType)) { - colorTypeResult = ImageType.BINARY; - } - // returns bytes for image - boolean singleImage = "single".equals(singleOrMultiple); + Path tempFile = null; + Path tempOutputDir = null; + Path tempPdfPath = null; byte[] result = null; - String filename = - Filenames.toSimpleFileName(file.getOriginalFilename()) - .replaceFirst("[.][^.]+$", ""); - result = - PdfUtils.convertFromPdf( - pdfBytes, - "webp".equalsIgnoreCase(imageFormat) ? "png" : imageFormat.toUpperCase(), - colorTypeResult, - singleImage, - Integer.valueOf(dpi), - filename); - if (result == null || result.length == 0) { - logger.error("resultant bytes for {} is null, error converting ", filename); - } - if ("webp".equalsIgnoreCase(imageFormat) && !CheckProgramInstall.isPythonAvailable()) { - throw new IOException("Python is not installed. Required for WebP conversion."); - } else if ("webp".equalsIgnoreCase(imageFormat) - && CheckProgramInstall.isPythonAvailable()) { - // Write the output stream to a temp file - Path tempFile = Files.createTempFile("temp_png", ".png"); - try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) { - fos.write(result); - fos.flush(); + try { + byte[] pdfBytes = file.getBytes(); + ImageType colorTypeResult = ImageType.RGB; + if ("greyscale".equals(colorType)) { + colorTypeResult = ImageType.GRAY; + } else if ("blackwhite".equals(colorType)) { + colorTypeResult = ImageType.BINARY; } + // returns bytes for image + boolean singleImage = "single".equals(singleOrMultiple); + String filename = + Filenames.toSimpleFileName(file.getOriginalFilename()) + .replaceFirst("[.][^.]+$", ""); - String pythonVersion = CheckProgramInstall.getAvailablePythonCommand(); - - List command = new ArrayList<>(); - command.add(pythonVersion); - command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion - - // Create a temporary directory for the output WebP files - Path tempOutputDir = Files.createTempDirectory("webp_output"); - if (singleImage) { - // Run the Python script to convert PNG to WebP - command.add(tempFile.toString()); - command.add(tempOutputDir.toString()); - command.add("--single"); - } else { - // Save the uploaded PDF to a temporary file - Path tempPdfPath = Files.createTempFile("temp_pdf", ".pdf"); - file.transferTo(tempPdfPath.toFile()); - // Run the Python script to convert PDF to WebP - command.add(tempPdfPath.toString()); - command.add(tempOutputDir.toString()); + result = + PdfUtils.convertFromPdf( + pdfBytes, + "webp".equalsIgnoreCase(imageFormat) + ? "png" + : imageFormat.toUpperCase(), + colorTypeResult, + singleImage, + Integer.valueOf(dpi), + filename); + if (result == null || result.length == 0) { + logger.error("resultant bytes for {} is null, error converting ", filename); } - command.add("--dpi"); - command.add(dpi); - ProcessExecutorResult resultProcess = - ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV) - .runCommandWithOutputHandling(command); - - // Find all WebP files in the output directory - List webpFiles = - Files.walk(tempOutputDir) - .filter(path -> path.toString().endsWith(".webp")) - .collect(Collectors.toList()); - - if (webpFiles.isEmpty()) { - logger.error("No WebP files were created in: {}", tempOutputDir.toString()); - throw new IOException("No WebP files were created. " + resultProcess.getMessages()); - } - - byte[] bodyBytes = new byte[0]; - - if (webpFiles.size() == 1) { - // Return the single WebP file directly - Path webpFilePath = webpFiles.get(0); - bodyBytes = Files.readAllBytes(webpFilePath); - } else { - // Create a ZIP file containing all WebP images - ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream(); - try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) { - for (Path webpFile : webpFiles) { - zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString())); - Files.copy(webpFile, zos); - zos.closeEntry(); - } + if ("webp".equalsIgnoreCase(imageFormat) && !CheckProgramInstall.isPythonAvailable()) { + throw new IOException("Python is not installed. Required for WebP conversion."); + } else if ("webp".equalsIgnoreCase(imageFormat) + && CheckProgramInstall.isPythonAvailable()) { + // Write the output stream to a temp file + tempFile = Files.createTempFile("temp_png", ".png"); + try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) { + fos.write(result); + fos.flush(); } - bodyBytes = zipOutputStream.toByteArray(); - } - // Clean up the temporary files - Files.deleteIfExists(tempFile); - if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile()); - result = bodyBytes; - } - if (singleImage) { - String docName = filename + "." + imageFormat; - MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat)); - return WebResponseUtils.bytesToWebResponse(result, docName, mediaType); - } else { - String zipFilename = filename + "_convertedToImages.zip"; - return WebResponseUtils.bytesToWebResponse( - result, zipFilename, MediaType.APPLICATION_OCTET_STREAM); + String pythonVersion = CheckProgramInstall.getAvailablePythonCommand(); + + List command = new ArrayList<>(); + command.add(pythonVersion); + command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion + + // Create a temporary directory for the output WebP files + tempOutputDir = Files.createTempDirectory("webp_output"); + if (singleImage) { + // Run the Python script to convert PNG to WebP + command.add(tempFile.toString()); + command.add(tempOutputDir.toString()); + command.add("--single"); + } else { + // Save the uploaded PDF to a temporary file + tempPdfPath = Files.createTempFile("temp_pdf", ".pdf"); + file.transferTo(tempPdfPath.toFile()); + // Run the Python script to convert PDF to WebP + command.add(tempPdfPath.toString()); + command.add(tempOutputDir.toString()); + } + command.add("--dpi"); + command.add(dpi); + ProcessExecutorResult resultProcess = + ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV) + .runCommandWithOutputHandling(command); + + // Find all WebP files in the output directory + List webpFiles = + Files.walk(tempOutputDir) + .filter(path -> path.toString().endsWith(".webp")) + .collect(Collectors.toList()); + + if (webpFiles.isEmpty()) { + logger.error("No WebP files were created in: {}", tempOutputDir.toString()); + throw new IOException( + "No WebP files were created. " + resultProcess.getMessages()); + } + + byte[] bodyBytes = new byte[0]; + + if (webpFiles.size() == 1) { + // Return the single WebP file directly + Path webpFilePath = webpFiles.get(0); + bodyBytes = Files.readAllBytes(webpFilePath); + } else { + // Create a ZIP file containing all WebP images + ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream(); + try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) { + for (Path webpFile : webpFiles) { + zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString())); + Files.copy(webpFile, zos); + zos.closeEntry(); + } + } + bodyBytes = zipOutputStream.toByteArray(); + } + // Clean up the temporary files + Files.deleteIfExists(tempFile); + if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile()); + result = bodyBytes; + } + + if (singleImage) { + String docName = filename + "." + imageFormat; + MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat)); + return WebResponseUtils.bytesToWebResponse(result, docName, mediaType); + } else { + String zipFilename = filename + "_convertedToImages.zip"; + return WebResponseUtils.bytesToWebResponse( + result, zipFilename, MediaType.APPLICATION_OCTET_STREAM); + } + + } finally { + try { + // Clean up temporary files + if (tempFile != null) { + Files.deleteIfExists(tempFile); + } + if (tempPdfPath != null) { + Files.deleteIfExists(tempPdfPath); + } + if (tempOutputDir != null) { + FileUtils.deleteDirectory(tempOutputDir.toFile()); + } + } catch (Exception e) { + logger.error("Error cleaning up temporary files", e); + } } } diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java index 6c5f3993..f503c107 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java @@ -87,7 +87,7 @@ public class OCRController { Files.createDirectories(tempOutputDir); Files.createDirectories(tempImagesDir); - + Process process = null; try { // Save input file inputFile.transferTo(tempInputFile.toFile()); @@ -139,7 +139,7 @@ public class OCRController { command.add("pdf"); // Always output PDF ProcessBuilder pb = new ProcessBuilder(command); - Process process = pb.start(); + process = pb.start(); // Capture any error output try (BufferedReader reader = @@ -188,6 +188,10 @@ public class OCRController { .body(pdfContent); } finally { + if (process != null) { + process.destroy(); + } + // Clean up temporary files deleteDirectory(tempDir); }