This commit is contained in:
Anthony Stirling 2024-12-09 20:40:59 +00:00
parent a400fe6015
commit 0652299bec
5 changed files with 304 additions and 230 deletions

View File

@ -52,84 +52,115 @@ public class SplitPDFController {
"This endpoint splits a given PDF file into separate documents based on the specified page numbers or ranges. Users can specify pages using individual numbers, ranges, or 'all' for every page. Input:PDF Output:PDF Type:SIMO") "This endpoint splits a given PDF file into separate documents based on the specified page numbers or ranges. Users can specify pages using individual numbers, ranges, or 'all' for every page. Input:PDF Output:PDF Type:SIMO")
public ResponseEntity<byte[]> splitPdf(@ModelAttribute PDFWithPageNums request) public ResponseEntity<byte[]> splitPdf(@ModelAttribute PDFWithPageNums request)
throws IOException { throws IOException {
MultipartFile file = request.getFileInput();
String pages = request.getPageNumbers();
// open the pdf document
PDDocument document = Loader.loadPDF(file.getBytes()); PDDocument document = null;
// PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document); Path zipFile = null;
int totalPages = document.getNumberOfPages();
List<Integer> pageNumbers = request.getPageNumbersList(document, false);
if (!pageNumbers.contains(totalPages - 1)) {
// Create a mutable ArrayList so we can add to it
pageNumbers = new ArrayList<>(pageNumbers);
pageNumbers.add(totalPages - 1);
}
logger.info(
"Splitting PDF into pages: {}",
pageNumbers.stream().map(String::valueOf).collect(Collectors.joining(",")));
// split the document
List<ByteArrayOutputStream> splitDocumentsBoas = new ArrayList<>(); List<ByteArrayOutputStream> splitDocumentsBoas = new ArrayList<>();
int previousPageNumber = 0;
for (int splitPoint : pageNumbers) { try {
try (PDDocument splitDocument =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(document)) { MultipartFile file = request.getFileInput();
for (int i = previousPageNumber; i <= splitPoint; i++) { String pages = request.getPageNumbers();
PDPage page = document.getPage(i); // open the pdf document
splitDocument.addPage(page);
logger.info("Adding page {} to split document", i); document = Loader.loadPDF(file.getBytes());
// PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document);
int totalPages = document.getNumberOfPages();
List<Integer> pageNumbers = request.getPageNumbersList(document, false);
if (!pageNumbers.contains(totalPages - 1)) {
// Create a mutable ArrayList so we can add to it
pageNumbers = new ArrayList<>(pageNumbers);
pageNumbers.add(totalPages - 1);
}
logger.info(
"Splitting PDF into pages: {}",
pageNumbers.stream().map(String::valueOf).collect(Collectors.joining(",")));
// split the document
splitDocumentsBoas = new ArrayList<>();
int previousPageNumber = 0;
for (int splitPoint : pageNumbers) {
try (PDDocument splitDocument =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(document)) {
for (int i = previousPageNumber; i <= splitPoint; i++) {
PDPage page = document.getPage(i);
splitDocument.addPage(page);
logger.info("Adding page {} to split document", i);
}
previousPageNumber = splitPoint + 1;
// Transfer metadata to split pdf
// PdfMetadataService.setMetadataToPdf(splitDocument, metadata);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
splitDocument.save(baos);
splitDocumentsBoas.add(baos);
} catch (Exception e) {
logger.error("Failed splitting documents and saving them", e);
throw e;
} }
previousPageNumber = splitPoint + 1; }
// Transfer metadata to split pdf // closing the original document
// PdfMetadataService.setMetadataToPdf(splitDocument, metadata); document.close();
ByteArrayOutputStream baos = new ByteArrayOutputStream(); zipFile = Files.createTempFile("split_documents", ".zip");
splitDocument.save(baos);
splitDocumentsBoas.add(baos); String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) {
// loop through the split documents and write them to the zip file
for (int i = 0; i < splitDocumentsBoas.size(); i++) {
String fileName = filename + "_" + (i + 1) + ".pdf";
ByteArrayOutputStream baos = splitDocumentsBoas.get(i);
byte[] pdf = baos.toByteArray();
// Add PDF file to the zip
ZipEntry pdfEntry = new ZipEntry(fileName);
zipOut.putNextEntry(pdfEntry);
zipOut.write(pdf);
zipOut.closeEntry();
logger.info("Wrote split document {} to zip file", fileName);
}
} catch (Exception e) { } catch (Exception e) {
logger.error("Failed splitting documents and saving them", e); logger.error("Failed writing to zip", e);
throw e; throw e;
} }
}
// closing the original document logger.info(
document.close(); "Successfully created zip file with split documents: {}", zipFile.toString());
byte[] data = Files.readAllBytes(zipFile);
Files.deleteIfExists(zipFile);
Path zipFile = Files.createTempFile("split_documents", ".zip"); // return the Resource in the response
return WebResponseUtils.bytesToWebResponse(
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
String filename = } finally {
Filenames.toSimpleFileName(file.getOriginalFilename()) try {
.replaceFirst("[.][^.]+$", ""); // Close the main document
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) { if (document != null) {
// loop through the split documents and write them to the zip file document.close();
for (int i = 0; i < splitDocumentsBoas.size(); i++) { }
String fileName = filename + "_" + (i + 1) + ".pdf";
ByteArrayOutputStream baos = splitDocumentsBoas.get(i);
byte[] pdf = baos.toByteArray();
// Add PDF file to the zip // Close all ByteArrayOutputStreams
ZipEntry pdfEntry = new ZipEntry(fileName); for (ByteArrayOutputStream baos : splitDocumentsBoas) {
zipOut.putNextEntry(pdfEntry); if (baos != null) {
zipOut.write(pdf); baos.close();
zipOut.closeEntry(); }
}
logger.info("Wrote split document {} to zip file", fileName); // Delete temporary zip file
if (zipFile != null) {
Files.deleteIfExists(zipFile);
}
} catch (Exception e) {
logger.error("Error while cleaning up resources", e);
} }
} catch (Exception e) {
logger.error("Failed writing to zip", e);
throw e;
} }
logger.info("Successfully created zip file with split documents: {}", zipFile.toString());
byte[] data = Files.readAllBytes(zipFile);
Files.deleteIfExists(zipFile);
// return the Resource in the response
return WebResponseUtils.bytesToWebResponse(
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
} }
} }

View File

@ -59,70 +59,86 @@ public class SplitPdfByChaptersController {
public ResponseEntity<byte[]> splitPdf(@ModelAttribute SplitPdfByChaptersRequest request) public ResponseEntity<byte[]> splitPdf(@ModelAttribute SplitPdfByChaptersRequest request)
throws Exception { throws Exception {
MultipartFile file = request.getFileInput(); MultipartFile file = request.getFileInput();
boolean includeMetadata = request.getIncludeMetadata(); PDDocument sourceDocument = null;
Integer bookmarkLevel = Path zipFile = null;
request.getBookmarkLevel(); // levels start from 0 (top most bookmarks)
if (bookmarkLevel < 0) {
return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes());
}
PDDocument sourceDocument = Loader.loadPDF(file.getBytes());
PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline();
if (outline == null) {
logger.warn("No outline found for {}", file.getOriginalFilename());
return ResponseEntity.badRequest().body("No outline found".getBytes());
}
List<Bookmark> bookmarks = new ArrayList<>();
try { try {
bookmarks = boolean includeMetadata = request.getIncludeMetadata();
extractOutlineItems( Integer bookmarkLevel =
sourceDocument, request.getBookmarkLevel(); // levels start from 0 (top most bookmarks)
outline.getFirstChild(), if (bookmarkLevel < 0) {
bookmarks, return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes());
outline.getFirstChild().getNextSibling(), }
0, sourceDocument = Loader.loadPDF(file.getBytes());
bookmarkLevel);
// to handle last page edge case
bookmarks.get(bookmarks.size() - 1).setEndPage(sourceDocument.getNumberOfPages());
Bookmark lastBookmark = bookmarks.get(bookmarks.size() - 1);
} catch (Exception e) { PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline();
logger.error("Unable to extract outline items", e);
return ResponseEntity.internalServerError() if (outline == null) {
.body("Unable to extract outline items".getBytes()); logger.warn("No outline found for {}", file.getOriginalFilename());
return ResponseEntity.badRequest().body("No outline found".getBytes());
}
List<Bookmark> bookmarks = new ArrayList<>();
try {
bookmarks =
extractOutlineItems(
sourceDocument,
outline.getFirstChild(),
bookmarks,
outline.getFirstChild().getNextSibling(),
0,
bookmarkLevel);
// to handle last page edge case
bookmarks.get(bookmarks.size() - 1).setEndPage(sourceDocument.getNumberOfPages());
Bookmark lastBookmark = bookmarks.get(bookmarks.size() - 1);
} catch (Exception e) {
logger.error("Unable to extract outline items", e);
return ResponseEntity.internalServerError()
.body("Unable to extract outline items".getBytes());
}
boolean allowDuplicates = request.getAllowDuplicates();
if (!allowDuplicates) {
/*
duplicates are generated when multiple bookmarks correspond to the same page,
if the user doesn't want duplicates mergeBookmarksThatCorrespondToSamePage() method will merge the titles of all
the bookmarks that correspond to the same page, and treat them as a single bookmark
*/
bookmarks = mergeBookmarksThatCorrespondToSamePage(bookmarks);
}
for (Bookmark bookmark : bookmarks) {
logger.info(
"{}::::{} to {}",
bookmark.getTitle(),
bookmark.getStartPage(),
bookmark.getEndPage());
}
List<ByteArrayOutputStream> splitDocumentsBoas =
getSplitDocumentsBoas(sourceDocument, bookmarks, includeMetadata);
zipFile = createZipFile(bookmarks, splitDocumentsBoas);
byte[] data = Files.readAllBytes(zipFile);
Files.deleteIfExists(zipFile);
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
sourceDocument.close();
return WebResponseUtils.bytesToWebResponse(
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
} finally {
try {
if (sourceDocument != null) {
sourceDocument.close();
}
if (zipFile != null) {
Files.deleteIfExists(zipFile);
}
} catch (Exception e) {
logger.error("Error while cleaning up resources", e);
}
} }
boolean allowDuplicates = request.getAllowDuplicates();
if (!allowDuplicates) {
/*
duplicates are generated when multiple bookmarks correspond to the same page,
if the user doesn't want duplicates mergeBookmarksThatCorrespondToSamePage() method will merge the titles of all
the bookmarks that correspond to the same page, and treat them as a single bookmark
*/
bookmarks = mergeBookmarksThatCorrespondToSamePage(bookmarks);
}
for (Bookmark bookmark : bookmarks) {
logger.info(
"{}::::{} to {}",
bookmark.getTitle(),
bookmark.getStartPage(),
bookmark.getEndPage());
}
List<ByteArrayOutputStream> splitDocumentsBoas =
getSplitDocumentsBoas(sourceDocument, bookmarks, includeMetadata);
Path zipFile = createZipFile(bookmarks, splitDocumentsBoas);
byte[] data = Files.readAllBytes(zipFile);
Files.deleteIfExists(zipFile);
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
sourceDocument.close();
return WebResponseUtils.bytesToWebResponse(
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
} }
private List<Bookmark> mergeBookmarksThatCorrespondToSamePage(List<Bookmark> bookmarks) { private List<Bookmark> mergeBookmarksThatCorrespondToSamePage(List<Bookmark> bookmarks) {

View File

@ -105,15 +105,13 @@ public class SplitPdfBySectionsController {
if (sectionNum == horiz * verti) pageNum++; if (sectionNum == horiz * verti) pageNum++;
} }
} catch (Exception e) {
logger.error("exception", e);
} finally {
data = Files.readAllBytes(zipFile); data = Files.readAllBytes(zipFile);
return WebResponseUtils.bytesToWebResponse(
data, filename + "_split.zip", MediaType.APPLICATION_OCTET_STREAM);
} finally {
Files.deleteIfExists(zipFile); Files.deleteIfExists(zipFile);
} }
return WebResponseUtils.bytesToWebResponse(
data, filename + "_split.zip", MediaType.APPLICATION_OCTET_STREAM);
} }
public List<PDDocument> splitPdfPages( public List<PDDocument> splitPdfPages(

View File

@ -65,112 +65,137 @@ public class ConvertImgPDFController {
String colorType = request.getColorType(); String colorType = request.getColorType();
String dpi = request.getDpi(); String dpi = request.getDpi();
byte[] pdfBytes = file.getBytes(); Path tempFile = null;
ImageType colorTypeResult = ImageType.RGB; Path tempOutputDir = null;
if ("greyscale".equals(colorType)) { Path tempPdfPath = null;
colorTypeResult = ImageType.GRAY;
} else if ("blackwhite".equals(colorType)) {
colorTypeResult = ImageType.BINARY;
}
// returns bytes for image
boolean singleImage = "single".equals(singleOrMultiple);
byte[] result = null; byte[] result = null;
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
result = try {
PdfUtils.convertFromPdf( byte[] pdfBytes = file.getBytes();
pdfBytes, ImageType colorTypeResult = ImageType.RGB;
"webp".equalsIgnoreCase(imageFormat) ? "png" : imageFormat.toUpperCase(), if ("greyscale".equals(colorType)) {
colorTypeResult, colorTypeResult = ImageType.GRAY;
singleImage, } else if ("blackwhite".equals(colorType)) {
Integer.valueOf(dpi), colorTypeResult = ImageType.BINARY;
filename);
if (result == null || result.length == 0) {
logger.error("resultant bytes for {} is null, error converting ", filename);
}
if ("webp".equalsIgnoreCase(imageFormat) && !CheckProgramInstall.isPythonAvailable()) {
throw new IOException("Python is not installed. Required for WebP conversion.");
} else if ("webp".equalsIgnoreCase(imageFormat)
&& CheckProgramInstall.isPythonAvailable()) {
// Write the output stream to a temp file
Path tempFile = Files.createTempFile("temp_png", ".png");
try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) {
fos.write(result);
fos.flush();
} }
// returns bytes for image
boolean singleImage = "single".equals(singleOrMultiple);
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
String pythonVersion = CheckProgramInstall.getAvailablePythonCommand(); result =
PdfUtils.convertFromPdf(
List<String> command = new ArrayList<>(); pdfBytes,
command.add(pythonVersion); "webp".equalsIgnoreCase(imageFormat)
command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion ? "png"
: imageFormat.toUpperCase(),
// Create a temporary directory for the output WebP files colorTypeResult,
Path tempOutputDir = Files.createTempDirectory("webp_output"); singleImage,
if (singleImage) { Integer.valueOf(dpi),
// Run the Python script to convert PNG to WebP filename);
command.add(tempFile.toString()); if (result == null || result.length == 0) {
command.add(tempOutputDir.toString()); logger.error("resultant bytes for {} is null, error converting ", filename);
command.add("--single");
} else {
// Save the uploaded PDF to a temporary file
Path tempPdfPath = Files.createTempFile("temp_pdf", ".pdf");
file.transferTo(tempPdfPath.toFile());
// Run the Python script to convert PDF to WebP
command.add(tempPdfPath.toString());
command.add(tempOutputDir.toString());
} }
command.add("--dpi"); if ("webp".equalsIgnoreCase(imageFormat) && !CheckProgramInstall.isPythonAvailable()) {
command.add(dpi); throw new IOException("Python is not installed. Required for WebP conversion.");
ProcessExecutorResult resultProcess = } else if ("webp".equalsIgnoreCase(imageFormat)
ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV) && CheckProgramInstall.isPythonAvailable()) {
.runCommandWithOutputHandling(command); // Write the output stream to a temp file
tempFile = Files.createTempFile("temp_png", ".png");
// Find all WebP files in the output directory try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) {
List<Path> webpFiles = fos.write(result);
Files.walk(tempOutputDir) fos.flush();
.filter(path -> path.toString().endsWith(".webp"))
.collect(Collectors.toList());
if (webpFiles.isEmpty()) {
logger.error("No WebP files were created in: {}", tempOutputDir.toString());
throw new IOException("No WebP files were created. " + resultProcess.getMessages());
}
byte[] bodyBytes = new byte[0];
if (webpFiles.size() == 1) {
// Return the single WebP file directly
Path webpFilePath = webpFiles.get(0);
bodyBytes = Files.readAllBytes(webpFilePath);
} else {
// Create a ZIP file containing all WebP images
ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) {
for (Path webpFile : webpFiles) {
zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString()));
Files.copy(webpFile, zos);
zos.closeEntry();
}
} }
bodyBytes = zipOutputStream.toByteArray();
}
// Clean up the temporary files
Files.deleteIfExists(tempFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
result = bodyBytes;
}
if (singleImage) { String pythonVersion = CheckProgramInstall.getAvailablePythonCommand();
String docName = filename + "." + imageFormat;
MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat)); List<String> command = new ArrayList<>();
return WebResponseUtils.bytesToWebResponse(result, docName, mediaType); command.add(pythonVersion);
} else { command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion
String zipFilename = filename + "_convertedToImages.zip";
return WebResponseUtils.bytesToWebResponse( // Create a temporary directory for the output WebP files
result, zipFilename, MediaType.APPLICATION_OCTET_STREAM); tempOutputDir = Files.createTempDirectory("webp_output");
if (singleImage) {
// Run the Python script to convert PNG to WebP
command.add(tempFile.toString());
command.add(tempOutputDir.toString());
command.add("--single");
} else {
// Save the uploaded PDF to a temporary file
tempPdfPath = Files.createTempFile("temp_pdf", ".pdf");
file.transferTo(tempPdfPath.toFile());
// Run the Python script to convert PDF to WebP
command.add(tempPdfPath.toString());
command.add(tempOutputDir.toString());
}
command.add("--dpi");
command.add(dpi);
ProcessExecutorResult resultProcess =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
.runCommandWithOutputHandling(command);
// Find all WebP files in the output directory
List<Path> webpFiles =
Files.walk(tempOutputDir)
.filter(path -> path.toString().endsWith(".webp"))
.collect(Collectors.toList());
if (webpFiles.isEmpty()) {
logger.error("No WebP files were created in: {}", tempOutputDir.toString());
throw new IOException(
"No WebP files were created. " + resultProcess.getMessages());
}
byte[] bodyBytes = new byte[0];
if (webpFiles.size() == 1) {
// Return the single WebP file directly
Path webpFilePath = webpFiles.get(0);
bodyBytes = Files.readAllBytes(webpFilePath);
} else {
// Create a ZIP file containing all WebP images
ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) {
for (Path webpFile : webpFiles) {
zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString()));
Files.copy(webpFile, zos);
zos.closeEntry();
}
}
bodyBytes = zipOutputStream.toByteArray();
}
// Clean up the temporary files
Files.deleteIfExists(tempFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
result = bodyBytes;
}
if (singleImage) {
String docName = filename + "." + imageFormat;
MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat));
return WebResponseUtils.bytesToWebResponse(result, docName, mediaType);
} else {
String zipFilename = filename + "_convertedToImages.zip";
return WebResponseUtils.bytesToWebResponse(
result, zipFilename, MediaType.APPLICATION_OCTET_STREAM);
}
} finally {
try {
// Clean up temporary files
if (tempFile != null) {
Files.deleteIfExists(tempFile);
}
if (tempPdfPath != null) {
Files.deleteIfExists(tempPdfPath);
}
if (tempOutputDir != null) {
FileUtils.deleteDirectory(tempOutputDir.toFile());
}
} catch (Exception e) {
logger.error("Error cleaning up temporary files", e);
}
} }
} }

View File

@ -87,7 +87,7 @@ public class OCRController {
Files.createDirectories(tempOutputDir); Files.createDirectories(tempOutputDir);
Files.createDirectories(tempImagesDir); Files.createDirectories(tempImagesDir);
Process process = null;
try { try {
// Save input file // Save input file
inputFile.transferTo(tempInputFile.toFile()); inputFile.transferTo(tempInputFile.toFile());
@ -139,7 +139,7 @@ public class OCRController {
command.add("pdf"); // Always output PDF command.add("pdf"); // Always output PDF
ProcessBuilder pb = new ProcessBuilder(command); ProcessBuilder pb = new ProcessBuilder(command);
Process process = pb.start(); process = pb.start();
// Capture any error output // Capture any error output
try (BufferedReader reader = try (BufferedReader reader =
@ -188,6 +188,10 @@ public class OCRController {
.body(pdfContent); .body(pdfContent);
} finally { } finally {
if (process != null) {
process.destroy();
}
// Clean up temporary files // Clean up temporary files
deleteDirectory(tempDir); deleteDirectory(tempDir);
} }