This commit is contained in:
Anthony Stirling 2024-12-09 20:40:59 +00:00
parent a400fe6015
commit 0652299bec
5 changed files with 304 additions and 230 deletions

View File

@ -52,84 +52,115 @@ public class SplitPDFController {
"This endpoint splits a given PDF file into separate documents based on the specified page numbers or ranges. Users can specify pages using individual numbers, ranges, or 'all' for every page. Input:PDF Output:PDF Type:SIMO")
public ResponseEntity<byte[]> splitPdf(@ModelAttribute PDFWithPageNums request)
throws IOException {
MultipartFile file = request.getFileInput();
String pages = request.getPageNumbers();
// open the pdf document
PDDocument document = Loader.loadPDF(file.getBytes());
// PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document);
int totalPages = document.getNumberOfPages();
List<Integer> pageNumbers = request.getPageNumbersList(document, false);
if (!pageNumbers.contains(totalPages - 1)) {
// Create a mutable ArrayList so we can add to it
pageNumbers = new ArrayList<>(pageNumbers);
pageNumbers.add(totalPages - 1);
}
logger.info(
"Splitting PDF into pages: {}",
pageNumbers.stream().map(String::valueOf).collect(Collectors.joining(",")));
// split the document
PDDocument document = null;
Path zipFile = null;
List<ByteArrayOutputStream> splitDocumentsBoas = new ArrayList<>();
int previousPageNumber = 0;
for (int splitPoint : pageNumbers) {
try (PDDocument splitDocument =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(document)) {
for (int i = previousPageNumber; i <= splitPoint; i++) {
PDPage page = document.getPage(i);
splitDocument.addPage(page);
logger.info("Adding page {} to split document", i);
try {
MultipartFile file = request.getFileInput();
String pages = request.getPageNumbers();
// open the pdf document
document = Loader.loadPDF(file.getBytes());
// PdfMetadata metadata = PdfMetadataService.extractMetadataFromPdf(document);
int totalPages = document.getNumberOfPages();
List<Integer> pageNumbers = request.getPageNumbersList(document, false);
if (!pageNumbers.contains(totalPages - 1)) {
// Create a mutable ArrayList so we can add to it
pageNumbers = new ArrayList<>(pageNumbers);
pageNumbers.add(totalPages - 1);
}
logger.info(
"Splitting PDF into pages: {}",
pageNumbers.stream().map(String::valueOf).collect(Collectors.joining(",")));
// split the document
splitDocumentsBoas = new ArrayList<>();
int previousPageNumber = 0;
for (int splitPoint : pageNumbers) {
try (PDDocument splitDocument =
pdfDocumentFactory.createNewDocumentBasedOnOldDocument(document)) {
for (int i = previousPageNumber; i <= splitPoint; i++) {
PDPage page = document.getPage(i);
splitDocument.addPage(page);
logger.info("Adding page {} to split document", i);
}
previousPageNumber = splitPoint + 1;
// Transfer metadata to split pdf
// PdfMetadataService.setMetadataToPdf(splitDocument, metadata);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
splitDocument.save(baos);
splitDocumentsBoas.add(baos);
} catch (Exception e) {
logger.error("Failed splitting documents and saving them", e);
throw e;
}
previousPageNumber = splitPoint + 1;
}
// Transfer metadata to split pdf
// PdfMetadataService.setMetadataToPdf(splitDocument, metadata);
// closing the original document
document.close();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
splitDocument.save(baos);
zipFile = Files.createTempFile("split_documents", ".zip");
splitDocumentsBoas.add(baos);
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) {
// loop through the split documents and write them to the zip file
for (int i = 0; i < splitDocumentsBoas.size(); i++) {
String fileName = filename + "_" + (i + 1) + ".pdf";
ByteArrayOutputStream baos = splitDocumentsBoas.get(i);
byte[] pdf = baos.toByteArray();
// Add PDF file to the zip
ZipEntry pdfEntry = new ZipEntry(fileName);
zipOut.putNextEntry(pdfEntry);
zipOut.write(pdf);
zipOut.closeEntry();
logger.info("Wrote split document {} to zip file", fileName);
}
} catch (Exception e) {
logger.error("Failed splitting documents and saving them", e);
logger.error("Failed writing to zip", e);
throw e;
}
}
// closing the original document
document.close();
logger.info(
"Successfully created zip file with split documents: {}", zipFile.toString());
byte[] data = Files.readAllBytes(zipFile);
Files.deleteIfExists(zipFile);
Path zipFile = Files.createTempFile("split_documents", ".zip");
// return the Resource in the response
return WebResponseUtils.bytesToWebResponse(
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) {
// loop through the split documents and write them to the zip file
for (int i = 0; i < splitDocumentsBoas.size(); i++) {
String fileName = filename + "_" + (i + 1) + ".pdf";
ByteArrayOutputStream baos = splitDocumentsBoas.get(i);
byte[] pdf = baos.toByteArray();
} finally {
try {
// Close the main document
if (document != null) {
document.close();
}
// Add PDF file to the zip
ZipEntry pdfEntry = new ZipEntry(fileName);
zipOut.putNextEntry(pdfEntry);
zipOut.write(pdf);
zipOut.closeEntry();
// Close all ByteArrayOutputStreams
for (ByteArrayOutputStream baos : splitDocumentsBoas) {
if (baos != null) {
baos.close();
}
}
logger.info("Wrote split document {} to zip file", fileName);
// Delete temporary zip file
if (zipFile != null) {
Files.deleteIfExists(zipFile);
}
} catch (Exception e) {
logger.error("Error while cleaning up resources", e);
}
} catch (Exception e) {
logger.error("Failed writing to zip", e);
throw e;
}
logger.info("Successfully created zip file with split documents: {}", zipFile.toString());
byte[] data = Files.readAllBytes(zipFile);
Files.deleteIfExists(zipFile);
// return the Resource in the response
return WebResponseUtils.bytesToWebResponse(
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
}
}

View File

@ -59,70 +59,86 @@ public class SplitPdfByChaptersController {
public ResponseEntity<byte[]> splitPdf(@ModelAttribute SplitPdfByChaptersRequest request)
throws Exception {
MultipartFile file = request.getFileInput();
boolean includeMetadata = request.getIncludeMetadata();
Integer bookmarkLevel =
request.getBookmarkLevel(); // levels start from 0 (top most bookmarks)
if (bookmarkLevel < 0) {
return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes());
}
PDDocument sourceDocument = Loader.loadPDF(file.getBytes());
PDDocument sourceDocument = null;
Path zipFile = null;
PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline();
if (outline == null) {
logger.warn("No outline found for {}", file.getOriginalFilename());
return ResponseEntity.badRequest().body("No outline found".getBytes());
}
List<Bookmark> bookmarks = new ArrayList<>();
try {
bookmarks =
extractOutlineItems(
sourceDocument,
outline.getFirstChild(),
bookmarks,
outline.getFirstChild().getNextSibling(),
0,
bookmarkLevel);
// to handle last page edge case
bookmarks.get(bookmarks.size() - 1).setEndPage(sourceDocument.getNumberOfPages());
Bookmark lastBookmark = bookmarks.get(bookmarks.size() - 1);
boolean includeMetadata = request.getIncludeMetadata();
Integer bookmarkLevel =
request.getBookmarkLevel(); // levels start from 0 (top most bookmarks)
if (bookmarkLevel < 0) {
return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes());
}
sourceDocument = Loader.loadPDF(file.getBytes());
} catch (Exception e) {
logger.error("Unable to extract outline items", e);
return ResponseEntity.internalServerError()
.body("Unable to extract outline items".getBytes());
PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline();
if (outline == null) {
logger.warn("No outline found for {}", file.getOriginalFilename());
return ResponseEntity.badRequest().body("No outline found".getBytes());
}
List<Bookmark> bookmarks = new ArrayList<>();
try {
bookmarks =
extractOutlineItems(
sourceDocument,
outline.getFirstChild(),
bookmarks,
outline.getFirstChild().getNextSibling(),
0,
bookmarkLevel);
// to handle last page edge case
bookmarks.get(bookmarks.size() - 1).setEndPage(sourceDocument.getNumberOfPages());
Bookmark lastBookmark = bookmarks.get(bookmarks.size() - 1);
} catch (Exception e) {
logger.error("Unable to extract outline items", e);
return ResponseEntity.internalServerError()
.body("Unable to extract outline items".getBytes());
}
boolean allowDuplicates = request.getAllowDuplicates();
if (!allowDuplicates) {
/*
duplicates are generated when multiple bookmarks correspond to the same page,
if the user doesn't want duplicates mergeBookmarksThatCorrespondToSamePage() method will merge the titles of all
the bookmarks that correspond to the same page, and treat them as a single bookmark
*/
bookmarks = mergeBookmarksThatCorrespondToSamePage(bookmarks);
}
for (Bookmark bookmark : bookmarks) {
logger.info(
"{}::::{} to {}",
bookmark.getTitle(),
bookmark.getStartPage(),
bookmark.getEndPage());
}
List<ByteArrayOutputStream> splitDocumentsBoas =
getSplitDocumentsBoas(sourceDocument, bookmarks, includeMetadata);
zipFile = createZipFile(bookmarks, splitDocumentsBoas);
byte[] data = Files.readAllBytes(zipFile);
Files.deleteIfExists(zipFile);
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
sourceDocument.close();
return WebResponseUtils.bytesToWebResponse(
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
} finally {
try {
if (sourceDocument != null) {
sourceDocument.close();
}
if (zipFile != null) {
Files.deleteIfExists(zipFile);
}
} catch (Exception e) {
logger.error("Error while cleaning up resources", e);
}
}
boolean allowDuplicates = request.getAllowDuplicates();
if (!allowDuplicates) {
/*
duplicates are generated when multiple bookmarks correspond to the same page,
if the user doesn't want duplicates mergeBookmarksThatCorrespondToSamePage() method will merge the titles of all
the bookmarks that correspond to the same page, and treat them as a single bookmark
*/
bookmarks = mergeBookmarksThatCorrespondToSamePage(bookmarks);
}
for (Bookmark bookmark : bookmarks) {
logger.info(
"{}::::{} to {}",
bookmark.getTitle(),
bookmark.getStartPage(),
bookmark.getEndPage());
}
List<ByteArrayOutputStream> splitDocumentsBoas =
getSplitDocumentsBoas(sourceDocument, bookmarks, includeMetadata);
Path zipFile = createZipFile(bookmarks, splitDocumentsBoas);
byte[] data = Files.readAllBytes(zipFile);
Files.deleteIfExists(zipFile);
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
sourceDocument.close();
return WebResponseUtils.bytesToWebResponse(
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
}
private List<Bookmark> mergeBookmarksThatCorrespondToSamePage(List<Bookmark> bookmarks) {

View File

@ -105,15 +105,13 @@ public class SplitPdfBySectionsController {
if (sectionNum == horiz * verti) pageNum++;
}
} catch (Exception e) {
logger.error("exception", e);
} finally {
data = Files.readAllBytes(zipFile);
return WebResponseUtils.bytesToWebResponse(
data, filename + "_split.zip", MediaType.APPLICATION_OCTET_STREAM);
} finally {
Files.deleteIfExists(zipFile);
}
return WebResponseUtils.bytesToWebResponse(
data, filename + "_split.zip", MediaType.APPLICATION_OCTET_STREAM);
}
public List<PDDocument> splitPdfPages(

View File

@ -65,112 +65,137 @@ public class ConvertImgPDFController {
String colorType = request.getColorType();
String dpi = request.getDpi();
byte[] pdfBytes = file.getBytes();
ImageType colorTypeResult = ImageType.RGB;
if ("greyscale".equals(colorType)) {
colorTypeResult = ImageType.GRAY;
} else if ("blackwhite".equals(colorType)) {
colorTypeResult = ImageType.BINARY;
}
// returns bytes for image
boolean singleImage = "single".equals(singleOrMultiple);
Path tempFile = null;
Path tempOutputDir = null;
Path tempPdfPath = null;
byte[] result = null;
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
result =
PdfUtils.convertFromPdf(
pdfBytes,
"webp".equalsIgnoreCase(imageFormat) ? "png" : imageFormat.toUpperCase(),
colorTypeResult,
singleImage,
Integer.valueOf(dpi),
filename);
if (result == null || result.length == 0) {
logger.error("resultant bytes for {} is null, error converting ", filename);
}
if ("webp".equalsIgnoreCase(imageFormat) && !CheckProgramInstall.isPythonAvailable()) {
throw new IOException("Python is not installed. Required for WebP conversion.");
} else if ("webp".equalsIgnoreCase(imageFormat)
&& CheckProgramInstall.isPythonAvailable()) {
// Write the output stream to a temp file
Path tempFile = Files.createTempFile("temp_png", ".png");
try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) {
fos.write(result);
fos.flush();
try {
byte[] pdfBytes = file.getBytes();
ImageType colorTypeResult = ImageType.RGB;
if ("greyscale".equals(colorType)) {
colorTypeResult = ImageType.GRAY;
} else if ("blackwhite".equals(colorType)) {
colorTypeResult = ImageType.BINARY;
}
// returns bytes for image
boolean singleImage = "single".equals(singleOrMultiple);
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
String pythonVersion = CheckProgramInstall.getAvailablePythonCommand();
List<String> command = new ArrayList<>();
command.add(pythonVersion);
command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion
// Create a temporary directory for the output WebP files
Path tempOutputDir = Files.createTempDirectory("webp_output");
if (singleImage) {
// Run the Python script to convert PNG to WebP
command.add(tempFile.toString());
command.add(tempOutputDir.toString());
command.add("--single");
} else {
// Save the uploaded PDF to a temporary file
Path tempPdfPath = Files.createTempFile("temp_pdf", ".pdf");
file.transferTo(tempPdfPath.toFile());
// Run the Python script to convert PDF to WebP
command.add(tempPdfPath.toString());
command.add(tempOutputDir.toString());
result =
PdfUtils.convertFromPdf(
pdfBytes,
"webp".equalsIgnoreCase(imageFormat)
? "png"
: imageFormat.toUpperCase(),
colorTypeResult,
singleImage,
Integer.valueOf(dpi),
filename);
if (result == null || result.length == 0) {
logger.error("resultant bytes for {} is null, error converting ", filename);
}
command.add("--dpi");
command.add(dpi);
ProcessExecutorResult resultProcess =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
.runCommandWithOutputHandling(command);
// Find all WebP files in the output directory
List<Path> webpFiles =
Files.walk(tempOutputDir)
.filter(path -> path.toString().endsWith(".webp"))
.collect(Collectors.toList());
if (webpFiles.isEmpty()) {
logger.error("No WebP files were created in: {}", tempOutputDir.toString());
throw new IOException("No WebP files were created. " + resultProcess.getMessages());
}
byte[] bodyBytes = new byte[0];
if (webpFiles.size() == 1) {
// Return the single WebP file directly
Path webpFilePath = webpFiles.get(0);
bodyBytes = Files.readAllBytes(webpFilePath);
} else {
// Create a ZIP file containing all WebP images
ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) {
for (Path webpFile : webpFiles) {
zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString()));
Files.copy(webpFile, zos);
zos.closeEntry();
}
if ("webp".equalsIgnoreCase(imageFormat) && !CheckProgramInstall.isPythonAvailable()) {
throw new IOException("Python is not installed. Required for WebP conversion.");
} else if ("webp".equalsIgnoreCase(imageFormat)
&& CheckProgramInstall.isPythonAvailable()) {
// Write the output stream to a temp file
tempFile = Files.createTempFile("temp_png", ".png");
try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) {
fos.write(result);
fos.flush();
}
bodyBytes = zipOutputStream.toByteArray();
}
// Clean up the temporary files
Files.deleteIfExists(tempFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
result = bodyBytes;
}
if (singleImage) {
String docName = filename + "." + imageFormat;
MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat));
return WebResponseUtils.bytesToWebResponse(result, docName, mediaType);
} else {
String zipFilename = filename + "_convertedToImages.zip";
return WebResponseUtils.bytesToWebResponse(
result, zipFilename, MediaType.APPLICATION_OCTET_STREAM);
String pythonVersion = CheckProgramInstall.getAvailablePythonCommand();
List<String> command = new ArrayList<>();
command.add(pythonVersion);
command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion
// Create a temporary directory for the output WebP files
tempOutputDir = Files.createTempDirectory("webp_output");
if (singleImage) {
// Run the Python script to convert PNG to WebP
command.add(tempFile.toString());
command.add(tempOutputDir.toString());
command.add("--single");
} else {
// Save the uploaded PDF to a temporary file
tempPdfPath = Files.createTempFile("temp_pdf", ".pdf");
file.transferTo(tempPdfPath.toFile());
// Run the Python script to convert PDF to WebP
command.add(tempPdfPath.toString());
command.add(tempOutputDir.toString());
}
command.add("--dpi");
command.add(dpi);
ProcessExecutorResult resultProcess =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
.runCommandWithOutputHandling(command);
// Find all WebP files in the output directory
List<Path> webpFiles =
Files.walk(tempOutputDir)
.filter(path -> path.toString().endsWith(".webp"))
.collect(Collectors.toList());
if (webpFiles.isEmpty()) {
logger.error("No WebP files were created in: {}", tempOutputDir.toString());
throw new IOException(
"No WebP files were created. " + resultProcess.getMessages());
}
byte[] bodyBytes = new byte[0];
if (webpFiles.size() == 1) {
// Return the single WebP file directly
Path webpFilePath = webpFiles.get(0);
bodyBytes = Files.readAllBytes(webpFilePath);
} else {
// Create a ZIP file containing all WebP images
ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) {
for (Path webpFile : webpFiles) {
zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString()));
Files.copy(webpFile, zos);
zos.closeEntry();
}
}
bodyBytes = zipOutputStream.toByteArray();
}
// Clean up the temporary files
Files.deleteIfExists(tempFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
result = bodyBytes;
}
if (singleImage) {
String docName = filename + "." + imageFormat;
MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat));
return WebResponseUtils.bytesToWebResponse(result, docName, mediaType);
} else {
String zipFilename = filename + "_convertedToImages.zip";
return WebResponseUtils.bytesToWebResponse(
result, zipFilename, MediaType.APPLICATION_OCTET_STREAM);
}
} finally {
try {
// Clean up temporary files
if (tempFile != null) {
Files.deleteIfExists(tempFile);
}
if (tempPdfPath != null) {
Files.deleteIfExists(tempPdfPath);
}
if (tempOutputDir != null) {
FileUtils.deleteDirectory(tempOutputDir.toFile());
}
} catch (Exception e) {
logger.error("Error cleaning up temporary files", e);
}
}
}

View File

@ -87,7 +87,7 @@ public class OCRController {
Files.createDirectories(tempOutputDir);
Files.createDirectories(tempImagesDir);
Process process = null;
try {
// Save input file
inputFile.transferTo(tempInputFile.toFile());
@ -139,7 +139,7 @@ public class OCRController {
command.add("pdf"); // Always output PDF
ProcessBuilder pb = new ProcessBuilder(command);
Process process = pb.start();
process = pb.start();
// Capture any error output
try (BufferedReader reader =
@ -188,6 +188,10 @@ public class OCRController {
.body(pdfContent);
} finally {
if (process != null) {
process.destroy();
}
// Clean up temporary files
deleteDirectory(tempDir);
}