diff --git a/app/common/src/main/java/stirling/software/common/util/PDFToFile.java b/app/common/src/main/java/stirling/software/common/util/PDFToFile.java index 6b3d61b11..32f2cc874 100644 --- a/app/common/src/main/java/stirling/software/common/util/PDFToFile.java +++ b/app/common/src/main/java/stirling/software/common/util/PDFToFile.java @@ -25,15 +25,19 @@ import com.vladsch.flexmark.util.data.MutableDataSet; import io.github.pixee.security.Filenames; -import lombok.NoArgsConstructor; import lombok.extern.slf4j.Slf4j; import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult; @Slf4j -@NoArgsConstructor public class PDFToFile { + private final TempFileManager tempFileManager; + + public PDFToFile(TempFileManager tempFileManager) { + this.tempFileManager = tempFileManager; + } + public ResponseEntity processPdfToMarkdown(MultipartFile inputFile) throws IOException, InterruptedException { if (!MediaType.APPLICATION_PDF_VALUE.equals(inputFile.getContentType())) { @@ -71,15 +75,12 @@ public class PDFToFile { pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.')); } - Path tempInputFile = null; - Path tempOutputDir = null; byte[] fileBytes; String fileName; - try { - tempInputFile = Files.createTempFile("input_", ".pdf"); - inputFile.transferTo(tempInputFile); - tempOutputDir = Files.createTempDirectory("output_"); + try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf"); + TempDirectory tempOutputDir = new TempDirectory(tempFileManager)) { + inputFile.transferTo(tempInputFile.getFile()); List command = new ArrayList<>( @@ -88,14 +89,16 @@ public class PDFToFile { "-s", "-noframes", "-c", - tempInputFile.toString(), + tempInputFile.getAbsolutePath(), pdfBaseName)); ProcessExecutorResult returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML) - .runCommandWithOutputHandling(command, tempOutputDir.toFile()); + .runCommandWithOutputHandling( + command, tempOutputDir.getPath().toFile()); // Process HTML files to Markdown - File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles()); + File[] outputFiles = + Objects.requireNonNull(tempOutputDir.getPath().toFile().listFiles()); List markdownFiles = new ArrayList<>(); // Convert HTML files to Markdown @@ -105,7 +108,7 @@ public class PDFToFile { String markdown = htmlToMarkdownConverter.convert(html); String mdFileName = outputFile.getName().replace(".html", ".md"); - File mdFile = new File(tempOutputDir.toFile(), mdFileName); + File mdFile = new File(tempOutputDir.getPath().toFile(), mdFileName); Files.writeString(mdFile.toPath(), markdown); markdownFiles.add(mdFile); } @@ -142,10 +145,6 @@ public class PDFToFile { fileBytes = byteArrayOutputStream.toByteArray(); } - - } finally { - if (tempInputFile != null) Files.deleteIfExists(tempInputFile); - if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile()); } return WebResponseUtils.bytesToWebResponse( fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM); @@ -164,18 +163,17 @@ public class PDFToFile { pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.')); } - Path tempInputFile = null; - Path tempOutputDir = null; byte[] fileBytes; String fileName; - try { - // Save the uploaded file to a temporary location - tempInputFile = Files.createTempFile("input_", ".pdf"); - inputFile.transferTo(tempInputFile); + try (TempFile inputFileTemp = new TempFile(tempFileManager, ".pdf"); + TempDirectory outputDirTemp = new TempDirectory(tempFileManager)) { - // Prepare the output directory - tempOutputDir = Files.createTempDirectory("output_"); + Path tempInputFile = inputFileTemp.getPath(); + Path tempOutputDir = outputDirTemp.getPath(); + + // Save the uploaded file to a temporary location + inputFile.transferTo(tempInputFile); // Run the pdftohtml command with complex output List command = @@ -208,11 +206,6 @@ public class PDFToFile { log.error("Exception writing zip", e); } fileBytes = byteArrayOutputStream.toByteArray(); - - } finally { - // Clean up the temporary files - if (tempInputFile != null) Files.deleteIfExists(tempInputFile); - if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile()); } return WebResponseUtils.bytesToWebResponse( @@ -245,18 +238,17 @@ public class PDFToFile { return new ResponseEntity<>(HttpStatus.BAD_REQUEST); } - Path tempInputFile = null; - Path tempOutputDir = null; byte[] fileBytes; String fileName; - try { - // Save the uploaded file to a temporary location - tempInputFile = Files.createTempFile("input_", ".pdf"); - inputFile.transferTo(tempInputFile); + try (TempFile inputFileTemp = new TempFile(tempFileManager, ".pdf"); + TempDirectory outputDirTemp = new TempDirectory(tempFileManager)) { - // Prepare the output directory - tempOutputDir = Files.createTempDirectory("output_"); + Path tempInputFile = inputFileTemp.getPath(); + Path tempOutputDir = outputDirTemp.getPath(); + + // Save the uploaded file to a temporary location + inputFile.transferTo(tempInputFile); // Run the LibreOffice command List command = @@ -308,11 +300,6 @@ public class PDFToFile { fileBytes = byteArrayOutputStream.toByteArray(); } - - } finally { - // Clean up the temporary files - Files.deleteIfExists(tempInputFile); - if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile()); } return WebResponseUtils.bytesToWebResponse( fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM); diff --git a/app/common/src/main/java/stirling/software/common/util/misc/ColorSpaceConversionStrategy.java b/app/common/src/main/java/stirling/software/common/util/misc/ColorSpaceConversionStrategy.java index c784bbed6..ca4970b71 100644 --- a/app/common/src/main/java/stirling/software/common/util/misc/ColorSpaceConversionStrategy.java +++ b/app/common/src/main/java/stirling/software/common/util/misc/ColorSpaceConversionStrategy.java @@ -15,22 +15,29 @@ import lombok.extern.slf4j.Slf4j; import stirling.software.common.model.api.misc.ReplaceAndInvert; import stirling.software.common.util.ProcessExecutor; import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult; +import stirling.software.common.util.TempFile; +import stirling.software.common.util.TempFileManager; @Slf4j public class ColorSpaceConversionStrategy extends ReplaceAndInvertColorStrategy { - public ColorSpaceConversionStrategy(MultipartFile file, ReplaceAndInvert replaceAndInvert) { + private final TempFileManager tempFileManager; + + public ColorSpaceConversionStrategy( + MultipartFile file, + ReplaceAndInvert replaceAndInvert, + TempFileManager tempFileManager) { super(file, replaceAndInvert); + this.tempFileManager = tempFileManager; } @Override public InputStreamResource replace() throws IOException { - Path tempInputFile = null; - Path tempOutputFile = null; + try (TempFile tempInput = new TempFile(tempFileManager, ".pdf"); + TempFile tempOutput = new TempFile(tempFileManager, ".pdf")) { - try { - tempInputFile = Files.createTempFile("colorspace_input_", ".pdf"); - tempOutputFile = Files.createTempFile("colorspace_output_", ".pdf"); + Path tempInputFile = tempInput.getPath(); + Path tempOutputFile = tempOutput.getPath(); Files.write(tempInputFile, getFileInput().getBytes()); @@ -74,21 +81,6 @@ public class ColorSpaceConversionStrategy extends ReplaceAndInvertColorStrategy log.warn("CMYK color space conversion failed", e); throw new IOException( "Failed to convert PDF to CMYK color space: " + e.getMessage(), e); - } finally { - if (tempInputFile != null) { - try { - Files.deleteIfExists(tempInputFile); - } catch (IOException e) { - log.warn("Failed to delete temporary input file: {}", tempInputFile, e); - } - } - if (tempOutputFile != null) { - try { - Files.deleteIfExists(tempOutputFile); - } catch (IOException e) { - log.warn("Failed to delete temporary output file: {}", tempOutputFile, e); - } - } } } } diff --git a/app/common/src/test/java/stirling/software/common/util/PDFToFileTest.java b/app/common/src/test/java/stirling/software/common/util/PDFToFileTest.java index 9a178a400..2ebb58c0d 100644 --- a/app/common/src/test/java/stirling/software/common/util/PDFToFileTest.java +++ b/app/common/src/test/java/stirling/software/common/util/PDFToFileTest.java @@ -5,7 +5,9 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyList; +import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.ArgumentMatchers.argThat; +import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.mockStatic; import static org.mockito.Mockito.when; @@ -47,10 +49,21 @@ class PDFToFileTest { @Mock private ProcessExecutor mockProcessExecutor; @Mock private ProcessExecutorResult mockExecutorResult; + @Mock private TempFileManager mockTempFileManager; @BeforeEach - void setUp() { - pdfToFile = new PDFToFile(); + void setUp() throws IOException { + // Mock the TempFileManager to return real temp files + lenient() + .when(mockTempFileManager.createTempFile(anyString())) + .thenAnswer( + invocation -> + Files.createTempFile("test", invocation.getArgument(0)).toFile()); + lenient() + .when(mockTempFileManager.createTempDirectory()) + .thenAnswer(invocation -> Files.createTempDirectory("test")); + + pdfToFile = new PDFToFile(mockTempFileManager); } @Test diff --git a/app/core/src/main/java/stirling/software/SPDF/Factories/ReplaceAndInvertColorFactory.java b/app/core/src/main/java/stirling/software/SPDF/Factories/ReplaceAndInvertColorFactory.java index e53850ff8..6697beb79 100644 --- a/app/core/src/main/java/stirling/software/SPDF/Factories/ReplaceAndInvertColorFactory.java +++ b/app/core/src/main/java/stirling/software/SPDF/Factories/ReplaceAndInvertColorFactory.java @@ -3,16 +3,22 @@ package stirling.software.SPDF.Factories; import org.springframework.stereotype.Component; import org.springframework.web.multipart.MultipartFile; +import lombok.RequiredArgsConstructor; + import stirling.software.common.model.api.misc.HighContrastColorCombination; import stirling.software.common.model.api.misc.ReplaceAndInvert; +import stirling.software.common.util.TempFileManager; import stirling.software.common.util.misc.ColorSpaceConversionStrategy; import stirling.software.common.util.misc.CustomColorReplaceStrategy; import stirling.software.common.util.misc.InvertFullColorStrategy; import stirling.software.common.util.misc.ReplaceAndInvertColorStrategy; @Component +@RequiredArgsConstructor public class ReplaceAndInvertColorFactory { + private final TempFileManager tempFileManager; + public ReplaceAndInvertColorStrategy replaceAndInvert( MultipartFile file, ReplaceAndInvert replaceAndInvertOption, @@ -30,7 +36,7 @@ public class ReplaceAndInvertColorFactory { highContrastColorCombination); case FULL_INVERSION -> new InvertFullColorStrategy(file, replaceAndInvertOption); case COLOR_SPACE_CONVERSION -> - new ColorSpaceConversionStrategy(file, replaceAndInvertOption); + new ColorSpaceConversionStrategy(file, replaceAndInvertOption, tempFileManager); }; } } diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToHtml.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToHtml.java index ed8f30458..76414ca57 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToHtml.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToHtml.java @@ -11,14 +11,20 @@ import org.springframework.web.multipart.MultipartFile; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; +import lombok.RequiredArgsConstructor; + import stirling.software.common.model.api.PDFFile; import stirling.software.common.util.PDFToFile; +import stirling.software.common.util.TempFileManager; @RestController @Tag(name = "Convert", description = "Convert APIs") @RequestMapping("/api/v1/convert") +@RequiredArgsConstructor public class ConvertPDFToHtml { + private final TempFileManager tempFileManager; + @PostMapping(consumes = MediaType.MULTIPART_FORM_DATA_VALUE, value = "/pdf/html") @Operation( summary = "Convert PDF to HTML", @@ -26,7 +32,7 @@ public class ConvertPDFToHtml { "This endpoint converts a PDF file to HTML format. Input:PDF Output:HTML Type:SISO") public ResponseEntity processPdfToHTML(@ModelAttribute PDFFile file) throws Exception { MultipartFile inputFile = file.getFileInput(); - PDFToFile pdfToFile = new PDFToFile(); + PDFToFile pdfToFile = new PDFToFile(tempFileManager); return pdfToFile.processPdfToHtml(inputFile); } } diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java index ac72fb926..d9538de58 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java @@ -24,6 +24,7 @@ import stirling.software.common.model.api.PDFFile; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.GeneralUtils; import stirling.software.common.util.PDFToFile; +import stirling.software.common.util.TempFileManager; import stirling.software.common.util.WebResponseUtils; @RestController @@ -33,6 +34,7 @@ import stirling.software.common.util.WebResponseUtils; public class ConvertPDFToOffice { private final CustomPDFDocumentFactory pdfDocumentFactory; + private final TempFileManager tempFileManager; @PostMapping(consumes = MediaType.MULTIPART_FORM_DATA_VALUE, value = "/pdf/presentation") @Operation( @@ -45,7 +47,7 @@ public class ConvertPDFToOffice { throws IOException, InterruptedException { MultipartFile inputFile = request.getFileInput(); String outputFormat = request.getOutputFormat(); - PDFToFile pdfToFile = new PDFToFile(); + PDFToFile pdfToFile = new PDFToFile(tempFileManager); return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "impress_pdf_import"); } @@ -70,7 +72,7 @@ public class ConvertPDFToOffice { MediaType.TEXT_PLAIN); } } else { - PDFToFile pdfToFile = new PDFToFile(); + PDFToFile pdfToFile = new PDFToFile(tempFileManager); return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "writer_pdf_import"); } } @@ -85,7 +87,7 @@ public class ConvertPDFToOffice { throws IOException, InterruptedException { MultipartFile inputFile = request.getFileInput(); String outputFormat = request.getOutputFormat(); - PDFToFile pdfToFile = new PDFToFile(); + PDFToFile pdfToFile = new PDFToFile(tempFileManager); return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "writer_pdf_import"); } @@ -98,7 +100,7 @@ public class ConvertPDFToOffice { public ResponseEntity processPdfToXML(@ModelAttribute PDFFile file) throws Exception { MultipartFile inputFile = file.getFileInput(); - PDFToFile pdfToFile = new PDFToFile(); + PDFToFile pdfToFile = new PDFToFile(tempFileManager); return pdfToFile.processPdfToOfficeFormat(inputFile, "xml", "writer_pdf_import"); } } diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java index 0338a76c4..79b65ca33 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java @@ -116,101 +116,82 @@ public class OCRController { // Use try-with-resources for proper temp file management try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf"); - TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) { + TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf"); + TempFile sidecarTextFile = sidecar ? new TempFile(tempFileManager, ".txt") : null) { inputFile.transferTo(tempInputFile.getFile()); - TempFile sidecarTextFile = null; + // Use OCRmyPDF if available (no fallback - error if it fails) + if (isOcrMyPdfEnabled()) { + processWithOcrMyPdf( + selectedLanguages, + sidecar, + deskew, + clean, + cleanFinal, + ocrType, + ocrRenderType, + removeImagesAfter, + tempInputFile.getPath(), + tempOutputFile.getPath(), + sidecarTextFile != null ? sidecarTextFile.getPath() : null); + log.info("OCRmyPDF processing completed successfully"); + } + // Use Tesseract only if OCRmyPDF is not available + else if (isTesseractEnabled()) { + processWithTesseract( + selectedLanguages, + ocrType, + tempInputFile.getPath(), + tempOutputFile.getPath()); + log.info("Tesseract processing completed successfully"); + } else { + throw ExceptionUtils.createOcrToolsUnavailableException(); + } - try { - // Use OCRmyPDF if available (no fallback - error if it fails) - if (isOcrMyPdfEnabled()) { - if (sidecar) { - sidecarTextFile = new TempFile(tempFileManager, ".txt"); - } + // Read the processed PDF file + byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath()); - processWithOcrMyPdf( - selectedLanguages, - sidecar, - deskew, - clean, - cleanFinal, - ocrType, - ocrRenderType, - removeImagesAfter, - tempInputFile.getPath(), - tempOutputFile.getPath(), - sidecarTextFile != null ? sidecarTextFile.getPath() : null); - log.info("OCRmyPDF processing completed successfully"); - } - // Use Tesseract only if OCRmyPDF is not available - else if (isTesseractEnabled()) { - processWithTesseract( - selectedLanguages, - ocrType, - tempInputFile.getPath(), - tempOutputFile.getPath()); - log.info("Tesseract processing completed successfully"); - } else { - throw ExceptionUtils.createOcrToolsUnavailableException(); - } + // Return the OCR processed PDF as a response + String outputFilename = + GeneralUtils.removeExtension( + Filenames.toSimpleFileName(inputFile.getOriginalFilename())) + + "_OCR.pdf"; - // Read the processed PDF file - byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath()); - - // Return the OCR processed PDF as a response - String outputFilename = + if (sidecar && sidecarTextFile != null) { + // Create a zip file containing both the PDF and the text file + String outputZipFilename = GeneralUtils.removeExtension( Filenames.toSimpleFileName(inputFile.getOriginalFilename())) - + "_OCR.pdf"; + + "_OCR.zip"; - if (sidecar && sidecarTextFile != null) { - // Create a zip file containing both the PDF and the text file - String outputZipFilename = - GeneralUtils.removeExtension( - Filenames.toSimpleFileName( - inputFile.getOriginalFilename())) - + "_OCR.zip"; + try (TempFile tempZipFile = new TempFile(tempFileManager, ".zip"); + ZipOutputStream zipOut = + new ZipOutputStream(Files.newOutputStream(tempZipFile.getPath()))) { - try (TempFile tempZipFile = new TempFile(tempFileManager, ".zip"); - ZipOutputStream zipOut = - new ZipOutputStream( - Files.newOutputStream(tempZipFile.getPath()))) { + // Add PDF file to the zip + ZipEntry pdfEntry = new ZipEntry(outputFilename); + zipOut.putNextEntry(pdfEntry); + zipOut.write(pdfBytes); + zipOut.closeEntry(); - // Add PDF file to the zip - ZipEntry pdfEntry = new ZipEntry(outputFilename); - zipOut.putNextEntry(pdfEntry); - zipOut.write(pdfBytes); - zipOut.closeEntry(); + // Add text file to the zip + ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt")); + zipOut.putNextEntry(txtEntry); + Files.copy(sidecarTextFile.getPath(), zipOut); + zipOut.closeEntry(); - // Add text file to the zip - ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt")); - zipOut.putNextEntry(txtEntry); - Files.copy(sidecarTextFile.getPath(), zipOut); - zipOut.closeEntry(); + zipOut.finish(); - zipOut.finish(); + byte[] zipBytes = Files.readAllBytes(tempZipFile.getPath()); - byte[] zipBytes = Files.readAllBytes(tempZipFile.getPath()); - - // Return the zip file containing both the PDF and the text file - return WebResponseUtils.bytesToWebResponse( - zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM); - } - } else { - // Return the OCR processed PDF as a response - return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename); - } - - } finally { - // Clean up sidecar temp file if created - if (sidecarTextFile != null) { - try { - sidecarTextFile.close(); - } catch (Exception e) { - log.warn("Failed to close sidecar temp file", e); - } + // Return the zip file containing both the PDF and the text file + return WebResponseUtils.bytesToWebResponse( + zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM); } + } else { + // Return the OCR processed PDF as a response + return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename); } } } diff --git a/app/core/src/main/java/stirling/software/SPDF/model/api/converters/ConvertPDFToMarkdown.java b/app/core/src/main/java/stirling/software/SPDF/model/api/converters/ConvertPDFToMarkdown.java index d5e4aa57a..c7a6bd7af 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/api/converters/ConvertPDFToMarkdown.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/api/converters/ConvertPDFToMarkdown.java @@ -11,14 +11,20 @@ import org.springframework.web.multipart.MultipartFile; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; +import lombok.RequiredArgsConstructor; + import stirling.software.common.model.api.PDFFile; import stirling.software.common.util.PDFToFile; +import stirling.software.common.util.TempFileManager; @RestController @Tag(name = "Convert", description = "Convert APIs") @RequestMapping("/api/v1/convert") +@RequiredArgsConstructor public class ConvertPDFToMarkdown { + private final TempFileManager tempFileManager; + @PostMapping(consumes = MediaType.MULTIPART_FORM_DATA_VALUE, value = "/pdf/markdown") @Operation( summary = "Convert PDF to Markdown", @@ -27,7 +33,7 @@ public class ConvertPDFToMarkdown { public ResponseEntity processPdfToMarkdown(@ModelAttribute PDFFile file) throws Exception { MultipartFile inputFile = file.getFileInput(); - PDFToFile pdfToFile = new PDFToFile(); + PDFToFile pdfToFile = new PDFToFile(tempFileManager); return pdfToFile.processPdfToMarkdown(inputFile); } }