feat:centralise temp-file management & cleanup across app/docker (#3797)

# Description of Changes
Introduces TempFileManager, registry, and scheduled cleanup service;
aligns all Docker images and runtime scripts to use a dedicated
/tmp/stirling-pdf directory; updates controllers, utilities, and tests
to use the new API; adds configurable system.tempFileManagement section.

Closes #(issue_number)

---

## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing)
for more details.

---------

Co-authored-by: a <a>
This commit is contained in:
Anthony Stirling
2025-06-25 18:32:28 +01:00
committed by GitHub
parent 32aa568196
commit bc9c127819
36 changed files with 2167 additions and 213 deletions

View File

@@ -24,6 +24,7 @@ import stirling.software.common.configuration.RuntimePathConfig;
import stirling.software.common.model.api.converters.EmlToPdfRequest;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.EmlToPdf;
import stirling.software.common.util.TempFileManager;
import stirling.software.common.util.WebResponseUtils;
@RestController
@@ -35,6 +36,7 @@ public class ConvertEmlToPDF {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final RuntimePathConfig runtimePathConfig;
private final TempFileManager tempFileManager;
@PostMapping(consumes = "multipart/form-data", value = "/eml/pdf")
@Operation(
@@ -102,7 +104,8 @@ public class ConvertEmlToPDF {
fileBytes,
originalFilename,
false,
pdfDocumentFactory);
pdfDocumentFactory,
tempFileManager);
if (pdfBytes == null || pdfBytes.length == 0) {
log.error("PDF conversion failed - empty output for {}", originalFilename);

View File

@@ -18,6 +18,7 @@ import stirling.software.common.model.ApplicationProperties;
import stirling.software.common.model.api.converters.HTMLToPdfRequest;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.FileToPdf;
import stirling.software.common.util.TempFileManager;
import stirling.software.common.util.WebResponseUtils;
@RestController
@@ -32,6 +33,8 @@ public class ConvertHtmlToPDF {
private final RuntimePathConfig runtimePathConfig;
private final TempFileManager tempFileManager;
@PostMapping(consumes = "multipart/form-data", value = "/html/pdf")
@Operation(
summary = "Convert an HTML or ZIP (containing HTML and CSS) to PDF",
@@ -62,7 +65,8 @@ public class ConvertHtmlToPDF {
request,
fileInput.getBytes(),
originalFilename,
disableSanitize);
disableSanitize,
tempFileManager);
pdfBytes = pdfDocumentFactory.createNewBytesBasedOnOldDocument(pdfBytes);

View File

@@ -28,6 +28,7 @@ import stirling.software.common.model.ApplicationProperties;
import stirling.software.common.model.api.GeneralFile;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.FileToPdf;
import stirling.software.common.util.TempFileManager;
import stirling.software.common.util.WebResponseUtils;
@RestController
@@ -41,6 +42,8 @@ public class ConvertMarkdownToPdf {
private final ApplicationProperties applicationProperties;
private final RuntimePathConfig runtimePathConfig;
private final TempFileManager tempFileManager;
@PostMapping(consumes = "multipart/form-data", value = "/markdown/pdf")
@Operation(
summary = "Convert a Markdown file to PDF",
@@ -82,7 +85,8 @@ public class ConvertMarkdownToPdf {
null,
htmlContent.getBytes(),
"converted.html",
disableSanitize);
disableSanitize,
tempFileManager);
pdfBytes = pdfDocumentFactory.createNewBytesBasedOnOldDocument(pdfBytes);
String outputFilename =
originalFilename.replaceFirst("[.][^.]+$", "")

View File

@@ -2,7 +2,6 @@ package stirling.software.SPDF.controller.api.misc;
import java.awt.image.BufferedImage;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.zip.ZipEntry;
@@ -23,7 +22,6 @@ import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.github.pixee.security.BoundedLineReader;
import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
@@ -34,6 +32,9 @@ import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
import stirling.software.common.model.ApplicationProperties;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.ProcessExecutor;
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
import stirling.software.common.util.TempFileManager;
@RestController
@RequestMapping("/api/v1/misc")
@@ -43,8 +44,8 @@ import stirling.software.common.service.CustomPDFDocumentFactory;
public class OCRController {
private final ApplicationProperties applicationProperties;
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final TempFileManager tempFileManager;
/** Gets the list of available Tesseract languages from the tessdata directory */
public List<String> getAvailableTesseractLanguages() {
@@ -73,93 +74,117 @@ public class OCRController {
MultipartFile inputFile = request.getFileInput();
List<String> languages = request.getLanguages();
String ocrType = request.getOcrType();
Path tempDir = Files.createTempDirectory("ocr_process");
Path tempInputFile = tempDir.resolve("input.pdf");
Path tempOutputDir = tempDir.resolve("output");
Path tempImagesDir = tempDir.resolve("images");
Path finalOutputFile = tempDir.resolve("final_output.pdf");
Files.createDirectories(tempOutputDir);
Files.createDirectories(tempImagesDir);
Process process = null;
// Create a temp directory using TempFileManager directly
Path tempDirPath = tempFileManager.createTempDirectory();
File tempDir = tempDirPath.toFile();
try {
File tempInputFile = new File(tempDir, "input.pdf");
File tempOutputDir = new File(tempDir, "output");
File tempImagesDir = new File(tempDir, "images");
File finalOutputFile = new File(tempDir, "final_output.pdf");
// Create directories
tempOutputDir.mkdirs();
tempImagesDir.mkdirs();
// Save input file
inputFile.transferTo(tempInputFile.toFile());
inputFile.transferTo(tempInputFile);
PDFMergerUtility merger = new PDFMergerUtility();
merger.setDestinationFileName(finalOutputFile.toString());
try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
int pageCount = document.getNumberOfPages();
for (int pageNum = 0; pageNum < pageCount; pageNum++) {
PDPage page = document.getPage(pageNum);
boolean hasText = false;
// Check for existing text
try (PDDocument tempDoc = new PDDocument()) {
tempDoc.addPage(page);
PDFTextStripper stripper = new PDFTextStripper();
hasText = !stripper.getText(tempDoc).trim().isEmpty();
}
boolean shouldOcr =
switch (ocrType) {
case "skip-text" -> !hasText;
case "force-ocr" -> true;
default -> true;
};
Path pageOutputPath =
tempOutputDir.resolve(String.format("page_%d.pdf", pageNum));
File pageOutputPath =
new File(tempOutputDir, String.format("page_%d.pdf", pageNum));
if (shouldOcr) {
// Convert page to image
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
Path imagePath =
tempImagesDir.resolve(String.format("page_%d.png", pageNum));
ImageIO.write(image, "png", imagePath.toFile());
File imagePath =
new File(tempImagesDir, String.format("page_%d.png", pageNum));
ImageIO.write(image, "png", imagePath);
// Build OCR command
List<String> command = new ArrayList<>();
command.add("tesseract");
command.add(imagePath.toString());
command.add(
tempOutputDir
.resolve(String.format("page_%d", pageNum))
new File(tempOutputDir, String.format("page_%d", pageNum))
.toString());
command.add("-l");
command.add(String.join("+", languages));
// Always output PDF
command.add("pdf");
ProcessBuilder pb = new ProcessBuilder(command);
process = pb.start();
// Capture any error output
try (BufferedReader reader =
new BufferedReader(
new InputStreamReader(process.getErrorStream()))) {
String line;
while ((line = BoundedLineReader.readLine(reader, 5_000_000)) != null) {
log.debug("Tesseract: {}", line);
// Use ProcessExecutor to run tesseract command
try {
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
.runCommandWithOutputHandling(command);
log.debug(
"Tesseract OCR completed for page {} with exit code {}",
pageNum,
result.getRc());
// Add OCR'd PDF to merger
merger.addSource(pageOutputPath);
} catch (IOException | InterruptedException e) {
log.error(
"Error processing page {} with tesseract: {}",
pageNum,
e.getMessage());
// If OCR fails, fall back to the original page
try (PDDocument pageDoc = new PDDocument()) {
pageDoc.addPage(page);
pageDoc.save(pageOutputPath);
merger.addSource(pageOutputPath);
}
}
int exitCode = process.waitFor();
if (exitCode != 0) {
throw new RuntimeException(
"Tesseract failed with exit code: " + exitCode);
}
// Add OCR'd PDF to merger
merger.addSource(pageOutputPath.toFile());
} else {
// Save original page without OCR
try (PDDocument pageDoc = new PDDocument()) {
pageDoc.addPage(page);
pageDoc.save(pageOutputPath.toFile());
merger.addSource(pageOutputPath.toFile());
pageDoc.save(pageOutputPath);
merger.addSource(pageOutputPath);
}
}
}
}
// Merge all pages into final PDF
merger.mergeDocuments(null);
// Read the final PDF file
byte[] pdfContent = Files.readAllBytes(finalOutputFile);
byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath());
String outputFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "")
+ "_OCR.pdf";
return ResponseEntity.ok()
.header(
"Content-Disposition",
@@ -167,11 +192,8 @@ public class OCRController {
.contentType(MediaType.APPLICATION_PDF)
.body(pdfContent);
} finally {
if (process != null) {
process.destroy();
}
// Clean up temporary files
deleteDirectory(tempDir);
// Clean up the temp directory and all its contents
tempFileManager.deleteTempDirectory(tempDirPath);
}
}
@@ -192,21 +214,4 @@ public class OCRController {
zipOut.closeEntry();
}
}
private void deleteDirectory(Path directory) {
try {
Files.walk(directory)
.sorted(Comparator.reverseOrder())
.forEach(
path -> {
try {
Files.delete(path);
} catch (IOException e) {
log.error("Error deleting {}: {}", path, e.getMessage());
}
});
} catch (IOException e) {
log.error("Error walking directory {}: {}", directory, e.getMessage());
}
}
}

View File

@@ -1,8 +1,6 @@
package stirling.software.SPDF.controller.api.misc;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
@@ -23,6 +21,8 @@ import stirling.software.common.model.api.PDFFile;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.ProcessExecutor;
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
import stirling.software.common.util.TempFile;
import stirling.software.common.util.TempFileManager;
import stirling.software.common.util.WebResponseUtils;
@RestController
@@ -32,6 +32,7 @@ import stirling.software.common.util.WebResponseUtils;
public class RepairController {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final TempFileManager tempFileManager;
@PostMapping(consumes = "multipart/form-data", value = "/repair")
@Operation(
@@ -43,25 +44,25 @@ public class RepairController {
public ResponseEntity<byte[]> repairPdf(@ModelAttribute PDFFile file)
throws IOException, InterruptedException {
MultipartFile inputFile = file.getFileInput();
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", ".pdf");
byte[] pdfBytes = null;
inputFile.transferTo(tempInputFile.toFile());
try {
// Use TempFile with try-with-resources for automatic cleanup
try (TempFile tempFile = new TempFile(tempFileManager, ".pdf")) {
// Save the uploaded file to the temporary location
inputFile.transferTo(tempFile.getFile());
List<String> command = new ArrayList<>();
command.add("qpdf");
command.add("--replace-input"); // Automatically fixes problems it can
command.add("--qdf"); // Linearizes and normalizes PDF structure
command.add("--object-streams=disable"); // Can help with some corruptions
command.add(tempInputFile.toString());
command.add(tempFile.getFile().getAbsolutePath());
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
.runCommandWithOutputHandling(command);
// Read the optimized PDF file
pdfBytes = pdfDocumentFactory.loadToBytes(tempInputFile.toFile());
byte[] pdfBytes = pdfDocumentFactory.loadToBytes(tempFile.getFile());
// Return the optimized PDF as a response
String outputFilename =
@@ -69,9 +70,6 @@ public class RepairController {
.replaceFirst("[.][^.]+$", "")
+ "_repaired.pdf";
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
} finally {
// Clean up the temporary files
Files.deleteIfExists(tempInputFile);
}
}
}

View File

@@ -6,7 +6,6 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.util.List;
import javax.imageio.ImageIO;
@@ -40,6 +39,8 @@ import lombok.RequiredArgsConstructor;
import stirling.software.SPDF.model.api.misc.AddStampRequest;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.TempFile;
import stirling.software.common.util.TempFileManager;
import stirling.software.common.util.WebResponseUtils;
@RestController
@@ -49,6 +50,7 @@ import stirling.software.common.util.WebResponseUtils;
public class StampController {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final TempFileManager tempFileManager;
@PostMapping(consumes = "multipart/form-data", value = "/add-stamp")
@Operation(
@@ -188,14 +190,14 @@ public class StampController {
if (!"".equals(resourceDir)) {
ClassPathResource classPathResource = new ClassPathResource(resourceDir);
String fileExtension = resourceDir.substring(resourceDir.lastIndexOf("."));
File tempFile = Files.createTempFile("NotoSansFont", fileExtension).toFile();
try (InputStream is = classPathResource.getInputStream();
FileOutputStream os = new FileOutputStream(tempFile)) {
IOUtils.copy(is, os);
font = PDType0Font.load(document, tempFile);
} finally {
if (tempFile != null) {
Files.deleteIfExists(tempFile.toPath());
// Use TempFile with try-with-resources for automatic cleanup
try (TempFile tempFileWrapper = new TempFile(tempFileManager, fileExtension)) {
File tempFile = tempFileWrapper.getFile();
try (InputStream is = classPathResource.getInputStream();
FileOutputStream os = new FileOutputStream(tempFile)) {
IOUtils.copy(is, os);
font = PDType0Font.load(document, tempFile);
}
}
}