improvements to tmp dir, OCR and others

This commit is contained in:
Anthony Stirling 2025-06-23 17:24:27 +01:00
parent 5aec97939e
commit 93ea611eda
5 changed files with 341 additions and 410 deletions

View File

@ -25,7 +25,7 @@ import stirling.software.common.util.TempFileRegistry;
@Configuration
public class TempFileConfiguration {
@Value("${stirling.tempfiles.directory:}")
@Value("${stirling.tempfiles.directory:${java.io.tmpdir}/stirling-pdf}")
private String customTempDirectory;
@Autowired
@ -48,42 +48,11 @@ public class TempFileConfiguration {
@PostConstruct
public void initTempFileConfig() {
try {
// If a custom temp directory is specified in the config, use it
if (customTempDirectory != null && !customTempDirectory.isEmpty()) {
Path tempDir = Path.of(customTempDirectory);
if (!Files.exists(tempDir)) {
Files.createDirectories(tempDir);
log.info("Created custom temporary directory: {}", tempDir);
}
// Set Java temp directory system property if in Docker/Kubernetes mode
if ("Docker".equals(machineType) || "Kubernetes".equals(machineType)) {
System.setProperty("java.io.tmpdir", customTempDirectory);
log.info(
"Set system temp directory to: {} for environment: {}",
customTempDirectory,
machineType);
}
} else {
// No custom directory specified, use java.io.tmpdir + application subfolder
String defaultTempDir;
if ("Docker".equals(machineType) || "Kubernetes".equals(machineType)) {
// Container environments should continue to use /tmp/stirling-pdf
defaultTempDir = "/tmp/stirling-pdf";
} else {
// Use system temp directory (java.io.tmpdir) with our application subfolder
// This automatically handles Windows (AppData\Local\Temp), macOS, and Linux systems
defaultTempDir = System.getProperty("java.io.tmpdir") + File.separator + "stirling-pdf";
}
customTempDirectory = defaultTempDir;
// Create the default temp directory
Path tempDir = Path.of(customTempDirectory);
if (!Files.exists(tempDir)) {
Files.createDirectories(tempDir);
log.info("Created default OS-specific temporary directory: {}", tempDir);
}
// Create the temp directory if it doesn't exist
Path tempDir = Path.of(customTempDirectory);
if (!Files.exists(tempDir)) {
Files.createDirectories(tempDir);
log.info("Created temporary directory: {}", tempDir);
}
log.info("Temporary file configuration initialized");
@ -93,4 +62,4 @@ public class TempFileConfiguration {
log.error("Failed to initialize temporary file configuration", e);
}
}
}
}

View File

@ -3,8 +3,12 @@ package stirling.software.common.service;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Stream;
import org.springframework.beans.factory.annotation.Autowired;
@ -49,6 +53,36 @@ public class TempFileCleanupService {
@Value("${stirling.tempfiles.libreoffice-dir:/tmp/stirling-pdf/libreoffice}")
private String libreOfficeTempDir;
// Maximum recursion depth for directory traversal
private static final int MAX_RECURSION_DEPTH = 5;
// File patterns that identify our temp files
private static final Predicate<String> IS_OUR_TEMP_FILE = fileName ->
fileName.startsWith("stirling-pdf-") ||
fileName.startsWith("output_") ||
fileName.startsWith("compressedPDF") ||
fileName.startsWith("pdf-save-") ||
fileName.startsWith("pdf-stream-") ||
fileName.startsWith("PDFBox") ||
fileName.startsWith("input_") ||
fileName.startsWith("overlay-");
// File patterns that identify common system temp files
private static final Predicate<String> IS_SYSTEM_TEMP_FILE = fileName ->
fileName.matches("lu\\d+[a-z0-9]*\\.tmp") ||
fileName.matches("ocr_process\\d+") ||
(fileName.startsWith("tmp") && !fileName.contains("jetty")) ||
fileName.startsWith("OSL_PIPE_") ||
(fileName.endsWith(".tmp") && !fileName.contains("jetty"));
// File patterns that should be excluded from cleanup
private static final Predicate<String> SHOULD_SKIP = fileName ->
fileName.contains("jetty") ||
fileName.startsWith("jetty-") ||
fileName.equals("proc") ||
fileName.equals("sys") ||
fileName.equals("dev");
@Autowired
public TempFileCleanupService(TempFileRegistry registry, TempFileManager tempFileManager) {
this.registry = registry;
@ -114,41 +148,9 @@ public class TempFileCleanupService {
}
}
int unregisteredDeletedCount = 0;
try {
// Get all directories we need to clean
Path systemTempPath;
if (systemTempDir != null && !systemTempDir.isEmpty()) {
systemTempPath = Path.of(systemTempDir);
} else {
systemTempPath = Path.of(System.getProperty("java.io.tmpdir"));
}
Path[] dirsToScan = {
systemTempPath, Path.of(customTempDirectory), Path.of(libreOfficeTempDir)
};
boolean containerMode =
"Docker".equals(machineType) || "Kubernetes".equals(machineType);
// Process each directory
for (Path tempDir : dirsToScan) {
if (!Files.exists(tempDir)) {
continue;
}
int dirDeletedCount = cleanupDirectory(tempDir, containerMode, 0, maxAgeMillis);
unregisteredDeletedCount += dirDeletedCount;
if (dirDeletedCount > 0) {
log.info(
"Cleaned up {} unregistered files/directories in {}",
dirDeletedCount,
tempDir);
}
}
} catch (IOException e) {
log.error("Error during scheduled cleanup of unregistered files", e);
}
// Clean up unregistered temp files based on our cleanup strategy
boolean containerMode = isContainerMode();
int unregisteredDeletedCount = cleanupUnregisteredFiles(containerMode, true, maxAgeMillis);
log.info(
"Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories",
@ -157,272 +159,217 @@ public class TempFileCleanupService {
directoriesDeletedCount);
}
/** Overload of cleanupDirectory that uses the specified max age for files */
private int cleanupDirectory(
Path directory, boolean containerMode, int depth, long maxAgeMillis)
throws IOException {
if (depth > 5) {
log.warn("Maximum directory recursion depth reached for: {}", directory);
return 0;
}
int deletedCount = 0;
try (Stream<Path> paths = Files.list(directory)) {
for (Path path : paths.toList()) {
String fileName = path.getFileName().toString();
// Skip registered files - these are handled by TempFileManager
if (registry.contains(path.toFile())) {
continue;
}
// Skip Jetty-related directories and files
if (fileName.contains("jetty") || fileName.startsWith("jetty-")) {
continue;
}
// Check if this is a directory we should recursively scan
if (Files.isDirectory(path)) {
// Don't recurse into certain system directories
if (!fileName.equals("proc")
&& !fileName.equals("sys")
&& !fileName.equals("dev")) {
deletedCount +=
cleanupDirectory(path, containerMode, depth + 1, maxAgeMillis);
}
continue;
}
// Determine if this file matches our temp file patterns
boolean isOurTempFile =
fileName.startsWith("stirling-pdf-")
|| fileName.startsWith("output_")
|| fileName.startsWith("compressedPDF")
|| fileName.startsWith("pdf-save-")
|| fileName.startsWith("pdf-stream-")
|| fileName.startsWith("PDFBox")
|| fileName.startsWith("input_")
|| fileName.startsWith("overlay-");
// Avoid touching Jetty files
boolean isSystemTempFile =
fileName.matches("lu\\d+[a-z0-9]*\\.tmp")
|| fileName.matches("ocr_process\\d+")
|| (fileName.startsWith("tmp") && !fileName.contains("jetty"))
|| fileName.startsWith("OSL_PIPE_")
|| (fileName.endsWith(".tmp") && !fileName.contains("jetty"));
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
// Special case for zero-byte files - these are often corrupted temp files
try {
if (Files.size(path) == 0) {
// For empty files, use a shorter timeout (5 minutes)
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
// Delete empty files older than 5 minutes
if ((currentTime - lastModified) > 5 * 60 * 1000) {
shouldDelete = true;
}
}
} catch (IOException e) {
log.debug("Could not check file size, skipping: {}", path);
}
// Check file age against maxAgeMillis
if (shouldDelete) {
try {
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
shouldDelete = (currentTime - lastModified) > maxAgeMillis;
} catch (IOException e) {
log.debug("Could not check file age, skipping: {}", path);
shouldDelete = false;
}
}
if (shouldDelete) {
try {
Files.deleteIfExists(path);
deletedCount++;
log.debug(
"Deleted unregistered temp file during scheduled cleanup: {}",
path);
} catch (IOException e) {
// Handle locked files more gracefully - just log at debug level
if (e.getMessage() != null
&& e.getMessage().contains("being used by another process")) {
log.debug("File locked, skipping delete: {}", path);
} else {
log.warn(
"Failed to delete temp file during scheduled cleanup: {}",
path,
e);
}
}
}
}
}
return deletedCount;
}
/**
* Perform startup cleanup of stale temporary files from previous runs. This is especially
* important in Docker environments where temp files persist between container restarts.
*/
private void runStartupCleanup() {
log.info("Running startup temporary file cleanup");
boolean containerMode = isContainerMode();
log.info(
"Running in {} mode, using {} cleanup strategy",
machineType,
containerMode ? "aggressive" : "conservative");
// For startup cleanup, we use a longer timeout for non-container environments
long maxAgeMillis = containerMode ? 0 : 24 * 60 * 60 * 1000; // 0 or 24 hours
int totalDeletedCount = cleanupUnregisteredFiles(containerMode, false, maxAgeMillis);
log.info(
"Startup cleanup complete. Deleted {} temporary files/directories",
totalDeletedCount);
}
/**
* Clean up unregistered temporary files across all configured temp directories.
*
* @param containerMode Whether we're in container mode (more aggressive cleanup)
* @param isScheduled Whether this is a scheduled cleanup or startup cleanup
* @param maxAgeMillis Maximum age of files to clean in milliseconds
* @return Number of files deleted
*/
private int cleanupUnregisteredFiles(boolean containerMode, boolean isScheduled, long maxAgeMillis) {
AtomicInteger totalDeletedCount = new AtomicInteger(0);
try {
// Get all directories we need to clean
Path systemTempPath;
if (systemTempDir != null && !systemTempDir.isEmpty()) {
systemTempPath = Path.of(systemTempDir);
} else {
systemTempPath = Path.of(System.getProperty("java.io.tmpdir"));
}
Path systemTempPath = getSystemTempPath();
Path[] dirsToScan = {
systemTempPath, Path.of(customTempDirectory), Path.of(libreOfficeTempDir)
systemTempPath,
Path.of(customTempDirectory),
Path.of(libreOfficeTempDir)
};
int totalDeletedCount = 0;
boolean containerMode =
"Docker".equals(machineType) || "Kubernetes".equals(machineType);
log.info(
"Running in {} mode, using {} cleanup strategy",
machineType,
containerMode ? "aggressive" : "conservative");
// Process each directory
for (Path tempDir : dirsToScan) {
if (!Files.exists(tempDir)) {
log.warn("Temporary directory does not exist: {}", tempDir);
continue;
Arrays.stream(dirsToScan)
.filter(Files::exists)
.forEach(tempDir -> {
try {
String phase = isScheduled ? "scheduled" : "startup";
log.info("Scanning directory for {} cleanup: {}", phase, tempDir);
AtomicInteger dirDeletedCount = new AtomicInteger(0);
cleanupDirectoryStreaming(
tempDir,
containerMode,
0,
maxAgeMillis,
isScheduled,
path -> {
dirDeletedCount.incrementAndGet();
if (log.isDebugEnabled()) {
log.debug("Deleted temp file during {} cleanup: {}", phase, path);
}
}
);
int count = dirDeletedCount.get();
totalDeletedCount.addAndGet(count);
if (count > 0) {
log.info("Cleaned up {} files/directories in {}", count, tempDir);
}
} catch (IOException e) {
log.error("Error during cleanup of directory: {}", tempDir, e);
}
});
} catch (Exception e) {
log.error("Error during cleanup of unregistered files", e);
}
return totalDeletedCount.get();
}
/**
* Get the system temp directory path based on configuration or system property.
*/
private Path getSystemTempPath() {
if (systemTempDir != null && !systemTempDir.isEmpty()) {
return Path.of(systemTempDir);
} else {
return Path.of(System.getProperty("java.io.tmpdir"));
}
}
/**
* Determine if we're running in a container environment.
*/
private boolean isContainerMode() {
return "Docker".equals(machineType) || "Kubernetes".equals(machineType);
}
/**
* Recursively clean up a directory using a streaming approach to reduce memory usage.
*
* @param directory The directory to clean
* @param containerMode Whether we're in container mode (more aggressive cleanup)
* @param depth Current recursion depth
* @param maxAgeMillis Maximum age of files to delete
* @param isScheduled Whether this is a scheduled cleanup (vs startup)
* @param onDeleteCallback Callback function when a file is deleted
* @throws IOException If an I/O error occurs
*/
private void cleanupDirectoryStreaming(
Path directory,
boolean containerMode,
int depth,
long maxAgeMillis,
boolean isScheduled,
Consumer<Path> onDeleteCallback) throws IOException {
// Check recursion depth limit
if (depth > MAX_RECURSION_DEPTH) {
log.warn("Maximum directory recursion depth reached for: {}", directory);
return;
}
// Use try-with-resources to ensure the stream is closed
try (Stream<Path> pathStream = Files.list(directory)) {
// Process files in a streaming fashion instead of materializing the whole list
pathStream.forEach(path -> {
try {
String fileName = path.getFileName().toString();
// Skip if file should be excluded
if (SHOULD_SKIP.test(fileName)) {
return;
}
// Handle directories recursively
if (Files.isDirectory(path)) {
try {
cleanupDirectoryStreaming(
path, containerMode, depth + 1, maxAgeMillis, isScheduled, onDeleteCallback);
} catch (IOException e) {
log.warn("Error processing subdirectory: {}", path, e);
}
return;
}
// Skip registered files - these are handled by TempFileManager
if (isScheduled && registry.contains(path.toFile())) {
return;
}
// Check if this file should be deleted
if (shouldDeleteFile(path, fileName, containerMode, maxAgeMillis)) {
try {
Files.deleteIfExists(path);
onDeleteCallback.accept(path);
} catch (IOException e) {
// Handle locked files more gracefully
if (e.getMessage() != null && e.getMessage().contains("being used by another process")) {
log.debug("File locked, skipping delete: {}", path);
} else {
log.warn("Failed to delete temp file: {}", path, e);
}
}
}
} catch (Exception e) {
log.warn("Error processing path: {}", path, e);
}
log.info("Scanning directory for cleanup: {}", tempDir);
int dirDeletedCount = cleanupDirectory(tempDir, containerMode, 0);
totalDeletedCount += dirDeletedCount;
log.info("Cleaned up {} files/directories in {}", dirDeletedCount, tempDir);
}
log.info(
"Startup cleanup complete. Deleted {} temporary files/directories",
totalDeletedCount);
} catch (IOException e) {
log.error("Error during startup cleanup", e);
});
}
}
/**
* Recursively clean up a directory for temporary files.
*
* @param directory The directory to clean
* @param containerMode Whether we're in container mode (more aggressive cleanup)
* @param depth Current recursion depth (to prevent excessive recursion)
* @return Number of files deleted
* Determine if a file should be deleted based on its name, age, and other criteria.
*/
private int cleanupDirectory(Path directory, boolean containerMode, int depth)
throws IOException {
if (depth > 5) {
log.warn("Maximum directory recursion depth reached for: {}", directory);
return 0;
private boolean shouldDeleteFile(Path path, String fileName, boolean containerMode, long maxAgeMillis) {
// First check if it matches our known temp file patterns
boolean isOurTempFile = IS_OUR_TEMP_FILE.test(fileName);
boolean isSystemTempFile = IS_SYSTEM_TEMP_FILE.test(fileName);
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
// Special case for zero-byte files - these are often corrupted temp files
try {
if (Files.size(path) == 0) {
// For empty files, use a shorter timeout (5 minutes)
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
// Delete empty files older than 5 minutes
if ((currentTime - lastModified) > 5 * 60 * 1000) {
shouldDelete = true;
}
}
} catch (IOException e) {
log.debug("Could not check file size, skipping: {}", path);
}
int deletedCount = 0;
try (Stream<Path> paths = Files.list(directory)) {
for (Path path : paths.toList()) {
String fileName = path.getFileName().toString();
// Skip Jetty-related directories and files
if (fileName.contains("jetty") || fileName.startsWith("jetty-")) {
continue;
}
// Check if this is a directory we should recursively scan
if (Files.isDirectory(path)) {
// Don't recurse into certain system directories
if (!fileName.equals("proc")
&& !fileName.equals("sys")
&& !fileName.equals("dev")) {
deletedCount += cleanupDirectory(path, containerMode, depth + 1);
}
continue;
}
// Determine if this file matches our temp file patterns
boolean isOurTempFile =
fileName.startsWith("stirling-pdf-")
|| fileName.startsWith("output_")
|| fileName.startsWith("compressedPDF")
|| fileName.startsWith("pdf-save-")
|| fileName.startsWith("pdf-stream-")
|| fileName.startsWith("PDFBox")
|| fileName.startsWith("input_")
|| fileName.startsWith("overlay-");
// Avoid touching Jetty files
boolean isSystemTempFile =
fileName.matches("lu\\d+[a-z0-9]*\\.tmp")
|| fileName.matches("ocr_process\\d+")
|| (fileName.startsWith("tmp") && !fileName.contains("jetty"))
|| fileName.startsWith("OSL_PIPE_")
|| (fileName.endsWith(".tmp") && !fileName.contains("jetty"));
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
// Special case for zero-byte files - these are often corrupted temp files
boolean isEmptyFile = false;
try {
if (!Files.isDirectory(path) && Files.size(path) == 0) {
isEmptyFile = true;
// For empty files, use a shorter timeout (5 minutes)
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
// Delete empty files older than 5 minutes
if ((currentTime - lastModified) > 5 * 60 * 1000) {
shouldDelete = true;
}
}
} catch (IOException e) {
log.debug("Could not check file size, skipping: {}", path);
}
// For non-container mode, check file age before deleting
if (!containerMode && (isOurTempFile || isSystemTempFile) && !isEmptyFile) {
try {
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
// Only delete files older than 24 hours in non-container mode
shouldDelete = (currentTime - lastModified) > 24 * 60 * 60 * 1000;
} catch (IOException e) {
log.debug("Could not check file age, skipping: {}", path);
shouldDelete = false;
}
}
if (shouldDelete) {
try {
if (Files.isDirectory(path)) {
GeneralUtils.deleteDirectory(path);
} else {
Files.deleteIfExists(path);
}
deletedCount++;
log.debug("Deleted temp file during startup cleanup: {}", path);
} catch (IOException e) {
log.warn("Failed to delete temp file during startup cleanup: {}", path, e);
}
}
// Check file age against maxAgeMillis
if (shouldDelete && maxAgeMillis > 0) {
try {
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
shouldDelete = (currentTime - lastModified) > maxAgeMillis;
} catch (IOException e) {
log.debug("Could not check file age, skipping: {}", path);
shouldDelete = false;
}
}
return deletedCount;
return shouldDelete;
}
/** Clean up LibreOffice temporary files. This method is called after LibreOffice operations. */
@ -431,18 +378,17 @@ public class TempFileCleanupService {
try {
Set<Path> directories = registry.getTempDirectories();
for (Path dir : directories) {
if (dir.getFileName().toString().contains("libreoffice")) {
if (dir.getFileName().toString().contains("libreoffice") && Files.exists(dir)) {
// For directories containing "libreoffice", delete all contents
// but keep the directory itself for future use
try (Stream<Path> files = Files.list(dir)) {
for (Path file : files.toList()) {
if (Files.isDirectory(file)) {
GeneralUtils.deleteDirectory(file);
} else {
Files.deleteIfExists(file);
}
}
}
cleanupDirectoryStreaming(
dir,
isContainerMode(),
0,
0, // age doesn't matter for LibreOffice cleanup
false,
path -> log.debug("Cleaned up LibreOffice temp file: {}", path)
);
log.debug("Cleaned up LibreOffice temp directory contents: {}", dir);
}
}
@ -450,4 +396,4 @@ public class TempFileCleanupService {
log.warn("Failed to clean up LibreOffice temp files", e);
}
}
}
}

View File

@ -14,6 +14,8 @@ import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -118,12 +120,14 @@ public class TempFileCleanupServiceTest {
// Create a file older than threshold
Path oldFile = Files.createFile(systemTempDir.resolve("output_old.pdf"));
Files.setLastModifiedTime(oldFile, FileTime.from( Files.getLastModifiedTime(oldFile).toMillis() - 5000000, TimeUnit.MILLISECONDS));
Files.setLastModifiedTime(oldFile, FileTime.from(
Files.getLastModifiedTime(oldFile).toMillis() - 5000000,
TimeUnit.MILLISECONDS));
// Act
invokeCleanupDirectory(systemTempDir, true, 0, 3600000);
invokeCleanupDirectory(customTempDir, true, 0, 3600000);
invokeCleanupDirectory(libreOfficeTempDir, true, 0, 3600000);
invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
invokeCleanupDirectoryStreaming(customTempDir, true, 0, 3600000);
invokeCleanupDirectoryStreaming(libreOfficeTempDir, true, 0, 3600000);
// Assert - Our temp files and system temp files should be deleted (if old enough)
assertFalse(Files.exists(oldFile), "Old temp file should be deleted");
@ -141,14 +145,15 @@ public class TempFileCleanupServiceTest {
// Arrange - Create an empty file
Path emptyFile = Files.createFile(systemTempDir.resolve("empty.tmp"));
// Make it "old enough" to be deleted (>5 minutes)
Files.setLastModifiedTime(emptyFile, FileTime.from( Files.getLastModifiedTime(emptyFile).toMillis() - 6 * 60 * 1000, TimeUnit.MILLISECONDS));
Files.setLastModifiedTime(emptyFile, FileTime.from(
Files.getLastModifiedTime(emptyFile).toMillis() - 6 * 60 * 1000,
TimeUnit.MILLISECONDS));
// Configure mock registry to say this file isn't registered
when(registry.contains(any(File.class))).thenReturn(false);
// Act
invokeCleanupDirectory(systemTempDir, true, 0, 3600000);
invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
// Assert
assertFalse(Files.exists(emptyFile), "Empty file older than 5 minutes should be deleted");
@ -166,13 +171,15 @@ public class TempFileCleanupServiceTest {
Path tempFile3 = Files.createFile(dir3.resolve("output_3.pdf"));
// Make the deepest file old enough to be deleted
Files.setLastModifiedTime(tempFile3, FileTime.from( Files.getLastModifiedTime(tempFile3).toMillis() - 5000000, TimeUnit.MILLISECONDS));
Files.setLastModifiedTime(tempFile3, FileTime.from(
Files.getLastModifiedTime(tempFile3).toMillis() - 5000000,
TimeUnit.MILLISECONDS));
// Configure mock registry to say these files aren't registered
when(registry.contains(any(File.class))).thenReturn(false);
// Act
invokeCleanupDirectory(systemTempDir, true, 0, 3600000);
invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
// Assert
assertTrue(Files.exists(tempFile1), "Recent temp file should be preserved");
@ -181,17 +188,25 @@ public class TempFileCleanupServiceTest {
}
/**
* Helper method to invoke the private cleanupDirectory method using reflection
* Helper method to invoke the private cleanupDirectoryStreaming method using reflection
*/
private int invokeCleanupDirectory(Path directory, boolean containerMode, int depth, long maxAgeMillis)
private void invokeCleanupDirectoryStreaming(Path directory, boolean containerMode, int depth, long maxAgeMillis)
throws IOException {
try {
// Create a consumer that tracks deleted files
AtomicInteger deleteCount = new AtomicInteger(0);
Consumer<Path> deleteCallback = path -> deleteCount.incrementAndGet();
// Get the new method with updated signature
var method = TempFileCleanupService.class.getDeclaredMethod(
"cleanupDirectory", Path.class, boolean.class, int.class, long.class);
"cleanupDirectoryStreaming",
Path.class, boolean.class, int.class, long.class, boolean.class, Consumer.class);
method.setAccessible(true);
return (int) method.invoke(cleanupService, directory, containerMode, depth, maxAgeMillis);
// Invoke the method with appropriate parameters
method.invoke(cleanupService, directory, containerMode, depth, maxAgeMillis, false, deleteCallback);
} catch (Exception e) {
throw new RuntimeException("Error invoking cleanupDirectory", e);
throw new RuntimeException("Error invoking cleanupDirectoryStreaming", e);
}
}
}

View File

@ -2,7 +2,6 @@ package stirling.software.SPDF.controller.api.misc;
import java.awt.image.BufferedImage;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.zip.ZipEntry;
@ -23,7 +22,6 @@ import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.github.pixee.security.BoundedLineReader;
import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
@ -34,6 +32,11 @@ import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
import stirling.software.common.model.ApplicationProperties;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.ProcessExecutor;
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
import stirling.software.common.util.TempFileManager;
import stirling.software.common.util.TempFileUtil;
import stirling.software.common.util.TempFileUtil.TempFile;
@RestController
@RequestMapping("/api/v1/misc")
@ -43,8 +46,8 @@ import stirling.software.common.service.CustomPDFDocumentFactory;
public class OCRController {
private final ApplicationProperties applicationProperties;
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final TempFileManager tempFileManager;
/** Gets the list of available Tesseract languages from the tessdata directory */
public List<String> getAvailableTesseractLanguages() {
@ -73,93 +76,108 @@ public class OCRController {
MultipartFile inputFile = request.getFileInput();
List<String> languages = request.getLanguages();
String ocrType = request.getOcrType();
Path tempDir = Files.createTempDirectory("ocr_process");
Path tempInputFile = tempDir.resolve("input.pdf");
Path tempOutputDir = tempDir.resolve("output");
Path tempImagesDir = tempDir.resolve("images");
Path finalOutputFile = tempDir.resolve("final_output.pdf");
Files.createDirectories(tempOutputDir);
Files.createDirectories(tempImagesDir);
Process process = null;
// Create a temp directory using TempFileManager directly
Path tempDirPath = tempFileManager.createTempDirectory();
File tempDir = tempDirPath.toFile();
try {
File tempInputFile = new File(tempDir, "input.pdf");
File tempOutputDir = new File(tempDir, "output");
File tempImagesDir = new File(tempDir, "images");
File finalOutputFile = new File(tempDir, "final_output.pdf");
// Create directories
tempOutputDir.mkdirs();
tempImagesDir.mkdirs();
// Save input file
inputFile.transferTo(tempInputFile.toFile());
inputFile.transferTo(tempInputFile);
PDFMergerUtility merger = new PDFMergerUtility();
merger.setDestinationFileName(finalOutputFile.toString());
try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
int pageCount = document.getNumberOfPages();
for (int pageNum = 0; pageNum < pageCount; pageNum++) {
PDPage page = document.getPage(pageNum);
boolean hasText = false;
// Check for existing text
try (PDDocument tempDoc = new PDDocument()) {
tempDoc.addPage(page);
PDFTextStripper stripper = new PDFTextStripper();
hasText = !stripper.getText(tempDoc).trim().isEmpty();
}
boolean shouldOcr =
switch (ocrType) {
case "skip-text" -> !hasText;
case "force-ocr" -> true;
default -> true;
};
Path pageOutputPath =
tempOutputDir.resolve(String.format("page_%d.pdf", pageNum));
boolean shouldOcr = switch (ocrType) {
case "skip-text" -> !hasText;
case "force-ocr" -> true;
default -> true;
};
File pageOutputPath = new File(tempOutputDir, String.format("page_%d.pdf", pageNum));
if (shouldOcr) {
// Convert page to image
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
Path imagePath =
tempImagesDir.resolve(String.format("page_%d.png", pageNum));
ImageIO.write(image, "png", imagePath.toFile());
File imagePath = new File(tempImagesDir, String.format("page_%d.png", pageNum));
ImageIO.write(image, "png", imagePath);
// Build OCR command
List<String> command = new ArrayList<>();
command.add("tesseract");
command.add(imagePath.toString());
command.add(
tempOutputDir
.resolve(String.format("page_%d", pageNum))
new File(tempOutputDir, String.format("page_%d", pageNum))
.toString());
command.add("-l");
command.add(String.join("+", languages));
// Always output PDF
command.add("pdf");
ProcessBuilder pb = new ProcessBuilder(command);
process = pb.start();
// Capture any error output
try (BufferedReader reader =
new BufferedReader(
new InputStreamReader(process.getErrorStream()))) {
String line;
while ((line = BoundedLineReader.readLine(reader, 5_000_000)) != null) {
log.debug("Tesseract: {}", line);
// Use ProcessExecutor to run tesseract command
try {
ProcessExecutorResult result = ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
.runCommandWithOutputHandling(command);
log.debug("Tesseract OCR completed for page {} with exit code {}",
pageNum, result.getRc());
// Add OCR'd PDF to merger
merger.addSource(pageOutputPath);
} catch (IOException | InterruptedException e) {
log.error("Error processing page {} with tesseract: {}", pageNum, e.getMessage());
// If OCR fails, fall back to the original page
try (PDDocument pageDoc = new PDDocument()) {
pageDoc.addPage(page);
pageDoc.save(pageOutputPath);
merger.addSource(pageOutputPath);
}
}
int exitCode = process.waitFor();
if (exitCode != 0) {
throw new RuntimeException(
"Tesseract failed with exit code: " + exitCode);
}
// Add OCR'd PDF to merger
merger.addSource(pageOutputPath.toFile());
} else {
// Save original page without OCR
try (PDDocument pageDoc = new PDDocument()) {
pageDoc.addPage(page);
pageDoc.save(pageOutputPath.toFile());
merger.addSource(pageOutputPath.toFile());
pageDoc.save(pageOutputPath);
merger.addSource(pageOutputPath);
}
}
}
}
// Merge all pages into final PDF
merger.mergeDocuments(null);
// Read the final PDF file
byte[] pdfContent = Files.readAllBytes(finalOutputFile);
byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath());
String outputFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "")
+ "_OCR.pdf";
return ResponseEntity.ok()
.header(
"Content-Disposition",
@ -167,14 +185,11 @@ public class OCRController {
.contentType(MediaType.APPLICATION_PDF)
.body(pdfContent);
} finally {
if (process != null) {
process.destroy();
}
// Clean up temporary files
deleteDirectory(tempDir);
// Clean up the temp directory and all its contents
tempFileManager.deleteTempDirectory(tempDirPath);
}
}
private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
throws IOException {
if (!file.exists()) {
@ -192,21 +207,4 @@ public class OCRController {
zipOut.closeEntry();
}
}
private void deleteDirectory(Path directory) {
try {
Files.walk(directory)
.sorted(Comparator.reverseOrder())
.forEach(
path -> {
try {
Files.delete(path);
} catch (IOException e) {
log.error("Error deleting {}: {}", path, e.getMessage());
}
});
} catch (IOException e) {
log.error("Error walking directory {}: {}", directory, e.getMessage());
}
}
}
}

View File

@ -44,4 +44,7 @@ springdoc.swagger-ui.path=/index.html
posthog.api.key=phc_fiR65u5j6qmXTYL56MNrLZSWqLaDW74OrZH0Insd2xq
posthog.host=https://eu.i.posthog.com
spring.main.allow-bean-definition-overriding=true
spring.main.allow-bean-definition-overriding=true
# Set up a consistent temporary directory location
java.io.tmpdir=${stirling.tempfiles.directory:${java.io.tmpdir}/stirling-pdf}