improvements to tmp dir, OCR and others

This commit is contained in:
Anthony Stirling 2025-06-23 17:24:27 +01:00
parent 5aec97939e
commit 93ea611eda
5 changed files with 341 additions and 410 deletions

View File

@ -25,7 +25,7 @@ import stirling.software.common.util.TempFileRegistry;
@Configuration @Configuration
public class TempFileConfiguration { public class TempFileConfiguration {
@Value("${stirling.tempfiles.directory:}") @Value("${stirling.tempfiles.directory:${java.io.tmpdir}/stirling-pdf}")
private String customTempDirectory; private String customTempDirectory;
@Autowired @Autowired
@ -48,42 +48,11 @@ public class TempFileConfiguration {
@PostConstruct @PostConstruct
public void initTempFileConfig() { public void initTempFileConfig() {
try { try {
// If a custom temp directory is specified in the config, use it // Create the temp directory if it doesn't exist
if (customTempDirectory != null && !customTempDirectory.isEmpty()) { Path tempDir = Path.of(customTempDirectory);
Path tempDir = Path.of(customTempDirectory); if (!Files.exists(tempDir)) {
if (!Files.exists(tempDir)) { Files.createDirectories(tempDir);
Files.createDirectories(tempDir); log.info("Created temporary directory: {}", tempDir);
log.info("Created custom temporary directory: {}", tempDir);
}
// Set Java temp directory system property if in Docker/Kubernetes mode
if ("Docker".equals(machineType) || "Kubernetes".equals(machineType)) {
System.setProperty("java.io.tmpdir", customTempDirectory);
log.info(
"Set system temp directory to: {} for environment: {}",
customTempDirectory,
machineType);
}
} else {
// No custom directory specified, use java.io.tmpdir + application subfolder
String defaultTempDir;
if ("Docker".equals(machineType) || "Kubernetes".equals(machineType)) {
// Container environments should continue to use /tmp/stirling-pdf
defaultTempDir = "/tmp/stirling-pdf";
} else {
// Use system temp directory (java.io.tmpdir) with our application subfolder
// This automatically handles Windows (AppData\Local\Temp), macOS, and Linux systems
defaultTempDir = System.getProperty("java.io.tmpdir") + File.separator + "stirling-pdf";
}
customTempDirectory = defaultTempDir;
// Create the default temp directory
Path tempDir = Path.of(customTempDirectory);
if (!Files.exists(tempDir)) {
Files.createDirectories(tempDir);
log.info("Created default OS-specific temporary directory: {}", tempDir);
}
} }
log.info("Temporary file configuration initialized"); log.info("Temporary file configuration initialized");
@ -93,4 +62,4 @@ public class TempFileConfiguration {
log.error("Failed to initialize temporary file configuration", e); log.error("Failed to initialize temporary file configuration", e);
} }
} }
} }

View File

@ -3,8 +3,12 @@ package stirling.software.common.service;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Arrays;
import java.util.Set; import java.util.Set;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
@ -49,6 +53,36 @@ public class TempFileCleanupService {
@Value("${stirling.tempfiles.libreoffice-dir:/tmp/stirling-pdf/libreoffice}") @Value("${stirling.tempfiles.libreoffice-dir:/tmp/stirling-pdf/libreoffice}")
private String libreOfficeTempDir; private String libreOfficeTempDir;
// Maximum recursion depth for directory traversal
private static final int MAX_RECURSION_DEPTH = 5;
// File patterns that identify our temp files
private static final Predicate<String> IS_OUR_TEMP_FILE = fileName ->
fileName.startsWith("stirling-pdf-") ||
fileName.startsWith("output_") ||
fileName.startsWith("compressedPDF") ||
fileName.startsWith("pdf-save-") ||
fileName.startsWith("pdf-stream-") ||
fileName.startsWith("PDFBox") ||
fileName.startsWith("input_") ||
fileName.startsWith("overlay-");
// File patterns that identify common system temp files
private static final Predicate<String> IS_SYSTEM_TEMP_FILE = fileName ->
fileName.matches("lu\\d+[a-z0-9]*\\.tmp") ||
fileName.matches("ocr_process\\d+") ||
(fileName.startsWith("tmp") && !fileName.contains("jetty")) ||
fileName.startsWith("OSL_PIPE_") ||
(fileName.endsWith(".tmp") && !fileName.contains("jetty"));
// File patterns that should be excluded from cleanup
private static final Predicate<String> SHOULD_SKIP = fileName ->
fileName.contains("jetty") ||
fileName.startsWith("jetty-") ||
fileName.equals("proc") ||
fileName.equals("sys") ||
fileName.equals("dev");
@Autowired @Autowired
public TempFileCleanupService(TempFileRegistry registry, TempFileManager tempFileManager) { public TempFileCleanupService(TempFileRegistry registry, TempFileManager tempFileManager) {
this.registry = registry; this.registry = registry;
@ -114,41 +148,9 @@ public class TempFileCleanupService {
} }
} }
int unregisteredDeletedCount = 0; // Clean up unregistered temp files based on our cleanup strategy
try { boolean containerMode = isContainerMode();
// Get all directories we need to clean int unregisteredDeletedCount = cleanupUnregisteredFiles(containerMode, true, maxAgeMillis);
Path systemTempPath;
if (systemTempDir != null && !systemTempDir.isEmpty()) {
systemTempPath = Path.of(systemTempDir);
} else {
systemTempPath = Path.of(System.getProperty("java.io.tmpdir"));
}
Path[] dirsToScan = {
systemTempPath, Path.of(customTempDirectory), Path.of(libreOfficeTempDir)
};
boolean containerMode =
"Docker".equals(machineType) || "Kubernetes".equals(machineType);
// Process each directory
for (Path tempDir : dirsToScan) {
if (!Files.exists(tempDir)) {
continue;
}
int dirDeletedCount = cleanupDirectory(tempDir, containerMode, 0, maxAgeMillis);
unregisteredDeletedCount += dirDeletedCount;
if (dirDeletedCount > 0) {
log.info(
"Cleaned up {} unregistered files/directories in {}",
dirDeletedCount,
tempDir);
}
}
} catch (IOException e) {
log.error("Error during scheduled cleanup of unregistered files", e);
}
log.info( log.info(
"Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories", "Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories",
@ -157,272 +159,217 @@ public class TempFileCleanupService {
directoriesDeletedCount); directoriesDeletedCount);
} }
/** Overload of cleanupDirectory that uses the specified max age for files */
private int cleanupDirectory(
Path directory, boolean containerMode, int depth, long maxAgeMillis)
throws IOException {
if (depth > 5) {
log.warn("Maximum directory recursion depth reached for: {}", directory);
return 0;
}
int deletedCount = 0;
try (Stream<Path> paths = Files.list(directory)) {
for (Path path : paths.toList()) {
String fileName = path.getFileName().toString();
// Skip registered files - these are handled by TempFileManager
if (registry.contains(path.toFile())) {
continue;
}
// Skip Jetty-related directories and files
if (fileName.contains("jetty") || fileName.startsWith("jetty-")) {
continue;
}
// Check if this is a directory we should recursively scan
if (Files.isDirectory(path)) {
// Don't recurse into certain system directories
if (!fileName.equals("proc")
&& !fileName.equals("sys")
&& !fileName.equals("dev")) {
deletedCount +=
cleanupDirectory(path, containerMode, depth + 1, maxAgeMillis);
}
continue;
}
// Determine if this file matches our temp file patterns
boolean isOurTempFile =
fileName.startsWith("stirling-pdf-")
|| fileName.startsWith("output_")
|| fileName.startsWith("compressedPDF")
|| fileName.startsWith("pdf-save-")
|| fileName.startsWith("pdf-stream-")
|| fileName.startsWith("PDFBox")
|| fileName.startsWith("input_")
|| fileName.startsWith("overlay-");
// Avoid touching Jetty files
boolean isSystemTempFile =
fileName.matches("lu\\d+[a-z0-9]*\\.tmp")
|| fileName.matches("ocr_process\\d+")
|| (fileName.startsWith("tmp") && !fileName.contains("jetty"))
|| fileName.startsWith("OSL_PIPE_")
|| (fileName.endsWith(".tmp") && !fileName.contains("jetty"));
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
// Special case for zero-byte files - these are often corrupted temp files
try {
if (Files.size(path) == 0) {
// For empty files, use a shorter timeout (5 minutes)
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
// Delete empty files older than 5 minutes
if ((currentTime - lastModified) > 5 * 60 * 1000) {
shouldDelete = true;
}
}
} catch (IOException e) {
log.debug("Could not check file size, skipping: {}", path);
}
// Check file age against maxAgeMillis
if (shouldDelete) {
try {
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
shouldDelete = (currentTime - lastModified) > maxAgeMillis;
} catch (IOException e) {
log.debug("Could not check file age, skipping: {}", path);
shouldDelete = false;
}
}
if (shouldDelete) {
try {
Files.deleteIfExists(path);
deletedCount++;
log.debug(
"Deleted unregistered temp file during scheduled cleanup: {}",
path);
} catch (IOException e) {
// Handle locked files more gracefully - just log at debug level
if (e.getMessage() != null
&& e.getMessage().contains("being used by another process")) {
log.debug("File locked, skipping delete: {}", path);
} else {
log.warn(
"Failed to delete temp file during scheduled cleanup: {}",
path,
e);
}
}
}
}
}
return deletedCount;
}
/** /**
* Perform startup cleanup of stale temporary files from previous runs. This is especially * Perform startup cleanup of stale temporary files from previous runs. This is especially
* important in Docker environments where temp files persist between container restarts. * important in Docker environments where temp files persist between container restarts.
*/ */
private void runStartupCleanup() { private void runStartupCleanup() {
log.info("Running startup temporary file cleanup"); log.info("Running startup temporary file cleanup");
boolean containerMode = isContainerMode();
log.info(
"Running in {} mode, using {} cleanup strategy",
machineType,
containerMode ? "aggressive" : "conservative");
// For startup cleanup, we use a longer timeout for non-container environments
long maxAgeMillis = containerMode ? 0 : 24 * 60 * 60 * 1000; // 0 or 24 hours
int totalDeletedCount = cleanupUnregisteredFiles(containerMode, false, maxAgeMillis);
log.info(
"Startup cleanup complete. Deleted {} temporary files/directories",
totalDeletedCount);
}
/**
* Clean up unregistered temporary files across all configured temp directories.
*
* @param containerMode Whether we're in container mode (more aggressive cleanup)
* @param isScheduled Whether this is a scheduled cleanup or startup cleanup
* @param maxAgeMillis Maximum age of files to clean in milliseconds
* @return Number of files deleted
*/
private int cleanupUnregisteredFiles(boolean containerMode, boolean isScheduled, long maxAgeMillis) {
AtomicInteger totalDeletedCount = new AtomicInteger(0);
try { try {
// Get all directories we need to clean // Get all directories we need to clean
Path systemTempPath; Path systemTempPath = getSystemTempPath();
if (systemTempDir != null && !systemTempDir.isEmpty()) {
systemTempPath = Path.of(systemTempDir);
} else {
systemTempPath = Path.of(System.getProperty("java.io.tmpdir"));
}
Path[] dirsToScan = { Path[] dirsToScan = {
systemTempPath, Path.of(customTempDirectory), Path.of(libreOfficeTempDir) systemTempPath,
Path.of(customTempDirectory),
Path.of(libreOfficeTempDir)
}; };
int totalDeletedCount = 0;
boolean containerMode =
"Docker".equals(machineType) || "Kubernetes".equals(machineType);
log.info(
"Running in {} mode, using {} cleanup strategy",
machineType,
containerMode ? "aggressive" : "conservative");
// Process each directory // Process each directory
for (Path tempDir : dirsToScan) { Arrays.stream(dirsToScan)
if (!Files.exists(tempDir)) { .filter(Files::exists)
log.warn("Temporary directory does not exist: {}", tempDir); .forEach(tempDir -> {
continue; try {
String phase = isScheduled ? "scheduled" : "startup";
log.info("Scanning directory for {} cleanup: {}", phase, tempDir);
AtomicInteger dirDeletedCount = new AtomicInteger(0);
cleanupDirectoryStreaming(
tempDir,
containerMode,
0,
maxAgeMillis,
isScheduled,
path -> {
dirDeletedCount.incrementAndGet();
if (log.isDebugEnabled()) {
log.debug("Deleted temp file during {} cleanup: {}", phase, path);
}
}
);
int count = dirDeletedCount.get();
totalDeletedCount.addAndGet(count);
if (count > 0) {
log.info("Cleaned up {} files/directories in {}", count, tempDir);
}
} catch (IOException e) {
log.error("Error during cleanup of directory: {}", tempDir, e);
}
});
} catch (Exception e) {
log.error("Error during cleanup of unregistered files", e);
}
return totalDeletedCount.get();
}
/**
* Get the system temp directory path based on configuration or system property.
*/
private Path getSystemTempPath() {
if (systemTempDir != null && !systemTempDir.isEmpty()) {
return Path.of(systemTempDir);
} else {
return Path.of(System.getProperty("java.io.tmpdir"));
}
}
/**
* Determine if we're running in a container environment.
*/
private boolean isContainerMode() {
return "Docker".equals(machineType) || "Kubernetes".equals(machineType);
}
/**
* Recursively clean up a directory using a streaming approach to reduce memory usage.
*
* @param directory The directory to clean
* @param containerMode Whether we're in container mode (more aggressive cleanup)
* @param depth Current recursion depth
* @param maxAgeMillis Maximum age of files to delete
* @param isScheduled Whether this is a scheduled cleanup (vs startup)
* @param onDeleteCallback Callback function when a file is deleted
* @throws IOException If an I/O error occurs
*/
private void cleanupDirectoryStreaming(
Path directory,
boolean containerMode,
int depth,
long maxAgeMillis,
boolean isScheduled,
Consumer<Path> onDeleteCallback) throws IOException {
// Check recursion depth limit
if (depth > MAX_RECURSION_DEPTH) {
log.warn("Maximum directory recursion depth reached for: {}", directory);
return;
}
// Use try-with-resources to ensure the stream is closed
try (Stream<Path> pathStream = Files.list(directory)) {
// Process files in a streaming fashion instead of materializing the whole list
pathStream.forEach(path -> {
try {
String fileName = path.getFileName().toString();
// Skip if file should be excluded
if (SHOULD_SKIP.test(fileName)) {
return;
}
// Handle directories recursively
if (Files.isDirectory(path)) {
try {
cleanupDirectoryStreaming(
path, containerMode, depth + 1, maxAgeMillis, isScheduled, onDeleteCallback);
} catch (IOException e) {
log.warn("Error processing subdirectory: {}", path, e);
}
return;
}
// Skip registered files - these are handled by TempFileManager
if (isScheduled && registry.contains(path.toFile())) {
return;
}
// Check if this file should be deleted
if (shouldDeleteFile(path, fileName, containerMode, maxAgeMillis)) {
try {
Files.deleteIfExists(path);
onDeleteCallback.accept(path);
} catch (IOException e) {
// Handle locked files more gracefully
if (e.getMessage() != null && e.getMessage().contains("being used by another process")) {
log.debug("File locked, skipping delete: {}", path);
} else {
log.warn("Failed to delete temp file: {}", path, e);
}
}
}
} catch (Exception e) {
log.warn("Error processing path: {}", path, e);
} }
});
log.info("Scanning directory for cleanup: {}", tempDir);
int dirDeletedCount = cleanupDirectory(tempDir, containerMode, 0);
totalDeletedCount += dirDeletedCount;
log.info("Cleaned up {} files/directories in {}", dirDeletedCount, tempDir);
}
log.info(
"Startup cleanup complete. Deleted {} temporary files/directories",
totalDeletedCount);
} catch (IOException e) {
log.error("Error during startup cleanup", e);
} }
} }
/** /**
* Recursively clean up a directory for temporary files. * Determine if a file should be deleted based on its name, age, and other criteria.
*
* @param directory The directory to clean
* @param containerMode Whether we're in container mode (more aggressive cleanup)
* @param depth Current recursion depth (to prevent excessive recursion)
* @return Number of files deleted
*/ */
private int cleanupDirectory(Path directory, boolean containerMode, int depth) private boolean shouldDeleteFile(Path path, String fileName, boolean containerMode, long maxAgeMillis) {
throws IOException { // First check if it matches our known temp file patterns
if (depth > 5) { boolean isOurTempFile = IS_OUR_TEMP_FILE.test(fileName);
log.warn("Maximum directory recursion depth reached for: {}", directory); boolean isSystemTempFile = IS_SYSTEM_TEMP_FILE.test(fileName);
return 0; boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
// Special case for zero-byte files - these are often corrupted temp files
try {
if (Files.size(path) == 0) {
// For empty files, use a shorter timeout (5 minutes)
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
// Delete empty files older than 5 minutes
if ((currentTime - lastModified) > 5 * 60 * 1000) {
shouldDelete = true;
}
}
} catch (IOException e) {
log.debug("Could not check file size, skipping: {}", path);
} }
int deletedCount = 0; // Check file age against maxAgeMillis
if (shouldDelete && maxAgeMillis > 0) {
try (Stream<Path> paths = Files.list(directory)) { try {
for (Path path : paths.toList()) { long lastModified = Files.getLastModifiedTime(path).toMillis();
String fileName = path.getFileName().toString(); long currentTime = System.currentTimeMillis();
shouldDelete = (currentTime - lastModified) > maxAgeMillis;
// Skip Jetty-related directories and files } catch (IOException e) {
if (fileName.contains("jetty") || fileName.startsWith("jetty-")) { log.debug("Could not check file age, skipping: {}", path);
continue; shouldDelete = false;
}
// Check if this is a directory we should recursively scan
if (Files.isDirectory(path)) {
// Don't recurse into certain system directories
if (!fileName.equals("proc")
&& !fileName.equals("sys")
&& !fileName.equals("dev")) {
deletedCount += cleanupDirectory(path, containerMode, depth + 1);
}
continue;
}
// Determine if this file matches our temp file patterns
boolean isOurTempFile =
fileName.startsWith("stirling-pdf-")
|| fileName.startsWith("output_")
|| fileName.startsWith("compressedPDF")
|| fileName.startsWith("pdf-save-")
|| fileName.startsWith("pdf-stream-")
|| fileName.startsWith("PDFBox")
|| fileName.startsWith("input_")
|| fileName.startsWith("overlay-");
// Avoid touching Jetty files
boolean isSystemTempFile =
fileName.matches("lu\\d+[a-z0-9]*\\.tmp")
|| fileName.matches("ocr_process\\d+")
|| (fileName.startsWith("tmp") && !fileName.contains("jetty"))
|| fileName.startsWith("OSL_PIPE_")
|| (fileName.endsWith(".tmp") && !fileName.contains("jetty"));
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
// Special case for zero-byte files - these are often corrupted temp files
boolean isEmptyFile = false;
try {
if (!Files.isDirectory(path) && Files.size(path) == 0) {
isEmptyFile = true;
// For empty files, use a shorter timeout (5 minutes)
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
// Delete empty files older than 5 minutes
if ((currentTime - lastModified) > 5 * 60 * 1000) {
shouldDelete = true;
}
}
} catch (IOException e) {
log.debug("Could not check file size, skipping: {}", path);
}
// For non-container mode, check file age before deleting
if (!containerMode && (isOurTempFile || isSystemTempFile) && !isEmptyFile) {
try {
long lastModified = Files.getLastModifiedTime(path).toMillis();
long currentTime = System.currentTimeMillis();
// Only delete files older than 24 hours in non-container mode
shouldDelete = (currentTime - lastModified) > 24 * 60 * 60 * 1000;
} catch (IOException e) {
log.debug("Could not check file age, skipping: {}", path);
shouldDelete = false;
}
}
if (shouldDelete) {
try {
if (Files.isDirectory(path)) {
GeneralUtils.deleteDirectory(path);
} else {
Files.deleteIfExists(path);
}
deletedCount++;
log.debug("Deleted temp file during startup cleanup: {}", path);
} catch (IOException e) {
log.warn("Failed to delete temp file during startup cleanup: {}", path, e);
}
}
} }
} }
return deletedCount; return shouldDelete;
} }
/** Clean up LibreOffice temporary files. This method is called after LibreOffice operations. */ /** Clean up LibreOffice temporary files. This method is called after LibreOffice operations. */
@ -431,18 +378,17 @@ public class TempFileCleanupService {
try { try {
Set<Path> directories = registry.getTempDirectories(); Set<Path> directories = registry.getTempDirectories();
for (Path dir : directories) { for (Path dir : directories) {
if (dir.getFileName().toString().contains("libreoffice")) { if (dir.getFileName().toString().contains("libreoffice") && Files.exists(dir)) {
// For directories containing "libreoffice", delete all contents // For directories containing "libreoffice", delete all contents
// but keep the directory itself for future use // but keep the directory itself for future use
try (Stream<Path> files = Files.list(dir)) { cleanupDirectoryStreaming(
for (Path file : files.toList()) { dir,
if (Files.isDirectory(file)) { isContainerMode(),
GeneralUtils.deleteDirectory(file); 0,
} else { 0, // age doesn't matter for LibreOffice cleanup
Files.deleteIfExists(file); false,
} path -> log.debug("Cleaned up LibreOffice temp file: {}", path)
} );
}
log.debug("Cleaned up LibreOffice temp directory contents: {}", dir); log.debug("Cleaned up LibreOffice temp directory contents: {}", dir);
} }
} }
@ -450,4 +396,4 @@ public class TempFileCleanupService {
log.warn("Failed to clean up LibreOffice temp files", e); log.warn("Failed to clean up LibreOffice temp files", e);
} }
} }
} }

View File

@ -14,6 +14,8 @@ import java.util.Arrays;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -118,12 +120,14 @@ public class TempFileCleanupServiceTest {
// Create a file older than threshold // Create a file older than threshold
Path oldFile = Files.createFile(systemTempDir.resolve("output_old.pdf")); Path oldFile = Files.createFile(systemTempDir.resolve("output_old.pdf"));
Files.setLastModifiedTime(oldFile, FileTime.from( Files.getLastModifiedTime(oldFile).toMillis() - 5000000, TimeUnit.MILLISECONDS)); Files.setLastModifiedTime(oldFile, FileTime.from(
Files.getLastModifiedTime(oldFile).toMillis() - 5000000,
TimeUnit.MILLISECONDS));
// Act // Act
invokeCleanupDirectory(systemTempDir, true, 0, 3600000); invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
invokeCleanupDirectory(customTempDir, true, 0, 3600000); invokeCleanupDirectoryStreaming(customTempDir, true, 0, 3600000);
invokeCleanupDirectory(libreOfficeTempDir, true, 0, 3600000); invokeCleanupDirectoryStreaming(libreOfficeTempDir, true, 0, 3600000);
// Assert - Our temp files and system temp files should be deleted (if old enough) // Assert - Our temp files and system temp files should be deleted (if old enough)
assertFalse(Files.exists(oldFile), "Old temp file should be deleted"); assertFalse(Files.exists(oldFile), "Old temp file should be deleted");
@ -141,14 +145,15 @@ public class TempFileCleanupServiceTest {
// Arrange - Create an empty file // Arrange - Create an empty file
Path emptyFile = Files.createFile(systemTempDir.resolve("empty.tmp")); Path emptyFile = Files.createFile(systemTempDir.resolve("empty.tmp"));
// Make it "old enough" to be deleted (>5 minutes) // Make it "old enough" to be deleted (>5 minutes)
Files.setLastModifiedTime(emptyFile, FileTime.from( Files.getLastModifiedTime(emptyFile).toMillis() - 6 * 60 * 1000, TimeUnit.MILLISECONDS)); Files.setLastModifiedTime(emptyFile, FileTime.from(
Files.getLastModifiedTime(emptyFile).toMillis() - 6 * 60 * 1000,
TimeUnit.MILLISECONDS));
// Configure mock registry to say this file isn't registered // Configure mock registry to say this file isn't registered
when(registry.contains(any(File.class))).thenReturn(false); when(registry.contains(any(File.class))).thenReturn(false);
// Act // Act
invokeCleanupDirectory(systemTempDir, true, 0, 3600000); invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
// Assert // Assert
assertFalse(Files.exists(emptyFile), "Empty file older than 5 minutes should be deleted"); assertFalse(Files.exists(emptyFile), "Empty file older than 5 minutes should be deleted");
@ -166,13 +171,15 @@ public class TempFileCleanupServiceTest {
Path tempFile3 = Files.createFile(dir3.resolve("output_3.pdf")); Path tempFile3 = Files.createFile(dir3.resolve("output_3.pdf"));
// Make the deepest file old enough to be deleted // Make the deepest file old enough to be deleted
Files.setLastModifiedTime(tempFile3, FileTime.from( Files.getLastModifiedTime(tempFile3).toMillis() - 5000000, TimeUnit.MILLISECONDS)); Files.setLastModifiedTime(tempFile3, FileTime.from(
Files.getLastModifiedTime(tempFile3).toMillis() - 5000000,
TimeUnit.MILLISECONDS));
// Configure mock registry to say these files aren't registered // Configure mock registry to say these files aren't registered
when(registry.contains(any(File.class))).thenReturn(false); when(registry.contains(any(File.class))).thenReturn(false);
// Act // Act
invokeCleanupDirectory(systemTempDir, true, 0, 3600000); invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
// Assert // Assert
assertTrue(Files.exists(tempFile1), "Recent temp file should be preserved"); assertTrue(Files.exists(tempFile1), "Recent temp file should be preserved");
@ -181,17 +188,25 @@ public class TempFileCleanupServiceTest {
} }
/** /**
* Helper method to invoke the private cleanupDirectory method using reflection * Helper method to invoke the private cleanupDirectoryStreaming method using reflection
*/ */
private int invokeCleanupDirectory(Path directory, boolean containerMode, int depth, long maxAgeMillis) private void invokeCleanupDirectoryStreaming(Path directory, boolean containerMode, int depth, long maxAgeMillis)
throws IOException { throws IOException {
try { try {
// Create a consumer that tracks deleted files
AtomicInteger deleteCount = new AtomicInteger(0);
Consumer<Path> deleteCallback = path -> deleteCount.incrementAndGet();
// Get the new method with updated signature
var method = TempFileCleanupService.class.getDeclaredMethod( var method = TempFileCleanupService.class.getDeclaredMethod(
"cleanupDirectory", Path.class, boolean.class, int.class, long.class); "cleanupDirectoryStreaming",
Path.class, boolean.class, int.class, long.class, boolean.class, Consumer.class);
method.setAccessible(true); method.setAccessible(true);
return (int) method.invoke(cleanupService, directory, containerMode, depth, maxAgeMillis);
// Invoke the method with appropriate parameters
method.invoke(cleanupService, directory, containerMode, depth, maxAgeMillis, false, deleteCallback);
} catch (Exception e) { } catch (Exception e) {
throw new RuntimeException("Error invoking cleanupDirectory", e); throw new RuntimeException("Error invoking cleanupDirectoryStreaming", e);
} }
} }
} }

View File

@ -2,7 +2,6 @@ package stirling.software.SPDF.controller.api.misc;
import java.awt.image.BufferedImage; import java.awt.image.BufferedImage;
import java.io.*; import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.*; import java.util.*;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
@ -23,7 +22,6 @@ import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController; import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.multipart.MultipartFile;
import io.github.pixee.security.BoundedLineReader;
import io.github.pixee.security.Filenames; import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag; import io.swagger.v3.oas.annotations.tags.Tag;
@ -34,6 +32,11 @@ import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest; import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
import stirling.software.common.model.ApplicationProperties; import stirling.software.common.model.ApplicationProperties;
import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.ProcessExecutor;
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
import stirling.software.common.util.TempFileManager;
import stirling.software.common.util.TempFileUtil;
import stirling.software.common.util.TempFileUtil.TempFile;
@RestController @RestController
@RequestMapping("/api/v1/misc") @RequestMapping("/api/v1/misc")
@ -43,8 +46,8 @@ import stirling.software.common.service.CustomPDFDocumentFactory;
public class OCRController { public class OCRController {
private final ApplicationProperties applicationProperties; private final ApplicationProperties applicationProperties;
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
private final TempFileManager tempFileManager;
/** Gets the list of available Tesseract languages from the tessdata directory */ /** Gets the list of available Tesseract languages from the tessdata directory */
public List<String> getAvailableTesseractLanguages() { public List<String> getAvailableTesseractLanguages() {
@ -73,93 +76,108 @@ public class OCRController {
MultipartFile inputFile = request.getFileInput(); MultipartFile inputFile = request.getFileInput();
List<String> languages = request.getLanguages(); List<String> languages = request.getLanguages();
String ocrType = request.getOcrType(); String ocrType = request.getOcrType();
Path tempDir = Files.createTempDirectory("ocr_process");
Path tempInputFile = tempDir.resolve("input.pdf"); // Create a temp directory using TempFileManager directly
Path tempOutputDir = tempDir.resolve("output"); Path tempDirPath = tempFileManager.createTempDirectory();
Path tempImagesDir = tempDir.resolve("images"); File tempDir = tempDirPath.toFile();
Path finalOutputFile = tempDir.resolve("final_output.pdf");
Files.createDirectories(tempOutputDir);
Files.createDirectories(tempImagesDir);
Process process = null;
try { try {
File tempInputFile = new File(tempDir, "input.pdf");
File tempOutputDir = new File(tempDir, "output");
File tempImagesDir = new File(tempDir, "images");
File finalOutputFile = new File(tempDir, "final_output.pdf");
// Create directories
tempOutputDir.mkdirs();
tempImagesDir.mkdirs();
// Save input file // Save input file
inputFile.transferTo(tempInputFile.toFile()); inputFile.transferTo(tempInputFile);
PDFMergerUtility merger = new PDFMergerUtility(); PDFMergerUtility merger = new PDFMergerUtility();
merger.setDestinationFileName(finalOutputFile.toString()); merger.setDestinationFileName(finalOutputFile.toString());
try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) {
PDFRenderer pdfRenderer = new PDFRenderer(document); PDFRenderer pdfRenderer = new PDFRenderer(document);
int pageCount = document.getNumberOfPages(); int pageCount = document.getNumberOfPages();
for (int pageNum = 0; pageNum < pageCount; pageNum++) { for (int pageNum = 0; pageNum < pageCount; pageNum++) {
PDPage page = document.getPage(pageNum); PDPage page = document.getPage(pageNum);
boolean hasText = false; boolean hasText = false;
// Check for existing text // Check for existing text
try (PDDocument tempDoc = new PDDocument()) { try (PDDocument tempDoc = new PDDocument()) {
tempDoc.addPage(page); tempDoc.addPage(page);
PDFTextStripper stripper = new PDFTextStripper(); PDFTextStripper stripper = new PDFTextStripper();
hasText = !stripper.getText(tempDoc).trim().isEmpty(); hasText = !stripper.getText(tempDoc).trim().isEmpty();
} }
boolean shouldOcr =
switch (ocrType) { boolean shouldOcr = switch (ocrType) {
case "skip-text" -> !hasText; case "skip-text" -> !hasText;
case "force-ocr" -> true; case "force-ocr" -> true;
default -> true; default -> true;
}; };
Path pageOutputPath =
tempOutputDir.resolve(String.format("page_%d.pdf", pageNum)); File pageOutputPath = new File(tempOutputDir, String.format("page_%d.pdf", pageNum));
if (shouldOcr) { if (shouldOcr) {
// Convert page to image // Convert page to image
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300); BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
Path imagePath = File imagePath = new File(tempImagesDir, String.format("page_%d.png", pageNum));
tempImagesDir.resolve(String.format("page_%d.png", pageNum)); ImageIO.write(image, "png", imagePath);
ImageIO.write(image, "png", imagePath.toFile());
// Build OCR command // Build OCR command
List<String> command = new ArrayList<>(); List<String> command = new ArrayList<>();
command.add("tesseract"); command.add("tesseract");
command.add(imagePath.toString()); command.add(imagePath.toString());
command.add( command.add(
tempOutputDir new File(tempOutputDir, String.format("page_%d", pageNum))
.resolve(String.format("page_%d", pageNum))
.toString()); .toString());
command.add("-l"); command.add("-l");
command.add(String.join("+", languages)); command.add(String.join("+", languages));
// Always output PDF // Always output PDF
command.add("pdf"); command.add("pdf");
ProcessBuilder pb = new ProcessBuilder(command);
process = pb.start(); // Use ProcessExecutor to run tesseract command
// Capture any error output try {
try (BufferedReader reader = ProcessExecutorResult result = ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
new BufferedReader( .runCommandWithOutputHandling(command);
new InputStreamReader(process.getErrorStream()))) {
String line; log.debug("Tesseract OCR completed for page {} with exit code {}",
while ((line = BoundedLineReader.readLine(reader, 5_000_000)) != null) { pageNum, result.getRc());
log.debug("Tesseract: {}", line);
// Add OCR'd PDF to merger
merger.addSource(pageOutputPath);
} catch (IOException | InterruptedException e) {
log.error("Error processing page {} with tesseract: {}", pageNum, e.getMessage());
// If OCR fails, fall back to the original page
try (PDDocument pageDoc = new PDDocument()) {
pageDoc.addPage(page);
pageDoc.save(pageOutputPath);
merger.addSource(pageOutputPath);
} }
} }
int exitCode = process.waitFor();
if (exitCode != 0) {
throw new RuntimeException(
"Tesseract failed with exit code: " + exitCode);
}
// Add OCR'd PDF to merger
merger.addSource(pageOutputPath.toFile());
} else { } else {
// Save original page without OCR // Save original page without OCR
try (PDDocument pageDoc = new PDDocument()) { try (PDDocument pageDoc = new PDDocument()) {
pageDoc.addPage(page); pageDoc.addPage(page);
pageDoc.save(pageOutputPath.toFile()); pageDoc.save(pageOutputPath);
merger.addSource(pageOutputPath.toFile()); merger.addSource(pageOutputPath);
} }
} }
} }
} }
// Merge all pages into final PDF // Merge all pages into final PDF
merger.mergeDocuments(null); merger.mergeDocuments(null);
// Read the final PDF file // Read the final PDF file
byte[] pdfContent = Files.readAllBytes(finalOutputFile); byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath());
String outputFilename = String outputFilename =
Filenames.toSimpleFileName(inputFile.getOriginalFilename()) Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "") .replaceFirst("[.][^.]+$", "")
+ "_OCR.pdf"; + "_OCR.pdf";
return ResponseEntity.ok() return ResponseEntity.ok()
.header( .header(
"Content-Disposition", "Content-Disposition",
@ -167,14 +185,11 @@ public class OCRController {
.contentType(MediaType.APPLICATION_PDF) .contentType(MediaType.APPLICATION_PDF)
.body(pdfContent); .body(pdfContent);
} finally { } finally {
if (process != null) { // Clean up the temp directory and all its contents
process.destroy(); tempFileManager.deleteTempDirectory(tempDirPath);
}
// Clean up temporary files
deleteDirectory(tempDir);
} }
} }
private void addFileToZip(File file, String filename, ZipOutputStream zipOut) private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
throws IOException { throws IOException {
if (!file.exists()) { if (!file.exists()) {
@ -192,21 +207,4 @@ public class OCRController {
zipOut.closeEntry(); zipOut.closeEntry();
} }
} }
}
private void deleteDirectory(Path directory) {
try {
Files.walk(directory)
.sorted(Comparator.reverseOrder())
.forEach(
path -> {
try {
Files.delete(path);
} catch (IOException e) {
log.error("Error deleting {}: {}", path, e.getMessage());
}
});
} catch (IOException e) {
log.error("Error walking directory {}: {}", directory, e.getMessage());
}
}
}

View File

@ -44,4 +44,7 @@ springdoc.swagger-ui.path=/index.html
posthog.api.key=phc_fiR65u5j6qmXTYL56MNrLZSWqLaDW74OrZH0Insd2xq posthog.api.key=phc_fiR65u5j6qmXTYL56MNrLZSWqLaDW74OrZH0Insd2xq
posthog.host=https://eu.i.posthog.com posthog.host=https://eu.i.posthog.com
spring.main.allow-bean-definition-overriding=true spring.main.allow-bean-definition-overriding=true
# Set up a consistent temporary directory location
java.io.tmpdir=${stirling.tempfiles.directory:${java.io.tmpdir}/stirling-pdf}