From 93ea611eda3a6558dcb5e66b9bf5b4466765488c Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com.> Date: Mon, 23 Jun 2025 17:24:27 +0100 Subject: [PATCH] improvements to tmp dir, OCR and others --- .../common/config/TempFileConfiguration.java | 45 +- .../service/TempFileCleanupService.java | 522 ++++++++---------- .../service/TempFileCleanupServiceTest.java | 43 +- .../controller/api/misc/OCRController.java | 136 +++-- .../src/main/resources/application.properties | 5 +- 5 files changed, 341 insertions(+), 410 deletions(-) diff --git a/common/src/main/java/stirling/software/common/config/TempFileConfiguration.java b/common/src/main/java/stirling/software/common/config/TempFileConfiguration.java index 8939e1474..4cb8ec189 100644 --- a/common/src/main/java/stirling/software/common/config/TempFileConfiguration.java +++ b/common/src/main/java/stirling/software/common/config/TempFileConfiguration.java @@ -25,7 +25,7 @@ import stirling.software.common.util.TempFileRegistry; @Configuration public class TempFileConfiguration { - @Value("${stirling.tempfiles.directory:}") + @Value("${stirling.tempfiles.directory:${java.io.tmpdir}/stirling-pdf}") private String customTempDirectory; @Autowired @@ -48,42 +48,11 @@ public class TempFileConfiguration { @PostConstruct public void initTempFileConfig() { try { - // If a custom temp directory is specified in the config, use it - if (customTempDirectory != null && !customTempDirectory.isEmpty()) { - Path tempDir = Path.of(customTempDirectory); - if (!Files.exists(tempDir)) { - Files.createDirectories(tempDir); - log.info("Created custom temporary directory: {}", tempDir); - } - - // Set Java temp directory system property if in Docker/Kubernetes mode - if ("Docker".equals(machineType) || "Kubernetes".equals(machineType)) { - System.setProperty("java.io.tmpdir", customTempDirectory); - log.info( - "Set system temp directory to: {} for environment: {}", - customTempDirectory, - machineType); - } - } else { - // No custom directory specified, use java.io.tmpdir + application subfolder - String defaultTempDir; - - if ("Docker".equals(machineType) || "Kubernetes".equals(machineType)) { - // Container environments should continue to use /tmp/stirling-pdf - defaultTempDir = "/tmp/stirling-pdf"; - } else { - // Use system temp directory (java.io.tmpdir) with our application subfolder - // This automatically handles Windows (AppData\Local\Temp), macOS, and Linux systems - defaultTempDir = System.getProperty("java.io.tmpdir") + File.separator + "stirling-pdf"; - } - customTempDirectory = defaultTempDir; - - // Create the default temp directory - Path tempDir = Path.of(customTempDirectory); - if (!Files.exists(tempDir)) { - Files.createDirectories(tempDir); - log.info("Created default OS-specific temporary directory: {}", tempDir); - } + // Create the temp directory if it doesn't exist + Path tempDir = Path.of(customTempDirectory); + if (!Files.exists(tempDir)) { + Files.createDirectories(tempDir); + log.info("Created temporary directory: {}", tempDir); } log.info("Temporary file configuration initialized"); @@ -93,4 +62,4 @@ public class TempFileConfiguration { log.error("Failed to initialize temporary file configuration", e); } } -} +} \ No newline at end of file diff --git a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java index ee6fdd3c9..9b496e90c 100644 --- a/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java +++ b/common/src/main/java/stirling/software/common/service/TempFileCleanupService.java @@ -3,8 +3,12 @@ package stirling.software.common.service; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Arrays; import java.util.Set; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Consumer; +import java.util.function.Predicate; import java.util.stream.Stream; import org.springframework.beans.factory.annotation.Autowired; @@ -49,6 +53,36 @@ public class TempFileCleanupService { @Value("${stirling.tempfiles.libreoffice-dir:/tmp/stirling-pdf/libreoffice}") private String libreOfficeTempDir; + // Maximum recursion depth for directory traversal + private static final int MAX_RECURSION_DEPTH = 5; + + // File patterns that identify our temp files + private static final Predicate IS_OUR_TEMP_FILE = fileName -> + fileName.startsWith("stirling-pdf-") || + fileName.startsWith("output_") || + fileName.startsWith("compressedPDF") || + fileName.startsWith("pdf-save-") || + fileName.startsWith("pdf-stream-") || + fileName.startsWith("PDFBox") || + fileName.startsWith("input_") || + fileName.startsWith("overlay-"); + + // File patterns that identify common system temp files + private static final Predicate IS_SYSTEM_TEMP_FILE = fileName -> + fileName.matches("lu\\d+[a-z0-9]*\\.tmp") || + fileName.matches("ocr_process\\d+") || + (fileName.startsWith("tmp") && !fileName.contains("jetty")) || + fileName.startsWith("OSL_PIPE_") || + (fileName.endsWith(".tmp") && !fileName.contains("jetty")); + + // File patterns that should be excluded from cleanup + private static final Predicate SHOULD_SKIP = fileName -> + fileName.contains("jetty") || + fileName.startsWith("jetty-") || + fileName.equals("proc") || + fileName.equals("sys") || + fileName.equals("dev"); + @Autowired public TempFileCleanupService(TempFileRegistry registry, TempFileManager tempFileManager) { this.registry = registry; @@ -114,41 +148,9 @@ public class TempFileCleanupService { } } - int unregisteredDeletedCount = 0; - try { - // Get all directories we need to clean - Path systemTempPath; - if (systemTempDir != null && !systemTempDir.isEmpty()) { - systemTempPath = Path.of(systemTempDir); - } else { - systemTempPath = Path.of(System.getProperty("java.io.tmpdir")); - } - - Path[] dirsToScan = { - systemTempPath, Path.of(customTempDirectory), Path.of(libreOfficeTempDir) - }; - - boolean containerMode = - "Docker".equals(machineType) || "Kubernetes".equals(machineType); - - // Process each directory - for (Path tempDir : dirsToScan) { - if (!Files.exists(tempDir)) { - continue; - } - - int dirDeletedCount = cleanupDirectory(tempDir, containerMode, 0, maxAgeMillis); - unregisteredDeletedCount += dirDeletedCount; - if (dirDeletedCount > 0) { - log.info( - "Cleaned up {} unregistered files/directories in {}", - dirDeletedCount, - tempDir); - } - } - } catch (IOException e) { - log.error("Error during scheduled cleanup of unregistered files", e); - } + // Clean up unregistered temp files based on our cleanup strategy + boolean containerMode = isContainerMode(); + int unregisteredDeletedCount = cleanupUnregisteredFiles(containerMode, true, maxAgeMillis); log.info( "Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories", @@ -157,272 +159,217 @@ public class TempFileCleanupService { directoriesDeletedCount); } - /** Overload of cleanupDirectory that uses the specified max age for files */ - private int cleanupDirectory( - Path directory, boolean containerMode, int depth, long maxAgeMillis) - throws IOException { - if (depth > 5) { - log.warn("Maximum directory recursion depth reached for: {}", directory); - return 0; - } - - int deletedCount = 0; - - try (Stream paths = Files.list(directory)) { - for (Path path : paths.toList()) { - String fileName = path.getFileName().toString(); - - // Skip registered files - these are handled by TempFileManager - if (registry.contains(path.toFile())) { - continue; - } - - // Skip Jetty-related directories and files - if (fileName.contains("jetty") || fileName.startsWith("jetty-")) { - continue; - } - - // Check if this is a directory we should recursively scan - if (Files.isDirectory(path)) { - // Don't recurse into certain system directories - if (!fileName.equals("proc") - && !fileName.equals("sys") - && !fileName.equals("dev")) { - deletedCount += - cleanupDirectory(path, containerMode, depth + 1, maxAgeMillis); - } - continue; - } - - // Determine if this file matches our temp file patterns - boolean isOurTempFile = - fileName.startsWith("stirling-pdf-") - || fileName.startsWith("output_") - || fileName.startsWith("compressedPDF") - || fileName.startsWith("pdf-save-") - || fileName.startsWith("pdf-stream-") - || fileName.startsWith("PDFBox") - || fileName.startsWith("input_") - || fileName.startsWith("overlay-"); - - // Avoid touching Jetty files - boolean isSystemTempFile = - fileName.matches("lu\\d+[a-z0-9]*\\.tmp") - || fileName.matches("ocr_process\\d+") - || (fileName.startsWith("tmp") && !fileName.contains("jetty")) - || fileName.startsWith("OSL_PIPE_") - || (fileName.endsWith(".tmp") && !fileName.contains("jetty")); - - boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile); - - // Special case for zero-byte files - these are often corrupted temp files - try { - if (Files.size(path) == 0) { - // For empty files, use a shorter timeout (5 minutes) - long lastModified = Files.getLastModifiedTime(path).toMillis(); - long currentTime = System.currentTimeMillis(); - // Delete empty files older than 5 minutes - if ((currentTime - lastModified) > 5 * 60 * 1000) { - shouldDelete = true; - } - } - } catch (IOException e) { - log.debug("Could not check file size, skipping: {}", path); - } - - // Check file age against maxAgeMillis - if (shouldDelete) { - try { - long lastModified = Files.getLastModifiedTime(path).toMillis(); - long currentTime = System.currentTimeMillis(); - shouldDelete = (currentTime - lastModified) > maxAgeMillis; - } catch (IOException e) { - log.debug("Could not check file age, skipping: {}", path); - shouldDelete = false; - } - } - - if (shouldDelete) { - try { - Files.deleteIfExists(path); - deletedCount++; - log.debug( - "Deleted unregistered temp file during scheduled cleanup: {}", - path); - } catch (IOException e) { - // Handle locked files more gracefully - just log at debug level - if (e.getMessage() != null - && e.getMessage().contains("being used by another process")) { - log.debug("File locked, skipping delete: {}", path); - } else { - log.warn( - "Failed to delete temp file during scheduled cleanup: {}", - path, - e); - } - } - } - } - } - - return deletedCount; - } - /** * Perform startup cleanup of stale temporary files from previous runs. This is especially * important in Docker environments where temp files persist between container restarts. */ private void runStartupCleanup() { log.info("Running startup temporary file cleanup"); + boolean containerMode = isContainerMode(); + + log.info( + "Running in {} mode, using {} cleanup strategy", + machineType, + containerMode ? "aggressive" : "conservative"); + // For startup cleanup, we use a longer timeout for non-container environments + long maxAgeMillis = containerMode ? 0 : 24 * 60 * 60 * 1000; // 0 or 24 hours + + int totalDeletedCount = cleanupUnregisteredFiles(containerMode, false, maxAgeMillis); + + log.info( + "Startup cleanup complete. Deleted {} temporary files/directories", + totalDeletedCount); + } + + /** + * Clean up unregistered temporary files across all configured temp directories. + * + * @param containerMode Whether we're in container mode (more aggressive cleanup) + * @param isScheduled Whether this is a scheduled cleanup or startup cleanup + * @param maxAgeMillis Maximum age of files to clean in milliseconds + * @return Number of files deleted + */ + private int cleanupUnregisteredFiles(boolean containerMode, boolean isScheduled, long maxAgeMillis) { + AtomicInteger totalDeletedCount = new AtomicInteger(0); + try { // Get all directories we need to clean - Path systemTempPath; - if (systemTempDir != null && !systemTempDir.isEmpty()) { - systemTempPath = Path.of(systemTempDir); - } else { - systemTempPath = Path.of(System.getProperty("java.io.tmpdir")); - } - + Path systemTempPath = getSystemTempPath(); Path[] dirsToScan = { - systemTempPath, Path.of(customTempDirectory), Path.of(libreOfficeTempDir) + systemTempPath, + Path.of(customTempDirectory), + Path.of(libreOfficeTempDir) }; - int totalDeletedCount = 0; - - boolean containerMode = - "Docker".equals(machineType) || "Kubernetes".equals(machineType); - log.info( - "Running in {} mode, using {} cleanup strategy", - machineType, - containerMode ? "aggressive" : "conservative"); - // Process each directory - for (Path tempDir : dirsToScan) { - if (!Files.exists(tempDir)) { - log.warn("Temporary directory does not exist: {}", tempDir); - continue; + Arrays.stream(dirsToScan) + .filter(Files::exists) + .forEach(tempDir -> { + try { + String phase = isScheduled ? "scheduled" : "startup"; + log.info("Scanning directory for {} cleanup: {}", phase, tempDir); + + AtomicInteger dirDeletedCount = new AtomicInteger(0); + cleanupDirectoryStreaming( + tempDir, + containerMode, + 0, + maxAgeMillis, + isScheduled, + path -> { + dirDeletedCount.incrementAndGet(); + if (log.isDebugEnabled()) { + log.debug("Deleted temp file during {} cleanup: {}", phase, path); + } + } + ); + + int count = dirDeletedCount.get(); + totalDeletedCount.addAndGet(count); + if (count > 0) { + log.info("Cleaned up {} files/directories in {}", count, tempDir); + } + } catch (IOException e) { + log.error("Error during cleanup of directory: {}", tempDir, e); + } + }); + } catch (Exception e) { + log.error("Error during cleanup of unregistered files", e); + } + + return totalDeletedCount.get(); + } + + /** + * Get the system temp directory path based on configuration or system property. + */ + private Path getSystemTempPath() { + if (systemTempDir != null && !systemTempDir.isEmpty()) { + return Path.of(systemTempDir); + } else { + return Path.of(System.getProperty("java.io.tmpdir")); + } + } + + /** + * Determine if we're running in a container environment. + */ + private boolean isContainerMode() { + return "Docker".equals(machineType) || "Kubernetes".equals(machineType); + } + + /** + * Recursively clean up a directory using a streaming approach to reduce memory usage. + * + * @param directory The directory to clean + * @param containerMode Whether we're in container mode (more aggressive cleanup) + * @param depth Current recursion depth + * @param maxAgeMillis Maximum age of files to delete + * @param isScheduled Whether this is a scheduled cleanup (vs startup) + * @param onDeleteCallback Callback function when a file is deleted + * @throws IOException If an I/O error occurs + */ + private void cleanupDirectoryStreaming( + Path directory, + boolean containerMode, + int depth, + long maxAgeMillis, + boolean isScheduled, + Consumer onDeleteCallback) throws IOException { + + // Check recursion depth limit + if (depth > MAX_RECURSION_DEPTH) { + log.warn("Maximum directory recursion depth reached for: {}", directory); + return; + } + + // Use try-with-resources to ensure the stream is closed + try (Stream pathStream = Files.list(directory)) { + // Process files in a streaming fashion instead of materializing the whole list + pathStream.forEach(path -> { + try { + String fileName = path.getFileName().toString(); + + // Skip if file should be excluded + if (SHOULD_SKIP.test(fileName)) { + return; + } + + // Handle directories recursively + if (Files.isDirectory(path)) { + try { + cleanupDirectoryStreaming( + path, containerMode, depth + 1, maxAgeMillis, isScheduled, onDeleteCallback); + } catch (IOException e) { + log.warn("Error processing subdirectory: {}", path, e); + } + return; + } + + // Skip registered files - these are handled by TempFileManager + if (isScheduled && registry.contains(path.toFile())) { + return; + } + + // Check if this file should be deleted + if (shouldDeleteFile(path, fileName, containerMode, maxAgeMillis)) { + try { + Files.deleteIfExists(path); + onDeleteCallback.accept(path); + } catch (IOException e) { + // Handle locked files more gracefully + if (e.getMessage() != null && e.getMessage().contains("being used by another process")) { + log.debug("File locked, skipping delete: {}", path); + } else { + log.warn("Failed to delete temp file: {}", path, e); + } + } + } + } catch (Exception e) { + log.warn("Error processing path: {}", path, e); } - - log.info("Scanning directory for cleanup: {}", tempDir); - int dirDeletedCount = cleanupDirectory(tempDir, containerMode, 0); - totalDeletedCount += dirDeletedCount; - log.info("Cleaned up {} files/directories in {}", dirDeletedCount, tempDir); - } - - log.info( - "Startup cleanup complete. Deleted {} temporary files/directories", - totalDeletedCount); - } catch (IOException e) { - log.error("Error during startup cleanup", e); + }); } } /** - * Recursively clean up a directory for temporary files. - * - * @param directory The directory to clean - * @param containerMode Whether we're in container mode (more aggressive cleanup) - * @param depth Current recursion depth (to prevent excessive recursion) - * @return Number of files deleted + * Determine if a file should be deleted based on its name, age, and other criteria. */ - private int cleanupDirectory(Path directory, boolean containerMode, int depth) - throws IOException { - if (depth > 5) { - log.warn("Maximum directory recursion depth reached for: {}", directory); - return 0; + private boolean shouldDeleteFile(Path path, String fileName, boolean containerMode, long maxAgeMillis) { + // First check if it matches our known temp file patterns + boolean isOurTempFile = IS_OUR_TEMP_FILE.test(fileName); + boolean isSystemTempFile = IS_SYSTEM_TEMP_FILE.test(fileName); + boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile); + + // Special case for zero-byte files - these are often corrupted temp files + try { + if (Files.size(path) == 0) { + // For empty files, use a shorter timeout (5 minutes) + long lastModified = Files.getLastModifiedTime(path).toMillis(); + long currentTime = System.currentTimeMillis(); + // Delete empty files older than 5 minutes + if ((currentTime - lastModified) > 5 * 60 * 1000) { + shouldDelete = true; + } + } + } catch (IOException e) { + log.debug("Could not check file size, skipping: {}", path); } - int deletedCount = 0; - - try (Stream paths = Files.list(directory)) { - for (Path path : paths.toList()) { - String fileName = path.getFileName().toString(); - - // Skip Jetty-related directories and files - if (fileName.contains("jetty") || fileName.startsWith("jetty-")) { - continue; - } - - // Check if this is a directory we should recursively scan - if (Files.isDirectory(path)) { - // Don't recurse into certain system directories - if (!fileName.equals("proc") - && !fileName.equals("sys") - && !fileName.equals("dev")) { - deletedCount += cleanupDirectory(path, containerMode, depth + 1); - } - continue; - } - - // Determine if this file matches our temp file patterns - boolean isOurTempFile = - fileName.startsWith("stirling-pdf-") - || fileName.startsWith("output_") - || fileName.startsWith("compressedPDF") - || fileName.startsWith("pdf-save-") - || fileName.startsWith("pdf-stream-") - || fileName.startsWith("PDFBox") - || fileName.startsWith("input_") - || fileName.startsWith("overlay-"); - - // Avoid touching Jetty files - boolean isSystemTempFile = - fileName.matches("lu\\d+[a-z0-9]*\\.tmp") - || fileName.matches("ocr_process\\d+") - || (fileName.startsWith("tmp") && !fileName.contains("jetty")) - || fileName.startsWith("OSL_PIPE_") - || (fileName.endsWith(".tmp") && !fileName.contains("jetty")); - - boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile); - - // Special case for zero-byte files - these are often corrupted temp files - boolean isEmptyFile = false; - try { - if (!Files.isDirectory(path) && Files.size(path) == 0) { - isEmptyFile = true; - // For empty files, use a shorter timeout (5 minutes) - long lastModified = Files.getLastModifiedTime(path).toMillis(); - long currentTime = System.currentTimeMillis(); - // Delete empty files older than 5 minutes - if ((currentTime - lastModified) > 5 * 60 * 1000) { - shouldDelete = true; - } - } - } catch (IOException e) { - log.debug("Could not check file size, skipping: {}", path); - } - - // For non-container mode, check file age before deleting - if (!containerMode && (isOurTempFile || isSystemTempFile) && !isEmptyFile) { - try { - long lastModified = Files.getLastModifiedTime(path).toMillis(); - long currentTime = System.currentTimeMillis(); - // Only delete files older than 24 hours in non-container mode - shouldDelete = (currentTime - lastModified) > 24 * 60 * 60 * 1000; - } catch (IOException e) { - log.debug("Could not check file age, skipping: {}", path); - shouldDelete = false; - } - } - - if (shouldDelete) { - try { - if (Files.isDirectory(path)) { - GeneralUtils.deleteDirectory(path); - } else { - Files.deleteIfExists(path); - } - deletedCount++; - log.debug("Deleted temp file during startup cleanup: {}", path); - } catch (IOException e) { - log.warn("Failed to delete temp file during startup cleanup: {}", path, e); - } - } + // Check file age against maxAgeMillis + if (shouldDelete && maxAgeMillis > 0) { + try { + long lastModified = Files.getLastModifiedTime(path).toMillis(); + long currentTime = System.currentTimeMillis(); + shouldDelete = (currentTime - lastModified) > maxAgeMillis; + } catch (IOException e) { + log.debug("Could not check file age, skipping: {}", path); + shouldDelete = false; } } - return deletedCount; + return shouldDelete; } /** Clean up LibreOffice temporary files. This method is called after LibreOffice operations. */ @@ -431,18 +378,17 @@ public class TempFileCleanupService { try { Set directories = registry.getTempDirectories(); for (Path dir : directories) { - if (dir.getFileName().toString().contains("libreoffice")) { + if (dir.getFileName().toString().contains("libreoffice") && Files.exists(dir)) { // For directories containing "libreoffice", delete all contents // but keep the directory itself for future use - try (Stream files = Files.list(dir)) { - for (Path file : files.toList()) { - if (Files.isDirectory(file)) { - GeneralUtils.deleteDirectory(file); - } else { - Files.deleteIfExists(file); - } - } - } + cleanupDirectoryStreaming( + dir, + isContainerMode(), + 0, + 0, // age doesn't matter for LibreOffice cleanup + false, + path -> log.debug("Cleaned up LibreOffice temp file: {}", path) + ); log.debug("Cleaned up LibreOffice temp directory contents: {}", dir); } } @@ -450,4 +396,4 @@ public class TempFileCleanupService { log.warn("Failed to clean up LibreOffice temp files", e); } } -} +} \ No newline at end of file diff --git a/common/src/test/java/stirling/software/common/service/TempFileCleanupServiceTest.java b/common/src/test/java/stirling/software/common/service/TempFileCleanupServiceTest.java index ebb323d93..db0e8aaae 100644 --- a/common/src/test/java/stirling/software/common/service/TempFileCleanupServiceTest.java +++ b/common/src/test/java/stirling/software/common/service/TempFileCleanupServiceTest.java @@ -14,6 +14,8 @@ import java.util.Arrays; import java.util.HashSet; import java.util.Set; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Consumer; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -118,12 +120,14 @@ public class TempFileCleanupServiceTest { // Create a file older than threshold Path oldFile = Files.createFile(systemTempDir.resolve("output_old.pdf")); - Files.setLastModifiedTime(oldFile, FileTime.from( Files.getLastModifiedTime(oldFile).toMillis() - 5000000, TimeUnit.MILLISECONDS)); + Files.setLastModifiedTime(oldFile, FileTime.from( + Files.getLastModifiedTime(oldFile).toMillis() - 5000000, + TimeUnit.MILLISECONDS)); // Act - invokeCleanupDirectory(systemTempDir, true, 0, 3600000); - invokeCleanupDirectory(customTempDir, true, 0, 3600000); - invokeCleanupDirectory(libreOfficeTempDir, true, 0, 3600000); + invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000); + invokeCleanupDirectoryStreaming(customTempDir, true, 0, 3600000); + invokeCleanupDirectoryStreaming(libreOfficeTempDir, true, 0, 3600000); // Assert - Our temp files and system temp files should be deleted (if old enough) assertFalse(Files.exists(oldFile), "Old temp file should be deleted"); @@ -141,14 +145,15 @@ public class TempFileCleanupServiceTest { // Arrange - Create an empty file Path emptyFile = Files.createFile(systemTempDir.resolve("empty.tmp")); // Make it "old enough" to be deleted (>5 minutes) - Files.setLastModifiedTime(emptyFile, FileTime.from( Files.getLastModifiedTime(emptyFile).toMillis() - 6 * 60 * 1000, TimeUnit.MILLISECONDS)); + Files.setLastModifiedTime(emptyFile, FileTime.from( + Files.getLastModifiedTime(emptyFile).toMillis() - 6 * 60 * 1000, + TimeUnit.MILLISECONDS)); - // Configure mock registry to say this file isn't registered when(registry.contains(any(File.class))).thenReturn(false); // Act - invokeCleanupDirectory(systemTempDir, true, 0, 3600000); + invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000); // Assert assertFalse(Files.exists(emptyFile), "Empty file older than 5 minutes should be deleted"); @@ -166,13 +171,15 @@ public class TempFileCleanupServiceTest { Path tempFile3 = Files.createFile(dir3.resolve("output_3.pdf")); // Make the deepest file old enough to be deleted - Files.setLastModifiedTime(tempFile3, FileTime.from( Files.getLastModifiedTime(tempFile3).toMillis() - 5000000, TimeUnit.MILLISECONDS)); + Files.setLastModifiedTime(tempFile3, FileTime.from( + Files.getLastModifiedTime(tempFile3).toMillis() - 5000000, + TimeUnit.MILLISECONDS)); // Configure mock registry to say these files aren't registered when(registry.contains(any(File.class))).thenReturn(false); // Act - invokeCleanupDirectory(systemTempDir, true, 0, 3600000); + invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000); // Assert assertTrue(Files.exists(tempFile1), "Recent temp file should be preserved"); @@ -181,17 +188,25 @@ public class TempFileCleanupServiceTest { } /** - * Helper method to invoke the private cleanupDirectory method using reflection + * Helper method to invoke the private cleanupDirectoryStreaming method using reflection */ - private int invokeCleanupDirectory(Path directory, boolean containerMode, int depth, long maxAgeMillis) + private void invokeCleanupDirectoryStreaming(Path directory, boolean containerMode, int depth, long maxAgeMillis) throws IOException { try { + // Create a consumer that tracks deleted files + AtomicInteger deleteCount = new AtomicInteger(0); + Consumer deleteCallback = path -> deleteCount.incrementAndGet(); + + // Get the new method with updated signature var method = TempFileCleanupService.class.getDeclaredMethod( - "cleanupDirectory", Path.class, boolean.class, int.class, long.class); + "cleanupDirectoryStreaming", + Path.class, boolean.class, int.class, long.class, boolean.class, Consumer.class); method.setAccessible(true); - return (int) method.invoke(cleanupService, directory, containerMode, depth, maxAgeMillis); + + // Invoke the method with appropriate parameters + method.invoke(cleanupService, directory, containerMode, depth, maxAgeMillis, false, deleteCallback); } catch (Exception e) { - throw new RuntimeException("Error invoking cleanupDirectory", e); + throw new RuntimeException("Error invoking cleanupDirectoryStreaming", e); } } } \ No newline at end of file diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java index be6c4649c..00b08d2bb 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/misc/OCRController.java @@ -2,7 +2,6 @@ package stirling.software.SPDF.controller.api.misc; import java.awt.image.BufferedImage; import java.io.*; -import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.zip.ZipEntry; @@ -23,7 +22,6 @@ import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; import org.springframework.web.multipart.MultipartFile; -import io.github.pixee.security.BoundedLineReader; import io.github.pixee.security.Filenames; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; @@ -34,6 +32,11 @@ import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest; import stirling.software.common.model.ApplicationProperties; import stirling.software.common.service.CustomPDFDocumentFactory; +import stirling.software.common.util.ProcessExecutor; +import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult; +import stirling.software.common.util.TempFileManager; +import stirling.software.common.util.TempFileUtil; +import stirling.software.common.util.TempFileUtil.TempFile; @RestController @RequestMapping("/api/v1/misc") @@ -43,8 +46,8 @@ import stirling.software.common.service.CustomPDFDocumentFactory; public class OCRController { private final ApplicationProperties applicationProperties; - private final CustomPDFDocumentFactory pdfDocumentFactory; + private final TempFileManager tempFileManager; /** Gets the list of available Tesseract languages from the tessdata directory */ public List getAvailableTesseractLanguages() { @@ -73,93 +76,108 @@ public class OCRController { MultipartFile inputFile = request.getFileInput(); List languages = request.getLanguages(); String ocrType = request.getOcrType(); - Path tempDir = Files.createTempDirectory("ocr_process"); - Path tempInputFile = tempDir.resolve("input.pdf"); - Path tempOutputDir = tempDir.resolve("output"); - Path tempImagesDir = tempDir.resolve("images"); - Path finalOutputFile = tempDir.resolve("final_output.pdf"); - Files.createDirectories(tempOutputDir); - Files.createDirectories(tempImagesDir); - Process process = null; + + // Create a temp directory using TempFileManager directly + Path tempDirPath = tempFileManager.createTempDirectory(); + File tempDir = tempDirPath.toFile(); + try { + File tempInputFile = new File(tempDir, "input.pdf"); + File tempOutputDir = new File(tempDir, "output"); + File tempImagesDir = new File(tempDir, "images"); + File finalOutputFile = new File(tempDir, "final_output.pdf"); + + // Create directories + tempOutputDir.mkdirs(); + tempImagesDir.mkdirs(); + // Save input file - inputFile.transferTo(tempInputFile.toFile()); + inputFile.transferTo(tempInputFile); + PDFMergerUtility merger = new PDFMergerUtility(); merger.setDestinationFileName(finalOutputFile.toString()); - try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) { + + try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) { PDFRenderer pdfRenderer = new PDFRenderer(document); int pageCount = document.getNumberOfPages(); + for (int pageNum = 0; pageNum < pageCount; pageNum++) { PDPage page = document.getPage(pageNum); boolean hasText = false; + // Check for existing text try (PDDocument tempDoc = new PDDocument()) { tempDoc.addPage(page); PDFTextStripper stripper = new PDFTextStripper(); hasText = !stripper.getText(tempDoc).trim().isEmpty(); } - boolean shouldOcr = - switch (ocrType) { - case "skip-text" -> !hasText; - case "force-ocr" -> true; - default -> true; - }; - Path pageOutputPath = - tempOutputDir.resolve(String.format("page_%d.pdf", pageNum)); + + boolean shouldOcr = switch (ocrType) { + case "skip-text" -> !hasText; + case "force-ocr" -> true; + default -> true; + }; + + File pageOutputPath = new File(tempOutputDir, String.format("page_%d.pdf", pageNum)); + if (shouldOcr) { // Convert page to image BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300); - Path imagePath = - tempImagesDir.resolve(String.format("page_%d.png", pageNum)); - ImageIO.write(image, "png", imagePath.toFile()); + File imagePath = new File(tempImagesDir, String.format("page_%d.png", pageNum)); + ImageIO.write(image, "png", imagePath); + // Build OCR command List command = new ArrayList<>(); command.add("tesseract"); command.add(imagePath.toString()); command.add( - tempOutputDir - .resolve(String.format("page_%d", pageNum)) + new File(tempOutputDir, String.format("page_%d", pageNum)) .toString()); command.add("-l"); command.add(String.join("+", languages)); // Always output PDF command.add("pdf"); - ProcessBuilder pb = new ProcessBuilder(command); - process = pb.start(); - // Capture any error output - try (BufferedReader reader = - new BufferedReader( - new InputStreamReader(process.getErrorStream()))) { - String line; - while ((line = BoundedLineReader.readLine(reader, 5_000_000)) != null) { - log.debug("Tesseract: {}", line); + + // Use ProcessExecutor to run tesseract command + try { + ProcessExecutorResult result = ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT) + .runCommandWithOutputHandling(command); + + log.debug("Tesseract OCR completed for page {} with exit code {}", + pageNum, result.getRc()); + + // Add OCR'd PDF to merger + merger.addSource(pageOutputPath); + } catch (IOException | InterruptedException e) { + log.error("Error processing page {} with tesseract: {}", pageNum, e.getMessage()); + // If OCR fails, fall back to the original page + try (PDDocument pageDoc = new PDDocument()) { + pageDoc.addPage(page); + pageDoc.save(pageOutputPath); + merger.addSource(pageOutputPath); } } - int exitCode = process.waitFor(); - if (exitCode != 0) { - throw new RuntimeException( - "Tesseract failed with exit code: " + exitCode); - } - // Add OCR'd PDF to merger - merger.addSource(pageOutputPath.toFile()); } else { // Save original page without OCR try (PDDocument pageDoc = new PDDocument()) { pageDoc.addPage(page); - pageDoc.save(pageOutputPath.toFile()); - merger.addSource(pageOutputPath.toFile()); + pageDoc.save(pageOutputPath); + merger.addSource(pageOutputPath); } } } } + // Merge all pages into final PDF merger.mergeDocuments(null); + // Read the final PDF file - byte[] pdfContent = Files.readAllBytes(finalOutputFile); + byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath()); String outputFilename = Filenames.toSimpleFileName(inputFile.getOriginalFilename()) .replaceFirst("[.][^.]+$", "") + "_OCR.pdf"; + return ResponseEntity.ok() .header( "Content-Disposition", @@ -167,14 +185,11 @@ public class OCRController { .contentType(MediaType.APPLICATION_PDF) .body(pdfContent); } finally { - if (process != null) { - process.destroy(); - } - // Clean up temporary files - deleteDirectory(tempDir); + // Clean up the temp directory and all its contents + tempFileManager.deleteTempDirectory(tempDirPath); } } - + private void addFileToZip(File file, String filename, ZipOutputStream zipOut) throws IOException { if (!file.exists()) { @@ -192,21 +207,4 @@ public class OCRController { zipOut.closeEntry(); } } - - private void deleteDirectory(Path directory) { - try { - Files.walk(directory) - .sorted(Comparator.reverseOrder()) - .forEach( - path -> { - try { - Files.delete(path); - } catch (IOException e) { - log.error("Error deleting {}: {}", path, e.getMessage()); - } - }); - } catch (IOException e) { - log.error("Error walking directory {}: {}", directory, e.getMessage()); - } - } -} +} \ No newline at end of file diff --git a/stirling-pdf/src/main/resources/application.properties b/stirling-pdf/src/main/resources/application.properties index 00a2e87e1..ea30bf78e 100644 --- a/stirling-pdf/src/main/resources/application.properties +++ b/stirling-pdf/src/main/resources/application.properties @@ -44,4 +44,7 @@ springdoc.swagger-ui.path=/index.html posthog.api.key=phc_fiR65u5j6qmXTYL56MNrLZSWqLaDW74OrZH0Insd2xq posthog.host=https://eu.i.posthog.com -spring.main.allow-bean-definition-overriding=true \ No newline at end of file +spring.main.allow-bean-definition-overriding=true + +# Set up a consistent temporary directory location +java.io.tmpdir=${stirling.tempfiles.directory:${java.io.tmpdir}/stirling-pdf} \ No newline at end of file