mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
improvements to tmp dir, OCR and others
This commit is contained in:
parent
5aec97939e
commit
93ea611eda
@ -25,7 +25,7 @@ import stirling.software.common.util.TempFileRegistry;
|
||||
@Configuration
|
||||
public class TempFileConfiguration {
|
||||
|
||||
@Value("${stirling.tempfiles.directory:}")
|
||||
@Value("${stirling.tempfiles.directory:${java.io.tmpdir}/stirling-pdf}")
|
||||
private String customTempDirectory;
|
||||
|
||||
@Autowired
|
||||
@ -48,42 +48,11 @@ public class TempFileConfiguration {
|
||||
@PostConstruct
|
||||
public void initTempFileConfig() {
|
||||
try {
|
||||
// If a custom temp directory is specified in the config, use it
|
||||
if (customTempDirectory != null && !customTempDirectory.isEmpty()) {
|
||||
Path tempDir = Path.of(customTempDirectory);
|
||||
if (!Files.exists(tempDir)) {
|
||||
Files.createDirectories(tempDir);
|
||||
log.info("Created custom temporary directory: {}", tempDir);
|
||||
}
|
||||
|
||||
// Set Java temp directory system property if in Docker/Kubernetes mode
|
||||
if ("Docker".equals(machineType) || "Kubernetes".equals(machineType)) {
|
||||
System.setProperty("java.io.tmpdir", customTempDirectory);
|
||||
log.info(
|
||||
"Set system temp directory to: {} for environment: {}",
|
||||
customTempDirectory,
|
||||
machineType);
|
||||
}
|
||||
} else {
|
||||
// No custom directory specified, use java.io.tmpdir + application subfolder
|
||||
String defaultTempDir;
|
||||
|
||||
if ("Docker".equals(machineType) || "Kubernetes".equals(machineType)) {
|
||||
// Container environments should continue to use /tmp/stirling-pdf
|
||||
defaultTempDir = "/tmp/stirling-pdf";
|
||||
} else {
|
||||
// Use system temp directory (java.io.tmpdir) with our application subfolder
|
||||
// This automatically handles Windows (AppData\Local\Temp), macOS, and Linux systems
|
||||
defaultTempDir = System.getProperty("java.io.tmpdir") + File.separator + "stirling-pdf";
|
||||
}
|
||||
customTempDirectory = defaultTempDir;
|
||||
|
||||
// Create the default temp directory
|
||||
Path tempDir = Path.of(customTempDirectory);
|
||||
if (!Files.exists(tempDir)) {
|
||||
Files.createDirectories(tempDir);
|
||||
log.info("Created default OS-specific temporary directory: {}", tempDir);
|
||||
}
|
||||
// Create the temp directory if it doesn't exist
|
||||
Path tempDir = Path.of(customTempDirectory);
|
||||
if (!Files.exists(tempDir)) {
|
||||
Files.createDirectories(tempDir);
|
||||
log.info("Created temporary directory: {}", tempDir);
|
||||
}
|
||||
|
||||
log.info("Temporary file configuration initialized");
|
||||
@ -93,4 +62,4 @@ public class TempFileConfiguration {
|
||||
log.error("Failed to initialize temporary file configuration", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -3,8 +3,12 @@ package stirling.software.common.service;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
@ -49,6 +53,36 @@ public class TempFileCleanupService {
|
||||
@Value("${stirling.tempfiles.libreoffice-dir:/tmp/stirling-pdf/libreoffice}")
|
||||
private String libreOfficeTempDir;
|
||||
|
||||
// Maximum recursion depth for directory traversal
|
||||
private static final int MAX_RECURSION_DEPTH = 5;
|
||||
|
||||
// File patterns that identify our temp files
|
||||
private static final Predicate<String> IS_OUR_TEMP_FILE = fileName ->
|
||||
fileName.startsWith("stirling-pdf-") ||
|
||||
fileName.startsWith("output_") ||
|
||||
fileName.startsWith("compressedPDF") ||
|
||||
fileName.startsWith("pdf-save-") ||
|
||||
fileName.startsWith("pdf-stream-") ||
|
||||
fileName.startsWith("PDFBox") ||
|
||||
fileName.startsWith("input_") ||
|
||||
fileName.startsWith("overlay-");
|
||||
|
||||
// File patterns that identify common system temp files
|
||||
private static final Predicate<String> IS_SYSTEM_TEMP_FILE = fileName ->
|
||||
fileName.matches("lu\\d+[a-z0-9]*\\.tmp") ||
|
||||
fileName.matches("ocr_process\\d+") ||
|
||||
(fileName.startsWith("tmp") && !fileName.contains("jetty")) ||
|
||||
fileName.startsWith("OSL_PIPE_") ||
|
||||
(fileName.endsWith(".tmp") && !fileName.contains("jetty"));
|
||||
|
||||
// File patterns that should be excluded from cleanup
|
||||
private static final Predicate<String> SHOULD_SKIP = fileName ->
|
||||
fileName.contains("jetty") ||
|
||||
fileName.startsWith("jetty-") ||
|
||||
fileName.equals("proc") ||
|
||||
fileName.equals("sys") ||
|
||||
fileName.equals("dev");
|
||||
|
||||
@Autowired
|
||||
public TempFileCleanupService(TempFileRegistry registry, TempFileManager tempFileManager) {
|
||||
this.registry = registry;
|
||||
@ -114,41 +148,9 @@ public class TempFileCleanupService {
|
||||
}
|
||||
}
|
||||
|
||||
int unregisteredDeletedCount = 0;
|
||||
try {
|
||||
// Get all directories we need to clean
|
||||
Path systemTempPath;
|
||||
if (systemTempDir != null && !systemTempDir.isEmpty()) {
|
||||
systemTempPath = Path.of(systemTempDir);
|
||||
} else {
|
||||
systemTempPath = Path.of(System.getProperty("java.io.tmpdir"));
|
||||
}
|
||||
|
||||
Path[] dirsToScan = {
|
||||
systemTempPath, Path.of(customTempDirectory), Path.of(libreOfficeTempDir)
|
||||
};
|
||||
|
||||
boolean containerMode =
|
||||
"Docker".equals(machineType) || "Kubernetes".equals(machineType);
|
||||
|
||||
// Process each directory
|
||||
for (Path tempDir : dirsToScan) {
|
||||
if (!Files.exists(tempDir)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int dirDeletedCount = cleanupDirectory(tempDir, containerMode, 0, maxAgeMillis);
|
||||
unregisteredDeletedCount += dirDeletedCount;
|
||||
if (dirDeletedCount > 0) {
|
||||
log.info(
|
||||
"Cleaned up {} unregistered files/directories in {}",
|
||||
dirDeletedCount,
|
||||
tempDir);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("Error during scheduled cleanup of unregistered files", e);
|
||||
}
|
||||
// Clean up unregistered temp files based on our cleanup strategy
|
||||
boolean containerMode = isContainerMode();
|
||||
int unregisteredDeletedCount = cleanupUnregisteredFiles(containerMode, true, maxAgeMillis);
|
||||
|
||||
log.info(
|
||||
"Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories",
|
||||
@ -157,272 +159,217 @@ public class TempFileCleanupService {
|
||||
directoriesDeletedCount);
|
||||
}
|
||||
|
||||
/** Overload of cleanupDirectory that uses the specified max age for files */
|
||||
private int cleanupDirectory(
|
||||
Path directory, boolean containerMode, int depth, long maxAgeMillis)
|
||||
throws IOException {
|
||||
if (depth > 5) {
|
||||
log.warn("Maximum directory recursion depth reached for: {}", directory);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int deletedCount = 0;
|
||||
|
||||
try (Stream<Path> paths = Files.list(directory)) {
|
||||
for (Path path : paths.toList()) {
|
||||
String fileName = path.getFileName().toString();
|
||||
|
||||
// Skip registered files - these are handled by TempFileManager
|
||||
if (registry.contains(path.toFile())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip Jetty-related directories and files
|
||||
if (fileName.contains("jetty") || fileName.startsWith("jetty-")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if this is a directory we should recursively scan
|
||||
if (Files.isDirectory(path)) {
|
||||
// Don't recurse into certain system directories
|
||||
if (!fileName.equals("proc")
|
||||
&& !fileName.equals("sys")
|
||||
&& !fileName.equals("dev")) {
|
||||
deletedCount +=
|
||||
cleanupDirectory(path, containerMode, depth + 1, maxAgeMillis);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Determine if this file matches our temp file patterns
|
||||
boolean isOurTempFile =
|
||||
fileName.startsWith("stirling-pdf-")
|
||||
|| fileName.startsWith("output_")
|
||||
|| fileName.startsWith("compressedPDF")
|
||||
|| fileName.startsWith("pdf-save-")
|
||||
|| fileName.startsWith("pdf-stream-")
|
||||
|| fileName.startsWith("PDFBox")
|
||||
|| fileName.startsWith("input_")
|
||||
|| fileName.startsWith("overlay-");
|
||||
|
||||
// Avoid touching Jetty files
|
||||
boolean isSystemTempFile =
|
||||
fileName.matches("lu\\d+[a-z0-9]*\\.tmp")
|
||||
|| fileName.matches("ocr_process\\d+")
|
||||
|| (fileName.startsWith("tmp") && !fileName.contains("jetty"))
|
||||
|| fileName.startsWith("OSL_PIPE_")
|
||||
|| (fileName.endsWith(".tmp") && !fileName.contains("jetty"));
|
||||
|
||||
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
|
||||
|
||||
// Special case for zero-byte files - these are often corrupted temp files
|
||||
try {
|
||||
if (Files.size(path) == 0) {
|
||||
// For empty files, use a shorter timeout (5 minutes)
|
||||
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
||||
long currentTime = System.currentTimeMillis();
|
||||
// Delete empty files older than 5 minutes
|
||||
if ((currentTime - lastModified) > 5 * 60 * 1000) {
|
||||
shouldDelete = true;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.debug("Could not check file size, skipping: {}", path);
|
||||
}
|
||||
|
||||
// Check file age against maxAgeMillis
|
||||
if (shouldDelete) {
|
||||
try {
|
||||
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
||||
long currentTime = System.currentTimeMillis();
|
||||
shouldDelete = (currentTime - lastModified) > maxAgeMillis;
|
||||
} catch (IOException e) {
|
||||
log.debug("Could not check file age, skipping: {}", path);
|
||||
shouldDelete = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (shouldDelete) {
|
||||
try {
|
||||
Files.deleteIfExists(path);
|
||||
deletedCount++;
|
||||
log.debug(
|
||||
"Deleted unregistered temp file during scheduled cleanup: {}",
|
||||
path);
|
||||
} catch (IOException e) {
|
||||
// Handle locked files more gracefully - just log at debug level
|
||||
if (e.getMessage() != null
|
||||
&& e.getMessage().contains("being used by another process")) {
|
||||
log.debug("File locked, skipping delete: {}", path);
|
||||
} else {
|
||||
log.warn(
|
||||
"Failed to delete temp file during scheduled cleanup: {}",
|
||||
path,
|
||||
e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return deletedCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform startup cleanup of stale temporary files from previous runs. This is especially
|
||||
* important in Docker environments where temp files persist between container restarts.
|
||||
*/
|
||||
private void runStartupCleanup() {
|
||||
log.info("Running startup temporary file cleanup");
|
||||
boolean containerMode = isContainerMode();
|
||||
|
||||
log.info(
|
||||
"Running in {} mode, using {} cleanup strategy",
|
||||
machineType,
|
||||
containerMode ? "aggressive" : "conservative");
|
||||
|
||||
// For startup cleanup, we use a longer timeout for non-container environments
|
||||
long maxAgeMillis = containerMode ? 0 : 24 * 60 * 60 * 1000; // 0 or 24 hours
|
||||
|
||||
int totalDeletedCount = cleanupUnregisteredFiles(containerMode, false, maxAgeMillis);
|
||||
|
||||
log.info(
|
||||
"Startup cleanup complete. Deleted {} temporary files/directories",
|
||||
totalDeletedCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up unregistered temporary files across all configured temp directories.
|
||||
*
|
||||
* @param containerMode Whether we're in container mode (more aggressive cleanup)
|
||||
* @param isScheduled Whether this is a scheduled cleanup or startup cleanup
|
||||
* @param maxAgeMillis Maximum age of files to clean in milliseconds
|
||||
* @return Number of files deleted
|
||||
*/
|
||||
private int cleanupUnregisteredFiles(boolean containerMode, boolean isScheduled, long maxAgeMillis) {
|
||||
AtomicInteger totalDeletedCount = new AtomicInteger(0);
|
||||
|
||||
try {
|
||||
// Get all directories we need to clean
|
||||
Path systemTempPath;
|
||||
if (systemTempDir != null && !systemTempDir.isEmpty()) {
|
||||
systemTempPath = Path.of(systemTempDir);
|
||||
} else {
|
||||
systemTempPath = Path.of(System.getProperty("java.io.tmpdir"));
|
||||
}
|
||||
|
||||
Path systemTempPath = getSystemTempPath();
|
||||
Path[] dirsToScan = {
|
||||
systemTempPath, Path.of(customTempDirectory), Path.of(libreOfficeTempDir)
|
||||
systemTempPath,
|
||||
Path.of(customTempDirectory),
|
||||
Path.of(libreOfficeTempDir)
|
||||
};
|
||||
|
||||
int totalDeletedCount = 0;
|
||||
|
||||
boolean containerMode =
|
||||
"Docker".equals(machineType) || "Kubernetes".equals(machineType);
|
||||
log.info(
|
||||
"Running in {} mode, using {} cleanup strategy",
|
||||
machineType,
|
||||
containerMode ? "aggressive" : "conservative");
|
||||
|
||||
// Process each directory
|
||||
for (Path tempDir : dirsToScan) {
|
||||
if (!Files.exists(tempDir)) {
|
||||
log.warn("Temporary directory does not exist: {}", tempDir);
|
||||
continue;
|
||||
Arrays.stream(dirsToScan)
|
||||
.filter(Files::exists)
|
||||
.forEach(tempDir -> {
|
||||
try {
|
||||
String phase = isScheduled ? "scheduled" : "startup";
|
||||
log.info("Scanning directory for {} cleanup: {}", phase, tempDir);
|
||||
|
||||
AtomicInteger dirDeletedCount = new AtomicInteger(0);
|
||||
cleanupDirectoryStreaming(
|
||||
tempDir,
|
||||
containerMode,
|
||||
0,
|
||||
maxAgeMillis,
|
||||
isScheduled,
|
||||
path -> {
|
||||
dirDeletedCount.incrementAndGet();
|
||||
if (log.isDebugEnabled()) {
|
||||
log.debug("Deleted temp file during {} cleanup: {}", phase, path);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
int count = dirDeletedCount.get();
|
||||
totalDeletedCount.addAndGet(count);
|
||||
if (count > 0) {
|
||||
log.info("Cleaned up {} files/directories in {}", count, tempDir);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("Error during cleanup of directory: {}", tempDir, e);
|
||||
}
|
||||
});
|
||||
} catch (Exception e) {
|
||||
log.error("Error during cleanup of unregistered files", e);
|
||||
}
|
||||
|
||||
return totalDeletedCount.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the system temp directory path based on configuration or system property.
|
||||
*/
|
||||
private Path getSystemTempPath() {
|
||||
if (systemTempDir != null && !systemTempDir.isEmpty()) {
|
||||
return Path.of(systemTempDir);
|
||||
} else {
|
||||
return Path.of(System.getProperty("java.io.tmpdir"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if we're running in a container environment.
|
||||
*/
|
||||
private boolean isContainerMode() {
|
||||
return "Docker".equals(machineType) || "Kubernetes".equals(machineType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively clean up a directory using a streaming approach to reduce memory usage.
|
||||
*
|
||||
* @param directory The directory to clean
|
||||
* @param containerMode Whether we're in container mode (more aggressive cleanup)
|
||||
* @param depth Current recursion depth
|
||||
* @param maxAgeMillis Maximum age of files to delete
|
||||
* @param isScheduled Whether this is a scheduled cleanup (vs startup)
|
||||
* @param onDeleteCallback Callback function when a file is deleted
|
||||
* @throws IOException If an I/O error occurs
|
||||
*/
|
||||
private void cleanupDirectoryStreaming(
|
||||
Path directory,
|
||||
boolean containerMode,
|
||||
int depth,
|
||||
long maxAgeMillis,
|
||||
boolean isScheduled,
|
||||
Consumer<Path> onDeleteCallback) throws IOException {
|
||||
|
||||
// Check recursion depth limit
|
||||
if (depth > MAX_RECURSION_DEPTH) {
|
||||
log.warn("Maximum directory recursion depth reached for: {}", directory);
|
||||
return;
|
||||
}
|
||||
|
||||
// Use try-with-resources to ensure the stream is closed
|
||||
try (Stream<Path> pathStream = Files.list(directory)) {
|
||||
// Process files in a streaming fashion instead of materializing the whole list
|
||||
pathStream.forEach(path -> {
|
||||
try {
|
||||
String fileName = path.getFileName().toString();
|
||||
|
||||
// Skip if file should be excluded
|
||||
if (SHOULD_SKIP.test(fileName)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle directories recursively
|
||||
if (Files.isDirectory(path)) {
|
||||
try {
|
||||
cleanupDirectoryStreaming(
|
||||
path, containerMode, depth + 1, maxAgeMillis, isScheduled, onDeleteCallback);
|
||||
} catch (IOException e) {
|
||||
log.warn("Error processing subdirectory: {}", path, e);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Skip registered files - these are handled by TempFileManager
|
||||
if (isScheduled && registry.contains(path.toFile())) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if this file should be deleted
|
||||
if (shouldDeleteFile(path, fileName, containerMode, maxAgeMillis)) {
|
||||
try {
|
||||
Files.deleteIfExists(path);
|
||||
onDeleteCallback.accept(path);
|
||||
} catch (IOException e) {
|
||||
// Handle locked files more gracefully
|
||||
if (e.getMessage() != null && e.getMessage().contains("being used by another process")) {
|
||||
log.debug("File locked, skipping delete: {}", path);
|
||||
} else {
|
||||
log.warn("Failed to delete temp file: {}", path, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("Error processing path: {}", path, e);
|
||||
}
|
||||
|
||||
log.info("Scanning directory for cleanup: {}", tempDir);
|
||||
int dirDeletedCount = cleanupDirectory(tempDir, containerMode, 0);
|
||||
totalDeletedCount += dirDeletedCount;
|
||||
log.info("Cleaned up {} files/directories in {}", dirDeletedCount, tempDir);
|
||||
}
|
||||
|
||||
log.info(
|
||||
"Startup cleanup complete. Deleted {} temporary files/directories",
|
||||
totalDeletedCount);
|
||||
} catch (IOException e) {
|
||||
log.error("Error during startup cleanup", e);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively clean up a directory for temporary files.
|
||||
*
|
||||
* @param directory The directory to clean
|
||||
* @param containerMode Whether we're in container mode (more aggressive cleanup)
|
||||
* @param depth Current recursion depth (to prevent excessive recursion)
|
||||
* @return Number of files deleted
|
||||
* Determine if a file should be deleted based on its name, age, and other criteria.
|
||||
*/
|
||||
private int cleanupDirectory(Path directory, boolean containerMode, int depth)
|
||||
throws IOException {
|
||||
if (depth > 5) {
|
||||
log.warn("Maximum directory recursion depth reached for: {}", directory);
|
||||
return 0;
|
||||
private boolean shouldDeleteFile(Path path, String fileName, boolean containerMode, long maxAgeMillis) {
|
||||
// First check if it matches our known temp file patterns
|
||||
boolean isOurTempFile = IS_OUR_TEMP_FILE.test(fileName);
|
||||
boolean isSystemTempFile = IS_SYSTEM_TEMP_FILE.test(fileName);
|
||||
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
|
||||
|
||||
// Special case for zero-byte files - these are often corrupted temp files
|
||||
try {
|
||||
if (Files.size(path) == 0) {
|
||||
// For empty files, use a shorter timeout (5 minutes)
|
||||
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
||||
long currentTime = System.currentTimeMillis();
|
||||
// Delete empty files older than 5 minutes
|
||||
if ((currentTime - lastModified) > 5 * 60 * 1000) {
|
||||
shouldDelete = true;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.debug("Could not check file size, skipping: {}", path);
|
||||
}
|
||||
|
||||
int deletedCount = 0;
|
||||
|
||||
try (Stream<Path> paths = Files.list(directory)) {
|
||||
for (Path path : paths.toList()) {
|
||||
String fileName = path.getFileName().toString();
|
||||
|
||||
// Skip Jetty-related directories and files
|
||||
if (fileName.contains("jetty") || fileName.startsWith("jetty-")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if this is a directory we should recursively scan
|
||||
if (Files.isDirectory(path)) {
|
||||
// Don't recurse into certain system directories
|
||||
if (!fileName.equals("proc")
|
||||
&& !fileName.equals("sys")
|
||||
&& !fileName.equals("dev")) {
|
||||
deletedCount += cleanupDirectory(path, containerMode, depth + 1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Determine if this file matches our temp file patterns
|
||||
boolean isOurTempFile =
|
||||
fileName.startsWith("stirling-pdf-")
|
||||
|| fileName.startsWith("output_")
|
||||
|| fileName.startsWith("compressedPDF")
|
||||
|| fileName.startsWith("pdf-save-")
|
||||
|| fileName.startsWith("pdf-stream-")
|
||||
|| fileName.startsWith("PDFBox")
|
||||
|| fileName.startsWith("input_")
|
||||
|| fileName.startsWith("overlay-");
|
||||
|
||||
// Avoid touching Jetty files
|
||||
boolean isSystemTempFile =
|
||||
fileName.matches("lu\\d+[a-z0-9]*\\.tmp")
|
||||
|| fileName.matches("ocr_process\\d+")
|
||||
|| (fileName.startsWith("tmp") && !fileName.contains("jetty"))
|
||||
|| fileName.startsWith("OSL_PIPE_")
|
||||
|| (fileName.endsWith(".tmp") && !fileName.contains("jetty"));
|
||||
|
||||
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
|
||||
|
||||
// Special case for zero-byte files - these are often corrupted temp files
|
||||
boolean isEmptyFile = false;
|
||||
try {
|
||||
if (!Files.isDirectory(path) && Files.size(path) == 0) {
|
||||
isEmptyFile = true;
|
||||
// For empty files, use a shorter timeout (5 minutes)
|
||||
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
||||
long currentTime = System.currentTimeMillis();
|
||||
// Delete empty files older than 5 minutes
|
||||
if ((currentTime - lastModified) > 5 * 60 * 1000) {
|
||||
shouldDelete = true;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.debug("Could not check file size, skipping: {}", path);
|
||||
}
|
||||
|
||||
// For non-container mode, check file age before deleting
|
||||
if (!containerMode && (isOurTempFile || isSystemTempFile) && !isEmptyFile) {
|
||||
try {
|
||||
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
||||
long currentTime = System.currentTimeMillis();
|
||||
// Only delete files older than 24 hours in non-container mode
|
||||
shouldDelete = (currentTime - lastModified) > 24 * 60 * 60 * 1000;
|
||||
} catch (IOException e) {
|
||||
log.debug("Could not check file age, skipping: {}", path);
|
||||
shouldDelete = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (shouldDelete) {
|
||||
try {
|
||||
if (Files.isDirectory(path)) {
|
||||
GeneralUtils.deleteDirectory(path);
|
||||
} else {
|
||||
Files.deleteIfExists(path);
|
||||
}
|
||||
deletedCount++;
|
||||
log.debug("Deleted temp file during startup cleanup: {}", path);
|
||||
} catch (IOException e) {
|
||||
log.warn("Failed to delete temp file during startup cleanup: {}", path, e);
|
||||
}
|
||||
}
|
||||
// Check file age against maxAgeMillis
|
||||
if (shouldDelete && maxAgeMillis > 0) {
|
||||
try {
|
||||
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
||||
long currentTime = System.currentTimeMillis();
|
||||
shouldDelete = (currentTime - lastModified) > maxAgeMillis;
|
||||
} catch (IOException e) {
|
||||
log.debug("Could not check file age, skipping: {}", path);
|
||||
shouldDelete = false;
|
||||
}
|
||||
}
|
||||
|
||||
return deletedCount;
|
||||
return shouldDelete;
|
||||
}
|
||||
|
||||
/** Clean up LibreOffice temporary files. This method is called after LibreOffice operations. */
|
||||
@ -431,18 +378,17 @@ public class TempFileCleanupService {
|
||||
try {
|
||||
Set<Path> directories = registry.getTempDirectories();
|
||||
for (Path dir : directories) {
|
||||
if (dir.getFileName().toString().contains("libreoffice")) {
|
||||
if (dir.getFileName().toString().contains("libreoffice") && Files.exists(dir)) {
|
||||
// For directories containing "libreoffice", delete all contents
|
||||
// but keep the directory itself for future use
|
||||
try (Stream<Path> files = Files.list(dir)) {
|
||||
for (Path file : files.toList()) {
|
||||
if (Files.isDirectory(file)) {
|
||||
GeneralUtils.deleteDirectory(file);
|
||||
} else {
|
||||
Files.deleteIfExists(file);
|
||||
}
|
||||
}
|
||||
}
|
||||
cleanupDirectoryStreaming(
|
||||
dir,
|
||||
isContainerMode(),
|
||||
0,
|
||||
0, // age doesn't matter for LibreOffice cleanup
|
||||
false,
|
||||
path -> log.debug("Cleaned up LibreOffice temp file: {}", path)
|
||||
);
|
||||
log.debug("Cleaned up LibreOffice temp directory contents: {}", dir);
|
||||
}
|
||||
}
|
||||
@ -450,4 +396,4 @@ public class TempFileCleanupService {
|
||||
log.warn("Failed to clean up LibreOffice temp files", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -14,6 +14,8 @@ import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -118,12 +120,14 @@ public class TempFileCleanupServiceTest {
|
||||
|
||||
// Create a file older than threshold
|
||||
Path oldFile = Files.createFile(systemTempDir.resolve("output_old.pdf"));
|
||||
Files.setLastModifiedTime(oldFile, FileTime.from( Files.getLastModifiedTime(oldFile).toMillis() - 5000000, TimeUnit.MILLISECONDS));
|
||||
Files.setLastModifiedTime(oldFile, FileTime.from(
|
||||
Files.getLastModifiedTime(oldFile).toMillis() - 5000000,
|
||||
TimeUnit.MILLISECONDS));
|
||||
|
||||
// Act
|
||||
invokeCleanupDirectory(systemTempDir, true, 0, 3600000);
|
||||
invokeCleanupDirectory(customTempDir, true, 0, 3600000);
|
||||
invokeCleanupDirectory(libreOfficeTempDir, true, 0, 3600000);
|
||||
invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
|
||||
invokeCleanupDirectoryStreaming(customTempDir, true, 0, 3600000);
|
||||
invokeCleanupDirectoryStreaming(libreOfficeTempDir, true, 0, 3600000);
|
||||
|
||||
// Assert - Our temp files and system temp files should be deleted (if old enough)
|
||||
assertFalse(Files.exists(oldFile), "Old temp file should be deleted");
|
||||
@ -141,14 +145,15 @@ public class TempFileCleanupServiceTest {
|
||||
// Arrange - Create an empty file
|
||||
Path emptyFile = Files.createFile(systemTempDir.resolve("empty.tmp"));
|
||||
// Make it "old enough" to be deleted (>5 minutes)
|
||||
Files.setLastModifiedTime(emptyFile, FileTime.from( Files.getLastModifiedTime(emptyFile).toMillis() - 6 * 60 * 1000, TimeUnit.MILLISECONDS));
|
||||
Files.setLastModifiedTime(emptyFile, FileTime.from(
|
||||
Files.getLastModifiedTime(emptyFile).toMillis() - 6 * 60 * 1000,
|
||||
TimeUnit.MILLISECONDS));
|
||||
|
||||
|
||||
// Configure mock registry to say this file isn't registered
|
||||
when(registry.contains(any(File.class))).thenReturn(false);
|
||||
|
||||
// Act
|
||||
invokeCleanupDirectory(systemTempDir, true, 0, 3600000);
|
||||
invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
|
||||
|
||||
// Assert
|
||||
assertFalse(Files.exists(emptyFile), "Empty file older than 5 minutes should be deleted");
|
||||
@ -166,13 +171,15 @@ public class TempFileCleanupServiceTest {
|
||||
Path tempFile3 = Files.createFile(dir3.resolve("output_3.pdf"));
|
||||
|
||||
// Make the deepest file old enough to be deleted
|
||||
Files.setLastModifiedTime(tempFile3, FileTime.from( Files.getLastModifiedTime(tempFile3).toMillis() - 5000000, TimeUnit.MILLISECONDS));
|
||||
Files.setLastModifiedTime(tempFile3, FileTime.from(
|
||||
Files.getLastModifiedTime(tempFile3).toMillis() - 5000000,
|
||||
TimeUnit.MILLISECONDS));
|
||||
|
||||
// Configure mock registry to say these files aren't registered
|
||||
when(registry.contains(any(File.class))).thenReturn(false);
|
||||
|
||||
// Act
|
||||
invokeCleanupDirectory(systemTempDir, true, 0, 3600000);
|
||||
invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
|
||||
|
||||
// Assert
|
||||
assertTrue(Files.exists(tempFile1), "Recent temp file should be preserved");
|
||||
@ -181,17 +188,25 @@ public class TempFileCleanupServiceTest {
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to invoke the private cleanupDirectory method using reflection
|
||||
* Helper method to invoke the private cleanupDirectoryStreaming method using reflection
|
||||
*/
|
||||
private int invokeCleanupDirectory(Path directory, boolean containerMode, int depth, long maxAgeMillis)
|
||||
private void invokeCleanupDirectoryStreaming(Path directory, boolean containerMode, int depth, long maxAgeMillis)
|
||||
throws IOException {
|
||||
try {
|
||||
// Create a consumer that tracks deleted files
|
||||
AtomicInteger deleteCount = new AtomicInteger(0);
|
||||
Consumer<Path> deleteCallback = path -> deleteCount.incrementAndGet();
|
||||
|
||||
// Get the new method with updated signature
|
||||
var method = TempFileCleanupService.class.getDeclaredMethod(
|
||||
"cleanupDirectory", Path.class, boolean.class, int.class, long.class);
|
||||
"cleanupDirectoryStreaming",
|
||||
Path.class, boolean.class, int.class, long.class, boolean.class, Consumer.class);
|
||||
method.setAccessible(true);
|
||||
return (int) method.invoke(cleanupService, directory, containerMode, depth, maxAgeMillis);
|
||||
|
||||
// Invoke the method with appropriate parameters
|
||||
method.invoke(cleanupService, directory, containerMode, depth, maxAgeMillis, false, deleteCallback);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Error invoking cleanupDirectory", e);
|
||||
throw new RuntimeException("Error invoking cleanupDirectoryStreaming", e);
|
||||
}
|
||||
}
|
||||
}
|
@ -2,7 +2,6 @@ package stirling.software.SPDF.controller.api.misc;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.zip.ZipEntry;
|
||||
@ -23,7 +22,6 @@ import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import io.github.pixee.security.BoundedLineReader;
|
||||
import io.github.pixee.security.Filenames;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
@ -34,6 +32,11 @@ import lombok.extern.slf4j.Slf4j;
|
||||
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
|
||||
import stirling.software.common.model.ApplicationProperties;
|
||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||
import stirling.software.common.util.ProcessExecutor;
|
||||
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
|
||||
import stirling.software.common.util.TempFileManager;
|
||||
import stirling.software.common.util.TempFileUtil;
|
||||
import stirling.software.common.util.TempFileUtil.TempFile;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/misc")
|
||||
@ -43,8 +46,8 @@ import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||
public class OCRController {
|
||||
|
||||
private final ApplicationProperties applicationProperties;
|
||||
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
private final TempFileManager tempFileManager;
|
||||
|
||||
/** Gets the list of available Tesseract languages from the tessdata directory */
|
||||
public List<String> getAvailableTesseractLanguages() {
|
||||
@ -73,93 +76,108 @@ public class OCRController {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
List<String> languages = request.getLanguages();
|
||||
String ocrType = request.getOcrType();
|
||||
Path tempDir = Files.createTempDirectory("ocr_process");
|
||||
Path tempInputFile = tempDir.resolve("input.pdf");
|
||||
Path tempOutputDir = tempDir.resolve("output");
|
||||
Path tempImagesDir = tempDir.resolve("images");
|
||||
Path finalOutputFile = tempDir.resolve("final_output.pdf");
|
||||
Files.createDirectories(tempOutputDir);
|
||||
Files.createDirectories(tempImagesDir);
|
||||
Process process = null;
|
||||
|
||||
// Create a temp directory using TempFileManager directly
|
||||
Path tempDirPath = tempFileManager.createTempDirectory();
|
||||
File tempDir = tempDirPath.toFile();
|
||||
|
||||
try {
|
||||
File tempInputFile = new File(tempDir, "input.pdf");
|
||||
File tempOutputDir = new File(tempDir, "output");
|
||||
File tempImagesDir = new File(tempDir, "images");
|
||||
File finalOutputFile = new File(tempDir, "final_output.pdf");
|
||||
|
||||
// Create directories
|
||||
tempOutputDir.mkdirs();
|
||||
tempImagesDir.mkdirs();
|
||||
|
||||
// Save input file
|
||||
inputFile.transferTo(tempInputFile.toFile());
|
||||
inputFile.transferTo(tempInputFile);
|
||||
|
||||
PDFMergerUtility merger = new PDFMergerUtility();
|
||||
merger.setDestinationFileName(finalOutputFile.toString());
|
||||
try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
|
||||
|
||||
try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) {
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
int pageCount = document.getNumberOfPages();
|
||||
|
||||
for (int pageNum = 0; pageNum < pageCount; pageNum++) {
|
||||
PDPage page = document.getPage(pageNum);
|
||||
boolean hasText = false;
|
||||
|
||||
// Check for existing text
|
||||
try (PDDocument tempDoc = new PDDocument()) {
|
||||
tempDoc.addPage(page);
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
hasText = !stripper.getText(tempDoc).trim().isEmpty();
|
||||
}
|
||||
boolean shouldOcr =
|
||||
switch (ocrType) {
|
||||
case "skip-text" -> !hasText;
|
||||
case "force-ocr" -> true;
|
||||
default -> true;
|
||||
};
|
||||
Path pageOutputPath =
|
||||
tempOutputDir.resolve(String.format("page_%d.pdf", pageNum));
|
||||
|
||||
boolean shouldOcr = switch (ocrType) {
|
||||
case "skip-text" -> !hasText;
|
||||
case "force-ocr" -> true;
|
||||
default -> true;
|
||||
};
|
||||
|
||||
File pageOutputPath = new File(tempOutputDir, String.format("page_%d.pdf", pageNum));
|
||||
|
||||
if (shouldOcr) {
|
||||
// Convert page to image
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
|
||||
Path imagePath =
|
||||
tempImagesDir.resolve(String.format("page_%d.png", pageNum));
|
||||
ImageIO.write(image, "png", imagePath.toFile());
|
||||
File imagePath = new File(tempImagesDir, String.format("page_%d.png", pageNum));
|
||||
ImageIO.write(image, "png", imagePath);
|
||||
|
||||
// Build OCR command
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add("tesseract");
|
||||
command.add(imagePath.toString());
|
||||
command.add(
|
||||
tempOutputDir
|
||||
.resolve(String.format("page_%d", pageNum))
|
||||
new File(tempOutputDir, String.format("page_%d", pageNum))
|
||||
.toString());
|
||||
command.add("-l");
|
||||
command.add(String.join("+", languages));
|
||||
// Always output PDF
|
||||
command.add("pdf");
|
||||
ProcessBuilder pb = new ProcessBuilder(command);
|
||||
process = pb.start();
|
||||
// Capture any error output
|
||||
try (BufferedReader reader =
|
||||
new BufferedReader(
|
||||
new InputStreamReader(process.getErrorStream()))) {
|
||||
String line;
|
||||
while ((line = BoundedLineReader.readLine(reader, 5_000_000)) != null) {
|
||||
log.debug("Tesseract: {}", line);
|
||||
|
||||
// Use ProcessExecutor to run tesseract command
|
||||
try {
|
||||
ProcessExecutorResult result = ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
log.debug("Tesseract OCR completed for page {} with exit code {}",
|
||||
pageNum, result.getRc());
|
||||
|
||||
// Add OCR'd PDF to merger
|
||||
merger.addSource(pageOutputPath);
|
||||
} catch (IOException | InterruptedException e) {
|
||||
log.error("Error processing page {} with tesseract: {}", pageNum, e.getMessage());
|
||||
// If OCR fails, fall back to the original page
|
||||
try (PDDocument pageDoc = new PDDocument()) {
|
||||
pageDoc.addPage(page);
|
||||
pageDoc.save(pageOutputPath);
|
||||
merger.addSource(pageOutputPath);
|
||||
}
|
||||
}
|
||||
int exitCode = process.waitFor();
|
||||
if (exitCode != 0) {
|
||||
throw new RuntimeException(
|
||||
"Tesseract failed with exit code: " + exitCode);
|
||||
}
|
||||
// Add OCR'd PDF to merger
|
||||
merger.addSource(pageOutputPath.toFile());
|
||||
} else {
|
||||
// Save original page without OCR
|
||||
try (PDDocument pageDoc = new PDDocument()) {
|
||||
pageDoc.addPage(page);
|
||||
pageDoc.save(pageOutputPath.toFile());
|
||||
merger.addSource(pageOutputPath.toFile());
|
||||
pageDoc.save(pageOutputPath);
|
||||
merger.addSource(pageOutputPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Merge all pages into final PDF
|
||||
merger.mergeDocuments(null);
|
||||
|
||||
// Read the final PDF file
|
||||
byte[] pdfContent = Files.readAllBytes(finalOutputFile);
|
||||
byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath());
|
||||
String outputFilename =
|
||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "")
|
||||
+ "_OCR.pdf";
|
||||
|
||||
return ResponseEntity.ok()
|
||||
.header(
|
||||
"Content-Disposition",
|
||||
@ -167,14 +185,11 @@ public class OCRController {
|
||||
.contentType(MediaType.APPLICATION_PDF)
|
||||
.body(pdfContent);
|
||||
} finally {
|
||||
if (process != null) {
|
||||
process.destroy();
|
||||
}
|
||||
// Clean up temporary files
|
||||
deleteDirectory(tempDir);
|
||||
// Clean up the temp directory and all its contents
|
||||
tempFileManager.deleteTempDirectory(tempDirPath);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
|
||||
throws IOException {
|
||||
if (!file.exists()) {
|
||||
@ -192,21 +207,4 @@ public class OCRController {
|
||||
zipOut.closeEntry();
|
||||
}
|
||||
}
|
||||
|
||||
private void deleteDirectory(Path directory) {
|
||||
try {
|
||||
Files.walk(directory)
|
||||
.sorted(Comparator.reverseOrder())
|
||||
.forEach(
|
||||
path -> {
|
||||
try {
|
||||
Files.delete(path);
|
||||
} catch (IOException e) {
|
||||
log.error("Error deleting {}: {}", path, e.getMessage());
|
||||
}
|
||||
});
|
||||
} catch (IOException e) {
|
||||
log.error("Error walking directory {}: {}", directory, e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -44,4 +44,7 @@ springdoc.swagger-ui.path=/index.html
|
||||
posthog.api.key=phc_fiR65u5j6qmXTYL56MNrLZSWqLaDW74OrZH0Insd2xq
|
||||
posthog.host=https://eu.i.posthog.com
|
||||
|
||||
spring.main.allow-bean-definition-overriding=true
|
||||
spring.main.allow-bean-definition-overriding=true
|
||||
|
||||
# Set up a consistent temporary directory location
|
||||
java.io.tmpdir=${stirling.tempfiles.directory:${java.io.tmpdir}/stirling-pdf}
|
Loading…
Reference in New Issue
Block a user