mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
improvements to tmp dir, OCR and others
This commit is contained in:
parent
5aec97939e
commit
93ea611eda
@ -25,7 +25,7 @@ import stirling.software.common.util.TempFileRegistry;
|
|||||||
@Configuration
|
@Configuration
|
||||||
public class TempFileConfiguration {
|
public class TempFileConfiguration {
|
||||||
|
|
||||||
@Value("${stirling.tempfiles.directory:}")
|
@Value("${stirling.tempfiles.directory:${java.io.tmpdir}/stirling-pdf}")
|
||||||
private String customTempDirectory;
|
private String customTempDirectory;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
@ -48,42 +48,11 @@ public class TempFileConfiguration {
|
|||||||
@PostConstruct
|
@PostConstruct
|
||||||
public void initTempFileConfig() {
|
public void initTempFileConfig() {
|
||||||
try {
|
try {
|
||||||
// If a custom temp directory is specified in the config, use it
|
// Create the temp directory if it doesn't exist
|
||||||
if (customTempDirectory != null && !customTempDirectory.isEmpty()) {
|
Path tempDir = Path.of(customTempDirectory);
|
||||||
Path tempDir = Path.of(customTempDirectory);
|
if (!Files.exists(tempDir)) {
|
||||||
if (!Files.exists(tempDir)) {
|
Files.createDirectories(tempDir);
|
||||||
Files.createDirectories(tempDir);
|
log.info("Created temporary directory: {}", tempDir);
|
||||||
log.info("Created custom temporary directory: {}", tempDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set Java temp directory system property if in Docker/Kubernetes mode
|
|
||||||
if ("Docker".equals(machineType) || "Kubernetes".equals(machineType)) {
|
|
||||||
System.setProperty("java.io.tmpdir", customTempDirectory);
|
|
||||||
log.info(
|
|
||||||
"Set system temp directory to: {} for environment: {}",
|
|
||||||
customTempDirectory,
|
|
||||||
machineType);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// No custom directory specified, use java.io.tmpdir + application subfolder
|
|
||||||
String defaultTempDir;
|
|
||||||
|
|
||||||
if ("Docker".equals(machineType) || "Kubernetes".equals(machineType)) {
|
|
||||||
// Container environments should continue to use /tmp/stirling-pdf
|
|
||||||
defaultTempDir = "/tmp/stirling-pdf";
|
|
||||||
} else {
|
|
||||||
// Use system temp directory (java.io.tmpdir) with our application subfolder
|
|
||||||
// This automatically handles Windows (AppData\Local\Temp), macOS, and Linux systems
|
|
||||||
defaultTempDir = System.getProperty("java.io.tmpdir") + File.separator + "stirling-pdf";
|
|
||||||
}
|
|
||||||
customTempDirectory = defaultTempDir;
|
|
||||||
|
|
||||||
// Create the default temp directory
|
|
||||||
Path tempDir = Path.of(customTempDirectory);
|
|
||||||
if (!Files.exists(tempDir)) {
|
|
||||||
Files.createDirectories(tempDir);
|
|
||||||
log.info("Created default OS-specific temporary directory: {}", tempDir);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Temporary file configuration initialized");
|
log.info("Temporary file configuration initialized");
|
||||||
@ -93,4 +62,4 @@ public class TempFileConfiguration {
|
|||||||
log.error("Failed to initialize temporary file configuration", e);
|
log.error("Failed to initialize temporary file configuration", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -3,8 +3,12 @@ package stirling.software.common.service;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
import java.util.function.Predicate;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
@ -49,6 +53,36 @@ public class TempFileCleanupService {
|
|||||||
@Value("${stirling.tempfiles.libreoffice-dir:/tmp/stirling-pdf/libreoffice}")
|
@Value("${stirling.tempfiles.libreoffice-dir:/tmp/stirling-pdf/libreoffice}")
|
||||||
private String libreOfficeTempDir;
|
private String libreOfficeTempDir;
|
||||||
|
|
||||||
|
// Maximum recursion depth for directory traversal
|
||||||
|
private static final int MAX_RECURSION_DEPTH = 5;
|
||||||
|
|
||||||
|
// File patterns that identify our temp files
|
||||||
|
private static final Predicate<String> IS_OUR_TEMP_FILE = fileName ->
|
||||||
|
fileName.startsWith("stirling-pdf-") ||
|
||||||
|
fileName.startsWith("output_") ||
|
||||||
|
fileName.startsWith("compressedPDF") ||
|
||||||
|
fileName.startsWith("pdf-save-") ||
|
||||||
|
fileName.startsWith("pdf-stream-") ||
|
||||||
|
fileName.startsWith("PDFBox") ||
|
||||||
|
fileName.startsWith("input_") ||
|
||||||
|
fileName.startsWith("overlay-");
|
||||||
|
|
||||||
|
// File patterns that identify common system temp files
|
||||||
|
private static final Predicate<String> IS_SYSTEM_TEMP_FILE = fileName ->
|
||||||
|
fileName.matches("lu\\d+[a-z0-9]*\\.tmp") ||
|
||||||
|
fileName.matches("ocr_process\\d+") ||
|
||||||
|
(fileName.startsWith("tmp") && !fileName.contains("jetty")) ||
|
||||||
|
fileName.startsWith("OSL_PIPE_") ||
|
||||||
|
(fileName.endsWith(".tmp") && !fileName.contains("jetty"));
|
||||||
|
|
||||||
|
// File patterns that should be excluded from cleanup
|
||||||
|
private static final Predicate<String> SHOULD_SKIP = fileName ->
|
||||||
|
fileName.contains("jetty") ||
|
||||||
|
fileName.startsWith("jetty-") ||
|
||||||
|
fileName.equals("proc") ||
|
||||||
|
fileName.equals("sys") ||
|
||||||
|
fileName.equals("dev");
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
public TempFileCleanupService(TempFileRegistry registry, TempFileManager tempFileManager) {
|
public TempFileCleanupService(TempFileRegistry registry, TempFileManager tempFileManager) {
|
||||||
this.registry = registry;
|
this.registry = registry;
|
||||||
@ -114,41 +148,9 @@ public class TempFileCleanupService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int unregisteredDeletedCount = 0;
|
// Clean up unregistered temp files based on our cleanup strategy
|
||||||
try {
|
boolean containerMode = isContainerMode();
|
||||||
// Get all directories we need to clean
|
int unregisteredDeletedCount = cleanupUnregisteredFiles(containerMode, true, maxAgeMillis);
|
||||||
Path systemTempPath;
|
|
||||||
if (systemTempDir != null && !systemTempDir.isEmpty()) {
|
|
||||||
systemTempPath = Path.of(systemTempDir);
|
|
||||||
} else {
|
|
||||||
systemTempPath = Path.of(System.getProperty("java.io.tmpdir"));
|
|
||||||
}
|
|
||||||
|
|
||||||
Path[] dirsToScan = {
|
|
||||||
systemTempPath, Path.of(customTempDirectory), Path.of(libreOfficeTempDir)
|
|
||||||
};
|
|
||||||
|
|
||||||
boolean containerMode =
|
|
||||||
"Docker".equals(machineType) || "Kubernetes".equals(machineType);
|
|
||||||
|
|
||||||
// Process each directory
|
|
||||||
for (Path tempDir : dirsToScan) {
|
|
||||||
if (!Files.exists(tempDir)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
int dirDeletedCount = cleanupDirectory(tempDir, containerMode, 0, maxAgeMillis);
|
|
||||||
unregisteredDeletedCount += dirDeletedCount;
|
|
||||||
if (dirDeletedCount > 0) {
|
|
||||||
log.info(
|
|
||||||
"Cleaned up {} unregistered files/directories in {}",
|
|
||||||
dirDeletedCount,
|
|
||||||
tempDir);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.error("Error during scheduled cleanup of unregistered files", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info(
|
log.info(
|
||||||
"Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories",
|
"Scheduled cleanup complete. Deleted {} registered files, {} unregistered files, {} directories",
|
||||||
@ -157,272 +159,217 @@ public class TempFileCleanupService {
|
|||||||
directoriesDeletedCount);
|
directoriesDeletedCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Overload of cleanupDirectory that uses the specified max age for files */
|
|
||||||
private int cleanupDirectory(
|
|
||||||
Path directory, boolean containerMode, int depth, long maxAgeMillis)
|
|
||||||
throws IOException {
|
|
||||||
if (depth > 5) {
|
|
||||||
log.warn("Maximum directory recursion depth reached for: {}", directory);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int deletedCount = 0;
|
|
||||||
|
|
||||||
try (Stream<Path> paths = Files.list(directory)) {
|
|
||||||
for (Path path : paths.toList()) {
|
|
||||||
String fileName = path.getFileName().toString();
|
|
||||||
|
|
||||||
// Skip registered files - these are handled by TempFileManager
|
|
||||||
if (registry.contains(path.toFile())) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip Jetty-related directories and files
|
|
||||||
if (fileName.contains("jetty") || fileName.startsWith("jetty-")) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if this is a directory we should recursively scan
|
|
||||||
if (Files.isDirectory(path)) {
|
|
||||||
// Don't recurse into certain system directories
|
|
||||||
if (!fileName.equals("proc")
|
|
||||||
&& !fileName.equals("sys")
|
|
||||||
&& !fileName.equals("dev")) {
|
|
||||||
deletedCount +=
|
|
||||||
cleanupDirectory(path, containerMode, depth + 1, maxAgeMillis);
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Determine if this file matches our temp file patterns
|
|
||||||
boolean isOurTempFile =
|
|
||||||
fileName.startsWith("stirling-pdf-")
|
|
||||||
|| fileName.startsWith("output_")
|
|
||||||
|| fileName.startsWith("compressedPDF")
|
|
||||||
|| fileName.startsWith("pdf-save-")
|
|
||||||
|| fileName.startsWith("pdf-stream-")
|
|
||||||
|| fileName.startsWith("PDFBox")
|
|
||||||
|| fileName.startsWith("input_")
|
|
||||||
|| fileName.startsWith("overlay-");
|
|
||||||
|
|
||||||
// Avoid touching Jetty files
|
|
||||||
boolean isSystemTempFile =
|
|
||||||
fileName.matches("lu\\d+[a-z0-9]*\\.tmp")
|
|
||||||
|| fileName.matches("ocr_process\\d+")
|
|
||||||
|| (fileName.startsWith("tmp") && !fileName.contains("jetty"))
|
|
||||||
|| fileName.startsWith("OSL_PIPE_")
|
|
||||||
|| (fileName.endsWith(".tmp") && !fileName.contains("jetty"));
|
|
||||||
|
|
||||||
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
|
|
||||||
|
|
||||||
// Special case for zero-byte files - these are often corrupted temp files
|
|
||||||
try {
|
|
||||||
if (Files.size(path) == 0) {
|
|
||||||
// For empty files, use a shorter timeout (5 minutes)
|
|
||||||
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
|
||||||
long currentTime = System.currentTimeMillis();
|
|
||||||
// Delete empty files older than 5 minutes
|
|
||||||
if ((currentTime - lastModified) > 5 * 60 * 1000) {
|
|
||||||
shouldDelete = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.debug("Could not check file size, skipping: {}", path);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check file age against maxAgeMillis
|
|
||||||
if (shouldDelete) {
|
|
||||||
try {
|
|
||||||
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
|
||||||
long currentTime = System.currentTimeMillis();
|
|
||||||
shouldDelete = (currentTime - lastModified) > maxAgeMillis;
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.debug("Could not check file age, skipping: {}", path);
|
|
||||||
shouldDelete = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (shouldDelete) {
|
|
||||||
try {
|
|
||||||
Files.deleteIfExists(path);
|
|
||||||
deletedCount++;
|
|
||||||
log.debug(
|
|
||||||
"Deleted unregistered temp file during scheduled cleanup: {}",
|
|
||||||
path);
|
|
||||||
} catch (IOException e) {
|
|
||||||
// Handle locked files more gracefully - just log at debug level
|
|
||||||
if (e.getMessage() != null
|
|
||||||
&& e.getMessage().contains("being used by another process")) {
|
|
||||||
log.debug("File locked, skipping delete: {}", path);
|
|
||||||
} else {
|
|
||||||
log.warn(
|
|
||||||
"Failed to delete temp file during scheduled cleanup: {}",
|
|
||||||
path,
|
|
||||||
e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return deletedCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Perform startup cleanup of stale temporary files from previous runs. This is especially
|
* Perform startup cleanup of stale temporary files from previous runs. This is especially
|
||||||
* important in Docker environments where temp files persist between container restarts.
|
* important in Docker environments where temp files persist between container restarts.
|
||||||
*/
|
*/
|
||||||
private void runStartupCleanup() {
|
private void runStartupCleanup() {
|
||||||
log.info("Running startup temporary file cleanup");
|
log.info("Running startup temporary file cleanup");
|
||||||
|
boolean containerMode = isContainerMode();
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"Running in {} mode, using {} cleanup strategy",
|
||||||
|
machineType,
|
||||||
|
containerMode ? "aggressive" : "conservative");
|
||||||
|
|
||||||
|
// For startup cleanup, we use a longer timeout for non-container environments
|
||||||
|
long maxAgeMillis = containerMode ? 0 : 24 * 60 * 60 * 1000; // 0 or 24 hours
|
||||||
|
|
||||||
|
int totalDeletedCount = cleanupUnregisteredFiles(containerMode, false, maxAgeMillis);
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"Startup cleanup complete. Deleted {} temporary files/directories",
|
||||||
|
totalDeletedCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clean up unregistered temporary files across all configured temp directories.
|
||||||
|
*
|
||||||
|
* @param containerMode Whether we're in container mode (more aggressive cleanup)
|
||||||
|
* @param isScheduled Whether this is a scheduled cleanup or startup cleanup
|
||||||
|
* @param maxAgeMillis Maximum age of files to clean in milliseconds
|
||||||
|
* @return Number of files deleted
|
||||||
|
*/
|
||||||
|
private int cleanupUnregisteredFiles(boolean containerMode, boolean isScheduled, long maxAgeMillis) {
|
||||||
|
AtomicInteger totalDeletedCount = new AtomicInteger(0);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Get all directories we need to clean
|
// Get all directories we need to clean
|
||||||
Path systemTempPath;
|
Path systemTempPath = getSystemTempPath();
|
||||||
if (systemTempDir != null && !systemTempDir.isEmpty()) {
|
|
||||||
systemTempPath = Path.of(systemTempDir);
|
|
||||||
} else {
|
|
||||||
systemTempPath = Path.of(System.getProperty("java.io.tmpdir"));
|
|
||||||
}
|
|
||||||
|
|
||||||
Path[] dirsToScan = {
|
Path[] dirsToScan = {
|
||||||
systemTempPath, Path.of(customTempDirectory), Path.of(libreOfficeTempDir)
|
systemTempPath,
|
||||||
|
Path.of(customTempDirectory),
|
||||||
|
Path.of(libreOfficeTempDir)
|
||||||
};
|
};
|
||||||
|
|
||||||
int totalDeletedCount = 0;
|
|
||||||
|
|
||||||
boolean containerMode =
|
|
||||||
"Docker".equals(machineType) || "Kubernetes".equals(machineType);
|
|
||||||
log.info(
|
|
||||||
"Running in {} mode, using {} cleanup strategy",
|
|
||||||
machineType,
|
|
||||||
containerMode ? "aggressive" : "conservative");
|
|
||||||
|
|
||||||
// Process each directory
|
// Process each directory
|
||||||
for (Path tempDir : dirsToScan) {
|
Arrays.stream(dirsToScan)
|
||||||
if (!Files.exists(tempDir)) {
|
.filter(Files::exists)
|
||||||
log.warn("Temporary directory does not exist: {}", tempDir);
|
.forEach(tempDir -> {
|
||||||
continue;
|
try {
|
||||||
|
String phase = isScheduled ? "scheduled" : "startup";
|
||||||
|
log.info("Scanning directory for {} cleanup: {}", phase, tempDir);
|
||||||
|
|
||||||
|
AtomicInteger dirDeletedCount = new AtomicInteger(0);
|
||||||
|
cleanupDirectoryStreaming(
|
||||||
|
tempDir,
|
||||||
|
containerMode,
|
||||||
|
0,
|
||||||
|
maxAgeMillis,
|
||||||
|
isScheduled,
|
||||||
|
path -> {
|
||||||
|
dirDeletedCount.incrementAndGet();
|
||||||
|
if (log.isDebugEnabled()) {
|
||||||
|
log.debug("Deleted temp file during {} cleanup: {}", phase, path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
int count = dirDeletedCount.get();
|
||||||
|
totalDeletedCount.addAndGet(count);
|
||||||
|
if (count > 0) {
|
||||||
|
log.info("Cleaned up {} files/directories in {}", count, tempDir);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Error during cleanup of directory: {}", tempDir, e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("Error during cleanup of unregistered files", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return totalDeletedCount.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the system temp directory path based on configuration or system property.
|
||||||
|
*/
|
||||||
|
private Path getSystemTempPath() {
|
||||||
|
if (systemTempDir != null && !systemTempDir.isEmpty()) {
|
||||||
|
return Path.of(systemTempDir);
|
||||||
|
} else {
|
||||||
|
return Path.of(System.getProperty("java.io.tmpdir"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine if we're running in a container environment.
|
||||||
|
*/
|
||||||
|
private boolean isContainerMode() {
|
||||||
|
return "Docker".equals(machineType) || "Kubernetes".equals(machineType);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recursively clean up a directory using a streaming approach to reduce memory usage.
|
||||||
|
*
|
||||||
|
* @param directory The directory to clean
|
||||||
|
* @param containerMode Whether we're in container mode (more aggressive cleanup)
|
||||||
|
* @param depth Current recursion depth
|
||||||
|
* @param maxAgeMillis Maximum age of files to delete
|
||||||
|
* @param isScheduled Whether this is a scheduled cleanup (vs startup)
|
||||||
|
* @param onDeleteCallback Callback function when a file is deleted
|
||||||
|
* @throws IOException If an I/O error occurs
|
||||||
|
*/
|
||||||
|
private void cleanupDirectoryStreaming(
|
||||||
|
Path directory,
|
||||||
|
boolean containerMode,
|
||||||
|
int depth,
|
||||||
|
long maxAgeMillis,
|
||||||
|
boolean isScheduled,
|
||||||
|
Consumer<Path> onDeleteCallback) throws IOException {
|
||||||
|
|
||||||
|
// Check recursion depth limit
|
||||||
|
if (depth > MAX_RECURSION_DEPTH) {
|
||||||
|
log.warn("Maximum directory recursion depth reached for: {}", directory);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use try-with-resources to ensure the stream is closed
|
||||||
|
try (Stream<Path> pathStream = Files.list(directory)) {
|
||||||
|
// Process files in a streaming fashion instead of materializing the whole list
|
||||||
|
pathStream.forEach(path -> {
|
||||||
|
try {
|
||||||
|
String fileName = path.getFileName().toString();
|
||||||
|
|
||||||
|
// Skip if file should be excluded
|
||||||
|
if (SHOULD_SKIP.test(fileName)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle directories recursively
|
||||||
|
if (Files.isDirectory(path)) {
|
||||||
|
try {
|
||||||
|
cleanupDirectoryStreaming(
|
||||||
|
path, containerMode, depth + 1, maxAgeMillis, isScheduled, onDeleteCallback);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.warn("Error processing subdirectory: {}", path, e);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip registered files - these are handled by TempFileManager
|
||||||
|
if (isScheduled && registry.contains(path.toFile())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this file should be deleted
|
||||||
|
if (shouldDeleteFile(path, fileName, containerMode, maxAgeMillis)) {
|
||||||
|
try {
|
||||||
|
Files.deleteIfExists(path);
|
||||||
|
onDeleteCallback.accept(path);
|
||||||
|
} catch (IOException e) {
|
||||||
|
// Handle locked files more gracefully
|
||||||
|
if (e.getMessage() != null && e.getMessage().contains("being used by another process")) {
|
||||||
|
log.debug("File locked, skipping delete: {}", path);
|
||||||
|
} else {
|
||||||
|
log.warn("Failed to delete temp file: {}", path, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Error processing path: {}", path, e);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
log.info("Scanning directory for cleanup: {}", tempDir);
|
|
||||||
int dirDeletedCount = cleanupDirectory(tempDir, containerMode, 0);
|
|
||||||
totalDeletedCount += dirDeletedCount;
|
|
||||||
log.info("Cleaned up {} files/directories in {}", dirDeletedCount, tempDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info(
|
|
||||||
"Startup cleanup complete. Deleted {} temporary files/directories",
|
|
||||||
totalDeletedCount);
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.error("Error during startup cleanup", e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Recursively clean up a directory for temporary files.
|
* Determine if a file should be deleted based on its name, age, and other criteria.
|
||||||
*
|
|
||||||
* @param directory The directory to clean
|
|
||||||
* @param containerMode Whether we're in container mode (more aggressive cleanup)
|
|
||||||
* @param depth Current recursion depth (to prevent excessive recursion)
|
|
||||||
* @return Number of files deleted
|
|
||||||
*/
|
*/
|
||||||
private int cleanupDirectory(Path directory, boolean containerMode, int depth)
|
private boolean shouldDeleteFile(Path path, String fileName, boolean containerMode, long maxAgeMillis) {
|
||||||
throws IOException {
|
// First check if it matches our known temp file patterns
|
||||||
if (depth > 5) {
|
boolean isOurTempFile = IS_OUR_TEMP_FILE.test(fileName);
|
||||||
log.warn("Maximum directory recursion depth reached for: {}", directory);
|
boolean isSystemTempFile = IS_SYSTEM_TEMP_FILE.test(fileName);
|
||||||
return 0;
|
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
|
||||||
|
|
||||||
|
// Special case for zero-byte files - these are often corrupted temp files
|
||||||
|
try {
|
||||||
|
if (Files.size(path) == 0) {
|
||||||
|
// For empty files, use a shorter timeout (5 minutes)
|
||||||
|
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
||||||
|
long currentTime = System.currentTimeMillis();
|
||||||
|
// Delete empty files older than 5 minutes
|
||||||
|
if ((currentTime - lastModified) > 5 * 60 * 1000) {
|
||||||
|
shouldDelete = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.debug("Could not check file size, skipping: {}", path);
|
||||||
}
|
}
|
||||||
|
|
||||||
int deletedCount = 0;
|
// Check file age against maxAgeMillis
|
||||||
|
if (shouldDelete && maxAgeMillis > 0) {
|
||||||
try (Stream<Path> paths = Files.list(directory)) {
|
try {
|
||||||
for (Path path : paths.toList()) {
|
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
||||||
String fileName = path.getFileName().toString();
|
long currentTime = System.currentTimeMillis();
|
||||||
|
shouldDelete = (currentTime - lastModified) > maxAgeMillis;
|
||||||
// Skip Jetty-related directories and files
|
} catch (IOException e) {
|
||||||
if (fileName.contains("jetty") || fileName.startsWith("jetty-")) {
|
log.debug("Could not check file age, skipping: {}", path);
|
||||||
continue;
|
shouldDelete = false;
|
||||||
}
|
|
||||||
|
|
||||||
// Check if this is a directory we should recursively scan
|
|
||||||
if (Files.isDirectory(path)) {
|
|
||||||
// Don't recurse into certain system directories
|
|
||||||
if (!fileName.equals("proc")
|
|
||||||
&& !fileName.equals("sys")
|
|
||||||
&& !fileName.equals("dev")) {
|
|
||||||
deletedCount += cleanupDirectory(path, containerMode, depth + 1);
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Determine if this file matches our temp file patterns
|
|
||||||
boolean isOurTempFile =
|
|
||||||
fileName.startsWith("stirling-pdf-")
|
|
||||||
|| fileName.startsWith("output_")
|
|
||||||
|| fileName.startsWith("compressedPDF")
|
|
||||||
|| fileName.startsWith("pdf-save-")
|
|
||||||
|| fileName.startsWith("pdf-stream-")
|
|
||||||
|| fileName.startsWith("PDFBox")
|
|
||||||
|| fileName.startsWith("input_")
|
|
||||||
|| fileName.startsWith("overlay-");
|
|
||||||
|
|
||||||
// Avoid touching Jetty files
|
|
||||||
boolean isSystemTempFile =
|
|
||||||
fileName.matches("lu\\d+[a-z0-9]*\\.tmp")
|
|
||||||
|| fileName.matches("ocr_process\\d+")
|
|
||||||
|| (fileName.startsWith("tmp") && !fileName.contains("jetty"))
|
|
||||||
|| fileName.startsWith("OSL_PIPE_")
|
|
||||||
|| (fileName.endsWith(".tmp") && !fileName.contains("jetty"));
|
|
||||||
|
|
||||||
boolean shouldDelete = isOurTempFile || (containerMode && isSystemTempFile);
|
|
||||||
|
|
||||||
// Special case for zero-byte files - these are often corrupted temp files
|
|
||||||
boolean isEmptyFile = false;
|
|
||||||
try {
|
|
||||||
if (!Files.isDirectory(path) && Files.size(path) == 0) {
|
|
||||||
isEmptyFile = true;
|
|
||||||
// For empty files, use a shorter timeout (5 minutes)
|
|
||||||
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
|
||||||
long currentTime = System.currentTimeMillis();
|
|
||||||
// Delete empty files older than 5 minutes
|
|
||||||
if ((currentTime - lastModified) > 5 * 60 * 1000) {
|
|
||||||
shouldDelete = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.debug("Could not check file size, skipping: {}", path);
|
|
||||||
}
|
|
||||||
|
|
||||||
// For non-container mode, check file age before deleting
|
|
||||||
if (!containerMode && (isOurTempFile || isSystemTempFile) && !isEmptyFile) {
|
|
||||||
try {
|
|
||||||
long lastModified = Files.getLastModifiedTime(path).toMillis();
|
|
||||||
long currentTime = System.currentTimeMillis();
|
|
||||||
// Only delete files older than 24 hours in non-container mode
|
|
||||||
shouldDelete = (currentTime - lastModified) > 24 * 60 * 60 * 1000;
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.debug("Could not check file age, skipping: {}", path);
|
|
||||||
shouldDelete = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (shouldDelete) {
|
|
||||||
try {
|
|
||||||
if (Files.isDirectory(path)) {
|
|
||||||
GeneralUtils.deleteDirectory(path);
|
|
||||||
} else {
|
|
||||||
Files.deleteIfExists(path);
|
|
||||||
}
|
|
||||||
deletedCount++;
|
|
||||||
log.debug("Deleted temp file during startup cleanup: {}", path);
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.warn("Failed to delete temp file during startup cleanup: {}", path, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return deletedCount;
|
return shouldDelete;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Clean up LibreOffice temporary files. This method is called after LibreOffice operations. */
|
/** Clean up LibreOffice temporary files. This method is called after LibreOffice operations. */
|
||||||
@ -431,18 +378,17 @@ public class TempFileCleanupService {
|
|||||||
try {
|
try {
|
||||||
Set<Path> directories = registry.getTempDirectories();
|
Set<Path> directories = registry.getTempDirectories();
|
||||||
for (Path dir : directories) {
|
for (Path dir : directories) {
|
||||||
if (dir.getFileName().toString().contains("libreoffice")) {
|
if (dir.getFileName().toString().contains("libreoffice") && Files.exists(dir)) {
|
||||||
// For directories containing "libreoffice", delete all contents
|
// For directories containing "libreoffice", delete all contents
|
||||||
// but keep the directory itself for future use
|
// but keep the directory itself for future use
|
||||||
try (Stream<Path> files = Files.list(dir)) {
|
cleanupDirectoryStreaming(
|
||||||
for (Path file : files.toList()) {
|
dir,
|
||||||
if (Files.isDirectory(file)) {
|
isContainerMode(),
|
||||||
GeneralUtils.deleteDirectory(file);
|
0,
|
||||||
} else {
|
0, // age doesn't matter for LibreOffice cleanup
|
||||||
Files.deleteIfExists(file);
|
false,
|
||||||
}
|
path -> log.debug("Cleaned up LibreOffice temp file: {}", path)
|
||||||
}
|
);
|
||||||
}
|
|
||||||
log.debug("Cleaned up LibreOffice temp directory contents: {}", dir);
|
log.debug("Cleaned up LibreOffice temp directory contents: {}", dir);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -450,4 +396,4 @@ public class TempFileCleanupService {
|
|||||||
log.warn("Failed to clean up LibreOffice temp files", e);
|
log.warn("Failed to clean up LibreOffice temp files", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -14,6 +14,8 @@ import java.util.Arrays;
|
|||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -118,12 +120,14 @@ public class TempFileCleanupServiceTest {
|
|||||||
|
|
||||||
// Create a file older than threshold
|
// Create a file older than threshold
|
||||||
Path oldFile = Files.createFile(systemTempDir.resolve("output_old.pdf"));
|
Path oldFile = Files.createFile(systemTempDir.resolve("output_old.pdf"));
|
||||||
Files.setLastModifiedTime(oldFile, FileTime.from( Files.getLastModifiedTime(oldFile).toMillis() - 5000000, TimeUnit.MILLISECONDS));
|
Files.setLastModifiedTime(oldFile, FileTime.from(
|
||||||
|
Files.getLastModifiedTime(oldFile).toMillis() - 5000000,
|
||||||
|
TimeUnit.MILLISECONDS));
|
||||||
|
|
||||||
// Act
|
// Act
|
||||||
invokeCleanupDirectory(systemTempDir, true, 0, 3600000);
|
invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
|
||||||
invokeCleanupDirectory(customTempDir, true, 0, 3600000);
|
invokeCleanupDirectoryStreaming(customTempDir, true, 0, 3600000);
|
||||||
invokeCleanupDirectory(libreOfficeTempDir, true, 0, 3600000);
|
invokeCleanupDirectoryStreaming(libreOfficeTempDir, true, 0, 3600000);
|
||||||
|
|
||||||
// Assert - Our temp files and system temp files should be deleted (if old enough)
|
// Assert - Our temp files and system temp files should be deleted (if old enough)
|
||||||
assertFalse(Files.exists(oldFile), "Old temp file should be deleted");
|
assertFalse(Files.exists(oldFile), "Old temp file should be deleted");
|
||||||
@ -141,14 +145,15 @@ public class TempFileCleanupServiceTest {
|
|||||||
// Arrange - Create an empty file
|
// Arrange - Create an empty file
|
||||||
Path emptyFile = Files.createFile(systemTempDir.resolve("empty.tmp"));
|
Path emptyFile = Files.createFile(systemTempDir.resolve("empty.tmp"));
|
||||||
// Make it "old enough" to be deleted (>5 minutes)
|
// Make it "old enough" to be deleted (>5 minutes)
|
||||||
Files.setLastModifiedTime(emptyFile, FileTime.from( Files.getLastModifiedTime(emptyFile).toMillis() - 6 * 60 * 1000, TimeUnit.MILLISECONDS));
|
Files.setLastModifiedTime(emptyFile, FileTime.from(
|
||||||
|
Files.getLastModifiedTime(emptyFile).toMillis() - 6 * 60 * 1000,
|
||||||
|
TimeUnit.MILLISECONDS));
|
||||||
|
|
||||||
|
|
||||||
// Configure mock registry to say this file isn't registered
|
// Configure mock registry to say this file isn't registered
|
||||||
when(registry.contains(any(File.class))).thenReturn(false);
|
when(registry.contains(any(File.class))).thenReturn(false);
|
||||||
|
|
||||||
// Act
|
// Act
|
||||||
invokeCleanupDirectory(systemTempDir, true, 0, 3600000);
|
invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
|
||||||
|
|
||||||
// Assert
|
// Assert
|
||||||
assertFalse(Files.exists(emptyFile), "Empty file older than 5 minutes should be deleted");
|
assertFalse(Files.exists(emptyFile), "Empty file older than 5 minutes should be deleted");
|
||||||
@ -166,13 +171,15 @@ public class TempFileCleanupServiceTest {
|
|||||||
Path tempFile3 = Files.createFile(dir3.resolve("output_3.pdf"));
|
Path tempFile3 = Files.createFile(dir3.resolve("output_3.pdf"));
|
||||||
|
|
||||||
// Make the deepest file old enough to be deleted
|
// Make the deepest file old enough to be deleted
|
||||||
Files.setLastModifiedTime(tempFile3, FileTime.from( Files.getLastModifiedTime(tempFile3).toMillis() - 5000000, TimeUnit.MILLISECONDS));
|
Files.setLastModifiedTime(tempFile3, FileTime.from(
|
||||||
|
Files.getLastModifiedTime(tempFile3).toMillis() - 5000000,
|
||||||
|
TimeUnit.MILLISECONDS));
|
||||||
|
|
||||||
// Configure mock registry to say these files aren't registered
|
// Configure mock registry to say these files aren't registered
|
||||||
when(registry.contains(any(File.class))).thenReturn(false);
|
when(registry.contains(any(File.class))).thenReturn(false);
|
||||||
|
|
||||||
// Act
|
// Act
|
||||||
invokeCleanupDirectory(systemTempDir, true, 0, 3600000);
|
invokeCleanupDirectoryStreaming(systemTempDir, true, 0, 3600000);
|
||||||
|
|
||||||
// Assert
|
// Assert
|
||||||
assertTrue(Files.exists(tempFile1), "Recent temp file should be preserved");
|
assertTrue(Files.exists(tempFile1), "Recent temp file should be preserved");
|
||||||
@ -181,17 +188,25 @@ public class TempFileCleanupServiceTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper method to invoke the private cleanupDirectory method using reflection
|
* Helper method to invoke the private cleanupDirectoryStreaming method using reflection
|
||||||
*/
|
*/
|
||||||
private int invokeCleanupDirectory(Path directory, boolean containerMode, int depth, long maxAgeMillis)
|
private void invokeCleanupDirectoryStreaming(Path directory, boolean containerMode, int depth, long maxAgeMillis)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
try {
|
try {
|
||||||
|
// Create a consumer that tracks deleted files
|
||||||
|
AtomicInteger deleteCount = new AtomicInteger(0);
|
||||||
|
Consumer<Path> deleteCallback = path -> deleteCount.incrementAndGet();
|
||||||
|
|
||||||
|
// Get the new method with updated signature
|
||||||
var method = TempFileCleanupService.class.getDeclaredMethod(
|
var method = TempFileCleanupService.class.getDeclaredMethod(
|
||||||
"cleanupDirectory", Path.class, boolean.class, int.class, long.class);
|
"cleanupDirectoryStreaming",
|
||||||
|
Path.class, boolean.class, int.class, long.class, boolean.class, Consumer.class);
|
||||||
method.setAccessible(true);
|
method.setAccessible(true);
|
||||||
return (int) method.invoke(cleanupService, directory, containerMode, depth, maxAgeMillis);
|
|
||||||
|
// Invoke the method with appropriate parameters
|
||||||
|
method.invoke(cleanupService, directory, containerMode, depth, maxAgeMillis, false, deleteCallback);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException("Error invoking cleanupDirectory", e);
|
throw new RuntimeException("Error invoking cleanupDirectoryStreaming", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -2,7 +2,6 @@ package stirling.software.SPDF.controller.api.misc;
|
|||||||
|
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.zip.ZipEntry;
|
import java.util.zip.ZipEntry;
|
||||||
@ -23,7 +22,6 @@ import org.springframework.web.bind.annotation.RequestMapping;
|
|||||||
import org.springframework.web.bind.annotation.RestController;
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
import io.github.pixee.security.BoundedLineReader;
|
|
||||||
import io.github.pixee.security.Filenames;
|
import io.github.pixee.security.Filenames;
|
||||||
import io.swagger.v3.oas.annotations.Operation;
|
import io.swagger.v3.oas.annotations.Operation;
|
||||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||||
@ -34,6 +32,11 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
|
import stirling.software.SPDF.model.api.misc.ProcessPdfWithOcrRequest;
|
||||||
import stirling.software.common.model.ApplicationProperties;
|
import stirling.software.common.model.ApplicationProperties;
|
||||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||||
|
import stirling.software.common.util.ProcessExecutor;
|
||||||
|
import stirling.software.common.util.ProcessExecutor.ProcessExecutorResult;
|
||||||
|
import stirling.software.common.util.TempFileManager;
|
||||||
|
import stirling.software.common.util.TempFileUtil;
|
||||||
|
import stirling.software.common.util.TempFileUtil.TempFile;
|
||||||
|
|
||||||
@RestController
|
@RestController
|
||||||
@RequestMapping("/api/v1/misc")
|
@RequestMapping("/api/v1/misc")
|
||||||
@ -43,8 +46,8 @@ import stirling.software.common.service.CustomPDFDocumentFactory;
|
|||||||
public class OCRController {
|
public class OCRController {
|
||||||
|
|
||||||
private final ApplicationProperties applicationProperties;
|
private final ApplicationProperties applicationProperties;
|
||||||
|
|
||||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||||
|
private final TempFileManager tempFileManager;
|
||||||
|
|
||||||
/** Gets the list of available Tesseract languages from the tessdata directory */
|
/** Gets the list of available Tesseract languages from the tessdata directory */
|
||||||
public List<String> getAvailableTesseractLanguages() {
|
public List<String> getAvailableTesseractLanguages() {
|
||||||
@ -73,93 +76,108 @@ public class OCRController {
|
|||||||
MultipartFile inputFile = request.getFileInput();
|
MultipartFile inputFile = request.getFileInput();
|
||||||
List<String> languages = request.getLanguages();
|
List<String> languages = request.getLanguages();
|
||||||
String ocrType = request.getOcrType();
|
String ocrType = request.getOcrType();
|
||||||
Path tempDir = Files.createTempDirectory("ocr_process");
|
|
||||||
Path tempInputFile = tempDir.resolve("input.pdf");
|
// Create a temp directory using TempFileManager directly
|
||||||
Path tempOutputDir = tempDir.resolve("output");
|
Path tempDirPath = tempFileManager.createTempDirectory();
|
||||||
Path tempImagesDir = tempDir.resolve("images");
|
File tempDir = tempDirPath.toFile();
|
||||||
Path finalOutputFile = tempDir.resolve("final_output.pdf");
|
|
||||||
Files.createDirectories(tempOutputDir);
|
|
||||||
Files.createDirectories(tempImagesDir);
|
|
||||||
Process process = null;
|
|
||||||
try {
|
try {
|
||||||
|
File tempInputFile = new File(tempDir, "input.pdf");
|
||||||
|
File tempOutputDir = new File(tempDir, "output");
|
||||||
|
File tempImagesDir = new File(tempDir, "images");
|
||||||
|
File finalOutputFile = new File(tempDir, "final_output.pdf");
|
||||||
|
|
||||||
|
// Create directories
|
||||||
|
tempOutputDir.mkdirs();
|
||||||
|
tempImagesDir.mkdirs();
|
||||||
|
|
||||||
// Save input file
|
// Save input file
|
||||||
inputFile.transferTo(tempInputFile.toFile());
|
inputFile.transferTo(tempInputFile);
|
||||||
|
|
||||||
PDFMergerUtility merger = new PDFMergerUtility();
|
PDFMergerUtility merger = new PDFMergerUtility();
|
||||||
merger.setDestinationFileName(finalOutputFile.toString());
|
merger.setDestinationFileName(finalOutputFile.toString());
|
||||||
try (PDDocument document = pdfDocumentFactory.load(tempInputFile.toFile())) {
|
|
||||||
|
try (PDDocument document = pdfDocumentFactory.load(tempInputFile)) {
|
||||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||||
int pageCount = document.getNumberOfPages();
|
int pageCount = document.getNumberOfPages();
|
||||||
|
|
||||||
for (int pageNum = 0; pageNum < pageCount; pageNum++) {
|
for (int pageNum = 0; pageNum < pageCount; pageNum++) {
|
||||||
PDPage page = document.getPage(pageNum);
|
PDPage page = document.getPage(pageNum);
|
||||||
boolean hasText = false;
|
boolean hasText = false;
|
||||||
|
|
||||||
// Check for existing text
|
// Check for existing text
|
||||||
try (PDDocument tempDoc = new PDDocument()) {
|
try (PDDocument tempDoc = new PDDocument()) {
|
||||||
tempDoc.addPage(page);
|
tempDoc.addPage(page);
|
||||||
PDFTextStripper stripper = new PDFTextStripper();
|
PDFTextStripper stripper = new PDFTextStripper();
|
||||||
hasText = !stripper.getText(tempDoc).trim().isEmpty();
|
hasText = !stripper.getText(tempDoc).trim().isEmpty();
|
||||||
}
|
}
|
||||||
boolean shouldOcr =
|
|
||||||
switch (ocrType) {
|
boolean shouldOcr = switch (ocrType) {
|
||||||
case "skip-text" -> !hasText;
|
case "skip-text" -> !hasText;
|
||||||
case "force-ocr" -> true;
|
case "force-ocr" -> true;
|
||||||
default -> true;
|
default -> true;
|
||||||
};
|
};
|
||||||
Path pageOutputPath =
|
|
||||||
tempOutputDir.resolve(String.format("page_%d.pdf", pageNum));
|
File pageOutputPath = new File(tempOutputDir, String.format("page_%d.pdf", pageNum));
|
||||||
|
|
||||||
if (shouldOcr) {
|
if (shouldOcr) {
|
||||||
// Convert page to image
|
// Convert page to image
|
||||||
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
|
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 300);
|
||||||
Path imagePath =
|
File imagePath = new File(tempImagesDir, String.format("page_%d.png", pageNum));
|
||||||
tempImagesDir.resolve(String.format("page_%d.png", pageNum));
|
ImageIO.write(image, "png", imagePath);
|
||||||
ImageIO.write(image, "png", imagePath.toFile());
|
|
||||||
// Build OCR command
|
// Build OCR command
|
||||||
List<String> command = new ArrayList<>();
|
List<String> command = new ArrayList<>();
|
||||||
command.add("tesseract");
|
command.add("tesseract");
|
||||||
command.add(imagePath.toString());
|
command.add(imagePath.toString());
|
||||||
command.add(
|
command.add(
|
||||||
tempOutputDir
|
new File(tempOutputDir, String.format("page_%d", pageNum))
|
||||||
.resolve(String.format("page_%d", pageNum))
|
|
||||||
.toString());
|
.toString());
|
||||||
command.add("-l");
|
command.add("-l");
|
||||||
command.add(String.join("+", languages));
|
command.add(String.join("+", languages));
|
||||||
// Always output PDF
|
// Always output PDF
|
||||||
command.add("pdf");
|
command.add("pdf");
|
||||||
ProcessBuilder pb = new ProcessBuilder(command);
|
|
||||||
process = pb.start();
|
// Use ProcessExecutor to run tesseract command
|
||||||
// Capture any error output
|
try {
|
||||||
try (BufferedReader reader =
|
ProcessExecutorResult result = ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
|
||||||
new BufferedReader(
|
.runCommandWithOutputHandling(command);
|
||||||
new InputStreamReader(process.getErrorStream()))) {
|
|
||||||
String line;
|
log.debug("Tesseract OCR completed for page {} with exit code {}",
|
||||||
while ((line = BoundedLineReader.readLine(reader, 5_000_000)) != null) {
|
pageNum, result.getRc());
|
||||||
log.debug("Tesseract: {}", line);
|
|
||||||
|
// Add OCR'd PDF to merger
|
||||||
|
merger.addSource(pageOutputPath);
|
||||||
|
} catch (IOException | InterruptedException e) {
|
||||||
|
log.error("Error processing page {} with tesseract: {}", pageNum, e.getMessage());
|
||||||
|
// If OCR fails, fall back to the original page
|
||||||
|
try (PDDocument pageDoc = new PDDocument()) {
|
||||||
|
pageDoc.addPage(page);
|
||||||
|
pageDoc.save(pageOutputPath);
|
||||||
|
merger.addSource(pageOutputPath);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int exitCode = process.waitFor();
|
|
||||||
if (exitCode != 0) {
|
|
||||||
throw new RuntimeException(
|
|
||||||
"Tesseract failed with exit code: " + exitCode);
|
|
||||||
}
|
|
||||||
// Add OCR'd PDF to merger
|
|
||||||
merger.addSource(pageOutputPath.toFile());
|
|
||||||
} else {
|
} else {
|
||||||
// Save original page without OCR
|
// Save original page without OCR
|
||||||
try (PDDocument pageDoc = new PDDocument()) {
|
try (PDDocument pageDoc = new PDDocument()) {
|
||||||
pageDoc.addPage(page);
|
pageDoc.addPage(page);
|
||||||
pageDoc.save(pageOutputPath.toFile());
|
pageDoc.save(pageOutputPath);
|
||||||
merger.addSource(pageOutputPath.toFile());
|
merger.addSource(pageOutputPath);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Merge all pages into final PDF
|
// Merge all pages into final PDF
|
||||||
merger.mergeDocuments(null);
|
merger.mergeDocuments(null);
|
||||||
|
|
||||||
// Read the final PDF file
|
// Read the final PDF file
|
||||||
byte[] pdfContent = Files.readAllBytes(finalOutputFile);
|
byte[] pdfContent = java.nio.file.Files.readAllBytes(finalOutputFile.toPath());
|
||||||
String outputFilename =
|
String outputFilename =
|
||||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||||
.replaceFirst("[.][^.]+$", "")
|
.replaceFirst("[.][^.]+$", "")
|
||||||
+ "_OCR.pdf";
|
+ "_OCR.pdf";
|
||||||
|
|
||||||
return ResponseEntity.ok()
|
return ResponseEntity.ok()
|
||||||
.header(
|
.header(
|
||||||
"Content-Disposition",
|
"Content-Disposition",
|
||||||
@ -167,14 +185,11 @@ public class OCRController {
|
|||||||
.contentType(MediaType.APPLICATION_PDF)
|
.contentType(MediaType.APPLICATION_PDF)
|
||||||
.body(pdfContent);
|
.body(pdfContent);
|
||||||
} finally {
|
} finally {
|
||||||
if (process != null) {
|
// Clean up the temp directory and all its contents
|
||||||
process.destroy();
|
tempFileManager.deleteTempDirectory(tempDirPath);
|
||||||
}
|
|
||||||
// Clean up temporary files
|
|
||||||
deleteDirectory(tempDir);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
|
private void addFileToZip(File file, String filename, ZipOutputStream zipOut)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if (!file.exists()) {
|
if (!file.exists()) {
|
||||||
@ -192,21 +207,4 @@ public class OCRController {
|
|||||||
zipOut.closeEntry();
|
zipOut.closeEntry();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
private void deleteDirectory(Path directory) {
|
|
||||||
try {
|
|
||||||
Files.walk(directory)
|
|
||||||
.sorted(Comparator.reverseOrder())
|
|
||||||
.forEach(
|
|
||||||
path -> {
|
|
||||||
try {
|
|
||||||
Files.delete(path);
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.error("Error deleting {}: {}", path, e.getMessage());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.error("Error walking directory {}: {}", directory, e.getMessage());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -44,4 +44,7 @@ springdoc.swagger-ui.path=/index.html
|
|||||||
posthog.api.key=phc_fiR65u5j6qmXTYL56MNrLZSWqLaDW74OrZH0Insd2xq
|
posthog.api.key=phc_fiR65u5j6qmXTYL56MNrLZSWqLaDW74OrZH0Insd2xq
|
||||||
posthog.host=https://eu.i.posthog.com
|
posthog.host=https://eu.i.posthog.com
|
||||||
|
|
||||||
spring.main.allow-bean-definition-overriding=true
|
spring.main.allow-bean-definition-overriding=true
|
||||||
|
|
||||||
|
# Set up a consistent temporary directory location
|
||||||
|
java.io.tmpdir=${stirling.tempfiles.directory:${java.io.tmpdir}/stirling-pdf}
|
Loading…
Reference in New Issue
Block a user