Memory enhancements and PDF decompress API (#3129)

# Description of Changes - PDF split by size to check size of PDF as it splits, avoids issue were a PDFs size is different viewed vs saved due to compression caused by repeated data etc. - Additionally memory enhancements for PDF load to dynamically load in memory vs scratch - PDF Decompress API for PDF testing ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
2026-04-06 03:19:39 +02:00 · 2025-03-08 00:03:27 +00:00
parent 33eb3fd034
commit ed2ef01690
43 changed files with 1042 additions and 321 deletions
--- a/src/main/java/stirling/software/SPDF/service/CustomPDDocumentFactory.java
+++ b/src/main/java/stirling/software/SPDF/service/CustomPDDocumentFactory.java
@@ -4,142 +4,355 @@ import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.util.concurrent.atomic.AtomicLong;

 import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.io.IOUtils;
+import org.apache.pdfbox.io.MemoryUsageSetting;
+import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
+import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction;
+import org.apache.pdfbox.io.ScratchFile;
 import org.apache.pdfbox.pdmodel.PDDocument;
-import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.stereotype.Component;
 import org.springframework.web.multipart.MultipartFile;

 import lombok.extern.slf4j.Slf4j;

-import stirling.software.SPDF.model.PdfMetadata;
 import stirling.software.SPDF.model.api.PDFFile;

+/**
+ * Adaptive PDF document factory that optimizes memory usage based on file size and available system
+ * resources.
+ */
@Component
@Slf4j
 public class CustomPDDocumentFactory {

    private final PdfMetadataService pdfMetadataService;

-    @Autowired
+    // Memory thresholds and limits
+
+    private static final long SMALL_FILE_THRESHOLD = 10 * 1024 * 1024; // 10 MB
+    // Files smaller than this threshold are loaded entirely in memory for better performance.
+    // These files use IOUtils.createMemoryOnlyStreamCache() which keeps all document data in RAM.
+    // No temp files are created for document data, reducing I/O operations but consuming more
+    // memory.
+
+    private static final long LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50 MB
+    // Files between SMALL and LARGE thresholds use file-based caching with ScratchFile,
+    // but are loaded directly from byte arrays if provided that way.
+    // When loading from byte arrays, once size exceeds this threshold, bytes are first
+    // written to temp files before loading to reduce memory pressure.
+
+    private static final long LARGE_FILE_USAGE = 10 * 1024 * 1024;
+
+    private static final long EXTREMELY_LARGE_THRESHOLD = 100 * 1024 * 1024; // 100 MB
+    // Files exceeding this threshold use specialized loading with RandomAccessReadBufferedFile
+    // which provides buffered access to the file without loading the entire content at once.
+    // These files are always processed using file-based caching with minimal memory footprint,
+    // trading some performance for significantly reduced memory usage.
+    // For extremely large PDFs, this prevents OutOfMemoryErrors at the cost of being more I/O
+    // bound.
+
+    private static final double MIN_FREE_MEMORY_PERCENTAGE = 30.0; // 30%
+    private static final long MIN_FREE_MEMORY_BYTES = 4L * 1024 * 1024 * 1024; // 4 GB
+
+    // Counter for tracking temporary resources
+    private static final AtomicLong tempCounter = new AtomicLong(0);
+
    public CustomPDDocumentFactory(PdfMetadataService pdfMetadataService) {
        this.pdfMetadataService = pdfMetadataService;
    }

-    public PDDocument createNewDocument() throws IOException {
-        PDDocument document = new PDDocument();
-        pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true);
+    /**
+     * Main entry point for loading a PDF document from a file. Automatically selects the most
+     * appropriate loading strategy.
+     */
+    public PDDocument load(File file) throws IOException {
+        if (file == null) {
+            throw new IllegalArgumentException("File cannot be null");
+        }
+
+        long fileSize = file.length();
+        log.info("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));
+
+        return loadAdaptively(file, fileSize);
+    }
+
+    /** Load a PDF from byte array with automatic optimization. */
+    public PDDocument load(byte[] input) throws IOException {
+        if (input == null) {
+            throw new IllegalArgumentException("Input bytes cannot be null");
+        }
+
+        long dataSize = input.length;
+        log.info("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024));
+
+        return loadAdaptively(input, dataSize);
+    }
+
+    /** Load a PDF from InputStream with automatic optimization. */
+    public PDDocument load(InputStream input) throws IOException {
+        if (input == null) {
+            throw new IllegalArgumentException("InputStream cannot be null");
+        }
+
+        // Since we don't know the size upfront, buffer to a temp file
+        Path tempFile = createTempFile("pdf-stream-");
+        try {
+            Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
+            return loadAdaptively(tempFile.toFile(), Files.size(tempFile));
+        } catch (IOException e) {
+            cleanupFile(tempFile);
+            throw e;
+        }
+    }
+
+    private PDDocument loadAdaptively(Object source, long contentSize) throws IOException {
+        long maxMemory = Runtime.getRuntime().maxMemory();
+        long freeMemory = Runtime.getRuntime().freeMemory();
+        long totalMemory = Runtime.getRuntime().totalMemory();
+        long usedMemory = totalMemory - freeMemory;
+
+        // Calculate percentage of free memory
+        double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100;
+        long actualFreeMemory = maxMemory - usedMemory;
+
+        // Log memory status
+        log.info(
+                "Memory status - Free: {}MB ({}%), Used: {}MB, Max: {}MB",
+                actualFreeMemory / (1024 * 1024),
+                String.format("%.2f", freeMemoryPercent),
+                usedMemory / (1024 * 1024),
+                maxMemory / (1024 * 1024));
+
+        // Determine caching strategy based on both file size and available memory
+        StreamCacheCreateFunction cacheFunction;
+
+        // If free memory is critically low, always use file-based caching
+        // In loadAdaptively method, replace current caching strategy decision with:
+        if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE
+                || actualFreeMemory < MIN_FREE_MEMORY_BYTES) {
+            log.info(
+                    "Low memory detected ({}%), forcing file-based cache",
+                    String.format("%.2f", freeMemoryPercent));
+            cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
+        } else if (contentSize < SMALL_FILE_THRESHOLD) {
+            log.info("Using memory-only cache for small document ({}KB)", contentSize / 1024);
+            cacheFunction = IOUtils.createMemoryOnlyStreamCache();
+        } else if (contentSize < LARGE_FILE_THRESHOLD) {
+            // For medium files (10-50MB), use a mixed approach
+            log.info(
+                    "Using mixed memory/file cache for medium document ({}MB)",
+                    contentSize / (1024 * 1024));
+            cacheFunction =
+                    createScratchFileCacheFunction(MemoryUsageSetting.setupMixed(LARGE_FILE_USAGE));
+        } else {
+            log.info("Using file-based cache for large document");
+            cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
+        }
+
+        PDDocument document;
+        if (source instanceof File file) {
+            document = loadFromFile(file, contentSize, cacheFunction);
+        } else if (source instanceof byte[] bytes) {
+            document = loadFromBytes(bytes, contentSize, cacheFunction);
+        } else {
+            throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
+        }
+
+        postProcessDocument(document);
        return document;
    }

+    private StreamCacheCreateFunction createScratchFileCacheFunction(MemoryUsageSetting settings) {
+        return () -> {
+            try {
+                return new ScratchFile(settings);
+            } catch (IOException e) {
+                throw new RuntimeException("ScratchFile initialization failed", e);
+            }
+        };
+    }
+
+    private void postProcessDocument(PDDocument doc) throws IOException {
+        pdfMetadataService.setDefaultMetadata(doc);
+        removePassword(doc);
+    }
+
+    private PDDocument loadFromFile(File file, long size, StreamCacheCreateFunction cache)
+            throws IOException {
+        if (size >= EXTREMELY_LARGE_THRESHOLD) {
+            log.info("Loading extremely large file via buffered access");
+            return Loader.loadPDF(new RandomAccessReadBufferedFile(file), "", null, null, cache);
+        }
+        return Loader.loadPDF(file, "", null, null, cache);
+    }
+
+    private PDDocument loadFromBytes(byte[] bytes, long size, StreamCacheCreateFunction cache)
+            throws IOException {
+        if (size >= SMALL_FILE_THRESHOLD) {
+            log.info("Writing large byte array to temp file");
+            Path tempFile = createTempFile("pdf-bytes-");
+            try {
+                Files.write(tempFile, bytes);
+                return Loader.loadPDF(tempFile.toFile(), "", null, null, cache);
+            } finally {
+                cleanupFile(tempFile);
+            }
+        }
+        return Loader.loadPDF(bytes, "", null, null, cache);
+    }
+
+    public PDDocument createNewDocument(MemoryUsageSetting settings) throws IOException {
+        PDDocument doc = new PDDocument(createScratchFileCacheFunction(settings));
+        pdfMetadataService.setDefaultMetadata(doc);
+        return doc;
+    }
+
+    public PDDocument createNewDocument() throws IOException {
+        return createNewDocument(MemoryUsageSetting.setupTempFileOnly());
+    }
+
+    public byte[] saveToBytes(PDDocument document) throws IOException {
+        if (document.getNumberOfPages() < 10) { // Simple heuristic
+            try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+                document.save(baos);
+                return baos.toByteArray();
+            }
+        } else {
+            Path tempFile = createTempFile("pdf-save-");
+            try {
+                document.save(tempFile.toFile());
+                return Files.readAllBytes(tempFile);
+            } finally {
+                cleanupFile(tempFile);
+            }
+        }
+    }
+
+    // Improved password handling
+    private void removePassword(PDDocument document) throws IOException {
+        if (document.isEncrypted()) {
+            try {
+                document.setAllSecurityToBeRemoved(true);
+            } catch (Exception e) {
+                log.error("Decryption failed", e);
+                throw new IOException("PDF decryption failed", e);
+            }
+        }
+    }
+
+    // Temp file handling with enhanced logging
+    private Path createTempFile(String prefix) throws IOException {
+        Path file = Files.createTempFile(prefix + tempCounter.incrementAndGet() + "-", ".tmp");
+        log.info("Created temp file: {}", file);
+        return file;
+    }
+
+    /** Create a uniquely named temporary directory */
+    private Path createTempDirectory(String prefix) throws IOException {
+        return Files.createTempDirectory(prefix + tempCounter.incrementAndGet() + "-");
+    }
+
+    /** Clean up a temporary file */
+    private void cleanupFile(Path file) {
+        try {
+            if (Files.deleteIfExists(file)) {
+                log.info("Deleted temp file: {}", file);
+            }
+        } catch (IOException e) {
+            log.info("Error deleting temp file {}", file, e);
+        }
+    }
+
+    /** Create new document bytes based on an existing document */
    public byte[] createNewBytesBasedOnOldDocument(byte[] oldDocument) throws IOException {
-        PDDocument document = Loader.loadPDF(oldDocument);
-        return createNewBytesBasedOnOldDocument(document);
+        try (PDDocument document = load(oldDocument)) {
+            return saveToBytes(document);
+        }
    }

+    /** Create new document bytes based on an existing document file */
    public byte[] createNewBytesBasedOnOldDocument(File oldDocument) throws IOException {
-        PDDocument document = Loader.loadPDF(oldDocument);
-        return createNewBytesBasedOnOldDocument(document);
+        try (PDDocument document = load(oldDocument)) {
+            return saveToBytes(document);
+        }
    }

+    /** Create new document bytes based on an existing PDDocument */
    public byte[] createNewBytesBasedOnOldDocument(PDDocument oldDocument) throws IOException {
        pdfMetadataService.setMetadataToPdf(
                oldDocument, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
-
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        oldDocument.save(baos);
-        oldDocument.close();
-        return baos.toByteArray();
+        return saveToBytes(oldDocument);
    }

+    /** Create a new document based on an existing document bytes */
    public PDDocument createNewDocumentBasedOnOldDocument(byte[] oldDocument) throws IOException {
-        PDDocument document = Loader.loadPDF(oldDocument);
-        return createNewDocumentBasedOnOldDocument(document);
+        try (PDDocument document = load(oldDocument)) {
+            return createNewDocumentBasedOnOldDocument(document);
+        }
    }

+    /** Create a new document based on an existing document file */
    public PDDocument createNewDocumentBasedOnOldDocument(File oldDocument) throws IOException {
-        PDDocument document = Loader.loadPDF(oldDocument);
-        return createNewDocumentBasedOnOldDocument(document);
+        try (PDDocument document = load(oldDocument)) {
+            return createNewDocumentBasedOnOldDocument(document);
+        }
    }

+    /** Create a new document based on an existing PDDocument */
    public PDDocument createNewDocumentBasedOnOldDocument(PDDocument oldDocument)
            throws IOException {
-        PDDocument document = new PDDocument();
+        PDDocument document = createNewDocument();
        pdfMetadataService.setMetadataToPdf(
                document, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
        return document;
    }

+    /** Load document from a file and convert it to bytes */
    public byte[] loadToBytes(File file) throws IOException {
-        PDDocument document = load(file);
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        document.save(baos);
-        // Close the document
-        document.close();
-        return baos.toByteArray();
+        try (PDDocument document = load(file)) {
+            return saveToBytes(document);
+        }
    }

+    /** Load document from bytes and convert it back to bytes */
    public byte[] loadToBytes(byte[] bytes) throws IOException {
-        PDDocument document = load(bytes);
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        document.save(baos);
-        // Close the document
-        document.close();
-        return baos.toByteArray();
-    }
-
-    // if loading from a file, assume the file has been made with Stirling-PDF
-    public PDDocument load(File file) throws IOException {
-        PDDocument document = Loader.loadPDF(file);
-        pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true);
-        return document;
-    }
-
-    public PDDocument load(InputStream input) throws IOException {
-        return load(input.readAllBytes());
-    }
-
-    public PDDocument load(byte[] input) throws IOException {
-        PDDocument document = Loader.loadPDF(input);
-        pdfMetadataService.setDefaultMetadata(document);
-        removezeropassword(document);
-        return document;
-    }
-
-    public PDDocument load(PDFFile pdfFile) throws IOException {
-        return load(pdfFile.getFileInput());
-    }
-
-    public PDDocument load(MultipartFile pdfFile) throws IOException {
-        return load(pdfFile.getBytes());
+        try (PDDocument document = load(bytes)) {
+            return saveToBytes(document);
+        }
    }

+    /** Load from a file path string */
    public PDDocument load(String path) throws IOException {
        return load(new File(path));
    }

+    /** Load from a PDFFile object */
+    public PDDocument load(PDFFile pdfFile) throws IOException {
+        return load(pdfFile.getFileInput());
+    }
+
+    /** Load from a MultipartFile */
+    public PDDocument load(MultipartFile pdfFile) throws IOException {
+        return load(pdfFile.getBytes());
+    }
+
+    /** Load with password from MultipartFile */
    public PDDocument load(MultipartFile fileInput, String password) throws IOException {
        return load(fileInput.getBytes(), password);
    }

+    /** Load with password from byte array */
    private PDDocument load(byte[] bytes, String password) throws IOException {
+        // Since we don't have direct password support in the adaptive loader,
+        // we'll need to use PDFBox's Loader directly
        PDDocument document = Loader.loadPDF(bytes, password);
        pdfMetadataService.setDefaultMetadata(document);
        return document;
    }
-
-    private PDDocument removezeropassword(PDDocument document) throws IOException {
-        if (document.isEncrypted()) {
-            try {
-                log.info("Removing security from the source document");
-                document.setAllSecurityToBeRemoved(true);
-            } catch (Exception e) {
-                log.warn("Cannot decrypt the pdf");
-            }
-        }
-        return document;
-    }
-
-    // Add other load methods as needed, following the same pattern
 }