mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-04-06 03:19:39 +02:00
Memory enhancements and PDF decompress API (#3129)
# Description of Changes - PDF split by size to check size of PDF as it splits, avoids issue were a PDFs size is different viewed vs saved due to compression caused by repeated data etc. - Additionally memory enhancements for PDF load to dynamically load in memory vs scratch - PDF Decompress API for PDF testing ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
This commit is contained in:
@@ -4,142 +4,355 @@ import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.io.IOUtils;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
||||
import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction;
|
||||
import org.apache.pdfbox.io.ScratchFile;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.PdfMetadata;
|
||||
import stirling.software.SPDF.model.api.PDFFile;
|
||||
|
||||
/**
|
||||
* Adaptive PDF document factory that optimizes memory usage based on file size and available system
|
||||
* resources.
|
||||
*/
|
||||
@Component
|
||||
@Slf4j
|
||||
public class CustomPDDocumentFactory {
|
||||
|
||||
private final PdfMetadataService pdfMetadataService;
|
||||
|
||||
@Autowired
|
||||
// Memory thresholds and limits
|
||||
|
||||
private static final long SMALL_FILE_THRESHOLD = 10 * 1024 * 1024; // 10 MB
|
||||
// Files smaller than this threshold are loaded entirely in memory for better performance.
|
||||
// These files use IOUtils.createMemoryOnlyStreamCache() which keeps all document data in RAM.
|
||||
// No temp files are created for document data, reducing I/O operations but consuming more
|
||||
// memory.
|
||||
|
||||
private static final long LARGE_FILE_THRESHOLD = 50 * 1024 * 1024; // 50 MB
|
||||
// Files between SMALL and LARGE thresholds use file-based caching with ScratchFile,
|
||||
// but are loaded directly from byte arrays if provided that way.
|
||||
// When loading from byte arrays, once size exceeds this threshold, bytes are first
|
||||
// written to temp files before loading to reduce memory pressure.
|
||||
|
||||
private static final long LARGE_FILE_USAGE = 10 * 1024 * 1024;
|
||||
|
||||
private static final long EXTREMELY_LARGE_THRESHOLD = 100 * 1024 * 1024; // 100 MB
|
||||
// Files exceeding this threshold use specialized loading with RandomAccessReadBufferedFile
|
||||
// which provides buffered access to the file without loading the entire content at once.
|
||||
// These files are always processed using file-based caching with minimal memory footprint,
|
||||
// trading some performance for significantly reduced memory usage.
|
||||
// For extremely large PDFs, this prevents OutOfMemoryErrors at the cost of being more I/O
|
||||
// bound.
|
||||
|
||||
private static final double MIN_FREE_MEMORY_PERCENTAGE = 30.0; // 30%
|
||||
private static final long MIN_FREE_MEMORY_BYTES = 4L * 1024 * 1024 * 1024; // 4 GB
|
||||
|
||||
// Counter for tracking temporary resources
|
||||
private static final AtomicLong tempCounter = new AtomicLong(0);
|
||||
|
||||
public CustomPDDocumentFactory(PdfMetadataService pdfMetadataService) {
|
||||
this.pdfMetadataService = pdfMetadataService;
|
||||
}
|
||||
|
||||
public PDDocument createNewDocument() throws IOException {
|
||||
PDDocument document = new PDDocument();
|
||||
pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true);
|
||||
/**
|
||||
* Main entry point for loading a PDF document from a file. Automatically selects the most
|
||||
* appropriate loading strategy.
|
||||
*/
|
||||
public PDDocument load(File file) throws IOException {
|
||||
if (file == null) {
|
||||
throw new IllegalArgumentException("File cannot be null");
|
||||
}
|
||||
|
||||
long fileSize = file.length();
|
||||
log.info("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));
|
||||
|
||||
return loadAdaptively(file, fileSize);
|
||||
}
|
||||
|
||||
/** Load a PDF from byte array with automatic optimization. */
|
||||
public PDDocument load(byte[] input) throws IOException {
|
||||
if (input == null) {
|
||||
throw new IllegalArgumentException("Input bytes cannot be null");
|
||||
}
|
||||
|
||||
long dataSize = input.length;
|
||||
log.info("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024));
|
||||
|
||||
return loadAdaptively(input, dataSize);
|
||||
}
|
||||
|
||||
/** Load a PDF from InputStream with automatic optimization. */
|
||||
public PDDocument load(InputStream input) throws IOException {
|
||||
if (input == null) {
|
||||
throw new IllegalArgumentException("InputStream cannot be null");
|
||||
}
|
||||
|
||||
// Since we don't know the size upfront, buffer to a temp file
|
||||
Path tempFile = createTempFile("pdf-stream-");
|
||||
try {
|
||||
Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
return loadAdaptively(tempFile.toFile(), Files.size(tempFile));
|
||||
} catch (IOException e) {
|
||||
cleanupFile(tempFile);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private PDDocument loadAdaptively(Object source, long contentSize) throws IOException {
|
||||
long maxMemory = Runtime.getRuntime().maxMemory();
|
||||
long freeMemory = Runtime.getRuntime().freeMemory();
|
||||
long totalMemory = Runtime.getRuntime().totalMemory();
|
||||
long usedMemory = totalMemory - freeMemory;
|
||||
|
||||
// Calculate percentage of free memory
|
||||
double freeMemoryPercent = (double) (maxMemory - usedMemory) / maxMemory * 100;
|
||||
long actualFreeMemory = maxMemory - usedMemory;
|
||||
|
||||
// Log memory status
|
||||
log.info(
|
||||
"Memory status - Free: {}MB ({}%), Used: {}MB, Max: {}MB",
|
||||
actualFreeMemory / (1024 * 1024),
|
||||
String.format("%.2f", freeMemoryPercent),
|
||||
usedMemory / (1024 * 1024),
|
||||
maxMemory / (1024 * 1024));
|
||||
|
||||
// Determine caching strategy based on both file size and available memory
|
||||
StreamCacheCreateFunction cacheFunction;
|
||||
|
||||
// If free memory is critically low, always use file-based caching
|
||||
// In loadAdaptively method, replace current caching strategy decision with:
|
||||
if (freeMemoryPercent < MIN_FREE_MEMORY_PERCENTAGE
|
||||
|| actualFreeMemory < MIN_FREE_MEMORY_BYTES) {
|
||||
log.info(
|
||||
"Low memory detected ({}%), forcing file-based cache",
|
||||
String.format("%.2f", freeMemoryPercent));
|
||||
cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
|
||||
} else if (contentSize < SMALL_FILE_THRESHOLD) {
|
||||
log.info("Using memory-only cache for small document ({}KB)", contentSize / 1024);
|
||||
cacheFunction = IOUtils.createMemoryOnlyStreamCache();
|
||||
} else if (contentSize < LARGE_FILE_THRESHOLD) {
|
||||
// For medium files (10-50MB), use a mixed approach
|
||||
log.info(
|
||||
"Using mixed memory/file cache for medium document ({}MB)",
|
||||
contentSize / (1024 * 1024));
|
||||
cacheFunction =
|
||||
createScratchFileCacheFunction(MemoryUsageSetting.setupMixed(LARGE_FILE_USAGE));
|
||||
} else {
|
||||
log.info("Using file-based cache for large document");
|
||||
cacheFunction = createScratchFileCacheFunction(MemoryUsageSetting.setupTempFileOnly());
|
||||
}
|
||||
|
||||
PDDocument document;
|
||||
if (source instanceof File file) {
|
||||
document = loadFromFile(file, contentSize, cacheFunction);
|
||||
} else if (source instanceof byte[] bytes) {
|
||||
document = loadFromBytes(bytes, contentSize, cacheFunction);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
|
||||
}
|
||||
|
||||
postProcessDocument(document);
|
||||
return document;
|
||||
}
|
||||
|
||||
private StreamCacheCreateFunction createScratchFileCacheFunction(MemoryUsageSetting settings) {
|
||||
return () -> {
|
||||
try {
|
||||
return new ScratchFile(settings);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("ScratchFile initialization failed", e);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private void postProcessDocument(PDDocument doc) throws IOException {
|
||||
pdfMetadataService.setDefaultMetadata(doc);
|
||||
removePassword(doc);
|
||||
}
|
||||
|
||||
private PDDocument loadFromFile(File file, long size, StreamCacheCreateFunction cache)
|
||||
throws IOException {
|
||||
if (size >= EXTREMELY_LARGE_THRESHOLD) {
|
||||
log.info("Loading extremely large file via buffered access");
|
||||
return Loader.loadPDF(new RandomAccessReadBufferedFile(file), "", null, null, cache);
|
||||
}
|
||||
return Loader.loadPDF(file, "", null, null, cache);
|
||||
}
|
||||
|
||||
private PDDocument loadFromBytes(byte[] bytes, long size, StreamCacheCreateFunction cache)
|
||||
throws IOException {
|
||||
if (size >= SMALL_FILE_THRESHOLD) {
|
||||
log.info("Writing large byte array to temp file");
|
||||
Path tempFile = createTempFile("pdf-bytes-");
|
||||
try {
|
||||
Files.write(tempFile, bytes);
|
||||
return Loader.loadPDF(tempFile.toFile(), "", null, null, cache);
|
||||
} finally {
|
||||
cleanupFile(tempFile);
|
||||
}
|
||||
}
|
||||
return Loader.loadPDF(bytes, "", null, null, cache);
|
||||
}
|
||||
|
||||
public PDDocument createNewDocument(MemoryUsageSetting settings) throws IOException {
|
||||
PDDocument doc = new PDDocument(createScratchFileCacheFunction(settings));
|
||||
pdfMetadataService.setDefaultMetadata(doc);
|
||||
return doc;
|
||||
}
|
||||
|
||||
public PDDocument createNewDocument() throws IOException {
|
||||
return createNewDocument(MemoryUsageSetting.setupTempFileOnly());
|
||||
}
|
||||
|
||||
public byte[] saveToBytes(PDDocument document) throws IOException {
|
||||
if (document.getNumberOfPages() < 10) { // Simple heuristic
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
document.save(baos);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
} else {
|
||||
Path tempFile = createTempFile("pdf-save-");
|
||||
try {
|
||||
document.save(tempFile.toFile());
|
||||
return Files.readAllBytes(tempFile);
|
||||
} finally {
|
||||
cleanupFile(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Improved password handling
|
||||
private void removePassword(PDDocument document) throws IOException {
|
||||
if (document.isEncrypted()) {
|
||||
try {
|
||||
document.setAllSecurityToBeRemoved(true);
|
||||
} catch (Exception e) {
|
||||
log.error("Decryption failed", e);
|
||||
throw new IOException("PDF decryption failed", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Temp file handling with enhanced logging
|
||||
private Path createTempFile(String prefix) throws IOException {
|
||||
Path file = Files.createTempFile(prefix + tempCounter.incrementAndGet() + "-", ".tmp");
|
||||
log.info("Created temp file: {}", file);
|
||||
return file;
|
||||
}
|
||||
|
||||
/** Create a uniquely named temporary directory */
|
||||
private Path createTempDirectory(String prefix) throws IOException {
|
||||
return Files.createTempDirectory(prefix + tempCounter.incrementAndGet() + "-");
|
||||
}
|
||||
|
||||
/** Clean up a temporary file */
|
||||
private void cleanupFile(Path file) {
|
||||
try {
|
||||
if (Files.deleteIfExists(file)) {
|
||||
log.info("Deleted temp file: {}", file);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.info("Error deleting temp file {}", file, e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Create new document bytes based on an existing document */
|
||||
public byte[] createNewBytesBasedOnOldDocument(byte[] oldDocument) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(oldDocument);
|
||||
return createNewBytesBasedOnOldDocument(document);
|
||||
try (PDDocument document = load(oldDocument)) {
|
||||
return saveToBytes(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Create new document bytes based on an existing document file */
|
||||
public byte[] createNewBytesBasedOnOldDocument(File oldDocument) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(oldDocument);
|
||||
return createNewBytesBasedOnOldDocument(document);
|
||||
try (PDDocument document = load(oldDocument)) {
|
||||
return saveToBytes(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Create new document bytes based on an existing PDDocument */
|
||||
public byte[] createNewBytesBasedOnOldDocument(PDDocument oldDocument) throws IOException {
|
||||
pdfMetadataService.setMetadataToPdf(
|
||||
oldDocument, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
|
||||
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
oldDocument.save(baos);
|
||||
oldDocument.close();
|
||||
return baos.toByteArray();
|
||||
return saveToBytes(oldDocument);
|
||||
}
|
||||
|
||||
/** Create a new document based on an existing document bytes */
|
||||
public PDDocument createNewDocumentBasedOnOldDocument(byte[] oldDocument) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(oldDocument);
|
||||
return createNewDocumentBasedOnOldDocument(document);
|
||||
try (PDDocument document = load(oldDocument)) {
|
||||
return createNewDocumentBasedOnOldDocument(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Create a new document based on an existing document file */
|
||||
public PDDocument createNewDocumentBasedOnOldDocument(File oldDocument) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(oldDocument);
|
||||
return createNewDocumentBasedOnOldDocument(document);
|
||||
try (PDDocument document = load(oldDocument)) {
|
||||
return createNewDocumentBasedOnOldDocument(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Create a new document based on an existing PDDocument */
|
||||
public PDDocument createNewDocumentBasedOnOldDocument(PDDocument oldDocument)
|
||||
throws IOException {
|
||||
PDDocument document = new PDDocument();
|
||||
PDDocument document = createNewDocument();
|
||||
pdfMetadataService.setMetadataToPdf(
|
||||
document, pdfMetadataService.extractMetadataFromPdf(oldDocument), true);
|
||||
return document;
|
||||
}
|
||||
|
||||
/** Load document from a file and convert it to bytes */
|
||||
public byte[] loadToBytes(File file) throws IOException {
|
||||
PDDocument document = load(file);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
document.save(baos);
|
||||
// Close the document
|
||||
document.close();
|
||||
return baos.toByteArray();
|
||||
try (PDDocument document = load(file)) {
|
||||
return saveToBytes(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Load document from bytes and convert it back to bytes */
|
||||
public byte[] loadToBytes(byte[] bytes) throws IOException {
|
||||
PDDocument document = load(bytes);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
document.save(baos);
|
||||
// Close the document
|
||||
document.close();
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
// if loading from a file, assume the file has been made with Stirling-PDF
|
||||
public PDDocument load(File file) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(file);
|
||||
pdfMetadataService.setMetadataToPdf(document, PdfMetadata.builder().build(), true);
|
||||
return document;
|
||||
}
|
||||
|
||||
public PDDocument load(InputStream input) throws IOException {
|
||||
return load(input.readAllBytes());
|
||||
}
|
||||
|
||||
public PDDocument load(byte[] input) throws IOException {
|
||||
PDDocument document = Loader.loadPDF(input);
|
||||
pdfMetadataService.setDefaultMetadata(document);
|
||||
removezeropassword(document);
|
||||
return document;
|
||||
}
|
||||
|
||||
public PDDocument load(PDFFile pdfFile) throws IOException {
|
||||
return load(pdfFile.getFileInput());
|
||||
}
|
||||
|
||||
public PDDocument load(MultipartFile pdfFile) throws IOException {
|
||||
return load(pdfFile.getBytes());
|
||||
try (PDDocument document = load(bytes)) {
|
||||
return saveToBytes(document);
|
||||
}
|
||||
}
|
||||
|
||||
/** Load from a file path string */
|
||||
public PDDocument load(String path) throws IOException {
|
||||
return load(new File(path));
|
||||
}
|
||||
|
||||
/** Load from a PDFFile object */
|
||||
public PDDocument load(PDFFile pdfFile) throws IOException {
|
||||
return load(pdfFile.getFileInput());
|
||||
}
|
||||
|
||||
/** Load from a MultipartFile */
|
||||
public PDDocument load(MultipartFile pdfFile) throws IOException {
|
||||
return load(pdfFile.getBytes());
|
||||
}
|
||||
|
||||
/** Load with password from MultipartFile */
|
||||
public PDDocument load(MultipartFile fileInput, String password) throws IOException {
|
||||
return load(fileInput.getBytes(), password);
|
||||
}
|
||||
|
||||
/** Load with password from byte array */
|
||||
private PDDocument load(byte[] bytes, String password) throws IOException {
|
||||
// Since we don't have direct password support in the adaptive loader,
|
||||
// we'll need to use PDFBox's Loader directly
|
||||
PDDocument document = Loader.loadPDF(bytes, password);
|
||||
pdfMetadataService.setDefaultMetadata(document);
|
||||
return document;
|
||||
}
|
||||
|
||||
private PDDocument removezeropassword(PDDocument document) throws IOException {
|
||||
if (document.isEncrypted()) {
|
||||
try {
|
||||
log.info("Removing security from the source document");
|
||||
document.setAllSecurityToBeRemoved(true);
|
||||
} catch (Exception e) {
|
||||
log.warn("Cannot decrypt the pdf");
|
||||
}
|
||||
}
|
||||
return document;
|
||||
}
|
||||
|
||||
// Add other load methods as needed, following the same pattern
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user