Sanataize PDF improvements (#3251)

# Description of Changes Please provide a summary of the changes, including: - Make distinction between metadata removal and XMP metadata removal - Change file loaders to only edit metadata for certain ops Closes #(issue_number) --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
2026-04-16 23:08:38 +02:00 · 2025-03-26 10:53:22 +00:00
parent e2ba296320
commit 5ce941dda0
45 changed files with 175 additions and 83 deletions
--- a/src/main/java/stirling/software/SPDF/controller/api/misc/MetadataController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/misc/MetadataController.java
@@ -86,7 +86,7 @@ public class MetadataController {
            allRequestParams = new java.util.HashMap<String, String>();
        }
        // Load the PDF file into a PDDocument
-        PDDocument document = pdfDocumentFactory.load(pdfFile);
+        PDDocument document = pdfDocumentFactory.load(pdfFile, true);

        // Get the document information from the PDF
        PDDocumentInformation info = document.getDocumentInformation();
--- a/src/main/java/stirling/software/SPDF/controller/api/security/SanitizeController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/security/SanitizeController.java
@@ -51,11 +51,12 @@ public class SanitizeController {
        MultipartFile inputFile = request.getFileInput();
        boolean removeJavaScript = request.isRemoveJavaScript();
        boolean removeEmbeddedFiles = request.isRemoveEmbeddedFiles();
+        boolean removeXMPMetadata = request.isRemoveXMPMetadata();
        boolean removeMetadata = request.isRemoveMetadata();
        boolean removeLinks = request.isRemoveLinks();
        boolean removeFonts = request.isRemoveFonts();

-        PDDocument document = pdfDocumentFactory.load(inputFile);
+        PDDocument document = pdfDocumentFactory.load(inputFile, true);
        if (removeJavaScript) {
            sanitizeJavaScript(document);
        }
@@ -64,10 +65,14 @@ public class SanitizeController {
            sanitizeEmbeddedFiles(document);
        }

-        if (removeMetadata) {
-            sanitizeMetadata(document);
+        if (removeXMPMetadata) {
+        	sanitizeXMPMetadata(document);
        }

+        if (removeMetadata) {
+        	sanitizeDocumentInfoMetadata(document);
+        }
+      
        if (removeLinks) {
            sanitizeLinks(document);
        }
@@ -145,7 +150,7 @@ public class SanitizeController {
        }
    }

-    private void sanitizeMetadata(PDDocument document) {
+    private void sanitizeXMPMetadata(PDDocument document) {
        if (document.getDocumentCatalog() != null) {
            PDMetadata metadata = document.getDocumentCatalog().getMetadata();
            if (metadata != null) {
@@ -153,6 +158,16 @@ public class SanitizeController {
            }
        }
    }
+    
+    private void sanitizeDocumentInfoMetadata(PDDocument document) {
+        PDDocumentInformation docInfo = document.getDocumentInformation();
+        if (docInfo != null) {
+            PDDocumentInformation newInfo = new PDDocumentInformation();
+            document.setDocumentInformation(newInfo);
+        }
+    }
+
+

    private void sanitizeLinks(PDDocument document) throws IOException {
        for (PDPage page : document.getPages()) {
--- a/src/main/java/stirling/software/SPDF/model/api/security/SanitizePdfRequest.java
+++ b/src/main/java/stirling/software/SPDF/model/api/security/SanitizePdfRequest.java
@@ -17,9 +17,12 @@ public class SanitizePdfRequest extends PDFFile {
    @Schema(description = "Remove embedded files from the PDF", defaultValue = "false")
    private boolean removeEmbeddedFiles;

-    @Schema(description = "Remove metadata from the PDF", defaultValue = "false")
-    private boolean removeMetadata;
+    @Schema(description = "Remove XMP metadata from the PDF", defaultValue = "false")
+    private boolean removeXMPMetadata;

+    @Schema(description = "Remove document info metadata from the PDF", defaultValue = "false")
+    private boolean removeMetadata;
+    
    @Schema(description = "Remove links from the PDF", defaultValue = "false")
    private boolean removeLinks;

--- a/src/main/java/stirling/software/SPDF/service/CustomPDFDocumentFactory.java
+++ b/src/main/java/stirling/software/SPDF/service/CustomPDFDocumentFactory.java
@@ -68,10 +68,18 @@ public class CustomPDFDocumentFactory {
    }

    /**
-     * Main entry point for loading a PDF document from a file. Automatically selects the most
-     * appropriate loading strategy.
-     */
+    * Main entry point for loading a PDF document from a file. Automatically selects the most
+    * appropriate loading strategy.
+    */
    public PDDocument load(File file) throws IOException {
+        return load(file, false);
+    }
+
+    /**
+    * Main entry point for loading a PDF document from a file with read-only option.
+    * Automatically selects the most appropriate loading strategy.
+    */
+    public PDDocument load(File file, boolean readOnly) throws IOException {
        if (file == null) {
            throw new IllegalArgumentException("File cannot be null");
        }
@@ -79,14 +87,26 @@ public class CustomPDFDocumentFactory {
        long fileSize = file.length();
        log.debug("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));

-        return loadAdaptively(file, fileSize);
+        PDDocument doc = loadAdaptively(file, fileSize);
+        if (!readOnly) {
+            postProcessDocument(doc);
+        }
+        return doc;
    }

    /**
-     * Main entry point for loading a PDF document from a Path. Automatically selects the most
-     * appropriate loading strategy.
-     */
+    * Main entry point for loading a PDF document from a Path. Automatically selects the most
+    * appropriate loading strategy.
+    */
    public PDDocument load(Path path) throws IOException {
+        return load(path, false);
+    }
+
+    /**
+    * Main entry point for loading a PDF document from a Path with read-only option.
+    * Automatically selects the most appropriate loading strategy.
+    */
+    public PDDocument load(Path path, boolean readOnly) throws IOException {
        if (path == null) {
            throw new IllegalArgumentException("File cannot be null");
        }
@@ -94,11 +114,20 @@ public class CustomPDFDocumentFactory {
        long fileSize = Files.size(path);
        log.debug("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));

-        return loadAdaptively(path.toFile(), fileSize);
+        PDDocument doc = loadAdaptively(path.toFile(), fileSize);
+        if (!readOnly) {
+            postProcessDocument(doc);
+        }
+        return doc;
    }

    /** Load a PDF from byte array with automatic optimization. */
    public PDDocument load(byte[] input) throws IOException {
+        return load(input, false);
+    }
+
+    /** Load a PDF from byte array with automatic optimization and read-only option. */
+    public PDDocument load(byte[] input, boolean readOnly) throws IOException {
        if (input == null) {
            throw new IllegalArgumentException("Input bytes cannot be null");
        }
@@ -106,11 +135,20 @@ public class CustomPDFDocumentFactory {
        long dataSize = input.length;
        log.debug("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024));

-        return loadAdaptively(input, dataSize);
+        PDDocument doc = loadAdaptively(input, dataSize);
+        if (!readOnly) {
+            postProcessDocument(doc);
+        }
+        return doc;
    }

    /** Load a PDF from InputStream with automatic optimization. */
    public PDDocument load(InputStream input) throws IOException {
+        return load(input, false);
+    }
+
+    /** Load a PDF from InputStream with automatic optimization and read-only option. */
+    public PDDocument load(InputStream input, boolean readOnly) throws IOException {
        if (input == null) {
            throw new IllegalArgumentException("InputStream cannot be null");
        }
@@ -119,11 +157,20 @@ public class CustomPDFDocumentFactory {
        Path tempFile = createTempFile("pdf-stream-");

        Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
-        return loadAdaptively(tempFile.toFile(), Files.size(tempFile));
+        PDDocument doc = loadAdaptively(tempFile.toFile(), Files.size(tempFile));
+        if (!readOnly) {
+            postProcessDocument(doc);
+        }
+        return doc;
    }

    /** Load with password from InputStream */
    public PDDocument load(InputStream input, String password) throws IOException {
+        return load(input, password, false);
+    }
+
+    /** Load with password from InputStream and read-only option */
+    public PDDocument load(InputStream input, String password, boolean readOnly) throws IOException {
        if (input == null) {
            throw new IllegalArgumentException("InputStream cannot be null");
        }
@@ -132,9 +179,54 @@ public class CustomPDFDocumentFactory {
        Path tempFile = createTempFile("pdf-stream-");

        Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
-        return loadAdaptivelyWithPassword(tempFile.toFile(), Files.size(tempFile), password);
+        PDDocument doc = loadAdaptivelyWithPassword(tempFile.toFile(), Files.size(tempFile), password);
+        if (!readOnly) {
+            postProcessDocument(doc);
+        }
+        return doc;
    }

+    /** Load from a file path string */
+    public PDDocument load(String path) throws IOException {
+        return load(path, false);
+    }
+
+    /** Load from a file path string with read-only option */
+    public PDDocument load(String path, boolean readOnly) throws IOException {
+        return load(new File(path), readOnly);
+    }
+
+    /** Load from a PDFFile object */
+    public PDDocument load(PDFFile pdfFile) throws IOException {
+        return load(pdfFile, false);
+    }
+
+    /** Load from a PDFFile object with read-only option */
+    public PDDocument load(PDFFile pdfFile, boolean readOnly) throws IOException {
+    	return load(pdfFile.getFileInput(), readOnly);
+    }
+
+    /** Load from a MultipartFile */
+    public PDDocument load(MultipartFile pdfFile) throws IOException {
+        return load(pdfFile, false);
+    }
+
+    /** Load from a MultipartFile with read-only option */
+    public PDDocument load(MultipartFile pdfFile, boolean readOnly) throws IOException {
+       return  load(pdfFile.getInputStream(), readOnly);
+
+    }
+
+    /** Load with password from MultipartFile */
+    public PDDocument load(MultipartFile fileInput, String password) throws IOException {
+        return load(fileInput, password, false);
+    }
+
+    /** Load with password from MultipartFile with read-only option */
+    public PDDocument load(MultipartFile fileInput, String password, boolean readOnly) throws IOException {
+    	return load(fileInput.getInputStream(), password, readOnly);
+    }
+    
    /**
     * Determine the appropriate caching strategy based on file size and available memory. This
     * common method is used by both password and non-password loading paths.
@@ -197,8 +289,6 @@ public class CustomPDFDocumentFactory {
        } else {
            throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
        }
-
-        postProcessDocument(document);
        return document;
    }

@@ -220,8 +310,6 @@ public class CustomPDFDocumentFactory {
        } else {
            throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
        }
-
-        postProcessDocument(document);
        return document;
    }

@@ -384,23 +472,4 @@ public class CustomPDFDocumentFactory {
        }
    }

-    /** Load from a file path string */
-    public PDDocument load(String path) throws IOException {
-        return load(new File(path));
-    }
-
-    /** Load from a PDFFile object */
-    public PDDocument load(PDFFile pdfFile) throws IOException {
-        return load(pdfFile.getFileInput());
-    }
-
-    /** Load from a MultipartFile */
-    public PDDocument load(MultipartFile pdfFile) throws IOException {
-        return load(pdfFile.getInputStream());
-    }
-
-    /** Load with password from MultipartFile */
-    public PDDocument load(MultipartFile fileInput, String password) throws IOException {
-        return load(fileInput.getInputStream(), password);
-    }
 }