Sanataize PDF improvements (#3251)

# Description of Changes

Please provide a summary of the changes, including:

- Make distinction between metadata removal and XMP metadata removal
- Change file loaders to only edit metadata for certain ops

Closes #(issue_number)

---

## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing)
for more details.
This commit is contained in:
Anthony Stirling
2025-03-26 10:53:22 +00:00
committed by GitHub
parent e2ba296320
commit 5ce941dda0
45 changed files with 175 additions and 83 deletions

View File

@@ -86,7 +86,7 @@ public class MetadataController {
allRequestParams = new java.util.HashMap<String, String>();
}
// Load the PDF file into a PDDocument
PDDocument document = pdfDocumentFactory.load(pdfFile);
PDDocument document = pdfDocumentFactory.load(pdfFile, true);
// Get the document information from the PDF
PDDocumentInformation info = document.getDocumentInformation();

View File

@@ -51,11 +51,12 @@ public class SanitizeController {
MultipartFile inputFile = request.getFileInput();
boolean removeJavaScript = request.isRemoveJavaScript();
boolean removeEmbeddedFiles = request.isRemoveEmbeddedFiles();
boolean removeXMPMetadata = request.isRemoveXMPMetadata();
boolean removeMetadata = request.isRemoveMetadata();
boolean removeLinks = request.isRemoveLinks();
boolean removeFonts = request.isRemoveFonts();
PDDocument document = pdfDocumentFactory.load(inputFile);
PDDocument document = pdfDocumentFactory.load(inputFile, true);
if (removeJavaScript) {
sanitizeJavaScript(document);
}
@@ -64,10 +65,14 @@ public class SanitizeController {
sanitizeEmbeddedFiles(document);
}
if (removeMetadata) {
sanitizeMetadata(document);
if (removeXMPMetadata) {
sanitizeXMPMetadata(document);
}
if (removeMetadata) {
sanitizeDocumentInfoMetadata(document);
}
if (removeLinks) {
sanitizeLinks(document);
}
@@ -145,7 +150,7 @@ public class SanitizeController {
}
}
private void sanitizeMetadata(PDDocument document) {
private void sanitizeXMPMetadata(PDDocument document) {
if (document.getDocumentCatalog() != null) {
PDMetadata metadata = document.getDocumentCatalog().getMetadata();
if (metadata != null) {
@@ -153,6 +158,16 @@ public class SanitizeController {
}
}
}
private void sanitizeDocumentInfoMetadata(PDDocument document) {
PDDocumentInformation docInfo = document.getDocumentInformation();
if (docInfo != null) {
PDDocumentInformation newInfo = new PDDocumentInformation();
document.setDocumentInformation(newInfo);
}
}
private void sanitizeLinks(PDDocument document) throws IOException {
for (PDPage page : document.getPages()) {

View File

@@ -17,9 +17,12 @@ public class SanitizePdfRequest extends PDFFile {
@Schema(description = "Remove embedded files from the PDF", defaultValue = "false")
private boolean removeEmbeddedFiles;
@Schema(description = "Remove metadata from the PDF", defaultValue = "false")
private boolean removeMetadata;
@Schema(description = "Remove XMP metadata from the PDF", defaultValue = "false")
private boolean removeXMPMetadata;
@Schema(description = "Remove document info metadata from the PDF", defaultValue = "false")
private boolean removeMetadata;
@Schema(description = "Remove links from the PDF", defaultValue = "false")
private boolean removeLinks;

View File

@@ -68,10 +68,18 @@ public class CustomPDFDocumentFactory {
}
/**
* Main entry point for loading a PDF document from a file. Automatically selects the most
* appropriate loading strategy.
*/
* Main entry point for loading a PDF document from a file. Automatically selects the most
* appropriate loading strategy.
*/
public PDDocument load(File file) throws IOException {
return load(file, false);
}
/**
* Main entry point for loading a PDF document from a file with read-only option.
* Automatically selects the most appropriate loading strategy.
*/
public PDDocument load(File file, boolean readOnly) throws IOException {
if (file == null) {
throw new IllegalArgumentException("File cannot be null");
}
@@ -79,14 +87,26 @@ public class CustomPDFDocumentFactory {
long fileSize = file.length();
log.debug("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));
return loadAdaptively(file, fileSize);
PDDocument doc = loadAdaptively(file, fileSize);
if (!readOnly) {
postProcessDocument(doc);
}
return doc;
}
/**
* Main entry point for loading a PDF document from a Path. Automatically selects the most
* appropriate loading strategy.
*/
* Main entry point for loading a PDF document from a Path. Automatically selects the most
* appropriate loading strategy.
*/
public PDDocument load(Path path) throws IOException {
return load(path, false);
}
/**
* Main entry point for loading a PDF document from a Path with read-only option.
* Automatically selects the most appropriate loading strategy.
*/
public PDDocument load(Path path, boolean readOnly) throws IOException {
if (path == null) {
throw new IllegalArgumentException("File cannot be null");
}
@@ -94,11 +114,20 @@ public class CustomPDFDocumentFactory {
long fileSize = Files.size(path);
log.debug("Loading PDF from file, size: {}MB", fileSize / (1024 * 1024));
return loadAdaptively(path.toFile(), fileSize);
PDDocument doc = loadAdaptively(path.toFile(), fileSize);
if (!readOnly) {
postProcessDocument(doc);
}
return doc;
}
/** Load a PDF from byte array with automatic optimization. */
public PDDocument load(byte[] input) throws IOException {
return load(input, false);
}
/** Load a PDF from byte array with automatic optimization and read-only option. */
public PDDocument load(byte[] input, boolean readOnly) throws IOException {
if (input == null) {
throw new IllegalArgumentException("Input bytes cannot be null");
}
@@ -106,11 +135,20 @@ public class CustomPDFDocumentFactory {
long dataSize = input.length;
log.debug("Loading PDF from byte array, size: {}MB", dataSize / (1024 * 1024));
return loadAdaptively(input, dataSize);
PDDocument doc = loadAdaptively(input, dataSize);
if (!readOnly) {
postProcessDocument(doc);
}
return doc;
}
/** Load a PDF from InputStream with automatic optimization. */
public PDDocument load(InputStream input) throws IOException {
return load(input, false);
}
/** Load a PDF from InputStream with automatic optimization and read-only option. */
public PDDocument load(InputStream input, boolean readOnly) throws IOException {
if (input == null) {
throw new IllegalArgumentException("InputStream cannot be null");
}
@@ -119,11 +157,20 @@ public class CustomPDFDocumentFactory {
Path tempFile = createTempFile("pdf-stream-");
Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
return loadAdaptively(tempFile.toFile(), Files.size(tempFile));
PDDocument doc = loadAdaptively(tempFile.toFile(), Files.size(tempFile));
if (!readOnly) {
postProcessDocument(doc);
}
return doc;
}
/** Load with password from InputStream */
public PDDocument load(InputStream input, String password) throws IOException {
return load(input, password, false);
}
/** Load with password from InputStream and read-only option */
public PDDocument load(InputStream input, String password, boolean readOnly) throws IOException {
if (input == null) {
throw new IllegalArgumentException("InputStream cannot be null");
}
@@ -132,9 +179,54 @@ public class CustomPDFDocumentFactory {
Path tempFile = createTempFile("pdf-stream-");
Files.copy(input, tempFile, StandardCopyOption.REPLACE_EXISTING);
return loadAdaptivelyWithPassword(tempFile.toFile(), Files.size(tempFile), password);
PDDocument doc = loadAdaptivelyWithPassword(tempFile.toFile(), Files.size(tempFile), password);
if (!readOnly) {
postProcessDocument(doc);
}
return doc;
}
/** Load from a file path string */
public PDDocument load(String path) throws IOException {
return load(path, false);
}
/** Load from a file path string with read-only option */
public PDDocument load(String path, boolean readOnly) throws IOException {
return load(new File(path), readOnly);
}
/** Load from a PDFFile object */
public PDDocument load(PDFFile pdfFile) throws IOException {
return load(pdfFile, false);
}
/** Load from a PDFFile object with read-only option */
public PDDocument load(PDFFile pdfFile, boolean readOnly) throws IOException {
return load(pdfFile.getFileInput(), readOnly);
}
/** Load from a MultipartFile */
public PDDocument load(MultipartFile pdfFile) throws IOException {
return load(pdfFile, false);
}
/** Load from a MultipartFile with read-only option */
public PDDocument load(MultipartFile pdfFile, boolean readOnly) throws IOException {
return load(pdfFile.getInputStream(), readOnly);
}
/** Load with password from MultipartFile */
public PDDocument load(MultipartFile fileInput, String password) throws IOException {
return load(fileInput, password, false);
}
/** Load with password from MultipartFile with read-only option */
public PDDocument load(MultipartFile fileInput, String password, boolean readOnly) throws IOException {
return load(fileInput.getInputStream(), password, readOnly);
}
/**
* Determine the appropriate caching strategy based on file size and available memory. This
* common method is used by both password and non-password loading paths.
@@ -197,8 +289,6 @@ public class CustomPDFDocumentFactory {
} else {
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
}
postProcessDocument(document);
return document;
}
@@ -220,8 +310,6 @@ public class CustomPDFDocumentFactory {
} else {
throw new IllegalArgumentException("Unsupported source type: " + source.getClass());
}
postProcessDocument(document);
return document;
}
@@ -384,23 +472,4 @@ public class CustomPDFDocumentFactory {
}
}
/** Load from a file path string */
public PDDocument load(String path) throws IOException {
return load(new File(path));
}
/** Load from a PDFFile object */
public PDDocument load(PDFFile pdfFile) throws IOException {
return load(pdfFile.getFileInput());
}
/** Load from a MultipartFile */
public PDDocument load(MultipartFile pdfFile) throws IOException {
return load(pdfFile.getInputStream());
}
/** Load with password from MultipartFile */
public PDDocument load(MultipartFile fileInput, String password) throws IOException {
return load(fileInput.getInputStream(), password);
}
}