From 575854952a39809023e7e487ad7916c921e83a4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?=
<127139797+balazs-szucs@users.noreply.github.com>
Date: Sat, 4 Oct 2025 11:22:29 +0200
Subject: [PATCH] feat(get-info-on-pdf): use PDFBox preflight to validate PDF
compliancy level, and parse in compliancy type (#4595)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
# Description of Changes
- Refactored methods for parsing and extracting PDF/A conformance levels
from XMP metadata.
- Implemented PDF/A validation using Preflight from Apache PDFBox.
- Enhanced PDF information generation to include PDF/A conformance level
and validation results.
- Updated compliance checks and JSON output to reflect new PDF/A
capabilities.
### Test files:
[lorem-ipsum_PDFA1b.pdf](https://github.com/user-attachments/files/22687689/lorem-ipsum_PDFA1b.pdf)
[lorem-ipsum_PDFA_2b.pdf](https://github.com/user-attachments/files/22687692/lorem-ipsum_PDFA_2b.pdf)
[lorem-ipsum_PD⁄A3a.pdf](https://github.com/user-attachments/files/22687693/lorem-ipsum_PD.A3a.pdf)
### New results:
I also validated with online tools. Should be good now!
I was also thinking moving this to GeneralUtils; it may be useful for
PDF/A converter in the future, or for other features. Not sure yet, for
now I think this is good for now.
Closes #4568
---
## Checklist
### General
- [x] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [x] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [x] I have performed a self-review of my own code
- [x] My changes generate no new warnings
### Documentation
- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)
### UI Changes (if applicable)
- [x] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)
### Testing (if applicable)
- [x] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.
---------
Signed-off-by: Balázs Szücs
---
.../common/util/RegexPatternUtils.java | 15 +-
.../controller/api/security/GetInfoOnPDF.java | 548 ++++++++++++------
2 files changed, 375 insertions(+), 188 deletions(-)
diff --git a/app/common/src/main/java/stirling/software/common/util/RegexPatternUtils.java b/app/common/src/main/java/stirling/software/common/util/RegexPatternUtils.java
index 4d119e179..858ad0605 100644
--- a/app/common/src/main/java/stirling/software/common/util/RegexPatternUtils.java
+++ b/app/common/src/main/java/stirling/software/common/util/RegexPatternUtils.java
@@ -447,7 +447,20 @@ public final class RegexPatternUtils {
return getPattern("@\\s*([^\\s\\(]+(?:\\.[a-zA-Z0-9]+)?)");
}
- // API doc parsing patterns
+ /** Pattern for matching pdfaid:part attribute in XMP metadata */
+ public Pattern getPdfAidPartPattern() {
+ return getPattern("pdfaid:part[\"\\s]*=[\"\\s]*([0-9]+)");
+ }
+
+ /** Pattern for matching pdfaid:conformance attribute in XMP metadata */
+ public Pattern getPdfAidConformancePattern() {
+ return getPattern("pdfaid:conformance[\"\\s]*=[\"\\s]*([A-Za-z]+)");
+ }
+
+ /** Pattern for matching slash in page mode description */
+ public Pattern getPageModePattern() {
+ return getPattern("/");
+ }
/**
* Pre-compile commonly used patterns for immediate availability. This eliminates first-call
diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java
index e92cf1dd6..1e9038380 100644
--- a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java
+++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java
@@ -7,10 +7,13 @@ import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.pdfbox.cos.COSInputStream;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
@@ -40,8 +43,14 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
+import org.apache.pdfbox.preflight.PreflightDocument;
+import org.apache.pdfbox.preflight.ValidationResult;
+import org.apache.pdfbox.preflight.exception.SyntaxValidationException;
+import org.apache.pdfbox.preflight.exception.ValidationException;
+import org.apache.pdfbox.preflight.parser.PreflightParser;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.xmpbox.XMPMetadata;
+import org.apache.xmpbox.schema.PDFAIdentificationSchema;
import org.apache.xmpbox.xml.DomXmpParser;
import org.apache.xmpbox.xml.XmpParsingException;
import org.apache.xmpbox.xml.XmpSerializer;
@@ -95,60 +104,147 @@ public class GetInfoOnPDF {
}
}
- /**
- * Generates structured summary data about the PDF highlighting its unique characteristics such
- * as encryption status, permission restrictions, and standards compliance.
- *
- * @param document The PDF document to analyze
- * @return An ObjectNode containing structured summary data
- */
- private ObjectNode generatePDFSummaryData(PDDocument document) {
- ObjectNode summaryData = objectMapper.createObjectNode();
-
- // Check if encrypted
- if (document.isEncrypted()) {
- summaryData.put("encrypted", true);
+ public static boolean checkForStandard(PDDocument document, String standardKeyword) {
+ if ("PDF/A".equalsIgnoreCase(standardKeyword)) {
+ return getPdfAConformanceLevel(document) != null;
}
- // Check permissions
- AccessPermission ap = document.getCurrentAccessPermission();
- ArrayNode restrictedPermissions = objectMapper.createArrayNode();
-
- if (!ap.canAssembleDocument()) restrictedPermissions.add("document assembly");
- if (!ap.canExtractContent()) restrictedPermissions.add("content extraction");
- if (!ap.canExtractForAccessibility()) restrictedPermissions.add("accessibility extraction");
- if (!ap.canFillInForm()) restrictedPermissions.add("form filling");
- if (!ap.canModify()) restrictedPermissions.add("modification");
- if (!ap.canModifyAnnotations()) restrictedPermissions.add("annotation modification");
- if (!ap.canPrint()) restrictedPermissions.add("printing");
-
- if (!restrictedPermissions.isEmpty()) {
- summaryData.set("restrictedPermissions", restrictedPermissions);
- summaryData.put("restrictedPermissionsCount", restrictedPermissions.size());
- }
-
- // Check standard compliance
- if (checkForStandard(document, "PDF/A")) {
- summaryData.put("standardCompliance", "PDF/A");
- summaryData.put("standardPurpose", "long-term archiving");
- } else if (checkForStandard(document, "PDF/X")) {
- summaryData.put("standardCompliance", "PDF/X");
- summaryData.put("standardPurpose", "graphic exchange");
- } else if (checkForStandard(document, "PDF/UA")) {
- summaryData.put("standardCompliance", "PDF/UA");
- summaryData.put("standardPurpose", "universal accessibility");
- } else if (checkForStandard(document, "PDF/E")) {
- summaryData.put("standardCompliance", "PDF/E");
- summaryData.put("standardPurpose", "engineering workflows");
- } else if (checkForStandard(document, "PDF/VT")) {
- summaryData.put("standardCompliance", "PDF/VT");
- summaryData.put("standardPurpose", "variable and transactional printing");
- }
-
- return summaryData;
+ return checkStandardInMetadata(document, standardKeyword);
}
- public static boolean checkForStandard(PDDocument document, String standardKeyword) {
+ public static String getPdfAConformanceLevel(PDDocument document) {
+ if (document == null || document.isEncrypted()) {
+ return null;
+ }
+
+ return getPdfAVersionFromMetadata(document);
+ }
+
+ private static String getPdfAVersionFromMetadata(PDDocument document) {
+ try {
+ PDMetadata pdMetadata = document.getDocumentCatalog().getMetadata();
+ if (pdMetadata != null) {
+ try (COSInputStream metaStream = pdMetadata.createInputStream()) {
+ DomXmpParser domXmpParser = new DomXmpParser();
+ XMPMetadata xmpMeta = domXmpParser.parse(metaStream);
+
+ PDFAIdentificationSchema pdfId = xmpMeta.getPDFAIdentificationSchema();
+ if (pdfId != null) {
+ Integer part = pdfId.getPart();
+ String conformance = pdfId.getConformance();
+
+ if (part != null && conformance != null) {
+ return part + conformance.toUpperCase(Locale.ROOT);
+ }
+ } else {
+ try (COSInputStream rawStream = pdMetadata.createInputStream()) {
+ byte[] metadataBytes = rawStream.readAllBytes();
+ String rawMetadata = new String(metadataBytes, StandardCharsets.UTF_8);
+ String extracted = extractPdfAVersionFromRawXml(rawMetadata);
+ if (extracted != null) {
+ return extracted;
+ }
+ }
+ }
+ } catch (XmpParsingException e) {
+ log.debug("XMP parsing failed, trying raw metadata search: {}", e.getMessage());
+ try (COSInputStream metaStream = pdMetadata.createInputStream()) {
+ byte[] metadataBytes = metaStream.readAllBytes();
+ String rawMetadata = new String(metadataBytes, StandardCharsets.UTF_8);
+ String extracted = extractPdfAVersionFromRawXml(rawMetadata);
+ if (extracted != null) {
+ return extracted;
+ }
+ }
+ }
+ }
+ } catch (Exception e) {
+ log.debug("Error reading PDF/A metadata: {}", e.getMessage());
+ }
+
+ return null;
+ }
+
+ private static String extractPdfAVersionFromRawXml(String rawXml) {
+ if (rawXml == null || rawXml.isEmpty()) {
+ return null;
+ }
+
+ try {
+ Pattern partPattern = RegexPatternUtils.getInstance().getPdfAidPartPattern();
+ Pattern confPattern = RegexPatternUtils.getInstance().getPdfAidConformancePattern();
+
+ Matcher partMatcher = partPattern.matcher(rawXml);
+ Matcher confMatcher = confPattern.matcher(rawXml);
+
+ if (partMatcher.find() && confMatcher.find()) {
+ String part = partMatcher.group(1);
+ String conformance = confMatcher.group(1).toUpperCase(Locale.ROOT);
+ return part + conformance;
+ }
+ } catch (Exception e) {
+ log.debug("Error parsing raw XMP for PDF/A version: {}", e.getMessage());
+ }
+
+ return null;
+ }
+
+ private static boolean validatePdfAWithPreflight(PDDocument document, String version) {
+ if (document == null || document.isEncrypted()) {
+ return false;
+ }
+
+ try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+ document.save(baos);
+
+ try (RandomAccessReadBuffer source = new RandomAccessReadBuffer(baos.toByteArray())) {
+ PreflightParser parser = new PreflightParser(source);
+
+ try (PDDocument parsedDocument = parser.parse()) {
+ if (!(parsedDocument instanceof PreflightDocument preflightDocument)) {
+ log.debug(
+ "Parsed document is not a PreflightDocument; unable to validate claimed PDF/A {}",
+ version);
+ return false;
+ }
+
+ try {
+ ValidationResult result = preflightDocument.validate();
+ if (!result.isValid() && log.isDebugEnabled()) {
+ log.debug(
+ "PDF/A validation found {} errors for claimed version {}",
+ result.getErrorsList().size(),
+ version);
+ int logged = 0;
+ for (ValidationResult.ValidationError error : result.getErrorsList()) {
+ log.debug(
+ " Error {}: {}", error.getErrorCode(), error.getDetails());
+ if (++logged >= 5) {
+ break;
+ }
+ }
+ }
+ return result.isValid();
+ } catch (ValidationException e) {
+ log.debug(
+ "Validation exception during PDF/A validation: {}", e.getMessage());
+ }
+ } catch (SyntaxValidationException e) {
+ log.debug(
+ "Syntax validation failed during PDF/A validation: {}", e.getMessage());
+ return false;
+ }
+ }
+ } catch (IOException e) {
+ log.debug("IOException during PDF/A validation: {}", e.getMessage());
+ } catch (Exception e) {
+ log.debug("Unexpected error during PDF/A validation: {}", e.getMessage());
+ }
+
+ return false;
+ }
+
+ private static boolean checkStandardInMetadata(PDDocument document, String standardKeyword) {
// Check XMP Metadata
try {
PDMetadata pdMetadata = document.getDocumentCatalog().getMetadata();
@@ -191,11 +287,197 @@ public class GetInfoOnPDF {
return false;
}
+ /**
+ * Generates structured summary data about the PDF highlighting its unique characteristics such
+ * as encryption status, permission restrictions, and standards compliance.
+ *
+ * @param document The PDF document to analyze
+ * @return An ObjectNode containing structured summary data
+ */
+ private ObjectNode generatePDFSummaryData(
+ PDDocument document, String pdfaConformanceLevel, Boolean pdfaValidationPassed) {
+ ObjectNode summaryData = objectMapper.createObjectNode();
+
+ // Check if encrypted
+ if (document.isEncrypted()) {
+ summaryData.put("encrypted", true);
+ }
+
+ // Check permissions
+ AccessPermission ap = document.getCurrentAccessPermission();
+ ArrayNode restrictedPermissions = objectMapper.createArrayNode();
+
+ if (!ap.canAssembleDocument()) restrictedPermissions.add("document assembly");
+ if (!ap.canExtractContent()) restrictedPermissions.add("content extraction");
+ if (!ap.canExtractForAccessibility()) restrictedPermissions.add("accessibility extraction");
+ if (!ap.canFillInForm()) restrictedPermissions.add("form filling");
+ if (!ap.canModify()) restrictedPermissions.add("modification");
+ if (!ap.canModifyAnnotations()) restrictedPermissions.add("annotation modification");
+ if (!ap.canPrint()) restrictedPermissions.add("printing");
+
+ if (!restrictedPermissions.isEmpty()) {
+ summaryData.set("restrictedPermissions", restrictedPermissions);
+ summaryData.put("restrictedPermissionsCount", restrictedPermissions.size());
+ }
+
+ // Check standard compliance
+ if (pdfaConformanceLevel != null) {
+ summaryData.put("standardCompliance", "PDF/A-" + pdfaConformanceLevel);
+ summaryData.put("standardPurpose", "long-term archiving");
+ if (pdfaValidationPassed != null) {
+ summaryData.put("standardValidationPassed", pdfaValidationPassed);
+ }
+ } else if (checkForStandard(document, "PDF/X")) {
+ summaryData.put("standardCompliance", "PDF/X");
+ summaryData.put("standardPurpose", "graphic exchange");
+ } else if (checkForStandard(document, "PDF/UA")) {
+ summaryData.put("standardCompliance", "PDF/UA");
+ summaryData.put("standardPurpose", "universal accessibility");
+ } else if (checkForStandard(document, "PDF/E")) {
+ summaryData.put("standardCompliance", "PDF/E");
+ summaryData.put("standardPurpose", "engineering workflows");
+ } else if (checkForStandard(document, "PDF/VT")) {
+ summaryData.put("standardCompliance", "PDF/VT");
+ summaryData.put("standardPurpose", "variable and transactional printing");
+ }
+
+ return summaryData;
+ }
+
+ private static void setNodePermissions(PDDocument pdfBoxDoc, ObjectNode permissionsNode) {
+ AccessPermission ap = pdfBoxDoc.getCurrentAccessPermission();
+
+ permissionsNode.put("Document Assembly", getPermissionState(ap.canAssembleDocument()));
+ permissionsNode.put("Extracting Content", getPermissionState(ap.canExtractContent()));
+ permissionsNode.put(
+ "Extracting for accessibility",
+ getPermissionState(ap.canExtractForAccessibility()));
+ permissionsNode.put("Form Filling", getPermissionState(ap.canFillInForm()));
+ permissionsNode.put("Modifying", getPermissionState(ap.canModify()));
+ permissionsNode.put("Modifying annotations", getPermissionState(ap.canModifyAnnotations()));
+ permissionsNode.put("Printing", getPermissionState(ap.canPrint()));
+ }
+
+ private static String getPermissionState(boolean state) {
+ return state ? "Allowed" : "Not Allowed";
+ }
+
+ public static String getPageOrientation(double width, double height) {
+ if (width > height) {
+ return "Landscape";
+ } else if (height > width) {
+ return "Portrait";
+ } else {
+ return "Square";
+ }
+ }
+
+ public static String getPageSize(float width, float height) {
+ // Define standard page sizes
+ Map standardSizes = new HashMap<>();
+ standardSizes.put("Letter", PDRectangle.LETTER);
+ standardSizes.put("LEGAL", PDRectangle.LEGAL);
+ standardSizes.put("A0", PDRectangle.A0);
+ standardSizes.put("A1", PDRectangle.A1);
+ standardSizes.put("A2", PDRectangle.A2);
+ standardSizes.put("A3", PDRectangle.A3);
+ standardSizes.put("A4", PDRectangle.A4);
+ standardSizes.put("A5", PDRectangle.A5);
+ standardSizes.put("A6", PDRectangle.A6);
+
+ for (Map.Entry entry : standardSizes.entrySet()) {
+ PDRectangle size = entry.getValue();
+ if (isCloseToSize(width, height, size.getWidth(), size.getHeight())) {
+ return entry.getKey();
+ }
+ }
+ return "Custom";
+ }
+
+ private static boolean isCloseToSize(
+ float width, float height, float standardWidth, float standardHeight) {
+ float tolerance = 1.0f; // You can adjust the tolerance as needed
+ return Math.abs(width - standardWidth) <= tolerance
+ && Math.abs(height - standardHeight) <= tolerance;
+ }
+
+ private static void setDimensionInfo(ObjectNode dimensionInfo, float width, float height) {
+ float ppi = 72; // Points Per Inch
+
+ float widthInInches = width / ppi;
+ float heightInInches = height / ppi;
+
+ float widthInCm = widthInInches * 2.54f;
+ float heightInCm = heightInInches * 2.54f;
+
+ dimensionInfo.put("Width (px)", String.format("%.2f", width));
+ dimensionInfo.put("Height (px)", String.format("%.2f", height));
+ dimensionInfo.put("Width (in)", String.format("%.2f", widthInInches));
+ dimensionInfo.put("Height (in)", String.format("%.2f", heightInInches));
+ dimensionInfo.put("Width (cm)", String.format("%.2f", widthInCm));
+ dimensionInfo.put("Height (cm)", String.format("%.2f", heightInCm));
+ }
+
+ private static ArrayNode exploreStructureTree(List