mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
feat(get-info-on-pdf): use PDFBox preflight to validate PDF compliancy level, and parse in compliancy type (#4595)
# Description of Changes - Refactored methods for parsing and extracting PDF/A conformance levels from XMP metadata. - Implemented PDF/A validation using Preflight from Apache PDFBox. - Enhanced PDF information generation to include PDF/A conformance level and validation results. - Updated compliance checks and JSON output to reflect new PDF/A capabilities. ### Test files: [lorem-ipsum_PDFA1b.pdf](https://github.com/user-attachments/files/22687689/lorem-ipsum_PDFA1b.pdf) [lorem-ipsum_PDFA_2b.pdf](https://github.com/user-attachments/files/22687692/lorem-ipsum_PDFA_2b.pdf) [lorem-ipsum_PD⁄A3a.pdf](https://github.com/user-attachments/files/22687693/lorem-ipsum_PD.A3a.pdf) ### New results: <img width="699" height="257" alt="image" src="https://github.com/user-attachments/assets/b8cb5510-2908-4e08-97f6-d5799e0e1be7" /> <img width="699" height="257" alt="image" src="https://github.com/user-attachments/assets/d7af3731-ad19-4524-b1c1-32f47776e6af" /> <img width="699" height="257" alt="image" src="https://github.com/user-attachments/assets/6e48e65b-2ebc-402a-a222-bfdbf783e45d" /> I also validated with online tools. Should be good now! I was also thinking moving this to GeneralUtils; it may be useful for PDF/A converter in the future, or for other features. Not sure yet, for now I think this is good for now. Closes #4568 <!-- Please provide a summary of the changes, including: - What was changed - Why the change was made - Any challenges encountered Closes #(issue_number) --> --- ## Checklist ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [x] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [x] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [x] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --------- Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
dabc52ef73
commit
575854952a
@ -447,7 +447,20 @@ public final class RegexPatternUtils {
|
||||
return getPattern("@\\s*([^\\s\\(]+(?:\\.[a-zA-Z0-9]+)?)");
|
||||
}
|
||||
|
||||
// API doc parsing patterns
|
||||
/** Pattern for matching pdfaid:part attribute in XMP metadata */
|
||||
public Pattern getPdfAidPartPattern() {
|
||||
return getPattern("pdfaid:part[\"\\s]*=[\"\\s]*([0-9]+)");
|
||||
}
|
||||
|
||||
/** Pattern for matching pdfaid:conformance attribute in XMP metadata */
|
||||
public Pattern getPdfAidConformancePattern() {
|
||||
return getPattern("pdfaid:conformance[\"\\s]*=[\"\\s]*([A-Za-z]+)");
|
||||
}
|
||||
|
||||
/** Pattern for matching slash in page mode description */
|
||||
public Pattern getPageModePattern() {
|
||||
return getPattern("/");
|
||||
}
|
||||
|
||||
/**
|
||||
* Pre-compile commonly used patterns for immediate availability. This eliminates first-call
|
||||
|
||||
@ -7,10 +7,13 @@ import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.pdfbox.cos.COSInputStream;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.io.RandomAccessReadBuffer;
|
||||
import org.apache.pdfbox.pdmodel.*;
|
||||
import org.apache.pdfbox.pdmodel.common.PDMetadata;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
@ -40,8 +43,14 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
|
||||
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
|
||||
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
|
||||
import org.apache.pdfbox.preflight.PreflightDocument;
|
||||
import org.apache.pdfbox.preflight.ValidationResult;
|
||||
import org.apache.pdfbox.preflight.exception.SyntaxValidationException;
|
||||
import org.apache.pdfbox.preflight.exception.ValidationException;
|
||||
import org.apache.pdfbox.preflight.parser.PreflightParser;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.xmpbox.XMPMetadata;
|
||||
import org.apache.xmpbox.schema.PDFAIdentificationSchema;
|
||||
import org.apache.xmpbox.xml.DomXmpParser;
|
||||
import org.apache.xmpbox.xml.XmpParsingException;
|
||||
import org.apache.xmpbox.xml.XmpSerializer;
|
||||
@ -95,60 +104,147 @@ public class GetInfoOnPDF {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates structured summary data about the PDF highlighting its unique characteristics such
|
||||
* as encryption status, permission restrictions, and standards compliance.
|
||||
*
|
||||
* @param document The PDF document to analyze
|
||||
* @return An ObjectNode containing structured summary data
|
||||
*/
|
||||
private ObjectNode generatePDFSummaryData(PDDocument document) {
|
||||
ObjectNode summaryData = objectMapper.createObjectNode();
|
||||
|
||||
// Check if encrypted
|
||||
if (document.isEncrypted()) {
|
||||
summaryData.put("encrypted", true);
|
||||
public static boolean checkForStandard(PDDocument document, String standardKeyword) {
|
||||
if ("PDF/A".equalsIgnoreCase(standardKeyword)) {
|
||||
return getPdfAConformanceLevel(document) != null;
|
||||
}
|
||||
|
||||
// Check permissions
|
||||
AccessPermission ap = document.getCurrentAccessPermission();
|
||||
ArrayNode restrictedPermissions = objectMapper.createArrayNode();
|
||||
|
||||
if (!ap.canAssembleDocument()) restrictedPermissions.add("document assembly");
|
||||
if (!ap.canExtractContent()) restrictedPermissions.add("content extraction");
|
||||
if (!ap.canExtractForAccessibility()) restrictedPermissions.add("accessibility extraction");
|
||||
if (!ap.canFillInForm()) restrictedPermissions.add("form filling");
|
||||
if (!ap.canModify()) restrictedPermissions.add("modification");
|
||||
if (!ap.canModifyAnnotations()) restrictedPermissions.add("annotation modification");
|
||||
if (!ap.canPrint()) restrictedPermissions.add("printing");
|
||||
|
||||
if (!restrictedPermissions.isEmpty()) {
|
||||
summaryData.set("restrictedPermissions", restrictedPermissions);
|
||||
summaryData.put("restrictedPermissionsCount", restrictedPermissions.size());
|
||||
}
|
||||
|
||||
// Check standard compliance
|
||||
if (checkForStandard(document, "PDF/A")) {
|
||||
summaryData.put("standardCompliance", "PDF/A");
|
||||
summaryData.put("standardPurpose", "long-term archiving");
|
||||
} else if (checkForStandard(document, "PDF/X")) {
|
||||
summaryData.put("standardCompliance", "PDF/X");
|
||||
summaryData.put("standardPurpose", "graphic exchange");
|
||||
} else if (checkForStandard(document, "PDF/UA")) {
|
||||
summaryData.put("standardCompliance", "PDF/UA");
|
||||
summaryData.put("standardPurpose", "universal accessibility");
|
||||
} else if (checkForStandard(document, "PDF/E")) {
|
||||
summaryData.put("standardCompliance", "PDF/E");
|
||||
summaryData.put("standardPurpose", "engineering workflows");
|
||||
} else if (checkForStandard(document, "PDF/VT")) {
|
||||
summaryData.put("standardCompliance", "PDF/VT");
|
||||
summaryData.put("standardPurpose", "variable and transactional printing");
|
||||
}
|
||||
|
||||
return summaryData;
|
||||
return checkStandardInMetadata(document, standardKeyword);
|
||||
}
|
||||
|
||||
public static boolean checkForStandard(PDDocument document, String standardKeyword) {
|
||||
public static String getPdfAConformanceLevel(PDDocument document) {
|
||||
if (document == null || document.isEncrypted()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return getPdfAVersionFromMetadata(document);
|
||||
}
|
||||
|
||||
private static String getPdfAVersionFromMetadata(PDDocument document) {
|
||||
try {
|
||||
PDMetadata pdMetadata = document.getDocumentCatalog().getMetadata();
|
||||
if (pdMetadata != null) {
|
||||
try (COSInputStream metaStream = pdMetadata.createInputStream()) {
|
||||
DomXmpParser domXmpParser = new DomXmpParser();
|
||||
XMPMetadata xmpMeta = domXmpParser.parse(metaStream);
|
||||
|
||||
PDFAIdentificationSchema pdfId = xmpMeta.getPDFAIdentificationSchema();
|
||||
if (pdfId != null) {
|
||||
Integer part = pdfId.getPart();
|
||||
String conformance = pdfId.getConformance();
|
||||
|
||||
if (part != null && conformance != null) {
|
||||
return part + conformance.toUpperCase(Locale.ROOT);
|
||||
}
|
||||
} else {
|
||||
try (COSInputStream rawStream = pdMetadata.createInputStream()) {
|
||||
byte[] metadataBytes = rawStream.readAllBytes();
|
||||
String rawMetadata = new String(metadataBytes, StandardCharsets.UTF_8);
|
||||
String extracted = extractPdfAVersionFromRawXml(rawMetadata);
|
||||
if (extracted != null) {
|
||||
return extracted;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (XmpParsingException e) {
|
||||
log.debug("XMP parsing failed, trying raw metadata search: {}", e.getMessage());
|
||||
try (COSInputStream metaStream = pdMetadata.createInputStream()) {
|
||||
byte[] metadataBytes = metaStream.readAllBytes();
|
||||
String rawMetadata = new String(metadataBytes, StandardCharsets.UTF_8);
|
||||
String extracted = extractPdfAVersionFromRawXml(rawMetadata);
|
||||
if (extracted != null) {
|
||||
return extracted;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("Error reading PDF/A metadata: {}", e.getMessage());
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static String extractPdfAVersionFromRawXml(String rawXml) {
|
||||
if (rawXml == null || rawXml.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
Pattern partPattern = RegexPatternUtils.getInstance().getPdfAidPartPattern();
|
||||
Pattern confPattern = RegexPatternUtils.getInstance().getPdfAidConformancePattern();
|
||||
|
||||
Matcher partMatcher = partPattern.matcher(rawXml);
|
||||
Matcher confMatcher = confPattern.matcher(rawXml);
|
||||
|
||||
if (partMatcher.find() && confMatcher.find()) {
|
||||
String part = partMatcher.group(1);
|
||||
String conformance = confMatcher.group(1).toUpperCase(Locale.ROOT);
|
||||
return part + conformance;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("Error parsing raw XMP for PDF/A version: {}", e.getMessage());
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static boolean validatePdfAWithPreflight(PDDocument document, String version) {
|
||||
if (document == null || document.isEncrypted()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
document.save(baos);
|
||||
|
||||
try (RandomAccessReadBuffer source = new RandomAccessReadBuffer(baos.toByteArray())) {
|
||||
PreflightParser parser = new PreflightParser(source);
|
||||
|
||||
try (PDDocument parsedDocument = parser.parse()) {
|
||||
if (!(parsedDocument instanceof PreflightDocument preflightDocument)) {
|
||||
log.debug(
|
||||
"Parsed document is not a PreflightDocument; unable to validate claimed PDF/A {}",
|
||||
version);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
ValidationResult result = preflightDocument.validate();
|
||||
if (!result.isValid() && log.isDebugEnabled()) {
|
||||
log.debug(
|
||||
"PDF/A validation found {} errors for claimed version {}",
|
||||
result.getErrorsList().size(),
|
||||
version);
|
||||
int logged = 0;
|
||||
for (ValidationResult.ValidationError error : result.getErrorsList()) {
|
||||
log.debug(
|
||||
" Error {}: {}", error.getErrorCode(), error.getDetails());
|
||||
if (++logged >= 5) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result.isValid();
|
||||
} catch (ValidationException e) {
|
||||
log.debug(
|
||||
"Validation exception during PDF/A validation: {}", e.getMessage());
|
||||
}
|
||||
} catch (SyntaxValidationException e) {
|
||||
log.debug(
|
||||
"Syntax validation failed during PDF/A validation: {}", e.getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.debug("IOException during PDF/A validation: {}", e.getMessage());
|
||||
} catch (Exception e) {
|
||||
log.debug("Unexpected error during PDF/A validation: {}", e.getMessage());
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean checkStandardInMetadata(PDDocument document, String standardKeyword) {
|
||||
// Check XMP Metadata
|
||||
try {
|
||||
PDMetadata pdMetadata = document.getDocumentCatalog().getMetadata();
|
||||
@ -191,11 +287,197 @@ public class GetInfoOnPDF {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates structured summary data about the PDF highlighting its unique characteristics such
|
||||
* as encryption status, permission restrictions, and standards compliance.
|
||||
*
|
||||
* @param document The PDF document to analyze
|
||||
* @return An ObjectNode containing structured summary data
|
||||
*/
|
||||
private ObjectNode generatePDFSummaryData(
|
||||
PDDocument document, String pdfaConformanceLevel, Boolean pdfaValidationPassed) {
|
||||
ObjectNode summaryData = objectMapper.createObjectNode();
|
||||
|
||||
// Check if encrypted
|
||||
if (document.isEncrypted()) {
|
||||
summaryData.put("encrypted", true);
|
||||
}
|
||||
|
||||
// Check permissions
|
||||
AccessPermission ap = document.getCurrentAccessPermission();
|
||||
ArrayNode restrictedPermissions = objectMapper.createArrayNode();
|
||||
|
||||
if (!ap.canAssembleDocument()) restrictedPermissions.add("document assembly");
|
||||
if (!ap.canExtractContent()) restrictedPermissions.add("content extraction");
|
||||
if (!ap.canExtractForAccessibility()) restrictedPermissions.add("accessibility extraction");
|
||||
if (!ap.canFillInForm()) restrictedPermissions.add("form filling");
|
||||
if (!ap.canModify()) restrictedPermissions.add("modification");
|
||||
if (!ap.canModifyAnnotations()) restrictedPermissions.add("annotation modification");
|
||||
if (!ap.canPrint()) restrictedPermissions.add("printing");
|
||||
|
||||
if (!restrictedPermissions.isEmpty()) {
|
||||
summaryData.set("restrictedPermissions", restrictedPermissions);
|
||||
summaryData.put("restrictedPermissionsCount", restrictedPermissions.size());
|
||||
}
|
||||
|
||||
// Check standard compliance
|
||||
if (pdfaConformanceLevel != null) {
|
||||
summaryData.put("standardCompliance", "PDF/A-" + pdfaConformanceLevel);
|
||||
summaryData.put("standardPurpose", "long-term archiving");
|
||||
if (pdfaValidationPassed != null) {
|
||||
summaryData.put("standardValidationPassed", pdfaValidationPassed);
|
||||
}
|
||||
} else if (checkForStandard(document, "PDF/X")) {
|
||||
summaryData.put("standardCompliance", "PDF/X");
|
||||
summaryData.put("standardPurpose", "graphic exchange");
|
||||
} else if (checkForStandard(document, "PDF/UA")) {
|
||||
summaryData.put("standardCompliance", "PDF/UA");
|
||||
summaryData.put("standardPurpose", "universal accessibility");
|
||||
} else if (checkForStandard(document, "PDF/E")) {
|
||||
summaryData.put("standardCompliance", "PDF/E");
|
||||
summaryData.put("standardPurpose", "engineering workflows");
|
||||
} else if (checkForStandard(document, "PDF/VT")) {
|
||||
summaryData.put("standardCompliance", "PDF/VT");
|
||||
summaryData.put("standardPurpose", "variable and transactional printing");
|
||||
}
|
||||
|
||||
return summaryData;
|
||||
}
|
||||
|
||||
private static void setNodePermissions(PDDocument pdfBoxDoc, ObjectNode permissionsNode) {
|
||||
AccessPermission ap = pdfBoxDoc.getCurrentAccessPermission();
|
||||
|
||||
permissionsNode.put("Document Assembly", getPermissionState(ap.canAssembleDocument()));
|
||||
permissionsNode.put("Extracting Content", getPermissionState(ap.canExtractContent()));
|
||||
permissionsNode.put(
|
||||
"Extracting for accessibility",
|
||||
getPermissionState(ap.canExtractForAccessibility()));
|
||||
permissionsNode.put("Form Filling", getPermissionState(ap.canFillInForm()));
|
||||
permissionsNode.put("Modifying", getPermissionState(ap.canModify()));
|
||||
permissionsNode.put("Modifying annotations", getPermissionState(ap.canModifyAnnotations()));
|
||||
permissionsNode.put("Printing", getPermissionState(ap.canPrint()));
|
||||
}
|
||||
|
||||
private static String getPermissionState(boolean state) {
|
||||
return state ? "Allowed" : "Not Allowed";
|
||||
}
|
||||
|
||||
public static String getPageOrientation(double width, double height) {
|
||||
if (width > height) {
|
||||
return "Landscape";
|
||||
} else if (height > width) {
|
||||
return "Portrait";
|
||||
} else {
|
||||
return "Square";
|
||||
}
|
||||
}
|
||||
|
||||
public static String getPageSize(float width, float height) {
|
||||
// Define standard page sizes
|
||||
Map<String, PDRectangle> standardSizes = new HashMap<>();
|
||||
standardSizes.put("Letter", PDRectangle.LETTER);
|
||||
standardSizes.put("LEGAL", PDRectangle.LEGAL);
|
||||
standardSizes.put("A0", PDRectangle.A0);
|
||||
standardSizes.put("A1", PDRectangle.A1);
|
||||
standardSizes.put("A2", PDRectangle.A2);
|
||||
standardSizes.put("A3", PDRectangle.A3);
|
||||
standardSizes.put("A4", PDRectangle.A4);
|
||||
standardSizes.put("A5", PDRectangle.A5);
|
||||
standardSizes.put("A6", PDRectangle.A6);
|
||||
|
||||
for (Map.Entry<String, PDRectangle> entry : standardSizes.entrySet()) {
|
||||
PDRectangle size = entry.getValue();
|
||||
if (isCloseToSize(width, height, size.getWidth(), size.getHeight())) {
|
||||
return entry.getKey();
|
||||
}
|
||||
}
|
||||
return "Custom";
|
||||
}
|
||||
|
||||
private static boolean isCloseToSize(
|
||||
float width, float height, float standardWidth, float standardHeight) {
|
||||
float tolerance = 1.0f; // You can adjust the tolerance as needed
|
||||
return Math.abs(width - standardWidth) <= tolerance
|
||||
&& Math.abs(height - standardHeight) <= tolerance;
|
||||
}
|
||||
|
||||
private static void setDimensionInfo(ObjectNode dimensionInfo, float width, float height) {
|
||||
float ppi = 72; // Points Per Inch
|
||||
|
||||
float widthInInches = width / ppi;
|
||||
float heightInInches = height / ppi;
|
||||
|
||||
float widthInCm = widthInInches * 2.54f;
|
||||
float heightInCm = heightInInches * 2.54f;
|
||||
|
||||
dimensionInfo.put("Width (px)", String.format("%.2f", width));
|
||||
dimensionInfo.put("Height (px)", String.format("%.2f", height));
|
||||
dimensionInfo.put("Width (in)", String.format("%.2f", widthInInches));
|
||||
dimensionInfo.put("Height (in)", String.format("%.2f", heightInInches));
|
||||
dimensionInfo.put("Width (cm)", String.format("%.2f", widthInCm));
|
||||
dimensionInfo.put("Height (cm)", String.format("%.2f", heightInCm));
|
||||
}
|
||||
|
||||
private static ArrayNode exploreStructureTree(List<Object> nodes) {
|
||||
ArrayNode elementsArray = objectMapper.createArrayNode();
|
||||
if (nodes != null) {
|
||||
for (Object obj : nodes) {
|
||||
if (obj instanceof PDStructureNode node) {
|
||||
ObjectNode elementNode = objectMapper.createObjectNode();
|
||||
|
||||
if (node instanceof PDStructureElement structureElement) {
|
||||
elementNode.put("Type", structureElement.getStructureType());
|
||||
elementNode.put("Content", getContent(structureElement));
|
||||
|
||||
// Recursively explore child elements
|
||||
ArrayNode childElements = exploreStructureTree(structureElement.getKids());
|
||||
if (!childElements.isEmpty()) {
|
||||
elementNode.set("Children", childElements);
|
||||
}
|
||||
}
|
||||
elementsArray.add(elementNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
return elementsArray;
|
||||
}
|
||||
|
||||
private static String getContent(PDStructureElement structureElement) {
|
||||
StringBuilder contentBuilder = new StringBuilder();
|
||||
|
||||
for (Object item : structureElement.getKids()) {
|
||||
if (item instanceof COSString cosString) {
|
||||
contentBuilder.append(cosString.getString());
|
||||
} else if (item instanceof PDStructureElement pdstructureelement) {
|
||||
// For simplicity, we're handling only COSString and PDStructureElement here
|
||||
// but a more comprehensive method would handle other types too
|
||||
contentBuilder.append(getContent(pdstructureelement));
|
||||
}
|
||||
}
|
||||
|
||||
return contentBuilder.toString();
|
||||
}
|
||||
|
||||
private static String formatDate(Calendar calendar) {
|
||||
if (calendar != null) {
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
|
||||
ZonedDateTime zonedDateTime =
|
||||
ZonedDateTime.ofInstant(calendar.toInstant(), ZoneId.systemDefault());
|
||||
return zonedDateTime.format(formatter);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@PostMapping(consumes = MediaType.MULTIPART_FORM_DATA_VALUE, value = "/get-info-on-pdf")
|
||||
@Operation(summary = "Summary here", description = "desc. Input:PDF Output:JSON Type:SISO")
|
||||
public ResponseEntity<byte[]> getPdfInfo(@ModelAttribute PDFFile request) throws IOException {
|
||||
MultipartFile inputFile = request.getFileInput();
|
||||
boolean readonly = true;
|
||||
final String pagePrefix = "Page ";
|
||||
final int prefixLength = pagePrefix.length();
|
||||
StringBuilder keyBuilder = new StringBuilder(prefixLength + 8);
|
||||
keyBuilder.append(pagePrefix);
|
||||
try (PDDocument pdfBoxDoc = pdfDocumentFactory.load(inputFile, readonly)) {
|
||||
ObjectMapper objectMapper = new ObjectMapper();
|
||||
ObjectNode jsonOutput = objectMapper.createObjectNode();
|
||||
@ -267,8 +549,15 @@ public class GetInfoOnPDF {
|
||||
}
|
||||
jsonOutput.set("FormFields", formFieldsNode);
|
||||
|
||||
String pdfaConformanceLevel = getPdfAConformanceLevel(pdfBoxDoc);
|
||||
Boolean pdfaValidationPassed = null;
|
||||
if (pdfaConformanceLevel != null) {
|
||||
pdfaValidationPassed = validatePdfAWithPreflight(pdfBoxDoc, pdfaConformanceLevel);
|
||||
}
|
||||
|
||||
// Generate structured summary data about PDF characteristics
|
||||
ObjectNode summaryData = generatePDFSummaryData(pdfBoxDoc);
|
||||
ObjectNode summaryData =
|
||||
generatePDFSummaryData(pdfBoxDoc, pdfaConformanceLevel, pdfaValidationPassed);
|
||||
if (summaryData != null && !summaryData.isEmpty()) {
|
||||
jsonOutput.set("SummaryData", summaryData);
|
||||
}
|
||||
@ -373,7 +662,7 @@ public class GetInfoOnPDF {
|
||||
log.error("exception", e);
|
||||
}
|
||||
|
||||
boolean isPdfACompliant = checkForStandard(pdfBoxDoc, "PDF/A");
|
||||
boolean isPdfACompliant = pdfaConformanceLevel != null;
|
||||
boolean isPdfXCompliant = checkForStandard(pdfBoxDoc, "PDF/X");
|
||||
boolean isPdfECompliant = checkForStandard(pdfBoxDoc, "PDF/E");
|
||||
boolean isPdfVTCompliant = checkForStandard(pdfBoxDoc, "PDF/VT");
|
||||
@ -390,6 +679,12 @@ public class GetInfoOnPDF {
|
||||
// development in 2021.
|
||||
|
||||
compliancy.put("IsPDF/ACompliant", isPdfACompliant);
|
||||
if (pdfaConformanceLevel != null) {
|
||||
compliancy.put("PDF/AConformanceLevel", pdfaConformanceLevel);
|
||||
if (pdfaValidationPassed != null) {
|
||||
compliancy.put("IsPDF/AValidated", pdfaValidationPassed);
|
||||
}
|
||||
}
|
||||
compliancy.put("IsPDF/XCompliant", isPdfXCompliant);
|
||||
compliancy.put("IsPDF/ECompliant", isPdfECompliant);
|
||||
compliancy.put("IsPDF/VTCompliant", isPdfVTCompliant);
|
||||
@ -466,7 +761,7 @@ public class GetInfoOnPDF {
|
||||
|
||||
ObjectNode sizeInfo = objectMapper.createObjectNode();
|
||||
|
||||
getDimensionInfo(sizeInfo, width, height);
|
||||
setDimensionInfo(sizeInfo, width, height);
|
||||
|
||||
sizeInfo.put("Standard Page", getPageSize(width, height));
|
||||
pageInfo.set("Size", sizeInfo);
|
||||
@ -552,11 +847,10 @@ public class GetInfoOnPDF {
|
||||
Set<String> uniqueURIs = new HashSet<>(); // To store unique URIs
|
||||
|
||||
for (PDAnnotation annotation : annotations) {
|
||||
if (annotation instanceof PDAnnotationLink linkAnnotation) {
|
||||
if (linkAnnotation.getAction() instanceof PDActionURI uriAction) {
|
||||
String uri = uriAction.getURI();
|
||||
uniqueURIs.add(uri); // Add to set to ensure uniqueness
|
||||
}
|
||||
if (annotation instanceof PDAnnotationLink linkAnnotation
|
||||
&& linkAnnotation.getAction() instanceof PDActionURI uriAction) {
|
||||
String uri = uriAction.getURI();
|
||||
uniqueURIs.add(uri); // Add to set to ensure uniqueness
|
||||
}
|
||||
}
|
||||
|
||||
@ -679,8 +973,10 @@ public class GetInfoOnPDF {
|
||||
}
|
||||
|
||||
pageInfo.set("Multimedia", multimediaArray);
|
||||
keyBuilder.setLength(prefixLength);
|
||||
keyBuilder.append(pageNum + 1);
|
||||
|
||||
pageInfoParent.set("Page " + (pageNum + 1), pageInfo);
|
||||
pageInfoParent.set(keyBuilder.toString(), pageInfo);
|
||||
}
|
||||
|
||||
jsonOutput.set("BasicInfo", basicInfo);
|
||||
@ -706,133 +1002,11 @@ public class GetInfoOnPDF {
|
||||
return null;
|
||||
}
|
||||
|
||||
private void setNodePermissions(PDDocument pdfBoxDoc, ObjectNode permissionsNode) {
|
||||
AccessPermission ap = pdfBoxDoc.getCurrentAccessPermission();
|
||||
|
||||
permissionsNode.put("Document Assembly", getPermissionState(ap.canAssembleDocument()));
|
||||
permissionsNode.put("Extracting Content", getPermissionState(ap.canExtractContent()));
|
||||
permissionsNode.put(
|
||||
"Extracting for accessibility",
|
||||
getPermissionState(ap.canExtractForAccessibility()));
|
||||
permissionsNode.put("Form Filling", getPermissionState(ap.canFillInForm()));
|
||||
permissionsNode.put("Modifying", getPermissionState(ap.canModify()));
|
||||
permissionsNode.put("Modifying annotations", getPermissionState(ap.canModifyAnnotations()));
|
||||
permissionsNode.put("Printing", getPermissionState(ap.canPrint()));
|
||||
}
|
||||
|
||||
private String getPermissionState(boolean state) {
|
||||
return state ? "Allowed" : "Not Allowed";
|
||||
}
|
||||
|
||||
public String getPageOrientation(double width, double height) {
|
||||
if (width > height) {
|
||||
return "Landscape";
|
||||
} else if (height > width) {
|
||||
return "Portrait";
|
||||
} else {
|
||||
return "Square";
|
||||
}
|
||||
}
|
||||
|
||||
public String getPageSize(float width, float height) {
|
||||
// Define standard page sizes
|
||||
Map<String, PDRectangle> standardSizes = new HashMap<>();
|
||||
standardSizes.put("Letter", PDRectangle.LETTER);
|
||||
standardSizes.put("LEGAL", PDRectangle.LEGAL);
|
||||
standardSizes.put("A0", PDRectangle.A0);
|
||||
standardSizes.put("A1", PDRectangle.A1);
|
||||
standardSizes.put("A2", PDRectangle.A2);
|
||||
standardSizes.put("A3", PDRectangle.A3);
|
||||
standardSizes.put("A4", PDRectangle.A4);
|
||||
standardSizes.put("A5", PDRectangle.A5);
|
||||
standardSizes.put("A6", PDRectangle.A6);
|
||||
|
||||
for (Map.Entry<String, PDRectangle> entry : standardSizes.entrySet()) {
|
||||
PDRectangle size = entry.getValue();
|
||||
if (isCloseToSize(width, height, size.getWidth(), size.getHeight())) {
|
||||
return entry.getKey();
|
||||
}
|
||||
}
|
||||
return "Custom";
|
||||
}
|
||||
|
||||
private boolean isCloseToSize(
|
||||
float width, float height, float standardWidth, float standardHeight) {
|
||||
float tolerance = 1.0f; // You can adjust the tolerance as needed
|
||||
return Math.abs(width - standardWidth) <= tolerance
|
||||
&& Math.abs(height - standardHeight) <= tolerance;
|
||||
}
|
||||
|
||||
public ObjectNode getDimensionInfo(ObjectNode dimensionInfo, float width, float height) {
|
||||
float ppi = 72; // Points Per Inch
|
||||
|
||||
float widthInInches = width / ppi;
|
||||
float heightInInches = height / ppi;
|
||||
|
||||
float widthInCm = widthInInches * 2.54f;
|
||||
float heightInCm = heightInInches * 2.54f;
|
||||
|
||||
dimensionInfo.put("Width (px)", String.format("%.2f", width));
|
||||
dimensionInfo.put("Height (px)", String.format("%.2f", height));
|
||||
dimensionInfo.put("Width (in)", String.format("%.2f", widthInInches));
|
||||
dimensionInfo.put("Height (in)", String.format("%.2f", heightInInches));
|
||||
dimensionInfo.put("Width (cm)", String.format("%.2f", widthInCm));
|
||||
dimensionInfo.put("Height (cm)", String.format("%.2f", heightInCm));
|
||||
return dimensionInfo;
|
||||
}
|
||||
|
||||
public ArrayNode exploreStructureTree(List<Object> nodes) {
|
||||
ArrayNode elementsArray = objectMapper.createArrayNode();
|
||||
if (nodes != null) {
|
||||
for (Object obj : nodes) {
|
||||
if (obj instanceof PDStructureNode node) {
|
||||
ObjectNode elementNode = objectMapper.createObjectNode();
|
||||
|
||||
if (node instanceof PDStructureElement structureElement) {
|
||||
elementNode.put("Type", structureElement.getStructureType());
|
||||
elementNode.put("Content", getContent(structureElement));
|
||||
|
||||
// Recursively explore child elements
|
||||
ArrayNode childElements = exploreStructureTree(structureElement.getKids());
|
||||
if (!childElements.isEmpty()) {
|
||||
elementNode.set("Children", childElements);
|
||||
}
|
||||
}
|
||||
elementsArray.add(elementNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
return elementsArray;
|
||||
}
|
||||
|
||||
public String getContent(PDStructureElement structureElement) {
|
||||
StringBuilder contentBuilder = new StringBuilder();
|
||||
|
||||
for (Object item : structureElement.getKids()) {
|
||||
if (item instanceof COSString cosString) {
|
||||
contentBuilder.append(cosString.getString());
|
||||
} else if (item instanceof PDStructureElement) {
|
||||
// For simplicity, we're handling only COSString and PDStructureElement here
|
||||
// but a more comprehensive method would handle other types too
|
||||
contentBuilder.append(getContent((PDStructureElement) item));
|
||||
}
|
||||
}
|
||||
|
||||
return contentBuilder.toString();
|
||||
}
|
||||
|
||||
private String formatDate(Calendar calendar) {
|
||||
if (calendar != null) {
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
|
||||
ZonedDateTime zonedDateTime =
|
||||
ZonedDateTime.ofInstant(calendar.toInstant(), ZoneId.systemDefault());
|
||||
return zonedDateTime.format(formatter);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private String getPageModeDescription(String pageMode) {
|
||||
return pageMode != null ? pageMode.toString().replaceFirst("/", "") : "Unknown";
|
||||
private static String getPageModeDescription(String pageMode) {
|
||||
if (pageMode == null) return "Unknown";
|
||||
return RegexPatternUtils.getInstance()
|
||||
.getPageModePattern()
|
||||
.matcher(pageMode)
|
||||
.replaceFirst("");
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user