From 2ef89101c26d6c37a5c6924ceba69c12cce43c70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= <127139797+balazs-szucs@users.noreply.github.com> Date: Thu, 16 Oct 2025 23:54:25 +0200 Subject: [PATCH] feat(pdf-info): refactor and add more info on attachments, images, forms, and give technical overview (#4643) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description of Changes ### UI and Summary Enhancements * Added a "Technical Details" section to the PDF summary UI, displaying information such as images, fonts, color spaces, form fields, embedded files, JavaScript, layers, bookmarks, and multimedia. This includes new HTML markup and corresponding translation string. * Populated the new technical details fields in the summary using data from the backend response, including logic to count and display images, fonts, color spaces, form fields, embedded files, JavaScript scripts, layers, bookmarks, and multimedia items. ### Refactor: - Refactored getPdfInfo function (which was 400 lines) to multiple manageable size functions - Extract magic numbers to named constants: - `72` (PPI) should be a named constant like `POINTS_PER_INCH` - `1.0f` (tolerance) should be a named constant like `DEFAULT_TOLERANCE` - `5` (max logged errors) should be a named constant like `MAX_LOGGED_ERRORS` - Make the keyBuilder optimization for page prefixes effective - Used `String.format()` for better readability in complex string constructions - Replace abbreviated variable names with descriptive alternatives: - `ap` (AccessPermission) → `accessPermission` - `is` (InputStream) → `inputStream` or `pdfInputStream` - Improved attachment processing, and other features see below: ### Features: #### Technical details overview: image #### Form fields: image #### Embedded file new fields; MIME type, creation/modification date image #### Images: image And few other more minor improvements. --- ## Checklist ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [x] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [x] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --------- Signed-off-by: Balázs Szücs --- .../controller/api/security/GetInfoOnPDF.java | 1163 +++++++++++------ .../main/resources/messages_en_GB.properties | 1 + .../templates/security/get-info-on-pdf.html | 237 +++- 3 files changed, 928 insertions(+), 473 deletions(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java index 1e9038380..205bed989 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java @@ -3,6 +3,7 @@ package stirling.software.SPDF.controller.api.security; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.time.Instant; import java.time.ZoneId; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; @@ -17,7 +18,6 @@ import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.*; import org.apache.pdfbox.pdmodel.common.PDMetadata; import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement; @@ -28,8 +28,6 @@ import org.apache.pdfbox.pdmodel.encryption.PDEncryption; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; import org.apache.pdfbox.pdmodel.graphics.PDXObject; -import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; -import org.apache.pdfbox.pdmodel.graphics.color.PDICCBased; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup; @@ -85,7 +83,13 @@ import stirling.software.common.util.WebResponseUtils; @RequiredArgsConstructor public class GetInfoOnPDF { - static ObjectMapper objectMapper = new ObjectMapper(); + private static final int DEFAULT_PPI = 72; + private static final float SIZE_TOLERANCE = 1.0f; + private static final int MAX_LOGGED_ERRORS = 5; + private static final String PAGE_PREFIX = "Page "; + private static final long MAX_FILE_SIZE = 100L * 1024 * 1024; + + private static final ObjectMapper objectMapper = new ObjectMapper(); private final CustomPDFDocumentFactory pdfDocumentFactory; @@ -219,7 +223,7 @@ public class GetInfoOnPDF { for (ValidationResult.ValidationError error : result.getErrorsList()) { log.debug( " Error {}: {}", error.getErrorCode(), error.getDetails()); - if (++logged >= 5) { + if (++logged >= MAX_LOGGED_ERRORS) { break; } } @@ -267,7 +271,7 @@ public class GetInfoOnPDF { ByteArrayOutputStream baos = new ByteArrayOutputStream(); new XmpSerializer().serialize(xmpMeta, baos, true); - String xmpString = new String(baos.toByteArray(), StandardCharsets.UTF_8); + String xmpString = baos.toString(StandardCharsets.UTF_8); if (xmpString.contains(standardKeyword)) { return true; @@ -287,14 +291,7 @@ public class GetInfoOnPDF { return false; } - /** - * Generates structured summary data about the PDF highlighting its unique characteristics such - * as encryption status, permission restrictions, and standards compliance. - * - * @param document The PDF document to analyze - * @return An ObjectNode containing structured summary data - */ - private ObjectNode generatePDFSummaryData( + private static ObjectNode generatePDFSummaryData( PDDocument document, String pdfaConformanceLevel, Boolean pdfaValidationPassed) { ObjectNode summaryData = objectMapper.createObjectNode(); @@ -304,16 +301,18 @@ public class GetInfoOnPDF { } // Check permissions - AccessPermission ap = document.getCurrentAccessPermission(); + AccessPermission accessPermission = document.getCurrentAccessPermission(); ArrayNode restrictedPermissions = objectMapper.createArrayNode(); - if (!ap.canAssembleDocument()) restrictedPermissions.add("document assembly"); - if (!ap.canExtractContent()) restrictedPermissions.add("content extraction"); - if (!ap.canExtractForAccessibility()) restrictedPermissions.add("accessibility extraction"); - if (!ap.canFillInForm()) restrictedPermissions.add("form filling"); - if (!ap.canModify()) restrictedPermissions.add("modification"); - if (!ap.canModifyAnnotations()) restrictedPermissions.add("annotation modification"); - if (!ap.canPrint()) restrictedPermissions.add("printing"); + if (!accessPermission.canAssembleDocument()) restrictedPermissions.add("document assembly"); + if (!accessPermission.canExtractContent()) restrictedPermissions.add("content extraction"); + if (!accessPermission.canExtractForAccessibility()) + restrictedPermissions.add("accessibility extraction"); + if (!accessPermission.canFillInForm()) restrictedPermissions.add("form filling"); + if (!accessPermission.canModify()) restrictedPermissions.add("modification"); + if (!accessPermission.canModifyAnnotations()) + restrictedPermissions.add("annotation modification"); + if (!accessPermission.canPrint()) restrictedPermissions.add("printing"); if (!restrictedPermissions.isEmpty()) { summaryData.set("restrictedPermissions", restrictedPermissions); @@ -345,17 +344,21 @@ public class GetInfoOnPDF { } private static void setNodePermissions(PDDocument pdfBoxDoc, ObjectNode permissionsNode) { - AccessPermission ap = pdfBoxDoc.getCurrentAccessPermission(); + AccessPermission accessPermission = pdfBoxDoc.getCurrentAccessPermission(); - permissionsNode.put("Document Assembly", getPermissionState(ap.canAssembleDocument())); - permissionsNode.put("Extracting Content", getPermissionState(ap.canExtractContent())); + permissionsNode.put( + "Document Assembly", getPermissionState(accessPermission.canAssembleDocument())); + permissionsNode.put( + "Extracting Content", getPermissionState(accessPermission.canExtractContent())); permissionsNode.put( "Extracting for accessibility", - getPermissionState(ap.canExtractForAccessibility())); - permissionsNode.put("Form Filling", getPermissionState(ap.canFillInForm())); - permissionsNode.put("Modifying", getPermissionState(ap.canModify())); - permissionsNode.put("Modifying annotations", getPermissionState(ap.canModifyAnnotations())); - permissionsNode.put("Printing", getPermissionState(ap.canPrint())); + getPermissionState(accessPermission.canExtractForAccessibility())); + permissionsNode.put("Form Filling", getPermissionState(accessPermission.canFillInForm())); + permissionsNode.put("Modifying", getPermissionState(accessPermission.canModify())); + permissionsNode.put( + "Modifying annotations", + getPermissionState(accessPermission.canModifyAnnotations())); + permissionsNode.put("Printing", getPermissionState(accessPermission.canPrint())); } private static String getPermissionState(boolean state) { @@ -396,16 +399,13 @@ public class GetInfoOnPDF { private static boolean isCloseToSize( float width, float height, float standardWidth, float standardHeight) { - float tolerance = 1.0f; // You can adjust the tolerance as needed - return Math.abs(width - standardWidth) <= tolerance - && Math.abs(height - standardHeight) <= tolerance; + return Math.abs(width - standardWidth) <= SIZE_TOLERANCE + && Math.abs(height - standardHeight) <= SIZE_TOLERANCE; } private static void setDimensionInfo(ObjectNode dimensionInfo, float width, float height) { - float ppi = 72; // Points Per Inch - - float widthInInches = width / ppi; - float heightInInches = height / ppi; + float widthInInches = width / DEFAULT_PPI; + float heightInInches = height / DEFAULT_PPI; float widthInCm = widthInInches * 2.54f; float heightInCm = heightInInches * 2.54f; @@ -458,115 +458,257 @@ public class GetInfoOnPDF { return contentBuilder.toString(); } - private static String formatDate(Calendar calendar) { - if (calendar != null) { + private static String formatDate(Instant instant) { + if (instant != null) { DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); - ZonedDateTime zonedDateTime = - ZonedDateTime.ofInstant(calendar.toInstant(), ZoneId.systemDefault()); + ZonedDateTime zonedDateTime = instant.atZone(ZoneId.systemDefault()); return zonedDateTime.format(formatter); } else { return null; } } - @PostMapping(consumes = MediaType.MULTIPART_FORM_DATA_VALUE, value = "/get-info-on-pdf") - @Operation(summary = "Summary here", description = "desc. Input:PDF Output:JSON Type:SISO") - public ResponseEntity getPdfInfo(@ModelAttribute PDFFile request) throws IOException { - MultipartFile inputFile = request.getFileInput(); - boolean readonly = true; - final String pagePrefix = "Page "; - final int prefixLength = pagePrefix.length(); - StringBuilder keyBuilder = new StringBuilder(prefixLength + 8); - keyBuilder.append(pagePrefix); - try (PDDocument pdfBoxDoc = pdfDocumentFactory.load(inputFile, readonly)) { - ObjectMapper objectMapper = new ObjectMapper(); - ObjectNode jsonOutput = objectMapper.createObjectNode(); + private static void validatePdfFile(MultipartFile file) { + if (file == null || file.isEmpty()) { + throw new IllegalArgumentException("PDF file is required"); + } - // Metadata using PDFBox - PDDocumentInformation info = pdfBoxDoc.getDocumentInformation(); - ObjectNode metadata = objectMapper.createObjectNode(); - ObjectNode basicInfo = objectMapper.createObjectNode(); - ObjectNode docInfoNode = objectMapper.createObjectNode(); - ObjectNode compliancy = objectMapper.createObjectNode(); - ObjectNode encryption = objectMapper.createObjectNode(); - ObjectNode other = objectMapper.createObjectNode(); + if (file.getSize() > MAX_FILE_SIZE) { + throw new IllegalArgumentException( + String.format( + "File size (%d bytes) exceeds maximum allowed size (%d bytes)", + file.getSize(), MAX_FILE_SIZE)); + } - metadata.put("Title", info.getTitle()); - metadata.put("Author", info.getAuthor()); - metadata.put("Subject", info.getSubject()); - metadata.put("Keywords", info.getKeywords()); - metadata.put("Producer", info.getProducer()); - metadata.put("Creator", info.getCreator()); - metadata.put("CreationDate", formatDate(info.getCreationDate())); - metadata.put("ModificationDate", formatDate(info.getModificationDate())); - jsonOutput.set("Metadata", metadata); + String contentType = file.getContentType(); + if (contentType != null && !"application/pdf".equals(contentType)) { + log.warn("File content type is {}, expected application/pdf", contentType); + } + } - // Total file size of the PDF - long fileSizeInBytes = inputFile.getSize(); - basicInfo.put("FileSizeInBytes", fileSizeInBytes); + private static ResponseEntity createErrorResponse(String errorMessage) { + try { + ObjectNode errorNode = objectMapper.createObjectNode(); + errorNode.put("error", errorMessage); + errorNode.put("timestamp", System.currentTimeMillis()); - // Number of words, paragraphs, and images in the entire document - String fullText = new PDFTextStripper().getText(pdfBoxDoc); - String[] words = RegexPatternUtils.getInstance().getWhitespacePattern().split(fullText); - int wordCount = words.length; - int paragraphCount = - RegexPatternUtils.getInstance() - .getMultiFormatNewlinePattern() - .split(fullText) - .length; - basicInfo.put("WordCount", wordCount); - basicInfo.put("ParagraphCount", paragraphCount); - // Number of characters in the entire document (including spaces and special characters) - int charCount = fullText.length(); - basicInfo.put("CharacterCount", charCount); + String jsonString = + objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(errorNode); + return WebResponseUtils.bytesToWebResponse( + jsonString.getBytes(StandardCharsets.UTF_8), + "error.json", + MediaType.APPLICATION_JSON); + } catch (Exception e) { + log.error("Failed to create error response", e); + return ResponseEntity.internalServerError().build(); + } + } - // Initialize the flags and types - boolean hasCompression = false; - String compressionType = "None"; + private static ObjectNode extractMetadata(PDDocument document) { + ObjectNode metadata = objectMapper.createObjectNode(); - basicInfo.put("Compression", hasCompression); - if (hasCompression) basicInfo.put("CompressionType", compressionType); + try { + PDDocumentInformation info = document.getDocumentInformation(); + if (info != null) { + if (info.getTitle() != null) metadata.put("Title", info.getTitle()); + if (info.getAuthor() != null) metadata.put("Author", info.getAuthor()); + if (info.getSubject() != null) metadata.put("Subject", info.getSubject()); + if (info.getKeywords() != null) metadata.put("Keywords", info.getKeywords()); + if (info.getProducer() != null) metadata.put("Producer", info.getProducer()); + if (info.getCreator() != null) metadata.put("Creator", info.getCreator()); - String language = pdfBoxDoc.getDocumentCatalog().getLanguage(); - basicInfo.put("Language", language); - basicInfo.put("Number of pages", pdfBoxDoc.getNumberOfPages()); + String creationDate = + formatDate( + info.getCreationDate() != null + ? info.getCreationDate().toInstant() + : null); + if (creationDate != null) { + metadata.put("CreationDate", creationDate); + } + + String modificationDate = + formatDate( + info.getModificationDate() != null + ? info.getModificationDate().toInstant() + : null); + if (modificationDate != null) { + metadata.put("ModificationDate", modificationDate); + } + } + } catch (Exception e) { + log.error("Error extracting metadata: {}", e.getMessage()); + } + + return metadata; + } + + private static ObjectNode extractDocumentInfo(PDDocument document) { + ObjectNode docInfoNode = objectMapper.createObjectNode(); + + try { + PDDocumentInformation info = document.getDocumentInformation(); + PDDocumentCatalog catalog = document.getDocumentCatalog(); + + docInfoNode.put("PDF version", document.getVersion()); + if (info != null) { + docInfoNode.put("Trapped", info.getTrapped()); + } - PDDocumentCatalog catalog = pdfBoxDoc.getDocumentCatalog(); String pageMode = catalog.getPageMode().name(); - - // Document Information using PDFBox - docInfoNode.put("PDF version", pdfBoxDoc.getVersion()); - docInfoNode.put("Trapped", info.getTrapped()); docInfoNode.put("Page Mode", getPageModeDescription(pageMode)); - PDAcroForm acroForm = pdfBoxDoc.getDocumentCatalog().getAcroForm(); + } catch (Exception e) { + log.error("Error extracting document info: {}", e.getMessage()); + } - ObjectNode formFieldsNode = objectMapper.createObjectNode(); + return docInfoNode; + } + + private static ObjectNode extractComplianceInfo(PDDocument document) { + ObjectNode compliancy = objectMapper.createObjectNode(); + + try { + String pdfaConformanceLevel = getPdfAConformanceLevel(document); + boolean isPdfACompliant = pdfaConformanceLevel != null; + boolean isPdfXCompliant = checkForStandard(document, "PDF/X"); + boolean isPdfECompliant = checkForStandard(document, "PDF/E"); + boolean isPdfVTCompliant = checkForStandard(document, "PDF/VT"); + boolean isPdfUACompliant = checkForStandard(document, "PDF/UA"); + boolean isPdfBCompliant = checkForStandard(document, "PDF/B"); + boolean isPdfSECCompliant = checkForStandard(document, "PDF/SEC"); + + compliancy.put("IsPDF/ACompliant", isPdfACompliant); + if (pdfaConformanceLevel != null) { + compliancy.put("PDF/AConformanceLevel", pdfaConformanceLevel); + Boolean pdfaValidationPassed = + validatePdfAWithPreflight(document, pdfaConformanceLevel); + compliancy.put("IsPDF/AValidated", pdfaValidationPassed); + } + compliancy.put("IsPDF/XCompliant", isPdfXCompliant); + compliancy.put("IsPDF/ECompliant", isPdfECompliant); + compliancy.put("IsPDF/VTCompliant", isPdfVTCompliant); + compliancy.put("IsPDF/UACompliant", isPdfUACompliant); + compliancy.put("IsPDF/BCompliant", isPdfBCompliant); + compliancy.put("IsPDF/SECCompliant", isPdfSECCompliant); + + } catch (Exception e) { + log.error("Error extracting compliance info: {}", e.getMessage()); + } + + return compliancy; + } + + private static ObjectNode extractEncryptionInfo(PDDocument document) { + ObjectNode encryption = objectMapper.createObjectNode(); + + try { + if (document.isEncrypted()) { + encryption.put("IsEncrypted", true); + + PDEncryption pdfEncryption = document.getEncryption(); + if (pdfEncryption != null) { + encryption.put("EncryptionAlgorithm", pdfEncryption.getFilter()); + encryption.put("KeyLength", pdfEncryption.getLength()); + encryption.put("Version", pdfEncryption.getVersion()); + encryption.put("Revision", pdfEncryption.getRevision()); + } + } else { + encryption.put("IsEncrypted", false); + } + } catch (Exception e) { + log.error("Error extracting encryption info: {}", e.getMessage()); + } + + return encryption; + } + + // Extracts permissions information + private static ObjectNode extractPermissions(PDDocument document) { + ObjectNode permissionsNode = objectMapper.createObjectNode(); + + try { + setNodePermissions(document, permissionsNode); + } catch (Exception e) { + log.error("Error extracting permissions: {}", e.getMessage()); + } + + return permissionsNode; + } + + private static ObjectNode extractFormFields(PDDocument document) { + ObjectNode formFieldsNode = objectMapper.createObjectNode(); + + try { + PDAcroForm acroForm = document.getDocumentCatalog().getAcroForm(); if (acroForm != null) { for (PDField field : acroForm.getFieldTree()) { formFieldsNode.put(field.getFullyQualifiedName(), field.getValueAsString()); } } - jsonOutput.set("FormFields", formFieldsNode); + } catch (Exception e) { + log.error("Error extracting form fields: {}", e.getMessage()); + } - String pdfaConformanceLevel = getPdfAConformanceLevel(pdfBoxDoc); - Boolean pdfaValidationPassed = null; - if (pdfaConformanceLevel != null) { - pdfaValidationPassed = validatePdfAWithPreflight(pdfBoxDoc, pdfaConformanceLevel); + return formFieldsNode; + } + + private static ObjectNode extractOtherInfo(PDDocument document) { + ObjectNode other = objectMapper.createObjectNode(); + + try { + PDDocumentCatalog catalog = document.getDocumentCatalog(); + + // Embedded files + ArrayNode embeddedFilesArray = extractEmbeddedFiles(catalog); + other.set("EmbeddedFiles", embeddedFilesArray); + + // Attachments + ArrayNode attachmentsArray = extractAttachments(document); + other.set("Attachments", attachmentsArray); + + // JavaScript with security analysis + ArrayNode javascriptArray = extractJavaScript(catalog); + other.set("JavaScript", javascriptArray); + + // Layers + ArrayNode layersArray = extractLayers(document); + other.set("Layers", layersArray); + + // Bookmarks + ArrayNode bookmarksArray = extractBookmarks(document); + other.set("Bookmarks/Outline/TOC", bookmarksArray); + + // XMP Metadata + String xmpString = extractXMPMetadata(document); + other.put("XMPMetadata", xmpString); + + // Structure tree + try { + PDStructureTreeRoot structureTreeRoot = + document.getDocumentCatalog().getStructureTreeRoot(); + if (structureTreeRoot != null) { + ArrayNode structureTreeArray = + exploreStructureTree(structureTreeRoot.getKids()); + other.set("StructureTree", structureTreeArray); + } + } catch (Exception e) { + log.error("Error extracting structure tree: {}", e.getMessage()); } - // Generate structured summary data about PDF characteristics - ObjectNode summaryData = - generatePDFSummaryData(pdfBoxDoc, pdfaConformanceLevel, pdfaValidationPassed); - if (summaryData != null && !summaryData.isEmpty()) { - jsonOutput.set("SummaryData", summaryData); - } + } catch (Exception e) { + log.error("Error extracting other info: {}", e.getMessage()); + } - // embeed files TODO size + return other; + } + + private static ArrayNode extractEmbeddedFiles(PDDocumentCatalog catalog) { + ArrayNode embeddedFilesArray = objectMapper.createArrayNode(); + + try { if (catalog.getNames() != null) { PDEmbeddedFilesNameTreeNode efTree = catalog.getNames().getEmbeddedFiles(); - ArrayNode embeddedFilesArray = objectMapper.createArrayNode(); if (efTree != null) { Map efMap = efTree.getNames(); if (efMap != null) { @@ -574,37 +716,75 @@ public class GetInfoOnPDF { efMap.entrySet()) { ObjectNode embeddedFileNode = objectMapper.createObjectNode(); embeddedFileNode.put("Name", entry.getKey()); + PDEmbeddedFile embeddedFile = entry.getValue().getEmbeddedFile(); if (embeddedFile != null) { + embeddedFileNode.put("FileSize", embeddedFile.getLength()); + embeddedFileNode.put("MimeType", embeddedFile.getSubtype()); embeddedFileNode.put( - "FileSize", embeddedFile.getLength()); // size in bytes + "CreationDate", + formatDate( + embeddedFile.getCreationDate() != null + ? embeddedFile.getCreationDate().toInstant() + : null)); + embeddedFileNode.put( + "ModificationDate", + formatDate( + embeddedFile.getModDate() != null + ? embeddedFile.getModDate().toInstant() + : null)); } embeddedFilesArray.add(embeddedFileNode); } } } - other.set("EmbeddedFiles", embeddedFilesArray); } + } catch (Exception e) { + log.error("Error extracting embedded files: {}", e.getMessage()); + } - // attachments TODO size - ArrayNode attachmentsArray = objectMapper.createArrayNode(); - for (PDPage page : pdfBoxDoc.getPages()) { + return embeddedFilesArray; + } + + private static ArrayNode extractAttachments(PDDocument document) { + ArrayNode attachmentsArray = objectMapper.createArrayNode(); + + try { + for (PDPage page : document.getPages()) { for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment fileAttachmentAnnotation) { ObjectNode attachmentNode = objectMapper.createObjectNode(); attachmentNode.put("Name", fileAttachmentAnnotation.getAttachmentName()); attachmentNode.put("Description", fileAttachmentAnnotation.getContents()); + // Try to get file size + try { + PDComplexFileSpecification fileSpec = + (PDComplexFileSpecification) fileAttachmentAnnotation.getFile(); + if (fileSpec != null && fileSpec.getEmbeddedFile() != null) { + attachmentNode.put( + "FileSize", fileSpec.getEmbeddedFile().getLength()); + } + } catch (Exception e) { + log.debug("Could not get attachment file size: {}", e.getMessage()); + } + attachmentsArray.add(attachmentNode); } } } - other.set("Attachments", attachmentsArray); + } catch (Exception e) { + log.error("Error extracting attachments: {}", e.getMessage()); + } - // Javascript + return attachmentsArray; + } + + private static ArrayNode extractJavaScript(PDDocumentCatalog catalog) { + ArrayNode javascriptArray = objectMapper.createArrayNode(); + + try { PDDocumentNameDictionary namesDict = catalog.getNames(); - ArrayNode javascriptArray = objectMapper.createArrayNode(); - if (namesDict != null) { PDJavascriptNameTreeNode javascriptDict = namesDict.getJavaScript(); if (javascriptDict != null) { @@ -626,17 +806,23 @@ public class GetInfoOnPDF { javascriptArray.add(jsNode); } } catch (IOException e) { - log.error("exception", e); + log.error("Error reading JavaScript entries: {}", e.getMessage()); } } } - other.set("JavaScript", javascriptArray); + } catch (Exception e) { + log.error("Error extracting JavaScript: {}", e.getMessage()); + } - // TODO size + return javascriptArray; + } + + private static ArrayNode extractLayers(PDDocument document) { + ArrayNode layersArray = objectMapper.createArrayNode(); + + try { PDOptionalContentProperties ocProperties = - pdfBoxDoc.getDocumentCatalog().getOCProperties(); - ArrayNode layersArray = objectMapper.createArrayNode(); - + document.getDocumentCatalog().getOCProperties(); if (ocProperties != null) { for (PDOptionalContentGroup ocg : ocProperties.getOptionalContentGroups()) { ObjectNode layerNode = objectMapper.createObjectNode(); @@ -644,350 +830,422 @@ public class GetInfoOnPDF { layersArray.add(layerNode); } } + } catch (Exception e) { + log.error("Error extracting layers: {}", e.getMessage()); + } - other.set("Layers", layersArray); + return layersArray; + } - // TODO Security - - PDStructureTreeRoot structureTreeRoot = - pdfBoxDoc.getDocumentCatalog().getStructureTreeRoot(); - ArrayNode structureTreeArray; - try { - if (structureTreeRoot != null) { - structureTreeArray = exploreStructureTree(structureTreeRoot.getKids()); - other.set("StructureTree", structureTreeArray); - } - } catch (Exception e) { - // TODO Auto-generated catch block - log.error("exception", e); - } - - boolean isPdfACompliant = pdfaConformanceLevel != null; - boolean isPdfXCompliant = checkForStandard(pdfBoxDoc, "PDF/X"); - boolean isPdfECompliant = checkForStandard(pdfBoxDoc, "PDF/E"); - boolean isPdfVTCompliant = checkForStandard(pdfBoxDoc, "PDF/VT"); - boolean isPdfUACompliant = checkForStandard(pdfBoxDoc, "PDF/UA"); - boolean isPdfBCompliant = - checkForStandard( - pdfBoxDoc, - "PDF/B"); // If you want to check for PDF/Broadcast, though this isn't - // an official ISO standard. - boolean isPdfSECCompliant = - checkForStandard( - pdfBoxDoc, - "PDF/SEC"); // This might not be effective since PDF/SEC was under - // development in 2021. - - compliancy.put("IsPDF/ACompliant", isPdfACompliant); - if (pdfaConformanceLevel != null) { - compliancy.put("PDF/AConformanceLevel", pdfaConformanceLevel); - if (pdfaValidationPassed != null) { - compliancy.put("IsPDF/AValidated", pdfaValidationPassed); - } - } - compliancy.put("IsPDF/XCompliant", isPdfXCompliant); - compliancy.put("IsPDF/ECompliant", isPdfECompliant); - compliancy.put("IsPDF/VTCompliant", isPdfVTCompliant); - compliancy.put("IsPDF/UACompliant", isPdfUACompliant); - compliancy.put("IsPDF/BCompliant", isPdfBCompliant); - compliancy.put("IsPDF/SECCompliant", isPdfSECCompliant); - - PDOutlineNode root = pdfBoxDoc.getDocumentCatalog().getDocumentOutline(); - ArrayNode bookmarksArray = objectMapper.createArrayNode(); + private static ArrayNode extractBookmarks(PDDocument document) { + ArrayNode bookmarksArray = objectMapper.createArrayNode(); + try { + PDOutlineNode root = document.getDocumentCatalog().getDocumentOutline(); if (root != null) { for (PDOutlineItem child : root.children()) { addOutlinesToArray(child, bookmarksArray); } } + } catch (Exception e) { + log.error("Error extracting bookmarks: {}", e.getMessage()); + } - other.set("Bookmarks/Outline/TOC", bookmarksArray); + return bookmarksArray; + } - PDMetadata pdMetadata = pdfBoxDoc.getDocumentCatalog().getMetadata(); - - String xmpString = null; + private static String extractXMPMetadata(PDDocument document) { + String xmpString = null; + try { + PDMetadata pdMetadata = document.getDocumentCatalog().getMetadata(); if (pdMetadata != null) { try { - try (COSInputStream is = pdMetadata.createInputStream()) { + try (COSInputStream inputStream = pdMetadata.createInputStream()) { DomXmpParser domXmpParser = new DomXmpParser(); - XMPMetadata xmpMeta = domXmpParser.parse(is); + XMPMetadata xmpMeta = domXmpParser.parse(inputStream); - ByteArrayOutputStream os = new ByteArrayOutputStream(); - new XmpSerializer().serialize(xmpMeta, os, true); - xmpString = new String(os.toByteArray(), StandardCharsets.UTF_8); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + new XmpSerializer().serialize(xmpMeta, outputStream, true); + xmpString = outputStream.toString(StandardCharsets.UTF_8); } } catch (XmpParsingException e) { - // XMP parsing failed, try to read raw metadata instead log.debug("XMP parsing failed, reading raw metadata: {}", e.getMessage()); - try (COSInputStream is = pdMetadata.createInputStream()) { - byte[] metadataBytes = is.readAllBytes(); + try (COSInputStream inputStream = pdMetadata.createInputStream()) { + byte[] metadataBytes = inputStream.readAllBytes(); xmpString = new String(metadataBytes, StandardCharsets.UTF_8); } - } catch (IOException e) { - log.error("exception", e); + } + } + } catch (Exception e) { + log.error("Error extracting XMP metadata: {}", e.getMessage()); + } + + return xmpString; + } + + private static ObjectNode extractPerPageInfo(PDDocument document) { + ObjectNode pageInfoParent = objectMapper.createObjectNode(); + + try { + int pageCount = document.getNumberOfPages(); + StringBuilder keyBuilder = new StringBuilder(PAGE_PREFIX.length() + 8); + + for (int pageNum = 0; pageNum < pageCount; pageNum++) { + try { + PDPage page = document.getPage(pageNum); + ObjectNode pageInfo = extractSinglePageInfo(document, page, pageNum); + + keyBuilder.setLength(0); + keyBuilder.append(PAGE_PREFIX).append(pageNum + 1); + pageInfoParent.set(keyBuilder.toString(), pageInfo); + } catch (Exception e) { + log.error("Error extracting info for page {}: {}", pageNum + 1, e.getMessage()); + } + } + } catch (Exception e) { + log.error("Error extracting per-page info: {}", e.getMessage()); + } + + return pageInfoParent; + } + + private static ObjectNode extractSinglePageInfo(PDDocument document, PDPage page, int pageNum) + throws IOException { + ObjectNode pageInfo = objectMapper.createObjectNode(); + + // Page size and dimensions + PDRectangle mediaBox = page.getMediaBox(); + float width = mediaBox.getWidth(); + float height = mediaBox.getHeight(); + + ObjectNode sizeInfo = objectMapper.createObjectNode(); + setDimensionInfo(sizeInfo, width, height); + sizeInfo.put("Standard Page", getPageSize(width, height)); + pageInfo.set("Size", sizeInfo); + + pageInfo.put("Rotation", page.getRotation()); + pageInfo.put("Page Orientation", getPageOrientation(width, height)); + + // Page boxes + pageInfo.put("MediaBox", mediaBox.toString()); + pageInfo.put( + "CropBox", page.getCropBox() == null ? "Undefined" : page.getCropBox().toString()); + pageInfo.put( + "BleedBox", + page.getBleedBox() == null ? "Undefined" : page.getBleedBox().toString()); + pageInfo.put( + "TrimBox", page.getTrimBox() == null ? "Undefined" : page.getTrimBox().toString()); + pageInfo.put( + "ArtBox", page.getArtBox() == null ? "Undefined" : page.getArtBox().toString()); + + // Text content + PDFTextStripper textStripper = new PDFTextStripper(); + textStripper.setStartPage(pageNum + 1); + textStripper.setEndPage(pageNum + 1); + String pageText = textStripper.getText(document); + pageInfo.put("Text Characters Count", pageText.length()); + + // Annotations + ObjectNode annotationsObject = extractPageAnnotations(page); + pageInfo.set("Annotations", annotationsObject); + + // Resources + PDResources resources = page.getResources(); + if (resources != null) { + // Images + ArrayNode imagesArray = extractPageImages(resources); + pageInfo.set("Images", imagesArray); + + // Links + ArrayNode linksArray = extractPageLinks(page); + pageInfo.set("Links", linksArray); + + // Fonts + ArrayNode fontsArray = extractPageFonts(resources); + pageInfo.set("Fonts", fontsArray); + + // XObjects count + ObjectNode xObjectCountNode = extractPageXObjects(resources); + pageInfo.set("XObjectCounts", xObjectCountNode); + } + + // Multimedia + ArrayNode multimediaArray = extractPageMultimedia(page); + pageInfo.set("Multimedia", multimediaArray); + + return pageInfo; + } + + private static ObjectNode extractPageAnnotations(PDPage page) throws IOException { + ObjectNode annotationsObject = objectMapper.createObjectNode(); + + List annotations = page.getAnnotations(); + int subtypeCount = 0; + int contentsCount = 0; + + for (PDAnnotation annotation : annotations) { + if (annotation.getSubtype() != null) { + subtypeCount++; + } + if (annotation.getContents() != null) { + contentsCount++; + } + } + + annotationsObject.put("AnnotationsCount", annotations.size()); + annotationsObject.put("SubtypeCount", subtypeCount); + annotationsObject.put("ContentsCount", contentsCount); + + return annotationsObject; + } + + private static ArrayNode extractPageImages(PDResources resources) { + ArrayNode imagesArray = objectMapper.createArrayNode(); + + try { + for (COSName name : resources.getXObjectNames()) { + PDXObject xObject = resources.getXObject(name); + if (xObject instanceof PDImageXObject image) { + ObjectNode imageNode = objectMapper.createObjectNode(); + imageNode.put("Width", image.getWidth()); + imageNode.put("Height", image.getHeight()); + + if (image.getMetadata() != null + && image.getMetadata().getFile() != null + && image.getMetadata().getFile().getFile() != null) { + imageNode.put("Name", image.getMetadata().getFile().getFile()); + } + + if (image.getColorSpace() != null) { + imageNode.put("ColorSpace", image.getColorSpace().getName()); + } + + imageNode.put("BitsPerComponent", image.getBitsPerComponent()); + + imagesArray.add(imageNode); + } + } + } catch (Exception e) { + log.error("Error extracting page images: {}", e.getMessage()); + } + + return imagesArray; + } + + private static ArrayNode extractPageLinks(PDPage page) throws IOException { + ArrayNode linksArray = objectMapper.createArrayNode(); + Set uniqueURIs = new HashSet<>(); + + List annotations = page.getAnnotations(); + for (PDAnnotation annotation : annotations) { + if (annotation instanceof PDAnnotationLink linkAnnotation + && linkAnnotation.getAction() instanceof PDActionURI uriAction) { + String uri = uriAction.getURI(); + uniqueURIs.add(uri); + } + } + + for (String uri : uniqueURIs) { + ObjectNode linkNode = objectMapper.createObjectNode(); + linkNode.put("URI", uri); + linksArray.add(linkNode); + } + + return linksArray; + } + + private static ArrayNode extractPageFonts(PDResources resources) { + ArrayNode fontsArray = objectMapper.createArrayNode(); + Map uniqueFontsMap = new HashMap<>(); + + try { + for (COSName fontName : resources.getFontNames()) { + PDFont font = resources.getFont(fontName); + ObjectNode fontNode = objectMapper.createObjectNode(); + + fontNode.put("IsEmbedded", font.isEmbedded()); + fontNode.put("Name", font.getName()); + fontNode.put("Subtype", font.getType()); + + PDFontDescriptor fontDescriptor = font.getFontDescriptor(); + if (fontDescriptor != null) { + fontNode.put("ItalicAngle", fontDescriptor.getItalicAngle()); + int flags = fontDescriptor.getFlags(); + fontNode.put("IsItalic", (flags & 1) != 0); + fontNode.put("IsBold", (flags & 64) != 0); + fontNode.put("IsFixedPitch", (flags & 2) != 0); + fontNode.put("IsSerif", (flags & 4) != 0); + fontNode.put("IsSymbolic", (flags & 8) != 0); + fontNode.put("IsScript", (flags & 16) != 0); + fontNode.put("IsNonsymbolic", (flags & 32) != 0); + fontNode.put("FontFamily", fontDescriptor.getFontFamily()); + fontNode.put("FontWeight", fontDescriptor.getFontWeight()); + } + + String uniqueKey = fontNode.toString(); + if (uniqueFontsMap.containsKey(uniqueKey)) { + ObjectNode existingFontNode = uniqueFontsMap.get(uniqueKey); + int count = existingFontNode.get("Count").asInt() + 1; + existingFontNode.put("Count", count); + } else { + fontNode.put("Count", 1); + uniqueFontsMap.put(uniqueKey, fontNode); } } - other.put("XMPMetadata", xmpString); - - if (pdfBoxDoc.isEncrypted()) { - encryption.put("IsEncrypted", true); - - // Retrieve encryption details using getEncryption() - PDEncryption pdfEncryption = pdfBoxDoc.getEncryption(); - encryption.put("EncryptionAlgorithm", pdfEncryption.getFilter()); - encryption.put("KeyLength", pdfEncryption.getLength()); - // Add other encryption-related properties as needed - } else { - encryption.put("IsEncrypted", false); + for (ObjectNode uniqueFontNode : uniqueFontsMap.values()) { + fontsArray.add(uniqueFontNode); } + } catch (Exception e) { + log.error("Error extracting page fonts: {}", e.getMessage()); + } - ObjectNode permissionsNode = objectMapper.createObjectNode(); - setNodePermissions(pdfBoxDoc, permissionsNode); + return fontsArray; + } - ObjectNode pageInfoParent = objectMapper.createObjectNode(); - for (int pageNum = 0; pageNum < pdfBoxDoc.getNumberOfPages(); pageNum++) { - ObjectNode pageInfo = objectMapper.createObjectNode(); + private static ObjectNode extractPageXObjects(PDResources resources) { + ObjectNode xObjectCountNode = objectMapper.createObjectNode(); + Map xObjectCountMap = new HashMap<>(); - // Retrieve the page - PDPage page = pdfBoxDoc.getPage(pageNum); + try { + for (COSName name : resources.getXObjectNames()) { + PDXObject xObject = resources.getXObject(name); + String xObjectType; - // Page-level Information - PDRectangle mediaBox = page.getMediaBox(); - - float width = mediaBox.getWidth(); - float height = mediaBox.getHeight(); - - ObjectNode sizeInfo = objectMapper.createObjectNode(); - - setDimensionInfo(sizeInfo, width, height); - - sizeInfo.put("Standard Page", getPageSize(width, height)); - pageInfo.set("Size", sizeInfo); - - pageInfo.put("Rotation", page.getRotation()); - pageInfo.put("Page Orientation", getPageOrientation(width, height)); - - // Boxes - pageInfo.put("MediaBox", mediaBox.toString()); - - // Assuming the following boxes are defined for your document; if not, you may get - // null values. - PDRectangle cropBox = page.getCropBox(); - pageInfo.put("CropBox", cropBox == null ? "Undefined" : cropBox.toString()); - - PDRectangle bleedBox = page.getBleedBox(); - pageInfo.put("BleedBox", bleedBox == null ? "Undefined" : bleedBox.toString()); - - PDRectangle trimBox = page.getTrimBox(); - pageInfo.put("TrimBox", trimBox == null ? "Undefined" : trimBox.toString()); - - PDRectangle artBox = page.getArtBox(); - pageInfo.put("ArtBox", artBox == null ? "Undefined" : artBox.toString()); - - // Content Extraction - PDFTextStripper textStripper = new PDFTextStripper(); - textStripper.setStartPage(pageNum + 1); - textStripper.setEndPage(pageNum + 1); - String pageText = textStripper.getText(pdfBoxDoc); - - pageInfo.put("Text Characters Count", pageText.length()); // - - // Annotations - - List annotations = page.getAnnotations(); - - int subtypeCount = 0; - int contentsCount = 0; - - for (PDAnnotation annotation : annotations) { - if (annotation.getSubtype() != null) { - subtypeCount++; // Increase subtype count - } - if (annotation.getContents() != null) { - contentsCount++; // Increase contents count - } + if (xObject instanceof PDImageXObject) { + xObjectType = "Image"; + } else if (xObject instanceof PDFormXObject) { + xObjectType = "Form"; + } else { + xObjectType = "Other"; } - ObjectNode annotationsObject = objectMapper.createObjectNode(); - annotationsObject.put("AnnotationsCount", annotations.size()); - annotationsObject.put("SubtypeCount", subtypeCount); - annotationsObject.put("ContentsCount", contentsCount); - pageInfo.set("Annotations", annotationsObject); + xObjectCountMap.put(xObjectType, xObjectCountMap.getOrDefault(xObjectType, 0) + 1); + } - // Images (simplified) - // This part is non-trivial as images can be embedded in multiple ways in a PDF. - // Here is a basic structure to recognize image XObjects on a page. - ArrayNode imagesArray = objectMapper.createArrayNode(); + for (Map.Entry entry : xObjectCountMap.entrySet()) { + xObjectCountNode.put(entry.getKey(), entry.getValue()); + } + } catch (Exception e) { + log.error("Error extracting page XObjects: {}", e.getMessage()); + } + + return xObjectCountNode; + } + + private static ArrayNode extractPageMultimedia(PDPage page) throws IOException { + ArrayNode multimediaArray = objectMapper.createArrayNode(); + + List annotations = page.getAnnotations(); + for (PDAnnotation annotation : annotations) { + if ("RichMedia".equals(annotation.getSubtype())) { + ObjectNode multimediaNode = objectMapper.createObjectNode(); + multimediaNode.put("Subtype", annotation.getSubtype()); + multimediaNode.put("Contents", annotation.getContents()); + multimediaArray.add(multimediaNode); + } + } + + return multimediaArray; + } + + private static ImageStatistics calculateImageStatistics(PDDocument document) { + ImageStatistics stats = new ImageStatistics(); + stats.totalImages = 0; + stats.uniqueImages = 0; + + try { + Set uniqueImageHashes = new HashSet<>(); + + for (PDPage page : document.getPages()) { PDResources resources = page.getResources(); + if (resources != null) { + for (COSName xObjectName : resources.getXObjectNames()) { + PDXObject xObject = resources.getXObject(xObjectName); + if (xObject instanceof PDImageXObject image) { + stats.totalImages++; - for (COSName name : resources.getXObjectNames()) { - PDXObject xObject = resources.getXObject(name); - if (xObject instanceof PDImageXObject image) { - ObjectNode imageNode = objectMapper.createObjectNode(); - imageNode.put("Width", image.getWidth()); - imageNode.put("Height", image.getHeight()); - if (image.getMetadata() != null - && image.getMetadata().getFile() != null - && image.getMetadata().getFile().getFile() != null) { - imageNode.put("Name", image.getMetadata().getFile().getFile()); + // Create a hash based on image properties + String imageHash = + String.format( + "%d_%d_%d_%s", + image.getWidth(), + image.getHeight(), + image.getBitsPerComponent(), + image.getSuffix()); + uniqueImageHashes.add(imageHash); } - if (image.getColorSpace() != null) { - imageNode.put("ColorSpace", image.getColorSpace().getName()); - } - - imagesArray.add(imageNode); } } - pageInfo.set("Images", imagesArray); - - // Links - ArrayNode linksArray = objectMapper.createArrayNode(); - Set uniqueURIs = new HashSet<>(); // To store unique URIs - - for (PDAnnotation annotation : annotations) { - if (annotation instanceof PDAnnotationLink linkAnnotation - && linkAnnotation.getAction() instanceof PDActionURI uriAction) { - String uri = uriAction.getURI(); - uniqueURIs.add(uri); // Add to set to ensure uniqueness - } - } - - // Add unique URIs to linksArray - for (String uri : uniqueURIs) { - ObjectNode linkNode = objectMapper.createObjectNode(); - linkNode.put("URI", uri); - linksArray.add(linkNode); - } - pageInfo.set("Links", linksArray); - - // Fonts - ArrayNode fontsArray = objectMapper.createArrayNode(); - Map uniqueFontsMap = new HashMap<>(); - - for (COSName fontName : resources.getFontNames()) { - PDFont font = resources.getFont(fontName); - ObjectNode fontNode = objectMapper.createObjectNode(); - - fontNode.put("IsEmbedded", font.isEmbedded()); - - // PDFBox provides Font's BaseFont (i.e., the font name) directly - fontNode.put("Name", font.getName()); - - fontNode.put("Subtype", font.getType()); - - PDFontDescriptor fontDescriptor = font.getFontDescriptor(); - - if (fontDescriptor != null) { - fontNode.put("ItalicAngle", fontDescriptor.getItalicAngle()); - int flags = fontDescriptor.getFlags(); - fontNode.put("IsItalic", (flags & 1) != 0); - fontNode.put("IsBold", (flags & 64) != 0); - fontNode.put("IsFixedPitch", (flags & 2) != 0); - fontNode.put("IsSerif", (flags & 4) != 0); - fontNode.put("IsSymbolic", (flags & 8) != 0); - fontNode.put("IsScript", (flags & 16) != 0); - fontNode.put("IsNonsymbolic", (flags & 32) != 0); - - fontNode.put("FontFamily", fontDescriptor.getFontFamily()); - // Font stretch and BBox are not directly available in PDFBox's API, so - // these are omitted for simplicity - fontNode.put("FontWeight", fontDescriptor.getFontWeight()); - } - - // Create a unique key for this font node based on its attributes - String uniqueKey = fontNode.toString(); - - // Increment count if this font exists, or initialize it if new - if (uniqueFontsMap.containsKey(uniqueKey)) { - ObjectNode existingFontNode = uniqueFontsMap.get(uniqueKey); - int count = existingFontNode.get("Count").asInt() + 1; - existingFontNode.put("Count", count); - } else { - fontNode.put("Count", 1); - uniqueFontsMap.put(uniqueKey, fontNode); - } - } - - // Add unique font entries to fontsArray - for (ObjectNode uniqueFontNode : uniqueFontsMap.values()) { - fontsArray.add(uniqueFontNode); - } - - pageInfo.set("Fonts", fontsArray); - - // Access resources dictionary - ArrayNode colorSpacesArray = objectMapper.createArrayNode(); - - Iterable colorSpaceNames = resources.getColorSpaceNames(); - for (COSName name : colorSpaceNames) { - PDColorSpace colorSpace = resources.getColorSpace(name); - if (colorSpace instanceof PDICCBased iccBased) { - PDStream iccData = iccBased.getPDStream(); - byte[] iccBytes = iccData.toByteArray(); - - // TODO: Further decode and analyze the ICC data if needed - ObjectNode iccProfileNode = objectMapper.createObjectNode(); - iccProfileNode.put("ICC Profile Length", iccBytes.length); - colorSpacesArray.add(iccProfileNode); - } - } - pageInfo.set("Color Spaces & ICC Profiles", colorSpacesArray); - - // Other XObjects - Map xObjectCountMap = - new HashMap<>(); // To store the count for each type - for (COSName name : resources.getXObjectNames()) { - PDXObject xObject = resources.getXObject(name); - String xObjectType; - - if (xObject instanceof PDImageXObject) { - xObjectType = "Image"; - } else if (xObject instanceof PDFormXObject) { - xObjectType = "Form"; - } else { - xObjectType = "Other"; - } - - // Increment the count for this type in the map - xObjectCountMap.put( - xObjectType, xObjectCountMap.getOrDefault(xObjectType, 0) + 1); - } - - // Add the count map to pageInfo (or wherever you want to store it) - ObjectNode xObjectCountNode = objectMapper.createObjectNode(); - for (Map.Entry entry : xObjectCountMap.entrySet()) { - xObjectCountNode.put(entry.getKey(), entry.getValue()); - } - pageInfo.set("XObjectCounts", xObjectCountNode); - - ArrayNode multimediaArray = objectMapper.createArrayNode(); - - for (PDAnnotation annotation : annotations) { - if ("RichMedia".equals(annotation.getSubtype())) { - ObjectNode multimediaNode = objectMapper.createObjectNode(); - // Extract details from the annotation as needed - multimediaArray.add(multimediaNode); - } - } - - pageInfo.set("Multimedia", multimediaArray); - keyBuilder.setLength(prefixLength); - keyBuilder.append(pageNum + 1); - - pageInfoParent.set(keyBuilder.toString(), pageInfo); } + stats.uniqueImages = uniqueImageHashes.size(); + } catch (Exception e) { + log.error("Error calculating image statistics: {}", e.getMessage()); + } + + return stats; + } + + @PostMapping(consumes = MediaType.MULTIPART_FORM_DATA_VALUE, value = "/get-info-on-pdf") + @Operation( + summary = "Get comprehensive PDF information", + description = + "Extracts all available information from a PDF file. Input:PDF Output:JSON Type:SISO") + public ResponseEntity getPdfInfo(@ModelAttribute PDFFile request) throws IOException { + MultipartFile inputFile = request.getFileInput(); + + // Validate input + try { + validatePdfFile(inputFile); + } catch (IllegalArgumentException e) { + log.error("Invalid PDF file: {}", e.getMessage()); + return createErrorResponse("Invalid PDF file: " + e.getMessage()); + } + + boolean readonly = true; + + try (PDDocument pdfBoxDoc = pdfDocumentFactory.load(inputFile, readonly)) { + ObjectNode jsonOutput = objectMapper.createObjectNode(); + + ObjectNode metadata = extractMetadata(pdfBoxDoc); + ObjectNode basicInfo = extractBasicInfo(pdfBoxDoc, inputFile.getSize()); + ObjectNode docInfoNode = extractDocumentInfo(pdfBoxDoc); + ObjectNode compliancy = extractComplianceInfo(pdfBoxDoc); + ObjectNode encryption = extractEncryptionInfo(pdfBoxDoc); + ObjectNode permissionsNode = extractPermissions(pdfBoxDoc); + ObjectNode other = extractOtherInfo(pdfBoxDoc); + ObjectNode formFieldsNode = extractFormFields(pdfBoxDoc); + + // Generate summary data + String pdfaConformanceLevel = getPdfAConformanceLevel(pdfBoxDoc); + Boolean pdfaValidationPassed = null; + if (pdfaConformanceLevel != null) { + pdfaValidationPassed = validatePdfAWithPreflight(pdfBoxDoc, pdfaConformanceLevel); + } + ObjectNode summaryData = + generatePDFSummaryData(pdfBoxDoc, pdfaConformanceLevel, pdfaValidationPassed); + + // Extract per-page information + ObjectNode pageInfoParent = extractPerPageInfo(pdfBoxDoc); + + // Assemble final JSON output + jsonOutput.set("Metadata", metadata); jsonOutput.set("BasicInfo", basicInfo); jsonOutput.set("DocumentInfo", docInfoNode); jsonOutput.set("Compliancy", compliancy); jsonOutput.set("Encryption", encryption); - jsonOutput.set("Permissions", permissionsNode); // set the node under "Permissions" + jsonOutput.set("Permissions", permissionsNode); + jsonOutput.set("FormFields", formFieldsNode); jsonOutput.set("Other", other); jsonOutput.set("PerPageInfo", pageInfoParent); - // Save JSON to file + if (summaryData != null && !summaryData.isEmpty()) { + jsonOutput.set("SummaryData", summaryData); + } + + // Convert to JSON string String jsonString = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(jsonOutput); @@ -996,10 +1254,53 @@ public class GetInfoOnPDF { "response.json", MediaType.APPLICATION_JSON); + } catch (IOException e) { + log.error("IO error while processing PDF: {}", e.getMessage(), e); + return createErrorResponse("Error reading PDF file: " + e.getMessage()); } catch (Exception e) { - log.error("exception", e); + log.error("Unexpected error while processing PDF: {}", e.getMessage(), e); + return createErrorResponse("Unexpected error processing PDF: " + e.getMessage()); } - return null; + } + + private ObjectNode extractBasicInfo(PDDocument document, long fileSizeInBytes) { + ObjectNode basicInfo = objectMapper.createObjectNode(); + + try { + basicInfo.put("FileSizeInBytes", fileSizeInBytes); + + String fullText = new PDFTextStripper().getText(document); + String[] words = RegexPatternUtils.getInstance().getWhitespacePattern().split(fullText); + int paragraphCount = + RegexPatternUtils.getInstance() + .getMultiFormatNewlinePattern() + .split(fullText) + .length; + + basicInfo.put("WordCount", words.length); + basicInfo.put("ParagraphCount", paragraphCount); + basicInfo.put("CharacterCount", fullText.length()); + + String language = document.getDocumentCatalog().getLanguage(); + if (language != null) { + basicInfo.put("Language", language); + } + basicInfo.put("Number of pages", document.getNumberOfPages()); + + ImageStatistics imageStats = calculateImageStatistics(document); + basicInfo.put("TotalImages", imageStats.totalImages); + basicInfo.put("UniqueImages", imageStats.uniqueImages); + + } catch (Exception e) { + log.error("Error extracting basic info: {}", e.getMessage()); + } + + return basicInfo; + } + + private static class ImageStatistics { + int totalImages; + int uniqueImages; } private static String getPageModeDescription(String pageMode) { diff --git a/app/core/src/main/resources/messages_en_GB.properties b/app/core/src/main/resources/messages_en_GB.properties index eb75856d3..508dc4c45 100644 --- a/app/core/src/main/resources/messages_en_GB.properties +++ b/app/core/src/main/resources/messages_en_GB.properties @@ -1017,6 +1017,7 @@ getPdfInfo.summary.all.permissions.alert=All Permissions Allowed getPdfInfo.summary.compliance.alert={0} Compliant getPdfInfo.summary.no.compliance.alert=No Compliance Standards getPdfInfo.summary.security.section=Security Status +getPdfInfo.summary.technical.section=Technical Details getPdfInfo.section.BasicInfo=Basic Information about the PDF document including file size, page count, and language getPdfInfo.section.Metadata=Document metadata including title, author, creation date and other document properties getPdfInfo.section.DocumentInfo=Technical details about the PDF document structure and version diff --git a/app/core/src/main/resources/templates/security/get-info-on-pdf.html b/app/core/src/main/resources/templates/security/get-info-on-pdf.html index 0b64bb679..4775de6d1 100644 --- a/app/core/src/main/resources/templates/security/get-info-on-pdf.html +++ b/app/core/src/main/resources/templates/security/get-info-on-pdf.html @@ -81,6 +81,29 @@ + +
+
Technical Details
+
+
+
    +
  • Images: -
  • +
  • Fonts: -
  • +
  • Form Fields: -
  • +
  • Embedded Files: -
  • +
+
+
+
    +
  • JavaScript: -
  • +
  • Layers: -
  • +
  • Bookmarks: -
  • +
  • Multimedia: -
  • +
+
+
+
+
@@ -121,6 +144,7 @@ const getPdfInfoSummaryAllPermissionsAlert = /*[[#{getPdfInfo.summary.all.permissions.alert}]]*/ "All Permissions Allowed"; const getPdfInfoSummaryComplianceAlert = /*[[#{getPdfInfo.summary.compliance.alert}]]*/ "{0} Compliant"; const getPdfInfoSummaryNoComplianceAlert = /*[[#{getPdfInfo.summary.no.compliance.alert}]]*/ "No Compliance Standards"; + const getPdfInfoSummaryTechnicalSection = /*[[#{getPdfInfo.summary.technical.section}]]*/ "Technical Details"; // Update the summary headings document.addEventListener('DOMContentLoaded', function() { @@ -128,6 +152,7 @@ document.getElementById('summary-basic-info-heading').textContent = getPdfInfoSummaryBasicInfo; document.getElementById('summary-doc-info-heading').textContent = getPdfInfoSummaryDocInfo; document.getElementById('summary-security-heading').textContent = getPdfInfoSummarySecuritySection; + document.getElementById('summary-technical-heading').textContent = getPdfInfoSummaryTechnicalSection; }); // Pre-load section descriptions @@ -141,6 +166,16 @@ const getPdfInfoSectionFormFields = /*[[#{getPdfInfo.section.FormFields}]]*/ "Interactive form fields present in the document"; const getPdfInfoSectionPerPageInfo = /*[[#{getPdfInfo.section.PerPageInfo}]]*/ "Detailed information about each page in the document"; + /** + * Form submission handler for PDF info extraction. + * + * Process: + * 1. Submit PDF to backend endpoint + * 2. Receive JSON with sections: Metadata, BasicInfo, DocumentInfo, etc. + * 3. Populate summary section from the detailed data + * 4. Display all sections in collapsible cards + * 5. Provide JSON download option + */ document.getElementById("pdfInfoForm").addEventListener("submit", function(event) { event.preventDefault(); @@ -154,14 +189,29 @@ fetchWithCsrf('api/v1/security/get-info-on-pdf', { method: 'POST', body: formData - }).then(response => response.json()).then(data => { + }).then(response => { + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + return response.json(); + }).then(data => { + // Check if response contains an error + if (data && data.error) { + console.error('Server error:', data.error); + alert('Error: ' + data.error); + return; + } + // Populate and display the enhanced PDF summary populateSummarySection(data); displayJsonData(data); setDownloadLink(data); document.getElementById("downloadJson").style.display = "block"; - }).catch(error => console.error('Error:', error)); + }).catch(error => { + console.error('Error:', error); + alert('An error occurred while processing the PDF. Please try again.'); + }); // Function to reset all summary elements to default state function resetSummaryElements() { @@ -177,6 +227,16 @@ document.getElementById('summary-created').textContent = '-'; document.getElementById('summary-modified').textContent = '-'; + // Reset technical details fields + document.getElementById('summary-images').textContent = '-'; + document.getElementById('summary-fonts').textContent = '-'; + document.getElementById('summary-form-fields').textContent = '-'; + document.getElementById('summary-embedded-files').textContent = '-'; + document.getElementById('summary-javascript').textContent = '-'; + document.getElementById('summary-layers').textContent = '-'; + document.getElementById('summary-bookmarks').textContent = '-'; + document.getElementById('summary-multimedia').textContent = '-'; + // Reset security status cards const cards = ['encryption-status', 'permissions-status', 'compliance-status']; cards.forEach(id => { @@ -208,12 +268,12 @@ resetSummaryElements(); // Get basic information - if (data.BasicInfo) { + if (data && data.BasicInfo) { document.getElementById('summary-pages').textContent = data.BasicInfo["Number of pages"] || "-"; // Format file size nicely let fileSize = data.BasicInfo["FileSizeInBytes"]; - if (fileSize) { + if (fileSize && fileSize > 0) { const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB']; const i = Math.floor(Math.log(fileSize) / Math.log(1024)); fileSize = (fileSize / Math.pow(1024, i)).toFixed(2) + ' ' + sizes[i]; @@ -224,18 +284,113 @@ } // Get document information - if (data.DocumentInfo) { + if (data && data.DocumentInfo) { document.getElementById('summary-version').textContent = data.DocumentInfo["PDF version"] || "-"; } // Get metadata - if (data.Metadata) { + if (data && data.Metadata) { document.getElementById('summary-title').textContent = data.Metadata["Title"] || "-"; document.getElementById('summary-author').textContent = data.Metadata["Author"] || "-"; document.getElementById('summary-created').textContent = data.Metadata["CreationDate"] || "-"; document.getElementById('summary-modified').textContent = data.Metadata["ModificationDate"] || "-"; } + // Populate technical details + if (data && data.BasicInfo) { + // Images + const totalImages = data.BasicInfo.TotalImages || 0; + const uniqueImages = data.BasicInfo.UniqueImages || 0; + if (totalImages > 0) { + document.getElementById('summary-images').textContent = `${totalImages} total (${uniqueImages} unique)`; + } else { + document.getElementById('summary-images').textContent = 'None'; + } + } + + // Count fonts from PerPageInfo + if (data && data.PerPageInfo) { + let totalFonts = 0; + let embeddedFonts = 0; + const fontSet = new Set(); + + for (const pageKey in data.PerPageInfo) { + const pageData = data.PerPageInfo[pageKey]; + + // Count fonts + if (pageData.Fonts && Array.isArray(pageData.Fonts)) { + pageData.Fonts.forEach(font => { + const fontKey = JSON.stringify([font.Name, font.Subtype]); + fontSet.add(fontKey); + if (font.IsEmbedded) { + embeddedFonts++; + } + }); + } + } + + // Display fonts + totalFonts = fontSet.size; + if (totalFonts > 0) { + document.getElementById('summary-fonts').textContent = `${totalFonts} (${embeddedFonts} embedded)`; + } else { + document.getElementById('summary-fonts').textContent = 'None'; + } + } + + // Form fields + if (data && data.FormFields) { + const formFieldCount = Object.keys(data.FormFields).length; + document.getElementById('summary-form-fields').textContent = formFieldCount > 0 ? formFieldCount : 'None'; + } + + // Other section data + if (data && data.Other) { + // Embedded files + const embeddedFiles = data.Other.EmbeddedFiles; + if (embeddedFiles && Array.isArray(embeddedFiles)) { + document.getElementById('summary-embedded-files').textContent = embeddedFiles.length > 0 ? embeddedFiles.length : 'None'; + } else { + document.getElementById('summary-embedded-files').textContent = 'None'; + } + + // JavaScript + const javascript = data.Other.JavaScript; + if (javascript && Array.isArray(javascript)) { + document.getElementById('summary-javascript').textContent = javascript.length > 0 ? `Yes (${javascript.length} scripts)` : 'None'; + } else { + document.getElementById('summary-javascript').textContent = 'None'; + } + + // Layers + const layers = data.Other.Layers; + if (layers && Array.isArray(layers)) { + document.getElementById('summary-layers').textContent = layers.length > 0 ? layers.length : 'None'; + } else { + document.getElementById('summary-layers').textContent = 'None'; + } + + // Bookmarks + const bookmarks = data.Other["Bookmarks/Outline/TOC"]; + if (bookmarks && Array.isArray(bookmarks)) { + document.getElementById('summary-bookmarks').textContent = bookmarks.length > 0 ? bookmarks.length : 'None'; + } else { + document.getElementById('summary-bookmarks').textContent = 'None'; + } + } + + // Count multimedia from pages + if (data && data.PerPageInfo) { + let multimediaCount = 0; + for (const pageKey in data.PerPageInfo) { + const pageData = data.PerPageInfo[pageKey]; + if (pageData.Multimedia && Array.isArray(pageData.Multimedia)) { + multimediaCount += pageData.Multimedia.length; + } + } + document.getElementById('summary-multimedia').textContent = multimediaCount > 0 ? multimediaCount : 'None'; + } + // Update security status cards // Encryption status @@ -257,7 +412,7 @@ const permissionsText = document.getElementById('permissions-text'); let restrictedPermissions = []; - if (data.Permissions) { + if (data && data.Permissions) { for (const [permission, state] of Object.entries(data.Permissions)) { if (state === "Not Allowed") { restrictedPermissions.push(permission); @@ -282,7 +437,7 @@ let hasCompliance = false; let compliantStandards = []; - if (data.Compliancy) { + if (data && data.Compliancy) { for (const [standard, compliant] of Object.entries(data.Compliancy)) { if (compliant === true) { hasCompliance = true; @@ -311,7 +466,7 @@ let hasSummaryInfo = false; // Create a consolidated security details card if there are security details worth highlighting - if ((data.Encryption && data.Encryption.IsEncrypted) || + if ((data && data.Encryption && data.Encryption.IsEncrypted) || restrictedPermissions.length > 0 || hasCompliance) { @@ -407,22 +562,22 @@ const summaryTextElement = document.getElementById('summary-text'); // Create a general summary for the document - let generalSummary = `This is a ${data.BasicInfo["Number of pages"] || "multi"}-page PDF`; + let generalSummary = `This is a ${(data && data.BasicInfo && data.BasicInfo["Number of pages"]) ? data.BasicInfo["Number of pages"] : "multi"}-page PDF`; - if (data.Metadata && data.Metadata["Title"]) { + if (data && data.Metadata && data.Metadata["Title"]) { generalSummary += ` titled "${data.Metadata["Title"]}"`; } - if (data.Metadata && data.Metadata["Author"]) { + if (data && data.Metadata && data.Metadata["Author"]) { generalSummary += ` created by ${data.Metadata["Author"]}`; } - if (data.DocumentInfo && data.DocumentInfo["PDF version"]) { + if (data && data.DocumentInfo && data.DocumentInfo["PDF version"]) { generalSummary += ` (PDF version ${data.DocumentInfo["PDF version"]})`; } // Add security information to the general summary if relevant - if (data.Encryption && data.Encryption.IsEncrypted) { + if (data && data.Encryption && data.Encryption.IsEncrypted) { generalSummary += '. The document is password protected'; if (data.Encryption.EncryptionAlgorithm) { @@ -443,42 +598,40 @@ generalSummary += `. This document complies with the ${compliantStandards.join(', ')} PDF standard${compliantStandards.length > 1 ? 's' : ''}`; } - generalSummary += '.'; + // Add interesting technical details + const technicalFeatures = []; - // Remove SummaryData from JSON to avoid duplication - if (data.SummaryData) { - delete data.SummaryData; + // Check for JavaScript + if (data && data.Other && data.Other.JavaScript && Array.isArray(data.Other.JavaScript) && data.Other.JavaScript.length > 0) { + technicalFeatures.push(`${data.Other.JavaScript.length} JavaScript script${data.Other.JavaScript.length > 1 ? 's' : ''}`); } + // Check for embedded files + if (data && data.Other && data.Other.EmbeddedFiles && Array.isArray(data.Other.EmbeddedFiles) && data.Other.EmbeddedFiles.length > 0) { + technicalFeatures.push(`${data.Other.EmbeddedFiles.length} embedded file${data.Other.EmbeddedFiles.length > 1 ? 's' : ''}`); + } + + // Check for layers + if (data && data.Other && data.Other.Layers && Array.isArray(data.Other.Layers) && data.Other.Layers.length > 0) { + technicalFeatures.push(`${data.Other.Layers.length} layer${data.Other.Layers.length > 1 ? 's' : ''}`); + } + + // Check for form fields + if (data && data.FormFields && Object.keys(data.FormFields).length > 0) { + technicalFeatures.push(`${Object.keys(data.FormFields).length} form field${Object.keys(data.FormFields).length > 1 ? 's' : ''}`); + } + + if (technicalFeatures.length > 0) { + generalSummary += `. The PDF contains ${technicalFeatures.join(', ')}`; + } + + generalSummary += '.'; + summaryTextElement.innerHTML = generalSummary; // Display the summary section document.getElementById('pdf-summary').style.display = 'block'; } - - function generateSummaryFromData(summaryData) { - let summary = []; - - // Handle encryption information - if (summaryData.encrypted) { - summary.push(getPdfInfoSummaryEncrypted); - } - - // Handle permissions information - if (summaryData.restrictedPermissions && summaryData.restrictedPermissions.length > 0) { - const formattedPermissionsText = getPdfInfoSummaryPermissions.replace('{0}', summaryData.restrictedPermissionsCount); - summary.push(formattedPermissionsText); - } - - // Handle standard compliance information - if (summaryData.standardCompliance) { - const formattedComplianceText = getPdfInfoSummaryCompliance - .replace('{0}', summaryData.standardCompliance); - summary.push(formattedComplianceText); - } - - return summary.join(' '); - } }); function displayJsonData(jsonData) {