From 0da9c62ef84560cbab58b1a5a0bdd55aa3c96ab8 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Sun, 30 Jul 2023 21:56:09 +0100 Subject: [PATCH] all info --- .../controller/api/security/PDFExtractor.java | 675 ++++++++++++++---- .../controller/web/SecurityWebController.java | 7 + .../templates/security/get-info-on-pdf.html | 33 + 3 files changed, 592 insertions(+), 123 deletions(-) create mode 100644 src/main/resources/templates/security/get-info-on-pdf.html diff --git a/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java b/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java index 4113691d..2cd429b4 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java +++ b/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java @@ -1,8 +1,23 @@ package stirling.software.SPDF.controller.api.security; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot; +import org.apache.pdfbox.pdmodel.encryption.PDEncryption; +import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; +import org.apache.pdfbox.pdmodel.interactive.form.PDField; + import com.itextpdf.kernel.pdf.PdfObject; +import com.itextpdf.kernel.pdf.PdfOutline; import com.itextpdf.forms.PdfAcroForm; import com.itextpdf.forms.fields.PdfFormField; import com.itextpdf.kernel.geom.Rectangle; @@ -15,29 +30,64 @@ import com.itextpdf.kernel.pdf.PdfEncryption; import com.itextpdf.kernel.pdf.PdfReader; import com.itextpdf.kernel.pdf.PdfResources; import com.itextpdf.kernel.pdf.PdfStream; +import com.itextpdf.kernel.pdf.PdfString; import com.itextpdf.kernel.pdf.PdfName; import com.itextpdf.kernel.pdf.PdfViewerPreferences; import com.itextpdf.kernel.pdf.PdfWriter; import com.itextpdf.kernel.pdf.annot.PdfAnnotation; import com.itextpdf.kernel.pdf.annot.PdfFileAttachmentAnnotation; import com.itextpdf.kernel.pdf.annot.PdfLinkAnnotation; +import com.itextpdf.kernel.pdf.annot.PdfWidgetAnnotation; import com.itextpdf.kernel.pdf.layer.PdfLayer; import com.itextpdf.kernel.pdf.layer.PdfOCProperties; +import com.itextpdf.kernel.xmp.XMPException; +import com.itextpdf.kernel.xmp.XMPMeta; +import com.itextpdf.kernel.xmp.XMPMetaFactory; + +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.Parameter; +import io.swagger.v3.oas.annotations.tags.Tag; +import stirling.software.SPDF.utils.WebResponseUtils; + import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; import org.apache.pdfbox.text.PDFTextStripper; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestPart; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + import java.io.File; +import java.util.HashMap; import java.io.FileWriter; +import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.List; import java.util.Map; - +import java.util.Set; +import java.util.HashSet; +@RestController +@Tag(name = "Security", description = "Security APIs") public class PDFExtractor { - public static void main(String[] args) { - try { - PDDocument pdfBoxDoc = PDDocument.load(new File("path_to_pdf.pdf")); + + static ObjectMapper objectMapper = new ObjectMapper(); + + @PostMapping(consumes = "multipart/form-data", value = "/get-info-on-pdf") + @Operation(summary = "Summary here", description = "desc. Input:PDF Output:JSON Type:SISO") + public ResponseEntity getPdfInfo( + @RequestPart(required = true, value = "fileInput") + @Parameter(description = "The input PDF file to get info on", required = true) MultipartFile inputFile) + throws IOException { + + try ( + PDDocument pdfBoxDoc = PDDocument.load(inputFile.getInputStream()); + PdfDocument itextDoc = new PdfDocument(new PdfReader(inputFile.getInputStream())) + ) { ObjectMapper objectMapper = new ObjectMapper(); ObjectNode jsonOutput = objectMapper.createObjectNode(); @@ -55,22 +105,256 @@ public class PDFExtractor { metadata.put("Trapped", info.getTrapped()); jsonOutput.set("Metadata", metadata); + + + // Total file size of the PDF + long fileSizeInBytes = inputFile.getSize(); + jsonOutput.put("FileSizeInBytes", fileSizeInBytes); + + // Number of words, paragraphs, and images in the entire document + String fullText = new PDFTextStripper().getText(pdfBoxDoc); + String[] words = fullText.split("\\s+"); + int wordCount = words.length; + int paragraphCount = fullText.split("\r\n|\r|\n").length; + jsonOutput.put("WordCount", wordCount); + jsonOutput.put("ParagraphCount", paragraphCount); + // Number of characters in the entire document (including spaces and special characters) + int charCount = fullText.length(); + jsonOutput.put("CharacterCount", charCount); + + + // Initialize the flags and types + boolean hasCompression = false; + String compressionType = "None"; + + // Check for object streams + for (int i = 1; i <= itextDoc.getNumberOfPdfObjects(); i++) { + PdfObject obj = itextDoc.getPdfObject(i); + if (obj != null && obj.isStream() && ((PdfStream) obj).get(PdfName.Type) == PdfName.ObjStm) { + hasCompression = true; + compressionType = "Object Streams"; + break; + } + } + + // If not compressed using object streams, check for compressed Xref tables + if (!hasCompression && itextDoc.getReader().hasRebuiltXref()) { + hasCompression = true; + compressionType = "Compressed Xref or Rebuilt Xref"; + } + jsonOutput.put("Compression", hasCompression); + if(hasCompression) + jsonOutput.put("CompressionType", compressionType); + + String language = pdfBoxDoc.getDocumentCatalog().getLanguage(); + jsonOutput.put("Language", language); + // Document Information using PDFBox ObjectNode docInfoNode = objectMapper.createObjectNode(); docInfoNode.put("Number of pages", pdfBoxDoc.getNumberOfPages()); docInfoNode.put("PDF version", pdfBoxDoc.getVersion()); - ; + // Page Mode using iText7 - PdfDocument itextDoc = new PdfDocument(new PdfReader("path_to_pdf.pdf")); PdfCatalog catalog = itextDoc.getCatalog(); PdfName pageMode = catalog.getPdfObject().getAsName(PdfName.PageMode); - ObjectNode itextDocInfo = objectMapper.createObjectNode(); + + + + PdfAcroForm acroForm = PdfAcroForm.getAcroForm(itextDoc, false); + ObjectNode formFieldsNode = objectMapper.createObjectNode(); + if (acroForm != null) { + for (Map.Entry entry : acroForm.getFormFields().entrySet()) { + formFieldsNode.put(entry.getKey(), entry.getValue().getValueAsString()); + } + } + jsonOutput.set("FormFields", formFieldsNode); + + + + + + //embeed files TODO size + ArrayNode embeddedFilesArray = objectMapper.createArrayNode(); + if(itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names) != null) + { + PdfDictionary embeddedFiles = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names) + .getAsDictionary(PdfName.EmbeddedFiles); + if (embeddedFiles != null) { + + PdfArray namesArray = embeddedFiles.getAsArray(PdfName.Names); + for (int i = 0; i < namesArray.size(); i += 2) { + ObjectNode embeddedFileNode = objectMapper.createObjectNode(); + embeddedFileNode.put("Name", namesArray.getAsString(i).toString()); + // Add other details if required + embeddedFilesArray.add(embeddedFileNode); + } + + } + } + jsonOutput.set("EmbeddedFiles", embeddedFilesArray); + + //attachments TODO size + ArrayNode attachmentsArray = objectMapper.createArrayNode(); + for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) { + for (PdfAnnotation annotation : itextDoc.getPage(pageNum).getAnnotations()) { + if (annotation instanceof PdfFileAttachmentAnnotation) { + ObjectNode attachmentNode = objectMapper.createObjectNode(); + attachmentNode.put("Name", ((PdfFileAttachmentAnnotation) annotation).getName().toString()); + attachmentNode.put("Description", annotation.getContents().getValue()); + attachmentsArray.add(attachmentNode); + } + } + } + jsonOutput.set("Attachments", attachmentsArray); + + //Javascript + PdfDictionary namesDict = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names); + ArrayNode javascriptArray = objectMapper.createArrayNode(); + if (namesDict != null) { + PdfDictionary javascriptDict = namesDict.getAsDictionary(PdfName.JavaScript); + if (javascriptDict != null) { + + PdfArray namesArray = javascriptDict.getAsArray(PdfName.Names); + for (int i = 0; i < namesArray.size(); i += 2) { + ObjectNode jsNode = objectMapper.createObjectNode(); + jsNode.put("JS Name", namesArray.getAsString(i).toString()); + jsNode.put("JS Code", namesArray.getAsString(i + 1).toString()); + javascriptArray.add(jsNode); + } + + } + } + jsonOutput.set("JavaScript", javascriptArray); + + //TODO size + PdfOCProperties ocProperties = itextDoc.getCatalog().getOCProperties(false); + ArrayNode layersArray = objectMapper.createArrayNode(); + if (ocProperties != null) { + + for (PdfLayer layer : ocProperties.getLayers()) { + ObjectNode layerNode = objectMapper.createObjectNode(); + layerNode.put("Name", layer.getPdfObject().getAsString(PdfName.Name).toString()); + layersArray.add(layerNode); + } + + } + jsonOutput.set("Layers", layersArray); + + //TODO Security + + + + + + + // Digital Signatures using iText7 TODO + + + + + PDAcroForm pdAcroForm = pdfBoxDoc.getDocumentCatalog().getAcroForm(); + ArrayNode formFieldsArray2 = objectMapper.createArrayNode(); + if (pdAcroForm != null) { + + for (PDField field : pdAcroForm.getFields()) { + ObjectNode fieldNode = objectMapper.createObjectNode(); + fieldNode.put("FieldName", field.getFullyQualifiedName()); + fieldNode.put("FieldType", field.getFieldType()); + // Add more attributes as needed... + formFieldsArray2.add(fieldNode); + } + + } + jsonOutput.set("FormFields", formFieldsArray2); + + + PDStructureTreeRoot structureTreeRoot = pdfBoxDoc.getDocumentCatalog().getStructureTreeRoot(); + ArrayNode structureTreeArray; + try { + if(structureTreeRoot != null) { + structureTreeArray = exploreStructureTree(structureTreeRoot.getKids()); + jsonOutput.set("StructureTree", structureTreeArray); + } + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + + + + + + + + boolean isPdfACompliant = checkOutputIntent(itextDoc, "PDF/A"); + boolean isPdfXCompliant = checkOutputIntent(itextDoc, "PDF/X"); + boolean isPdfECompliant = checkForStandard(itextDoc, "PDF/E"); + boolean isPdfVTCompliant = checkForStandard(itextDoc, "PDF/VT"); + boolean isPdfUACompliant = checkForStandard(itextDoc, "PDF/UA"); + boolean isPdfBCompliant = checkForStandard(itextDoc, "PDF/B"); // If you want to check for PDF/Broadcast, though this isn't an official ISO standard. + boolean isPdfSECCompliant = checkForStandard(itextDoc, "PDF/SEC"); // This might not be effective since PDF/SEC was under development in 2021. + + ObjectNode compliancy = objectMapper.createObjectNode(); + compliancy.put("IsPDF/ACompliant", isPdfACompliant); + compliancy.put("IsPDF/XCompliant", isPdfXCompliant); + compliancy.put("IsPDF/ECompliant", isPdfECompliant); + compliancy.put("IsPDF/VTCompliant", isPdfVTCompliant); + compliancy.put("IsPDF/UACompliant", isPdfUACompliant); + compliancy.put("IsPDF/BCompliant", isPdfBCompliant); + compliancy.put("IsPDF/SECCompliant", isPdfSECCompliant); + + jsonOutput.set("Compliancy", compliancy); + + + + + ArrayNode bookmarksArray = objectMapper.createArrayNode(); + PdfOutline root = itextDoc.getOutlines(false); + if (root != null) { + for (PdfOutline child : root.getAllChildren()) { + addOutlinesToArray(child, bookmarksArray); + } + } + jsonOutput.set("Bookmarks/Outline/TOC", bookmarksArray); + + String xmpString = null; + try { + byte[] xmpBytes = itextDoc.getXmpMetadata(); + if (xmpBytes != null) { + XMPMeta xmpMeta = XMPMetaFactory.parseFromBuffer(xmpBytes); + xmpString = xmpMeta.dumpObject(); + + } + } catch (XMPException e) { + e.printStackTrace(); + } + jsonOutput.put("XMPMetadata", xmpString); + + + + ObjectNode encryptionNode = objectMapper.createObjectNode(); + if (pdfBoxDoc.isEncrypted()) { + encryptionNode.put("IsEncrypted", true); + + // Retrieve encryption details using getEncryption() + PDEncryption encryption = pdfBoxDoc.getEncryption(); + encryptionNode.put("EncryptionAlgorithm", encryption.getFilter()); + encryptionNode.put("KeyLength", encryption.getLength()); + encryptionNode.put("Permissions", pdfBoxDoc.getCurrentAccessPermission().toString()); + + // Add other encryption-related properties as needed + } else { + encryptionNode.put("IsEncrypted", false); + } + jsonOutput.set("Encryption", encryptionNode); + docInfoNode.put("Page Mode", getPageModeDescription(pageMode));; jsonOutput.set("Document Information", docInfoNode); - + ObjectNode pageInfoParent = objectMapper.createObjectNode(); for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) { ObjectNode pageInfo = objectMapper.createObjectNode(); @@ -79,7 +363,9 @@ public class PDFExtractor { pageInfo.put("Width", pageSize.getWidth()); pageInfo.put("Height", pageSize.getHeight()); pageInfo.put("Rotation", itextDoc.getPage(pageNum).getRotation()); - + pageInfo.put("Page Orientation", getPageOrientation(pageSize.getWidth(),pageSize.getHeight())); + pageInfo.put("Standard Size", getPageSize(pageSize.getWidth(),pageSize.getHeight())); + // Boxes pageInfo.put("MediaBox", itextDoc.getPage(pageNum).getMediaBox().toString()); pageInfo.put("CropBox", itextDoc.getPage(pageNum).getCropBox().toString()); @@ -98,14 +384,25 @@ public class PDFExtractor { // Annotations ArrayNode annotationsArray = objectMapper.createArrayNode(); List annotations = itextDoc.getPage(pageNum).getAnnotations(); - for (PdfAnnotation annotation : annotations) { - ObjectNode annotationNode = objectMapper.createObjectNode(); - annotationNode.put("Subtype", annotation.getSubtype().toString()); - annotationNode.put("Contents", annotation.getContents().getValue()); - annotationsArray.add(annotationNode); - } - pageInfo.set("Annotations", annotationsArray); + int subtypeCount = 0; + int contentsCount = 0; + + for (PdfAnnotation annotation : annotations) { + if(annotation.getSubtype() != null) { + subtypeCount++; // Increase subtype count + } + if(annotation.getContents() != null) { + contentsCount++; // Increase contents count + } + } + + ObjectNode annotationsObject = objectMapper.createObjectNode(); + annotationsObject.put("AnnotationsCount", annotations.size()); + annotationsObject.put("SubtypeCount", subtypeCount); + annotationsObject.put("ContentsCount", contentsCount); + pageInfo.set("Annotations", annotationsObject); + // Images (simplified) // This part is non-trivial as images can be embedded in multiple ways in a PDF. // Here is a basic structure to recognize image XObjects on a page. @@ -129,32 +426,62 @@ public class PDFExtractor { } pageInfo.set("Images", imagesArray); + // Links ArrayNode linksArray = objectMapper.createArrayNode(); + Set uniqueURIs = new HashSet<>(); // To store unique URIs + for (PdfAnnotation annotation : annotations) { if (annotation instanceof PdfLinkAnnotation) { PdfLinkAnnotation linkAnnotation = (PdfLinkAnnotation) annotation; - ObjectNode linkNode = objectMapper.createObjectNode(); - linkNode.put("URI", linkAnnotation.getAction().toString()); // Basic, might not work for all links - linksArray.add(linkNode); + String uri = linkAnnotation.getAction().toString(); + uniqueURIs.add(uri); // Add to set to ensure uniqueness } } + + // Add unique URIs to linksArray + for (String uri : uniqueURIs) { + ObjectNode linkNode = objectMapper.createObjectNode(); + linkNode.put("URI", uri); + linksArray.add(linkNode); + } pageInfo.set("Links", linksArray); //Fonts ArrayNode fontsArray = objectMapper.createArrayNode(); PdfDictionary fontDicts = resources.getResource(PdfName.Font); + Set uniqueSubtypes = new HashSet<>(); // To store unique subtypes + + if (fontDicts != null) { for (PdfName key : fontDicts.keySet()) { + ObjectNode fontNode = objectMapper.createObjectNode(); // Create a new font node for each font PdfDictionary font = fontDicts.getAsDictionary(key); - ObjectNode fontNode = objectMapper.createObjectNode(); - fontNode.put("Name", font.getAsString(PdfName.BaseFont).toString()); + boolean isEmbedded = font.containsKey(PdfName.FontFile) || + font.containsKey(PdfName.FontFile2) || + font.containsKey(PdfName.FontFile3); + fontNode.put("IsEmbedded", isEmbedded); + + + if (font.containsKey(PdfName.Encoding)) { + String encoding = font.getAsName(PdfName.Encoding).toString(); + fontNode.put("Encoding", encoding); + } + + + + if(font.getAsString(PdfName.BaseFont) != null) + fontNode.put("Name", font.getAsString(PdfName.BaseFont).toString()); + + String subtype = null; // Font Subtype (e.g., Type1, TrueType) if (font.containsKey(PdfName.Subtype)) { - fontNode.put("Subtype", font.getAsName(PdfName.Subtype).toString()); + subtype = font.getAsName(PdfName.Subtype).toString(); + uniqueSubtypes.add(subtype); // Add to set to ensure uniqueness } - + fontNode.put("Subtype", subtype); + // Font Descriptor PdfDictionary fontDescriptor = font.getAsDictionary(PdfName.FontDescriptor); if (fontDescriptor != null) { @@ -166,14 +493,53 @@ public class PDFExtractor { // Flags (e.g., italic, bold) if (fontDescriptor.containsKey(PdfName.Flags)) { int flags = fontDescriptor.getAsNumber(PdfName.Flags).intValue(); - fontNode.put("IsItalic", (flags & 64) != 0); - fontNode.put("IsBold", (flags & 1) != 0); + fontNode.put("IsItalic", (flags & 64) != 0); // Existing italic flag + fontNode.put("IsBold", (flags & 1 << 16) != 0); // Existing bold flag + fontNode.put("IsFixedPitch", (flags & 1) != 0); + fontNode.put("IsSerif", (flags & 2) != 0); + fontNode.put("IsSymbolic", (flags & 4) != 0); + fontNode.put("IsScript", (flags & 8) != 0); + fontNode.put("IsNonsymbolic", (flags & 16) != 0); + } + + if (fontDescriptor.containsKey(PdfName.FontFamily)) { + String fontFamily = fontDescriptor.getAsString(PdfName.FontFamily).toString(); + fontNode.put("FontFamily", fontFamily); } - } - fontsArray.add(fontNode); + if (fontDescriptor.containsKey(PdfName.FontStretch)) { + String fontStretch = fontDescriptor.getAsName(PdfName.FontStretch).toString(); + fontNode.put("FontStretch", fontStretch); + } + + if (fontDescriptor != null && fontDescriptor.containsKey(PdfName.FontBBox)) { + PdfArray bbox = fontDescriptor.getAsArray(PdfName.FontBBox); + fontNode.put("FontBoundingBox", bbox.toString()); + } + if (fontDescriptor != null && fontDescriptor.containsKey(PdfName.FontWeight)) { + float fontWeight = fontDescriptor.getAsNumber(PdfName.FontWeight).floatValue(); + fontNode.put("FontWeight", fontWeight); + } + + } + if (font.containsKey(PdfName.ToUnicode)) { + PdfStream toUnicodeStream = font.getAsStream(PdfName.ToUnicode); + // Handle the stream as needed, maybe extract some details or just note its existence + fontNode.put("HasToUnicodeMap", true); + } + if (fontNode.size() > 0) { + fontsArray.add(fontNode); // Add each font node to fontsArray + } } } + + // Add unique subtypes to fontsArray + ArrayNode subtypesArray = objectMapper.createArrayNode(); + for (String subtype : uniqueSubtypes) { + subtypesArray.add(subtype); + } + pageInfo.set("FontSubtypes", subtypesArray); // Changed from Fonts to FontSubtypes + pageInfo.set("Fonts", fontsArray); @@ -204,123 +570,186 @@ public class PDFExtractor { pageInfo.set("Color Spaces & ICC Profiles", colorSpacesArray); // Other XObjects - ArrayNode xObjectsArray = objectMapper.createArrayNode(); + Map xObjectCountMap = new HashMap<>(); // To store the count for each type PdfDictionary xObjects = resourcesDict.getAsDictionary(PdfName.XObject); if (xObjects != null) { for (PdfName name : xObjects.keySet()) { PdfStream xObjectStream = xObjects.getAsStream(name); - ObjectNode xObjectNode = objectMapper.createObjectNode(); - xObjectNode.put("Type", xObjectStream.getAsName(PdfName.Subtype).toString()); - // TODO: Extract further details depending on the XObject type - xObjectsArray.add(xObjectNode); + String xObjectType = xObjectStream.getAsName(PdfName.Subtype).toString(); + + // Increment the count for this type in the map + xObjectCountMap.put(xObjectType, xObjectCountMap.getOrDefault(xObjectType, 0) + 1); } } - pageInfo.set("XObjects", xObjectsArray); - jsonOutput.set("Page " + pageNum, pageInfo); - } - - PdfAcroForm acroForm = PdfAcroForm.getAcroForm(itextDoc, false); - if (acroForm != null) { - ObjectNode formFieldsNode = objectMapper.createObjectNode(); - for (Map.Entry entry : acroForm.getFormFields().entrySet()) { - formFieldsNode.put(entry.getKey(), entry.getValue().getValueAsString()); + // Add the count map to pageInfo (or wherever you want to store it) + ObjectNode xObjectCountNode = objectMapper.createObjectNode(); + for (Map.Entry entry : xObjectCountMap.entrySet()) { + xObjectCountNode.put(entry.getKey(), entry.getValue()); } - jsonOutput.set("FormFields", formFieldsNode); - } + pageInfo.set("XObjectCounts", xObjectCountNode); + + - - - //TODO bookmarks here - - - - - //embeed files TODO size - PdfDictionary embeddedFiles = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names) - .getAsDictionary(PdfName.EmbeddedFiles); - if (embeddedFiles != null) { - ArrayNode embeddedFilesArray = objectMapper.createArrayNode(); - PdfArray namesArray = embeddedFiles.getAsArray(PdfName.Names); - for (int i = 0; i < namesArray.size(); i += 2) { - ObjectNode embeddedFileNode = objectMapper.createObjectNode(); - embeddedFileNode.put("Name", namesArray.getAsString(i).toString()); - // Add other details if required - embeddedFilesArray.add(embeddedFileNode); - } - jsonOutput.set("EmbeddedFiles", embeddedFilesArray); - } - - - //attachments TODO size - ArrayNode attachmentsArray = objectMapper.createArrayNode(); - for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) { - for (PdfAnnotation annotation : itextDoc.getPage(pageNum).getAnnotations()) { - if (annotation instanceof PdfFileAttachmentAnnotation) { - ObjectNode attachmentNode = objectMapper.createObjectNode(); - attachmentNode.put("Name", ((PdfFileAttachmentAnnotation) annotation).getName().toString()); - attachmentNode.put("Description", annotation.getContents().getValue()); - attachmentsArray.add(attachmentNode); + ArrayNode multimediaArray = objectMapper.createArrayNode(); + for (PdfAnnotation annotation : annotations) { + if (PdfName.RichMedia.equals(annotation.getSubtype())) { + ObjectNode multimediaNode = objectMapper.createObjectNode(); + // Extract details from the dictionary as needed + multimediaArray.add(multimediaNode); } } + pageInfo.set("Multimedia", multimediaArray); + + + + pageInfoParent.set("Page " + pageNum, pageInfo); } - jsonOutput.set("Attachments", attachmentsArray); - - //Javascript - PdfDictionary namesDict = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names); - if (namesDict != null) { - PdfDictionary javascriptDict = namesDict.getAsDictionary(PdfName.JavaScript); - if (javascriptDict != null) { - ArrayNode javascriptArray = objectMapper.createArrayNode(); - PdfArray namesArray = javascriptDict.getAsArray(PdfName.Names); - for (int i = 0; i < namesArray.size(); i += 2) { - ObjectNode jsNode = objectMapper.createObjectNode(); - jsNode.put("JS Name", namesArray.getAsString(i).toString()); - jsNode.put("JS Code", namesArray.getAsString(i + 1).toString()); - javascriptArray.add(jsNode); - } - jsonOutput.set("JavaScripts", javascriptArray); - } - } - - //TODO size - PdfOCProperties ocProperties = itextDoc.getCatalog().getOCProperties(false); - if (ocProperties != null) { - ArrayNode layersArray = objectMapper.createArrayNode(); - for (PdfLayer layer : ocProperties.getLayers()) { - ObjectNode layerNode = objectMapper.createObjectNode(); - layerNode.put("Name", layer.getPdfObject().getAsString(PdfName.Name).toString()); - layersArray.add(layerNode); - } - jsonOutput.set("Layers", layersArray); - } - + jsonOutput.set("Per Page Info", pageInfoParent); - //TODO Security - - - - - - - // Digital Signatures using iText7 TODO // Save JSON to file - try (FileWriter file = new FileWriter("output.json")) { - file.write(objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(jsonOutput)); - file.flush(); - } - - pdfBoxDoc.close(); - itextDoc.close(); + String jsonString = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(jsonOutput); + + + + return WebResponseUtils.bytesToWebResponse(jsonString.getBytes(StandardCharsets.UTF_8), "response.json", MediaType.APPLICATION_JSON); + } catch (Exception e) { e.printStackTrace(); } + return null; } - private static String formatDate(Calendar calendar) { + private static void addOutlinesToArray(PdfOutline outline, ArrayNode arrayNode) { + if (outline == null) return; + ObjectNode outlineNode = objectMapper.createObjectNode(); + outlineNode.put("Title", outline.getTitle()); + // You can add other properties if needed + arrayNode.add(outlineNode); + + for (PdfOutline child : outline.getAllChildren()) { + addOutlinesToArray(child, arrayNode); + } + } + public String getPageOrientation(double width, double height) { + if (width > height) { + return "Landscape"; + } else if (height > width) { + return "Portrait"; + } else { + return "Square"; + } + } + public String getPageSize(double width, double height) { + // Common aspect ratios used for standard paper sizes + double[] aspectRatios = {4.0 / 3.0, 3.0 / 2.0, Math.sqrt(2.0), 16.0 / 9.0}; + + // Check if the page matches any common aspect ratio + for (double aspectRatio : aspectRatios) { + if (isCloseToAspectRatio(width, height, aspectRatio)) { + return "Standard"; + } + } + + // If not a standard aspect ratio, consider it as a custom size + return "Custom"; + } + private boolean isCloseToAspectRatio(double width, double height, double aspectRatio) { + // Calculate the aspect ratio of the page + double pageAspectRatio = width / height; + + // Compare the page aspect ratio with the common aspect ratio within a threshold + return Math.abs(pageAspectRatio - aspectRatio) <= 0.05; + } + + public boolean checkForStandard(PdfDocument document, String standardKeyword) { + // Check Output Intents + boolean foundInOutputIntents = checkOutputIntent(document, standardKeyword); + if (foundInOutputIntents) return true; + + // Check XMP Metadata (rudimentary) + try { + byte[] metadataBytes = document.getXmpMetadata(); + if (metadataBytes != null) { + XMPMeta xmpMeta = XMPMetaFactory.parseFromBuffer(metadataBytes); + String xmpString = xmpMeta.dumpObject(); + if (xmpString.contains(standardKeyword)) { + return true; + } + } + } catch (XMPException e) { + e.printStackTrace(); + } + + return false; + } + + + public boolean checkOutputIntent(PdfDocument document, String standard) { + PdfArray outputIntents = document.getCatalog().getPdfObject().getAsArray(PdfName.OutputIntents); + if (outputIntents != null && !outputIntents.isEmpty()) { + for (int i = 0; i < outputIntents.size(); i++) { + PdfDictionary outputIntentDict = outputIntents.getAsDictionary(i); + if (outputIntentDict != null) { + PdfString s = outputIntentDict.getAsString(PdfName.S); + if (s != null && s.toString().contains(standard)) { + return true; + } + } + } + } + return false; + } + + public ArrayNode exploreStructureTree(List nodes) { + ArrayNode elementsArray = objectMapper.createArrayNode(); + if (nodes != null) { + for (Object obj : nodes) { + if (obj instanceof PDStructureNode) { + PDStructureNode node = (PDStructureNode) obj; + ObjectNode elementNode = objectMapper.createObjectNode(); + + if (node instanceof PDStructureElement) { + PDStructureElement structureElement = (PDStructureElement) node; + elementNode.put("Type", structureElement.getStructureType()); + elementNode.put("Content", getContent(structureElement)); + + // Recursively explore child elements + ArrayNode childElements = exploreStructureTree(structureElement.getKids()); + if (childElements.size() > 0) { + elementNode.set("Children", childElements); + } + } + elementsArray.add(elementNode); + } + } + } + return elementsArray; + } + + + public String getContent(PDStructureElement structureElement) { + StringBuilder contentBuilder = new StringBuilder(); + + for (Object item : structureElement.getKids()) { + if (item instanceof COSString) { + COSString cosString = (COSString) item; + contentBuilder.append(cosString.getString()); + } else if (item instanceof PDStructureElement) { + // For simplicity, we're handling only COSString and PDStructureElement here + // but a more comprehensive method would handle other types too + contentBuilder.append(getContent((PDStructureElement) item)); + } + } + + return contentBuilder.toString(); + } + + + private String formatDate(Calendar calendar) { if (calendar != null) { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); return sdf.format(calendar.getTime()); @@ -329,7 +758,7 @@ public class PDFExtractor { } } - private static String getPageModeDescription(PdfName pageMode) { + private String getPageModeDescription(PdfName pageMode) { return pageMode != null ? pageMode.toString().replaceFirst("/", "") : "Unknown"; } } diff --git a/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java b/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java index 66de400e..3857cc9e 100644 --- a/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java +++ b/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java @@ -52,4 +52,11 @@ public class SecurityWebController { model.addAttribute("currentPage", "sanitize-pdf"); return "security/sanitize-pdf"; } + + @GetMapping("/get-info-on-pdf") + @Hidden + public String getInfo(Model model) { + model.addAttribute("currentPage", "get-info-on-pdf"); + return "security/get-info-on-pdf"; + } } diff --git a/src/main/resources/templates/security/get-info-on-pdf.html b/src/main/resources/templates/security/get-info-on-pdf.html new file mode 100644 index 00000000..36a0bb6e --- /dev/null +++ b/src/main/resources/templates/security/get-info-on-pdf.html @@ -0,0 +1,33 @@ + + + + + + + + +
+
+
+

+
+
+
+

+

+

+
+
+ + +
+ +
+
+
+ +
+
+
+ + \ No newline at end of file