mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-01-19 00:07:17 +01:00
all info
This commit is contained in:
parent
52a7885f3c
commit
0da9c62ef8
@ -1,8 +1,23 @@
|
||||
package stirling.software.SPDF.controller.api.security;
|
||||
|
||||
import org.apache.pdfbox.cos.COSArray;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
|
||||
import org.apache.pdfbox.pdmodel.encryption.PDEncryption;
|
||||
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
|
||||
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
|
||||
|
||||
import com.itextpdf.kernel.pdf.PdfObject;
|
||||
import com.itextpdf.kernel.pdf.PdfOutline;
|
||||
import com.itextpdf.forms.PdfAcroForm;
|
||||
import com.itextpdf.forms.fields.PdfFormField;
|
||||
import com.itextpdf.kernel.geom.Rectangle;
|
||||
@ -15,29 +30,64 @@ import com.itextpdf.kernel.pdf.PdfEncryption;
|
||||
import com.itextpdf.kernel.pdf.PdfReader;
|
||||
import com.itextpdf.kernel.pdf.PdfResources;
|
||||
import com.itextpdf.kernel.pdf.PdfStream;
|
||||
import com.itextpdf.kernel.pdf.PdfString;
|
||||
import com.itextpdf.kernel.pdf.PdfName;
|
||||
import com.itextpdf.kernel.pdf.PdfViewerPreferences;
|
||||
import com.itextpdf.kernel.pdf.PdfWriter;
|
||||
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
|
||||
import com.itextpdf.kernel.pdf.annot.PdfFileAttachmentAnnotation;
|
||||
import com.itextpdf.kernel.pdf.annot.PdfLinkAnnotation;
|
||||
import com.itextpdf.kernel.pdf.annot.PdfWidgetAnnotation;
|
||||
import com.itextpdf.kernel.pdf.layer.PdfLayer;
|
||||
import com.itextpdf.kernel.pdf.layer.PdfOCProperties;
|
||||
import com.itextpdf.kernel.xmp.XMPException;
|
||||
import com.itextpdf.kernel.xmp.XMPMeta;
|
||||
import com.itextpdf.kernel.xmp.XMPMetaFactory;
|
||||
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.Parameter;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.node.ArrayNode;
|
||||
import com.fasterxml.jackson.databind.node.ObjectNode;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestPart;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Calendar;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
@RestController
|
||||
@Tag(name = "Security", description = "Security APIs")
|
||||
public class PDFExtractor {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
PDDocument pdfBoxDoc = PDDocument.load(new File("path_to_pdf.pdf"));
|
||||
|
||||
static ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/get-info-on-pdf")
|
||||
@Operation(summary = "Summary here", description = "desc. Input:PDF Output:JSON Type:SISO")
|
||||
public ResponseEntity<byte[]> getPdfInfo(
|
||||
@RequestPart(required = true, value = "fileInput")
|
||||
@Parameter(description = "The input PDF file to get info on", required = true) MultipartFile inputFile)
|
||||
throws IOException {
|
||||
|
||||
try (
|
||||
PDDocument pdfBoxDoc = PDDocument.load(inputFile.getInputStream());
|
||||
PdfDocument itextDoc = new PdfDocument(new PdfReader(inputFile.getInputStream()))
|
||||
) {
|
||||
ObjectMapper objectMapper = new ObjectMapper();
|
||||
ObjectNode jsonOutput = objectMapper.createObjectNode();
|
||||
|
||||
@ -55,22 +105,256 @@ public class PDFExtractor {
|
||||
metadata.put("Trapped", info.getTrapped());
|
||||
jsonOutput.set("Metadata", metadata);
|
||||
|
||||
|
||||
|
||||
// Total file size of the PDF
|
||||
long fileSizeInBytes = inputFile.getSize();
|
||||
jsonOutput.put("FileSizeInBytes", fileSizeInBytes);
|
||||
|
||||
// Number of words, paragraphs, and images in the entire document
|
||||
String fullText = new PDFTextStripper().getText(pdfBoxDoc);
|
||||
String[] words = fullText.split("\\s+");
|
||||
int wordCount = words.length;
|
||||
int paragraphCount = fullText.split("\r\n|\r|\n").length;
|
||||
jsonOutput.put("WordCount", wordCount);
|
||||
jsonOutput.put("ParagraphCount", paragraphCount);
|
||||
// Number of characters in the entire document (including spaces and special characters)
|
||||
int charCount = fullText.length();
|
||||
jsonOutput.put("CharacterCount", charCount);
|
||||
|
||||
|
||||
// Initialize the flags and types
|
||||
boolean hasCompression = false;
|
||||
String compressionType = "None";
|
||||
|
||||
// Check for object streams
|
||||
for (int i = 1; i <= itextDoc.getNumberOfPdfObjects(); i++) {
|
||||
PdfObject obj = itextDoc.getPdfObject(i);
|
||||
if (obj != null && obj.isStream() && ((PdfStream) obj).get(PdfName.Type) == PdfName.ObjStm) {
|
||||
hasCompression = true;
|
||||
compressionType = "Object Streams";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If not compressed using object streams, check for compressed Xref tables
|
||||
if (!hasCompression && itextDoc.getReader().hasRebuiltXref()) {
|
||||
hasCompression = true;
|
||||
compressionType = "Compressed Xref or Rebuilt Xref";
|
||||
}
|
||||
jsonOutput.put("Compression", hasCompression);
|
||||
if(hasCompression)
|
||||
jsonOutput.put("CompressionType", compressionType);
|
||||
|
||||
String language = pdfBoxDoc.getDocumentCatalog().getLanguage();
|
||||
jsonOutput.put("Language", language);
|
||||
|
||||
// Document Information using PDFBox
|
||||
ObjectNode docInfoNode = objectMapper.createObjectNode();
|
||||
docInfoNode.put("Number of pages", pdfBoxDoc.getNumberOfPages());
|
||||
docInfoNode.put("PDF version", pdfBoxDoc.getVersion());
|
||||
;
|
||||
|
||||
|
||||
// Page Mode using iText7
|
||||
PdfDocument itextDoc = new PdfDocument(new PdfReader("path_to_pdf.pdf"));
|
||||
PdfCatalog catalog = itextDoc.getCatalog();
|
||||
PdfName pageMode = catalog.getPdfObject().getAsName(PdfName.PageMode);
|
||||
|
||||
ObjectNode itextDocInfo = objectMapper.createObjectNode();
|
||||
|
||||
|
||||
|
||||
PdfAcroForm acroForm = PdfAcroForm.getAcroForm(itextDoc, false);
|
||||
ObjectNode formFieldsNode = objectMapper.createObjectNode();
|
||||
if (acroForm != null) {
|
||||
for (Map.Entry<String, PdfFormField> entry : acroForm.getFormFields().entrySet()) {
|
||||
formFieldsNode.put(entry.getKey(), entry.getValue().getValueAsString());
|
||||
}
|
||||
}
|
||||
jsonOutput.set("FormFields", formFieldsNode);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//embeed files TODO size
|
||||
ArrayNode embeddedFilesArray = objectMapper.createArrayNode();
|
||||
if(itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names) != null)
|
||||
{
|
||||
PdfDictionary embeddedFiles = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names)
|
||||
.getAsDictionary(PdfName.EmbeddedFiles);
|
||||
if (embeddedFiles != null) {
|
||||
|
||||
PdfArray namesArray = embeddedFiles.getAsArray(PdfName.Names);
|
||||
for (int i = 0; i < namesArray.size(); i += 2) {
|
||||
ObjectNode embeddedFileNode = objectMapper.createObjectNode();
|
||||
embeddedFileNode.put("Name", namesArray.getAsString(i).toString());
|
||||
// Add other details if required
|
||||
embeddedFilesArray.add(embeddedFileNode);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
jsonOutput.set("EmbeddedFiles", embeddedFilesArray);
|
||||
|
||||
//attachments TODO size
|
||||
ArrayNode attachmentsArray = objectMapper.createArrayNode();
|
||||
for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) {
|
||||
for (PdfAnnotation annotation : itextDoc.getPage(pageNum).getAnnotations()) {
|
||||
if (annotation instanceof PdfFileAttachmentAnnotation) {
|
||||
ObjectNode attachmentNode = objectMapper.createObjectNode();
|
||||
attachmentNode.put("Name", ((PdfFileAttachmentAnnotation) annotation).getName().toString());
|
||||
attachmentNode.put("Description", annotation.getContents().getValue());
|
||||
attachmentsArray.add(attachmentNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
jsonOutput.set("Attachments", attachmentsArray);
|
||||
|
||||
//Javascript
|
||||
PdfDictionary namesDict = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names);
|
||||
ArrayNode javascriptArray = objectMapper.createArrayNode();
|
||||
if (namesDict != null) {
|
||||
PdfDictionary javascriptDict = namesDict.getAsDictionary(PdfName.JavaScript);
|
||||
if (javascriptDict != null) {
|
||||
|
||||
PdfArray namesArray = javascriptDict.getAsArray(PdfName.Names);
|
||||
for (int i = 0; i < namesArray.size(); i += 2) {
|
||||
ObjectNode jsNode = objectMapper.createObjectNode();
|
||||
jsNode.put("JS Name", namesArray.getAsString(i).toString());
|
||||
jsNode.put("JS Code", namesArray.getAsString(i + 1).toString());
|
||||
javascriptArray.add(jsNode);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
jsonOutput.set("JavaScript", javascriptArray);
|
||||
|
||||
//TODO size
|
||||
PdfOCProperties ocProperties = itextDoc.getCatalog().getOCProperties(false);
|
||||
ArrayNode layersArray = objectMapper.createArrayNode();
|
||||
if (ocProperties != null) {
|
||||
|
||||
for (PdfLayer layer : ocProperties.getLayers()) {
|
||||
ObjectNode layerNode = objectMapper.createObjectNode();
|
||||
layerNode.put("Name", layer.getPdfObject().getAsString(PdfName.Name).toString());
|
||||
layersArray.add(layerNode);
|
||||
}
|
||||
|
||||
}
|
||||
jsonOutput.set("Layers", layersArray);
|
||||
|
||||
//TODO Security
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// Digital Signatures using iText7 TODO
|
||||
|
||||
|
||||
|
||||
|
||||
PDAcroForm pdAcroForm = pdfBoxDoc.getDocumentCatalog().getAcroForm();
|
||||
ArrayNode formFieldsArray2 = objectMapper.createArrayNode();
|
||||
if (pdAcroForm != null) {
|
||||
|
||||
for (PDField field : pdAcroForm.getFields()) {
|
||||
ObjectNode fieldNode = objectMapper.createObjectNode();
|
||||
fieldNode.put("FieldName", field.getFullyQualifiedName());
|
||||
fieldNode.put("FieldType", field.getFieldType());
|
||||
// Add more attributes as needed...
|
||||
formFieldsArray2.add(fieldNode);
|
||||
}
|
||||
|
||||
}
|
||||
jsonOutput.set("FormFields", formFieldsArray2);
|
||||
|
||||
|
||||
PDStructureTreeRoot structureTreeRoot = pdfBoxDoc.getDocumentCatalog().getStructureTreeRoot();
|
||||
ArrayNode structureTreeArray;
|
||||
try {
|
||||
if(structureTreeRoot != null) {
|
||||
structureTreeArray = exploreStructureTree(structureTreeRoot.getKids());
|
||||
jsonOutput.set("StructureTree", structureTreeArray);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
boolean isPdfACompliant = checkOutputIntent(itextDoc, "PDF/A");
|
||||
boolean isPdfXCompliant = checkOutputIntent(itextDoc, "PDF/X");
|
||||
boolean isPdfECompliant = checkForStandard(itextDoc, "PDF/E");
|
||||
boolean isPdfVTCompliant = checkForStandard(itextDoc, "PDF/VT");
|
||||
boolean isPdfUACompliant = checkForStandard(itextDoc, "PDF/UA");
|
||||
boolean isPdfBCompliant = checkForStandard(itextDoc, "PDF/B"); // If you want to check for PDF/Broadcast, though this isn't an official ISO standard.
|
||||
boolean isPdfSECCompliant = checkForStandard(itextDoc, "PDF/SEC"); // This might not be effective since PDF/SEC was under development in 2021.
|
||||
|
||||
ObjectNode compliancy = objectMapper.createObjectNode();
|
||||
compliancy.put("IsPDF/ACompliant", isPdfACompliant);
|
||||
compliancy.put("IsPDF/XCompliant", isPdfXCompliant);
|
||||
compliancy.put("IsPDF/ECompliant", isPdfECompliant);
|
||||
compliancy.put("IsPDF/VTCompliant", isPdfVTCompliant);
|
||||
compliancy.put("IsPDF/UACompliant", isPdfUACompliant);
|
||||
compliancy.put("IsPDF/BCompliant", isPdfBCompliant);
|
||||
compliancy.put("IsPDF/SECCompliant", isPdfSECCompliant);
|
||||
|
||||
jsonOutput.set("Compliancy", compliancy);
|
||||
|
||||
|
||||
|
||||
|
||||
ArrayNode bookmarksArray = objectMapper.createArrayNode();
|
||||
PdfOutline root = itextDoc.getOutlines(false);
|
||||
if (root != null) {
|
||||
for (PdfOutline child : root.getAllChildren()) {
|
||||
addOutlinesToArray(child, bookmarksArray);
|
||||
}
|
||||
}
|
||||
jsonOutput.set("Bookmarks/Outline/TOC", bookmarksArray);
|
||||
|
||||
String xmpString = null;
|
||||
try {
|
||||
byte[] xmpBytes = itextDoc.getXmpMetadata();
|
||||
if (xmpBytes != null) {
|
||||
XMPMeta xmpMeta = XMPMetaFactory.parseFromBuffer(xmpBytes);
|
||||
xmpString = xmpMeta.dumpObject();
|
||||
|
||||
}
|
||||
} catch (XMPException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
jsonOutput.put("XMPMetadata", xmpString);
|
||||
|
||||
|
||||
|
||||
ObjectNode encryptionNode = objectMapper.createObjectNode();
|
||||
if (pdfBoxDoc.isEncrypted()) {
|
||||
encryptionNode.put("IsEncrypted", true);
|
||||
|
||||
// Retrieve encryption details using getEncryption()
|
||||
PDEncryption encryption = pdfBoxDoc.getEncryption();
|
||||
encryptionNode.put("EncryptionAlgorithm", encryption.getFilter());
|
||||
encryptionNode.put("KeyLength", encryption.getLength());
|
||||
encryptionNode.put("Permissions", pdfBoxDoc.getCurrentAccessPermission().toString());
|
||||
|
||||
// Add other encryption-related properties as needed
|
||||
} else {
|
||||
encryptionNode.put("IsEncrypted", false);
|
||||
}
|
||||
jsonOutput.set("Encryption", encryptionNode);
|
||||
|
||||
docInfoNode.put("Page Mode", getPageModeDescription(pageMode));;
|
||||
|
||||
jsonOutput.set("Document Information", docInfoNode);
|
||||
|
||||
ObjectNode pageInfoParent = objectMapper.createObjectNode();
|
||||
for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) {
|
||||
ObjectNode pageInfo = objectMapper.createObjectNode();
|
||||
|
||||
@ -79,7 +363,9 @@ public class PDFExtractor {
|
||||
pageInfo.put("Width", pageSize.getWidth());
|
||||
pageInfo.put("Height", pageSize.getHeight());
|
||||
pageInfo.put("Rotation", itextDoc.getPage(pageNum).getRotation());
|
||||
|
||||
pageInfo.put("Page Orientation", getPageOrientation(pageSize.getWidth(),pageSize.getHeight()));
|
||||
pageInfo.put("Standard Size", getPageSize(pageSize.getWidth(),pageSize.getHeight()));
|
||||
|
||||
// Boxes
|
||||
pageInfo.put("MediaBox", itextDoc.getPage(pageNum).getMediaBox().toString());
|
||||
pageInfo.put("CropBox", itextDoc.getPage(pageNum).getCropBox().toString());
|
||||
@ -98,14 +384,25 @@ public class PDFExtractor {
|
||||
// Annotations
|
||||
ArrayNode annotationsArray = objectMapper.createArrayNode();
|
||||
List<PdfAnnotation> annotations = itextDoc.getPage(pageNum).getAnnotations();
|
||||
for (PdfAnnotation annotation : annotations) {
|
||||
ObjectNode annotationNode = objectMapper.createObjectNode();
|
||||
annotationNode.put("Subtype", annotation.getSubtype().toString());
|
||||
annotationNode.put("Contents", annotation.getContents().getValue());
|
||||
annotationsArray.add(annotationNode);
|
||||
}
|
||||
pageInfo.set("Annotations", annotationsArray);
|
||||
|
||||
int subtypeCount = 0;
|
||||
int contentsCount = 0;
|
||||
|
||||
for (PdfAnnotation annotation : annotations) {
|
||||
if(annotation.getSubtype() != null) {
|
||||
subtypeCount++; // Increase subtype count
|
||||
}
|
||||
if(annotation.getContents() != null) {
|
||||
contentsCount++; // Increase contents count
|
||||
}
|
||||
}
|
||||
|
||||
ObjectNode annotationsObject = objectMapper.createObjectNode();
|
||||
annotationsObject.put("AnnotationsCount", annotations.size());
|
||||
annotationsObject.put("SubtypeCount", subtypeCount);
|
||||
annotationsObject.put("ContentsCount", contentsCount);
|
||||
pageInfo.set("Annotations", annotationsObject);
|
||||
|
||||
// Images (simplified)
|
||||
// This part is non-trivial as images can be embedded in multiple ways in a PDF.
|
||||
// Here is a basic structure to recognize image XObjects on a page.
|
||||
@ -129,32 +426,62 @@ public class PDFExtractor {
|
||||
}
|
||||
pageInfo.set("Images", imagesArray);
|
||||
|
||||
|
||||
// Links
|
||||
ArrayNode linksArray = objectMapper.createArrayNode();
|
||||
Set<String> uniqueURIs = new HashSet<>(); // To store unique URIs
|
||||
|
||||
for (PdfAnnotation annotation : annotations) {
|
||||
if (annotation instanceof PdfLinkAnnotation) {
|
||||
PdfLinkAnnotation linkAnnotation = (PdfLinkAnnotation) annotation;
|
||||
ObjectNode linkNode = objectMapper.createObjectNode();
|
||||
linkNode.put("URI", linkAnnotation.getAction().toString()); // Basic, might not work for all links
|
||||
linksArray.add(linkNode);
|
||||
String uri = linkAnnotation.getAction().toString();
|
||||
uniqueURIs.add(uri); // Add to set to ensure uniqueness
|
||||
}
|
||||
}
|
||||
|
||||
// Add unique URIs to linksArray
|
||||
for (String uri : uniqueURIs) {
|
||||
ObjectNode linkNode = objectMapper.createObjectNode();
|
||||
linkNode.put("URI", uri);
|
||||
linksArray.add(linkNode);
|
||||
}
|
||||
pageInfo.set("Links", linksArray);
|
||||
|
||||
//Fonts
|
||||
ArrayNode fontsArray = objectMapper.createArrayNode();
|
||||
PdfDictionary fontDicts = resources.getResource(PdfName.Font);
|
||||
Set<String> uniqueSubtypes = new HashSet<>(); // To store unique subtypes
|
||||
|
||||
|
||||
if (fontDicts != null) {
|
||||
for (PdfName key : fontDicts.keySet()) {
|
||||
ObjectNode fontNode = objectMapper.createObjectNode(); // Create a new font node for each font
|
||||
PdfDictionary font = fontDicts.getAsDictionary(key);
|
||||
ObjectNode fontNode = objectMapper.createObjectNode();
|
||||
fontNode.put("Name", font.getAsString(PdfName.BaseFont).toString());
|
||||
|
||||
boolean isEmbedded = font.containsKey(PdfName.FontFile) ||
|
||||
font.containsKey(PdfName.FontFile2) ||
|
||||
font.containsKey(PdfName.FontFile3);
|
||||
fontNode.put("IsEmbedded", isEmbedded);
|
||||
|
||||
|
||||
if (font.containsKey(PdfName.Encoding)) {
|
||||
String encoding = font.getAsName(PdfName.Encoding).toString();
|
||||
fontNode.put("Encoding", encoding);
|
||||
}
|
||||
|
||||
|
||||
|
||||
if(font.getAsString(PdfName.BaseFont) != null)
|
||||
fontNode.put("Name", font.getAsString(PdfName.BaseFont).toString());
|
||||
|
||||
String subtype = null;
|
||||
// Font Subtype (e.g., Type1, TrueType)
|
||||
if (font.containsKey(PdfName.Subtype)) {
|
||||
fontNode.put("Subtype", font.getAsName(PdfName.Subtype).toString());
|
||||
subtype = font.getAsName(PdfName.Subtype).toString();
|
||||
uniqueSubtypes.add(subtype); // Add to set to ensure uniqueness
|
||||
}
|
||||
|
||||
fontNode.put("Subtype", subtype);
|
||||
|
||||
// Font Descriptor
|
||||
PdfDictionary fontDescriptor = font.getAsDictionary(PdfName.FontDescriptor);
|
||||
if (fontDescriptor != null) {
|
||||
@ -166,14 +493,53 @@ public class PDFExtractor {
|
||||
// Flags (e.g., italic, bold)
|
||||
if (fontDescriptor.containsKey(PdfName.Flags)) {
|
||||
int flags = fontDescriptor.getAsNumber(PdfName.Flags).intValue();
|
||||
fontNode.put("IsItalic", (flags & 64) != 0);
|
||||
fontNode.put("IsBold", (flags & 1) != 0);
|
||||
fontNode.put("IsItalic", (flags & 64) != 0); // Existing italic flag
|
||||
fontNode.put("IsBold", (flags & 1 << 16) != 0); // Existing bold flag
|
||||
fontNode.put("IsFixedPitch", (flags & 1) != 0);
|
||||
fontNode.put("IsSerif", (flags & 2) != 0);
|
||||
fontNode.put("IsSymbolic", (flags & 4) != 0);
|
||||
fontNode.put("IsScript", (flags & 8) != 0);
|
||||
fontNode.put("IsNonsymbolic", (flags & 16) != 0);
|
||||
}
|
||||
|
||||
if (fontDescriptor.containsKey(PdfName.FontFamily)) {
|
||||
String fontFamily = fontDescriptor.getAsString(PdfName.FontFamily).toString();
|
||||
fontNode.put("FontFamily", fontFamily);
|
||||
}
|
||||
}
|
||||
|
||||
fontsArray.add(fontNode);
|
||||
if (fontDescriptor.containsKey(PdfName.FontStretch)) {
|
||||
String fontStretch = fontDescriptor.getAsName(PdfName.FontStretch).toString();
|
||||
fontNode.put("FontStretch", fontStretch);
|
||||
}
|
||||
|
||||
if (fontDescriptor != null && fontDescriptor.containsKey(PdfName.FontBBox)) {
|
||||
PdfArray bbox = fontDescriptor.getAsArray(PdfName.FontBBox);
|
||||
fontNode.put("FontBoundingBox", bbox.toString());
|
||||
}
|
||||
if (fontDescriptor != null && fontDescriptor.containsKey(PdfName.FontWeight)) {
|
||||
float fontWeight = fontDescriptor.getAsNumber(PdfName.FontWeight).floatValue();
|
||||
fontNode.put("FontWeight", fontWeight);
|
||||
}
|
||||
|
||||
}
|
||||
if (font.containsKey(PdfName.ToUnicode)) {
|
||||
PdfStream toUnicodeStream = font.getAsStream(PdfName.ToUnicode);
|
||||
// Handle the stream as needed, maybe extract some details or just note its existence
|
||||
fontNode.put("HasToUnicodeMap", true);
|
||||
}
|
||||
if (fontNode.size() > 0) {
|
||||
fontsArray.add(fontNode); // Add each font node to fontsArray
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add unique subtypes to fontsArray
|
||||
ArrayNode subtypesArray = objectMapper.createArrayNode();
|
||||
for (String subtype : uniqueSubtypes) {
|
||||
subtypesArray.add(subtype);
|
||||
}
|
||||
pageInfo.set("FontSubtypes", subtypesArray); // Changed from Fonts to FontSubtypes
|
||||
|
||||
pageInfo.set("Fonts", fontsArray);
|
||||
|
||||
|
||||
@ -204,123 +570,186 @@ public class PDFExtractor {
|
||||
pageInfo.set("Color Spaces & ICC Profiles", colorSpacesArray);
|
||||
|
||||
// Other XObjects
|
||||
ArrayNode xObjectsArray = objectMapper.createArrayNode();
|
||||
Map<String, Integer> xObjectCountMap = new HashMap<>(); // To store the count for each type
|
||||
PdfDictionary xObjects = resourcesDict.getAsDictionary(PdfName.XObject);
|
||||
if (xObjects != null) {
|
||||
for (PdfName name : xObjects.keySet()) {
|
||||
PdfStream xObjectStream = xObjects.getAsStream(name);
|
||||
ObjectNode xObjectNode = objectMapper.createObjectNode();
|
||||
xObjectNode.put("Type", xObjectStream.getAsName(PdfName.Subtype).toString());
|
||||
// TODO: Extract further details depending on the XObject type
|
||||
xObjectsArray.add(xObjectNode);
|
||||
String xObjectType = xObjectStream.getAsName(PdfName.Subtype).toString();
|
||||
|
||||
// Increment the count for this type in the map
|
||||
xObjectCountMap.put(xObjectType, xObjectCountMap.getOrDefault(xObjectType, 0) + 1);
|
||||
}
|
||||
}
|
||||
pageInfo.set("XObjects", xObjectsArray);
|
||||
|
||||
jsonOutput.set("Page " + pageNum, pageInfo);
|
||||
}
|
||||
|
||||
PdfAcroForm acroForm = PdfAcroForm.getAcroForm(itextDoc, false);
|
||||
if (acroForm != null) {
|
||||
ObjectNode formFieldsNode = objectMapper.createObjectNode();
|
||||
for (Map.Entry<String, PdfFormField> entry : acroForm.getFormFields().entrySet()) {
|
||||
formFieldsNode.put(entry.getKey(), entry.getValue().getValueAsString());
|
||||
// Add the count map to pageInfo (or wherever you want to store it)
|
||||
ObjectNode xObjectCountNode = objectMapper.createObjectNode();
|
||||
for (Map.Entry<String, Integer> entry : xObjectCountMap.entrySet()) {
|
||||
xObjectCountNode.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
jsonOutput.set("FormFields", formFieldsNode);
|
||||
}
|
||||
pageInfo.set("XObjectCounts", xObjectCountNode);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//TODO bookmarks here
|
||||
|
||||
|
||||
|
||||
|
||||
//embeed files TODO size
|
||||
PdfDictionary embeddedFiles = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names)
|
||||
.getAsDictionary(PdfName.EmbeddedFiles);
|
||||
if (embeddedFiles != null) {
|
||||
ArrayNode embeddedFilesArray = objectMapper.createArrayNode();
|
||||
PdfArray namesArray = embeddedFiles.getAsArray(PdfName.Names);
|
||||
for (int i = 0; i < namesArray.size(); i += 2) {
|
||||
ObjectNode embeddedFileNode = objectMapper.createObjectNode();
|
||||
embeddedFileNode.put("Name", namesArray.getAsString(i).toString());
|
||||
// Add other details if required
|
||||
embeddedFilesArray.add(embeddedFileNode);
|
||||
}
|
||||
jsonOutput.set("EmbeddedFiles", embeddedFilesArray);
|
||||
}
|
||||
|
||||
|
||||
//attachments TODO size
|
||||
ArrayNode attachmentsArray = objectMapper.createArrayNode();
|
||||
for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) {
|
||||
for (PdfAnnotation annotation : itextDoc.getPage(pageNum).getAnnotations()) {
|
||||
if (annotation instanceof PdfFileAttachmentAnnotation) {
|
||||
ObjectNode attachmentNode = objectMapper.createObjectNode();
|
||||
attachmentNode.put("Name", ((PdfFileAttachmentAnnotation) annotation).getName().toString());
|
||||
attachmentNode.put("Description", annotation.getContents().getValue());
|
||||
attachmentsArray.add(attachmentNode);
|
||||
ArrayNode multimediaArray = objectMapper.createArrayNode();
|
||||
for (PdfAnnotation annotation : annotations) {
|
||||
if (PdfName.RichMedia.equals(annotation.getSubtype())) {
|
||||
ObjectNode multimediaNode = objectMapper.createObjectNode();
|
||||
// Extract details from the dictionary as needed
|
||||
multimediaArray.add(multimediaNode);
|
||||
}
|
||||
}
|
||||
pageInfo.set("Multimedia", multimediaArray);
|
||||
|
||||
|
||||
|
||||
pageInfoParent.set("Page " + pageNum, pageInfo);
|
||||
}
|
||||
jsonOutput.set("Attachments", attachmentsArray);
|
||||
|
||||
//Javascript
|
||||
PdfDictionary namesDict = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names);
|
||||
if (namesDict != null) {
|
||||
PdfDictionary javascriptDict = namesDict.getAsDictionary(PdfName.JavaScript);
|
||||
if (javascriptDict != null) {
|
||||
ArrayNode javascriptArray = objectMapper.createArrayNode();
|
||||
PdfArray namesArray = javascriptDict.getAsArray(PdfName.Names);
|
||||
for (int i = 0; i < namesArray.size(); i += 2) {
|
||||
ObjectNode jsNode = objectMapper.createObjectNode();
|
||||
jsNode.put("JS Name", namesArray.getAsString(i).toString());
|
||||
jsNode.put("JS Code", namesArray.getAsString(i + 1).toString());
|
||||
javascriptArray.add(jsNode);
|
||||
}
|
||||
jsonOutput.set("JavaScripts", javascriptArray);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//TODO size
|
||||
PdfOCProperties ocProperties = itextDoc.getCatalog().getOCProperties(false);
|
||||
if (ocProperties != null) {
|
||||
ArrayNode layersArray = objectMapper.createArrayNode();
|
||||
for (PdfLayer layer : ocProperties.getLayers()) {
|
||||
ObjectNode layerNode = objectMapper.createObjectNode();
|
||||
layerNode.put("Name", layer.getPdfObject().getAsString(PdfName.Name).toString());
|
||||
layersArray.add(layerNode);
|
||||
}
|
||||
jsonOutput.set("Layers", layersArray);
|
||||
}
|
||||
|
||||
jsonOutput.set("Per Page Info", pageInfoParent);
|
||||
|
||||
//TODO Security
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// Digital Signatures using iText7 TODO
|
||||
|
||||
|
||||
// Save JSON to file
|
||||
try (FileWriter file = new FileWriter("output.json")) {
|
||||
file.write(objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(jsonOutput));
|
||||
file.flush();
|
||||
}
|
||||
|
||||
pdfBoxDoc.close();
|
||||
itextDoc.close();
|
||||
String jsonString = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(jsonOutput);
|
||||
|
||||
|
||||
|
||||
return WebResponseUtils.bytesToWebResponse(jsonString.getBytes(StandardCharsets.UTF_8), "response.json", MediaType.APPLICATION_JSON);
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static String formatDate(Calendar calendar) {
|
||||
private static void addOutlinesToArray(PdfOutline outline, ArrayNode arrayNode) {
|
||||
if (outline == null) return;
|
||||
ObjectNode outlineNode = objectMapper.createObjectNode();
|
||||
outlineNode.put("Title", outline.getTitle());
|
||||
// You can add other properties if needed
|
||||
arrayNode.add(outlineNode);
|
||||
|
||||
for (PdfOutline child : outline.getAllChildren()) {
|
||||
addOutlinesToArray(child, arrayNode);
|
||||
}
|
||||
}
|
||||
public String getPageOrientation(double width, double height) {
|
||||
if (width > height) {
|
||||
return "Landscape";
|
||||
} else if (height > width) {
|
||||
return "Portrait";
|
||||
} else {
|
||||
return "Square";
|
||||
}
|
||||
}
|
||||
public String getPageSize(double width, double height) {
|
||||
// Common aspect ratios used for standard paper sizes
|
||||
double[] aspectRatios = {4.0 / 3.0, 3.0 / 2.0, Math.sqrt(2.0), 16.0 / 9.0};
|
||||
|
||||
// Check if the page matches any common aspect ratio
|
||||
for (double aspectRatio : aspectRatios) {
|
||||
if (isCloseToAspectRatio(width, height, aspectRatio)) {
|
||||
return "Standard";
|
||||
}
|
||||
}
|
||||
|
||||
// If not a standard aspect ratio, consider it as a custom size
|
||||
return "Custom";
|
||||
}
|
||||
private boolean isCloseToAspectRatio(double width, double height, double aspectRatio) {
|
||||
// Calculate the aspect ratio of the page
|
||||
double pageAspectRatio = width / height;
|
||||
|
||||
// Compare the page aspect ratio with the common aspect ratio within a threshold
|
||||
return Math.abs(pageAspectRatio - aspectRatio) <= 0.05;
|
||||
}
|
||||
|
||||
public boolean checkForStandard(PdfDocument document, String standardKeyword) {
|
||||
// Check Output Intents
|
||||
boolean foundInOutputIntents = checkOutputIntent(document, standardKeyword);
|
||||
if (foundInOutputIntents) return true;
|
||||
|
||||
// Check XMP Metadata (rudimentary)
|
||||
try {
|
||||
byte[] metadataBytes = document.getXmpMetadata();
|
||||
if (metadataBytes != null) {
|
||||
XMPMeta xmpMeta = XMPMetaFactory.parseFromBuffer(metadataBytes);
|
||||
String xmpString = xmpMeta.dumpObject();
|
||||
if (xmpString.contains(standardKeyword)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch (XMPException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean checkOutputIntent(PdfDocument document, String standard) {
|
||||
PdfArray outputIntents = document.getCatalog().getPdfObject().getAsArray(PdfName.OutputIntents);
|
||||
if (outputIntents != null && !outputIntents.isEmpty()) {
|
||||
for (int i = 0; i < outputIntents.size(); i++) {
|
||||
PdfDictionary outputIntentDict = outputIntents.getAsDictionary(i);
|
||||
if (outputIntentDict != null) {
|
||||
PdfString s = outputIntentDict.getAsString(PdfName.S);
|
||||
if (s != null && s.toString().contains(standard)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public ArrayNode exploreStructureTree(List<Object> nodes) {
|
||||
ArrayNode elementsArray = objectMapper.createArrayNode();
|
||||
if (nodes != null) {
|
||||
for (Object obj : nodes) {
|
||||
if (obj instanceof PDStructureNode) {
|
||||
PDStructureNode node = (PDStructureNode) obj;
|
||||
ObjectNode elementNode = objectMapper.createObjectNode();
|
||||
|
||||
if (node instanceof PDStructureElement) {
|
||||
PDStructureElement structureElement = (PDStructureElement) node;
|
||||
elementNode.put("Type", structureElement.getStructureType());
|
||||
elementNode.put("Content", getContent(structureElement));
|
||||
|
||||
// Recursively explore child elements
|
||||
ArrayNode childElements = exploreStructureTree(structureElement.getKids());
|
||||
if (childElements.size() > 0) {
|
||||
elementNode.set("Children", childElements);
|
||||
}
|
||||
}
|
||||
elementsArray.add(elementNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
return elementsArray;
|
||||
}
|
||||
|
||||
|
||||
public String getContent(PDStructureElement structureElement) {
|
||||
StringBuilder contentBuilder = new StringBuilder();
|
||||
|
||||
for (Object item : structureElement.getKids()) {
|
||||
if (item instanceof COSString) {
|
||||
COSString cosString = (COSString) item;
|
||||
contentBuilder.append(cosString.getString());
|
||||
} else if (item instanceof PDStructureElement) {
|
||||
// For simplicity, we're handling only COSString and PDStructureElement here
|
||||
// but a more comprehensive method would handle other types too
|
||||
contentBuilder.append(getContent((PDStructureElement) item));
|
||||
}
|
||||
}
|
||||
|
||||
return contentBuilder.toString();
|
||||
}
|
||||
|
||||
|
||||
private String formatDate(Calendar calendar) {
|
||||
if (calendar != null) {
|
||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||
return sdf.format(calendar.getTime());
|
||||
@ -329,7 +758,7 @@ public class PDFExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
private static String getPageModeDescription(PdfName pageMode) {
|
||||
private String getPageModeDescription(PdfName pageMode) {
|
||||
return pageMode != null ? pageMode.toString().replaceFirst("/", "") : "Unknown";
|
||||
}
|
||||
}
|
||||
|
@ -52,4 +52,11 @@ public class SecurityWebController {
|
||||
model.addAttribute("currentPage", "sanitize-pdf");
|
||||
return "security/sanitize-pdf";
|
||||
}
|
||||
|
||||
@GetMapping("/get-info-on-pdf")
|
||||
@Hidden
|
||||
public String getInfo(Model model) {
|
||||
model.addAttribute("currentPage", "get-info-on-pdf");
|
||||
return "security/get-info-on-pdf";
|
||||
}
|
||||
}
|
||||
|
33
src/main/resources/templates/security/get-info-on-pdf.html
Normal file
33
src/main/resources/templates/security/get-info-on-pdf.html
Normal file
@ -0,0 +1,33 @@
|
||||
<!DOCTYPE html>
|
||||
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
|
||||
|
||||
<th:block th:insert="~{fragments/common :: head(title=#{getPdfInfo.title})}"></th:block>
|
||||
|
||||
|
||||
<body>
|
||||
<th:block th:insert="~{fragments/common :: game}"></th:block>
|
||||
<div id="page-container">
|
||||
<div id="content-wrap">
|
||||
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
|
||||
<br> <br>
|
||||
<div class="container">
|
||||
<div class="row justify-content-center">
|
||||
<div class="col-md-6">
|
||||
<h2 th:text="#{getPdfInfo.header}"></h2>
|
||||
<p th:text="#{processTimeWarning}">
|
||||
<form method="post" enctype="multipart/form-data" th:action="@{get-info-on-pdf}">
|
||||
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false)}"></div>
|
||||
<br>
|
||||
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{getPdfInfo.submit}"></button>
|
||||
|
||||
</form>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue
Block a user