mirror of
				https://github.com/Frooodle/Stirling-PDF.git
				synced 2025-10-25 11:17:28 +02:00 
			
		
		
		
	redact
This commit is contained in:
		
							parent
							
								
									7c2f482b3b
								
							
						
					
					
						commit
						cd0e1a3962
					
				| @ -208,7 +208,11 @@ For API usage you must provide a header with 'X-API-Key' and the associated API | ||||
| - Progress bar/Tracking | ||||
| - Full custom logic pipelines to combine multiple operations together. | ||||
| - Folder support with auto scanning to perform operations on | ||||
| - Redact sections of pages | ||||
| - Redact text (not au | ||||
| - Add Forms | ||||
| - Annotations | ||||
| - Multi page layout (Stich PDF pages together) support x rows y columns and custom page sizing  | ||||
| - Fill forms mannual and automatic  | ||||
| 
 | ||||
| ### Q2: Why is my application downloading .htm files? | ||||
| This is a issue caused commonly by your NGINX congifuration. The default file upload size for NGINX is 1MB, you need to add the following in your Nginx sites-available file. client_max_body_size SIZE; Where "SIZE" is 50M for example for 50MB files. | ||||
|  | ||||
| @ -0,0 +1,106 @@ | ||||
| package stirling.software.SPDF.controller.api.security; | ||||
| 
 | ||||
| import java.awt.Color; | ||||
| import java.awt.image.BufferedImage; | ||||
| import java.io.ByteArrayInputStream; | ||||
| import java.io.ByteArrayOutputStream; | ||||
| import java.io.IOException; | ||||
| import java.util.List; | ||||
| 
 | ||||
| import org.apache.pdfbox.pdmodel.PDDocument; | ||||
| import org.apache.pdfbox.pdmodel.PDPage; | ||||
| import org.apache.pdfbox.pdmodel.PDPageContentStream; | ||||
| import org.apache.pdfbox.pdmodel.common.PDRectangle; | ||||
| import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; | ||||
| import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; | ||||
| import org.apache.pdfbox.rendering.ImageType; | ||||
| import org.apache.pdfbox.rendering.PDFRenderer; | ||||
| import org.slf4j.Logger; | ||||
| import org.slf4j.LoggerFactory; | ||||
| import org.springframework.http.ResponseEntity; | ||||
| import org.springframework.web.bind.annotation.PostMapping; | ||||
| import org.springframework.web.bind.annotation.RequestParam; | ||||
| import org.springframework.web.bind.annotation.RestController; | ||||
| import org.springframework.web.multipart.MultipartFile; | ||||
| 
 | ||||
| import io.swagger.v3.oas.annotations.Operation; | ||||
| import io.swagger.v3.oas.annotations.Parameter; | ||||
| import io.swagger.v3.oas.annotations.media.Schema; | ||||
| import io.swagger.v3.oas.annotations.tags.Tag; | ||||
| import stirling.software.SPDF.model.PDFText; | ||||
| import stirling.software.SPDF.pdf.TextFinder; | ||||
| import stirling.software.SPDF.utils.WebResponseUtils; | ||||
| @RestController | ||||
| @Tag(name = "Security", description = "Security APIs") | ||||
| public class RedactController { | ||||
| 
 | ||||
|     private static final Logger logger = LoggerFactory.getLogger(RedactController.class); | ||||
| 
 | ||||
| 
 | ||||
|     @PostMapping(value = "/auto-redact", consumes = "multipart/form-data") | ||||
|     @Operation(summary = "Redacts listOfText in a PDF document",  | ||||
|                description = "This operation takes an input PDF file and redacts the provided listOfText. Input:PDF, Output:PDF, Type:SISO") | ||||
|     public ResponseEntity<byte[]> redactPdf( | ||||
|             @Parameter(description = "The input PDF file", required = true) @RequestParam("fileInput") MultipartFile file, | ||||
|             @Parameter(description = "List of listOfText to redact from the PDF", required = true, schema = @Schema(type = "string")) @RequestParam("listOfText") String listOfTextString, | ||||
|             @RequestParam(value = "useRegex", required = false) boolean useRegex, | ||||
|             @RequestParam(value = "wholeWordSearch", required = false) boolean wholeWordSearchBool, | ||||
|             @RequestParam(value = "customPadding", required = false) float customPadding, | ||||
|             @RequestParam(value = "convertPDFToImage", required = false) boolean convertPDFToImage) throws Exception { | ||||
|          | ||||
|     	System.out.println(listOfTextString); | ||||
|     	String[] listOfText = listOfTextString.split("\n"); | ||||
|         byte[] bytes = file.getBytes(); | ||||
|         PDDocument document = PDDocument.load(new ByteArrayInputStream(bytes)); | ||||
|         for (String text : listOfText) { | ||||
|         	text = text.trim(); | ||||
|         	System.out.println(text); | ||||
|         	TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool); | ||||
|             List<PDFText> foundTexts = textFinder.getTextLocations(document); | ||||
|             redactFoundText(document, foundTexts, customPadding); | ||||
|         } | ||||
|          | ||||
|          | ||||
|          | ||||
|         if (convertPDFToImage) { | ||||
|             PDDocument imageDocument = new PDDocument(); | ||||
|             PDFRenderer pdfRenderer = new PDFRenderer(document); | ||||
|             for (int page = 0; page < document.getNumberOfPages(); ++page) { | ||||
|                 BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB); | ||||
|                 PDPage newPage = new PDPage(new PDRectangle(bim.getWidth(), bim.getHeight())); | ||||
|                 imageDocument.addPage(newPage); | ||||
|                 PDImageXObject pdImage = LosslessFactory.createFromImage(imageDocument, bim); | ||||
|                 PDPageContentStream contentStream = new PDPageContentStream(imageDocument, newPage); | ||||
|                 contentStream.drawImage(pdImage, 0, 0); | ||||
|                 contentStream.close(); | ||||
|             } | ||||
|             document.close(); | ||||
|             document = imageDocument; | ||||
|         } | ||||
| 
 | ||||
|         ByteArrayOutputStream baos = new ByteArrayOutputStream(); | ||||
|         document.save(baos); | ||||
|         document.close(); | ||||
|          | ||||
|         byte[] pdfContent = baos.toByteArray(); | ||||
|         return WebResponseUtils.bytesToWebResponse(pdfContent, | ||||
|                 file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_redacted.pdf"); | ||||
|     } | ||||
| 
 | ||||
|      | ||||
|     private void redactFoundText(PDDocument document, List<PDFText> blocks, float customPadding) throws IOException { | ||||
|         var allPages = document.getDocumentCatalog().getPages(); | ||||
| 
 | ||||
|         for (PDFText block : blocks) { | ||||
|             var page = allPages.get(block.getPageIndex()); | ||||
|             PDPageContentStream contentStream = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, true, true); | ||||
|             contentStream.setNonStrokingColor(Color.BLACK); | ||||
|             float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding; | ||||
|             PDRectangle pageBox = page.getBBox(); | ||||
|             contentStream.addRect(block.getX1(), pageBox.getHeight() - block.getY1() - padding, block.getX2() - block.getX1(), block.getY2() - block.getY1() + 2 * padding); | ||||
|             contentStream.fill(); | ||||
|             contentStream.close(); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
| } | ||||
| @ -11,6 +11,12 @@ import io.swagger.v3.oas.annotations.tags.Tag; | ||||
| @Tag(name = "Security", description = "Security APIs") | ||||
| public class SecurityWebController { | ||||
| 	 | ||||
| 	@GetMapping("/auto-redact") | ||||
|     @Hidden | ||||
|     public String autoRedactForm(Model model) { | ||||
|         model.addAttribute("currentPage", "auto-redact"); | ||||
|         return "security/auto-redact"; | ||||
|     } | ||||
| 	 | ||||
|     @GetMapping("/add-password") | ||||
|     @Hidden | ||||
|  | ||||
							
								
								
									
										42
									
								
								src/main/java/stirling/software/SPDF/model/PDFText.java
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								src/main/java/stirling/software/SPDF/model/PDFText.java
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,42 @@ | ||||
| package stirling.software.SPDF.model; | ||||
| public class PDFText { | ||||
|     private final int pageIndex; | ||||
|     private final float x1; | ||||
|     private final float y1; | ||||
|     private final float x2; | ||||
|     private final float y2; | ||||
|     private final String text; | ||||
| 
 | ||||
|     public PDFText(int pageIndex, float x1, float y1, float x2, float y2, String text) { | ||||
|         this.pageIndex = pageIndex; | ||||
|         this.x1 = x1; | ||||
|         this.y1 = y1; | ||||
|         this.x2 = x2; | ||||
|         this.y2 = y2; | ||||
|         this.text = text; | ||||
|     } | ||||
| 
 | ||||
|     public int getPageIndex() { | ||||
|         return pageIndex; | ||||
|     } | ||||
| 
 | ||||
|     public float getX1() { | ||||
|         return x1; | ||||
|     } | ||||
| 
 | ||||
|     public float getY1() { | ||||
|         return y1; | ||||
|     } | ||||
| 
 | ||||
|     public float getX2() { | ||||
|         return x2; | ||||
|     } | ||||
| 
 | ||||
|     public float getY2() { | ||||
|         return y2; | ||||
|     } | ||||
| 
 | ||||
|     public String getText() { | ||||
|         return text; | ||||
|     } | ||||
| } | ||||
							
								
								
									
										91
									
								
								src/main/java/stirling/software/SPDF/pdf/TextFinder.java
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								src/main/java/stirling/software/SPDF/pdf/TextFinder.java
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,91 @@ | ||||
| package stirling.software.SPDF.pdf; | ||||
| import org.apache.pdfbox.pdmodel.PDDocument; | ||||
| import org.apache.pdfbox.text.PDFTextStripper; | ||||
| import org.apache.pdfbox.text.TextPosition; | ||||
| import org.springframework.http.ResponseEntity; | ||||
| 
 | ||||
| import stirling.software.SPDF.model.PDFText; | ||||
| 
 | ||||
| import java.io.IOException; | ||||
| import java.util.ArrayList; | ||||
| import java.util.List; | ||||
| import java.util.regex.Matcher; | ||||
| import java.util.regex.Pattern; | ||||
| 
 | ||||
| public class TextFinder extends PDFTextStripper { | ||||
| 
 | ||||
| 	private final String searchText; | ||||
| 	private final boolean useRegex; | ||||
| 	private final boolean wholeWordSearch; | ||||
| 	private final List<PDFText> textOccurrences = new ArrayList<>(); | ||||
| 
 | ||||
| 	public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch) throws IOException { | ||||
| 	    this.searchText = searchText.toLowerCase(); | ||||
| 	    this.useRegex = useRegex; | ||||
| 	    this.wholeWordSearch = wholeWordSearch; | ||||
| 	    setSortByPosition(true); | ||||
| 	} | ||||
| 
 | ||||
| 	private List<Integer> findOccurrencesInText(String searchText, String content) { | ||||
| 	    List<Integer> indexes = new ArrayList<>(); | ||||
| 	    Pattern pattern; | ||||
| 
 | ||||
| 	    if (useRegex) { | ||||
| 	        // Use regex-based search | ||||
| 	        pattern = wholeWordSearch  | ||||
| 	            ? Pattern.compile("(\\b|_|\\.)" + searchText + "(\\b|_|\\.)")  | ||||
| 	            : Pattern.compile(searchText); | ||||
| 	    } else { | ||||
| 	        // Use normal text search | ||||
| 	        pattern = wholeWordSearch  | ||||
| 		            ? Pattern.compile("(\\b|_|\\.)" + Pattern.quote(searchText) + "(\\b|_|\\.)")  | ||||
| 		    	            : Pattern.compile(Pattern.quote(searchText)); | ||||
| 	    } | ||||
| 
 | ||||
| 	    Matcher matcher = pattern.matcher(content); | ||||
| 	    while (matcher.find()) { | ||||
| 	        indexes.add(matcher.start()); | ||||
| 	    } | ||||
| 	    return indexes; | ||||
| 	} | ||||
| 	 | ||||
| 	@Override | ||||
| 	protected void writeString(String text, List<TextPosition> textPositions) { | ||||
| 	    for (Integer index : findOccurrencesInText(searchText, text.toLowerCase())) { | ||||
| 	        if (index + searchText.length() <= textPositions.size()) { | ||||
| 	            // Initial values based on the first character | ||||
| 	            TextPosition first = textPositions.get(index); | ||||
| 	            float minX = first.getX(); | ||||
| 	            float minY = first.getY(); | ||||
| 	            float maxX = first.getX() + first.getWidth(); | ||||
| 	            float maxY = first.getY() + first.getHeight(); | ||||
| 
 | ||||
| 	            // Loop over the rest of the characters and adjust bounding box values | ||||
| 	            for (int i = index; i < index + searchText.length(); i++) { | ||||
| 	                TextPosition position = textPositions.get(i); | ||||
| 	                minX = Math.min(minX, position.getX()); | ||||
| 	                minY = Math.min(minY, position.getY()); | ||||
| 	                maxX = Math.max(maxX, position.getX() + position.getWidth()); | ||||
| 	                maxY = Math.max(maxY, position.getY() + position.getHeight()); | ||||
| 	            } | ||||
| 
 | ||||
| 	            textOccurrences.add(new PDFText( | ||||
| 	                    getCurrentPageNo() - 1, | ||||
| 	                    minX, | ||||
| 	                    minY, | ||||
| 	                    maxX, | ||||
| 	                    maxY, | ||||
| 	                    text | ||||
| 	            )); | ||||
| 	        } | ||||
| 	    } | ||||
| 	} | ||||
| 
 | ||||
| 	public List<PDFText> getTextLocations(PDDocument document) throws Exception { | ||||
| 		this.getText(document); | ||||
| 		System.out.println("Found " + textOccurrences.size() + " occurrences of '" + searchText + "' in the document."); | ||||
| 
 | ||||
| 		return textOccurrences; | ||||
| 	} | ||||
| 
 | ||||
| } | ||||
| @ -304,6 +304,10 @@ showJS.tags=JS | ||||
| #       WEB PAGES         # | ||||
| #                         # | ||||
| ########################### | ||||
| #auto-redact | ||||
| autoRedact.title=Auto Redact | ||||
| autoRedact.header=Auto Redact | ||||
| 
 | ||||
| #showJS | ||||
| showJS.title=Show Javascript | ||||
| showJS.header=Show Javascript | ||||
|  | ||||
							
								
								
									
										56
									
								
								src/main/resources/templates/security/auto-redact.html
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								src/main/resources/templates/security/auto-redact.html
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,56 @@ | ||||
| <!DOCTYPE html> | ||||
| <html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org"> | ||||
| 
 | ||||
| <th:block th:insert="~{fragments/common :: head(title=#{autoRedact.title})}"></th:block> | ||||
| 
 | ||||
| 
 | ||||
| <body> | ||||
|   <div id="page-container"> | ||||
|     <div id="content-wrap"> | ||||
|       <div th:insert="~{fragments/navbar.html :: navbar}"></div> | ||||
|       <br> <br> | ||||
|       <div class="container"> | ||||
|         <div class="row justify-content-center"> | ||||
|           <div class="col-md-6"> | ||||
|             <h2 th:text="#{autoRedact.header}"></h2> | ||||
| 
 | ||||
|             <form action="/auto-redact" method="post" enctype="multipart/form-data"> | ||||
| 		        <div class="mb-3"> | ||||
| 		            <input type="file" class="form-control" id="fileInput" name="fileInput" required accept="application/pdf"> | ||||
| 		        </div> | ||||
| 		         | ||||
| 		        <div class="mb-3"> | ||||
| 		            <label for="listOfText" class="form-label">Texts to Redact (line-separated)</label> | ||||
| 		            <textarea class="form-control" id="listOfText" name="listOfText" rows="4" required placeholder="e.g. Confidential,Top-Secret"></textarea> | ||||
| 		        </div> | ||||
| 		         | ||||
| 		        <div class="mb-3 form-check"> | ||||
| 		            <input type="checkbox" class="form-check-input" id="useRegex" name="useRegex"> | ||||
| 		            <label class="form-check-label" for="useRegex">Use Regex</label> | ||||
| 		        </div> | ||||
| 		 | ||||
| 		        <div class="mb-3 form-check"> | ||||
| 		            <input type="checkbox" class="form-check-input" id="wholeWordSearch" name="wholeWordSearch"> | ||||
| 		            <label class="form-check-label" for="wholeWordSearch">Whole Word Search</label> | ||||
| 		        </div> | ||||
| 		         | ||||
| 		        <div class="mb-3"> | ||||
| 		            <label for="customPadding" class="form-label">Custom Padding</label> | ||||
| 		            <input type="number" step="0.1" class="form-control" id="customPadding" name="customPadding" placeholder="0.0" value="0.1"> | ||||
| 		        </div> | ||||
| 		 | ||||
| 		        <div class="mb-3 form-check"> | ||||
| 		            <input type="checkbox" class="form-check-input" id="convertPDFToImage" name="convertPDFToImage" checked> | ||||
| 		            <label class="form-check-label" for="convertPDFToImage">Convert PDF to Image</label> | ||||
| 		        </div> | ||||
| 		         | ||||
| 		        <button type="submit" class="btn btn-primary">Submit</button> | ||||
| 		    </form> | ||||
|           </div> | ||||
|         </div> | ||||
|       </div> | ||||
|     </div> | ||||
|     <div th:insert="~{fragments/footer.html :: footer}"></div> | ||||
|   </div> | ||||
| </body> | ||||
| </html> | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user