mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-08-06 13:48:58 +02:00
feat: auto-redact to support text removal on true PDFs
This commit is contained in:
parent
bbf5d5f6d4
commit
d7fb66bb79
@ -1,19 +1,33 @@
|
|||||||
package stirling.software.SPDF.controller.api.security;
|
package stirling.software.SPDF.controller.api.security;
|
||||||
|
|
||||||
import java.awt.*;
|
import java.awt.Color;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||||
|
import org.apache.pdfbox.cos.COSArray;
|
||||||
|
import org.apache.pdfbox.cos.COSBase;
|
||||||
|
import org.apache.pdfbox.cos.COSName;
|
||||||
|
import org.apache.pdfbox.cos.COSString;
|
||||||
|
import org.apache.pdfbox.pdfparser.PDFStreamParser;
|
||||||
|
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||||
import org.apache.pdfbox.pdmodel.PDPageTree;
|
import org.apache.pdfbox.pdmodel.PDPageTree;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDResources;
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
|
import org.apache.pdfbox.pdmodel.common.PDStream;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.web.bind.WebDataBinder;
|
import org.springframework.web.bind.WebDataBinder;
|
||||||
import org.springframework.web.bind.annotation.InitBinder;
|
import org.springframework.web.bind.annotation.InitBinder;
|
||||||
@ -27,6 +41,8 @@ import io.github.pixee.security.Filenames;
|
|||||||
import io.swagger.v3.oas.annotations.Operation;
|
import io.swagger.v3.oas.annotations.Operation;
|
||||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@ -48,6 +64,13 @@ import stirling.software.common.util.propertyeditor.StringToArrayListPropertyEdi
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class RedactController {
|
public class RedactController {
|
||||||
|
|
||||||
|
private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.3f;
|
||||||
|
private static final float PRECISION_THRESHOLD = 1e-3f;
|
||||||
|
private static final int FONT_SCALE_FACTOR = 1000;
|
||||||
|
|
||||||
|
// Text showing operators
|
||||||
|
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
|
||||||
|
|
||||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||||
|
|
||||||
@InitBinder
|
@InitBinder
|
||||||
@ -65,17 +88,30 @@ public class RedactController {
|
|||||||
+ " Type:SISO")
|
+ " Type:SISO")
|
||||||
public ResponseEntity<byte[]> redactPDF(@ModelAttribute ManualRedactPdfRequest request)
|
public ResponseEntity<byte[]> redactPDF(@ModelAttribute ManualRedactPdfRequest request)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
log.debug(
|
||||||
|
"Starting manual redaction for file: {}",
|
||||||
|
request.getFileInput().getOriginalFilename());
|
||||||
|
|
||||||
MultipartFile file = request.getFileInput();
|
MultipartFile file = request.getFileInput();
|
||||||
List<RedactionArea> redactionAreas = request.getRedactions();
|
List<RedactionArea> redactionAreas = request.getRedactions();
|
||||||
|
|
||||||
|
log.debug(
|
||||||
|
"Processing {} redaction areas",
|
||||||
|
redactionAreas != null ? redactionAreas.size() : 0);
|
||||||
|
|
||||||
PDDocument document = pdfDocumentFactory.load(file);
|
PDDocument document = pdfDocumentFactory.load(file);
|
||||||
|
log.debug("Loaded PDF document with {} pages", document.getNumberOfPages());
|
||||||
|
|
||||||
PDPageTree allPages = document.getDocumentCatalog().getPages();
|
PDPageTree allPages = document.getDocumentCatalog().getPages();
|
||||||
|
|
||||||
|
log.debug("Starting page redactions");
|
||||||
redactPages(request, document, allPages);
|
redactPages(request, document, allPages);
|
||||||
|
|
||||||
|
log.debug("Starting area redactions");
|
||||||
redactAreas(redactionAreas, document, allPages);
|
redactAreas(redactionAreas, document, allPages);
|
||||||
|
|
||||||
if (Boolean.TRUE.equals(request.getConvertPDFToImage())) {
|
if (Boolean.TRUE.equals(request.getConvertPDFToImage())) {
|
||||||
|
log.debug("Converting PDF to image format");
|
||||||
PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
|
PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
|
||||||
document.close();
|
document.close();
|
||||||
document = convertedPdf;
|
document = convertedPdf;
|
||||||
@ -86,6 +122,8 @@ public class RedactController {
|
|||||||
document.close();
|
document.close();
|
||||||
|
|
||||||
byte[] pdfContent = baos.toByteArray();
|
byte[] pdfContent = baos.toByteArray();
|
||||||
|
log.debug("Manual redaction completed. Output PDF size: {} bytes", pdfContent.length);
|
||||||
|
|
||||||
return WebResponseUtils.bytesToWebResponse(
|
return WebResponseUtils.bytesToWebResponse(
|
||||||
pdfContent,
|
pdfContent,
|
||||||
Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
|
Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
|
||||||
@ -95,17 +133,30 @@ public class RedactController {
|
|||||||
private void redactAreas(
|
private void redactAreas(
|
||||||
List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
|
List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
log.debug("Processing redaction areas");
|
||||||
|
|
||||||
// Group redaction areas by page
|
// Group redaction areas by page
|
||||||
Map<Integer, List<RedactionArea>> redactionsByPage = new HashMap<>();
|
Map<Integer, List<RedactionArea>> redactionsByPage = new HashMap<>();
|
||||||
|
|
||||||
// Process and validate each redaction area
|
// Process and validate each redaction area
|
||||||
for (RedactionArea redactionArea : redactionAreas) {
|
for (RedactionArea redactionArea : redactionAreas) {
|
||||||
|
log.debug(
|
||||||
|
"Validating redaction area on page {}: x={}, y={}, width={}, height={}",
|
||||||
|
redactionArea.getPage(),
|
||||||
|
redactionArea.getX(),
|
||||||
|
redactionArea.getY(),
|
||||||
|
redactionArea.getWidth(),
|
||||||
|
redactionArea.getHeight());
|
||||||
|
|
||||||
if (redactionArea.getPage() == null
|
if (redactionArea.getPage() == null
|
||||||
|| redactionArea.getPage() <= 0
|
|| redactionArea.getPage() <= 0
|
||||||
|| redactionArea.getHeight() == null
|
|| redactionArea.getHeight() == null
|
||||||
|| redactionArea.getHeight() <= 0.0D
|
|| redactionArea.getHeight() <= 0.0D
|
||||||
|| redactionArea.getWidth() == null
|
|| redactionArea.getWidth() == null
|
||||||
|| redactionArea.getWidth() <= 0.0D) continue;
|
|| redactionArea.getWidth() <= 0.0D) {
|
||||||
|
log.debug("Skipping invalid redaction area: {}", redactionArea);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Group by page number
|
// Group by page number
|
||||||
redactionsByPage
|
redactionsByPage
|
||||||
@ -113,70 +164,151 @@ public class RedactController {
|
|||||||
.add(redactionArea);
|
.add(redactionArea);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.debug("Grouped redactions by page: {} pages affected", redactionsByPage.size());
|
||||||
|
|
||||||
// Process each page only once
|
// Process each page only once
|
||||||
for (Map.Entry<Integer, List<RedactionArea>> entry : redactionsByPage.entrySet()) {
|
for (Map.Entry<Integer, List<RedactionArea>> entry : redactionsByPage.entrySet()) {
|
||||||
Integer pageNumber = entry.getKey();
|
Integer pageNumber = entry.getKey();
|
||||||
List<RedactionArea> areasForPage = entry.getValue();
|
List<RedactionArea> areasForPage = entry.getValue();
|
||||||
|
|
||||||
|
log.debug(
|
||||||
|
"Processing page {} with {} redaction areas", pageNumber, areasForPage.size());
|
||||||
|
|
||||||
if (pageNumber > allPages.getCount()) {
|
if (pageNumber > allPages.getCount()) {
|
||||||
|
log.debug(
|
||||||
|
"Skipping page {} - out of bounds (total pages: {})",
|
||||||
|
pageNumber,
|
||||||
|
allPages.getCount());
|
||||||
continue; // Skip if page number is out of bounds
|
continue; // Skip if page number is out of bounds
|
||||||
}
|
}
|
||||||
|
|
||||||
PDPage page = allPages.get(pageNumber - 1);
|
PDPage page = allPages.get(pageNumber - 1);
|
||||||
PDRectangle box = page.getBBox();
|
|
||||||
|
|
||||||
// Create only one content stream per page
|
// Create only one content stream per page to draw all redaction boxes
|
||||||
PDPageContentStream contentStream =
|
try (PDPageContentStream contentStream =
|
||||||
new PDPageContentStream(
|
new PDPageContentStream(
|
||||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
|
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
|
||||||
|
|
||||||
// Process all redactions for this page
|
// Process all redactions for this page
|
||||||
for (RedactionArea redactionArea : areasForPage) {
|
for (RedactionArea redactionArea : areasForPage) {
|
||||||
Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK);
|
Color redactColor = decodeOrDefault(redactionArea.getColor());
|
||||||
contentStream.setNonStrokingColor(redactColor);
|
log.debug(
|
||||||
|
"Applying redaction with color {} at ({}, {}) size {}x{}",
|
||||||
|
redactColor,
|
||||||
|
redactionArea.getX(),
|
||||||
|
redactionArea.getY(),
|
||||||
|
redactionArea.getWidth(),
|
||||||
|
redactionArea.getHeight());
|
||||||
|
|
||||||
float x = redactionArea.getX().floatValue();
|
contentStream.setNonStrokingColor(redactColor);
|
||||||
float y = redactionArea.getY().floatValue();
|
|
||||||
float width = redactionArea.getWidth().floatValue();
|
|
||||||
float height = redactionArea.getHeight().floatValue();
|
|
||||||
|
|
||||||
contentStream.addRect(x, box.getHeight() - y - height, width, height);
|
float x = redactionArea.getX().floatValue();
|
||||||
contentStream.fill();
|
float y = redactionArea.getY().floatValue();
|
||||||
|
float width = redactionArea.getWidth().floatValue();
|
||||||
|
float height = redactionArea.getHeight().floatValue();
|
||||||
|
|
||||||
|
// The y-coordinate needs to be transformed from a top-left origin to a
|
||||||
|
// bottom-left origin.
|
||||||
|
float pdfY = page.getBBox().getHeight() - y - height;
|
||||||
|
|
||||||
|
contentStream.addRect(x, pdfY, width, height);
|
||||||
|
contentStream.fill();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
contentStream.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.debug("Completed redaction areas processing");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void redactPages(
|
private void redactPages(
|
||||||
ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages)
|
ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
Color redactColor = decodeOrDefault(request.getPageRedactionColor(), Color.BLACK);
|
log.debug("Starting page redactions");
|
||||||
|
|
||||||
|
Color redactColor = decodeOrDefault(request.getPageRedactionColor());
|
||||||
List<Integer> pageNumbers = getPageNumbers(request, allPages.getCount());
|
List<Integer> pageNumbers = getPageNumbers(request, allPages.getCount());
|
||||||
|
|
||||||
|
log.debug("Redacting {} pages with color {}", pageNumbers.size(), redactColor);
|
||||||
|
|
||||||
for (Integer pageNumber : pageNumbers) {
|
for (Integer pageNumber : pageNumbers) {
|
||||||
|
log.debug("Redacting entire page {}", pageNumber + 1);
|
||||||
|
|
||||||
PDPage page = allPages.get(pageNumber);
|
PDPage page = allPages.get(pageNumber);
|
||||||
|
|
||||||
PDPageContentStream contentStream =
|
try (PDPageContentStream contentStream =
|
||||||
new PDPageContentStream(
|
new PDPageContentStream(
|
||||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
|
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
|
||||||
contentStream.setNonStrokingColor(redactColor);
|
contentStream.setNonStrokingColor(redactColor);
|
||||||
|
|
||||||
PDRectangle box = page.getBBox();
|
PDRectangle box = page.getBBox();
|
||||||
|
log.debug(
|
||||||
|
"Page {} dimensions: {}x{}",
|
||||||
|
pageNumber + 1,
|
||||||
|
box.getWidth(),
|
||||||
|
box.getHeight());
|
||||||
|
|
||||||
contentStream.addRect(0, 0, box.getWidth(), box.getHeight());
|
contentStream.addRect(0, 0, box.getWidth(), box.getHeight());
|
||||||
contentStream.fill();
|
contentStream.fill();
|
||||||
contentStream.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.debug("Completed page redactions");
|
||||||
}
|
}
|
||||||
|
|
||||||
private Color decodeOrDefault(String hex, Color defaultColor) {
|
private void redactFoundText(
|
||||||
try {
|
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
|
||||||
if (hex != null && !hex.startsWith("#")) {
|
throws IOException {
|
||||||
hex = "#" + hex;
|
log.debug(
|
||||||
|
"Redacting {} text blocks with padding {} and color {}",
|
||||||
|
blocks.size(),
|
||||||
|
customPadding,
|
||||||
|
redactColor);
|
||||||
|
|
||||||
|
var allPages = document.getDocumentCatalog().getPages();
|
||||||
|
|
||||||
|
for (PDFText block : blocks) {
|
||||||
|
log.debug(
|
||||||
|
"Redacting text block on page {}: '{}' at ({}, {}) to ({}, {})",
|
||||||
|
block.getPageIndex() + 1,
|
||||||
|
block.getText(),
|
||||||
|
block.getX1(),
|
||||||
|
block.getY1(),
|
||||||
|
block.getX2(),
|
||||||
|
block.getY2());
|
||||||
|
|
||||||
|
var page = allPages.get(block.getPageIndex());
|
||||||
|
try (PDPageContentStream contentStream =
|
||||||
|
new PDPageContentStream(
|
||||||
|
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
|
||||||
|
contentStream.setNonStrokingColor(redactColor);
|
||||||
|
float padding =
|
||||||
|
(block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER
|
||||||
|
+ customPadding;
|
||||||
|
PDRectangle pageBox = page.getBBox();
|
||||||
|
contentStream.addRect(
|
||||||
|
block.getX1(),
|
||||||
|
pageBox.getHeight() - block.getY2() - padding,
|
||||||
|
block.getX2() - block.getX1(),
|
||||||
|
block.getY2() - block.getY1() + 2 * padding);
|
||||||
|
contentStream.fill();
|
||||||
}
|
}
|
||||||
return Color.decode(hex);
|
}
|
||||||
} catch (Exception e) {
|
|
||||||
return defaultColor;
|
log.debug("Completed text block redactions");
|
||||||
|
}
|
||||||
|
|
||||||
|
private Color decodeOrDefault(String hex) {
|
||||||
|
if (hex == null) {
|
||||||
|
return Color.BLACK;
|
||||||
|
}
|
||||||
|
|
||||||
|
String colorString = hex.startsWith("#") ? hex : "#" + hex;
|
||||||
|
|
||||||
|
try {
|
||||||
|
return Color.decode(colorString);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
log.warn("Invalid color string '{}'. Using default color BLACK.", hex);
|
||||||
|
return Color.BLACK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -198,6 +330,10 @@ public class RedactController {
|
|||||||
+ " Input:PDF, Output:PDF, Type:SISO")
|
+ " Input:PDF, Output:PDF, Type:SISO")
|
||||||
public ResponseEntity<byte[]> redactPdf(@ModelAttribute RedactPdfRequest request)
|
public ResponseEntity<byte[]> redactPdf(@ModelAttribute RedactPdfRequest request)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
log.debug(
|
||||||
|
"Starting auto-redaction for file: {}",
|
||||||
|
request.getFileInput().getOriginalFilename());
|
||||||
|
|
||||||
MultipartFile file = request.getFileInput();
|
MultipartFile file = request.getFileInput();
|
||||||
String listOfTextString = request.getListOfText();
|
String listOfTextString = request.getListOfText();
|
||||||
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
||||||
@ -206,28 +342,80 @@ public class RedactController {
|
|||||||
float customPadding = request.getCustomPadding();
|
float customPadding = request.getCustomPadding();
|
||||||
boolean convertPDFToImage = Boolean.TRUE.equals(request.getConvertPDFToImage());
|
boolean convertPDFToImage = Boolean.TRUE.equals(request.getConvertPDFToImage());
|
||||||
|
|
||||||
|
log.debug(
|
||||||
|
"Auto-redaction parameters: useRegex={}, wholeWordSearch={}, customPadding={}, convertToImage={}",
|
||||||
|
useRegex,
|
||||||
|
wholeWordSearchBool,
|
||||||
|
customPadding,
|
||||||
|
convertPDFToImage);
|
||||||
|
|
||||||
String[] listOfText = listOfTextString.split("\n");
|
String[] listOfText = listOfTextString.split("\n");
|
||||||
|
log.debug("Searching for {} text patterns", listOfText.length);
|
||||||
|
|
||||||
PDDocument document = pdfDocumentFactory.load(file);
|
PDDocument document = pdfDocumentFactory.load(file);
|
||||||
|
log.debug("Loaded PDF document with {} pages", document.getNumberOfPages());
|
||||||
|
|
||||||
Color redactColor;
|
Color redactColor;
|
||||||
try {
|
try {
|
||||||
if (!colorString.startsWith("#")) {
|
if (colorString != null && !colorString.startsWith("#")) {
|
||||||
colorString = "#" + colorString;
|
colorString = "#" + colorString;
|
||||||
}
|
}
|
||||||
redactColor = Color.decode(colorString);
|
redactColor = Color.decode(colorString);
|
||||||
|
log.debug("Using redaction color: {}", redactColor);
|
||||||
} catch (NumberFormatException e) {
|
} catch (NumberFormatException e) {
|
||||||
log.warn("Invalid color string provided. Using default color BLACK for redaction.");
|
log.warn("Invalid color string provided. Using default color BLACK for redaction.");
|
||||||
redactColor = Color.BLACK;
|
redactColor = Color.BLACK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Step 1: Find all text locations for all search terms
|
||||||
|
log.debug("Step 1: Finding all text locations");
|
||||||
|
Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
|
||||||
|
Set<String> allSearchTerms = new HashSet<>();
|
||||||
for (String text : listOfText) {
|
for (String text : listOfText) {
|
||||||
text = text.trim();
|
text = text.trim();
|
||||||
|
if (text.isEmpty()) continue;
|
||||||
|
|
||||||
|
log.debug("Searching for text pattern: '{}'", text);
|
||||||
|
allSearchTerms.add(text);
|
||||||
TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool);
|
TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool);
|
||||||
List<PDFText> foundTexts = textFinder.getTextLocations(document);
|
textFinder.getText(document);
|
||||||
redactFoundText(document, foundTexts, customPadding, redactColor);
|
List<PDFText> foundTexts = textFinder.getFoundTexts();
|
||||||
|
|
||||||
|
log.debug("Found {} instances of pattern '{}'", foundTexts.size(), text);
|
||||||
|
|
||||||
|
for (PDFText found : foundTexts) {
|
||||||
|
allFoundTextsByPage
|
||||||
|
.computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>())
|
||||||
|
.add(found);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.debug("Total pages with found text: {}", allFoundTextsByPage.size());
|
||||||
|
|
||||||
|
// Step 2: Process each page
|
||||||
|
log.debug("Step 2: Processing each page for text replacement");
|
||||||
|
for (PDPage page : document.getPages()) {
|
||||||
|
// Replace text content
|
||||||
|
List<Object> filteredTokens =
|
||||||
|
createTokensWithoutTargetText(
|
||||||
|
page, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||||
|
writeFilteredContentStream(document, page, filteredTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Draw redaction boxes for all found texts
|
||||||
|
List<PDFText> allFoundTexts = new ArrayList<>();
|
||||||
|
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
|
||||||
|
allFoundTexts.addAll(pageTexts);
|
||||||
|
}
|
||||||
|
|
||||||
|
log.debug("Drawing redaction boxes for {} total found texts", allFoundTexts.size());
|
||||||
|
|
||||||
|
if (!allFoundTexts.isEmpty()) {
|
||||||
|
redactFoundText(document, allFoundTexts, customPadding, redactColor);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (convertPDFToImage) {
|
if (convertPDFToImage) {
|
||||||
|
log.debug("Converting redacted PDF to image format");
|
||||||
PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
|
PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
|
||||||
document.close();
|
document.close();
|
||||||
document = convertedPdf;
|
document = convertedPdf;
|
||||||
@ -238,32 +426,465 @@ public class RedactController {
|
|||||||
document.close();
|
document.close();
|
||||||
|
|
||||||
byte[] pdfContent = baos.toByteArray();
|
byte[] pdfContent = baos.toByteArray();
|
||||||
|
log.debug("Auto-redaction completed. Output PDF size: {} bytes", pdfContent.length);
|
||||||
|
|
||||||
return WebResponseUtils.bytesToWebResponse(
|
return WebResponseUtils.bytesToWebResponse(
|
||||||
pdfContent,
|
pdfContent,
|
||||||
Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
|
Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
|
||||||
+ "_redacted.pdf");
|
+ "_redacted.pdf");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void redactFoundText(
|
private List<Object> createTokensWithoutTargetText(
|
||||||
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
|
PDPage page, Set<String> targetWords, boolean useRegex, boolean wholeWordSearch)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
var allPages = document.getDocumentCatalog().getPages();
|
log.debug(
|
||||||
|
"Creating tokens without target text for page, searching for {} words",
|
||||||
|
targetWords.size());
|
||||||
|
|
||||||
for (PDFText block : blocks) {
|
PDFStreamParser parser = new PDFStreamParser(page);
|
||||||
var page = allPages.get(block.getPageIndex());
|
List<Object> tokens = new ArrayList<>();
|
||||||
PDPageContentStream contentStream =
|
Object token;
|
||||||
new PDPageContentStream(
|
while ((token = parser.parseNextToken()) != null) {
|
||||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
|
tokens.add(token);
|
||||||
contentStream.setNonStrokingColor(redactColor);
|
}
|
||||||
float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding;
|
|
||||||
PDRectangle pageBox = page.getBBox();
|
log.debug("Parsed {} tokens from page content stream", tokens.size());
|
||||||
contentStream.addRect(
|
|
||||||
block.getX1(),
|
List<TextSegment> textSegments = extractTextSegments(page, tokens);
|
||||||
pageBox.getHeight() - block.getY1() - padding,
|
log.debug("Extracted {} text segments", textSegments.size());
|
||||||
block.getX2() - block.getX1(),
|
|
||||||
block.getY2() - block.getY1() + 2 * padding);
|
String completeText = buildCompleteText(textSegments);
|
||||||
contentStream.fill();
|
log.debug("Built complete text of {} characters", completeText.length());
|
||||||
contentStream.close();
|
|
||||||
|
List<MatchRange> matches =
|
||||||
|
findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
|
||||||
|
log.debug("Found {} matches in complete text", matches.size());
|
||||||
|
|
||||||
|
return applyRedactionsToTokens(tokens, textSegments, matches);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Data
|
||||||
|
private static class GraphicsState {
|
||||||
|
private PDFont font = null;
|
||||||
|
private float fontSize = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
private static class TextSegment {
|
||||||
|
private int tokenIndex;
|
||||||
|
private String operatorName;
|
||||||
|
private String text;
|
||||||
|
private int startPos;
|
||||||
|
private int endPos;
|
||||||
|
private PDFont font;
|
||||||
|
private float fontSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
private static class MatchRange {
|
||||||
|
private int startPos;
|
||||||
|
private int endPos;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<TextSegment> extractTextSegments(PDPage page, List<Object> tokens)
|
||||||
|
throws IOException {
|
||||||
|
log.debug("Extracting text segments from {} tokens", tokens.size());
|
||||||
|
|
||||||
|
List<TextSegment> segments = new ArrayList<>();
|
||||||
|
int currentTextPos = 0;
|
||||||
|
GraphicsState graphicsState = new GraphicsState();
|
||||||
|
PDResources resources = page.getResources();
|
||||||
|
|
||||||
|
for (int i = 0; i < tokens.size(); i++) {
|
||||||
|
Object currentToken = tokens.get(i);
|
||||||
|
|
||||||
|
if (currentToken instanceof Operator op) {
|
||||||
|
String opName = op.getName();
|
||||||
|
|
||||||
|
if ("Tf".equals(opName) && i >= 2) {
|
||||||
|
try {
|
||||||
|
COSName fontName = (COSName) tokens.get(i - 2);
|
||||||
|
COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
|
||||||
|
if (fontSizeBase instanceof org.apache.pdfbox.cos.COSNumber cosNumber) {
|
||||||
|
graphicsState.setFont(resources.getFont(fontName));
|
||||||
|
graphicsState.setFontSize(cosNumber.floatValue());
|
||||||
|
log.debug(
|
||||||
|
"Updated font state: {} size {}",
|
||||||
|
fontName.getName(),
|
||||||
|
graphicsState.getFontSize());
|
||||||
|
}
|
||||||
|
} catch (ClassCastException | IOException e) {
|
||||||
|
log.warn("Failed to update font state", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isTextShowingOperator(opName) && i > 0) {
|
||||||
|
String textContent = extractTextFromToken(tokens.get(i - 1), opName);
|
||||||
|
if (!textContent.isEmpty()) {
|
||||||
|
log.debug(
|
||||||
|
"Found text segment '{}' at position {} with operator {}",
|
||||||
|
textContent,
|
||||||
|
currentTextPos,
|
||||||
|
opName);
|
||||||
|
segments.add(
|
||||||
|
new TextSegment(
|
||||||
|
i - 1,
|
||||||
|
opName,
|
||||||
|
textContent,
|
||||||
|
currentTextPos,
|
||||||
|
currentTextPos + textContent.length(),
|
||||||
|
graphicsState.font,
|
||||||
|
graphicsState.fontSize));
|
||||||
|
currentTextPos += textContent.length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.debug("Extracted {} text segments from page", segments.size());
|
||||||
|
return segments;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildCompleteText(List<TextSegment> segments) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (TextSegment segment : segments) {
|
||||||
|
sb.append(segment.text);
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<MatchRange> findAllMatches(
|
||||||
|
String completeText,
|
||||||
|
Set<String> targetWords,
|
||||||
|
boolean useRegex,
|
||||||
|
boolean wholeWordSearch) {
|
||||||
|
log.debug(
|
||||||
|
"Finding matches in text of {} characters for {} target words",
|
||||||
|
completeText.length(),
|
||||||
|
targetWords.size());
|
||||||
|
|
||||||
|
List<MatchRange> matches = new ArrayList<>();
|
||||||
|
|
||||||
|
for (String target : targetWords) {
|
||||||
|
log.debug("Searching for pattern: '{}'", target);
|
||||||
|
|
||||||
|
String patternString = useRegex ? target : Pattern.quote(target);
|
||||||
|
if (wholeWordSearch) {
|
||||||
|
patternString = "\\b" + patternString + "\\b";
|
||||||
|
}
|
||||||
|
Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
|
||||||
|
Matcher matcher = pattern.matcher(completeText);
|
||||||
|
|
||||||
|
int matchCount = 0;
|
||||||
|
while (matcher.find()) {
|
||||||
|
matches.add(new MatchRange(matcher.start(), matcher.end()));
|
||||||
|
matchCount++;
|
||||||
|
log.debug(
|
||||||
|
"Found match for '{}' at positions {}-{}",
|
||||||
|
target,
|
||||||
|
matcher.start(),
|
||||||
|
matcher.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
log.debug("Total matches for '{}': {}", target, matchCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
matches.sort((a, b) -> Integer.compare(a.startPos, b.startPos));
|
||||||
|
log.debug("Found {} total matches across all patterns", matches.size());
|
||||||
|
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Object> applyRedactionsToTokens(
|
||||||
|
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
|
||||||
|
log.debug(
|
||||||
|
"Applying redactions to {} tokens with {} text segments and {} matches",
|
||||||
|
tokens.size(),
|
||||||
|
textSegments.size(),
|
||||||
|
matches.size());
|
||||||
|
|
||||||
|
List<Object> newTokens = new ArrayList<>(tokens);
|
||||||
|
|
||||||
|
// Group matches by segment to pass to modification methods
|
||||||
|
Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
|
||||||
|
for (MatchRange match : matches) {
|
||||||
|
for (int i = 0; i < textSegments.size(); i++) {
|
||||||
|
TextSegment segment = textSegments.get(i);
|
||||||
|
int overlapStart = Math.max(match.startPos, segment.startPos);
|
||||||
|
int overlapEnd = Math.min(match.endPos, segment.endPos);
|
||||||
|
if (overlapStart < overlapEnd) {
|
||||||
|
matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.debug("Grouped matches by segment: {} segments affected", matchesBySegment.size());
|
||||||
|
|
||||||
|
// Create a list of modification tasks
|
||||||
|
List<ModificationTask> tasks = new ArrayList<>();
|
||||||
|
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
|
||||||
|
int segmentIndex = entry.getKey();
|
||||||
|
List<MatchRange> segmentMatches = entry.getValue();
|
||||||
|
TextSegment segment = textSegments.get(segmentIndex);
|
||||||
|
|
||||||
|
log.debug(
|
||||||
|
"Creating modification task for segment {} with {} matches",
|
||||||
|
segmentIndex,
|
||||||
|
segmentMatches.size());
|
||||||
|
|
||||||
|
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) {
|
||||||
|
String newText = applyRedactionsToSegmentText(segment, segmentMatches);
|
||||||
|
try {
|
||||||
|
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
|
||||||
|
tasks.add(new ModificationTask(segment, newText, adjustment));
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.warn("Failed to calculate width adjustment for redaction.", e);
|
||||||
|
}
|
||||||
|
} else if ("TJ".equals(segment.operatorName)) {
|
||||||
|
tasks.add(new ModificationTask(segment, null, 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort tasks by token index in descending order to avoid index shifting issues
|
||||||
|
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
|
||||||
|
|
||||||
|
log.debug("Applying {} modification tasks", tasks.size());
|
||||||
|
|
||||||
|
// Apply modifications
|
||||||
|
for (ModificationTask task : tasks) {
|
||||||
|
List<MatchRange> segmentMatches =
|
||||||
|
matchesBySegment.getOrDefault(
|
||||||
|
textSegments.indexOf(task.segment), Collections.emptyList());
|
||||||
|
modifyTokenForRedaction(
|
||||||
|
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
|
||||||
|
}
|
||||||
|
|
||||||
|
log.debug("Completed applying redactions to tokens");
|
||||||
|
return newTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
private static class ModificationTask {
|
||||||
|
private TextSegment segment;
|
||||||
|
private String newText; // Only for Tj
|
||||||
|
private float adjustment; // Only for Tj
|
||||||
|
}
|
||||||
|
|
||||||
|
private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) {
|
||||||
|
String text = segment.getText();
|
||||||
|
StringBuilder result = new StringBuilder(text);
|
||||||
|
|
||||||
|
for (MatchRange match : matches) {
|
||||||
|
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
|
||||||
|
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
|
||||||
|
|
||||||
|
if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) {
|
||||||
|
String placeholder = createPlaceholder(text.substring(segmentStart, segmentEnd));
|
||||||
|
result.replace(segmentStart, segmentEnd, placeholder);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private float calculateWidthAdjustment(TextSegment segment, List<MatchRange> matches)
|
||||||
|
throws IOException {
|
||||||
|
float totalOriginalWidth = 0;
|
||||||
|
float totalPlaceholderWidth = 0;
|
||||||
|
String text = segment.getText();
|
||||||
|
|
||||||
|
for (MatchRange match : matches) {
|
||||||
|
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
|
||||||
|
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
|
||||||
|
|
||||||
|
if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) {
|
||||||
|
String originalPart = text.substring(segmentStart, segmentEnd);
|
||||||
|
String placeholderPart = createPlaceholder(originalPart);
|
||||||
|
|
||||||
|
if (segment.getFont() != null) {
|
||||||
|
totalOriginalWidth +=
|
||||||
|
segment.getFont().getStringWidth(originalPart)
|
||||||
|
/ FONT_SCALE_FACTOR
|
||||||
|
* segment.getFontSize();
|
||||||
|
totalPlaceholderWidth +=
|
||||||
|
segment.getFont().getStringWidth(placeholderPart)
|
||||||
|
/ FONT_SCALE_FACTOR
|
||||||
|
* segment.getFontSize();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return totalOriginalWidth - totalPlaceholderWidth;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void modifyTokenForRedaction(
|
||||||
|
List<Object> tokens,
|
||||||
|
TextSegment segment,
|
||||||
|
String newText,
|
||||||
|
float adjustment,
|
||||||
|
List<MatchRange> matches) {
|
||||||
|
log.debug(
|
||||||
|
"Modifying token at index {} for segment '{}' with operator {}",
|
||||||
|
segment.getTokenIndex(),
|
||||||
|
segment.getText(),
|
||||||
|
segment.getOperatorName());
|
||||||
|
|
||||||
|
if (segment.getTokenIndex() < 0 || segment.getTokenIndex() >= tokens.size()) {
|
||||||
|
log.debug(
|
||||||
|
"Token index {} out of bounds (0-{})",
|
||||||
|
segment.getTokenIndex(),
|
||||||
|
tokens.size() - 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Object token = tokens.get(segment.getTokenIndex());
|
||||||
|
String operatorName = segment.getOperatorName();
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (("Tj".equals(operatorName) || "'".equals(operatorName))
|
||||||
|
&& token instanceof COSString) {
|
||||||
|
log.debug("Modifying Tj/quote operator with adjustment {}", adjustment);
|
||||||
|
|
||||||
|
if (Math.abs(adjustment) < PRECISION_THRESHOLD) {
|
||||||
|
tokens.set(segment.getTokenIndex(), new COSString(newText));
|
||||||
|
} else {
|
||||||
|
COSArray newArray = new COSArray();
|
||||||
|
newArray.add(new COSString(newText));
|
||||||
|
if (segment.getFontSize() > 0) {
|
||||||
|
float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize();
|
||||||
|
newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning));
|
||||||
|
log.debug("Applied kerning adjustment: {}", kerning);
|
||||||
|
}
|
||||||
|
tokens.set(segment.getTokenIndex(), newArray);
|
||||||
|
|
||||||
|
int operatorIndex = segment.getTokenIndex() + 1;
|
||||||
|
if (operatorIndex < tokens.size()
|
||||||
|
&& tokens.get(operatorIndex) instanceof Operator op
|
||||||
|
&& op.getName().equals(operatorName)) {
|
||||||
|
tokens.set(operatorIndex, Operator.getOperator("TJ"));
|
||||||
|
log.debug("Changed operator from {} to TJ", operatorName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if ("TJ".equals(operatorName) && token instanceof COSArray) {
|
||||||
|
log.debug("Modifying TJ operator array");
|
||||||
|
COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches);
|
||||||
|
tokens.set(segment.getTokenIndex(), newArray);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.warn("Failed to modify token for redaction: {}", e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private COSArray createRedactedTJArray(
|
||||||
|
COSArray originalArray, TextSegment segment, List<MatchRange> matches)
|
||||||
|
throws IOException {
|
||||||
|
COSArray newArray = new COSArray();
|
||||||
|
int textOffsetInSegment = 0;
|
||||||
|
|
||||||
|
for (COSBase element : originalArray) {
|
||||||
|
if (element instanceof COSString cosString) {
|
||||||
|
String originalText = cosString.getString();
|
||||||
|
StringBuilder newText = new StringBuilder(originalText);
|
||||||
|
boolean modified = false;
|
||||||
|
|
||||||
|
for (MatchRange match : matches) {
|
||||||
|
int stringStartInPage = segment.getStartPos() + textOffsetInSegment;
|
||||||
|
int stringEndInPage = stringStartInPage + originalText.length();
|
||||||
|
|
||||||
|
int overlapStart = Math.max(match.getStartPos(), stringStartInPage);
|
||||||
|
int overlapEnd = Math.min(match.getEndPos(), stringEndInPage);
|
||||||
|
|
||||||
|
if (overlapStart < overlapEnd) {
|
||||||
|
modified = true;
|
||||||
|
int redactionStartInString = overlapStart - stringStartInPage;
|
||||||
|
int redactionEndInString = overlapEnd - stringStartInPage;
|
||||||
|
if (redactionStartInString >= 0
|
||||||
|
&& redactionEndInString <= originalText.length()) {
|
||||||
|
String placeholder =
|
||||||
|
createPlaceholder(
|
||||||
|
originalText.substring(
|
||||||
|
redactionStartInString, redactionEndInString));
|
||||||
|
newText.replace(
|
||||||
|
redactionStartInString, redactionEndInString, placeholder);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String modifiedString = newText.toString();
|
||||||
|
newArray.add(new COSString(modifiedString));
|
||||||
|
|
||||||
|
if (modified && segment.getFont() != null && segment.getFontSize() > 0) {
|
||||||
|
float originalWidth =
|
||||||
|
segment.getFont().getStringWidth(originalText)
|
||||||
|
/ FONT_SCALE_FACTOR
|
||||||
|
* segment.getFontSize();
|
||||||
|
float modifiedWidth =
|
||||||
|
segment.getFont().getStringWidth(modifiedString)
|
||||||
|
/ FONT_SCALE_FACTOR
|
||||||
|
* segment.getFontSize();
|
||||||
|
float adjustment = originalWidth - modifiedWidth;
|
||||||
|
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
||||||
|
float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize();
|
||||||
|
newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
textOffsetInSegment += originalText.length();
|
||||||
|
} else {
|
||||||
|
newArray.add(element);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return newArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String extractTextFromToken(Object token, String operatorName) {
|
||||||
|
return switch (operatorName) {
|
||||||
|
case "Tj", "'" -> {
|
||||||
|
if (token instanceof COSString cosString) {
|
||||||
|
yield cosString.getString();
|
||||||
|
}
|
||||||
|
yield "";
|
||||||
|
}
|
||||||
|
case "TJ" -> {
|
||||||
|
if (token instanceof COSArray cosArray) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (COSBase element : cosArray) {
|
||||||
|
if (element instanceof COSString cosString) {
|
||||||
|
sb.append(cosString.getString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
yield sb.toString();
|
||||||
|
}
|
||||||
|
yield "";
|
||||||
|
}
|
||||||
|
default -> "";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private String createPlaceholder(String originalWord) {
|
||||||
|
if (originalWord == null || originalWord.isEmpty()) {
|
||||||
|
return originalWord;
|
||||||
|
}
|
||||||
|
return "".repeat(originalWord.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeFilteredContentStream(PDDocument document, PDPage page, List<Object> tokens)
|
||||||
|
throws IOException {
|
||||||
|
log.debug("Writing filtered content stream with {} tokens", tokens.size());
|
||||||
|
|
||||||
|
PDStream newStream = new PDStream(document);
|
||||||
|
try (var out = newStream.createOutputStream()) {
|
||||||
|
ContentStreamWriter writer = new ContentStreamWriter(out);
|
||||||
|
writer.writeTokens(tokens);
|
||||||
|
}
|
||||||
|
page.setContents(newStream);
|
||||||
|
|
||||||
|
log.debug("Successfully wrote filtered content stream");
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isTextShowingOperator(String opName) {
|
||||||
|
return TEXT_SHOWING_OPERATORS.contains(opName);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,102 +6,109 @@ import java.util.List;
|
|||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
import stirling.software.SPDF.model.PDFText;
|
import stirling.software.SPDF.model.PDFText;
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
public class TextFinder extends PDFTextStripper {
|
public class TextFinder extends PDFTextStripper {
|
||||||
|
|
||||||
private final String searchText;
|
private final String searchTerm;
|
||||||
private final boolean useRegex;
|
private final boolean useRegex;
|
||||||
private final boolean wholeWordSearch;
|
private final boolean wholeWordSearch;
|
||||||
private final List<PDFText> textOccurrences = new ArrayList<>();
|
private final List<PDFText> foundTexts = new ArrayList<>();
|
||||||
|
|
||||||
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch)
|
private final List<TextPosition> pageTextPositions = new ArrayList<>();
|
||||||
|
private final StringBuilder pageTextBuilder = new StringBuilder();
|
||||||
|
|
||||||
|
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
this.searchText = searchText.toLowerCase();
|
super();
|
||||||
|
this.searchTerm = searchTerm;
|
||||||
this.useRegex = useRegex;
|
this.useRegex = useRegex;
|
||||||
this.wholeWordSearch = wholeWordSearch;
|
this.wholeWordSearch = wholeWordSearch;
|
||||||
setSortByPosition(true);
|
this.setWordSeparator(" ");
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<MatchInfo> findOccurrencesInText(String searchText, String content) {
|
@Override
|
||||||
List<MatchInfo> matches = new ArrayList<>();
|
protected void startPage(PDPage page) {
|
||||||
|
pageTextPositions.clear();
|
||||||
Pattern pattern;
|
pageTextBuilder.setLength(0);
|
||||||
|
|
||||||
if (useRegex) {
|
|
||||||
// Use regex-based search
|
|
||||||
pattern =
|
|
||||||
wholeWordSearch
|
|
||||||
? Pattern.compile("\\b" + searchText + "\\b")
|
|
||||||
: Pattern.compile(searchText);
|
|
||||||
} else {
|
|
||||||
// Use normal text search
|
|
||||||
pattern =
|
|
||||||
wholeWordSearch
|
|
||||||
? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
|
|
||||||
: Pattern.compile(Pattern.quote(searchText));
|
|
||||||
}
|
|
||||||
|
|
||||||
Matcher matcher = pattern.matcher(content);
|
|
||||||
while (matcher.find()) {
|
|
||||||
matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
|
|
||||||
}
|
|
||||||
return matches;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void writeString(String text, List<TextPosition> textPositions) {
|
protected void writeString(String text, List<TextPosition> textPositions) {
|
||||||
for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) {
|
pageTextBuilder.append(text);
|
||||||
int index = match.startIndex;
|
pageTextPositions.addAll(textPositions);
|
||||||
if (index + match.matchLength <= textPositions.size()) {
|
}
|
||||||
// Initial values based on the first character
|
|
||||||
TextPosition first = textPositions.get(index);
|
|
||||||
float minX = first.getX();
|
|
||||||
float minY = first.getY();
|
|
||||||
float maxX = first.getX() + first.getWidth();
|
|
||||||
float maxY = first.getY() + first.getHeight();
|
|
||||||
|
|
||||||
// Loop over the rest of the characters and adjust bounding box values
|
@Override
|
||||||
for (int i = index; i < index + match.matchLength; i++) {
|
protected void writeWordSeparator() {
|
||||||
TextPosition position = textPositions.get(i);
|
pageTextBuilder.append(getWordSeparator());
|
||||||
minX = Math.min(minX, position.getX());
|
pageTextPositions.add(null); // Placeholder for separator
|
||||||
minY = Math.min(minY, position.getY());
|
}
|
||||||
maxX = Math.max(maxX, position.getX() + position.getWidth());
|
|
||||||
maxY = Math.max(maxY, position.getY() + position.getHeight());
|
@Override
|
||||||
|
protected void writeLineSeparator() {
|
||||||
|
pageTextBuilder.append(getLineSeparator());
|
||||||
|
pageTextPositions.add(null); // Placeholder for separator
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void endPage(PDPage page) {
|
||||||
|
String text = pageTextBuilder.toString();
|
||||||
|
if (text.isEmpty() || this.searchTerm == null || this.searchTerm.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String processedSearchTerm = this.searchTerm.trim();
|
||||||
|
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
|
||||||
|
if (this.wholeWordSearch) {
|
||||||
|
regex = "\\b" + regex + "\\b";
|
||||||
|
}
|
||||||
|
|
||||||
|
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
|
||||||
|
Matcher matcher = pattern.matcher(text);
|
||||||
|
|
||||||
|
while (matcher.find()) {
|
||||||
|
int matchStart = matcher.start();
|
||||||
|
int matchEnd = matcher.end();
|
||||||
|
|
||||||
|
float minX = Float.MAX_VALUE;
|
||||||
|
float minY = Float.MAX_VALUE;
|
||||||
|
float maxX = Float.MIN_VALUE;
|
||||||
|
float maxY = Float.MIN_VALUE;
|
||||||
|
boolean foundPosition = false;
|
||||||
|
|
||||||
|
for (int i = matchStart; i < matchEnd; i++) {
|
||||||
|
if (i >= pageTextPositions.size()) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
TextPosition pos = pageTextPositions.get(i);
|
||||||
|
if (pos != null) {
|
||||||
|
foundPosition = true;
|
||||||
|
minX = Math.min(minX, pos.getX());
|
||||||
|
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
|
||||||
|
minY = Math.min(minY, pos.getY() - pos.getHeight());
|
||||||
|
maxY = Math.max(maxY, pos.getY());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
textOccurrences.add(
|
if (foundPosition) {
|
||||||
new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text));
|
foundTexts.add(
|
||||||
|
new PDFText(
|
||||||
|
this.getCurrentPageNo() - 1,
|
||||||
|
minX,
|
||||||
|
minY,
|
||||||
|
maxX,
|
||||||
|
maxY,
|
||||||
|
matcher.group()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<PDFText> getTextLocations(PDDocument document) throws Exception {
|
public List<PDFText> getFoundTexts() {
|
||||||
this.getText(document);
|
return foundTexts;
|
||||||
log.debug(
|
|
||||||
"Found "
|
|
||||||
+ textOccurrences.size()
|
|
||||||
+ " occurrences of '"
|
|
||||||
+ searchText
|
|
||||||
+ "' in the document.");
|
|
||||||
|
|
||||||
return textOccurrences;
|
|
||||||
}
|
|
||||||
|
|
||||||
private class MatchInfo {
|
|
||||||
int startIndex;
|
|
||||||
int matchLength;
|
|
||||||
|
|
||||||
MatchInfo(int startIndex, int matchLength) {
|
|
||||||
this.startIndex = startIndex;
|
|
||||||
this.matchLength = matchLength;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user