mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-08-02 13:48:15 +02:00
feat: auto-redact to support text removal on true PDFs
This commit is contained in:
parent
bbf5d5f6d4
commit
d7fb66bb79
@ -1,19 +1,33 @@
|
||||
package stirling.software.SPDF.controller.api.security;
|
||||
|
||||
import java.awt.*;
|
||||
import java.awt.Color;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.cos.COSArray;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdfparser.PDFStreamParser;
|
||||
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.PDPageTree;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.common.PDStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.WebDataBinder;
|
||||
import org.springframework.web.bind.annotation.InitBinder;
|
||||
@ -27,6 +41,8 @@ import io.github.pixee.security.Filenames;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -48,6 +64,13 @@ import stirling.software.common.util.propertyeditor.StringToArrayListPropertyEdi
|
||||
@RequiredArgsConstructor
|
||||
public class RedactController {
|
||||
|
||||
private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.3f;
|
||||
private static final float PRECISION_THRESHOLD = 1e-3f;
|
||||
private static final int FONT_SCALE_FACTOR = 1000;
|
||||
|
||||
// Text showing operators
|
||||
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
|
||||
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@InitBinder
|
||||
@ -65,17 +88,30 @@ public class RedactController {
|
||||
+ " Type:SISO")
|
||||
public ResponseEntity<byte[]> redactPDF(@ModelAttribute ManualRedactPdfRequest request)
|
||||
throws IOException {
|
||||
log.debug(
|
||||
"Starting manual redaction for file: {}",
|
||||
request.getFileInput().getOriginalFilename());
|
||||
|
||||
MultipartFile file = request.getFileInput();
|
||||
List<RedactionArea> redactionAreas = request.getRedactions();
|
||||
|
||||
log.debug(
|
||||
"Processing {} redaction areas",
|
||||
redactionAreas != null ? redactionAreas.size() : 0);
|
||||
|
||||
PDDocument document = pdfDocumentFactory.load(file);
|
||||
log.debug("Loaded PDF document with {} pages", document.getNumberOfPages());
|
||||
|
||||
PDPageTree allPages = document.getDocumentCatalog().getPages();
|
||||
|
||||
log.debug("Starting page redactions");
|
||||
redactPages(request, document, allPages);
|
||||
|
||||
log.debug("Starting area redactions");
|
||||
redactAreas(redactionAreas, document, allPages);
|
||||
|
||||
if (Boolean.TRUE.equals(request.getConvertPDFToImage())) {
|
||||
log.debug("Converting PDF to image format");
|
||||
PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
|
||||
document.close();
|
||||
document = convertedPdf;
|
||||
@ -86,6 +122,8 @@ public class RedactController {
|
||||
document.close();
|
||||
|
||||
byte[] pdfContent = baos.toByteArray();
|
||||
log.debug("Manual redaction completed. Output PDF size: {} bytes", pdfContent.length);
|
||||
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
pdfContent,
|
||||
Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
|
||||
@ -95,17 +133,30 @@ public class RedactController {
|
||||
private void redactAreas(
|
||||
List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
|
||||
throws IOException {
|
||||
log.debug("Processing redaction areas");
|
||||
|
||||
// Group redaction areas by page
|
||||
Map<Integer, List<RedactionArea>> redactionsByPage = new HashMap<>();
|
||||
|
||||
// Process and validate each redaction area
|
||||
for (RedactionArea redactionArea : redactionAreas) {
|
||||
log.debug(
|
||||
"Validating redaction area on page {}: x={}, y={}, width={}, height={}",
|
||||
redactionArea.getPage(),
|
||||
redactionArea.getX(),
|
||||
redactionArea.getY(),
|
||||
redactionArea.getWidth(),
|
||||
redactionArea.getHeight());
|
||||
|
||||
if (redactionArea.getPage() == null
|
||||
|| redactionArea.getPage() <= 0
|
||||
|| redactionArea.getHeight() == null
|
||||
|| redactionArea.getHeight() <= 0.0D
|
||||
|| redactionArea.getWidth() == null
|
||||
|| redactionArea.getWidth() <= 0.0D) continue;
|
||||
|| redactionArea.getWidth() <= 0.0D) {
|
||||
log.debug("Skipping invalid redaction area: {}", redactionArea);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Group by page number
|
||||
redactionsByPage
|
||||
@ -113,70 +164,151 @@ public class RedactController {
|
||||
.add(redactionArea);
|
||||
}
|
||||
|
||||
log.debug("Grouped redactions by page: {} pages affected", redactionsByPage.size());
|
||||
|
||||
// Process each page only once
|
||||
for (Map.Entry<Integer, List<RedactionArea>> entry : redactionsByPage.entrySet()) {
|
||||
Integer pageNumber = entry.getKey();
|
||||
List<RedactionArea> areasForPage = entry.getValue();
|
||||
|
||||
log.debug(
|
||||
"Processing page {} with {} redaction areas", pageNumber, areasForPage.size());
|
||||
|
||||
if (pageNumber > allPages.getCount()) {
|
||||
log.debug(
|
||||
"Skipping page {} - out of bounds (total pages: {})",
|
||||
pageNumber,
|
||||
allPages.getCount());
|
||||
continue; // Skip if page number is out of bounds
|
||||
}
|
||||
|
||||
PDPage page = allPages.get(pageNumber - 1);
|
||||
PDRectangle box = page.getBBox();
|
||||
|
||||
// Create only one content stream per page
|
||||
PDPageContentStream contentStream =
|
||||
// Create only one content stream per page to draw all redaction boxes
|
||||
try (PDPageContentStream contentStream =
|
||||
new PDPageContentStream(
|
||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
|
||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
|
||||
|
||||
// Process all redactions for this page
|
||||
for (RedactionArea redactionArea : areasForPage) {
|
||||
Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK);
|
||||
contentStream.setNonStrokingColor(redactColor);
|
||||
// Process all redactions for this page
|
||||
for (RedactionArea redactionArea : areasForPage) {
|
||||
Color redactColor = decodeOrDefault(redactionArea.getColor());
|
||||
log.debug(
|
||||
"Applying redaction with color {} at ({}, {}) size {}x{}",
|
||||
redactColor,
|
||||
redactionArea.getX(),
|
||||
redactionArea.getY(),
|
||||
redactionArea.getWidth(),
|
||||
redactionArea.getHeight());
|
||||
|
||||
float x = redactionArea.getX().floatValue();
|
||||
float y = redactionArea.getY().floatValue();
|
||||
float width = redactionArea.getWidth().floatValue();
|
||||
float height = redactionArea.getHeight().floatValue();
|
||||
contentStream.setNonStrokingColor(redactColor);
|
||||
|
||||
contentStream.addRect(x, box.getHeight() - y - height, width, height);
|
||||
contentStream.fill();
|
||||
float x = redactionArea.getX().floatValue();
|
||||
float y = redactionArea.getY().floatValue();
|
||||
float width = redactionArea.getWidth().floatValue();
|
||||
float height = redactionArea.getHeight().floatValue();
|
||||
|
||||
// The y-coordinate needs to be transformed from a top-left origin to a
|
||||
// bottom-left origin.
|
||||
float pdfY = page.getBBox().getHeight() - y - height;
|
||||
|
||||
contentStream.addRect(x, pdfY, width, height);
|
||||
contentStream.fill();
|
||||
}
|
||||
}
|
||||
|
||||
contentStream.close();
|
||||
}
|
||||
|
||||
log.debug("Completed redaction areas processing");
|
||||
}
|
||||
|
||||
private void redactPages(
|
||||
ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages)
|
||||
throws IOException {
|
||||
Color redactColor = decodeOrDefault(request.getPageRedactionColor(), Color.BLACK);
|
||||
log.debug("Starting page redactions");
|
||||
|
||||
Color redactColor = decodeOrDefault(request.getPageRedactionColor());
|
||||
List<Integer> pageNumbers = getPageNumbers(request, allPages.getCount());
|
||||
|
||||
log.debug("Redacting {} pages with color {}", pageNumbers.size(), redactColor);
|
||||
|
||||
for (Integer pageNumber : pageNumbers) {
|
||||
log.debug("Redacting entire page {}", pageNumber + 1);
|
||||
|
||||
PDPage page = allPages.get(pageNumber);
|
||||
|
||||
PDPageContentStream contentStream =
|
||||
try (PDPageContentStream contentStream =
|
||||
new PDPageContentStream(
|
||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
|
||||
contentStream.setNonStrokingColor(redactColor);
|
||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
|
||||
contentStream.setNonStrokingColor(redactColor);
|
||||
|
||||
PDRectangle box = page.getBBox();
|
||||
PDRectangle box = page.getBBox();
|
||||
log.debug(
|
||||
"Page {} dimensions: {}x{}",
|
||||
pageNumber + 1,
|
||||
box.getWidth(),
|
||||
box.getHeight());
|
||||
|
||||
contentStream.addRect(0, 0, box.getWidth(), box.getHeight());
|
||||
contentStream.fill();
|
||||
contentStream.close();
|
||||
contentStream.addRect(0, 0, box.getWidth(), box.getHeight());
|
||||
contentStream.fill();
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("Completed page redactions");
|
||||
}
|
||||
|
||||
private Color decodeOrDefault(String hex, Color defaultColor) {
|
||||
try {
|
||||
if (hex != null && !hex.startsWith("#")) {
|
||||
hex = "#" + hex;
|
||||
private void redactFoundText(
|
||||
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
|
||||
throws IOException {
|
||||
log.debug(
|
||||
"Redacting {} text blocks with padding {} and color {}",
|
||||
blocks.size(),
|
||||
customPadding,
|
||||
redactColor);
|
||||
|
||||
var allPages = document.getDocumentCatalog().getPages();
|
||||
|
||||
for (PDFText block : blocks) {
|
||||
log.debug(
|
||||
"Redacting text block on page {}: '{}' at ({}, {}) to ({}, {})",
|
||||
block.getPageIndex() + 1,
|
||||
block.getText(),
|
||||
block.getX1(),
|
||||
block.getY1(),
|
||||
block.getX2(),
|
||||
block.getY2());
|
||||
|
||||
var page = allPages.get(block.getPageIndex());
|
||||
try (PDPageContentStream contentStream =
|
||||
new PDPageContentStream(
|
||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
|
||||
contentStream.setNonStrokingColor(redactColor);
|
||||
float padding =
|
||||
(block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER
|
||||
+ customPadding;
|
||||
PDRectangle pageBox = page.getBBox();
|
||||
contentStream.addRect(
|
||||
block.getX1(),
|
||||
pageBox.getHeight() - block.getY2() - padding,
|
||||
block.getX2() - block.getX1(),
|
||||
block.getY2() - block.getY1() + 2 * padding);
|
||||
contentStream.fill();
|
||||
}
|
||||
return Color.decode(hex);
|
||||
} catch (Exception e) {
|
||||
return defaultColor;
|
||||
}
|
||||
|
||||
log.debug("Completed text block redactions");
|
||||
}
|
||||
|
||||
private Color decodeOrDefault(String hex) {
|
||||
if (hex == null) {
|
||||
return Color.BLACK;
|
||||
}
|
||||
|
||||
String colorString = hex.startsWith("#") ? hex : "#" + hex;
|
||||
|
||||
try {
|
||||
return Color.decode(colorString);
|
||||
} catch (NumberFormatException e) {
|
||||
log.warn("Invalid color string '{}'. Using default color BLACK.", hex);
|
||||
return Color.BLACK;
|
||||
}
|
||||
}
|
||||
|
||||
@ -198,6 +330,10 @@ public class RedactController {
|
||||
+ " Input:PDF, Output:PDF, Type:SISO")
|
||||
public ResponseEntity<byte[]> redactPdf(@ModelAttribute RedactPdfRequest request)
|
||||
throws Exception {
|
||||
log.debug(
|
||||
"Starting auto-redaction for file: {}",
|
||||
request.getFileInput().getOriginalFilename());
|
||||
|
||||
MultipartFile file = request.getFileInput();
|
||||
String listOfTextString = request.getListOfText();
|
||||
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
||||
@ -206,28 +342,80 @@ public class RedactController {
|
||||
float customPadding = request.getCustomPadding();
|
||||
boolean convertPDFToImage = Boolean.TRUE.equals(request.getConvertPDFToImage());
|
||||
|
||||
log.debug(
|
||||
"Auto-redaction parameters: useRegex={}, wholeWordSearch={}, customPadding={}, convertToImage={}",
|
||||
useRegex,
|
||||
wholeWordSearchBool,
|
||||
customPadding,
|
||||
convertPDFToImage);
|
||||
|
||||
String[] listOfText = listOfTextString.split("\n");
|
||||
log.debug("Searching for {} text patterns", listOfText.length);
|
||||
|
||||
PDDocument document = pdfDocumentFactory.load(file);
|
||||
log.debug("Loaded PDF document with {} pages", document.getNumberOfPages());
|
||||
|
||||
Color redactColor;
|
||||
try {
|
||||
if (!colorString.startsWith("#")) {
|
||||
if (colorString != null && !colorString.startsWith("#")) {
|
||||
colorString = "#" + colorString;
|
||||
}
|
||||
redactColor = Color.decode(colorString);
|
||||
log.debug("Using redaction color: {}", redactColor);
|
||||
} catch (NumberFormatException e) {
|
||||
log.warn("Invalid color string provided. Using default color BLACK for redaction.");
|
||||
redactColor = Color.BLACK;
|
||||
}
|
||||
|
||||
// Step 1: Find all text locations for all search terms
|
||||
log.debug("Step 1: Finding all text locations");
|
||||
Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
|
||||
Set<String> allSearchTerms = new HashSet<>();
|
||||
for (String text : listOfText) {
|
||||
text = text.trim();
|
||||
if (text.isEmpty()) continue;
|
||||
|
||||
log.debug("Searching for text pattern: '{}'", text);
|
||||
allSearchTerms.add(text);
|
||||
TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool);
|
||||
List<PDFText> foundTexts = textFinder.getTextLocations(document);
|
||||
redactFoundText(document, foundTexts, customPadding, redactColor);
|
||||
textFinder.getText(document);
|
||||
List<PDFText> foundTexts = textFinder.getFoundTexts();
|
||||
|
||||
log.debug("Found {} instances of pattern '{}'", foundTexts.size(), text);
|
||||
|
||||
for (PDFText found : foundTexts) {
|
||||
allFoundTextsByPage
|
||||
.computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>())
|
||||
.add(found);
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("Total pages with found text: {}", allFoundTextsByPage.size());
|
||||
|
||||
// Step 2: Process each page
|
||||
log.debug("Step 2: Processing each page for text replacement");
|
||||
for (PDPage page : document.getPages()) {
|
||||
// Replace text content
|
||||
List<Object> filteredTokens =
|
||||
createTokensWithoutTargetText(
|
||||
page, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||
writeFilteredContentStream(document, page, filteredTokens);
|
||||
}
|
||||
|
||||
// Draw redaction boxes for all found texts
|
||||
List<PDFText> allFoundTexts = new ArrayList<>();
|
||||
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
|
||||
allFoundTexts.addAll(pageTexts);
|
||||
}
|
||||
|
||||
log.debug("Drawing redaction boxes for {} total found texts", allFoundTexts.size());
|
||||
|
||||
if (!allFoundTexts.isEmpty()) {
|
||||
redactFoundText(document, allFoundTexts, customPadding, redactColor);
|
||||
}
|
||||
|
||||
if (convertPDFToImage) {
|
||||
log.debug("Converting redacted PDF to image format");
|
||||
PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
|
||||
document.close();
|
||||
document = convertedPdf;
|
||||
@ -238,32 +426,465 @@ public class RedactController {
|
||||
document.close();
|
||||
|
||||
byte[] pdfContent = baos.toByteArray();
|
||||
log.debug("Auto-redaction completed. Output PDF size: {} bytes", pdfContent.length);
|
||||
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
pdfContent,
|
||||
Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
|
||||
+ "_redacted.pdf");
|
||||
}
|
||||
|
||||
private void redactFoundText(
|
||||
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
|
||||
private List<Object> createTokensWithoutTargetText(
|
||||
PDPage page, Set<String> targetWords, boolean useRegex, boolean wholeWordSearch)
|
||||
throws IOException {
|
||||
var allPages = document.getDocumentCatalog().getPages();
|
||||
log.debug(
|
||||
"Creating tokens without target text for page, searching for {} words",
|
||||
targetWords.size());
|
||||
|
||||
for (PDFText block : blocks) {
|
||||
var page = allPages.get(block.getPageIndex());
|
||||
PDPageContentStream contentStream =
|
||||
new PDPageContentStream(
|
||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
|
||||
contentStream.setNonStrokingColor(redactColor);
|
||||
float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding;
|
||||
PDRectangle pageBox = page.getBBox();
|
||||
contentStream.addRect(
|
||||
block.getX1(),
|
||||
pageBox.getHeight() - block.getY1() - padding,
|
||||
block.getX2() - block.getX1(),
|
||||
block.getY2() - block.getY1() + 2 * padding);
|
||||
contentStream.fill();
|
||||
contentStream.close();
|
||||
PDFStreamParser parser = new PDFStreamParser(page);
|
||||
List<Object> tokens = new ArrayList<>();
|
||||
Object token;
|
||||
while ((token = parser.parseNextToken()) != null) {
|
||||
tokens.add(token);
|
||||
}
|
||||
|
||||
log.debug("Parsed {} tokens from page content stream", tokens.size());
|
||||
|
||||
List<TextSegment> textSegments = extractTextSegments(page, tokens);
|
||||
log.debug("Extracted {} text segments", textSegments.size());
|
||||
|
||||
String completeText = buildCompleteText(textSegments);
|
||||
log.debug("Built complete text of {} characters", completeText.length());
|
||||
|
||||
List<MatchRange> matches =
|
||||
findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
|
||||
log.debug("Found {} matches in complete text", matches.size());
|
||||
|
||||
return applyRedactionsToTokens(tokens, textSegments, matches);
|
||||
}
|
||||
|
||||
@Data
|
||||
private static class GraphicsState {
|
||||
private PDFont font = null;
|
||||
private float fontSize = 0;
|
||||
}
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
private static class TextSegment {
|
||||
private int tokenIndex;
|
||||
private String operatorName;
|
||||
private String text;
|
||||
private int startPos;
|
||||
private int endPos;
|
||||
private PDFont font;
|
||||
private float fontSize;
|
||||
}
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
private static class MatchRange {
|
||||
private int startPos;
|
||||
private int endPos;
|
||||
}
|
||||
|
||||
private List<TextSegment> extractTextSegments(PDPage page, List<Object> tokens)
|
||||
throws IOException {
|
||||
log.debug("Extracting text segments from {} tokens", tokens.size());
|
||||
|
||||
List<TextSegment> segments = new ArrayList<>();
|
||||
int currentTextPos = 0;
|
||||
GraphicsState graphicsState = new GraphicsState();
|
||||
PDResources resources = page.getResources();
|
||||
|
||||
for (int i = 0; i < tokens.size(); i++) {
|
||||
Object currentToken = tokens.get(i);
|
||||
|
||||
if (currentToken instanceof Operator op) {
|
||||
String opName = op.getName();
|
||||
|
||||
if ("Tf".equals(opName) && i >= 2) {
|
||||
try {
|
||||
COSName fontName = (COSName) tokens.get(i - 2);
|
||||
COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
|
||||
if (fontSizeBase instanceof org.apache.pdfbox.cos.COSNumber cosNumber) {
|
||||
graphicsState.setFont(resources.getFont(fontName));
|
||||
graphicsState.setFontSize(cosNumber.floatValue());
|
||||
log.debug(
|
||||
"Updated font state: {} size {}",
|
||||
fontName.getName(),
|
||||
graphicsState.getFontSize());
|
||||
}
|
||||
} catch (ClassCastException | IOException e) {
|
||||
log.warn("Failed to update font state", e);
|
||||
}
|
||||
}
|
||||
|
||||
if (isTextShowingOperator(opName) && i > 0) {
|
||||
String textContent = extractTextFromToken(tokens.get(i - 1), opName);
|
||||
if (!textContent.isEmpty()) {
|
||||
log.debug(
|
||||
"Found text segment '{}' at position {} with operator {}",
|
||||
textContent,
|
||||
currentTextPos,
|
||||
opName);
|
||||
segments.add(
|
||||
new TextSegment(
|
||||
i - 1,
|
||||
opName,
|
||||
textContent,
|
||||
currentTextPos,
|
||||
currentTextPos + textContent.length(),
|
||||
graphicsState.font,
|
||||
graphicsState.fontSize));
|
||||
currentTextPos += textContent.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("Extracted {} text segments from page", segments.size());
|
||||
return segments;
|
||||
}
|
||||
|
||||
private String buildCompleteText(List<TextSegment> segments) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (TextSegment segment : segments) {
|
||||
sb.append(segment.text);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private List<MatchRange> findAllMatches(
|
||||
String completeText,
|
||||
Set<String> targetWords,
|
||||
boolean useRegex,
|
||||
boolean wholeWordSearch) {
|
||||
log.debug(
|
||||
"Finding matches in text of {} characters for {} target words",
|
||||
completeText.length(),
|
||||
targetWords.size());
|
||||
|
||||
List<MatchRange> matches = new ArrayList<>();
|
||||
|
||||
for (String target : targetWords) {
|
||||
log.debug("Searching for pattern: '{}'", target);
|
||||
|
||||
String patternString = useRegex ? target : Pattern.quote(target);
|
||||
if (wholeWordSearch) {
|
||||
patternString = "\\b" + patternString + "\\b";
|
||||
}
|
||||
Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
|
||||
Matcher matcher = pattern.matcher(completeText);
|
||||
|
||||
int matchCount = 0;
|
||||
while (matcher.find()) {
|
||||
matches.add(new MatchRange(matcher.start(), matcher.end()));
|
||||
matchCount++;
|
||||
log.debug(
|
||||
"Found match for '{}' at positions {}-{}",
|
||||
target,
|
||||
matcher.start(),
|
||||
matcher.end());
|
||||
}
|
||||
|
||||
log.debug("Total matches for '{}': {}", target, matchCount);
|
||||
}
|
||||
|
||||
matches.sort((a, b) -> Integer.compare(a.startPos, b.startPos));
|
||||
log.debug("Found {} total matches across all patterns", matches.size());
|
||||
|
||||
return matches;
|
||||
}
|
||||
|
||||
private List<Object> applyRedactionsToTokens(
|
||||
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
|
||||
log.debug(
|
||||
"Applying redactions to {} tokens with {} text segments and {} matches",
|
||||
tokens.size(),
|
||||
textSegments.size(),
|
||||
matches.size());
|
||||
|
||||
List<Object> newTokens = new ArrayList<>(tokens);
|
||||
|
||||
// Group matches by segment to pass to modification methods
|
||||
Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
|
||||
for (MatchRange match : matches) {
|
||||
for (int i = 0; i < textSegments.size(); i++) {
|
||||
TextSegment segment = textSegments.get(i);
|
||||
int overlapStart = Math.max(match.startPos, segment.startPos);
|
||||
int overlapEnd = Math.min(match.endPos, segment.endPos);
|
||||
if (overlapStart < overlapEnd) {
|
||||
matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("Grouped matches by segment: {} segments affected", matchesBySegment.size());
|
||||
|
||||
// Create a list of modification tasks
|
||||
List<ModificationTask> tasks = new ArrayList<>();
|
||||
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
|
||||
int segmentIndex = entry.getKey();
|
||||
List<MatchRange> segmentMatches = entry.getValue();
|
||||
TextSegment segment = textSegments.get(segmentIndex);
|
||||
|
||||
log.debug(
|
||||
"Creating modification task for segment {} with {} matches",
|
||||
segmentIndex,
|
||||
segmentMatches.size());
|
||||
|
||||
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) {
|
||||
String newText = applyRedactionsToSegmentText(segment, segmentMatches);
|
||||
try {
|
||||
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
|
||||
tasks.add(new ModificationTask(segment, newText, adjustment));
|
||||
} catch (IOException e) {
|
||||
log.warn("Failed to calculate width adjustment for redaction.", e);
|
||||
}
|
||||
} else if ("TJ".equals(segment.operatorName)) {
|
||||
tasks.add(new ModificationTask(segment, null, 0));
|
||||
}
|
||||
}
|
||||
|
||||
// Sort tasks by token index in descending order to avoid index shifting issues
|
||||
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
|
||||
|
||||
log.debug("Applying {} modification tasks", tasks.size());
|
||||
|
||||
// Apply modifications
|
||||
for (ModificationTask task : tasks) {
|
||||
List<MatchRange> segmentMatches =
|
||||
matchesBySegment.getOrDefault(
|
||||
textSegments.indexOf(task.segment), Collections.emptyList());
|
||||
modifyTokenForRedaction(
|
||||
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
|
||||
}
|
||||
|
||||
log.debug("Completed applying redactions to tokens");
|
||||
return newTokens;
|
||||
}
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
private static class ModificationTask {
|
||||
private TextSegment segment;
|
||||
private String newText; // Only for Tj
|
||||
private float adjustment; // Only for Tj
|
||||
}
|
||||
|
||||
private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) {
|
||||
String text = segment.getText();
|
||||
StringBuilder result = new StringBuilder(text);
|
||||
|
||||
for (MatchRange match : matches) {
|
||||
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
|
||||
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
|
||||
|
||||
if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) {
|
||||
String placeholder = createPlaceholder(text.substring(segmentStart, segmentEnd));
|
||||
result.replace(segmentStart, segmentEnd, placeholder);
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
private float calculateWidthAdjustment(TextSegment segment, List<MatchRange> matches)
|
||||
throws IOException {
|
||||
float totalOriginalWidth = 0;
|
||||
float totalPlaceholderWidth = 0;
|
||||
String text = segment.getText();
|
||||
|
||||
for (MatchRange match : matches) {
|
||||
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
|
||||
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
|
||||
|
||||
if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) {
|
||||
String originalPart = text.substring(segmentStart, segmentEnd);
|
||||
String placeholderPart = createPlaceholder(originalPart);
|
||||
|
||||
if (segment.getFont() != null) {
|
||||
totalOriginalWidth +=
|
||||
segment.getFont().getStringWidth(originalPart)
|
||||
/ FONT_SCALE_FACTOR
|
||||
* segment.getFontSize();
|
||||
totalPlaceholderWidth +=
|
||||
segment.getFont().getStringWidth(placeholderPart)
|
||||
/ FONT_SCALE_FACTOR
|
||||
* segment.getFontSize();
|
||||
}
|
||||
}
|
||||
}
|
||||
return totalOriginalWidth - totalPlaceholderWidth;
|
||||
}
|
||||
|
||||
private void modifyTokenForRedaction(
|
||||
List<Object> tokens,
|
||||
TextSegment segment,
|
||||
String newText,
|
||||
float adjustment,
|
||||
List<MatchRange> matches) {
|
||||
log.debug(
|
||||
"Modifying token at index {} for segment '{}' with operator {}",
|
||||
segment.getTokenIndex(),
|
||||
segment.getText(),
|
||||
segment.getOperatorName());
|
||||
|
||||
if (segment.getTokenIndex() < 0 || segment.getTokenIndex() >= tokens.size()) {
|
||||
log.debug(
|
||||
"Token index {} out of bounds (0-{})",
|
||||
segment.getTokenIndex(),
|
||||
tokens.size() - 1);
|
||||
return;
|
||||
}
|
||||
|
||||
Object token = tokens.get(segment.getTokenIndex());
|
||||
String operatorName = segment.getOperatorName();
|
||||
|
||||
try {
|
||||
if (("Tj".equals(operatorName) || "'".equals(operatorName))
|
||||
&& token instanceof COSString) {
|
||||
log.debug("Modifying Tj/quote operator with adjustment {}", adjustment);
|
||||
|
||||
if (Math.abs(adjustment) < PRECISION_THRESHOLD) {
|
||||
tokens.set(segment.getTokenIndex(), new COSString(newText));
|
||||
} else {
|
||||
COSArray newArray = new COSArray();
|
||||
newArray.add(new COSString(newText));
|
||||
if (segment.getFontSize() > 0) {
|
||||
float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize();
|
||||
newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning));
|
||||
log.debug("Applied kerning adjustment: {}", kerning);
|
||||
}
|
||||
tokens.set(segment.getTokenIndex(), newArray);
|
||||
|
||||
int operatorIndex = segment.getTokenIndex() + 1;
|
||||
if (operatorIndex < tokens.size()
|
||||
&& tokens.get(operatorIndex) instanceof Operator op
|
||||
&& op.getName().equals(operatorName)) {
|
||||
tokens.set(operatorIndex, Operator.getOperator("TJ"));
|
||||
log.debug("Changed operator from {} to TJ", operatorName);
|
||||
}
|
||||
}
|
||||
} else if ("TJ".equals(operatorName) && token instanceof COSArray) {
|
||||
log.debug("Modifying TJ operator array");
|
||||
COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches);
|
||||
tokens.set(segment.getTokenIndex(), newArray);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.warn("Failed to modify token for redaction: {}", e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private COSArray createRedactedTJArray(
|
||||
COSArray originalArray, TextSegment segment, List<MatchRange> matches)
|
||||
throws IOException {
|
||||
COSArray newArray = new COSArray();
|
||||
int textOffsetInSegment = 0;
|
||||
|
||||
for (COSBase element : originalArray) {
|
||||
if (element instanceof COSString cosString) {
|
||||
String originalText = cosString.getString();
|
||||
StringBuilder newText = new StringBuilder(originalText);
|
||||
boolean modified = false;
|
||||
|
||||
for (MatchRange match : matches) {
|
||||
int stringStartInPage = segment.getStartPos() + textOffsetInSegment;
|
||||
int stringEndInPage = stringStartInPage + originalText.length();
|
||||
|
||||
int overlapStart = Math.max(match.getStartPos(), stringStartInPage);
|
||||
int overlapEnd = Math.min(match.getEndPos(), stringEndInPage);
|
||||
|
||||
if (overlapStart < overlapEnd) {
|
||||
modified = true;
|
||||
int redactionStartInString = overlapStart - stringStartInPage;
|
||||
int redactionEndInString = overlapEnd - stringStartInPage;
|
||||
if (redactionStartInString >= 0
|
||||
&& redactionEndInString <= originalText.length()) {
|
||||
String placeholder =
|
||||
createPlaceholder(
|
||||
originalText.substring(
|
||||
redactionStartInString, redactionEndInString));
|
||||
newText.replace(
|
||||
redactionStartInString, redactionEndInString, placeholder);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String modifiedString = newText.toString();
|
||||
newArray.add(new COSString(modifiedString));
|
||||
|
||||
if (modified && segment.getFont() != null && segment.getFontSize() > 0) {
|
||||
float originalWidth =
|
||||
segment.getFont().getStringWidth(originalText)
|
||||
/ FONT_SCALE_FACTOR
|
||||
* segment.getFontSize();
|
||||
float modifiedWidth =
|
||||
segment.getFont().getStringWidth(modifiedString)
|
||||
/ FONT_SCALE_FACTOR
|
||||
* segment.getFontSize();
|
||||
float adjustment = originalWidth - modifiedWidth;
|
||||
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
||||
float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize();
|
||||
newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning));
|
||||
}
|
||||
}
|
||||
|
||||
textOffsetInSegment += originalText.length();
|
||||
} else {
|
||||
newArray.add(element);
|
||||
}
|
||||
}
|
||||
return newArray;
|
||||
}
|
||||
|
||||
private String extractTextFromToken(Object token, String operatorName) {
|
||||
return switch (operatorName) {
|
||||
case "Tj", "'" -> {
|
||||
if (token instanceof COSString cosString) {
|
||||
yield cosString.getString();
|
||||
}
|
||||
yield "";
|
||||
}
|
||||
case "TJ" -> {
|
||||
if (token instanceof COSArray cosArray) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (COSBase element : cosArray) {
|
||||
if (element instanceof COSString cosString) {
|
||||
sb.append(cosString.getString());
|
||||
}
|
||||
}
|
||||
yield sb.toString();
|
||||
}
|
||||
yield "";
|
||||
}
|
||||
default -> "";
|
||||
};
|
||||
}
|
||||
|
||||
private String createPlaceholder(String originalWord) {
|
||||
if (originalWord == null || originalWord.isEmpty()) {
|
||||
return originalWord;
|
||||
}
|
||||
return "".repeat(originalWord.length());
|
||||
}
|
||||
|
||||
private void writeFilteredContentStream(PDDocument document, PDPage page, List<Object> tokens)
|
||||
throws IOException {
|
||||
log.debug("Writing filtered content stream with {} tokens", tokens.size());
|
||||
|
||||
PDStream newStream = new PDStream(document);
|
||||
try (var out = newStream.createOutputStream()) {
|
||||
ContentStreamWriter writer = new ContentStreamWriter(out);
|
||||
writer.writeTokens(tokens);
|
||||
}
|
||||
page.setContents(newStream);
|
||||
|
||||
log.debug("Successfully wrote filtered content stream");
|
||||
}
|
||||
|
||||
private boolean isTextShowingOperator(String opName) {
|
||||
return TEXT_SHOWING_OPERATORS.contains(opName);
|
||||
}
|
||||
}
|
||||
|
@ -6,102 +6,109 @@ import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.model.PDFText;
|
||||
|
||||
@Slf4j
|
||||
public class TextFinder extends PDFTextStripper {
|
||||
|
||||
private final String searchText;
|
||||
private final String searchTerm;
|
||||
private final boolean useRegex;
|
||||
private final boolean wholeWordSearch;
|
||||
private final List<PDFText> textOccurrences = new ArrayList<>();
|
||||
private final List<PDFText> foundTexts = new ArrayList<>();
|
||||
|
||||
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch)
|
||||
private final List<TextPosition> pageTextPositions = new ArrayList<>();
|
||||
private final StringBuilder pageTextBuilder = new StringBuilder();
|
||||
|
||||
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
|
||||
throws IOException {
|
||||
this.searchText = searchText.toLowerCase();
|
||||
super();
|
||||
this.searchTerm = searchTerm;
|
||||
this.useRegex = useRegex;
|
||||
this.wholeWordSearch = wholeWordSearch;
|
||||
setSortByPosition(true);
|
||||
this.setWordSeparator(" ");
|
||||
}
|
||||
|
||||
private List<MatchInfo> findOccurrencesInText(String searchText, String content) {
|
||||
List<MatchInfo> matches = new ArrayList<>();
|
||||
|
||||
Pattern pattern;
|
||||
|
||||
if (useRegex) {
|
||||
// Use regex-based search
|
||||
pattern =
|
||||
wholeWordSearch
|
||||
? Pattern.compile("\\b" + searchText + "\\b")
|
||||
: Pattern.compile(searchText);
|
||||
} else {
|
||||
// Use normal text search
|
||||
pattern =
|
||||
wholeWordSearch
|
||||
? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
|
||||
: Pattern.compile(Pattern.quote(searchText));
|
||||
}
|
||||
|
||||
Matcher matcher = pattern.matcher(content);
|
||||
while (matcher.find()) {
|
||||
matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
|
||||
}
|
||||
return matches;
|
||||
@Override
|
||||
protected void startPage(PDPage page) {
|
||||
pageTextPositions.clear();
|
||||
pageTextBuilder.setLength(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeString(String text, List<TextPosition> textPositions) {
|
||||
for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) {
|
||||
int index = match.startIndex;
|
||||
if (index + match.matchLength <= textPositions.size()) {
|
||||
// Initial values based on the first character
|
||||
TextPosition first = textPositions.get(index);
|
||||
float minX = first.getX();
|
||||
float minY = first.getY();
|
||||
float maxX = first.getX() + first.getWidth();
|
||||
float maxY = first.getY() + first.getHeight();
|
||||
pageTextBuilder.append(text);
|
||||
pageTextPositions.addAll(textPositions);
|
||||
}
|
||||
|
||||
// Loop over the rest of the characters and adjust bounding box values
|
||||
for (int i = index; i < index + match.matchLength; i++) {
|
||||
TextPosition position = textPositions.get(i);
|
||||
minX = Math.min(minX, position.getX());
|
||||
minY = Math.min(minY, position.getY());
|
||||
maxX = Math.max(maxX, position.getX() + position.getWidth());
|
||||
maxY = Math.max(maxY, position.getY() + position.getHeight());
|
||||
@Override
|
||||
protected void writeWordSeparator() {
|
||||
pageTextBuilder.append(getWordSeparator());
|
||||
pageTextPositions.add(null); // Placeholder for separator
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeLineSeparator() {
|
||||
pageTextBuilder.append(getLineSeparator());
|
||||
pageTextPositions.add(null); // Placeholder for separator
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void endPage(PDPage page) {
|
||||
String text = pageTextBuilder.toString();
|
||||
if (text.isEmpty() || this.searchTerm == null || this.searchTerm.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String processedSearchTerm = this.searchTerm.trim();
|
||||
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
|
||||
if (this.wholeWordSearch) {
|
||||
regex = "\\b" + regex + "\\b";
|
||||
}
|
||||
|
||||
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
|
||||
Matcher matcher = pattern.matcher(text);
|
||||
|
||||
while (matcher.find()) {
|
||||
int matchStart = matcher.start();
|
||||
int matchEnd = matcher.end();
|
||||
|
||||
float minX = Float.MAX_VALUE;
|
||||
float minY = Float.MAX_VALUE;
|
||||
float maxX = Float.MIN_VALUE;
|
||||
float maxY = Float.MIN_VALUE;
|
||||
boolean foundPosition = false;
|
||||
|
||||
for (int i = matchStart; i < matchEnd; i++) {
|
||||
if (i >= pageTextPositions.size()) {
|
||||
continue;
|
||||
}
|
||||
TextPosition pos = pageTextPositions.get(i);
|
||||
if (pos != null) {
|
||||
foundPosition = true;
|
||||
minX = Math.min(minX, pos.getX());
|
||||
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
|
||||
minY = Math.min(minY, pos.getY() - pos.getHeight());
|
||||
maxY = Math.max(maxY, pos.getY());
|
||||
}
|
||||
}
|
||||
|
||||
textOccurrences.add(
|
||||
new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text));
|
||||
if (foundPosition) {
|
||||
foundTexts.add(
|
||||
new PDFText(
|
||||
this.getCurrentPageNo() - 1,
|
||||
minX,
|
||||
minY,
|
||||
maxX,
|
||||
maxY,
|
||||
matcher.group()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<PDFText> getTextLocations(PDDocument document) throws Exception {
|
||||
this.getText(document);
|
||||
log.debug(
|
||||
"Found "
|
||||
+ textOccurrences.size()
|
||||
+ " occurrences of '"
|
||||
+ searchText
|
||||
+ "' in the document.");
|
||||
|
||||
return textOccurrences;
|
||||
}
|
||||
|
||||
private class MatchInfo {
|
||||
int startIndex;
|
||||
int matchLength;
|
||||
|
||||
MatchInfo(int startIndex, int matchLength) {
|
||||
this.startIndex = startIndex;
|
||||
this.matchLength = matchLength;
|
||||
}
|
||||
public List<PDFText> getFoundTexts() {
|
||||
return foundTexts;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user