feat: auto-redact to support text removal on true PDFs

This commit is contained in:
Balázs Szücs 2025-07-12 11:03:33 +02:00
parent bbf5d5f6d4
commit d7fb66bb79
2 changed files with 753 additions and 125 deletions

View File

@ -1,19 +1,33 @@
package stirling.software.SPDF.controller.api.security;
import java.awt.*;
import java.awt.Color;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.WebDataBinder;
import org.springframework.web.bind.annotation.InitBinder;
@ -27,6 +41,8 @@ import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -48,6 +64,13 @@ import stirling.software.common.util.propertyeditor.StringToArrayListPropertyEdi
@RequiredArgsConstructor
public class RedactController {
private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.3f;
private static final float PRECISION_THRESHOLD = 1e-3f;
private static final int FONT_SCALE_FACTOR = 1000;
// Text showing operators
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
private final CustomPDFDocumentFactory pdfDocumentFactory;
@InitBinder
@ -65,17 +88,30 @@ public class RedactController {
+ " Type:SISO")
public ResponseEntity<byte[]> redactPDF(@ModelAttribute ManualRedactPdfRequest request)
throws IOException {
log.debug(
"Starting manual redaction for file: {}",
request.getFileInput().getOriginalFilename());
MultipartFile file = request.getFileInput();
List<RedactionArea> redactionAreas = request.getRedactions();
log.debug(
"Processing {} redaction areas",
redactionAreas != null ? redactionAreas.size() : 0);
PDDocument document = pdfDocumentFactory.load(file);
log.debug("Loaded PDF document with {} pages", document.getNumberOfPages());
PDPageTree allPages = document.getDocumentCatalog().getPages();
log.debug("Starting page redactions");
redactPages(request, document, allPages);
log.debug("Starting area redactions");
redactAreas(redactionAreas, document, allPages);
if (Boolean.TRUE.equals(request.getConvertPDFToImage())) {
log.debug("Converting PDF to image format");
PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
document.close();
document = convertedPdf;
@ -86,6 +122,8 @@ public class RedactController {
document.close();
byte[] pdfContent = baos.toByteArray();
log.debug("Manual redaction completed. Output PDF size: {} bytes", pdfContent.length);
return WebResponseUtils.bytesToWebResponse(
pdfContent,
Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
@ -95,17 +133,30 @@ public class RedactController {
private void redactAreas(
List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
throws IOException {
log.debug("Processing redaction areas");
// Group redaction areas by page
Map<Integer, List<RedactionArea>> redactionsByPage = new HashMap<>();
// Process and validate each redaction area
for (RedactionArea redactionArea : redactionAreas) {
log.debug(
"Validating redaction area on page {}: x={}, y={}, width={}, height={}",
redactionArea.getPage(),
redactionArea.getX(),
redactionArea.getY(),
redactionArea.getWidth(),
redactionArea.getHeight());
if (redactionArea.getPage() == null
|| redactionArea.getPage() <= 0
|| redactionArea.getHeight() == null
|| redactionArea.getHeight() <= 0.0D
|| redactionArea.getWidth() == null
|| redactionArea.getWidth() <= 0.0D) continue;
|| redactionArea.getWidth() <= 0.0D) {
log.debug("Skipping invalid redaction area: {}", redactionArea);
continue;
}
// Group by page number
redactionsByPage
@ -113,70 +164,151 @@ public class RedactController {
.add(redactionArea);
}
log.debug("Grouped redactions by page: {} pages affected", redactionsByPage.size());
// Process each page only once
for (Map.Entry<Integer, List<RedactionArea>> entry : redactionsByPage.entrySet()) {
Integer pageNumber = entry.getKey();
List<RedactionArea> areasForPage = entry.getValue();
log.debug(
"Processing page {} with {} redaction areas", pageNumber, areasForPage.size());
if (pageNumber > allPages.getCount()) {
log.debug(
"Skipping page {} - out of bounds (total pages: {})",
pageNumber,
allPages.getCount());
continue; // Skip if page number is out of bounds
}
PDPage page = allPages.get(pageNumber - 1);
PDRectangle box = page.getBBox();
// Create only one content stream per page
PDPageContentStream contentStream =
// Create only one content stream per page to draw all redaction boxes
try (PDPageContentStream contentStream =
new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
// Process all redactions for this page
for (RedactionArea redactionArea : areasForPage) {
Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK);
contentStream.setNonStrokingColor(redactColor);
// Process all redactions for this page
for (RedactionArea redactionArea : areasForPage) {
Color redactColor = decodeOrDefault(redactionArea.getColor());
log.debug(
"Applying redaction with color {} at ({}, {}) size {}x{}",
redactColor,
redactionArea.getX(),
redactionArea.getY(),
redactionArea.getWidth(),
redactionArea.getHeight());
float x = redactionArea.getX().floatValue();
float y = redactionArea.getY().floatValue();
float width = redactionArea.getWidth().floatValue();
float height = redactionArea.getHeight().floatValue();
contentStream.setNonStrokingColor(redactColor);
contentStream.addRect(x, box.getHeight() - y - height, width, height);
contentStream.fill();
float x = redactionArea.getX().floatValue();
float y = redactionArea.getY().floatValue();
float width = redactionArea.getWidth().floatValue();
float height = redactionArea.getHeight().floatValue();
// The y-coordinate needs to be transformed from a top-left origin to a
// bottom-left origin.
float pdfY = page.getBBox().getHeight() - y - height;
contentStream.addRect(x, pdfY, width, height);
contentStream.fill();
}
}
contentStream.close();
}
log.debug("Completed redaction areas processing");
}
private void redactPages(
ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages)
throws IOException {
Color redactColor = decodeOrDefault(request.getPageRedactionColor(), Color.BLACK);
log.debug("Starting page redactions");
Color redactColor = decodeOrDefault(request.getPageRedactionColor());
List<Integer> pageNumbers = getPageNumbers(request, allPages.getCount());
log.debug("Redacting {} pages with color {}", pageNumbers.size(), redactColor);
for (Integer pageNumber : pageNumbers) {
log.debug("Redacting entire page {}", pageNumber + 1);
PDPage page = allPages.get(pageNumber);
PDPageContentStream contentStream =
try (PDPageContentStream contentStream =
new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
contentStream.setNonStrokingColor(redactColor);
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
contentStream.setNonStrokingColor(redactColor);
PDRectangle box = page.getBBox();
PDRectangle box = page.getBBox();
log.debug(
"Page {} dimensions: {}x{}",
pageNumber + 1,
box.getWidth(),
box.getHeight());
contentStream.addRect(0, 0, box.getWidth(), box.getHeight());
contentStream.fill();
contentStream.close();
contentStream.addRect(0, 0, box.getWidth(), box.getHeight());
contentStream.fill();
}
}
log.debug("Completed page redactions");
}
private Color decodeOrDefault(String hex, Color defaultColor) {
try {
if (hex != null && !hex.startsWith("#")) {
hex = "#" + hex;
private void redactFoundText(
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
throws IOException {
log.debug(
"Redacting {} text blocks with padding {} and color {}",
blocks.size(),
customPadding,
redactColor);
var allPages = document.getDocumentCatalog().getPages();
for (PDFText block : blocks) {
log.debug(
"Redacting text block on page {}: '{}' at ({}, {}) to ({}, {})",
block.getPageIndex() + 1,
block.getText(),
block.getX1(),
block.getY1(),
block.getX2(),
block.getY2());
var page = allPages.get(block.getPageIndex());
try (PDPageContentStream contentStream =
new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
contentStream.setNonStrokingColor(redactColor);
float padding =
(block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER
+ customPadding;
PDRectangle pageBox = page.getBBox();
contentStream.addRect(
block.getX1(),
pageBox.getHeight() - block.getY2() - padding,
block.getX2() - block.getX1(),
block.getY2() - block.getY1() + 2 * padding);
contentStream.fill();
}
return Color.decode(hex);
} catch (Exception e) {
return defaultColor;
}
log.debug("Completed text block redactions");
}
private Color decodeOrDefault(String hex) {
if (hex == null) {
return Color.BLACK;
}
String colorString = hex.startsWith("#") ? hex : "#" + hex;
try {
return Color.decode(colorString);
} catch (NumberFormatException e) {
log.warn("Invalid color string '{}'. Using default color BLACK.", hex);
return Color.BLACK;
}
}
@ -198,6 +330,10 @@ public class RedactController {
+ " Input:PDF, Output:PDF, Type:SISO")
public ResponseEntity<byte[]> redactPdf(@ModelAttribute RedactPdfRequest request)
throws Exception {
log.debug(
"Starting auto-redaction for file: {}",
request.getFileInput().getOriginalFilename());
MultipartFile file = request.getFileInput();
String listOfTextString = request.getListOfText();
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
@ -206,28 +342,80 @@ public class RedactController {
float customPadding = request.getCustomPadding();
boolean convertPDFToImage = Boolean.TRUE.equals(request.getConvertPDFToImage());
log.debug(
"Auto-redaction parameters: useRegex={}, wholeWordSearch={}, customPadding={}, convertToImage={}",
useRegex,
wholeWordSearchBool,
customPadding,
convertPDFToImage);
String[] listOfText = listOfTextString.split("\n");
log.debug("Searching for {} text patterns", listOfText.length);
PDDocument document = pdfDocumentFactory.load(file);
log.debug("Loaded PDF document with {} pages", document.getNumberOfPages());
Color redactColor;
try {
if (!colorString.startsWith("#")) {
if (colorString != null && !colorString.startsWith("#")) {
colorString = "#" + colorString;
}
redactColor = Color.decode(colorString);
log.debug("Using redaction color: {}", redactColor);
} catch (NumberFormatException e) {
log.warn("Invalid color string provided. Using default color BLACK for redaction.");
redactColor = Color.BLACK;
}
// Step 1: Find all text locations for all search terms
log.debug("Step 1: Finding all text locations");
Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
Set<String> allSearchTerms = new HashSet<>();
for (String text : listOfText) {
text = text.trim();
if (text.isEmpty()) continue;
log.debug("Searching for text pattern: '{}'", text);
allSearchTerms.add(text);
TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool);
List<PDFText> foundTexts = textFinder.getTextLocations(document);
redactFoundText(document, foundTexts, customPadding, redactColor);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
log.debug("Found {} instances of pattern '{}'", foundTexts.size(), text);
for (PDFText found : foundTexts) {
allFoundTextsByPage
.computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>())
.add(found);
}
}
log.debug("Total pages with found text: {}", allFoundTextsByPage.size());
// Step 2: Process each page
log.debug("Step 2: Processing each page for text replacement");
for (PDPage page : document.getPages()) {
// Replace text content
List<Object> filteredTokens =
createTokensWithoutTargetText(
page, allSearchTerms, useRegex, wholeWordSearchBool);
writeFilteredContentStream(document, page, filteredTokens);
}
// Draw redaction boxes for all found texts
List<PDFText> allFoundTexts = new ArrayList<>();
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
allFoundTexts.addAll(pageTexts);
}
log.debug("Drawing redaction boxes for {} total found texts", allFoundTexts.size());
if (!allFoundTexts.isEmpty()) {
redactFoundText(document, allFoundTexts, customPadding, redactColor);
}
if (convertPDFToImage) {
log.debug("Converting redacted PDF to image format");
PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
document.close();
document = convertedPdf;
@ -238,32 +426,465 @@ public class RedactController {
document.close();
byte[] pdfContent = baos.toByteArray();
log.debug("Auto-redaction completed. Output PDF size: {} bytes", pdfContent.length);
return WebResponseUtils.bytesToWebResponse(
pdfContent,
Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
+ "_redacted.pdf");
}
private void redactFoundText(
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
private List<Object> createTokensWithoutTargetText(
PDPage page, Set<String> targetWords, boolean useRegex, boolean wholeWordSearch)
throws IOException {
var allPages = document.getDocumentCatalog().getPages();
log.debug(
"Creating tokens without target text for page, searching for {} words",
targetWords.size());
for (PDFText block : blocks) {
var page = allPages.get(block.getPageIndex());
PDPageContentStream contentStream =
new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true);
contentStream.setNonStrokingColor(redactColor);
float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding;
PDRectangle pageBox = page.getBBox();
contentStream.addRect(
block.getX1(),
pageBox.getHeight() - block.getY1() - padding,
block.getX2() - block.getX1(),
block.getY2() - block.getY1() + 2 * padding);
contentStream.fill();
contentStream.close();
PDFStreamParser parser = new PDFStreamParser(page);
List<Object> tokens = new ArrayList<>();
Object token;
while ((token = parser.parseNextToken()) != null) {
tokens.add(token);
}
log.debug("Parsed {} tokens from page content stream", tokens.size());
List<TextSegment> textSegments = extractTextSegments(page, tokens);
log.debug("Extracted {} text segments", textSegments.size());
String completeText = buildCompleteText(textSegments);
log.debug("Built complete text of {} characters", completeText.length());
List<MatchRange> matches =
findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
log.debug("Found {} matches in complete text", matches.size());
return applyRedactionsToTokens(tokens, textSegments, matches);
}
@Data
private static class GraphicsState {
private PDFont font = null;
private float fontSize = 0;
}
@Data
@AllArgsConstructor
private static class TextSegment {
private int tokenIndex;
private String operatorName;
private String text;
private int startPos;
private int endPos;
private PDFont font;
private float fontSize;
}
@Data
@AllArgsConstructor
private static class MatchRange {
private int startPos;
private int endPos;
}
private List<TextSegment> extractTextSegments(PDPage page, List<Object> tokens)
throws IOException {
log.debug("Extracting text segments from {} tokens", tokens.size());
List<TextSegment> segments = new ArrayList<>();
int currentTextPos = 0;
GraphicsState graphicsState = new GraphicsState();
PDResources resources = page.getResources();
for (int i = 0; i < tokens.size(); i++) {
Object currentToken = tokens.get(i);
if (currentToken instanceof Operator op) {
String opName = op.getName();
if ("Tf".equals(opName) && i >= 2) {
try {
COSName fontName = (COSName) tokens.get(i - 2);
COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
if (fontSizeBase instanceof org.apache.pdfbox.cos.COSNumber cosNumber) {
graphicsState.setFont(resources.getFont(fontName));
graphicsState.setFontSize(cosNumber.floatValue());
log.debug(
"Updated font state: {} size {}",
fontName.getName(),
graphicsState.getFontSize());
}
} catch (ClassCastException | IOException e) {
log.warn("Failed to update font state", e);
}
}
if (isTextShowingOperator(opName) && i > 0) {
String textContent = extractTextFromToken(tokens.get(i - 1), opName);
if (!textContent.isEmpty()) {
log.debug(
"Found text segment '{}' at position {} with operator {}",
textContent,
currentTextPos,
opName);
segments.add(
new TextSegment(
i - 1,
opName,
textContent,
currentTextPos,
currentTextPos + textContent.length(),
graphicsState.font,
graphicsState.fontSize));
currentTextPos += textContent.length();
}
}
}
}
log.debug("Extracted {} text segments from page", segments.size());
return segments;
}
private String buildCompleteText(List<TextSegment> segments) {
StringBuilder sb = new StringBuilder();
for (TextSegment segment : segments) {
sb.append(segment.text);
}
return sb.toString();
}
private List<MatchRange> findAllMatches(
String completeText,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearch) {
log.debug(
"Finding matches in text of {} characters for {} target words",
completeText.length(),
targetWords.size());
List<MatchRange> matches = new ArrayList<>();
for (String target : targetWords) {
log.debug("Searching for pattern: '{}'", target);
String patternString = useRegex ? target : Pattern.quote(target);
if (wholeWordSearch) {
patternString = "\\b" + patternString + "\\b";
}
Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(completeText);
int matchCount = 0;
while (matcher.find()) {
matches.add(new MatchRange(matcher.start(), matcher.end()));
matchCount++;
log.debug(
"Found match for '{}' at positions {}-{}",
target,
matcher.start(),
matcher.end());
}
log.debug("Total matches for '{}': {}", target, matchCount);
}
matches.sort((a, b) -> Integer.compare(a.startPos, b.startPos));
log.debug("Found {} total matches across all patterns", matches.size());
return matches;
}
private List<Object> applyRedactionsToTokens(
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
log.debug(
"Applying redactions to {} tokens with {} text segments and {} matches",
tokens.size(),
textSegments.size(),
matches.size());
List<Object> newTokens = new ArrayList<>(tokens);
// Group matches by segment to pass to modification methods
Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
for (MatchRange match : matches) {
for (int i = 0; i < textSegments.size(); i++) {
TextSegment segment = textSegments.get(i);
int overlapStart = Math.max(match.startPos, segment.startPos);
int overlapEnd = Math.min(match.endPos, segment.endPos);
if (overlapStart < overlapEnd) {
matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
}
}
}
log.debug("Grouped matches by segment: {} segments affected", matchesBySegment.size());
// Create a list of modification tasks
List<ModificationTask> tasks = new ArrayList<>();
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
int segmentIndex = entry.getKey();
List<MatchRange> segmentMatches = entry.getValue();
TextSegment segment = textSegments.get(segmentIndex);
log.debug(
"Creating modification task for segment {} with {} matches",
segmentIndex,
segmentMatches.size());
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) {
String newText = applyRedactionsToSegmentText(segment, segmentMatches);
try {
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
tasks.add(new ModificationTask(segment, newText, adjustment));
} catch (IOException e) {
log.warn("Failed to calculate width adjustment for redaction.", e);
}
} else if ("TJ".equals(segment.operatorName)) {
tasks.add(new ModificationTask(segment, null, 0));
}
}
// Sort tasks by token index in descending order to avoid index shifting issues
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
log.debug("Applying {} modification tasks", tasks.size());
// Apply modifications
for (ModificationTask task : tasks) {
List<MatchRange> segmentMatches =
matchesBySegment.getOrDefault(
textSegments.indexOf(task.segment), Collections.emptyList());
modifyTokenForRedaction(
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
}
log.debug("Completed applying redactions to tokens");
return newTokens;
}
@Data
@AllArgsConstructor
private static class ModificationTask {
private TextSegment segment;
private String newText; // Only for Tj
private float adjustment; // Only for Tj
}
private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) {
String text = segment.getText();
StringBuilder result = new StringBuilder(text);
for (MatchRange match : matches) {
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) {
String placeholder = createPlaceholder(text.substring(segmentStart, segmentEnd));
result.replace(segmentStart, segmentEnd, placeholder);
}
}
return result.toString();
}
private float calculateWidthAdjustment(TextSegment segment, List<MatchRange> matches)
throws IOException {
float totalOriginalWidth = 0;
float totalPlaceholderWidth = 0;
String text = segment.getText();
for (MatchRange match : matches) {
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) {
String originalPart = text.substring(segmentStart, segmentEnd);
String placeholderPart = createPlaceholder(originalPart);
if (segment.getFont() != null) {
totalOriginalWidth +=
segment.getFont().getStringWidth(originalPart)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
totalPlaceholderWidth +=
segment.getFont().getStringWidth(placeholderPart)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
}
}
}
return totalOriginalWidth - totalPlaceholderWidth;
}
private void modifyTokenForRedaction(
List<Object> tokens,
TextSegment segment,
String newText,
float adjustment,
List<MatchRange> matches) {
log.debug(
"Modifying token at index {} for segment '{}' with operator {}",
segment.getTokenIndex(),
segment.getText(),
segment.getOperatorName());
if (segment.getTokenIndex() < 0 || segment.getTokenIndex() >= tokens.size()) {
log.debug(
"Token index {} out of bounds (0-{})",
segment.getTokenIndex(),
tokens.size() - 1);
return;
}
Object token = tokens.get(segment.getTokenIndex());
String operatorName = segment.getOperatorName();
try {
if (("Tj".equals(operatorName) || "'".equals(operatorName))
&& token instanceof COSString) {
log.debug("Modifying Tj/quote operator with adjustment {}", adjustment);
if (Math.abs(adjustment) < PRECISION_THRESHOLD) {
tokens.set(segment.getTokenIndex(), new COSString(newText));
} else {
COSArray newArray = new COSArray();
newArray.add(new COSString(newText));
if (segment.getFontSize() > 0) {
float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize();
newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning));
log.debug("Applied kerning adjustment: {}", kerning);
}
tokens.set(segment.getTokenIndex(), newArray);
int operatorIndex = segment.getTokenIndex() + 1;
if (operatorIndex < tokens.size()
&& tokens.get(operatorIndex) instanceof Operator op
&& op.getName().equals(operatorName)) {
tokens.set(operatorIndex, Operator.getOperator("TJ"));
log.debug("Changed operator from {} to TJ", operatorName);
}
}
} else if ("TJ".equals(operatorName) && token instanceof COSArray) {
log.debug("Modifying TJ operator array");
COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches);
tokens.set(segment.getTokenIndex(), newArray);
}
} catch (IOException e) {
log.warn("Failed to modify token for redaction: {}", e.getMessage(), e);
}
}
private COSArray createRedactedTJArray(
COSArray originalArray, TextSegment segment, List<MatchRange> matches)
throws IOException {
COSArray newArray = new COSArray();
int textOffsetInSegment = 0;
for (COSBase element : originalArray) {
if (element instanceof COSString cosString) {
String originalText = cosString.getString();
StringBuilder newText = new StringBuilder(originalText);
boolean modified = false;
for (MatchRange match : matches) {
int stringStartInPage = segment.getStartPos() + textOffsetInSegment;
int stringEndInPage = stringStartInPage + originalText.length();
int overlapStart = Math.max(match.getStartPos(), stringStartInPage);
int overlapEnd = Math.min(match.getEndPos(), stringEndInPage);
if (overlapStart < overlapEnd) {
modified = true;
int redactionStartInString = overlapStart - stringStartInPage;
int redactionEndInString = overlapEnd - stringStartInPage;
if (redactionStartInString >= 0
&& redactionEndInString <= originalText.length()) {
String placeholder =
createPlaceholder(
originalText.substring(
redactionStartInString, redactionEndInString));
newText.replace(
redactionStartInString, redactionEndInString, placeholder);
}
}
}
String modifiedString = newText.toString();
newArray.add(new COSString(modifiedString));
if (modified && segment.getFont() != null && segment.getFontSize() > 0) {
float originalWidth =
segment.getFont().getStringWidth(originalText)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
float modifiedWidth =
segment.getFont().getStringWidth(modifiedString)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
float adjustment = originalWidth - modifiedWidth;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize();
newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning));
}
}
textOffsetInSegment += originalText.length();
} else {
newArray.add(element);
}
}
return newArray;
}
private String extractTextFromToken(Object token, String operatorName) {
return switch (operatorName) {
case "Tj", "'" -> {
if (token instanceof COSString cosString) {
yield cosString.getString();
}
yield "";
}
case "TJ" -> {
if (token instanceof COSArray cosArray) {
StringBuilder sb = new StringBuilder();
for (COSBase element : cosArray) {
if (element instanceof COSString cosString) {
sb.append(cosString.getString());
}
}
yield sb.toString();
}
yield "";
}
default -> "";
};
}
private String createPlaceholder(String originalWord) {
if (originalWord == null || originalWord.isEmpty()) {
return originalWord;
}
return "".repeat(originalWord.length());
}
private void writeFilteredContentStream(PDDocument document, PDPage page, List<Object> tokens)
throws IOException {
log.debug("Writing filtered content stream with {} tokens", tokens.size());
PDStream newStream = new PDStream(document);
try (var out = newStream.createOutputStream()) {
ContentStreamWriter writer = new ContentStreamWriter(out);
writer.writeTokens(tokens);
}
page.setContents(newStream);
log.debug("Successfully wrote filtered content stream");
}
private boolean isTextShowingOperator(String opName) {
return TEXT_SHOWING_OPERATORS.contains(opName);
}
}

View File

@ -6,102 +6,109 @@ import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.PDFText;
@Slf4j
public class TextFinder extends PDFTextStripper {
private final String searchText;
private final String searchTerm;
private final boolean useRegex;
private final boolean wholeWordSearch;
private final List<PDFText> textOccurrences = new ArrayList<>();
private final List<PDFText> foundTexts = new ArrayList<>();
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch)
private final List<TextPosition> pageTextPositions = new ArrayList<>();
private final StringBuilder pageTextBuilder = new StringBuilder();
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
throws IOException {
this.searchText = searchText.toLowerCase();
super();
this.searchTerm = searchTerm;
this.useRegex = useRegex;
this.wholeWordSearch = wholeWordSearch;
setSortByPosition(true);
this.setWordSeparator(" ");
}
private List<MatchInfo> findOccurrencesInText(String searchText, String content) {
List<MatchInfo> matches = new ArrayList<>();
Pattern pattern;
if (useRegex) {
// Use regex-based search
pattern =
wholeWordSearch
? Pattern.compile("\\b" + searchText + "\\b")
: Pattern.compile(searchText);
} else {
// Use normal text search
pattern =
wholeWordSearch
? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
: Pattern.compile(Pattern.quote(searchText));
}
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
}
return matches;
@Override
protected void startPage(PDPage page) {
pageTextPositions.clear();
pageTextBuilder.setLength(0);
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) {
for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) {
int index = match.startIndex;
if (index + match.matchLength <= textPositions.size()) {
// Initial values based on the first character
TextPosition first = textPositions.get(index);
float minX = first.getX();
float minY = first.getY();
float maxX = first.getX() + first.getWidth();
float maxY = first.getY() + first.getHeight();
pageTextBuilder.append(text);
pageTextPositions.addAll(textPositions);
}
// Loop over the rest of the characters and adjust bounding box values
for (int i = index; i < index + match.matchLength; i++) {
TextPosition position = textPositions.get(i);
minX = Math.min(minX, position.getX());
minY = Math.min(minY, position.getY());
maxX = Math.max(maxX, position.getX() + position.getWidth());
maxY = Math.max(maxY, position.getY() + position.getHeight());
@Override
protected void writeWordSeparator() {
pageTextBuilder.append(getWordSeparator());
pageTextPositions.add(null); // Placeholder for separator
}
@Override
protected void writeLineSeparator() {
pageTextBuilder.append(getLineSeparator());
pageTextPositions.add(null); // Placeholder for separator
}
@Override
protected void endPage(PDPage page) {
String text = pageTextBuilder.toString();
if (text.isEmpty() || this.searchTerm == null || this.searchTerm.isEmpty()) {
return;
}
String processedSearchTerm = this.searchTerm.trim();
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
if (this.wholeWordSearch) {
regex = "\\b" + regex + "\\b";
}
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
int matchStart = matcher.start();
int matchEnd = matcher.end();
float minX = Float.MAX_VALUE;
float minY = Float.MAX_VALUE;
float maxX = Float.MIN_VALUE;
float maxY = Float.MIN_VALUE;
boolean foundPosition = false;
for (int i = matchStart; i < matchEnd; i++) {
if (i >= pageTextPositions.size()) {
continue;
}
TextPosition pos = pageTextPositions.get(i);
if (pos != null) {
foundPosition = true;
minX = Math.min(minX, pos.getX());
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
minY = Math.min(minY, pos.getY() - pos.getHeight());
maxY = Math.max(maxY, pos.getY());
}
}
textOccurrences.add(
new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text));
if (foundPosition) {
foundTexts.add(
new PDFText(
this.getCurrentPageNo() - 1,
minX,
minY,
maxX,
maxY,
matcher.group()));
}
}
}
public List<PDFText> getTextLocations(PDDocument document) throws Exception {
this.getText(document);
log.debug(
"Found "
+ textOccurrences.size()
+ " occurrences of '"
+ searchText
+ "' in the document.");
return textOccurrences;
}
private class MatchInfo {
int startIndex;
int matchLength;
MatchInfo(int startIndex, int matchLength) {
this.startIndex = startIndex;
this.matchLength = matchLength;
}
public List<PDFText> getFoundTexts() {
return foundTexts;
}
}