feat: auto-redact to support text removal on true PDFs

This commit is contained in:
Balázs Szücs 2025-07-12 11:03:33 +02:00
parent bbf5d5f6d4
commit d7fb66bb79
2 changed files with 753 additions and 125 deletions

View File

@ -1,19 +1,33 @@
package stirling.software.SPDF.controller.api.security; package stirling.software.SPDF.controller.api.security;
import java.awt.*; import java.awt.Color;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.WebDataBinder; import org.springframework.web.bind.WebDataBinder;
import org.springframework.web.bind.annotation.InitBinder; import org.springframework.web.bind.annotation.InitBinder;
@ -27,6 +41,8 @@ import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag; import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -48,6 +64,13 @@ import stirling.software.common.util.propertyeditor.StringToArrayListPropertyEdi
@RequiredArgsConstructor @RequiredArgsConstructor
public class RedactController { public class RedactController {
private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.3f;
private static final float PRECISION_THRESHOLD = 1e-3f;
private static final int FONT_SCALE_FACTOR = 1000;
// Text showing operators
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
@InitBinder @InitBinder
@ -65,17 +88,30 @@ public class RedactController {
+ " Type:SISO") + " Type:SISO")
public ResponseEntity<byte[]> redactPDF(@ModelAttribute ManualRedactPdfRequest request) public ResponseEntity<byte[]> redactPDF(@ModelAttribute ManualRedactPdfRequest request)
throws IOException { throws IOException {
log.debug(
"Starting manual redaction for file: {}",
request.getFileInput().getOriginalFilename());
MultipartFile file = request.getFileInput(); MultipartFile file = request.getFileInput();
List<RedactionArea> redactionAreas = request.getRedactions(); List<RedactionArea> redactionAreas = request.getRedactions();
log.debug(
"Processing {} redaction areas",
redactionAreas != null ? redactionAreas.size() : 0);
PDDocument document = pdfDocumentFactory.load(file); PDDocument document = pdfDocumentFactory.load(file);
log.debug("Loaded PDF document with {} pages", document.getNumberOfPages());
PDPageTree allPages = document.getDocumentCatalog().getPages(); PDPageTree allPages = document.getDocumentCatalog().getPages();
log.debug("Starting page redactions");
redactPages(request, document, allPages); redactPages(request, document, allPages);
log.debug("Starting area redactions");
redactAreas(redactionAreas, document, allPages); redactAreas(redactionAreas, document, allPages);
if (Boolean.TRUE.equals(request.getConvertPDFToImage())) { if (Boolean.TRUE.equals(request.getConvertPDFToImage())) {
log.debug("Converting PDF to image format");
PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document); PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
document.close(); document.close();
document = convertedPdf; document = convertedPdf;
@ -86,6 +122,8 @@ public class RedactController {
document.close(); document.close();
byte[] pdfContent = baos.toByteArray(); byte[] pdfContent = baos.toByteArray();
log.debug("Manual redaction completed. Output PDF size: {} bytes", pdfContent.length);
return WebResponseUtils.bytesToWebResponse( return WebResponseUtils.bytesToWebResponse(
pdfContent, pdfContent,
Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "") Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
@ -95,17 +133,30 @@ public class RedactController {
private void redactAreas( private void redactAreas(
List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages) List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
throws IOException { throws IOException {
log.debug("Processing redaction areas");
// Group redaction areas by page // Group redaction areas by page
Map<Integer, List<RedactionArea>> redactionsByPage = new HashMap<>(); Map<Integer, List<RedactionArea>> redactionsByPage = new HashMap<>();
// Process and validate each redaction area // Process and validate each redaction area
for (RedactionArea redactionArea : redactionAreas) { for (RedactionArea redactionArea : redactionAreas) {
log.debug(
"Validating redaction area on page {}: x={}, y={}, width={}, height={}",
redactionArea.getPage(),
redactionArea.getX(),
redactionArea.getY(),
redactionArea.getWidth(),
redactionArea.getHeight());
if (redactionArea.getPage() == null if (redactionArea.getPage() == null
|| redactionArea.getPage() <= 0 || redactionArea.getPage() <= 0
|| redactionArea.getHeight() == null || redactionArea.getHeight() == null
|| redactionArea.getHeight() <= 0.0D || redactionArea.getHeight() <= 0.0D
|| redactionArea.getWidth() == null || redactionArea.getWidth() == null
|| redactionArea.getWidth() <= 0.0D) continue; || redactionArea.getWidth() <= 0.0D) {
log.debug("Skipping invalid redaction area: {}", redactionArea);
continue;
}
// Group by page number // Group by page number
redactionsByPage redactionsByPage
@ -113,70 +164,151 @@ public class RedactController {
.add(redactionArea); .add(redactionArea);
} }
log.debug("Grouped redactions by page: {} pages affected", redactionsByPage.size());
// Process each page only once // Process each page only once
for (Map.Entry<Integer, List<RedactionArea>> entry : redactionsByPage.entrySet()) { for (Map.Entry<Integer, List<RedactionArea>> entry : redactionsByPage.entrySet()) {
Integer pageNumber = entry.getKey(); Integer pageNumber = entry.getKey();
List<RedactionArea> areasForPage = entry.getValue(); List<RedactionArea> areasForPage = entry.getValue();
log.debug(
"Processing page {} with {} redaction areas", pageNumber, areasForPage.size());
if (pageNumber > allPages.getCount()) { if (pageNumber > allPages.getCount()) {
log.debug(
"Skipping page {} - out of bounds (total pages: {})",
pageNumber,
allPages.getCount());
continue; // Skip if page number is out of bounds continue; // Skip if page number is out of bounds
} }
PDPage page = allPages.get(pageNumber - 1); PDPage page = allPages.get(pageNumber - 1);
PDRectangle box = page.getBBox();
// Create only one content stream per page // Create only one content stream per page to draw all redaction boxes
PDPageContentStream contentStream = try (PDPageContentStream contentStream =
new PDPageContentStream( new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true); document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
// Process all redactions for this page // Process all redactions for this page
for (RedactionArea redactionArea : areasForPage) { for (RedactionArea redactionArea : areasForPage) {
Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK); Color redactColor = decodeOrDefault(redactionArea.getColor());
contentStream.setNonStrokingColor(redactColor); log.debug(
"Applying redaction with color {} at ({}, {}) size {}x{}",
redactColor,
redactionArea.getX(),
redactionArea.getY(),
redactionArea.getWidth(),
redactionArea.getHeight());
float x = redactionArea.getX().floatValue(); contentStream.setNonStrokingColor(redactColor);
float y = redactionArea.getY().floatValue();
float width = redactionArea.getWidth().floatValue();
float height = redactionArea.getHeight().floatValue();
contentStream.addRect(x, box.getHeight() - y - height, width, height); float x = redactionArea.getX().floatValue();
contentStream.fill(); float y = redactionArea.getY().floatValue();
float width = redactionArea.getWidth().floatValue();
float height = redactionArea.getHeight().floatValue();
// The y-coordinate needs to be transformed from a top-left origin to a
// bottom-left origin.
float pdfY = page.getBBox().getHeight() - y - height;
contentStream.addRect(x, pdfY, width, height);
contentStream.fill();
}
} }
contentStream.close();
} }
log.debug("Completed redaction areas processing");
} }
private void redactPages( private void redactPages(
ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages) ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages)
throws IOException { throws IOException {
Color redactColor = decodeOrDefault(request.getPageRedactionColor(), Color.BLACK); log.debug("Starting page redactions");
Color redactColor = decodeOrDefault(request.getPageRedactionColor());
List<Integer> pageNumbers = getPageNumbers(request, allPages.getCount()); List<Integer> pageNumbers = getPageNumbers(request, allPages.getCount());
log.debug("Redacting {} pages with color {}", pageNumbers.size(), redactColor);
for (Integer pageNumber : pageNumbers) { for (Integer pageNumber : pageNumbers) {
log.debug("Redacting entire page {}", pageNumber + 1);
PDPage page = allPages.get(pageNumber); PDPage page = allPages.get(pageNumber);
PDPageContentStream contentStream = try (PDPageContentStream contentStream =
new PDPageContentStream( new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true); document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
contentStream.setNonStrokingColor(redactColor); contentStream.setNonStrokingColor(redactColor);
PDRectangle box = page.getBBox(); PDRectangle box = page.getBBox();
log.debug(
"Page {} dimensions: {}x{}",
pageNumber + 1,
box.getWidth(),
box.getHeight());
contentStream.addRect(0, 0, box.getWidth(), box.getHeight()); contentStream.addRect(0, 0, box.getWidth(), box.getHeight());
contentStream.fill(); contentStream.fill();
contentStream.close(); }
} }
log.debug("Completed page redactions");
} }
private Color decodeOrDefault(String hex, Color defaultColor) { private void redactFoundText(
try { PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
if (hex != null && !hex.startsWith("#")) { throws IOException {
hex = "#" + hex; log.debug(
"Redacting {} text blocks with padding {} and color {}",
blocks.size(),
customPadding,
redactColor);
var allPages = document.getDocumentCatalog().getPages();
for (PDFText block : blocks) {
log.debug(
"Redacting text block on page {}: '{}' at ({}, {}) to ({}, {})",
block.getPageIndex() + 1,
block.getText(),
block.getX1(),
block.getY1(),
block.getX2(),
block.getY2());
var page = allPages.get(block.getPageIndex());
try (PDPageContentStream contentStream =
new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
contentStream.setNonStrokingColor(redactColor);
float padding =
(block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER
+ customPadding;
PDRectangle pageBox = page.getBBox();
contentStream.addRect(
block.getX1(),
pageBox.getHeight() - block.getY2() - padding,
block.getX2() - block.getX1(),
block.getY2() - block.getY1() + 2 * padding);
contentStream.fill();
} }
return Color.decode(hex); }
} catch (Exception e) {
return defaultColor; log.debug("Completed text block redactions");
}
private Color decodeOrDefault(String hex) {
if (hex == null) {
return Color.BLACK;
}
String colorString = hex.startsWith("#") ? hex : "#" + hex;
try {
return Color.decode(colorString);
} catch (NumberFormatException e) {
log.warn("Invalid color string '{}'. Using default color BLACK.", hex);
return Color.BLACK;
} }
} }
@ -198,6 +330,10 @@ public class RedactController {
+ " Input:PDF, Output:PDF, Type:SISO") + " Input:PDF, Output:PDF, Type:SISO")
public ResponseEntity<byte[]> redactPdf(@ModelAttribute RedactPdfRequest request) public ResponseEntity<byte[]> redactPdf(@ModelAttribute RedactPdfRequest request)
throws Exception { throws Exception {
log.debug(
"Starting auto-redaction for file: {}",
request.getFileInput().getOriginalFilename());
MultipartFile file = request.getFileInput(); MultipartFile file = request.getFileInput();
String listOfTextString = request.getListOfText(); String listOfTextString = request.getListOfText();
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
@ -206,28 +342,80 @@ public class RedactController {
float customPadding = request.getCustomPadding(); float customPadding = request.getCustomPadding();
boolean convertPDFToImage = Boolean.TRUE.equals(request.getConvertPDFToImage()); boolean convertPDFToImage = Boolean.TRUE.equals(request.getConvertPDFToImage());
log.debug(
"Auto-redaction parameters: useRegex={}, wholeWordSearch={}, customPadding={}, convertToImage={}",
useRegex,
wholeWordSearchBool,
customPadding,
convertPDFToImage);
String[] listOfText = listOfTextString.split("\n"); String[] listOfText = listOfTextString.split("\n");
log.debug("Searching for {} text patterns", listOfText.length);
PDDocument document = pdfDocumentFactory.load(file); PDDocument document = pdfDocumentFactory.load(file);
log.debug("Loaded PDF document with {} pages", document.getNumberOfPages());
Color redactColor; Color redactColor;
try { try {
if (!colorString.startsWith("#")) { if (colorString != null && !colorString.startsWith("#")) {
colorString = "#" + colorString; colorString = "#" + colorString;
} }
redactColor = Color.decode(colorString); redactColor = Color.decode(colorString);
log.debug("Using redaction color: {}", redactColor);
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
log.warn("Invalid color string provided. Using default color BLACK for redaction."); log.warn("Invalid color string provided. Using default color BLACK for redaction.");
redactColor = Color.BLACK; redactColor = Color.BLACK;
} }
// Step 1: Find all text locations for all search terms
log.debug("Step 1: Finding all text locations");
Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
Set<String> allSearchTerms = new HashSet<>();
for (String text : listOfText) { for (String text : listOfText) {
text = text.trim(); text = text.trim();
if (text.isEmpty()) continue;
log.debug("Searching for text pattern: '{}'", text);
allSearchTerms.add(text);
TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool); TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool);
List<PDFText> foundTexts = textFinder.getTextLocations(document); textFinder.getText(document);
redactFoundText(document, foundTexts, customPadding, redactColor); List<PDFText> foundTexts = textFinder.getFoundTexts();
log.debug("Found {} instances of pattern '{}'", foundTexts.size(), text);
for (PDFText found : foundTexts) {
allFoundTextsByPage
.computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>())
.add(found);
}
}
log.debug("Total pages with found text: {}", allFoundTextsByPage.size());
// Step 2: Process each page
log.debug("Step 2: Processing each page for text replacement");
for (PDPage page : document.getPages()) {
// Replace text content
List<Object> filteredTokens =
createTokensWithoutTargetText(
page, allSearchTerms, useRegex, wholeWordSearchBool);
writeFilteredContentStream(document, page, filteredTokens);
}
// Draw redaction boxes for all found texts
List<PDFText> allFoundTexts = new ArrayList<>();
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
allFoundTexts.addAll(pageTexts);
}
log.debug("Drawing redaction boxes for {} total found texts", allFoundTexts.size());
if (!allFoundTexts.isEmpty()) {
redactFoundText(document, allFoundTexts, customPadding, redactColor);
} }
if (convertPDFToImage) { if (convertPDFToImage) {
log.debug("Converting redacted PDF to image format");
PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document); PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
document.close(); document.close();
document = convertedPdf; document = convertedPdf;
@ -238,32 +426,465 @@ public class RedactController {
document.close(); document.close();
byte[] pdfContent = baos.toByteArray(); byte[] pdfContent = baos.toByteArray();
log.debug("Auto-redaction completed. Output PDF size: {} bytes", pdfContent.length);
return WebResponseUtils.bytesToWebResponse( return WebResponseUtils.bytesToWebResponse(
pdfContent, pdfContent,
Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "") Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
+ "_redacted.pdf"); + "_redacted.pdf");
} }
private void redactFoundText( private List<Object> createTokensWithoutTargetText(
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor) PDPage page, Set<String> targetWords, boolean useRegex, boolean wholeWordSearch)
throws IOException { throws IOException {
var allPages = document.getDocumentCatalog().getPages(); log.debug(
"Creating tokens without target text for page, searching for {} words",
targetWords.size());
for (PDFText block : blocks) { PDFStreamParser parser = new PDFStreamParser(page);
var page = allPages.get(block.getPageIndex()); List<Object> tokens = new ArrayList<>();
PDPageContentStream contentStream = Object token;
new PDPageContentStream( while ((token = parser.parseNextToken()) != null) {
document, page, PDPageContentStream.AppendMode.APPEND, true, true); tokens.add(token);
contentStream.setNonStrokingColor(redactColor); }
float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding;
PDRectangle pageBox = page.getBBox(); log.debug("Parsed {} tokens from page content stream", tokens.size());
contentStream.addRect(
block.getX1(), List<TextSegment> textSegments = extractTextSegments(page, tokens);
pageBox.getHeight() - block.getY1() - padding, log.debug("Extracted {} text segments", textSegments.size());
block.getX2() - block.getX1(),
block.getY2() - block.getY1() + 2 * padding); String completeText = buildCompleteText(textSegments);
contentStream.fill(); log.debug("Built complete text of {} characters", completeText.length());
contentStream.close();
List<MatchRange> matches =
findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
log.debug("Found {} matches in complete text", matches.size());
return applyRedactionsToTokens(tokens, textSegments, matches);
}
@Data
private static class GraphicsState {
private PDFont font = null;
private float fontSize = 0;
}
@Data
@AllArgsConstructor
private static class TextSegment {
private int tokenIndex;
private String operatorName;
private String text;
private int startPos;
private int endPos;
private PDFont font;
private float fontSize;
}
@Data
@AllArgsConstructor
private static class MatchRange {
private int startPos;
private int endPos;
}
private List<TextSegment> extractTextSegments(PDPage page, List<Object> tokens)
throws IOException {
log.debug("Extracting text segments from {} tokens", tokens.size());
List<TextSegment> segments = new ArrayList<>();
int currentTextPos = 0;
GraphicsState graphicsState = new GraphicsState();
PDResources resources = page.getResources();
for (int i = 0; i < tokens.size(); i++) {
Object currentToken = tokens.get(i);
if (currentToken instanceof Operator op) {
String opName = op.getName();
if ("Tf".equals(opName) && i >= 2) {
try {
COSName fontName = (COSName) tokens.get(i - 2);
COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
if (fontSizeBase instanceof org.apache.pdfbox.cos.COSNumber cosNumber) {
graphicsState.setFont(resources.getFont(fontName));
graphicsState.setFontSize(cosNumber.floatValue());
log.debug(
"Updated font state: {} size {}",
fontName.getName(),
graphicsState.getFontSize());
}
} catch (ClassCastException | IOException e) {
log.warn("Failed to update font state", e);
}
}
if (isTextShowingOperator(opName) && i > 0) {
String textContent = extractTextFromToken(tokens.get(i - 1), opName);
if (!textContent.isEmpty()) {
log.debug(
"Found text segment '{}' at position {} with operator {}",
textContent,
currentTextPos,
opName);
segments.add(
new TextSegment(
i - 1,
opName,
textContent,
currentTextPos,
currentTextPos + textContent.length(),
graphicsState.font,
graphicsState.fontSize));
currentTextPos += textContent.length();
}
}
}
}
log.debug("Extracted {} text segments from page", segments.size());
return segments;
}
private String buildCompleteText(List<TextSegment> segments) {
StringBuilder sb = new StringBuilder();
for (TextSegment segment : segments) {
sb.append(segment.text);
}
return sb.toString();
}
private List<MatchRange> findAllMatches(
String completeText,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearch) {
log.debug(
"Finding matches in text of {} characters for {} target words",
completeText.length(),
targetWords.size());
List<MatchRange> matches = new ArrayList<>();
for (String target : targetWords) {
log.debug("Searching for pattern: '{}'", target);
String patternString = useRegex ? target : Pattern.quote(target);
if (wholeWordSearch) {
patternString = "\\b" + patternString + "\\b";
}
Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(completeText);
int matchCount = 0;
while (matcher.find()) {
matches.add(new MatchRange(matcher.start(), matcher.end()));
matchCount++;
log.debug(
"Found match for '{}' at positions {}-{}",
target,
matcher.start(),
matcher.end());
}
log.debug("Total matches for '{}': {}", target, matchCount);
}
matches.sort((a, b) -> Integer.compare(a.startPos, b.startPos));
log.debug("Found {} total matches across all patterns", matches.size());
return matches;
}
private List<Object> applyRedactionsToTokens(
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
log.debug(
"Applying redactions to {} tokens with {} text segments and {} matches",
tokens.size(),
textSegments.size(),
matches.size());
List<Object> newTokens = new ArrayList<>(tokens);
// Group matches by segment to pass to modification methods
Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
for (MatchRange match : matches) {
for (int i = 0; i < textSegments.size(); i++) {
TextSegment segment = textSegments.get(i);
int overlapStart = Math.max(match.startPos, segment.startPos);
int overlapEnd = Math.min(match.endPos, segment.endPos);
if (overlapStart < overlapEnd) {
matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
}
}
}
log.debug("Grouped matches by segment: {} segments affected", matchesBySegment.size());
// Create a list of modification tasks
List<ModificationTask> tasks = new ArrayList<>();
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
int segmentIndex = entry.getKey();
List<MatchRange> segmentMatches = entry.getValue();
TextSegment segment = textSegments.get(segmentIndex);
log.debug(
"Creating modification task for segment {} with {} matches",
segmentIndex,
segmentMatches.size());
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) {
String newText = applyRedactionsToSegmentText(segment, segmentMatches);
try {
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
tasks.add(new ModificationTask(segment, newText, adjustment));
} catch (IOException e) {
log.warn("Failed to calculate width adjustment for redaction.", e);
}
} else if ("TJ".equals(segment.operatorName)) {
tasks.add(new ModificationTask(segment, null, 0));
}
}
// Sort tasks by token index in descending order to avoid index shifting issues
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
log.debug("Applying {} modification tasks", tasks.size());
// Apply modifications
for (ModificationTask task : tasks) {
List<MatchRange> segmentMatches =
matchesBySegment.getOrDefault(
textSegments.indexOf(task.segment), Collections.emptyList());
modifyTokenForRedaction(
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
}
log.debug("Completed applying redactions to tokens");
return newTokens;
}
@Data
@AllArgsConstructor
private static class ModificationTask {
private TextSegment segment;
private String newText; // Only for Tj
private float adjustment; // Only for Tj
}
private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) {
String text = segment.getText();
StringBuilder result = new StringBuilder(text);
for (MatchRange match : matches) {
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) {
String placeholder = createPlaceholder(text.substring(segmentStart, segmentEnd));
result.replace(segmentStart, segmentEnd, placeholder);
}
}
return result.toString();
}
private float calculateWidthAdjustment(TextSegment segment, List<MatchRange> matches)
throws IOException {
float totalOriginalWidth = 0;
float totalPlaceholderWidth = 0;
String text = segment.getText();
for (MatchRange match : matches) {
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) {
String originalPart = text.substring(segmentStart, segmentEnd);
String placeholderPart = createPlaceholder(originalPart);
if (segment.getFont() != null) {
totalOriginalWidth +=
segment.getFont().getStringWidth(originalPart)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
totalPlaceholderWidth +=
segment.getFont().getStringWidth(placeholderPart)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
}
}
}
return totalOriginalWidth - totalPlaceholderWidth;
}
private void modifyTokenForRedaction(
List<Object> tokens,
TextSegment segment,
String newText,
float adjustment,
List<MatchRange> matches) {
log.debug(
"Modifying token at index {} for segment '{}' with operator {}",
segment.getTokenIndex(),
segment.getText(),
segment.getOperatorName());
if (segment.getTokenIndex() < 0 || segment.getTokenIndex() >= tokens.size()) {
log.debug(
"Token index {} out of bounds (0-{})",
segment.getTokenIndex(),
tokens.size() - 1);
return;
}
Object token = tokens.get(segment.getTokenIndex());
String operatorName = segment.getOperatorName();
try {
if (("Tj".equals(operatorName) || "'".equals(operatorName))
&& token instanceof COSString) {
log.debug("Modifying Tj/quote operator with adjustment {}", adjustment);
if (Math.abs(adjustment) < PRECISION_THRESHOLD) {
tokens.set(segment.getTokenIndex(), new COSString(newText));
} else {
COSArray newArray = new COSArray();
newArray.add(new COSString(newText));
if (segment.getFontSize() > 0) {
float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize();
newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning));
log.debug("Applied kerning adjustment: {}", kerning);
}
tokens.set(segment.getTokenIndex(), newArray);
int operatorIndex = segment.getTokenIndex() + 1;
if (operatorIndex < tokens.size()
&& tokens.get(operatorIndex) instanceof Operator op
&& op.getName().equals(operatorName)) {
tokens.set(operatorIndex, Operator.getOperator("TJ"));
log.debug("Changed operator from {} to TJ", operatorName);
}
}
} else if ("TJ".equals(operatorName) && token instanceof COSArray) {
log.debug("Modifying TJ operator array");
COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches);
tokens.set(segment.getTokenIndex(), newArray);
}
} catch (IOException e) {
log.warn("Failed to modify token for redaction: {}", e.getMessage(), e);
} }
} }
private COSArray createRedactedTJArray(
COSArray originalArray, TextSegment segment, List<MatchRange> matches)
throws IOException {
COSArray newArray = new COSArray();
int textOffsetInSegment = 0;
for (COSBase element : originalArray) {
if (element instanceof COSString cosString) {
String originalText = cosString.getString();
StringBuilder newText = new StringBuilder(originalText);
boolean modified = false;
for (MatchRange match : matches) {
int stringStartInPage = segment.getStartPos() + textOffsetInSegment;
int stringEndInPage = stringStartInPage + originalText.length();
int overlapStart = Math.max(match.getStartPos(), stringStartInPage);
int overlapEnd = Math.min(match.getEndPos(), stringEndInPage);
if (overlapStart < overlapEnd) {
modified = true;
int redactionStartInString = overlapStart - stringStartInPage;
int redactionEndInString = overlapEnd - stringStartInPage;
if (redactionStartInString >= 0
&& redactionEndInString <= originalText.length()) {
String placeholder =
createPlaceholder(
originalText.substring(
redactionStartInString, redactionEndInString));
newText.replace(
redactionStartInString, redactionEndInString, placeholder);
}
}
}
String modifiedString = newText.toString();
newArray.add(new COSString(modifiedString));
if (modified && segment.getFont() != null && segment.getFontSize() > 0) {
float originalWidth =
segment.getFont().getStringWidth(originalText)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
float modifiedWidth =
segment.getFont().getStringWidth(modifiedString)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
float adjustment = originalWidth - modifiedWidth;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize();
newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning));
}
}
textOffsetInSegment += originalText.length();
} else {
newArray.add(element);
}
}
return newArray;
}
private String extractTextFromToken(Object token, String operatorName) {
return switch (operatorName) {
case "Tj", "'" -> {
if (token instanceof COSString cosString) {
yield cosString.getString();
}
yield "";
}
case "TJ" -> {
if (token instanceof COSArray cosArray) {
StringBuilder sb = new StringBuilder();
for (COSBase element : cosArray) {
if (element instanceof COSString cosString) {
sb.append(cosString.getString());
}
}
yield sb.toString();
}
yield "";
}
default -> "";
};
}
private String createPlaceholder(String originalWord) {
if (originalWord == null || originalWord.isEmpty()) {
return originalWord;
}
return "".repeat(originalWord.length());
}
private void writeFilteredContentStream(PDDocument document, PDPage page, List<Object> tokens)
throws IOException {
log.debug("Writing filtered content stream with {} tokens", tokens.size());
PDStream newStream = new PDStream(document);
try (var out = newStream.createOutputStream()) {
ContentStreamWriter writer = new ContentStreamWriter(out);
writer.writeTokens(tokens);
}
page.setContents(newStream);
log.debug("Successfully wrote filtered content stream");
}
private boolean isTextShowingOperator(String opName) {
return TEXT_SHOWING_OPERATORS.contains(opName);
}
} }

View File

@ -6,102 +6,109 @@ import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.PDFText; import stirling.software.SPDF.model.PDFText;
@Slf4j
public class TextFinder extends PDFTextStripper { public class TextFinder extends PDFTextStripper {
private final String searchText; private final String searchTerm;
private final boolean useRegex; private final boolean useRegex;
private final boolean wholeWordSearch; private final boolean wholeWordSearch;
private final List<PDFText> textOccurrences = new ArrayList<>(); private final List<PDFText> foundTexts = new ArrayList<>();
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch) private final List<TextPosition> pageTextPositions = new ArrayList<>();
private final StringBuilder pageTextBuilder = new StringBuilder();
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
throws IOException { throws IOException {
this.searchText = searchText.toLowerCase(); super();
this.searchTerm = searchTerm;
this.useRegex = useRegex; this.useRegex = useRegex;
this.wholeWordSearch = wholeWordSearch; this.wholeWordSearch = wholeWordSearch;
setSortByPosition(true); this.setWordSeparator(" ");
} }
private List<MatchInfo> findOccurrencesInText(String searchText, String content) { @Override
List<MatchInfo> matches = new ArrayList<>(); protected void startPage(PDPage page) {
pageTextPositions.clear();
Pattern pattern; pageTextBuilder.setLength(0);
if (useRegex) {
// Use regex-based search
pattern =
wholeWordSearch
? Pattern.compile("\\b" + searchText + "\\b")
: Pattern.compile(searchText);
} else {
// Use normal text search
pattern =
wholeWordSearch
? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
: Pattern.compile(Pattern.quote(searchText));
}
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
}
return matches;
} }
@Override @Override
protected void writeString(String text, List<TextPosition> textPositions) { protected void writeString(String text, List<TextPosition> textPositions) {
for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) { pageTextBuilder.append(text);
int index = match.startIndex; pageTextPositions.addAll(textPositions);
if (index + match.matchLength <= textPositions.size()) { }
// Initial values based on the first character
TextPosition first = textPositions.get(index);
float minX = first.getX();
float minY = first.getY();
float maxX = first.getX() + first.getWidth();
float maxY = first.getY() + first.getHeight();
// Loop over the rest of the characters and adjust bounding box values @Override
for (int i = index; i < index + match.matchLength; i++) { protected void writeWordSeparator() {
TextPosition position = textPositions.get(i); pageTextBuilder.append(getWordSeparator());
minX = Math.min(minX, position.getX()); pageTextPositions.add(null); // Placeholder for separator
minY = Math.min(minY, position.getY()); }
maxX = Math.max(maxX, position.getX() + position.getWidth());
maxY = Math.max(maxY, position.getY() + position.getHeight()); @Override
protected void writeLineSeparator() {
pageTextBuilder.append(getLineSeparator());
pageTextPositions.add(null); // Placeholder for separator
}
@Override
protected void endPage(PDPage page) {
String text = pageTextBuilder.toString();
if (text.isEmpty() || this.searchTerm == null || this.searchTerm.isEmpty()) {
return;
}
String processedSearchTerm = this.searchTerm.trim();
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
if (this.wholeWordSearch) {
regex = "\\b" + regex + "\\b";
}
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
int matchStart = matcher.start();
int matchEnd = matcher.end();
float minX = Float.MAX_VALUE;
float minY = Float.MAX_VALUE;
float maxX = Float.MIN_VALUE;
float maxY = Float.MIN_VALUE;
boolean foundPosition = false;
for (int i = matchStart; i < matchEnd; i++) {
if (i >= pageTextPositions.size()) {
continue;
} }
TextPosition pos = pageTextPositions.get(i);
if (pos != null) {
foundPosition = true;
minX = Math.min(minX, pos.getX());
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
minY = Math.min(minY, pos.getY() - pos.getHeight());
maxY = Math.max(maxY, pos.getY());
}
}
textOccurrences.add( if (foundPosition) {
new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text)); foundTexts.add(
new PDFText(
this.getCurrentPageNo() - 1,
minX,
minY,
maxX,
maxY,
matcher.group()));
} }
} }
} }
public List<PDFText> getTextLocations(PDDocument document) throws Exception { public List<PDFText> getFoundTexts() {
this.getText(document); return foundTexts;
log.debug(
"Found "
+ textOccurrences.size()
+ " occurrences of '"
+ searchText
+ "' in the document.");
return textOccurrences;
}
private class MatchInfo {
int startIndex;
int matchLength;
MatchInfo(int startIndex, int matchLength) {
this.startIndex = startIndex;
this.matchLength = matchLength;
}
} }
} }