From d7fb66bb7924cd91b6de1c1bf844122f4115b140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Sat, 12 Jul 2025 11:03:33 +0200 Subject: [PATCH] feat: auto-redact to support text removal on true PDFs --- .../api/security/RedactController.java | 729 ++++++++++++++++-- .../software/SPDF/pdf/TextFinder.java | 149 ++-- 2 files changed, 753 insertions(+), 125 deletions(-) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 88d271cfb..b647ea511 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -1,19 +1,33 @@ package stirling.software.SPDF.controller.api.security; -import java.awt.*; +import java.awt.Color; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdfparser.PDFStreamParser; +import org.apache.pdfbox.pdfwriter.ContentStreamWriter; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.common.PDStream; +import org.apache.pdfbox.pdmodel.font.PDFont; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.WebDataBinder; import org.springframework.web.bind.annotation.InitBinder; @@ -27,6 +41,8 @@ import io.github.pixee.security.Filenames; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; +import lombok.AllArgsConstructor; +import lombok.Data; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -48,6 +64,13 @@ import stirling.software.common.util.propertyeditor.StringToArrayListPropertyEdi @RequiredArgsConstructor public class RedactController { + private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.3f; + private static final float PRECISION_THRESHOLD = 1e-3f; + private static final int FONT_SCALE_FACTOR = 1000; + + // Text showing operators + private static final Set TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); + private final CustomPDFDocumentFactory pdfDocumentFactory; @InitBinder @@ -65,17 +88,30 @@ public class RedactController { + " Type:SISO") public ResponseEntity redactPDF(@ModelAttribute ManualRedactPdfRequest request) throws IOException { + log.debug( + "Starting manual redaction for file: {}", + request.getFileInput().getOriginalFilename()); + MultipartFile file = request.getFileInput(); List redactionAreas = request.getRedactions(); + log.debug( + "Processing {} redaction areas", + redactionAreas != null ? redactionAreas.size() : 0); + PDDocument document = pdfDocumentFactory.load(file); + log.debug("Loaded PDF document with {} pages", document.getNumberOfPages()); PDPageTree allPages = document.getDocumentCatalog().getPages(); + log.debug("Starting page redactions"); redactPages(request, document, allPages); + + log.debug("Starting area redactions"); redactAreas(redactionAreas, document, allPages); if (Boolean.TRUE.equals(request.getConvertPDFToImage())) { + log.debug("Converting PDF to image format"); PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document); document.close(); document = convertedPdf; @@ -86,6 +122,8 @@ public class RedactController { document.close(); byte[] pdfContent = baos.toByteArray(); + log.debug("Manual redaction completed. Output PDF size: {} bytes", pdfContent.length); + return WebResponseUtils.bytesToWebResponse( pdfContent, Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "") @@ -95,17 +133,30 @@ public class RedactController { private void redactAreas( List redactionAreas, PDDocument document, PDPageTree allPages) throws IOException { + log.debug("Processing redaction areas"); + // Group redaction areas by page Map> redactionsByPage = new HashMap<>(); // Process and validate each redaction area for (RedactionArea redactionArea : redactionAreas) { + log.debug( + "Validating redaction area on page {}: x={}, y={}, width={}, height={}", + redactionArea.getPage(), + redactionArea.getX(), + redactionArea.getY(), + redactionArea.getWidth(), + redactionArea.getHeight()); + if (redactionArea.getPage() == null || redactionArea.getPage() <= 0 || redactionArea.getHeight() == null || redactionArea.getHeight() <= 0.0D || redactionArea.getWidth() == null - || redactionArea.getWidth() <= 0.0D) continue; + || redactionArea.getWidth() <= 0.0D) { + log.debug("Skipping invalid redaction area: {}", redactionArea); + continue; + } // Group by page number redactionsByPage @@ -113,70 +164,151 @@ public class RedactController { .add(redactionArea); } + log.debug("Grouped redactions by page: {} pages affected", redactionsByPage.size()); + // Process each page only once for (Map.Entry> entry : redactionsByPage.entrySet()) { Integer pageNumber = entry.getKey(); List areasForPage = entry.getValue(); + log.debug( + "Processing page {} with {} redaction areas", pageNumber, areasForPage.size()); + if (pageNumber > allPages.getCount()) { + log.debug( + "Skipping page {} - out of bounds (total pages: {})", + pageNumber, + allPages.getCount()); continue; // Skip if page number is out of bounds } PDPage page = allPages.get(pageNumber - 1); - PDRectangle box = page.getBBox(); - // Create only one content stream per page - PDPageContentStream contentStream = + // Create only one content stream per page to draw all redaction boxes + try (PDPageContentStream contentStream = new PDPageContentStream( - document, page, PDPageContentStream.AppendMode.APPEND, true, true); + document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { - // Process all redactions for this page - for (RedactionArea redactionArea : areasForPage) { - Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK); - contentStream.setNonStrokingColor(redactColor); + // Process all redactions for this page + for (RedactionArea redactionArea : areasForPage) { + Color redactColor = decodeOrDefault(redactionArea.getColor()); + log.debug( + "Applying redaction with color {} at ({}, {}) size {}x{}", + redactColor, + redactionArea.getX(), + redactionArea.getY(), + redactionArea.getWidth(), + redactionArea.getHeight()); - float x = redactionArea.getX().floatValue(); - float y = redactionArea.getY().floatValue(); - float width = redactionArea.getWidth().floatValue(); - float height = redactionArea.getHeight().floatValue(); + contentStream.setNonStrokingColor(redactColor); - contentStream.addRect(x, box.getHeight() - y - height, width, height); - contentStream.fill(); + float x = redactionArea.getX().floatValue(); + float y = redactionArea.getY().floatValue(); + float width = redactionArea.getWidth().floatValue(); + float height = redactionArea.getHeight().floatValue(); + + // The y-coordinate needs to be transformed from a top-left origin to a + // bottom-left origin. + float pdfY = page.getBBox().getHeight() - y - height; + + contentStream.addRect(x, pdfY, width, height); + contentStream.fill(); + } } - - contentStream.close(); } + + log.debug("Completed redaction areas processing"); } private void redactPages( ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages) throws IOException { - Color redactColor = decodeOrDefault(request.getPageRedactionColor(), Color.BLACK); + log.debug("Starting page redactions"); + + Color redactColor = decodeOrDefault(request.getPageRedactionColor()); List pageNumbers = getPageNumbers(request, allPages.getCount()); + + log.debug("Redacting {} pages with color {}", pageNumbers.size(), redactColor); + for (Integer pageNumber : pageNumbers) { + log.debug("Redacting entire page {}", pageNumber + 1); + PDPage page = allPages.get(pageNumber); - PDPageContentStream contentStream = + try (PDPageContentStream contentStream = new PDPageContentStream( - document, page, PDPageContentStream.AppendMode.APPEND, true, true); - contentStream.setNonStrokingColor(redactColor); + document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { + contentStream.setNonStrokingColor(redactColor); - PDRectangle box = page.getBBox(); + PDRectangle box = page.getBBox(); + log.debug( + "Page {} dimensions: {}x{}", + pageNumber + 1, + box.getWidth(), + box.getHeight()); - contentStream.addRect(0, 0, box.getWidth(), box.getHeight()); - contentStream.fill(); - contentStream.close(); + contentStream.addRect(0, 0, box.getWidth(), box.getHeight()); + contentStream.fill(); + } } + + log.debug("Completed page redactions"); } - private Color decodeOrDefault(String hex, Color defaultColor) { - try { - if (hex != null && !hex.startsWith("#")) { - hex = "#" + hex; + private void redactFoundText( + PDDocument document, List blocks, float customPadding, Color redactColor) + throws IOException { + log.debug( + "Redacting {} text blocks with padding {} and color {}", + blocks.size(), + customPadding, + redactColor); + + var allPages = document.getDocumentCatalog().getPages(); + + for (PDFText block : blocks) { + log.debug( + "Redacting text block on page {}: '{}' at ({}, {}) to ({}, {})", + block.getPageIndex() + 1, + block.getText(), + block.getX1(), + block.getY1(), + block.getX2(), + block.getY2()); + + var page = allPages.get(block.getPageIndex()); + try (PDPageContentStream contentStream = + new PDPageContentStream( + document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { + contentStream.setNonStrokingColor(redactColor); + float padding = + (block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER + + customPadding; + PDRectangle pageBox = page.getBBox(); + contentStream.addRect( + block.getX1(), + pageBox.getHeight() - block.getY2() - padding, + block.getX2() - block.getX1(), + block.getY2() - block.getY1() + 2 * padding); + contentStream.fill(); } - return Color.decode(hex); - } catch (Exception e) { - return defaultColor; + } + + log.debug("Completed text block redactions"); + } + + private Color decodeOrDefault(String hex) { + if (hex == null) { + return Color.BLACK; + } + + String colorString = hex.startsWith("#") ? hex : "#" + hex; + + try { + return Color.decode(colorString); + } catch (NumberFormatException e) { + log.warn("Invalid color string '{}'. Using default color BLACK.", hex); + return Color.BLACK; } } @@ -198,6 +330,10 @@ public class RedactController { + " Input:PDF, Output:PDF, Type:SISO") public ResponseEntity redactPdf(@ModelAttribute RedactPdfRequest request) throws Exception { + log.debug( + "Starting auto-redaction for file: {}", + request.getFileInput().getOriginalFilename()); + MultipartFile file = request.getFileInput(); String listOfTextString = request.getListOfText(); boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); @@ -206,28 +342,80 @@ public class RedactController { float customPadding = request.getCustomPadding(); boolean convertPDFToImage = Boolean.TRUE.equals(request.getConvertPDFToImage()); + log.debug( + "Auto-redaction parameters: useRegex={}, wholeWordSearch={}, customPadding={}, convertToImage={}", + useRegex, + wholeWordSearchBool, + customPadding, + convertPDFToImage); + String[] listOfText = listOfTextString.split("\n"); + log.debug("Searching for {} text patterns", listOfText.length); + PDDocument document = pdfDocumentFactory.load(file); + log.debug("Loaded PDF document with {} pages", document.getNumberOfPages()); Color redactColor; try { - if (!colorString.startsWith("#")) { + if (colorString != null && !colorString.startsWith("#")) { colorString = "#" + colorString; } redactColor = Color.decode(colorString); + log.debug("Using redaction color: {}", redactColor); } catch (NumberFormatException e) { log.warn("Invalid color string provided. Using default color BLACK for redaction."); redactColor = Color.BLACK; } + // Step 1: Find all text locations for all search terms + log.debug("Step 1: Finding all text locations"); + Map> allFoundTextsByPage = new HashMap<>(); + Set allSearchTerms = new HashSet<>(); for (String text : listOfText) { text = text.trim(); + if (text.isEmpty()) continue; + + log.debug("Searching for text pattern: '{}'", text); + allSearchTerms.add(text); TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool); - List foundTexts = textFinder.getTextLocations(document); - redactFoundText(document, foundTexts, customPadding, redactColor); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + log.debug("Found {} instances of pattern '{}'", foundTexts.size(), text); + + for (PDFText found : foundTexts) { + allFoundTextsByPage + .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>()) + .add(found); + } + } + + log.debug("Total pages with found text: {}", allFoundTextsByPage.size()); + + // Step 2: Process each page + log.debug("Step 2: Processing each page for text replacement"); + for (PDPage page : document.getPages()) { + // Replace text content + List filteredTokens = + createTokensWithoutTargetText( + page, allSearchTerms, useRegex, wholeWordSearchBool); + writeFilteredContentStream(document, page, filteredTokens); + } + + // Draw redaction boxes for all found texts + List allFoundTexts = new ArrayList<>(); + for (List pageTexts : allFoundTextsByPage.values()) { + allFoundTexts.addAll(pageTexts); + } + + log.debug("Drawing redaction boxes for {} total found texts", allFoundTexts.size()); + + if (!allFoundTexts.isEmpty()) { + redactFoundText(document, allFoundTexts, customPadding, redactColor); } if (convertPDFToImage) { + log.debug("Converting redacted PDF to image format"); PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document); document.close(); document = convertedPdf; @@ -238,32 +426,465 @@ public class RedactController { document.close(); byte[] pdfContent = baos.toByteArray(); + log.debug("Auto-redaction completed. Output PDF size: {} bytes", pdfContent.length); + return WebResponseUtils.bytesToWebResponse( pdfContent, Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "") + "_redacted.pdf"); } - private void redactFoundText( - PDDocument document, List blocks, float customPadding, Color redactColor) + private List createTokensWithoutTargetText( + PDPage page, Set targetWords, boolean useRegex, boolean wholeWordSearch) throws IOException { - var allPages = document.getDocumentCatalog().getPages(); + log.debug( + "Creating tokens without target text for page, searching for {} words", + targetWords.size()); - for (PDFText block : blocks) { - var page = allPages.get(block.getPageIndex()); - PDPageContentStream contentStream = - new PDPageContentStream( - document, page, PDPageContentStream.AppendMode.APPEND, true, true); - contentStream.setNonStrokingColor(redactColor); - float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding; - PDRectangle pageBox = page.getBBox(); - contentStream.addRect( - block.getX1(), - pageBox.getHeight() - block.getY1() - padding, - block.getX2() - block.getX1(), - block.getY2() - block.getY1() + 2 * padding); - contentStream.fill(); - contentStream.close(); + PDFStreamParser parser = new PDFStreamParser(page); + List tokens = new ArrayList<>(); + Object token; + while ((token = parser.parseNextToken()) != null) { + tokens.add(token); + } + + log.debug("Parsed {} tokens from page content stream", tokens.size()); + + List textSegments = extractTextSegments(page, tokens); + log.debug("Extracted {} text segments", textSegments.size()); + + String completeText = buildCompleteText(textSegments); + log.debug("Built complete text of {} characters", completeText.length()); + + List matches = + findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); + log.debug("Found {} matches in complete text", matches.size()); + + return applyRedactionsToTokens(tokens, textSegments, matches); + } + + @Data + private static class GraphicsState { + private PDFont font = null; + private float fontSize = 0; + } + + @Data + @AllArgsConstructor + private static class TextSegment { + private int tokenIndex; + private String operatorName; + private String text; + private int startPos; + private int endPos; + private PDFont font; + private float fontSize; + } + + @Data + @AllArgsConstructor + private static class MatchRange { + private int startPos; + private int endPos; + } + + private List extractTextSegments(PDPage page, List tokens) + throws IOException { + log.debug("Extracting text segments from {} tokens", tokens.size()); + + List segments = new ArrayList<>(); + int currentTextPos = 0; + GraphicsState graphicsState = new GraphicsState(); + PDResources resources = page.getResources(); + + for (int i = 0; i < tokens.size(); i++) { + Object currentToken = tokens.get(i); + + if (currentToken instanceof Operator op) { + String opName = op.getName(); + + if ("Tf".equals(opName) && i >= 2) { + try { + COSName fontName = (COSName) tokens.get(i - 2); + COSBase fontSizeBase = (COSBase) tokens.get(i - 1); + if (fontSizeBase instanceof org.apache.pdfbox.cos.COSNumber cosNumber) { + graphicsState.setFont(resources.getFont(fontName)); + graphicsState.setFontSize(cosNumber.floatValue()); + log.debug( + "Updated font state: {} size {}", + fontName.getName(), + graphicsState.getFontSize()); + } + } catch (ClassCastException | IOException e) { + log.warn("Failed to update font state", e); + } + } + + if (isTextShowingOperator(opName) && i > 0) { + String textContent = extractTextFromToken(tokens.get(i - 1), opName); + if (!textContent.isEmpty()) { + log.debug( + "Found text segment '{}' at position {} with operator {}", + textContent, + currentTextPos, + opName); + segments.add( + new TextSegment( + i - 1, + opName, + textContent, + currentTextPos, + currentTextPos + textContent.length(), + graphicsState.font, + graphicsState.fontSize)); + currentTextPos += textContent.length(); + } + } + } + } + + log.debug("Extracted {} text segments from page", segments.size()); + return segments; + } + + private String buildCompleteText(List segments) { + StringBuilder sb = new StringBuilder(); + for (TextSegment segment : segments) { + sb.append(segment.text); + } + return sb.toString(); + } + + private List findAllMatches( + String completeText, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + log.debug( + "Finding matches in text of {} characters for {} target words", + completeText.length(), + targetWords.size()); + + List matches = new ArrayList<>(); + + for (String target : targetWords) { + log.debug("Searching for pattern: '{}'", target); + + String patternString = useRegex ? target : Pattern.quote(target); + if (wholeWordSearch) { + patternString = "\\b" + patternString + "\\b"; + } + Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(completeText); + + int matchCount = 0; + while (matcher.find()) { + matches.add(new MatchRange(matcher.start(), matcher.end())); + matchCount++; + log.debug( + "Found match for '{}' at positions {}-{}", + target, + matcher.start(), + matcher.end()); + } + + log.debug("Total matches for '{}': {}", target, matchCount); + } + + matches.sort((a, b) -> Integer.compare(a.startPos, b.startPos)); + log.debug("Found {} total matches across all patterns", matches.size()); + + return matches; + } + + private List applyRedactionsToTokens( + List tokens, List textSegments, List matches) { + log.debug( + "Applying redactions to {} tokens with {} text segments and {} matches", + tokens.size(), + textSegments.size(), + matches.size()); + + List newTokens = new ArrayList<>(tokens); + + // Group matches by segment to pass to modification methods + Map> matchesBySegment = new HashMap<>(); + for (MatchRange match : matches) { + for (int i = 0; i < textSegments.size(); i++) { + TextSegment segment = textSegments.get(i); + int overlapStart = Math.max(match.startPos, segment.startPos); + int overlapEnd = Math.min(match.endPos, segment.endPos); + if (overlapStart < overlapEnd) { + matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match); + } + } + } + + log.debug("Grouped matches by segment: {} segments affected", matchesBySegment.size()); + + // Create a list of modification tasks + List tasks = new ArrayList<>(); + for (Map.Entry> entry : matchesBySegment.entrySet()) { + int segmentIndex = entry.getKey(); + List segmentMatches = entry.getValue(); + TextSegment segment = textSegments.get(segmentIndex); + + log.debug( + "Creating modification task for segment {} with {} matches", + segmentIndex, + segmentMatches.size()); + + if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) { + String newText = applyRedactionsToSegmentText(segment, segmentMatches); + try { + float adjustment = calculateWidthAdjustment(segment, segmentMatches); + tasks.add(new ModificationTask(segment, newText, adjustment)); + } catch (IOException e) { + log.warn("Failed to calculate width adjustment for redaction.", e); + } + } else if ("TJ".equals(segment.operatorName)) { + tasks.add(new ModificationTask(segment, null, 0)); + } + } + + // Sort tasks by token index in descending order to avoid index shifting issues + tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex)); + + log.debug("Applying {} modification tasks", tasks.size()); + + // Apply modifications + for (ModificationTask task : tasks) { + List segmentMatches = + matchesBySegment.getOrDefault( + textSegments.indexOf(task.segment), Collections.emptyList()); + modifyTokenForRedaction( + newTokens, task.segment, task.newText, task.adjustment, segmentMatches); + } + + log.debug("Completed applying redactions to tokens"); + return newTokens; + } + + @Data + @AllArgsConstructor + private static class ModificationTask { + private TextSegment segment; + private String newText; // Only for Tj + private float adjustment; // Only for Tj + } + + private String applyRedactionsToSegmentText(TextSegment segment, List matches) { + String text = segment.getText(); + StringBuilder result = new StringBuilder(text); + + for (MatchRange match : matches) { + int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); + int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); + + if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) { + String placeholder = createPlaceholder(text.substring(segmentStart, segmentEnd)); + result.replace(segmentStart, segmentEnd, placeholder); + } + } + + return result.toString(); + } + + private float calculateWidthAdjustment(TextSegment segment, List matches) + throws IOException { + float totalOriginalWidth = 0; + float totalPlaceholderWidth = 0; + String text = segment.getText(); + + for (MatchRange match : matches) { + int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); + int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); + + if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) { + String originalPart = text.substring(segmentStart, segmentEnd); + String placeholderPart = createPlaceholder(originalPart); + + if (segment.getFont() != null) { + totalOriginalWidth += + segment.getFont().getStringWidth(originalPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + totalPlaceholderWidth += + segment.getFont().getStringWidth(placeholderPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + } + } + } + return totalOriginalWidth - totalPlaceholderWidth; + } + + private void modifyTokenForRedaction( + List tokens, + TextSegment segment, + String newText, + float adjustment, + List matches) { + log.debug( + "Modifying token at index {} for segment '{}' with operator {}", + segment.getTokenIndex(), + segment.getText(), + segment.getOperatorName()); + + if (segment.getTokenIndex() < 0 || segment.getTokenIndex() >= tokens.size()) { + log.debug( + "Token index {} out of bounds (0-{})", + segment.getTokenIndex(), + tokens.size() - 1); + return; + } + + Object token = tokens.get(segment.getTokenIndex()); + String operatorName = segment.getOperatorName(); + + try { + if (("Tj".equals(operatorName) || "'".equals(operatorName)) + && token instanceof COSString) { + log.debug("Modifying Tj/quote operator with adjustment {}", adjustment); + + if (Math.abs(adjustment) < PRECISION_THRESHOLD) { + tokens.set(segment.getTokenIndex(), new COSString(newText)); + } else { + COSArray newArray = new COSArray(); + newArray.add(new COSString(newText)); + if (segment.getFontSize() > 0) { + float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize(); + newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); + log.debug("Applied kerning adjustment: {}", kerning); + } + tokens.set(segment.getTokenIndex(), newArray); + + int operatorIndex = segment.getTokenIndex() + 1; + if (operatorIndex < tokens.size() + && tokens.get(operatorIndex) instanceof Operator op + && op.getName().equals(operatorName)) { + tokens.set(operatorIndex, Operator.getOperator("TJ")); + log.debug("Changed operator from {} to TJ", operatorName); + } + } + } else if ("TJ".equals(operatorName) && token instanceof COSArray) { + log.debug("Modifying TJ operator array"); + COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches); + tokens.set(segment.getTokenIndex(), newArray); + } + } catch (IOException e) { + log.warn("Failed to modify token for redaction: {}", e.getMessage(), e); } } + + private COSArray createRedactedTJArray( + COSArray originalArray, TextSegment segment, List matches) + throws IOException { + COSArray newArray = new COSArray(); + int textOffsetInSegment = 0; + + for (COSBase element : originalArray) { + if (element instanceof COSString cosString) { + String originalText = cosString.getString(); + StringBuilder newText = new StringBuilder(originalText); + boolean modified = false; + + for (MatchRange match : matches) { + int stringStartInPage = segment.getStartPos() + textOffsetInSegment; + int stringEndInPage = stringStartInPage + originalText.length(); + + int overlapStart = Math.max(match.getStartPos(), stringStartInPage); + int overlapEnd = Math.min(match.getEndPos(), stringEndInPage); + + if (overlapStart < overlapEnd) { + modified = true; + int redactionStartInString = overlapStart - stringStartInPage; + int redactionEndInString = overlapEnd - stringStartInPage; + if (redactionStartInString >= 0 + && redactionEndInString <= originalText.length()) { + String placeholder = + createPlaceholder( + originalText.substring( + redactionStartInString, redactionEndInString)); + newText.replace( + redactionStartInString, redactionEndInString, placeholder); + } + } + } + + String modifiedString = newText.toString(); + newArray.add(new COSString(modifiedString)); + + if (modified && segment.getFont() != null && segment.getFontSize() > 0) { + float originalWidth = + segment.getFont().getStringWidth(originalText) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + float modifiedWidth = + segment.getFont().getStringWidth(modifiedString) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + float adjustment = originalWidth - modifiedWidth; + if (Math.abs(adjustment) > PRECISION_THRESHOLD) { + float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize(); + newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); + } + } + + textOffsetInSegment += originalText.length(); + } else { + newArray.add(element); + } + } + return newArray; + } + + private String extractTextFromToken(Object token, String operatorName) { + return switch (operatorName) { + case "Tj", "'" -> { + if (token instanceof COSString cosString) { + yield cosString.getString(); + } + yield ""; + } + case "TJ" -> { + if (token instanceof COSArray cosArray) { + StringBuilder sb = new StringBuilder(); + for (COSBase element : cosArray) { + if (element instanceof COSString cosString) { + sb.append(cosString.getString()); + } + } + yield sb.toString(); + } + yield ""; + } + default -> ""; + }; + } + + private String createPlaceholder(String originalWord) { + if (originalWord == null || originalWord.isEmpty()) { + return originalWord; + } + return "".repeat(originalWord.length()); + } + + private void writeFilteredContentStream(PDDocument document, PDPage page, List tokens) + throws IOException { + log.debug("Writing filtered content stream with {} tokens", tokens.size()); + + PDStream newStream = new PDStream(document); + try (var out = newStream.createOutputStream()) { + ContentStreamWriter writer = new ContentStreamWriter(out); + writer.writeTokens(tokens); + } + page.setContents(newStream); + + log.debug("Successfully wrote filtered content stream"); + } + + private boolean isTextShowingOperator(String opName) { + return TEXT_SHOWING_OPERATORS.contains(opName); + } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index 4119b3eac..d9ddf3b91 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -6,102 +6,109 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; -import lombok.extern.slf4j.Slf4j; - import stirling.software.SPDF.model.PDFText; -@Slf4j public class TextFinder extends PDFTextStripper { - private final String searchText; + private final String searchTerm; private final boolean useRegex; private final boolean wholeWordSearch; - private final List textOccurrences = new ArrayList<>(); + private final List foundTexts = new ArrayList<>(); - public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch) + private final List pageTextPositions = new ArrayList<>(); + private final StringBuilder pageTextBuilder = new StringBuilder(); + + public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch) throws IOException { - this.searchText = searchText.toLowerCase(); + super(); + this.searchTerm = searchTerm; this.useRegex = useRegex; this.wholeWordSearch = wholeWordSearch; - setSortByPosition(true); + this.setWordSeparator(" "); } - private List findOccurrencesInText(String searchText, String content) { - List matches = new ArrayList<>(); - - Pattern pattern; - - if (useRegex) { - // Use regex-based search - pattern = - wholeWordSearch - ? Pattern.compile("\\b" + searchText + "\\b") - : Pattern.compile(searchText); - } else { - // Use normal text search - pattern = - wholeWordSearch - ? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b") - : Pattern.compile(Pattern.quote(searchText)); - } - - Matcher matcher = pattern.matcher(content); - while (matcher.find()) { - matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start())); - } - return matches; + @Override + protected void startPage(PDPage page) { + pageTextPositions.clear(); + pageTextBuilder.setLength(0); } @Override protected void writeString(String text, List textPositions) { - for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) { - int index = match.startIndex; - if (index + match.matchLength <= textPositions.size()) { - // Initial values based on the first character - TextPosition first = textPositions.get(index); - float minX = first.getX(); - float minY = first.getY(); - float maxX = first.getX() + first.getWidth(); - float maxY = first.getY() + first.getHeight(); + pageTextBuilder.append(text); + pageTextPositions.addAll(textPositions); + } - // Loop over the rest of the characters and adjust bounding box values - for (int i = index; i < index + match.matchLength; i++) { - TextPosition position = textPositions.get(i); - minX = Math.min(minX, position.getX()); - minY = Math.min(minY, position.getY()); - maxX = Math.max(maxX, position.getX() + position.getWidth()); - maxY = Math.max(maxY, position.getY() + position.getHeight()); + @Override + protected void writeWordSeparator() { + pageTextBuilder.append(getWordSeparator()); + pageTextPositions.add(null); // Placeholder for separator + } + + @Override + protected void writeLineSeparator() { + pageTextBuilder.append(getLineSeparator()); + pageTextPositions.add(null); // Placeholder for separator + } + + @Override + protected void endPage(PDPage page) { + String text = pageTextBuilder.toString(); + if (text.isEmpty() || this.searchTerm == null || this.searchTerm.isEmpty()) { + return; + } + + String processedSearchTerm = this.searchTerm.trim(); + String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E"; + if (this.wholeWordSearch) { + regex = "\\b" + regex + "\\b"; + } + + Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + Matcher matcher = pattern.matcher(text); + + while (matcher.find()) { + int matchStart = matcher.start(); + int matchEnd = matcher.end(); + + float minX = Float.MAX_VALUE; + float minY = Float.MAX_VALUE; + float maxX = Float.MIN_VALUE; + float maxY = Float.MIN_VALUE; + boolean foundPosition = false; + + for (int i = matchStart; i < matchEnd; i++) { + if (i >= pageTextPositions.size()) { + continue; } + TextPosition pos = pageTextPositions.get(i); + if (pos != null) { + foundPosition = true; + minX = Math.min(minX, pos.getX()); + maxX = Math.max(maxX, pos.getX() + pos.getWidth()); + minY = Math.min(minY, pos.getY() - pos.getHeight()); + maxY = Math.max(maxY, pos.getY()); + } + } - textOccurrences.add( - new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text)); + if (foundPosition) { + foundTexts.add( + new PDFText( + this.getCurrentPageNo() - 1, + minX, + minY, + maxX, + maxY, + matcher.group())); } } } - public List getTextLocations(PDDocument document) throws Exception { - this.getText(document); - log.debug( - "Found " - + textOccurrences.size() - + " occurrences of '" - + searchText - + "' in the document."); - - return textOccurrences; - } - - private class MatchInfo { - int startIndex; - int matchLength; - - MatchInfo(int startIndex, int matchLength) { - this.startIndex = startIndex; - this.matchLength = matchLength; - } + public List getFoundTexts() { + return foundTexts; } }