feat: auto-redact to support text removal on true PDFs

2025-08-06 13:48:58 +02:00 · 2025-07-12 11:03:33 +02:00 · 2025-07-12 11:03:33 +02:00 · d7fb66bb79
commit d7fb66bb79
parent bbf5d5f6d4
2 changed files with 753 additions and 125 deletions
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java
@ -1,19 +1,33 @@
 package stirling.software.SPDF.controller.api.security;
-import java.awt.*;
+import java.awt.Color;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import org.apache.pdfbox.contentstream.operator.Operator;
 import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSString;
 import org.apache.pdfbox.pdfparser.PDFStreamParser;
 import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageContentStream;
 import org.apache.pdfbox.pdmodel.PDPageTree;
 import org.apache.pdfbox.pdmodel.PDResources;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.pdmodel.common.PDStream;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.bind.WebDataBinder;
 import org.springframework.web.bind.annotation.InitBinder;
@ -27,6 +41,8 @@ import io.github.pixee.security.Filenames;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;
 import lombok.AllArgsConstructor;
 import lombok.Data;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
@ -48,6 +64,13 @@ import stirling.software.common.util.propertyeditor.StringToArrayListPropertyEdi
@RequiredArgsConstructor
 public class RedactController {
    private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.3f;
    private static final float PRECISION_THRESHOLD = 1e-3f;
    private static final int FONT_SCALE_FACTOR = 1000;
    // Text showing operators
    private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
    private final CustomPDFDocumentFactory pdfDocumentFactory;
    @InitBinder
@ -65,17 +88,30 @@ public class RedactController {
                            + " Type:SISO")
    public ResponseEntity<byte[]> redactPDF(@ModelAttribute ManualRedactPdfRequest request)
            throws IOException {
        log.debug(
                "Starting manual redaction for file: {}",
                request.getFileInput().getOriginalFilename());
        MultipartFile file = request.getFileInput();
        List<RedactionArea> redactionAreas = request.getRedactions();
        log.debug(
                "Processing {} redaction areas",
                redactionAreas != null ? redactionAreas.size() : 0);
        PDDocument document = pdfDocumentFactory.load(file);
        log.debug("Loaded PDF document with {} pages", document.getNumberOfPages());
        PDPageTree allPages = document.getDocumentCatalog().getPages();
        log.debug("Starting page redactions");
        redactPages(request, document, allPages);
        log.debug("Starting area redactions");
        redactAreas(redactionAreas, document, allPages);
        if (Boolean.TRUE.equals(request.getConvertPDFToImage())) {
            log.debug("Converting PDF to image format");
            PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
            document.close();
            document = convertedPdf;
@ -86,6 +122,8 @@ public class RedactController {
        document.close();
        byte[] pdfContent = baos.toByteArray();
        log.debug("Manual redaction completed. Output PDF size: {} bytes", pdfContent.length);
        return WebResponseUtils.bytesToWebResponse(
                pdfContent,
                Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
@ -95,17 +133,30 @@ public class RedactController {
    private void redactAreas(
            List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
            throws IOException {
        log.debug("Processing redaction areas");
        // Group redaction areas by page
        Map<Integer, List<RedactionArea>> redactionsByPage = new HashMap<>();
        // Process and validate each redaction area
        for (RedactionArea redactionArea : redactionAreas) {
            log.debug(
                    "Validating redaction area on page {}: x={}, y={}, width={}, height={}",
                    redactionArea.getPage(),
                    redactionArea.getX(),
                    redactionArea.getY(),
                    redactionArea.getWidth(),
                    redactionArea.getHeight());
            if (redactionArea.getPage() == null
                    || redactionArea.getPage() <= 0
                    || redactionArea.getHeight() == null
                    || redactionArea.getHeight() <= 0.0D
                    || redactionArea.getWidth() == null
-                    || redactionArea.getWidth() <= 0.0D) continue;
+                    || redactionArea.getWidth() <= 0.0D) {
                log.debug("Skipping invalid redaction area: {}", redactionArea);
                continue;
            }
            // Group by page number
            redactionsByPage
@ -113,70 +164,151 @@ public class RedactController {
                    .add(redactionArea);
        }
        log.debug("Grouped redactions by page: {} pages affected", redactionsByPage.size());
        // Process each page only once
        for (Map.Entry<Integer, List<RedactionArea>> entry : redactionsByPage.entrySet()) {
            Integer pageNumber = entry.getKey();
            List<RedactionArea> areasForPage = entry.getValue();
            log.debug(
                    "Processing page {} with {} redaction areas", pageNumber, areasForPage.size());
            if (pageNumber > allPages.getCount()) {
                log.debug(
                        "Skipping page {} - out of bounds (total pages: {})",
                        pageNumber,
                        allPages.getCount());
                continue; // Skip if page number is out of bounds
            }
            PDPage page = allPages.get(pageNumber - 1);
            PDRectangle box = page.getBBox();
-            // Create only one content stream per page
+            // Create only one content stream per page to draw all redaction boxes
-            PDPageContentStream contentStream =
+            try (PDPageContentStream contentStream =
                    new PDPageContentStream(
-                            document, page, PDPageContentStream.AppendMode.APPEND, true, true);
+                            document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
-            // Process all redactions for this page
+                // Process all redactions for this page
-            for (RedactionArea redactionArea : areasForPage) {
+                for (RedactionArea redactionArea : areasForPage) {
-                Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK);
+                    Color redactColor = decodeOrDefault(redactionArea.getColor());
-                contentStream.setNonStrokingColor(redactColor);
+                    log.debug(
                            "Applying redaction with color {} at ({}, {}) size {}x{}",
                            redactColor,
                            redactionArea.getX(),
                            redactionArea.getY(),
                            redactionArea.getWidth(),
                            redactionArea.getHeight());
-                float x = redactionArea.getX().floatValue();
+                    contentStream.setNonStrokingColor(redactColor);
                float y = redactionArea.getY().floatValue();
                float width = redactionArea.getWidth().floatValue();
                float height = redactionArea.getHeight().floatValue();
-                contentStream.addRect(x, box.getHeight() - y - height, width, height);
+                    float x = redactionArea.getX().floatValue();
-                contentStream.fill();
+                    float y = redactionArea.getY().floatValue();
                    float width = redactionArea.getWidth().floatValue();
                    float height = redactionArea.getHeight().floatValue();
                    // The y-coordinate needs to be transformed from a top-left origin to a
                    // bottom-left origin.
                    float pdfY = page.getBBox().getHeight() - y - height;
                    contentStream.addRect(x, pdfY, width, height);
                    contentStream.fill();
                }
            }
            contentStream.close();
        }
        log.debug("Completed redaction areas processing");
    }
    private void redactPages(
            ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages)
            throws IOException {
-        Color redactColor = decodeOrDefault(request.getPageRedactionColor(), Color.BLACK);
+        log.debug("Starting page redactions");
        Color redactColor = decodeOrDefault(request.getPageRedactionColor());
        List<Integer> pageNumbers = getPageNumbers(request, allPages.getCount());
        log.debug("Redacting {} pages with color {}", pageNumbers.size(), redactColor);
        for (Integer pageNumber : pageNumbers) {
            log.debug("Redacting entire page {}", pageNumber + 1);
            PDPage page = allPages.get(pageNumber);
-            PDPageContentStream contentStream =
+            try (PDPageContentStream contentStream =
                    new PDPageContentStream(
-                            document, page, PDPageContentStream.AppendMode.APPEND, true, true);
+                            document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
-            contentStream.setNonStrokingColor(redactColor);
+                contentStream.setNonStrokingColor(redactColor);
-            PDRectangle box = page.getBBox();
+                PDRectangle box = page.getBBox();
                log.debug(
                        "Page {} dimensions: {}x{}",
                        pageNumber + 1,
                        box.getWidth(),
                        box.getHeight());
-            contentStream.addRect(0, 0, box.getWidth(), box.getHeight());
+                contentStream.addRect(0, 0, box.getWidth(), box.getHeight());
-            contentStream.fill();
+                contentStream.fill();
-            contentStream.close();
+            }
        }
        log.debug("Completed page redactions");
    }
-    private Color decodeOrDefault(String hex, Color defaultColor) {
+    private void redactFoundText(
-        try {
+            PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
-            if (hex != null && !hex.startsWith("#")) {
+            throws IOException {
-                hex = "#" + hex;
+        log.debug(
                "Redacting {} text blocks with padding {} and color {}",
                blocks.size(),
                customPadding,
                redactColor);
        var allPages = document.getDocumentCatalog().getPages();
        for (PDFText block : blocks) {
            log.debug(
                    "Redacting text block on page {}: '{}' at ({}, {}) to ({}, {})",
                    block.getPageIndex() + 1,
                    block.getText(),
                    block.getX1(),
                    block.getY1(),
                    block.getX2(),
                    block.getY2());
            var page = allPages.get(block.getPageIndex());
            try (PDPageContentStream contentStream =
                    new PDPageContentStream(
                            document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
                contentStream.setNonStrokingColor(redactColor);
                float padding =
                        (block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER
                                + customPadding;
                PDRectangle pageBox = page.getBBox();
                contentStream.addRect(
                        block.getX1(),
                        pageBox.getHeight() - block.getY2() - padding,
                        block.getX2() - block.getX1(),
                        block.getY2() - block.getY1() + 2 * padding);
                contentStream.fill();
            }
-            return Color.decode(hex);
+        }
-        } catch (Exception e) {
+
-            return defaultColor;
+        log.debug("Completed text block redactions");
    }
    private Color decodeOrDefault(String hex) {
        if (hex == null) {
            return Color.BLACK;
        }
        String colorString = hex.startsWith("#") ? hex : "#" + hex;
        try {
            return Color.decode(colorString);
        } catch (NumberFormatException e) {
            log.warn("Invalid color string '{}'. Using default color BLACK.", hex);
            return Color.BLACK;
        }
    }
@ -198,6 +330,10 @@ public class RedactController {
                            + " Input:PDF, Output:PDF, Type:SISO")
    public ResponseEntity<byte[]> redactPdf(@ModelAttribute RedactPdfRequest request)
            throws Exception {
        log.debug(
                "Starting auto-redaction for file: {}",
                request.getFileInput().getOriginalFilename());
        MultipartFile file = request.getFileInput();
        String listOfTextString = request.getListOfText();
        boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
@ -206,28 +342,80 @@ public class RedactController {
        float customPadding = request.getCustomPadding();
        boolean convertPDFToImage = Boolean.TRUE.equals(request.getConvertPDFToImage());
        log.debug(
                "Auto-redaction parameters: useRegex={}, wholeWordSearch={}, customPadding={}, convertToImage={}",
                useRegex,
                wholeWordSearchBool,
                customPadding,
                convertPDFToImage);
        String[] listOfText = listOfTextString.split("\n");
        log.debug("Searching for {} text patterns", listOfText.length);
        PDDocument document = pdfDocumentFactory.load(file);
        log.debug("Loaded PDF document with {} pages", document.getNumberOfPages());
        Color redactColor;
        try {
-            if (!colorString.startsWith("#")) {
+            if (colorString != null && !colorString.startsWith("#")) {
                colorString = "#" + colorString;
            }
            redactColor = Color.decode(colorString);
            log.debug("Using redaction color: {}", redactColor);
        } catch (NumberFormatException e) {
            log.warn("Invalid color string provided. Using default color BLACK for redaction.");
            redactColor = Color.BLACK;
        }
        // Step 1: Find all text locations for all search terms
        log.debug("Step 1: Finding all text locations");
        Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
        Set<String> allSearchTerms = new HashSet<>();
        for (String text : listOfText) {
            text = text.trim();
            if (text.isEmpty()) continue;
            log.debug("Searching for text pattern: '{}'", text);
            allSearchTerms.add(text);
            TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool);
-            List<PDFText> foundTexts = textFinder.getTextLocations(document);
+            textFinder.getText(document);
-            redactFoundText(document, foundTexts, customPadding, redactColor);
+            List<PDFText> foundTexts = textFinder.getFoundTexts();
            log.debug("Found {} instances of pattern '{}'", foundTexts.size(), text);
            for (PDFText found : foundTexts) {
                allFoundTextsByPage
                        .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>())
                        .add(found);
            }
        }
        log.debug("Total pages with found text: {}", allFoundTextsByPage.size());
        // Step 2: Process each page
        log.debug("Step 2: Processing each page for text replacement");
        for (PDPage page : document.getPages()) {
            // Replace text content
            List<Object> filteredTokens =
                    createTokensWithoutTargetText(
                            page, allSearchTerms, useRegex, wholeWordSearchBool);
            writeFilteredContentStream(document, page, filteredTokens);
        }
        // Draw redaction boxes for all found texts
        List<PDFText> allFoundTexts = new ArrayList<>();
        for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
            allFoundTexts.addAll(pageTexts);
        }
        log.debug("Drawing redaction boxes for {} total found texts", allFoundTexts.size());
        if (!allFoundTexts.isEmpty()) {
            redactFoundText(document, allFoundTexts, customPadding, redactColor);
        }
        if (convertPDFToImage) {
            log.debug("Converting redacted PDF to image format");
            PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document);
            document.close();
            document = convertedPdf;
@ -238,32 +426,465 @@ public class RedactController {
        document.close();
        byte[] pdfContent = baos.toByteArray();
        log.debug("Auto-redaction completed. Output PDF size: {} bytes", pdfContent.length);
        return WebResponseUtils.bytesToWebResponse(
                pdfContent,
                Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "")
                        + "_redacted.pdf");
    }
-    private void redactFoundText(
+    private List<Object> createTokensWithoutTargetText(
-            PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
+            PDPage page, Set<String> targetWords, boolean useRegex, boolean wholeWordSearch)
            throws IOException {
-        var allPages = document.getDocumentCatalog().getPages();
+        log.debug(
                "Creating tokens without target text for page, searching for {} words",
                targetWords.size());
-        for (PDFText block : blocks) {
+        PDFStreamParser parser = new PDFStreamParser(page);
-            var page = allPages.get(block.getPageIndex());
+        List<Object> tokens = new ArrayList<>();
-            PDPageContentStream contentStream =
+        Object token;
-                    new PDPageContentStream(
+        while ((token = parser.parseNextToken()) != null) {
-                            document, page, PDPageContentStream.AppendMode.APPEND, true, true);
+            tokens.add(token);
-            contentStream.setNonStrokingColor(redactColor);
+        }
-            float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding;
+
-            PDRectangle pageBox = page.getBBox();
+        log.debug("Parsed {} tokens from page content stream", tokens.size());
-            contentStream.addRect(
+
-                    block.getX1(),
+        List<TextSegment> textSegments = extractTextSegments(page, tokens);
-                    pageBox.getHeight() - block.getY1() - padding,
+        log.debug("Extracted {} text segments", textSegments.size());
-                    block.getX2() - block.getX1(),
+
-                    block.getY2() - block.getY1() + 2 * padding);
+        String completeText = buildCompleteText(textSegments);
-            contentStream.fill();
+        log.debug("Built complete text of {} characters", completeText.length());
-            contentStream.close();
+
        List<MatchRange> matches =
                findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
        log.debug("Found {} matches in complete text", matches.size());
        return applyRedactionsToTokens(tokens, textSegments, matches);
    }
    @Data
    private static class GraphicsState {
        private PDFont font = null;
        private float fontSize = 0;
    }
    @Data
    @AllArgsConstructor
    private static class TextSegment {
        private int tokenIndex;
        private String operatorName;
        private String text;
        private int startPos;
        private int endPos;
        private PDFont font;
        private float fontSize;
    }
    @Data
    @AllArgsConstructor
    private static class MatchRange {
        private int startPos;
        private int endPos;
    }
    private List<TextSegment> extractTextSegments(PDPage page, List<Object> tokens)
            throws IOException {
        log.debug("Extracting text segments from {} tokens", tokens.size());
        List<TextSegment> segments = new ArrayList<>();
        int currentTextPos = 0;
        GraphicsState graphicsState = new GraphicsState();
        PDResources resources = page.getResources();
        for (int i = 0; i < tokens.size(); i++) {
            Object currentToken = tokens.get(i);
            if (currentToken instanceof Operator op) {
                String opName = op.getName();
                if ("Tf".equals(opName) && i >= 2) {
                    try {
                        COSName fontName = (COSName) tokens.get(i - 2);
                        COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
                        if (fontSizeBase instanceof org.apache.pdfbox.cos.COSNumber cosNumber) {
                            graphicsState.setFont(resources.getFont(fontName));
                            graphicsState.setFontSize(cosNumber.floatValue());
                            log.debug(
                                    "Updated font state: {} size {}",
                                    fontName.getName(),
                                    graphicsState.getFontSize());
                        }
                    } catch (ClassCastException | IOException e) {
                        log.warn("Failed to update font state", e);
                    }
                }
                if (isTextShowingOperator(opName) && i > 0) {
                    String textContent = extractTextFromToken(tokens.get(i - 1), opName);
                    if (!textContent.isEmpty()) {
                        log.debug(
                                "Found text segment '{}' at position {} with operator {}",
                                textContent,
                                currentTextPos,
                                opName);
                        segments.add(
                                new TextSegment(
                                        i - 1,
                                        opName,
                                        textContent,
                                        currentTextPos,
                                        currentTextPos + textContent.length(),
                                        graphicsState.font,
                                        graphicsState.fontSize));
                        currentTextPos += textContent.length();
                    }
                }
            }
        }
        log.debug("Extracted {} text segments from page", segments.size());
        return segments;
    }
    private String buildCompleteText(List<TextSegment> segments) {
        StringBuilder sb = new StringBuilder();
        for (TextSegment segment : segments) {
            sb.append(segment.text);
        }
        return sb.toString();
    }
    private List<MatchRange> findAllMatches(
            String completeText,
            Set<String> targetWords,
            boolean useRegex,
            boolean wholeWordSearch) {
        log.debug(
                "Finding matches in text of {} characters for {} target words",
                completeText.length(),
                targetWords.size());
        List<MatchRange> matches = new ArrayList<>();
        for (String target : targetWords) {
            log.debug("Searching for pattern: '{}'", target);
            String patternString = useRegex ? target : Pattern.quote(target);
            if (wholeWordSearch) {
                patternString = "\\b" + patternString + "\\b";
            }
            Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
            Matcher matcher = pattern.matcher(completeText);
            int matchCount = 0;
            while (matcher.find()) {
                matches.add(new MatchRange(matcher.start(), matcher.end()));
                matchCount++;
                log.debug(
                        "Found match for '{}' at positions {}-{}",
                        target,
                        matcher.start(),
                        matcher.end());
            }
            log.debug("Total matches for '{}': {}", target, matchCount);
        }
        matches.sort((a, b) -> Integer.compare(a.startPos, b.startPos));
        log.debug("Found {} total matches across all patterns", matches.size());
        return matches;
    }
    private List<Object> applyRedactionsToTokens(
            List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
        log.debug(
                "Applying redactions to {} tokens with {} text segments and {} matches",
                tokens.size(),
                textSegments.size(),
                matches.size());
        List<Object> newTokens = new ArrayList<>(tokens);
        // Group matches by segment to pass to modification methods
        Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
        for (MatchRange match : matches) {
            for (int i = 0; i < textSegments.size(); i++) {
                TextSegment segment = textSegments.get(i);
                int overlapStart = Math.max(match.startPos, segment.startPos);
                int overlapEnd = Math.min(match.endPos, segment.endPos);
                if (overlapStart < overlapEnd) {
                    matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
                }
            }
        }
        log.debug("Grouped matches by segment: {} segments affected", matchesBySegment.size());
        // Create a list of modification tasks
        List<ModificationTask> tasks = new ArrayList<>();
        for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
            int segmentIndex = entry.getKey();
            List<MatchRange> segmentMatches = entry.getValue();
            TextSegment segment = textSegments.get(segmentIndex);
            log.debug(
                    "Creating modification task for segment {} with {} matches",
                    segmentIndex,
                    segmentMatches.size());
            if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) {
                String newText = applyRedactionsToSegmentText(segment, segmentMatches);
                try {
                    float adjustment = calculateWidthAdjustment(segment, segmentMatches);
                    tasks.add(new ModificationTask(segment, newText, adjustment));
                } catch (IOException e) {
                    log.warn("Failed to calculate width adjustment for redaction.", e);
                }
            } else if ("TJ".equals(segment.operatorName)) {
                tasks.add(new ModificationTask(segment, null, 0));
            }
        }
        // Sort tasks by token index in descending order to avoid index shifting issues
        tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
        log.debug("Applying {} modification tasks", tasks.size());
        // Apply modifications
        for (ModificationTask task : tasks) {
            List<MatchRange> segmentMatches =
                    matchesBySegment.getOrDefault(
                            textSegments.indexOf(task.segment), Collections.emptyList());
            modifyTokenForRedaction(
                    newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
        }
        log.debug("Completed applying redactions to tokens");
        return newTokens;
    }
    @Data
    @AllArgsConstructor
    private static class ModificationTask {
        private TextSegment segment;
        private String newText; // Only for Tj
        private float adjustment; // Only for Tj
    }
    private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) {
        String text = segment.getText();
        StringBuilder result = new StringBuilder(text);
        for (MatchRange match : matches) {
            int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
            int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
            if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) {
                String placeholder = createPlaceholder(text.substring(segmentStart, segmentEnd));
                result.replace(segmentStart, segmentEnd, placeholder);
            }
        }
        return result.toString();
    }
    private float calculateWidthAdjustment(TextSegment segment, List<MatchRange> matches)
            throws IOException {
        float totalOriginalWidth = 0;
        float totalPlaceholderWidth = 0;
        String text = segment.getText();
        for (MatchRange match : matches) {
            int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
            int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
            if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) {
                String originalPart = text.substring(segmentStart, segmentEnd);
                String placeholderPart = createPlaceholder(originalPart);
                if (segment.getFont() != null) {
                    totalOriginalWidth +=
                            segment.getFont().getStringWidth(originalPart)
                                    / FONT_SCALE_FACTOR
                                    * segment.getFontSize();
                    totalPlaceholderWidth +=
                            segment.getFont().getStringWidth(placeholderPart)
                                    / FONT_SCALE_FACTOR
                                    * segment.getFontSize();
                }
            }
        }
        return totalOriginalWidth - totalPlaceholderWidth;
    }
    private void modifyTokenForRedaction(
            List<Object> tokens,
            TextSegment segment,
            String newText,
            float adjustment,
            List<MatchRange> matches) {
        log.debug(
                "Modifying token at index {} for segment '{}' with operator {}",
                segment.getTokenIndex(),
                segment.getText(),
                segment.getOperatorName());
        if (segment.getTokenIndex() < 0 || segment.getTokenIndex() >= tokens.size()) {
            log.debug(
                    "Token index {} out of bounds (0-{})",
                    segment.getTokenIndex(),
                    tokens.size() - 1);
            return;
        }
        Object token = tokens.get(segment.getTokenIndex());
        String operatorName = segment.getOperatorName();
        try {
            if (("Tj".equals(operatorName) || "'".equals(operatorName))
                    && token instanceof COSString) {
                log.debug("Modifying Tj/quote operator with adjustment {}", adjustment);
                if (Math.abs(adjustment) < PRECISION_THRESHOLD) {
                    tokens.set(segment.getTokenIndex(), new COSString(newText));
                } else {
                    COSArray newArray = new COSArray();
                    newArray.add(new COSString(newText));
                    if (segment.getFontSize() > 0) {
                        float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize();
                        newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning));
                        log.debug("Applied kerning adjustment: {}", kerning);
                    }
                    tokens.set(segment.getTokenIndex(), newArray);
                    int operatorIndex = segment.getTokenIndex() + 1;
                    if (operatorIndex < tokens.size()
                            && tokens.get(operatorIndex) instanceof Operator op
                            && op.getName().equals(operatorName)) {
                        tokens.set(operatorIndex, Operator.getOperator("TJ"));
                        log.debug("Changed operator from {} to TJ", operatorName);
                    }
                }
            } else if ("TJ".equals(operatorName) && token instanceof COSArray) {
                log.debug("Modifying TJ operator array");
                COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches);
                tokens.set(segment.getTokenIndex(), newArray);
            }
        } catch (IOException e) {
            log.warn("Failed to modify token for redaction: {}", e.getMessage(), e);
        }
    }
    private COSArray createRedactedTJArray(
            COSArray originalArray, TextSegment segment, List<MatchRange> matches)
            throws IOException {
        COSArray newArray = new COSArray();
        int textOffsetInSegment = 0;
        for (COSBase element : originalArray) {
            if (element instanceof COSString cosString) {
                String originalText = cosString.getString();
                StringBuilder newText = new StringBuilder(originalText);
                boolean modified = false;
                for (MatchRange match : matches) {
                    int stringStartInPage = segment.getStartPos() + textOffsetInSegment;
                    int stringEndInPage = stringStartInPage + originalText.length();
                    int overlapStart = Math.max(match.getStartPos(), stringStartInPage);
                    int overlapEnd = Math.min(match.getEndPos(), stringEndInPage);
                    if (overlapStart < overlapEnd) {
                        modified = true;
                        int redactionStartInString = overlapStart - stringStartInPage;
                        int redactionEndInString = overlapEnd - stringStartInPage;
                        if (redactionStartInString >= 0
                                && redactionEndInString <= originalText.length()) {
                            String placeholder =
                                    createPlaceholder(
                                            originalText.substring(
                                                    redactionStartInString, redactionEndInString));
                            newText.replace(
                                    redactionStartInString, redactionEndInString, placeholder);
                        }
                    }
                }
                String modifiedString = newText.toString();
                newArray.add(new COSString(modifiedString));
                if (modified && segment.getFont() != null && segment.getFontSize() > 0) {
                    float originalWidth =
                            segment.getFont().getStringWidth(originalText)
                                    / FONT_SCALE_FACTOR
                                    * segment.getFontSize();
                    float modifiedWidth =
                            segment.getFont().getStringWidth(modifiedString)
                                    / FONT_SCALE_FACTOR
                                    * segment.getFontSize();
                    float adjustment = originalWidth - modifiedWidth;
                    if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
                        float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize();
                        newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning));
                    }
                }
                textOffsetInSegment += originalText.length();
            } else {
                newArray.add(element);
            }
        }
        return newArray;
    }
    private String extractTextFromToken(Object token, String operatorName) {
        return switch (operatorName) {
            case "Tj", "'" -> {
                if (token instanceof COSString cosString) {
                    yield cosString.getString();
                }
                yield "";
            }
            case "TJ" -> {
                if (token instanceof COSArray cosArray) {
                    StringBuilder sb = new StringBuilder();
                    for (COSBase element : cosArray) {
                        if (element instanceof COSString cosString) {
                            sb.append(cosString.getString());
                        }
                    }
                    yield sb.toString();
                }
                yield "";
            }
            default -> "";
        };
    }
    private String createPlaceholder(String originalWord) {
        if (originalWord == null || originalWord.isEmpty()) {
            return originalWord;
        }
        return "".repeat(originalWord.length());
    }
    private void writeFilteredContentStream(PDDocument document, PDPage page, List<Object> tokens)
            throws IOException {
        log.debug("Writing filtered content stream with {} tokens", tokens.size());
        PDStream newStream = new PDStream(document);
        try (var out = newStream.createOutputStream()) {
            ContentStreamWriter writer = new ContentStreamWriter(out);
            writer.writeTokens(tokens);
        }
        page.setContents(newStream);
        log.debug("Successfully wrote filtered content stream");
    }
    private boolean isTextShowingOperator(String opName) {
        return TEXT_SHOWING_OPERATORS.contains(opName);
    }
 }
--- a/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java
+++ b/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java
@ -6,102 +6,109 @@ import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
 import lombok.extern.slf4j.Slf4j;
 import stirling.software.SPDF.model.PDFText;
@Slf4j
 public class TextFinder extends PDFTextStripper {
-    private final String searchText;
+    private final String searchTerm;
    private final boolean useRegex;
    private final boolean wholeWordSearch;
-    private final List<PDFText> textOccurrences = new ArrayList<>();
+    private final List<PDFText> foundTexts = new ArrayList<>();
-    public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch)
+    private final List<TextPosition> pageTextPositions = new ArrayList<>();
    private final StringBuilder pageTextBuilder = new StringBuilder();
    public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
            throws IOException {
-        this.searchText = searchText.toLowerCase();
+        super();
        this.searchTerm = searchTerm;
        this.useRegex = useRegex;
        this.wholeWordSearch = wholeWordSearch;
-        setSortByPosition(true);
+        this.setWordSeparator(" ");
    }
-    private List<MatchInfo> findOccurrencesInText(String searchText, String content) {
+    @Override
-        List<MatchInfo> matches = new ArrayList<>();
+    protected void startPage(PDPage page) {
-
+        pageTextPositions.clear();
-        Pattern pattern;
+        pageTextBuilder.setLength(0);
        if (useRegex) {
            // Use regex-based search
            pattern =
                    wholeWordSearch
                            ? Pattern.compile("\\b" + searchText + "\\b")
                            : Pattern.compile(searchText);
        } else {
            // Use normal text search
            pattern =
                    wholeWordSearch
                            ? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
                            : Pattern.compile(Pattern.quote(searchText));
        }
        Matcher matcher = pattern.matcher(content);
        while (matcher.find()) {
            matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
        }
        return matches;
    }
    @Override
    protected void writeString(String text, List<TextPosition> textPositions) {
-        for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) {
+        pageTextBuilder.append(text);
-            int index = match.startIndex;
+        pageTextPositions.addAll(textPositions);
-            if (index + match.matchLength <= textPositions.size()) {
+    }
                // Initial values based on the first character
                TextPosition first = textPositions.get(index);
                float minX = first.getX();
                float minY = first.getY();
                float maxX = first.getX() + first.getWidth();
                float maxY = first.getY() + first.getHeight();
-                // Loop over the rest of the characters and adjust bounding box values
+    @Override
-                for (int i = index; i < index + match.matchLength; i++) {
+    protected void writeWordSeparator() {
-                    TextPosition position = textPositions.get(i);
+        pageTextBuilder.append(getWordSeparator());
-                    minX = Math.min(minX, position.getX());
+        pageTextPositions.add(null); // Placeholder for separator
-                    minY = Math.min(minY, position.getY());
+    }
-                    maxX = Math.max(maxX, position.getX() + position.getWidth());
+
-                    maxY = Math.max(maxY, position.getY() + position.getHeight());
+    @Override
    protected void writeLineSeparator() {
        pageTextBuilder.append(getLineSeparator());
        pageTextPositions.add(null); // Placeholder for separator
    }
    @Override
    protected void endPage(PDPage page) {
        String text = pageTextBuilder.toString();
        if (text.isEmpty() || this.searchTerm == null || this.searchTerm.isEmpty()) {
            return;
        }
        String processedSearchTerm = this.searchTerm.trim();
        String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
        if (this.wholeWordSearch) {
            regex = "\\b" + regex + "\\b";
        }
        Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
        Matcher matcher = pattern.matcher(text);
        while (matcher.find()) {
            int matchStart = matcher.start();
            int matchEnd = matcher.end();
            float minX = Float.MAX_VALUE;
            float minY = Float.MAX_VALUE;
            float maxX = Float.MIN_VALUE;
            float maxY = Float.MIN_VALUE;
            boolean foundPosition = false;
            for (int i = matchStart; i < matchEnd; i++) {
                if (i >= pageTextPositions.size()) {
                    continue;
                }
                TextPosition pos = pageTextPositions.get(i);
                if (pos != null) {
                    foundPosition = true;
                    minX = Math.min(minX, pos.getX());
                    maxX = Math.max(maxX, pos.getX() + pos.getWidth());
                    minY = Math.min(minY, pos.getY() - pos.getHeight());
                    maxY = Math.max(maxY, pos.getY());
                }
            }
-                textOccurrences.add(
+            if (foundPosition) {
-                        new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text));
+                foundTexts.add(
                        new PDFText(
                                this.getCurrentPageNo() - 1,
                                minX,
                                minY,
                                maxX,
                                maxY,
                                matcher.group()));
            }
        }
    }
-    public List<PDFText> getTextLocations(PDDocument document) throws Exception {
+    public List<PDFText> getFoundTexts() {
-        this.getText(document);
+        return foundTexts;
        log.debug(
                "Found "
                        + textOccurrences.size()
                        + " occurrences of '"
                        + searchText
                        + "' in the document.");
        return textOccurrences;
    }
    private class MatchInfo {
        int startIndex;
        int matchLength;
        MatchInfo(int startIndex, int matchLength) {
            this.startIndex = startIndex;
            this.matchLength = matchLength;
        }
    }
 }