refactor redaction services to improve resource management and streamline text processing

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
2025-09-08 17:51:20 +02:00 · 2025-08-25 19:53:19 +02:00 · 2025-08-25 19:53:19 +02:00 · 3ac7f0df4c
commit 3ac7f0df4c
parent f236505cae
6 changed files with 394 additions and 707 deletions
--- a/app/core/src/main/java/stirling/software/SPDF/service/AggressiveRedactionService.java
+++ b/app/core/src/main/java/stirling/software/SPDF/service/AggressiveRedactionService.java
@ -30,51 +30,39 @@ class AggressiveRedactionService implements RedactionModeStrategy {
        boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
        boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
-        PDDocument doc = null;
+        try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
        PDDocument fb = null;
        try {
            doc = pdfDocumentFactory.load(request.getFileInput());
            Map<Integer, List<PDFText>> allFound =
                    RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
            if (allFound.isEmpty()) {
-                try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+                return toByteArray(doc);
                    doc.save(baos);
                    return baos.toByteArray();
                }
            }
            helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord);
            Map<Integer, List<PDFText>> residual =
                    RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
            boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0;
-            String effectiveColor =
+
                    (request.getRedactColor() == null || request.getRedactColor().isBlank())
                            ? "#000000"
                            : request.getRedactColor();
            if (residualExists) {
                // Use the new visual redaction with OCR restoration fallback
                return helper.performVisualRedactionWithOcrRestoration(
                        request, listOfText, useRegex, wholeWord);
            }
            return RedactionService.finalizeRedaction(
                    doc,
                    allFound,
                    request.getRedactColor(),
                    request.getCustomPadding(),
-                    request.getConvertPDFToImage(), /*text removal*/
+                    request.getConvertPDFToImage(),
                    true);
        } catch (Exception e) {
            throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
        } finally {
            if (doc != null)
                try {
                    doc.close();
                } catch (IOException ignore) {
                }
            if (fb != null)
                try {
                    fb.close();
                } catch (IOException ignore) {
        }
    }
    private byte[] toByteArray(PDDocument doc) throws IOException {
        try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
            doc.save(baos);
            return baos.toByteArray();
        }
    }
 }
--- a/app/core/src/main/java/stirling/software/SPDF/service/ModerateRedactionService.java
+++ b/app/core/src/main/java/stirling/software/SPDF/service/ModerateRedactionService.java
@ -29,49 +29,36 @@ class ModerateRedactionService implements RedactionModeStrategy {
        boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
        boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
-        PDDocument doc = null;
+        try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
        PDDocument fallback = null;
        try {
            doc = pdfDocumentFactory.load(request.getFileInput());
            Map<Integer, List<PDFText>> allFound =
                    RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
            if (allFound.isEmpty()) {
-                try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+                return toByteArray(doc);
                    doc.save(baos);
                    return baos.toByteArray();
                }
            }
            boolean fallbackToBoxOnly =
                    helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord);
            String effectiveColor =
                    (request.getRedactColor() == null || request.getRedactColor().isBlank())
                            ? "#000000"
                            : request.getRedactColor();
            if (fallbackToBoxOnly) {
                // Use the new visual redaction with OCR restoration fallback
                return helper.performVisualRedactionWithOcrRestoration(
                        request, listOfText, useRegex, wholeWord);
            }
            return RedactionService.finalizeRedaction(
                    doc,
                    allFound,
-                    effectiveColor,
+                    request.getRedactColor(),
                    request.getCustomPadding(),
                    request.getConvertPDFToImage(),
                    false);
        } catch (Exception e) {
            throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
        } finally {
            if (doc != null)
                try {
                    doc.close();
                } catch (IOException ignore) {
                }
            if (fallback != null)
                try {
                    fallback.close();
                } catch (IOException ignore) {
        }
    }
    private byte[] toByteArray(PDDocument doc) throws IOException {
        try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
            doc.save(baos);
            return baos.toByteArray();
        }
    }
 }
--- a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java
+++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java
@ -81,6 +81,7 @@ public class RedactionService {
    private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
    private static final COSString EMPTY_COS_STRING = new COSString("");
    private static final int MAX_SWEEPS = 3;
    private static final Pattern PATTERN = Pattern.compile(".*(hoepap|temp|generated).*");
    private boolean aggressiveMode = false;
    private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
    private final CustomPDFDocumentFactory pdfDocumentFactory;
@ -266,26 +267,20 @@ public class RedactionService {
            boolean wholeWordSearch) {
        try {
            for (String term : targetWords) {
-                if (term == null || term.isBlank()) {
+                if (term == null || term.isBlank()) continue;
-                    continue;
+
                }
                TextFinder finder = new TextFinder(term, useRegex, wholeWordSearch);
                finder.setStartPage(pageIndex + 1);
                finder.setEndPage(pageIndex + 1);
                finder.getText(document);
-                List<PDFText> foundTexts = finder.getFoundTexts();
+                for (PDFText text : finder.getFoundTexts()) {
-                for (PDFText ft : foundTexts) {
+                    if (text.getPageIndex() == pageIndex) {
                    if (ft.getPageIndex() == pageIndex) {
                        return true;
                    }
                }
                if (!foundTexts.isEmpty()) {}
            }
            return false;
        } catch (Exception e) {
            return true;
        }
@ -297,18 +292,13 @@ public class RedactionService {
            boolean useRegex,
            boolean wholeWordSearch) {
        try {
-            int idx = -1;
+            for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
            final int numberOfPages = document.getNumberOfPages();
            for (int i = 0; i < numberOfPages; i++) {
                idx++;
                if (pageStillContainsTargets(
-                        document, idx, targetWords, useRegex, wholeWordSearch)) {
+                        document, pageIndex, targetWords, useRegex, wholeWordSearch)) {
                    return true;
                }
            }
            return false;
        } catch (Exception e) {
            return true;
        }
@ -352,12 +342,11 @@ public class RedactionService {
        for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
            allFoundTexts.addAll(pageTexts);
        }
-        if (!allFoundTexts.isEmpty()) {
+        if (!allFoundTexts.isEmpty() && !isTextRemovalMode) {
            if (!isTextRemovalMode) {
            Color redactColor = decodeOrDefault(colorString);
            redactFoundText(document, allFoundTexts, customPadding, redactColor);
        }
-        }
+
        if (Boolean.TRUE.equals(convertToImage)) {
            try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) {
                ByteArrayOutputStream baos = new ByteArrayOutputStream();
@ -597,18 +586,11 @@ public class RedactionService {
    private static boolean isTextSafeForRedaction(String text) {
        if (text == null || text.isEmpty()) return true;
-        for (int i = 0; i < text.length(); i++) {
+        for (char c : text.toCharArray()) {
-            char c = text.charAt(i);
+            if (c >= 65488 || (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')) {
            int codePoint = c;
            if (codePoint >= 65488) {
                return false;
            }
            if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
                return false;
            }
        }
        return true;
    }
@ -657,56 +639,33 @@ public class RedactionService {
        return wipeAllSemanticTextInTokens(tokens, true);
    }
-    public byte[] performVisualRedactionWithOcrRestoration(
+    private static String normalizeTextForRedaction(String text) {
-            RedactPdfRequest request,
+        if (text == null) return null;
-            String[] listOfText,
+
-            boolean useRegex,
+        StringBuilder normalized = new StringBuilder();
-            boolean wholeWordSearch)
+        for (int i = 0; i < text.length(); i++) {
-            throws IOException {
+            char c = text.charAt(i);
-        PDDocument visualRedactedDoc = null;
+
-        try {
+            if (c >= 65488) {
-            visualRedactedDoc = pdfDocumentFactory.load(request.getFileInput());
+                normalized.append(' ');
-            Map<Integer, List<PDFText>> allFound =
+            } else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
-                    findTextToRedact(visualRedactedDoc, listOfText, useRegex, wholeWordSearch);
+                normalized.append(' ');
-            String effectiveColor =
+            } else {
-                    (request.getRedactColor() == null || request.getRedactColor().isBlank())
+                normalized.append(c);
                            ? "#000000"
                            : request.getRedactColor();
            byte[] visualRedactedBytes =
                    finalizeRedaction(
                            visualRedactedDoc,
                            allFound,
                            effectiveColor,
                            request.getCustomPadding(),
                            true,
                            false);
            return performOcrRestoration(visualRedactedBytes, request);
        } catch (Exception e) {
            throw new IOException(
                    "Visual redaction with OCR restoration failed: " + e.getMessage(), e);
        } finally {
            if (visualRedactedDoc != null) {
                try {
                    visualRedactedDoc.close();
                } catch (IOException ignore) {
                }
            }
            }
        }
-    private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request)
+        return normalized.toString();
            throws IOException, InterruptedException {
        try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
                TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
            java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes);
            if (isOcrMyPdfAvailable()) {
                return processWithOcrMyPdfForRestoration(
                        tempInputFile.getPath(), tempOutputFile.getPath(), request);
            } else if (isTesseractAvailable()) {
                return processWithTesseractForRestoration(
                        tempInputFile.getPath(), tempOutputFile.getPath(), request);
    }
-            return redactedPdfBytes;
+
    private static boolean isOcrMyPdfAvailable() {
        try {
            ProcessExecutorResult result =
                    ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
                            .runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version"));
            return result.getRc() == 0;
        } catch (Exception e) {
            return false;
        }
    }
@ -780,37 +739,7 @@ public class RedactionService {
        }
    }
-    private static String normalizeTextForRedaction(String text) {
+    private static boolean isTesseractAvailable() {
        if (text == null) return null;
        StringBuilder normalized = new StringBuilder();
        for (int i = 0; i < text.length(); i++) {
            char c = text.charAt(i);
            if ((int) c >= 65488) {
                normalized.append(' ');
            } else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
                normalized.append(' ');
            } else {
                normalized.append(c);
            }
        }
        return normalized.toString();
    }
    private boolean isOcrMyPdfAvailable() {
        try {
            ProcessExecutorResult result =
                    ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
                            .runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version"));
            return result.getRc() == 0;
        } catch (Exception e) {
            return false;
        }
    }
    private boolean isTesseractAvailable() {
        try {
            ProcessExecutorResult result =
                    ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
@ -826,7 +755,7 @@ public class RedactionService {
            String fontName = font.getName();
            if (fontName == null
                    || isProperFontSubset(fontName)
-                    || fontName.toLowerCase().matches(".*(hoepap|temp|generated).*")) {
+                    || PATTERN.matcher(fontName.toLowerCase()).matches()) {
                return false;
            }
            return hasReliableWidthMetrics(font);
@ -835,6 +764,58 @@ public class RedactionService {
        }
    }
    private static String sanitizeText(String text) {
        if (text == null) return "";
        StringBuilder sanitized = new StringBuilder();
        for (char c : text.toCharArray()) {
            sanitized.append(
                    (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')
                            ? '\uFFFD'
                            : c);
        }
        return sanitized.toString();
    }
    private static byte[] processWithOcrMyPdfForRestoration(
            java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request)
            throws IOException, InterruptedException {
        List<String> command =
                Arrays.asList(
                        "ocrmypdf",
                        "--verbose",
                        "1",
                        "--output-type",
                        "pdf",
                        "--pdf-renderer",
                        "sandwich",
                        "--language",
                        "eng",
                        "--optimize",
                        "0",
                        "--jpeg-quality",
                        "100",
                        "--png-quality",
                        "9",
                        "--force-ocr",
                        "--deskew",
                        "--clean",
                        "--clean-final",
                        inputPath.toString(),
                        outputPath.toString());
        ProcessExecutorResult result =
                ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
                        .runCommandWithOutputHandling(command);
        if (result.getRc() != 0) {
            throw new IOException(
                    "OCRmyPDF restoration failed with return code: "
                            + result.getRc()
                            + ". Error: "
                            + result.getMessages());
        }
        return java.nio.file.Files.readAllBytes(outputPath);
    }
    private static String createSubsetFontPlaceholder(
            String originalWord, float targetWidth, PDFont font, float fontSize) {
        String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
@ -843,77 +824,144 @@ public class RedactionService {
                : " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
    }
-    public void performTextReplacementAggressive(
+    private static COSArray buildKerningAdjustedTJArray(
-            PDDocument document,
+            COSArray originalArray, COSArray redactedArray, TextSegment segment) {
-            Map<Integer, List<PDFText>> allFoundTextsByPage,
+        try {
-            String[] listOfText,
+            if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0)
                return redactedArray;
            COSArray out = new COSArray();
            int size = redactedArray.size();
            for (int i = 0; i < size; i++) {
                COSBase redEl = redactedArray.get(i);
                COSBase origEl =
                        (originalArray != null && i < originalArray.size())
                                ? originalArray.get(i)
                                : null;
                out.add(redEl);
                if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) {
                    String origText = getDecodedString(origStr, segment.getFont());
                    String modText = getDecodedString(redStr, segment.getFont());
                    float wOrig =
                            calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
                    float wMod =
                            calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
                    float adjustment = wOrig - wMod;
                    if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
                        float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
                        if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
                            i++;
                            float combined = num.floatValue() + kerning;
                            out.add(new COSFloat(combined));
                        } else {
                            out.add(new COSFloat(kerning));
                        }
                    }
                }
            }
            return out;
        } catch (Exception e) {
            return redactedArray;
        }
    }
    private static List<MatchRange> findMatchesInSegments(
            List<TextSegment> segments,
            Set<String> targetWords,
            boolean useRegex,
-            boolean wholeWordSearchBool) {
+            boolean wholeWordSearch) {
-        if (allFoundTextsByPage.isEmpty()) {
+        List<MatchRange> allMatches = new ArrayList<>();
-            return;
+        List<Pattern> patterns =
                TextFinderUtils.createOptimizedSearchPatterns(
                        targetWords, useRegex, wholeWordSearch);
        log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size());
        int totalMatchesFound = 0;
        for (int i = 0; i < segments.size(); i++) {
            TextSegment segment = segments.get(i);
            String segmentText = segment.getText();
            if (segmentText == null || segmentText.isEmpty()) {
                log.debug("Skipping empty segment {}", i);
                continue;
            }
-        Set<String> allSearchTerms =
+
-                Arrays.stream(listOfText)
+            log.debug("Processing segment {}: '{}'", i, segmentText);
-                        .map(String::trim)
+
-                        .filter(s -> !s.isEmpty())
+            if (segment.getFont() != null
-                        .collect(Collectors.toSet());
+                    && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
-        this.aggressiveMode = true;
+                log.debug(
-        this.aggressiveSegMatches = new HashMap<>();
+                        "Skipping segment {} - font not removable: {}",
                        i,
                        segment.getFont().getName());
                continue;
            }
            int segmentMatches = 0;
            for (Pattern pattern : patterns) {
                try {
-            for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
+                    log.debug(
-                boolean anyResidual = false;
+                            "Matching pattern '{}' against segment text '{}'",
-                int pageIndex = -1;
+                            pattern.pattern(),
-                for (PDPage page : document.getPages()) {
+                            segmentText);
-                    pageIndex++;
+                    var matcher = pattern.matcher(segmentText);
-                    try {
+                    while (matcher.find()) {
-                        this.aggressiveSegMatches = new HashMap<>();
+                        int matchStart = matcher.start();
-                        List<Object> filtered =
+                        int matchEnd = matcher.end();
-                                createTokensWithoutTargetText(
+
-                                        document,
+                        log.debug(
-                                        page,
+                                "Found match in segment {}: positions {}-{}",
-                                        allSearchTerms,
+                                i,
-                                        useRegex,
+                                matchStart,
-                                        wholeWordSearchBool);
+                                matchEnd);
-                        writeFilteredContentStream(document, page, filtered);
+
-                        boolean residual =
+                        if (matchStart >= 0
-                                pageStillContainsTargets(
+                                && matchEnd <= segmentText.length()
-                                        document,
+                                && matchStart < matchEnd) {
-                                        pageIndex,
+                            String matchedText = segmentText.substring(matchStart, matchEnd);
-                                        allSearchTerms,
+                            log.debug("Matched text: '{}'", matchedText);
-                                        useRegex,
+
-                                        wholeWordSearchBool);
+                            allMatches.add(
-                        if (residual) {
+                                    new MatchRange(
-                            anyResidual = true;
+                                            segment.getStartPos() + matchStart,
-                            try {
+                                            segment.getStartPos() + matchEnd));
-                                var sem = wipeAllSemanticTextInTokens(filtered);
+                            segmentMatches++;
-                                filtered = sem.tokens;
+                            totalMatchesFound++;
                                PDResources res = page.getResources();
                                if (res != null) {
                                    wipeAllSemanticTextInProperties(res);
                                    wipeAllTextInXObjects(document, res);
                                    wipeAllTextInPatterns(document, res);
                                }
                                writeFilteredContentStream(document, page, filtered);
                            } catch (Exception ignored) {
                        }
                    }
-                    } catch (Exception ignored) {
+                } catch (Exception e) {
                    log.error("Error matching pattern in segment {}: {}", i, e.getMessage());
                }
            }
-                if (!anyResidual) {
+
-                    break;
+            if (segmentMatches > 0) {
-                }
+                log.info("Segment {} had {} matches", i, segmentMatches);
                if (!documentStillContainsTargets(
                        document, allSearchTerms, useRegex, wholeWordSearchBool)) {
                    break;
            }
        }
-        } finally {
+
-            this.aggressiveMode = false;
+        log.info("Total matches found across all segments: {}", totalMatchesFound);
-            this.aggressiveSegMatches = null;
+        allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
        if (allMatches.isEmpty()) {
            log.warn("No matches found in segments. This might indicate:");
            log.warn("1. Text encoding issues preventing proper extraction");
            log.warn("2. Font compatibility issues");
            log.warn("3. Search terms not matching extracted text");
            log.warn("4. Whole word search filtering out matches");
            if (!segments.isEmpty()) {
                log.warn("Sample segment text: '{}'", segments.get(0).getText());
                log.warn("Target words: {}", targetWords);
                log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch);
            }
        }
        return allMatches;
    }
    private static float calculateCharacterSumWidth(PDFont font, String text) {
        float totalWidth = 0f;
        for (char c : text.toCharArray()) {
@ -1033,19 +1081,29 @@ public class RedactionService {
        }
    }
-    private static String sanitizeText(String text) {
+    public byte[] performVisualRedactionWithOcrRestoration(
-        if (text == null) return "";
+            RedactPdfRequest request,
-
+            String[] listOfText,
-        StringBuilder sanitized = new StringBuilder();
+            boolean useRegex,
-        for (char c : text.toCharArray()) {
+            boolean wholeWordSearch)
-            if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
+            throws IOException {
-                sanitized.append('\uFFFD');
+        try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
-            } else {
+            Map<Integer, List<PDFText>> allFound =
-                sanitized.append(c);
+                    findTextToRedact(doc, listOfText, useRegex, wholeWordSearch);
            byte[] visualRedactedBytes =
                    finalizeRedaction(
                            doc,
                            allFound,
                            request.getRedactColor(),
                            request.getCustomPadding(),
                            true,
                            false);
            return performOcrRestoration(visualRedactedBytes, request);
        } catch (Exception e) {
            throw new IOException(
                    "Visual redaction with OCR restoration failed: " + e.getMessage(), e);
        }
    }
        return sanitized.toString();
    }
    private static WipeResult wipeAllSemanticTextInTokens(List<Object> tokens, boolean removeTU) {
        if (tokens == null || tokens.isEmpty()) {
@ -1064,43 +1122,21 @@ public class RedactionService {
        return res;
    }
-    private byte[] processWithOcrMyPdfForRestoration(
+    private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request)
            java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request)
            throws IOException, InterruptedException {
-        List<String> command =
+        try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
-                Arrays.asList(
+                TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
-                        "ocrmypdf",
+            java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes);
-                        "--verbose",
+
-                        "1",
+            if (isOcrMyPdfAvailable()) {
-                        "--output-type",
+                return processWithOcrMyPdfForRestoration(
-                        "pdf",
+                        tempInputFile.getPath(), tempOutputFile.getPath(), request);
-                        "--pdf-renderer",
+            } else if (isTesseractAvailable()) {
-                        "sandwich",
+                return processWithTesseractForRestoration(
-                        "--language",
+                        tempInputFile.getPath(), tempOutputFile.getPath(), request);
-                        "eng",
+            }
-                        "--optimize",
+            return redactedPdfBytes;
                        "0",
                        "--jpeg-quality",
                        "100",
                        "--png-quality",
                        "9",
                        "--force-ocr",
                        "--deskew",
                        "--clean",
                        "--clean-final",
                        inputPath.toString(),
                        outputPath.toString());
        ProcessExecutorResult result =
                ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
                        .runCommandWithOutputHandling(command);
        if (result.getRc() != 0) {
            throw new IOException(
                    "OCRmyPDF restoration failed with return code: "
                            + result.getRc()
                            + ". Error: "
                            + result.getMessages());
        }
        return java.nio.file.Files.readAllBytes(outputPath);
    }
    private static boolean removeSemanticProperties(COSDictionary dict, boolean removeTU) {
@ -1427,59 +1463,62 @@ public class RedactionService {
        }
    }
-    private int getOriginalTokenCount(PDPage page) {
+    public void performTextReplacementAggressive(
            PDDocument document,
            Map<Integer, List<PDFText>> allFoundTextsByPage,
            String[] listOfText,
            boolean useRegex,
            boolean wholeWordSearchBool) {
        if (allFoundTextsByPage.isEmpty()) return;
        Set<String> allSearchTerms =
                Arrays.stream(listOfText)
                        .map(String::trim)
                        .filter(s -> !s.isEmpty())
                        .collect(Collectors.toSet());
        this.aggressiveMode = true;
        this.aggressiveSegMatches = new HashMap<>();
        try {
-            PDFStreamParser parser = new PDFStreamParser(page);
+            for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
-            int count = 0;
+                boolean anyResidual = false;
            while (parser.parseNextToken() != null) {
                count++;
            }
            return count;
        } catch (Exception e) {
            return 0;
        }
    }
-    private COSArray buildKerningAdjustedTJArray(
+                for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
-            COSArray originalArray, COSArray redactedArray, TextSegment segment) {
+                    PDPage page = document.getPages().get(pageIndex);
                    try {
-            if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0)
+                        this.aggressiveSegMatches = new HashMap<>();
-                return redactedArray;
+                        List<Object> filtered =
                                createTokensWithoutTargetText(
                                        document,
                                        page,
                                        allSearchTerms,
                                        useRegex,
                                        wholeWordSearchBool);
                        writeFilteredContentStream(document, page, filtered);
-            COSArray out = new COSArray();
+                        if (pageStillContainsTargets(
-            int size = redactedArray.size();
+                                document,
-            for (int i = 0; i < size; i++) {
+                                pageIndex,
-                COSBase redEl = redactedArray.get(i);
+                                allSearchTerms,
-                COSBase origEl =
+                                useRegex,
-                        (originalArray != null && i < originalArray.size())
+                                wholeWordSearchBool)) {
-                                ? originalArray.get(i)
+                            anyResidual = true;
-                                : null;
+                            processResidualText(document, page, filtered);
                        }
                    } catch (Exception ignored) {
                    }
                }
-                out.add(redEl);
+                if (!anyResidual
-
+                        || !documentStillContainsTargets(
-                if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) {
+                                document, allSearchTerms, useRegex, wholeWordSearchBool)) {
-                    String origText = getDecodedString(origStr, segment.getFont());
+                    break;
                    String modText = getDecodedString(redStr, segment.getFont());
                    float wOrig =
                            calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
                    float wMod =
                            calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
                    float adjustment = wOrig - wMod;
                    if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
                        float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
                        if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
                            i++;
                            float combined = num.floatValue() + kerning;
                            out.add(new COSFloat(combined));
                        } else {
                            out.add(new COSFloat(kerning));
                }
            }
-                }
+        } finally {
-            }
+            this.aggressiveMode = false;
-            return out;
+            this.aggressiveSegMatches = null;
        } catch (Exception e) {
            return redactedArray;
        }
    }
@ -1678,6 +1717,21 @@ public class RedactionService {
        return problematicRatio > 0.3;
    }
    private void processResidualText(PDDocument document, PDPage page, List<Object> filtered) {
        try {
            var sem = wipeAllSemanticTextInTokens(filtered);
            filtered = sem.tokens;
            PDResources res = page.getResources();
            if (res != null) {
                wipeAllSemanticTextInProperties(res);
                wipeAllTextInXObjects(document, res);
                wipeAllTextInPatterns(document, res);
            }
            writeFilteredContentStream(document, page, filtered);
        } catch (Exception ignored) {
        }
    }
    public boolean performTextReplacement(
            PDDocument document,
            Map<Integer, List<PDFText>> allFoundTextsByPage,
@ -1688,151 +1742,38 @@ public class RedactionService {
            log.info("No text found to redact");
            return false;
        }
-        try {
+
        Set<String> allSearchTerms =
                Arrays.stream(listOfText)
                        .map(String::trim)
                        .filter(s -> !s.isEmpty())
                        .collect(Collectors.toSet());
-            log.info(
+        log.info("Starting text replacement with {} search terms", allSearchTerms.size());
                    "Starting text replacement with {} search terms: {}",
                    allSearchTerms.size(),
                    allSearchTerms);
            log.info("Total pages in document: {}", document.getNumberOfPages());
            log.info("Initial text found on {} pages", allFoundTextsByPage.size());
            int initialTotalInstances =
                    allFoundTextsByPage.values().stream().mapToInt(List::size).sum();
            log.info("Total initial instances to redact: {}", initialTotalInstances);
            int finalSweepCount = 0;
        for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
-                finalSweepCount = sweep + 1;
+            processPages(document, allSearchTerms, useRegex, wholeWordSearchBool);
                log.info("=== Starting sweep {} of {} ===", sweep + 1, MAX_SWEEPS);
                int pagesProcessed = 0;
                int totalModifications = 0;
-                for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
+            if (!documentStillContainsTargets(
-                    PDPage page = document.getPages().get(pageIndex);
+                    document, allSearchTerms, useRegex, wholeWordSearchBool)) {
                    List<PDFText> pageFoundTexts =
                            allFoundTextsByPage.getOrDefault(pageIndex, List.of());
                    log.debug(
                            "Processing page {} - found {} instances",
                            pageIndex + 1,
                            pageFoundTexts.size());
                    List<Object> filtered =
                            createTokensWithoutTargetText(
                                    document, page, allSearchTerms, useRegex, wholeWordSearchBool);
                    writeFilteredContentStream(document, page, filtered);
                    int tokenDiff = Math.abs(filtered.size() - getOriginalTokenCount(page));
                    totalModifications += tokenDiff;
                    pagesProcessed++;
                    log.debug("Page {} - token modifications: {}", pageIndex + 1, tokenDiff);
                }
                log.info(
                        "Sweep {} completed - processed {} pages, total modifications: {}",
                        sweep + 1,
                        pagesProcessed,
                        totalModifications);
                boolean stillContainsTargets =
                        documentStillContainsTargets(
                                document, allSearchTerms, useRegex, wholeWordSearchBool);
                if (!stillContainsTargets) {
                log.info("SUCCESS: All targets removed after {} sweeps", sweep + 1);
                    break;
                } else {
                    log.warn(
                            "WARNING: Still contains targets after sweep {} - continuing...",
                            sweep + 1);
                }
            }
            boolean finalCheck = false;
            for (int verifyAttempt = 0; verifyAttempt < 3; verifyAttempt++) {
                log.info("Final verification attempt {} of 3", verifyAttempt + 1);
                finalCheck =
                        documentStillContainsTargets(
                                document, allSearchTerms, useRegex, wholeWordSearchBool);
                if (!finalCheck) {
                    log.info(
                            "Verification attempt {} passed - no targets found", verifyAttempt + 1);
                    break;
                } else {
                    log.warn("Verification attempt {} found remaining targets", verifyAttempt + 1);
                    if (verifyAttempt < 2) {
                        log.info("Performing additional cleanup sweep due to verification failure");
                        for (PDPage page : document.getPages()) {
                            List<Object> additionalFiltered =
                                    createTokensWithoutTargetText(
                                            document,
                                            page,
                                            allSearchTerms,
                                            useRegex,
                                            wholeWordSearchBool);
                            writeFilteredContentStream(document, page, additionalFiltered);
                        }
                    }
                }
            }
            if (finalCheck) {
                log.error(
                        "FAILURE: Document still contains targets after {} sweeps and {} verification attempts. Falling back to visual redaction with OCR restoration.",
                        MAX_SWEEPS,
                        3);
                log.error("Remaining search terms: {}", allSearchTerms);
                log.error("=== DETAILED FAILURE ANALYSIS ===");
                for (int pageIdx = 0; pageIdx < document.getNumberOfPages(); pageIdx++) {
                    for (String term : allSearchTerms) {
                        try {
                            TextFinder finder = new TextFinder(term, useRegex, wholeWordSearchBool);
                            finder.setStartPage(pageIdx + 1);
                            finder.setEndPage(pageIdx + 1);
                            finder.getText(document);
                            for (PDFText found : finder.getFoundTexts()) {
                                if (found.getPageIndex() == pageIdx) {
                                    log.error(
                                            "REMAINING: '{}' found on page {} at position ({}, {})",
                                            term,
                                            pageIdx + 1,
                                            found.getX1(),
                                            found.getY1());
                                }
                            }
                        } catch (Exception e) {
                            log.error(
                                    "Error during failure analysis for term '{}' on page {}: {}",
                                    term,
                                    pageIdx + 1,
                                    e.getMessage());
                        }
                    }
                }
                log.error("=== END FAILURE ANALYSIS ===");
                return true;
            } else {
                log.info(
                        "SUCCESS: All text redaction completed successfully after {} sweeps",
                        finalSweepCount);
                return false;
            }
        } catch (Exception e) {
            log.error("Exception during text replacement: {}", e.getMessage(), e);
            return true;
        }
        // Verification attempts
        for (int attempt = 0; attempt < 3; attempt++) {
            if (!documentStillContainsTargets(
                    document, allSearchTerms, useRegex, wholeWordSearchBool)) {
                return false;
            }
            if (attempt < 2) {
                processPages(document, allSearchTerms, useRegex, wholeWordSearchBool);
            }
        }
        log.error("FAILURE: Document still contains targets after {} sweeps", MAX_SWEEPS);
        return true;
    }
    private COSArray createRedactedTJArray(
@ -1917,99 +1858,21 @@ public class RedactionService {
        };
    }
-    private List<MatchRange> findMatchesInSegments(
+    private void processPages(
-            List<TextSegment> segments,
+            PDDocument document,
-            Set<String> targetWords,
+            Set<String> allSearchTerms,
            boolean useRegex,
-            boolean wholeWordSearch) {
+            boolean wholeWordSearchBool) {
-        List<MatchRange> allMatches = new ArrayList<>();
+        for (PDPage page : document.getPages()) {
        List<Pattern> patterns =
                TextFinderUtils.createOptimizedSearchPatterns(
                        targetWords, useRegex, wholeWordSearch);
        log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size());
        int totalMatchesFound = 0;
        for (int i = 0; i < segments.size(); i++) {
            TextSegment segment = segments.get(i);
            String segmentText = segment.getText();
            if (segmentText == null || segmentText.isEmpty()) {
                log.debug("Skipping empty segment {}", i);
                continue;
            }
            log.debug("Processing segment {}: '{}'", i, segmentText);
            if (segment.getFont() != null
                    && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
                log.debug(
                        "Skipping segment {} - font not removable: {}",
                        i,
                        segment.getFont().getName());
                continue;
            }
            int segmentMatches = 0;
            for (Pattern pattern : patterns) {
            try {
-                    log.debug(
+                List<Object> filtered =
-                            "Matching pattern '{}' against segment text '{}'",
+                        createTokensWithoutTargetText(
-                            pattern.pattern(),
+                                document, page, allSearchTerms, useRegex, wholeWordSearchBool);
-                            segmentText);
+                writeFilteredContentStream(document, page, filtered);
                    var matcher = pattern.matcher(segmentText);
                    while (matcher.find()) {
                        int matchStart = matcher.start();
                        int matchEnd = matcher.end();
                        log.debug(
                                "Found match in segment {}: positions {}-{}",
                                i,
                                matchStart,
                                matchEnd);
                        if (matchStart >= 0
                                && matchEnd <= segmentText.length()
                                && matchStart < matchEnd) {
                            String matchedText = segmentText.substring(matchStart, matchEnd);
                            log.debug("Matched text: '{}'", matchedText);
                            allMatches.add(
                                    new MatchRange(
                                            segment.getStartPos() + matchStart,
                                            segment.getStartPos() + matchEnd));
                            segmentMatches++;
                            totalMatchesFound++;
                        }
                    }
            } catch (Exception e) {
-                    log.error("Error matching pattern in segment {}: {}", i, e.getMessage());
+                log.warn("Error processing page: {}", e.getMessage());
            }
        }
            if (segmentMatches > 0) {
                log.info("Segment {} had {} matches", i, segmentMatches);
            }
        }
        log.info("Total matches found across all segments: {}", totalMatchesFound);
        allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
        if (allMatches.isEmpty()) {
            log.warn("No matches found in segments. This might indicate:");
            log.warn("1. Text encoding issues preventing proper extraction");
            log.warn("2. Font compatibility issues");
            log.warn("3. Search terms not matching extracted text");
            log.warn("4. Whole word search filtering out matches");
            if (!segments.isEmpty()) {
                log.warn("Sample segment text: '{}'", segments.get(0).getText());
                log.warn("Target words: {}", targetWords);
                log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch);
            }
        }
        return allMatches;
    }
    private String createSafeReplacement(String originalPart, TextSegment segment) {
@ -2962,9 +2825,9 @@ public class RedactionService {
    @Data
    public static class DecodedMapping {
-        public String text;
+        private String text;
-        public int[] charByteStart;
+        private int[] charByteStart;
-        public int[] charByteEnd;
+        private int[] charByteEnd;
    }
    @Data
--- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java
+++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java
@ -5,10 +5,17 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
 import lombok.experimental.UtilityClass;
 import lombok.extern.slf4j.Slf4j;
 import java.util.regex.Pattern;
@Slf4j
@UtilityClass
 public class TextEncodingHelper {
    private final Pattern PATTERN = Pattern.compile("^[A-Z]+$");
    private final Pattern REGEX = Pattern.compile("^[A-Z]{6}\\+.*");
    private final Pattern REGEXP = Pattern.compile("^[A-Z]{5}\\+.*");
    private final Pattern PATTERN1 = Pattern.compile("^[A-Z]{4}\\+.*");
    public boolean canEncodeCharacters(PDFont font, String text) {
        if (font == null || text == null) {
            return false;
@ -421,21 +428,21 @@ public class TextEncodingHelper {
            return false;
        }
-        if (fontName.matches("^[A-Z]{6}\\+.*")) {
+        if (REGEX.matcher(fontName).matches()) {
            return true;
        }
-        if (fontName.matches("^[A-Z]{5}\\+.*")) {
+        if (REGEXP.matcher(fontName).matches()) {
            return true;
        }
-        if (fontName.matches("^[A-Z]{4}\\+.*")) {
+        if (PATTERN1.matcher(fontName).matches()) {
            return true;
        }
        if (fontName.contains("+")) {
            String prefix = fontName.split("\\+")[0];
-            if (prefix.matches("^[A-Z]+$") && prefix.length() >= 4) {
+            if (PATTERN.matcher(prefix).matches() && prefix.length() >= 4) {
                return true;
            }
        }
@ -510,68 +517,4 @@ public class TextEncodingHelper {
        return false;
    }
    public boolean canEncodeAnyCharacter(PDFont font) {
        if (font == null) {
            return false;
        }
        String[] testStrings = {
            "a", "A", "0", " ", ".", "!", "e", "i", "o", "u", "n", "t", "r", "s", "l", "1", "2",
            "3", "4", "5", "6", "7", "8", "9", ",", ".", ";", ":", "?", "!", "(", ")", "[", "]",
            "{", "}", "hello", "test", "sample", "abc", "123", "ABC"
        };
        for (String testStr : testStrings) {
            try {
                byte[] encoded = font.encode(testStr);
                if (encoded.length > 0) {
                    return true;
                }
            } catch (Exception e) {
            }
        }
        for (int code = 0; code <= 0xFFFF; code += 100) {
            try {
                String testStr = String.valueOf((char) code);
                byte[] encoded = font.encode(testStr);
                if (encoded.length > 0) {
                    return true;
                }
            } catch (Exception e) {
            }
        }
        return false;
    }
    public boolean isValidFont(PDFont font) {
        if (font == null) {
            return false;
        }
        try {
            String name = font.getName();
            if (name != null && !name.trim().isEmpty()) {
                return true;
            }
        } catch (Exception e) {
        }
        try {
            if (canCalculateBasicWidths(font)) {
                return true;
            }
        } catch (Exception e) {
        }
        try {
            if (canEncodeAnyCharacter(font)) {
                return true;
            }
        } catch (Exception e) {
        }
        return false;
    }
 }
--- a/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java
+++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java
@ -80,10 +80,6 @@ public class WidthCalculator {
                Float charWidth =
                        calculateSingleCharacterWidth(font, character, fontSize, codePoint);
                if (charWidth == null) {
                    return null;
                }
                totalWidth += charWidth;
                if (previousCodePoint != -1) {
                    totalWidth += calculateKerning(font, previousCodePoint, codePoint, fontSize);
@ -203,9 +199,6 @@ public class WidthCalculator {
                Float charWidth =
                        calculateGlyphWidthComprehensively(font, character, codePoint, fontSize);
                if (charWidth == null) {
                    return null;
                }
                totalWidth += charWidth;
                i += Character.charCount(codePoint);
@ -514,64 +507,4 @@ public class WidthCalculator {
        return false;
    }
    public float calculateMinimumTextWidth(PDFont font, String text, float fontSize) {
        if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
            return 0;
        }
        try {
            float minWidth = calculateAccurateWidth(font, text, fontSize);
            if (minWidth > 0) {
                return minWidth * 0.8f;
            }
        } catch (Exception e) {
        }
        return text.length() * fontSize * 0.3f;
    }
    public float calculateMaximumTextWidth(PDFont font, String text, float fontSize) {
        if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
            return 0;
        }
        try {
            float maxWidth = calculateAccurateWidth(font, text, fontSize);
            if (maxWidth > 0) {
                return maxWidth * 1.2f;
            }
        } catch (Exception e) {
        }
        return text.length() * fontSize * 1.0f;
    }
    public boolean canCalculateWidthForText(PDFont font, String text) {
        if (font == null || text == null) {
            return false;
        }
        if (text.isEmpty()) {
            return true;
        }
        try {
            Float width = calculateDirectWidth(font, text, 12f);
            if (width != null) {
                return true;
            }
        } catch (Exception e) {
        }
        try {
            Float width = calculateCharacterByCharacterWidth(font, text, 12f);
            if (width != null) {
                return true;
            }
        } catch (Exception e) {
        }
        return true;
    }
 }
--- a/app/core/src/main/resources/templates/security/auto-redact.html
+++ b/app/core/src/main/resources/templates/security/auto-redact.html
@ -13,20 +13,7 @@
            color: #6c757d !important;
        }
-        .btn-primary:focus {
+        .btn-primary:focus, .form-check-input:focus, .form-control:focus, .form-select:focus {
            box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
            outline: 2px solid #0d6efd;
            outline-offset: 2px;
        }
        .form-check-input:focus {
            box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
            outline: 2px solid #0d6efd;
            outline-offset: 2px;
        }
        .form-control:focus, .form-select:focus {
            border-color: #0d6efd;
            box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
            outline: 2px solid #0d6efd;
            outline-offset: 2px;
@ -36,20 +23,6 @@
            background-color: #0d6efd;
            border-color: #0d6efd;
        }
        .sr-only {
            position: absolute;
            width: 1px;
            height: 1px;
            padding: 0;
            margin: -1px;
            overflow: hidden;
            clip: rect(0, 0, 0, 0);
            white-space: nowrap;
            border: 0;
        }
    </style>
 </head>