Refactor redaction services and utilities for improved readability and maintainability

- Adjusted indentation and formatting across multiple files for consistency. - Improved imports ordering in utility classes for better organization. - Enhanced `performTextReplacementAggressive` method with multi-sweep logic to handle residual text more effectively. - Added helper methods for verifying document text targets to streamline aggressive redaction. - Simplified logic and formatting in `RedactionService` and related classes. Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
2025-09-08 17:51:20 +02:00 · 2025-08-20 22:45:08 +02:00 · 2025-08-20 22:45:08 +02:00 · 1fac74a3ca
commit 1fac74a3ca
parent 8f19369c58
10 changed files with 441 additions and 396 deletions
--- a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java
+++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java
@ -65,6 +65,7 @@ public class RedactionService {
    private static final int FONT_SCALE_FACTOR = 1000;
    private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
    private static final COSString EMPTY_COS_STRING = new COSString("");
+    private static final int MAX_SWEEPS = 3;
    private static final ThreadLocal<Boolean> AGGRESSIVE_MODE =
            ThreadLocal.withInitial(() -> Boolean.FALSE);
    private static final ThreadLocal<Map<Integer, List<AggressiveSegMatch>>> AGGR_SEG_MATCHES =
@ -268,6 +269,26 @@ public class RedactionService {
        return false;
    }

+    private static boolean documentStillContainsTargets(
+            PDDocument document,
+            Set<String> targetWords,
+            boolean useRegex,
+            boolean wholeWordSearch) {
+        try {
+            int idx = -1;
+            for (int i = 0; i < document.getNumberOfPages(); i++) {
+                idx++;
+                if (pageStillContainsTargets(
+                        document, idx, targetWords, useRegex, wholeWordSearch)) {
+                    return true;
+                }
+            }
+        } catch (Exception ignored) {
+            return true;
+        }
+        return false;
+    }
+
    public static Map<Integer, List<PDFText>> findTextToRedact(
            PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) {
        Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
@ -809,6 +830,8 @@ public class RedactionService {
                        .collect(Collectors.toSet());
        AGGRESSIVE_MODE.set(Boolean.TRUE);
        try {
+            for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
+                boolean anyResidual = false;
                int pageIndex = -1;
                for (PDPage page : document.getPages()) {
                    pageIndex++;
@ -816,7 +839,11 @@ public class RedactionService {
                        AGGR_SEG_MATCHES.remove();
                        List<Object> filtered =
                                createTokensWithoutTargetText(
-                            document, page, allSearchTerms, useRegex, wholeWordSearchBool);
+                                        document,
+                                        page,
+                                        allSearchTerms,
+                                        useRegex,
+                                        wholeWordSearchBool);
                        writeFilteredContentStream(document, page, filtered);
                        boolean residual =
                                pageStillContainsTargets(
@ -826,6 +853,7 @@ public class RedactionService {
                                        useRegex,
                                        wholeWordSearchBool);
                        if (residual) {
+                            anyResidual = true;
                            try {
                                var sem = wipeAllSemanticTextInTokens(filtered);
                                filtered = sem.tokens;
@ -842,6 +870,16 @@ public class RedactionService {
                    } catch (Exception ignored) {
                    }
                }
+                // If no residuals detected in this sweep, stop early
+                if (!anyResidual) {
+                    break;
+                }
+                // As a safety, if nothing left in the doc, stop
+                if (!documentStillContainsTargets(
+                        document, allSearchTerms, useRegex, wholeWordSearchBool)) {
+                    break;
+                }
+            }
        } finally {
            AGGRESSIVE_MODE.remove();
        }
@ -862,12 +900,19 @@ public class RedactionService {
                            .map(String::trim)
                            .filter(s -> !s.isEmpty())
                            .collect(Collectors.toSet());
+            for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
                for (PDPage page : document.getPages()) {
                    List<Object> filtered =
                            createTokensWithoutTargetText(
                                    document, page, allSearchTerms, useRegex, wholeWordSearchBool);
                    writeFilteredContentStream(document, page, filtered);
                }
+                // Stop early if nothing remains
+                if (!documentStillContainsTargets(
+                        document, allSearchTerms, useRegex, wholeWordSearchBool)) {
+                    break;
+                }
+            }
            return false;
        } catch (Exception e) {
            return true;
@ -1473,7 +1518,8 @@ public class RedactionService {
                                String originalPart =
                                        originalText.substring(
                                                redactionStartInString, redactionEndInString);
-                                if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && segment.getFont() != null
+                                if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
+                                        && segment.getFont() != null
                                        && !TextEncodingHelper.isTextSegmentRemovable(
                                                segment.getFont(), originalPart)) {
                                    continue;
@ -1514,7 +1560,10 @@ public class RedactionService {
                    }
                    String modifiedString = newText.toString();
                    newArray.add(new COSString(modifiedString));
-                    if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && modified && segment.getFont() != null && segment.getFontSize() > 0) {
+                    if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
+                            && modified
+                            && segment.getFont() != null
+                            && segment.getFontSize() > 0) {
                        try {
                            float originalWidth =
                                    safeGetStringWidth(segment.getFont(), originalText)
@ -1847,8 +1896,7 @@ public class RedactionService {
        private PDFont font = null;
        private float fontSize = 0;

-        public GraphicsState() {
-        }
+        public GraphicsState() {}
    }

    @Data
--- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java
+++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java
@ -1,17 +1,17 @@
 package stirling.software.SPDF.utils.text;

-import lombok.experimental.UtilityClass;
-import org.apache.pdfbox.cos.COSString;
-import org.apache.pdfbox.pdmodel.font.PDFont;
-import org.apache.pdfbox.pdmodel.font.PDType0Font;
-
-import lombok.extern.slf4j.Slf4j;
-
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.StandardCharsets;

+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.font.PDType0Font;
+
+import lombok.experimental.UtilityClass;
+import lombok.extern.slf4j.Slf4j;
+
@Slf4j
@UtilityClass
 public class TextDecodingHelper {
@ -89,8 +89,7 @@ public class TextDecodingHelper {
        } catch (Exception ignored) {
        }

-        if (charStr == null
-            && font instanceof PDType0Font type0Font) {
+        if (charStr == null && font instanceof PDType0Font type0Font) {
            try {
                int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
                charStr = type0Font.toUnicode(cid);
@ -143,8 +142,7 @@ public class TextDecodingHelper {
            try {
                if (bytes.length >= 2) {
                    ByteBuffer buffer = ByteBuffer.wrap(bytes);
-                    CharsetDecoder decoder =
-                        StandardCharsets.UTF_16BE.newDecoder();
+                    CharsetDecoder decoder = StandardCharsets.UTF_16BE.newDecoder();
                    CharBuffer charBuffer = decoder.decode(buffer);
                    return charBuffer.toString();
                }
--- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java
+++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java
@ -2,12 +2,12 @@ package stirling.software.SPDF.utils.text;

 import java.io.IOException;

-import lombok.experimental.UtilityClass;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
 import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
 import org.apache.pdfbox.pdmodel.font.encoding.Encoding;

+import lombok.experimental.UtilityClass;
 import lombok.extern.slf4j.Slf4j;

@Slf4j
--- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java
+++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java
@ -5,13 +5,13 @@ import java.util.List;
 import java.util.Set;
 import java.util.regex.Pattern;

-import lombok.experimental.UtilityClass;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
-
-import lombok.extern.slf4j.Slf4j;
 import org.apache.pdfbox.pdmodel.font.PDFont;

+import lombok.experimental.UtilityClass;
+import lombok.extern.slf4j.Slf4j;
+
@Slf4j
@UtilityClass
 public class TextFinderUtils {
--- a/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java
+++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java
@ -1,9 +1,9 @@
 package stirling.software.SPDF.utils.text;

-import lombok.experimental.UtilityClass;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.pdmodel.font.PDFont;

+import lombok.experimental.UtilityClass;
 import lombok.extern.slf4j.Slf4j;

@Slf4j
@ -44,8 +44,7 @@ public class WidthCalculator {
        }
    }

-    private float calculateWidthWithCharacterIteration(
-        PDFont font, String text, float fontSize) {
+    private float calculateWidthWithCharacterIteration(PDFont font, String text, float fontSize) {
        try {
            float totalWidth = 0;