Refactor VisualRedactionService and add TextDecodingHelper class

- Updated VisualRedactionService to improve code readability by adjusting indentation and formatting. - Introduced a new TextDecodingHelper class to enhance text decoding capabilities for PDF documents. - Implemented methods for decoding characters with improved handling of various font types and encodings. - Added fallback mechanisms for character mapping to ensure better text extraction from PDFs. Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
2025-09-08 17:51:20 +02:00 · 2025-08-20 22:35:33 +02:00 · 2025-08-20 22:35:33 +02:00 · f9d2d9bbe5
commit f9d2d9bbe5
parent a5a1a6218c
2 changed files with 242 additions and 117 deletions
--- a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java
+++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java
@ -46,6 +46,7 @@ import stirling.software.SPDF.model.PDFText;
 import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest;
 import stirling.software.SPDF.model.api.security.RedactPdfRequest;
 import stirling.software.SPDF.pdf.TextFinder;
 import stirling.software.SPDF.utils.text.TextDecodingHelper;
 import stirling.software.SPDF.utils.text.TextEncodingHelper;
 import stirling.software.SPDF.utils.text.TextFinderUtils;
 import stirling.software.SPDF.utils.text.WidthCalculator;
@ -337,48 +338,7 @@ public class RedactionService {
        }
    }
-    private static String tryDecodeWithFontEnhanced(PDFont font, COSString cosString) {
+    // Local decoding helpers removed in favor of TextDecodingHelper
        try {
            if (font == null || cosString == null) {
                return null;
            }
            byte[] bytes = cosString.getBytes();
            if (bytes.length == 0) {
                return "";
            }
            String basicDecoded = tryDecodeWithFont(font, cosString);
            if (basicDecoded != null && !basicDecoded.contains("?")) {
                return basicDecoded;
            }
            StringBuilder out = new StringBuilder();
            for (byte aByte : bytes) {
                int code = aByte & 0xFF;
                String charStr = null;
                try {
                    charStr = font.toUnicode(code);
                } catch (Exception ignored) {
                }
                if (charStr == null && font.getName() != null && font.getName().contains("+")) {
                    charStr = mapSubsetCharacter(code);
                }
                out.append(charStr != null ? charStr : "");
            }
            return out.toString();
        } catch (Exception e) {
            return tryDecodeWithFont(font, cosString);
        }
    }
    private static String mapSubsetCharacter(int code) {
        if (code >= 32 && code <= 126) {
            return String.valueOf((char) code);
        }
        if (code >= 160 && code <= 255) {
            return String.valueOf((char) (code - 128));
        }
        return null;
    }
    private static String normalizeForFuzzy(String s) {
        if (s == null) {
@ -632,71 +592,6 @@ public class RedactionService {
        return text.length() * 500f;
    }
    private static String tryDecodeWithFont(PDFont font, COSString cosString) {
        try {
            if (font == null || cosString == null) {
                return null;
            }
            byte[] bytes = cosString.getBytes();
            if (bytes.length == 0) {
                return "";
            }
            boolean anyMapped = false;
            StringBuilder out = new StringBuilder();
            for (byte b : bytes) {
                int code = b & 0xFF;
                String uni = null;
                try {
                    uni = font.toUnicode(code);
                } catch (Exception ignored) {
                }
                if (uni != null) {
                    out.append(uni);
                    anyMapped = true;
                } else {
                    out.append('?');
                }
            }
            if (anyMapped) {
                return out.toString();
            }
            out.setLength(0);
            anyMapped = false;
            for (int i = 0; i < bytes.length; ) {
                int b1 = bytes[i] & 0xFF;
                String u1 = null;
                try {
                    u1 = font.toUnicode(b1);
                } catch (Exception ignored) {
                }
                if (i + 1 < bytes.length) {
                    int b2 = bytes[i + 1] & 0xFF;
                    int code = (b1 << 8) | b2;
                    String u2 = null;
                    try {
                        u2 = font.toUnicode(code);
                    } catch (Exception ignored) {
                    }
                    if (u2 != null) {
                        out.append(u2);
                        i += 2;
                        anyMapped = true;
                        continue;
                    }
                }
                if (u1 != null) {
                    out.append(u1);
                } else {
                    out.append('?');
                }
                i += 1;
            }
            return anyMapped ? out.toString() : null;
        } catch (Exception e) {
            return null;
        }
    }
    private static WipeResult wipeAllTextShowingOperators(List<Object> tokens) {
        List<Object> newTokens = new ArrayList<>(tokens);
        int modifications = 0;
@ -1062,7 +957,7 @@ public class RedactionService {
                        if (aggressive
                            && gs.font != null
                            && tokens.get(i - 1) instanceof COSString cs) {
-                            tryDecodeWithFontEnhanced(gs.font, cs);
+                            TextDecodingHelper.tryDecodeWithFontEnhanced(gs.font, cs);
                        }
                        segments.add(
                            new TextSegment(
@ -1175,12 +1070,12 @@ public class RedactionService {
                    || "'".equals(seg.getOperatorName())
                    || "\"".equals(seg.getOperatorName()))
                    && tok instanceof COSString cs) {
-                    decoded = tryDecodeWithFont(seg.getFont(), cs);
+                    decoded = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), cs);
                } else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) {
                    StringBuilder sb = new StringBuilder();
                    for (COSBase el : arr) {
                        if (el instanceof COSString s) {
-                            String d = tryDecodeWithFont(seg.getFont(), s);
+                            String d = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), s);
                            sb.append(d != null ? d : s.getString());
                        }
                    }
@ -1272,12 +1167,12 @@ public class RedactionService {
                Object tok = tokens.get(seg.getTokenIndex());
                if (("Tj".equals(seg.getOperatorName()) || "'".equals(seg.getOperatorName()))
                    && tok instanceof COSString cs) {
-                    decoded = tryDecodeWithFont(seg.getFont(), cs);
+                    decoded = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), cs);
                } else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) {
                    StringBuilder sb = new StringBuilder();
                    for (COSBase el : arr) {
                        if (el instanceof COSString s) {
-                            String d = tryDecodeWithFont(seg.getFont(), s);
+                            String d = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), s);
                            sb.append(d != null ? d : s.getString());
                        }
                    }
@ -1715,7 +1610,7 @@ public class RedactionService {
    }
    private int wipeAllTextInResources(PDDocument document, PDResources resources) {
-        int totalMods = 0;
+        int totalMods = 0; // aggregated but currently not returned to caller
        try {
            totalMods += wipeAllSemanticTextInProperties(resources);
            for (COSName xobjName : resources.getXObjectNames()) {
@ -1776,7 +1671,6 @@ public class RedactionService {
    }
    private void wipeAllTextInPatterns(PDDocument document, PDResources resources) {
        int totalMods = 0;
        try {
            for (COSName patName : resources.getPatternNames()) {
                try {
@ -1786,7 +1680,7 @@ public class RedactionService {
                        org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern tiling) {
                        PDResources patRes = tiling.getResources();
                        if (patRes != null) {
-                            totalMods += wipeAllTextInResources(document, patRes);
+                            wipeAllTextInResources(document, patRes);
                        }
                        PDFStreamParser parser = new PDFStreamParser(tiling);
                        List<Object> tokens = new ArrayList<>();
@ -1795,9 +1689,7 @@ public class RedactionService {
                            tokens.add(token);
                        }
                        WipeResult wrText = wipeAllTextShowingOperators(tokens);
                        totalMods += wrText.modifications;
                        WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens);
                        totalMods += wrSem.modifications;
                        if (wrText.modifications > 0 || wrSem.modifications > 0) {
                            writeRedactedContentToPattern(tiling, wrSem.tokens);
                        }
@ -1809,6 +1701,7 @@ public class RedactionService {
        }
    }
    @SuppressWarnings("unused")
    private int wipeAllTextInAnnotations(PDDocument document, PDPage page) {
        int totalMods = 0;
        try {
--- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java
+++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java
@ -0,0 +1,232 @@
 package stirling.software.SPDF.utils.text;
 import org.apache.pdfbox.cos.COSString;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.pdfbox.pdmodel.font.PDType0Font;
 import lombok.extern.slf4j.Slf4j;
@Slf4j
 public class TextDecodingHelper {
    private static final int ASCII_LOWER_BOUND = 32;
    private static final int ASCII_UPPER_BOUND = 126;
    private static final int EXTENDED_ASCII_LOWER_BOUND = 160;
    private static final int EXTENDED_ASCII_UPPER_BOUND = 255;
    public static void tryDecodeWithFontEnhanced(PDFont font, COSString cosString) {
        if (font == null || cosString == null) {
            return;
        }
        try {
            byte[] bytes = cosString.getBytes();
            if (bytes.length == 0) {
                return;
            }
            String basicDecoded = tryDecodeWithFont(font, cosString);
            if (basicDecoded != null
                && !basicDecoded.contains("?")
                && !basicDecoded.trim().isEmpty()) {
                return;
            }
            decodeCharactersEnhanced(font, bytes);
        } catch (Exception e) {
            log.error("Decoding failed: {}", e.getMessage(), e);
            try {
                tryDecodeWithFont(font, cosString);
            } catch (Exception fallbackException) {
                // Ultimate fallback: return hex representation for analysis
            }
        }
    }
    public static String decodeCharactersEnhanced(PDFont font, byte[] bytes) {
        StringBuilder out = new StringBuilder();
        boolean hasValidCharacters = false;
        int i = 0;
        while (i < bytes.length) {
            int code = bytes[i] & 0xFF;
            String charStr = decodeSingleCharacter(font, code, bytes);
            // Heuristic for multi-byte: if high byte, try combining with next
            if (charStr == null && code >= 128 && i + 1 < bytes.length) {
                int combinedCode = (code << 8) | (bytes[i + 1] & 0xFF);
                charStr = decodeSingleCharacter(font, combinedCode, bytes);
                if (charStr != null) {
                    i += 2; // Skip the next byte
                    out.append(charStr);
                    hasValidCharacters = true;
                    continue;
                }
            }
            if (charStr != null && !charStr.isEmpty()) {
                out.append(charStr);
                hasValidCharacters = true;
            } else {
                out.append('?');
            }
            i++;
        }
        String result = out.toString();
        return hasValidCharacters ? result : null;
    }
    public static String decodeSingleCharacter(PDFont font, int code, byte[] bytes) {
        String charStr = null;
        try {
            charStr = font.toUnicode(code);
        } catch (Exception ignored) {
        }
        // Enhanced CID Font and Composite Font Handling
        if (charStr == null
            && font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font type0Font) {
            try {
                // Attempt CID-specific decoding for multi-byte codes
                int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
                charStr = type0Font.toUnicode(cid);
                log.debug("CID decoding successful for code {}: {}", cid, charStr);
            } catch (Exception e) {
                log.debug("CID decoding failed for code {}: {}", code, e.getMessage());
            }
        }
        if (charStr == null && font.getName() != null && font.getName().contains("+")) {
            charStr = mapSubsetCharacter(code);
        }
        if (charStr == null) {
            charStr = fallbackCharacterMapping(code, bytes, font);
        }
        return charStr;
    }
    public static String fallbackCharacterMapping(int code, byte[] bytes, PDFont font) {
        try {
            if (font instanceof PDType0Font && bytes.length > 1) {
                return null;
            }
            if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) {
                return String.valueOf((char) code);
            }
            if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) {
                return String.valueOf((char) code);
            }
            String fontName = font.getName();
            if (fontName != null) {
                String lowerName = fontName.toLowerCase();
                if (lowerName.contains("cjk")
                    || lowerName.contains("gb")
                    || lowerName.contains("jp")) {
                    // Basic CJK fallback (expand with a lookup table if needed)
                    if (code >= 0x4E00 && code <= 0x9FFF) {
                        return String.valueOf(
                            (char) code); // Unicode Basic Multilingual Plane for CJK
                    }
                }
            }
            // Fallback to UTF-8/16 decoding attempt for unknown encodings
            try {
                if (bytes.length >= 2) {
                    java.nio.ByteBuffer buffer = java.nio.ByteBuffer.wrap(bytes);
                    java.nio.charset.CharsetDecoder decoder =
                        java.nio.charset.StandardCharsets.UTF_16BE.newDecoder();
                    java.nio.CharBuffer charBuffer = decoder.decode(buffer);
                    return charBuffer.toString();
                }
            } catch (Exception e) {
                log.debug("UTF fallback failed: {}", e.getMessage());
            }
            return null;
        } catch (Exception e) {
            return null;
        }
    }
    public static String mapSubsetCharacter(int code) {
        if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) {
            return String.valueOf((char) code);
        }
        if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) {
            return String.valueOf((char) (code - 128));
        }
        return null;
    }
    public static String tryDecodeWithFont(PDFont font, COSString cosString) {
        try {
            if (font == null || cosString == null) {
                return null;
            }
            byte[] bytes = cosString.getBytes();
            if (bytes.length == 0) {
                return "";
            }
            boolean anyMapped = false;
            StringBuilder out = new StringBuilder();
            for (byte b : bytes) {
                int code = b & 0xFF;
                String uni = null;
                try {
                    uni = font.toUnicode(code);
                } catch (Exception ignored) {
                }
                if (uni != null) {
                    out.append(uni);
                    anyMapped = true;
                } else {
                    out.append('?');
                }
            }
            if (anyMapped) {
                return out.toString();
            }
            out.setLength(0);
            anyMapped = false;
            for (int i = 0; i < bytes.length; ) {
                int b1 = bytes[i] & 0xFF;
                String u1 = null;
                try {
                    u1 = font.toUnicode(b1);
                } catch (Exception ignored) {
                }
                if (i + 1 < bytes.length) {
                    int b2 = bytes[i + 1] & 0xFF;
                    int code = (b1 << 8) | b2;
                    String u2 = null;
                    try {
                        u2 = font.toUnicode(code);
                    } catch (Exception ignored) {
                    }
                    if (u2 != null) {
                        out.append(u2);
                        i += 2;
                        anyMapped = true;
                        continue;
                    }
                }
                if (u1 != null) {
                    out.append(u1);
                } else {
                    out.append('?');
                }
                i += 1;
            }
            return anyMapped ? out.toString() : null;
        } catch (Exception e) {
            return null;
        }
    }
 }