enhance text extraction with font support and improved error handling

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
2025-09-08 17:51:20 +02:00 · 2025-08-21 11:48:58 +02:00 · 2025-08-21 11:48:58 +02:00 · 0bbf1dd344
commit 0bbf1dd344
parent 5dc7358219
1 changed files with 304 additions and 51 deletions
--- a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java
+++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java
@ -999,7 +999,7 @@ public class RedactionService {
                    }
                }
                if (isTextShowingOperator(opName) && i > 0) {
-                    String textContent = extractTextFromToken(tokens.get(i - 1), opName);
+                    String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
                    if (!textContent.isEmpty()) {
                        if (aggressive
                                && gs.font != null
@ -1045,7 +1045,7 @@ public class RedactionService {
                    }
                }
                if (isTextShowingOperator(opName) && i > 0) {
-                    String textContent = extractTextFromToken(tokens.get(i - 1), opName);
+                    String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
                    if (!textContent.isEmpty()) {
                        segments.add(
                                new TextSegment(
@ -1752,65 +1752,318 @@ public class RedactionService {
    }

    private String extractTextFromToken(Object token, String operatorName) {
-        return switch (operatorName) {
-            case "Tj", "'", "\"" -> {
-                if (token instanceof COSString cosString) {
-                    yield cosString.getString();
-                }
-                yield "";
-            }
-            case "TJ" -> {
-                if (token instanceof COSArray cosArray) {
-                    StringBuilder sb = new StringBuilder();
-                    for (COSBase element : cosArray) {
-                        if (element instanceof COSString cosString) {
-                            sb.append(cosString.getString());
-                        }
-                    }
-                    yield sb.toString();
-                }
-                yield "";
-            }
-            default -> "";
-        };
+        return extractTextFromToken(token, operatorName, null);
    }

-    private WipeResult wipeAllSemanticTextInTokens(List<Object> tokens) {
-        List<Object> newTokens = new ArrayList<>(tokens);
-        int modifications = 0;
-        for (int i = 0; i < newTokens.size(); i++) {
-            Object t = newTokens.get(i);
-            if (t instanceof Operator op) {
-                String name = op.getName();
-                if ("BDC".equals(name) && i > 0) {
-                    Object maybeDict = newTokens.get(i - 1);
-                    if (maybeDict instanceof COSDictionary dict) {
-                        boolean changed = false;
-                        if (dict.containsKey(COSName.getPDFName("ActualText"))) {
-                            dict.removeItem(COSName.getPDFName("ActualText"));
-                            changed = true;
-                        }
-                        if (dict.containsKey(COSName.getPDFName("Alt"))) {
-                            dict.removeItem(COSName.getPDFName("Alt"));
-                            changed = true;
-                        }
-                        if (dict.containsKey(COSName.getPDFName("TU"))) {
-                            dict.removeItem(COSName.getPDFName("TU"));
-                            changed = true;
-                        }
-                        if (changed) {
-                            modifications++;
-                        }
-                    }
+    private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) {
+        if (token == null || operatorName == null) {
+            return "";
+        }
+
+        try {
+            return switch (operatorName) {
+                case "Tj" -> handleTjOperator(token, currentFont);
+                case "'" -> handleSingleQuoteOperator(token, currentFont);
+                case "\"" -> handleDoubleQuoteOperator(token, currentFont);
+                case "TJ" -> handleTJOperator(token, currentFont);
+                default -> "";
+            };
+        } catch (Exception e) {
+            log.warn(
+                    "Failed to extract text from token for operator {}: {}",
+                    operatorName,
+                    e.getMessage());
+            return "";
+        }
+    }
+
+    private String handleTjOperator(Object token, PDFont font) {
+        if (token instanceof COSString cosString) {
+            return extractStringWithFallbacks(cosString, font);
+        }
+        return "";
+    }
+
+    private String handleSingleQuoteOperator(Object token, PDFont font) {
+        if (token instanceof COSString cosString) {
+            return "\n" + extractStringWithFallbacks(cosString, font);
+        }
+        return "\n";
+    }
+
+    private String handleDoubleQuoteOperator(Object token, PDFont font) {
+        if (token instanceof COSString cosString) {
+            return "\n" + extractStringWithFallbacks(cosString, font);
+        }
+        return "\n";
+    }
+
+    private String handleTJOperator(Object token, PDFont font) {
+        if (!(token instanceof COSArray cosArray)) {
+            return "";
+        }
+
+        StringBuilder textBuilder = new StringBuilder();
+
+        for (COSBase element : cosArray) {
+            if (element instanceof COSString cosString) {
+                String extractedText = extractStringWithFallbacks(cosString, font);
+                textBuilder.append(extractedText);
+
+            } else if (element instanceof COSNumber cosNumber) {
+                double displacement = cosNumber.floatValue();
+
+                if (displacement < -100.0) {
+                    textBuilder.append(" "); // Add space for significant gaps
                }
            }
        }
+
+        return textBuilder.toString();
+    }
+
+    private String extractStringWithFallbacks(COSString cosString, PDFont font) {
+        if (cosString == null) {
+            return "";
+        }
+
+        try {
+            String text = cosString.getString();
+
+            if (text != null && !text.trim().isEmpty() && !isGibberish(text)) {
+                return text;
+            }
+
+            // Fallback 1: Try enhanced font-based decoding if available
+            if (font != null) {
+                String fontBasedText = tryFontBasedExtraction(cosString, font);
+                if (fontBasedText != null && !isGibberish(fontBasedText)) {
+                    log.debug("Used font-based fallback extraction");
+                    return fontBasedText;
+                }
+            }
+
+            // Fallback 2: Try different encoding interpretations
+            String encodingFallback = tryEncodingFallbacks(cosString);
+            if (encodingFallback != null && !isGibberish(encodingFallback)) {
+                log.debug("Used encoding fallback extraction");
+                return encodingFallback;
+            }
+
+            // Fallback 3: Return original with sanitization
+            return sanitizeText(text != null ? text : "");
+
+        } catch (Exception e) {
+            log.debug("All extraction methods failed for COSString: {}", e.getMessage());
+            return "\uFFFD"; // Unicode replacement character
+        }
+    }
+
+    private String tryFontBasedExtraction(COSString cosString, PDFont font) {
+        try {
+            byte[] bytes = cosString.getBytes();
+            if (bytes.length == 0) return "";
+
+            StringBuilder result = new StringBuilder();
+            for (byte b : bytes) {
+                int code = b & 0xFF;
+                try {
+                    String unicode = font.toUnicode(code);
+                    if (unicode != null && !unicode.isEmpty()) {
+                        result.append(unicode);
+                    } else {
+                        result.append("\uFFFD");
+                    }
+                } catch (Exception e) {
+                    result.append("\uFFFD");
+                }
+            }
+            return result.toString();
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    private String tryEncodingFallbacks(COSString cosString) {
+        try {
+            byte[] bytes = cosString.getBytes();
+            if (bytes.length == 0) return "";
+
+            String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"};
+
+            for (String encoding : encodings) {
+                try {
+                    if (bytes.length >= 2) {
+                        if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
+                            return new String(bytes, 2, bytes.length - 2, "UTF-16BE");
+                        } else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
+                            return new String(bytes, 2, bytes.length - 2, "UTF-16LE");
+                        }
+                    }
+
+                    String decoded = new String(bytes, encoding);
+                    if (!isGibberish(decoded)) {
+                        return decoded;
+                    }
+                } catch (Exception ignored) {
+                }
+            }
+        } catch (Exception e) {
+        }
+        return null;
+    }
+
+    private boolean isGibberish(String text) {
+        if (text == null || text.trim().isEmpty()) {
+            return true;
+        }
+
+        int questionMarks = 0;
+        int replacementChars = 0;
+        int totalChars = text.length();
+
+        for (char c : text.toCharArray()) {
+            if (c == '?') questionMarks++;
+            if (c == '\uFFFD') replacementChars++;
+        }
+
+        double problematicRatio = (double) (questionMarks + replacementChars) / totalChars;
+        return problematicRatio > 0.3;
+    }
+
+    private String sanitizeText(String text) {
+        if (text == null) return "";
+
+        StringBuilder sanitized = new StringBuilder();
+        for (char c : text.toCharArray()) {
+            if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
+                sanitized.append('\uFFFD');
+            } else {
+                sanitized.append(c);
+            }
+        }
+        return sanitized.toString();
+    }
+
+    private WipeResult wipeAllSemanticTextInTokens(List<Object> tokens) {
+        return wipeAllSemanticTextInTokens(
+                tokens, true); // Default to removing TU for backward compatibility
+    }
+
+    private WipeResult wipeAllSemanticTextInTokens(List<Object> tokens, boolean removeTU) {
+        if (tokens == null || tokens.isEmpty()) {
+            log.warn("Empty or null token list; no modifications made");
+            WipeResult res = new WipeResult();
+            res.tokens = new ArrayList<>();
+            res.modifications = 0;
+            return res;
+        }
+
+        List<Object> newTokens = deepCopyTokens(tokens);
+        int modifications = processSemanticTokens(newTokens, removeTU);
+
        WipeResult res = new WipeResult();
        res.tokens = newTokens;
        res.modifications = modifications;
        return res;
    }

+    private int processSemanticTokens(List<Object> tokens, boolean removeTU) {
+        int modifications = 0;
+        java.util.Stack<Integer> markedContentStack =
+                new java.util.Stack<>(); // Track nesting for correctness
+
+        for (int i = 0; i < tokens.size(); i++) {
+            Object t = tokens.get(i);
+            if (t instanceof Operator op) {
+                String name = op.getName();
+
+                // Handle BDC (with dictionary) and BMC (without, for completeness)
+                if ("BDC".equals(name) || "BMC".equals(name)) {
+                    markedContentStack.push(i); // Track start for nesting validation
+
+                    if ("BDC".equals(name) && i > 0) {
+                        Object prev = tokens.get(i - 1);
+                        if (prev instanceof COSDictionary dict) {
+                            boolean changed = removeSemanticProperties(dict, removeTU);
+                            if (changed) {
+                                modifications++;
+                                log.debug(
+                                        "Removed semantic properties from dictionary at index {}",
+                                        i - 1);
+                            }
+                        } else {
+                            log.warn("BDC at index {} lacks preceding COSDictionary; skipping", i);
+                        }
+                    }
+                } else if ("EMC".equals(name)) {
+                    if (markedContentStack.isEmpty()) {
+                        log.warn(
+                                "Unmatched EMC at index {}; potential malformed content stream", i);
+                    } else {
+                        markedContentStack.pop(); // Validate pairing
+                    }
+                }
+            }
+        }
+
+        if (!markedContentStack.isEmpty()) {
+            log.warn(
+                    "Unmatched marked content starts: {} (potential nesting issues)",
+                    markedContentStack.size());
+        }
+
+        return modifications;
+    }
+
+    private boolean removeSemanticProperties(COSDictionary dict, boolean removeTU) {
+        boolean changed = false;
+        COSName actualText = COSName.getPDFName("ActualText");
+        COSName alt = COSName.getPDFName("Alt");
+        COSName tu = COSName.getPDFName("TU");
+
+        if (dict.containsKey(actualText)) {
+            dict.removeItem(actualText);
+            changed = true;
+        }
+        if (dict.containsKey(alt)) {
+            dict.removeItem(alt);
+            changed = true;
+        }
+        if (removeTU && dict.containsKey(tu)) {
+            dict.removeItem(tu);
+            changed = true;
+            log.info("Removed non-standard TU property (confirm if needed for your PDFs)");
+        }
+
+        return changed;
+    }
+
+    private List<Object> deepCopyTokens(List<Object> original) {
+        List<Object> copy = new ArrayList<>(original.size());
+        for (Object obj : original) {
+            if (obj instanceof COSDictionary dict) {
+                COSDictionary newDict = new COSDictionary();
+                for (COSName key : dict.keySet()) {
+                    newDict.setItem(key, dict.getDictionaryObject(key));
+                }
+                copy.add(newDict);
+            } else if (obj instanceof List<?> nestedList
+                    && !nestedList.isEmpty()
+                    && nestedList.get(0) instanceof Object) {
+                try {
+                    List<Object> objectList = (List<Object>) nestedList;
+                    copy.add(deepCopyTokens(objectList));
+                } catch (ClassCastException e) {
+                    copy.add(obj); // Fallback to shallow copy if cast fails
+                }
+            } else {
+                copy.add(obj); // Shallow copy for primitives/operators
+            }
+        }
+        return copy;
+    }
+
    private int wipeAllTextInResources(PDDocument document, PDResources resources) {
        int totalMods = 0; // aggregated but currently not returned to caller
        try {
@ -2018,7 +2271,7 @@ public class RedactionService {
                    }
                }
                if (isTextShowingOperator(opName) && i > 0) {
-                    String textContent = extractTextFromToken(tokens.get(i - 1), opName);
+                    String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
                    if (!textContent.isEmpty()) {
                        segments.add(
                                new TextSegment(