From e64bbebfd595d79e70ec5dfcdb8db4caf4c9e555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Wed, 3 Sep 2025 00:16:30 +0200 Subject: [PATCH] improve code formatting and enhance readability in auto-redact.html, RedactionService, TextDecodingHelper, TextEncodingHelper, and TextFinder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Balázs Szücs --- .../software/SPDF/pdf/TextFinder.java | 37 +- .../SPDF/service/RedactionService.java | 19 +- .../SPDF/utils/text/TextDecodingHelper.java | 20 +- .../SPDF/utils/text/TextEncodingHelper.java | 205 ++----- .../templates/security/auto-redact.html | 531 +++++++++--------- 5 files changed, 344 insertions(+), 468 deletions(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index c99a2ade7..d25847434 100644 --- a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -3,8 +3,6 @@ package stirling.software.SPDF.pdf; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; @@ -71,26 +69,33 @@ public class TextFinder extends PDFTextStripper { super.endPage(page); return; } - String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E"; - if (this.wholeWordSearch) { - if (processedSearchTerm.length() == 1 - && Character.isDigit(processedSearchTerm.charAt(0))) { - regex = "(? patterns = + stirling.software.SPDF.utils.text.TextFinderUtils.createOptimizedSearchPatterns( + java.util.Collections.singleton(processedSearchTerm), + this.useRegex, + this.wholeWordSearch); + java.util.regex.Matcher matcher = null; + java.util.regex.Pattern activePattern = null; + for (java.util.regex.Pattern p : patterns) { + matcher = p.matcher(text); + if (matcher + .find()) { // prime by checking has at least one match; we will re-iterate below + activePattern = p; + break; } } - - Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); - Matcher matcher = pattern.matcher(text); + if (activePattern == null) { + super.endPage(page); + return; + } + matcher = activePattern.matcher(text); log.debug( - "Searching for '{}' in page {} with regex '{}' (wholeWord: {}, useRegex: {})", + "Searching for '{}' in page {} with pattern '{}' (wholeWord: {}, useRegex: {})", processedSearchTerm, getCurrentPageNo(), - regex, + activePattern, wholeWordSearch, useRegex); diff --git a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java index a73a814b0..0c2879d28 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java @@ -86,7 +86,7 @@ public class RedactionService { private static final Set TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); private static final COSString EMPTY_COS_STRING = new COSString(""); private static final int MAX_SWEEPS = 3; - private static final Pattern PATTERN = Pattern.compile(".*(hoepap|temp|generated).*"); + private static final Pattern PATTERN = Pattern.compile(".*(placeholder|temp|generated).*"); private boolean aggressiveMode = false; private Map> aggressiveSegMatches = null; private final CustomPDFDocumentFactory pdfDocumentFactory; @@ -2409,11 +2409,9 @@ public class RedactionService { textSegments.indexOf(task.segment), Collections.emptyList()); if (task.segment.tokenIndex >= newTokens.size()) { - continue; } if (task.segment.getText() == null || task.segment.getText().isEmpty()) { - continue; } @@ -2430,19 +2428,22 @@ public class RedactionService { private static String extractStringWithFallbacks(COSString cosString, PDFont font) { if (cosString == null) return ""; - try { + // Prefer font-guided decoding for correctness + if (font != null) { + String enhanced = + TextDecodingHelper.decodeCharactersEnhanced(font, cosString.getBytes()); + if (enhanced != null && !isGibberish(enhanced)) return enhanced; + } + // Fallback to COSString raw string if it seems valid String text = cosString.getString(); if (!text.trim().isEmpty() && !isGibberish(text)) return text; - + // Fallback: try basic font-based extraction if (font != null) { String fontBasedText = tryFontBasedExtraction(cosString, font); if (fontBasedText != null && !isGibberish(fontBasedText)) return fontBasedText; } - - String encodingFallback = tryEncodingFallbacks(cosString); - if (encodingFallback != null && !isGibberish(encodingFallback)) return encodingFallback; - + // Last resort: sanitized raw return sanitizeText(text); } catch (Exception e) { return "\uFFFD"; diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java index 00c3db099..ab7f862b9 100644 --- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java @@ -123,9 +123,8 @@ public class TextDecodingHelper { } } catch (Exception ignored) { } - if (ch == null || !isPrintable(ch)) { - // Handle problematic character codes specifically - ch = "�"; + if (ch == null) { + return null; // fail fast if undecodable via font tables } out.append(ch); i += consumed; @@ -250,16 +249,8 @@ public class TextDecodingHelper { } public String handleProblematicCharacterCode(int code, PDFont font) { - if (code >= PROBLEMATIC_CODE_LOWER_BOUND && code <= PROBLEMATIC_CODE_UPPER_BOUND) { - int adjustedCode = code - PROBLEMATIC_CODE_LOWER_BOUND; - if (adjustedCode >= ASCII_LOWER_BOUND) { - return String.valueOf((char) adjustedCode); - } - if (font != null && font.getName() != null && font.getName().contains("+")) { - return mapSubsetCharacter(adjustedCode); - } - } - return "�"; + // For correctness, avoid speculative remapping. Return replacement char only when needed. + return "\uFFFD"; } public String mapSubsetCharacter(int code) { @@ -267,7 +258,8 @@ public class TextDecodingHelper { return String.valueOf((char) code); } if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) { - return String.valueOf((char) (code - 128)); + // Do not alter code point arbitrarily; extended ASCII maps directly for correctness. + return String.valueOf((char) code); } return null; } diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java index 1f96f65d3..37021d125 100644 --- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java @@ -20,80 +20,44 @@ public class TextEncodingHelper { if (font == null || text == null) { return false; } - if (text.isEmpty()) { return true; } - - try { - byte[] encoded = font.encode(text); - if (encoded.length > 0) { - return true; + // Strict: every code point must be encodable by the font + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + String ch = new String(Character.toChars(cp)); + try { + byte[] encoded = font.encode(ch); + if (encoded == null || encoded.length == 0) { + return false; + } + } catch (Exception ex) { + return false; } - } catch (Exception e) { + i += Character.charCount(cp); } - - return validateAsCodePointArray(font, text); + return true; } private boolean validateAsCodePointArray(PDFont font, String text) { if (text == null || text.isEmpty()) { return true; } - - int totalCodePoints = 0; - int successfulCodePoints = 0; - for (int i = 0; i < text.length(); ) { int codePoint = text.codePointAt(i); String charStr = new String(Character.toChars(codePoint)); - totalCodePoints++; - try { byte[] charEncoded = font.encode(charStr); - if (charEncoded.length > 0) { - try { - float charWidth = font.getStringWidth(charStr); - if (charWidth >= 0) { - successfulCodePoints++; - } - } catch (Exception e) { - try { - if (canDecodeCharacter(font, charStr)) { - successfulCodePoints++; - } - } catch (Exception e2) { - } - } - } else { - try { - if (canDecodeCharacter(font, charStr)) { - successfulCodePoints++; - } - } catch (Exception e) { - } + if (charEncoded == null || charEncoded.length == 0) { + return false; } } catch (Exception e) { - try { - if (canDecodeCharacter(font, charStr)) { - successfulCodePoints++; - } - } catch (Exception e2) { - if (isBasicCharacter(codePoint)) { - successfulCodePoints++; - } - } + return false; } - i += Character.charCount(codePoint); } - - if (totalCodePoints == 0) { - return true; - } - - double successRate = (double) successfulCodePoints / totalCodePoints; - return successRate >= 0.1; + return true; } private boolean canDecodeCharacter(PDFont font, String charStr) { @@ -128,26 +92,17 @@ public class TextEncodingHelper { if (font == null || text == null) { return false; } - if (text.isEmpty()) { return true; } - - if (isSimpleCharacter(text)) { - try { - font.encode(text); - font.getStringWidth(text); - return true; - } catch (Exception e) { - try { - return canHandleText(font, text); - } catch (Exception e2) { - return false; - } - } + // Strict: removable only if we can encode every codepoint and measure width + if (!canEncodeCharacters(font, text)) return false; + try { + font.getStringWidth(text); + return true; + } catch (Exception e) { + return false; } - - return isTextFullyRemovable(font, text); } private boolean canHandleText(PDFont font, String text) { @@ -197,68 +152,14 @@ public class TextEncodingHelper { } public boolean isTextFullyRemovable(PDFont font, String text) { - if (font == null || text == null) { - return false; - } - - if (text.isEmpty()) { - return true; - } - + if (font == null || text == null) return false; + if (text.isEmpty()) return true; + if (!canEncodeCharacters(font, text)) return false; try { - if (!canEncodeCharacters(font, text)) { - return false; - } - - try { - float width = font.getStringWidth(text); - if (width < 0) { - return false; - } - } catch (Exception e) { - try { - if (!canCalculateTextWidth(font, text)) { - return false; - } - } catch (Exception e2) { - return false; - } - } - - try { - if (font.getFontDescriptor() == null) { - try { - return canHandleWithoutDescriptor(font, text); - } catch (Exception e) { - return false; - } - } - } catch (Exception e) { - try { - return canHandleWithoutDescriptor(font, text); - } catch (Exception e2) { - return false; - } - } - - try { - font.getFontDescriptor().getFontBoundingBox(); - } catch (Exception e) { - try { - return canHandleWithoutBoundingBox(font, text); - } catch (Exception e2) { - return false; - } - } - - return true; - + float width = font.getStringWidth(text); + return width >= 0; } catch (Exception e) { - try { - return canHandleText(font, text); - } catch (Exception e2) { - return false; - } + return false; } } @@ -381,45 +282,19 @@ public class TextEncodingHelper { } public boolean fontSupportsCharacter(PDFont font, String character) { - if (font == null || character == null) { - return false; - } - - if (character.isEmpty()) { - return true; - } - - try { - byte[] encoded = font.encode(character); - if (encoded.length > 0) { - try { - float width = font.getStringWidth(character); - if (width >= 0) { - return true; - } - } catch (Exception e) { - } - return true; - } - } catch (Exception e) { - } - - try { - if (canDecodeCharacter(font, character)) { - return true; - } - } catch (Exception e) { - } - + if (font == null || character == null) return false; + if (character.isEmpty()) return true; for (int i = 0; i < character.length(); ) { - int codePoint = character.codePointAt(i); - if (isBasicCharacter(codePoint)) { - i += Character.charCount(codePoint); - continue; + int cp = character.codePointAt(i); + String ch = new String(Character.toChars(cp)); + try { + byte[] encoded = font.encode(ch); + if (encoded == null || encoded.length == 0) return false; + } catch (Exception e) { + return false; } - return false; + i += Character.charCount(cp); } - return true; } diff --git a/app/core/src/main/resources/templates/security/auto-redact.html b/app/core/src/main/resources/templates/security/auto-redact.html index 7a84abe2b..4f4fd0680 100644 --- a/app/core/src/main/resources/templates/security/auto-redact.html +++ b/app/core/src/main/resources/templates/security/auto-redact.html @@ -3,321 +3,324 @@ xmlns:th="https://www.thymeleaf.org"> - - + .redaction-options-group small.form-text { + margin-left: 1.8rem; /* align with radio */ + } +
-
- -

-
-
-
-
- - - - -
-
-
+
+ +

+
+
+
+
+ + + + +
+ +
-
- - -
+
+ + +
-
-
- - -
-
- - -
-
+
+
+ + +
+
+ + +
+
-
- -
- - - Converts to image with visual redactions for maximum security. -
-
- - - Removes text completely, allowing the surrounding content to shift. This may change the document's original appearance. -
-
- - - Removes the underlying text and places a redaction box in its place, preserving the document's original layout. -
+
+ +
+ + + Converts to image with visual redactions for maximum security. +
+
+ + + Removes text completely, allowing the surrounding content to shift. This may change the document's original appearance. +
+
+ + + Removes the underlying text and places a redaction box in its place, preserving the document's original layout. +
+
-
- - - For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality. -
-
+ +
+
+ + +
+ For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality. +
-
+
-
- - -
+
+ + +
- + -
- - -
+
+ + +
-
+
-
- -
-
- -
- - - -
- -
-
+
+ Used when OCR restoration is needed
+ + + +
+ +
+
+
- +
+
\ No newline at end of file