improve code formatting and enhance readability in auto-redact.html, RedactionService, TextDecodingHelper, TextEncodingHelper, and TextFinder

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-09-03 00:16:30 +02:00
parent ebe17f4c93
commit e64bbebfd5
5 changed files with 344 additions and 468 deletions

View File

@ -3,8 +3,6 @@ package stirling.software.SPDF.pdf;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
@ -71,26 +69,33 @@ public class TextFinder extends PDFTextStripper {
super.endPage(page); super.endPage(page);
return; return;
} }
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E"; // Build patterns using unified utility for consistency
if (this.wholeWordSearch) { List<java.util.regex.Pattern> patterns =
if (processedSearchTerm.length() == 1 stirling.software.SPDF.utils.text.TextFinderUtils.createOptimizedSearchPatterns(
&& Character.isDigit(processedSearchTerm.charAt(0))) { java.util.Collections.singleton(processedSearchTerm),
regex = "(?<![\\w])(?<!\\d[\\.,])" + regex + "(?![\\w])(?![\\.,]\\d)"; this.useRegex,
} else if (processedSearchTerm.length() == 1) { this.wholeWordSearch);
regex = "(?<![\\w])" + regex + "(?![\\w])"; java.util.regex.Matcher matcher = null;
} else { java.util.regex.Pattern activePattern = null;
regex = "\\b" + regex + "\\b"; for (java.util.regex.Pattern p : patterns) {
matcher = p.matcher(text);
if (matcher
.find()) { // prime by checking has at least one match; we will re-iterate below
activePattern = p;
break;
} }
} }
if (activePattern == null) {
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); super.endPage(page);
Matcher matcher = pattern.matcher(text); return;
}
matcher = activePattern.matcher(text);
log.debug( log.debug(
"Searching for '{}' in page {} with regex '{}' (wholeWord: {}, useRegex: {})", "Searching for '{}' in page {} with pattern '{}' (wholeWord: {}, useRegex: {})",
processedSearchTerm, processedSearchTerm,
getCurrentPageNo(), getCurrentPageNo(),
regex, activePattern,
wholeWordSearch, wholeWordSearch,
useRegex); useRegex);

View File

@ -86,7 +86,7 @@ public class RedactionService {
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
private static final COSString EMPTY_COS_STRING = new COSString(""); private static final COSString EMPTY_COS_STRING = new COSString("");
private static final int MAX_SWEEPS = 3; private static final int MAX_SWEEPS = 3;
private static final Pattern PATTERN = Pattern.compile(".*(hoepap|temp|generated).*"); private static final Pattern PATTERN = Pattern.compile(".*(placeholder|temp|generated).*");
private boolean aggressiveMode = false; private boolean aggressiveMode = false;
private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null; private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
@ -2409,11 +2409,9 @@ public class RedactionService {
textSegments.indexOf(task.segment), Collections.emptyList()); textSegments.indexOf(task.segment), Collections.emptyList());
if (task.segment.tokenIndex >= newTokens.size()) { if (task.segment.tokenIndex >= newTokens.size()) {
continue; continue;
} }
if (task.segment.getText() == null || task.segment.getText().isEmpty()) { if (task.segment.getText() == null || task.segment.getText().isEmpty()) {
continue; continue;
} }
@ -2430,19 +2428,22 @@ public class RedactionService {
private static String extractStringWithFallbacks(COSString cosString, PDFont font) { private static String extractStringWithFallbacks(COSString cosString, PDFont font) {
if (cosString == null) return ""; if (cosString == null) return "";
try { try {
// Prefer font-guided decoding for correctness
if (font != null) {
String enhanced =
TextDecodingHelper.decodeCharactersEnhanced(font, cosString.getBytes());
if (enhanced != null && !isGibberish(enhanced)) return enhanced;
}
// Fallback to COSString raw string if it seems valid
String text = cosString.getString(); String text = cosString.getString();
if (!text.trim().isEmpty() && !isGibberish(text)) return text; if (!text.trim().isEmpty() && !isGibberish(text)) return text;
// Fallback: try basic font-based extraction
if (font != null) { if (font != null) {
String fontBasedText = tryFontBasedExtraction(cosString, font); String fontBasedText = tryFontBasedExtraction(cosString, font);
if (fontBasedText != null && !isGibberish(fontBasedText)) return fontBasedText; if (fontBasedText != null && !isGibberish(fontBasedText)) return fontBasedText;
} }
// Last resort: sanitized raw
String encodingFallback = tryEncodingFallbacks(cosString);
if (encodingFallback != null && !isGibberish(encodingFallback)) return encodingFallback;
return sanitizeText(text); return sanitizeText(text);
} catch (Exception e) { } catch (Exception e) {
return "\uFFFD"; return "\uFFFD";

View File

@ -123,9 +123,8 @@ public class TextDecodingHelper {
} }
} catch (Exception ignored) { } catch (Exception ignored) {
} }
if (ch == null || !isPrintable(ch)) { if (ch == null) {
// Handle problematic character codes specifically return null; // fail fast if undecodable via font tables
ch = "<EFBFBD>";
} }
out.append(ch); out.append(ch);
i += consumed; i += consumed;
@ -250,16 +249,8 @@ public class TextDecodingHelper {
} }
public String handleProblematicCharacterCode(int code, PDFont font) { public String handleProblematicCharacterCode(int code, PDFont font) {
if (code >= PROBLEMATIC_CODE_LOWER_BOUND && code <= PROBLEMATIC_CODE_UPPER_BOUND) { // For correctness, avoid speculative remapping. Return replacement char only when needed.
int adjustedCode = code - PROBLEMATIC_CODE_LOWER_BOUND; return "\uFFFD";
if (adjustedCode >= ASCII_LOWER_BOUND) {
return String.valueOf((char) adjustedCode);
}
if (font != null && font.getName() != null && font.getName().contains("+")) {
return mapSubsetCharacter(adjustedCode);
}
}
return "<EFBFBD>";
} }
public String mapSubsetCharacter(int code) { public String mapSubsetCharacter(int code) {
@ -267,7 +258,8 @@ public class TextDecodingHelper {
return String.valueOf((char) code); return String.valueOf((char) code);
} }
if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) { if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) {
return String.valueOf((char) (code - 128)); // Do not alter code point arbitrarily; extended ASCII maps directly for correctness.
return String.valueOf((char) code);
} }
return null; return null;
} }

View File

@ -20,82 +20,46 @@ public class TextEncodingHelper {
if (font == null || text == null) { if (font == null || text == null) {
return false; return false;
} }
if (text.isEmpty()) { if (text.isEmpty()) {
return true; return true;
} }
// Strict: every code point must be encodable by the font
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
String ch = new String(Character.toChars(cp));
try { try {
byte[] encoded = font.encode(text); byte[] encoded = font.encode(ch);
if (encoded.length > 0) { if (encoded == null || encoded.length == 0) {
return false;
}
} catch (Exception ex) {
return false;
}
i += Character.charCount(cp);
}
return true; return true;
} }
} catch (Exception e) {
}
return validateAsCodePointArray(font, text);
}
private boolean validateAsCodePointArray(PDFont font, String text) { private boolean validateAsCodePointArray(PDFont font, String text) {
if (text == null || text.isEmpty()) { if (text == null || text.isEmpty()) {
return true; return true;
} }
int totalCodePoints = 0;
int successfulCodePoints = 0;
for (int i = 0; i < text.length(); ) { for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i); int codePoint = text.codePointAt(i);
String charStr = new String(Character.toChars(codePoint)); String charStr = new String(Character.toChars(codePoint));
totalCodePoints++;
try { try {
byte[] charEncoded = font.encode(charStr); byte[] charEncoded = font.encode(charStr);
if (charEncoded.length > 0) { if (charEncoded == null || charEncoded.length == 0) {
try { return false;
float charWidth = font.getStringWidth(charStr);
if (charWidth >= 0) {
successfulCodePoints++;
} }
} catch (Exception e) { } catch (Exception e) {
try { return false;
if (canDecodeCharacter(font, charStr)) {
successfulCodePoints++;
} }
} catch (Exception e2) {
}
}
} else {
try {
if (canDecodeCharacter(font, charStr)) {
successfulCodePoints++;
}
} catch (Exception e) {
}
}
} catch (Exception e) {
try {
if (canDecodeCharacter(font, charStr)) {
successfulCodePoints++;
}
} catch (Exception e2) {
if (isBasicCharacter(codePoint)) {
successfulCodePoints++;
}
}
}
i += Character.charCount(codePoint); i += Character.charCount(codePoint);
} }
if (totalCodePoints == 0) {
return true; return true;
} }
double successRate = (double) successfulCodePoints / totalCodePoints;
return successRate >= 0.1;
}
private boolean canDecodeCharacter(PDFont font, String charStr) { private boolean canDecodeCharacter(PDFont font, String charStr) {
if (font == null || charStr == null || charStr.isEmpty()) { if (font == null || charStr == null || charStr.isEmpty()) {
return false; return false;
@ -128,27 +92,18 @@ public class TextEncodingHelper {
if (font == null || text == null) { if (font == null || text == null) {
return false; return false;
} }
if (text.isEmpty()) { if (text.isEmpty()) {
return true; return true;
} }
// Strict: removable only if we can encode every codepoint and measure width
if (isSimpleCharacter(text)) { if (!canEncodeCharacters(font, text)) return false;
try { try {
font.encode(text);
font.getStringWidth(text); font.getStringWidth(text);
return true; return true;
} catch (Exception e) { } catch (Exception e) {
try {
return canHandleText(font, text);
} catch (Exception e2) {
return false; return false;
} }
} }
}
return isTextFullyRemovable(font, text);
}
private boolean canHandleText(PDFont font, String text) { private boolean canHandleText(PDFont font, String text) {
if (font == null || text == null) { if (font == null || text == null) {
@ -197,70 +152,16 @@ public class TextEncodingHelper {
} }
public boolean isTextFullyRemovable(PDFont font, String text) { public boolean isTextFullyRemovable(PDFont font, String text) {
if (font == null || text == null) { if (font == null || text == null) return false;
return false; if (text.isEmpty()) return true;
} if (!canEncodeCharacters(font, text)) return false;
if (text.isEmpty()) {
return true;
}
try {
if (!canEncodeCharacters(font, text)) {
return false;
}
try { try {
float width = font.getStringWidth(text); float width = font.getStringWidth(text);
if (width < 0) { return width >= 0;
return false;
}
} catch (Exception e) {
try {
if (!canCalculateTextWidth(font, text)) {
return false;
}
} catch (Exception e2) {
return false;
}
}
try {
if (font.getFontDescriptor() == null) {
try {
return canHandleWithoutDescriptor(font, text);
} catch (Exception e) { } catch (Exception e) {
return false; return false;
} }
} }
} catch (Exception e) {
try {
return canHandleWithoutDescriptor(font, text);
} catch (Exception e2) {
return false;
}
}
try {
font.getFontDescriptor().getFontBoundingBox();
} catch (Exception e) {
try {
return canHandleWithoutBoundingBox(font, text);
} catch (Exception e2) {
return false;
}
}
return true;
} catch (Exception e) {
try {
return canHandleText(font, text);
} catch (Exception e2) {
return false;
}
}
}
private boolean canCalculateTextWidth(PDFont font, String text) { private boolean canCalculateTextWidth(PDFont font, String text) {
if (font == null || text == null) { if (font == null || text == null) {
@ -381,45 +282,19 @@ public class TextEncodingHelper {
} }
public boolean fontSupportsCharacter(PDFont font, String character) { public boolean fontSupportsCharacter(PDFont font, String character) {
if (font == null || character == null) { if (font == null || character == null) return false;
return false; if (character.isEmpty()) return true;
}
if (character.isEmpty()) {
return true;
}
try {
byte[] encoded = font.encode(character);
if (encoded.length > 0) {
try {
float width = font.getStringWidth(character);
if (width >= 0) {
return true;
}
} catch (Exception e) {
}
return true;
}
} catch (Exception e) {
}
try {
if (canDecodeCharacter(font, character)) {
return true;
}
} catch (Exception e) {
}
for (int i = 0; i < character.length(); ) { for (int i = 0; i < character.length(); ) {
int codePoint = character.codePointAt(i); int cp = character.codePointAt(i);
if (isBasicCharacter(codePoint)) { String ch = new String(Character.toChars(cp));
i += Character.charCount(codePoint); try {
continue; byte[] encoded = font.encode(ch);
} if (encoded == null || encoded.length == 0) return false;
} catch (Exception e) {
return false; return false;
} }
i += Character.charCount(cp);
}
return true; return true;
} }

View File

@ -115,12 +115,15 @@
<label class="form-check-label" for="keepLayout" th:text="#{autoRedact.keepLayoutLabel}">Remove Text & Cover (Preserve Layout)</label> <label class="form-check-label" for="keepLayout" th:text="#{autoRedact.keepLayoutLabel}">Remove Text & Cover (Preserve Layout)</label>
<small class="form-text text-muted d-block mt-1" id="keep-desc" th:text="#{autoRedact.keepLayoutDescription}">Removes the underlying text and places a redaction box in its place, preserving the document's original layout.</small> <small class="form-text text-muted d-block mt-1" id="keep-desc" th:text="#{autoRedact.keepLayoutDescription}">Removes the underlying text and places a redaction box in its place, preserving the document's original layout.</small>
</div> </div>
<div class="form-check">
<input aria-describedby="guarantee-desc" class="form-check-input" id="guaranteeRedaction" name="convertPDFToImage" type="checkbox">
<label class="form-check-label" for="guaranteeRedaction" th:text="#{autoRedact.pdfImageLabel}">PDF image</label>
<small class="form-text text-muted d-block mt-1" id="guarantee-desc" th:text="#{autoRedact.pdfImageDescription}">For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality.</small>
</div> </div>
<!-- PDF Image checkbox - moved outside redaction-options-group to use plain styling -->
<div class="mb-3">
<div class="form-check">
<input aria-describedby="guarantee-desc" class="form-check-input" id="convertPDFToImage" name="convertPDFToImage" type="checkbox">
<label class="form-check-label" for="convertPDFToImage" th:text="#{autoRedact.convertPDFToImageLabel}">Convert PDF to PDF-Image</label>
</div>
<small class="form-text text-muted d-block mt-1" id="guarantee-desc" th:text="#{autoRedact.pdfImageDescription}">For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality.</small>
</div> </div>
<br> <br>
@ -154,8 +157,8 @@
<label class="form-label" for="languages">OCR Languages</label> <label class="form-label" for="languages">OCR Languages</label>
<div id="languages"> <div id="languages">
<div class="form-check" th:each="language, iterStat : ${languages}"> <div class="form-check" th:each="language, iterStat : ${languages}">
<input onchange="handleLangSelection()" required th:checked="${language == 'eng'}" th:id="${'language-' + language}" th:name="languages" th:value="${language}" type="checkbox" /> <input class="form-check-input" onchange="handleLangSelection()" required th:checked="${language == 'eng'}" th:id="${'language-' + language}" th:name="languages" th:value="${language}" type="checkbox" />
<label th:attr="data-lang-code=${language}, data-lang-name=#{'lang.' + language}" <label class="form-check-label" th:attr="data-lang-code=${language}, data-lang-name=#{'lang.' + language}"
th:for="${'language-' + language}" th:for="${'language-' + language}"
th:text="${language}"></label> th:text="${language}"></label>
</div> </div>
@ -273,7 +276,7 @@
document.addEventListener('DOMContentLoaded', function () { document.addEventListener('DOMContentLoaded', function () {
const redactionModeRadios = document.querySelectorAll('input[name="redactionMode"]'); const redactionModeRadios = document.querySelectorAll('input[name="redactionMode"]');
const aggressiveModeHidden = document.getElementById('aggressiveMode'); const aggressiveModeHidden = document.getElementById('aggressiveMode');
const guaranteeRedactionCheckbox = document.getElementById('guaranteeRedaction'); const convertPDFToImageCheckbox = document.getElementById('convertPDFToImage');
const defaultColor = document.getElementById('defaultColor'); const defaultColor = document.getElementById('defaultColor');
function updateMode() { function updateMode() {
@ -285,10 +288,10 @@
// Handle PDF image checkbox based on selection // Handle PDF image checkbox based on selection
if (selectedMode.value === 'visual') { if (selectedMode.value === 'visual') {
// Visual mode automatically enables PDF image for maximum security // Visual mode automatically enables PDF image for maximum security
guaranteeRedactionCheckbox.checked = true; convertPDFToImageCheckbox.checked = true;
} else { } else {
// Delete Text and Keep Layout modes disable PDF image // Delete Text and Keep Layout modes disable PDF image
guaranteeRedactionCheckbox.checked = false; convertPDFToImageCheckbox.checked = false;
} }
// Highlight selected card // Highlight selected card