cleanup, remove bloat

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-22 22:50:17 +02:00
parent eb0fbcdfa3
commit 48967f7061

View File

@ -3,6 +3,7 @@ package stirling.software.SPDF.service;
import java.awt.Color; import java.awt.Color;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
@ -708,8 +709,7 @@ public class RedactionService {
private static String tryFontBasedExtraction(COSString cosString, PDFont font) { private static String tryFontBasedExtraction(COSString cosString, PDFont font) {
try { try {
String decoded = TextDecodingHelper.tryDecodeWithFont(font, cosString); return TextDecodingHelper.tryDecodeWithFont(font, cosString);
return decoded;
} catch (Exception e) { } catch (Exception e) {
return null; return null;
} }
@ -888,56 +888,20 @@ public class RedactionService {
} }
} }
String createPlaceholderWithFont(String originalWord, PDFont font) { private static float calculateCharacterSumWidth(PDFont font, String text) {
if (originalWord == null || originalWord.isEmpty()) return " "; float totalWidth = 0f;
for (char c : text.toCharArray()) {
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
try { try {
// Use helper to get accurate width at fontSize=1.0 totalWidth += font.getStringWidth(String.valueOf(c));
float originalWidth =
WidthCalculator.calculateAccurateWidth(font, originalWord, 1.0f);
String result =
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
return result != null ? result : " ".repeat(Math.max(1, originalWord.length()));
} catch (Exception e) { } catch (Exception e) {
return " ".repeat(Math.max(1, originalWord.length())); return -1f;
} }
} }
return totalWidth;
}
return " ".repeat(Math.max(1, originalWord.length())); private static boolean isValidTokenIndex(List<Object> tokens, int index) {
} return index >= 0 && index < tokens.size();
String createPlaceholderWithWidth(
String originalWord, float targetWidth, PDFont font, float fontSize) {
if (originalWord == null || originalWord.isEmpty()) return " ";
if (font == null || fontSize <= 0) return " ".repeat(Math.max(1, originalWord.length()));
if (!WidthCalculator.isWidthCalculationReliable(font))
return " ".repeat(Math.max(1, originalWord.length()));
if (TextEncodingHelper.isFontSubset(font.getName())) {
String result = createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize);
return result != null
? result
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
}
try {
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
if (spaceWidth <= 0) {
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
}
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces =
Math.max(
originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f));
return " ".repeat(Math.min(spaceCount, maxSpaces));
} catch (Exception e) {
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
return result != null
? result
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
}
} }
private String createSubsetFontPlaceholder( private String createSubsetFontPlaceholder(
@ -1020,35 +984,16 @@ public class RedactionService {
} }
} }
private String createAlternativePlaceholder( private static boolean isValidTokenForOperator(Object token, String operatorName) {
String originalWord, float targetWidth, PDFont font, float fontSize) { if (token == null || operatorName == null) {
try { return false;
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
if (spaceWidth > 0) {
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces = originalWord.length() * 2;
return " ".repeat(Math.min(spaceCount, maxSpaces));
}
}
for (String alt : alternatives) {
if (" ".equals(alt)) continue;
try {
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
if (cw > 0) {
int count = Math.max(1, Math.round(targetWidth / cw));
int max = originalWord.length() * 2;
return " ".repeat(Math.min(count, max));
}
} catch (Exception ignored) {
}
}
return " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
} catch (Exception e) {
return " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
} }
return switch (operatorName) {
case "Tj", "'", "\"" -> token instanceof COSString;
case "TJ" -> token instanceof COSArray;
default -> true;
};
} }
private List<TextSegment> extractTextSegments( private List<TextSegment> extractTextSegments(
@ -1257,199 +1202,25 @@ public class RedactionService {
} }
} }
private List<MatchRange> findAllMatchesAggressive( private static int getActualStringLength(COSString cosString, PDFont font) {
List<TextSegment> segments,
List<Object> tokens,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearch) {
List<Pattern> patterns =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
List<MatchRange> result = new ArrayList<>();
Map<Integer, List<AggressiveSegMatch>> perSegMatches = new HashMap<>();
try { try {
String completeText = buildCompleteText(segments); if (font == null) return cosString.getString().length();
if (!completeText.isEmpty()) { String decodedText = TextDecodingHelper.tryDecodeWithFont(font, cosString);
List<MatchRange> global = return decodedText != null ? decodedText.length() : cosString.getString().length();
findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); } catch (Exception e) {
if (!global.isEmpty()) { return cosString.getString().length();
result.addAll(global);
} else if (!useRegex && !targetWords.isEmpty()) {
String lower = completeText.toLowerCase();
for (String word : targetWords) {
String w = word.toLowerCase();
int idx = lower.indexOf(w);
while (idx >= 0) {
result.add(new MatchRange(idx, idx + w.length()));
idx = lower.indexOf(w, idx + 1);
} }
} }
}
}
} catch (Exception ignored) {
}
List<String> decodedPerSegment = new ArrayList<>(segments.size()); private static float calculateSafeWidth(String text, PDFont font, float fontSize) {
List<Integer> decStarts = new ArrayList<>(segments.size());
List<Integer> decEnds = new ArrayList<>(segments.size());
int decCursor = 0;
for (TextSegment seg : segments) {
String decoded = null;
try { try {
Object tok = tokens.get(seg.tokenIndex); if (font != null && fontSize > 0) {
if (("Tj".equals(seg.operatorName) return WidthCalculator.calculateAccurateWidth(font, text, fontSize);
|| "'".equals(seg.operatorName)
|| "\"".equals(seg.operatorName))
&& tok instanceof COSString cs) {
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
StringBuilder sb = new StringBuilder();
for (COSBase el : arr) {
if (el instanceof COSString s) {
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
sb.append(d != null ? d : s.getString());
} }
} catch (Exception e) {
// Width calculation failed
} }
decoded = sb.toString(); return 0f;
}
} catch (Exception ignored) {
}
String basis = (decoded != null) ? decoded : seg.getText();
decodedPerSegment.add(basis);
decStarts.add(decCursor);
decCursor += basis.length();
decEnds.add(decCursor);
}
StringBuilder decodedCompleteSb = new StringBuilder();
for (String d : decodedPerSegment) {
decodedCompleteSb.append(d);
}
String decodedComplete = decodedCompleteSb.toString();
if (!decodedComplete.isEmpty()) {
List<Pattern> patternsDec =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
for (Pattern p : patternsDec) {
try {
var m = p.matcher(decodedComplete);
while (m.find()) {
int gStart = m.start();
int gEnd = m.end();
for (int sIdx = 0; sIdx < segments.size(); sIdx++) {
int sStart = decStarts.get(sIdx);
int sEnd = decEnds.get(sIdx);
int ovStart = Math.max(gStart, sStart);
int ovEnd = Math.min(gEnd, sEnd);
if (ovStart < ovEnd) {
int localStart = ovStart - sStart;
int localEnd = ovEnd - sStart;
perSegMatches
.computeIfAbsent(sIdx, k -> new ArrayList<>())
.add(new AggressiveSegMatch(sIdx, localStart, localEnd));
TextSegment seg = segments.get(sIdx);
int mappedStart = seg.getStartPos();
int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1);
result.add(new MatchRange(mappedStart, mappedEnd));
}
}
}
} catch (Exception ignored) {
}
}
if (perSegMatches.isEmpty() && !useRegex && !targetWords.isEmpty()) {
String lower = decodedComplete.toLowerCase();
for (String word : targetWords) {
String w = word.toLowerCase();
int idx = lower.indexOf(w);
while (idx >= 0) {
int gStart = idx;
int gEnd = idx + w.length();
for (int sIdx = 0; sIdx < segments.size(); sIdx++) {
int sStart = decStarts.get(sIdx);
int sEnd = decEnds.get(sIdx);
int ovStart = Math.max(gStart, sStart);
int ovEnd = Math.min(gEnd, sEnd);
if (ovStart < ovEnd) {
int localStart = ovStart - sStart;
int localEnd = ovEnd - sStart;
perSegMatches
.computeIfAbsent(sIdx, k -> new ArrayList<>())
.add(new AggressiveSegMatch(sIdx, localStart, localEnd));
TextSegment seg = segments.get(sIdx);
int mappedStart = seg.getStartPos();
int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1);
result.add(new MatchRange(mappedStart, mappedEnd));
}
}
idx = lower.indexOf(w, idx + 1);
}
}
}
}
if (!perSegMatches.isEmpty()) {
this.aggressiveSegMatches = perSegMatches;
} else {
this.aggressiveSegMatches = null;
}
for (TextSegment seg : segments) {
String decoded = null;
try {
Object tok = tokens.get(seg.tokenIndex);
if (("Tj".equals(seg.operatorName) || "'".equals(seg.operatorName))
&& tok instanceof COSString cs) {
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
StringBuilder sb = new StringBuilder();
for (COSBase el : arr) {
if (el instanceof COSString s) {
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
sb.append(d != null ? d : s.getString());
}
}
decoded = sb.toString();
}
} catch (Exception ignored) {
}
String basis = (decoded != null && !decoded.isEmpty()) ? decoded : seg.getText();
boolean any = false;
for (Pattern p : patterns) {
try {
var m = p.matcher(basis);
while (m.find()) {
any = true;
result.add(new MatchRange(seg.getStartPos(), seg.getStartPos()));
}
} catch (Exception ignored) {
}
}
if (!any) {
NormalizedMap nm = buildNormalizedMap(seg.getText());
if (!nm.norm.isEmpty()) {
for (String word : targetWords) {
String normWord = normalizeForFuzzy(word);
if (normWord.isEmpty()) {
continue;
}
int idx = nm.norm.indexOf(normWord);
while (idx >= 0) {
int origStart = nm.map[idx];
int origEnd =
nm.map[Math.min(idx + normWord.length() - 1, nm.map.length - 1)]
+ 1;
result.add(
new MatchRange(
seg.getStartPos() + origStart,
seg.getStartPos() + origEnd));
idx = nm.norm.indexOf(normWord, idx + 1);
}
}
}
}
}
result.sort(Comparator.comparingInt(MatchRange::getStartPos));
return result;
} }
private List<MatchRange> findMatchesInSegments( private List<MatchRange> findMatchesInSegments(
@ -1642,67 +1413,43 @@ public class RedactionService {
} }
} }
private float calculateCharacterSumWidth(PDFont font, String text) { private static void addSpacingAdjustment(
float totalWidth = 0f; COSArray newArray, TextSegment segment, String originalText, String modifiedText) {
for (char c : text.toCharArray()) {
try { try {
totalWidth += font.getStringWidth(String.valueOf(c)); if (segment.getFont() == null || segment.getFontSize() <= 0) return;
float originalWidth =
calculateSafeWidth(originalText, segment.getFont(), segment.getFontSize());
float modifiedWidth =
calculateSafeWidth(modifiedText, segment.getFont(), segment.getFontSize());
float adjustment = originalWidth - modifiedWidth;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR * 1.10f;
if (Math.abs(kerning) < 1000) {
newArray.add(new COSFloat(kerning));
}
}
} catch (Exception e) { } catch (Exception e) {
return -1f; // Failed to add spacing adjustment
} }
} }
return totalWidth;
}
private WidthCalculationResult calculatePreciseWidthAdjustment( private static TokenModificationResult updateOperatorSafely(
TextSegment segment, List<MatchRange> matches, String text) { List<Object> tokens, int tokenIndex, String originalOperator) {
float totalOriginalWidth = 0f, totalPlaceholderWidth = 0f;
int processedMatches = 0;
List<String> warnings = new ArrayList<>();
for (MatchRange match : matches) {
try { try {
int segStart = Math.max(0, match.getStartPos() - segment.getStartPos()); int operatorIndex = tokenIndex + 1;
int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); if (isValidTokenIndex(tokens, operatorIndex)
&& tokens.get(operatorIndex) instanceof Operator op
if (segStart >= text.length() || segEnd <= segStart || segStart < 0) { && op.getName().equals(originalOperator)) {
warnings.add("Invalid bounds: " + segStart + "-" + segEnd); tokens.set(operatorIndex, Operator.getOperator("TJ"));
continue;
} }
return TokenModificationResult.success();
String originalPart = text.substring(segStart, segEnd);
if (originalPart.isEmpty()) continue;
WidthMeasurement originalMeasurement =
measureTextWidth(segment.getFont(), originalPart, segment.getFontSize());
if (!originalMeasurement.isValid()) {
warnings.add(
"Cannot measure: "
+ originalPart.substring(
0, Math.min(10, originalPart.length())));
continue;
}
String placeholderPart = createSafePlaceholder(originalPart, segment);
WidthMeasurement placeholderMeasurement =
measureTextWidth(segment.getFont(), placeholderPart, segment.getFontSize());
totalOriginalWidth += originalMeasurement.getWidth();
totalPlaceholderWidth +=
placeholderMeasurement.isValid()
? placeholderMeasurement.getWidth()
: originalMeasurement.getWidth();
processedMatches++;
} catch (Exception e) { } catch (Exception e) {
warnings.add("Error: " + e.getMessage()); return TokenModificationResult.success(); // Non-critical failure
} }
} }
return new WidthCalculationResult(
totalOriginalWidth - totalPlaceholderWidth, processedMatches, warnings);
}
private WidthMeasurement measureTextWidth(PDFont font, String text, float fontSize) { private WidthMeasurement measureTextWidth(PDFont font, String text, float fontSize) {
try { try {
float fontUnits = safeGetStringWidth(font, text); float fontUnits = safeGetStringWidth(font, text);
@ -1724,19 +1471,37 @@ public class RedactionService {
} }
} }
private String createSafePlaceholder(String originalText, TextSegment segment) { private static String tryEncodingFallbacks(COSString cosString) {
try { try {
return createPlaceholderWithWidth( byte[] bytes = cosString.getBytes();
originalText, if (bytes.length == 0) return "";
measureTextWidth(segment.getFont(), originalText, segment.getFontSize())
.getWidth(), String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"};
segment.getFont(),
segment.getFontSize()); for (String encoding : encodings) {
} catch (Exception e) { try {
return "".repeat(Math.max(1, originalText.length())); if (bytes.length >= 2) {
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
return new String(
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
return new String(
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
} }
} }
String decoded = new String(bytes, encoding);
if (!isGibberish(decoded)) {
return decoded;
}
} catch (Exception ignored) {
}
}
} catch (Exception e) {
}
return null;
}
private float applySafetyBounds( private float applySafetyBounds(
WidthCalculationResult result, TextSegment segment, String text) { WidthCalculationResult result, TextSegment segment, String text) {
if (result.processedMatches() == 0) return 0f; if (result.processedMatches() == 0) return 0f;
@ -1796,20 +1561,27 @@ public class RedactionService {
} }
} }
private boolean isValidTokenIndex(List<Object> tokens, int index) { private static boolean isGibberish(String text) {
return index >= 0 && index < tokens.size(); if (text == null || text.trim().isEmpty()) {
return true;
} }
private boolean isValidTokenForOperator(Object token, String operatorName) { int questionMarks = 0;
if (token == null || operatorName == null) { int replacementChars = 0;
return false; int totalChars = text.length();
for (char c : text.toCharArray()) {
if (c == '?') questionMarks++;
if (c == '\uFFFD') replacementChars++;
} }
return switch (operatorName) { double problematicRatio = (double) (questionMarks + replacementChars) / totalChars;
case "Tj", "'", "\"" -> token instanceof COSString; return problematicRatio > 0.3;
case "TJ" -> token instanceof COSArray; }
default -> true;
}; private static WipeResult wipeAllSemanticTextInTokens(List<Object> tokens) {
return wipeAllSemanticTextInTokens(
tokens, true); // Default to removing TU for backward compatibility
} }
private COSArray createRedactedTJArray( private COSArray createRedactedTJArray(
@ -1844,16 +1616,26 @@ public class RedactionService {
} }
} }
private int getActualStringLength(COSString cosString, PDFont font) { String createPlaceholderWithFont(String originalWord, PDFont font) {
if (originalWord == null || originalWord.isEmpty()) return " ";
final String repeat = " ".repeat(Math.max(1, originalWord.length()));
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
try { try {
if (font == null) return cosString.getString().length(); // Use helper to get accurate width at fontSize=1.0
String decodedText = TextDecodingHelper.tryDecodeWithFont(font, cosString); float originalWidth =
return decodedText != null ? decodedText.length() : cosString.getString().length(); WidthCalculator.calculateAccurateWidth(font, originalWord, 1.0f);
String result =
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
return result != null ? result : repeat;
} catch (Exception e) { } catch (Exception e) {
return cosString.getString().length(); return repeat;
} }
} }
return repeat;
}
private TokenModificationResult performTokenModification( private TokenModificationResult performTokenModification(
List<Object> tokens, List<Object> tokens,
Object token, Object token,
@ -1913,15 +1695,33 @@ public class RedactionService {
} }
} }
private float calculateSafeWidth(String text, PDFont font, float fontSize) { String createPlaceholderWithWidth(
String originalWord, float targetWidth, PDFont font, float fontSize) {
if (originalWord == null || originalWord.isEmpty()) return " ";
if (font == null || fontSize <= 0) return " ".repeat(Math.max(1, originalWord.length()));
if (!WidthCalculator.isWidthCalculationReliable(font))
return " ".repeat(originalWord.length());
final String repeat = " ".repeat(Math.max(1, originalWord.length()));
if (TextEncodingHelper.isFontSubset(font.getName())) {
return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize);
}
try { try {
if (font != null && fontSize > 0) { float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
return WidthCalculator.calculateAccurateWidth(font, text, fontSize); if (spaceWidth <= 0) {
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
} }
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces =
Math.max(
originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f));
return " ".repeat(Math.min(spaceCount, maxSpaces));
} catch (Exception e) { } catch (Exception e) {
// Width calculation failed String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
return result != null ? result : repeat;
} }
return 0f;
} }
private TokenModificationResult convertToTJWithAdjustment( private TokenModificationResult convertToTJWithAdjustment(
@ -1949,42 +1749,208 @@ public class RedactionService {
} }
} }
private void addSpacingAdjustment( private String createAlternativePlaceholder(
COSArray newArray, TextSegment segment, String originalText, String modifiedText) { String originalWord, float targetWidth, PDFont font, float fontSize) {
final String repeat =
" ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
try { try {
if (segment.getFont() == null || segment.getFontSize() <= 0) return; String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
float originalWidth = float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
calculateSafeWidth(originalText, segment.getFont(), segment.getFontSize()); if (spaceWidth > 0) {
float modifiedWidth = int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
calculateSafeWidth(modifiedText, segment.getFont(), segment.getFontSize()); int maxSpaces = originalWord.length() * 2;
float adjustment = originalWidth - modifiedWidth; return " ".repeat(Math.min(spaceCount, maxSpaces));
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR * 1.10f;
if (Math.abs(kerning) < 1000) {
newArray.add(new COSFloat(kerning));
} }
} }
} catch (Exception e) { for (String alt : alternatives) {
// Failed to add spacing adjustment if (" ".equals(alt)) continue;
}
}
private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) {
if (token == null || operatorName == null) return "";
try { try {
return switch (operatorName) { if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
case "Tj" -> handleTjOperator(token, currentFont); float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
case "'" -> handleQuotedOperator(token, currentFont); if (cw > 0) {
case "\"" -> handleQuotedOperator(token, currentFont); int count = Math.max(1, Math.round(targetWidth / cw));
case "TJ" -> handleTJOperator(token, currentFont); int max = originalWord.length() * 2;
default -> ""; return " ".repeat(Math.min(count, max));
};
} catch (Exception e) {
return "";
} }
} catch (Exception ignored) {
}
}
return repeat;
} catch (Exception e) {
return repeat;
}
}
private List<MatchRange> findAllMatchesAggressive(
List<TextSegment> segments,
List<Object> tokens,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearch) {
List<Pattern> patterns =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
List<MatchRange> result = new ArrayList<>();
Map<Integer, List<AggressiveSegMatch>> perSegMatches = new HashMap<>();
try {
String completeText = buildCompleteText(segments);
if (!completeText.isEmpty()) {
List<MatchRange> global =
findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
if (!global.isEmpty()) {
result.addAll(global);
} else if (!useRegex && !targetWords.isEmpty()) {
String lower = completeText.toLowerCase();
for (String word : targetWords) {
String w = word.toLowerCase();
int idx = lower.indexOf(w);
while (idx >= 0) {
result.add(new MatchRange(idx, idx + w.length()));
idx = lower.indexOf(w, idx + 1);
}
}
}
}
} catch (Exception ignored) {
}
List<String> decodedPerSegment = new ArrayList<>(segments.size());
List<Integer> decStarts = new ArrayList<>(segments.size());
List<Integer> decEnds = new ArrayList<>(segments.size());
int decCursor = 0;
for (TextSegment seg : segments) {
String decoded = null;
try {
Object tok = tokens.get(seg.tokenIndex);
if (("Tj".equals(seg.operatorName)
|| "'".equals(seg.operatorName)
|| "\"".equals(seg.operatorName))
&& tok instanceof COSString cs) {
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
StringBuilder sb = new StringBuilder();
for (COSBase el : arr) {
if (el instanceof COSString s) {
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
sb.append(d != null ? d : s.getString());
}
}
decoded = sb.toString();
}
} catch (Exception ignored) {
}
String basis = (decoded != null) ? decoded : seg.getText();
decodedPerSegment.add(basis);
decStarts.add(decCursor);
decCursor += basis.length();
decEnds.add(decCursor);
}
StringBuilder decodedCompleteSb = new StringBuilder();
for (String d : decodedPerSegment) {
decodedCompleteSb.append(d);
}
String decodedComplete = decodedCompleteSb.toString();
if (!decodedComplete.isEmpty()) {
List<Pattern> patternsDec =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
for (Pattern p : patternsDec) {
try {
var m = p.matcher(decodedComplete);
while (m.find()) {
int gStart = m.start();
int gEnd = m.end();
mapStartToEnd(
segments, result, perSegMatches, decStarts, decEnds, gStart, gEnd);
}
} catch (Exception ignored) {
}
}
if (perSegMatches.isEmpty() && !useRegex && !targetWords.isEmpty()) {
String lower = decodedComplete.toLowerCase();
for (String word : targetWords) {
String w = word.toLowerCase();
int idx = lower.indexOf(w);
while (idx >= 0) {
int gStart = idx;
int gEnd = idx + w.length();
mapStartToEnd(
(List<TextSegment>) segments,
(List<MatchRange>) result,
(Map<Integer, List<AggressiveSegMatch>>) perSegMatches,
decStarts,
decEnds,
gStart,
gEnd);
idx = lower.indexOf(w, idx + 1);
}
}
}
}
if (!perSegMatches.isEmpty()) {
this.aggressiveSegMatches = perSegMatches;
} else {
this.aggressiveSegMatches = null;
}
for (TextSegment seg : segments) {
String decoded = null;
try {
Object tok = tokens.get(seg.tokenIndex);
if (("Tj".equals(seg.operatorName) || "'".equals(seg.operatorName))
&& tok instanceof COSString cs) {
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
StringBuilder sb = new StringBuilder();
for (COSBase el : arr) {
if (el instanceof COSString s) {
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
sb.append(d != null ? d : s.getString());
}
}
decoded = sb.toString();
}
} catch (Exception ignored) {
}
String basis = (decoded != null && !decoded.isEmpty()) ? decoded : seg.getText();
boolean any = false;
for (Pattern p : patterns) {
try {
var m = p.matcher(basis);
while (m.find()) {
any = true;
result.add(new MatchRange(seg.getStartPos(), seg.getStartPos()));
}
} catch (Exception ignored) {
}
}
if (!any) {
NormalizedMap nm = buildNormalizedMap(seg.getText());
if (!nm.norm.isEmpty()) {
for (String word : targetWords) {
String normWord = normalizeForFuzzy(word);
if (normWord.isEmpty()) {
continue;
}
int idx = nm.norm.indexOf(normWord);
while (idx >= 0) {
int origStart = nm.map[idx];
int origEnd =
nm.map[Math.min(idx + normWord.length() - 1, nm.map.length - 1)]
+ 1;
result.add(
new MatchRange(
seg.getStartPos() + origStart,
seg.getStartPos() + origEnd));
idx = nm.norm.indexOf(normWord, idx + 1);
}
}
}
}
}
result.sort(Comparator.comparingInt(MatchRange::getStartPos));
return result;
} }
private String handleTjOperator(Object token, PDFont font) { private String handleTjOperator(Object token, PDFont font) {
@ -2015,18 +1981,30 @@ public class RedactionService {
return textBuilder.toString(); return textBuilder.toString();
} }
private TokenModificationResult updateOperatorSafely( private void mapStartToEnd(
List<Object> tokens, int tokenIndex, String originalOperator) { List<TextSegment> segments,
try { List<MatchRange> result,
int operatorIndex = tokenIndex + 1; Map<Integer, List<AggressiveSegMatch>> perSegMatches,
if (isValidTokenIndex(tokens, operatorIndex) List<Integer> decStarts,
&& tokens.get(operatorIndex) instanceof Operator op List<Integer> decEnds,
&& op.getName().equals(originalOperator)) { int gStart,
tokens.set(operatorIndex, Operator.getOperator("TJ")); int gEnd) {
for (int sIdx = 0; sIdx < segments.size(); sIdx++) {
int sStart = decStarts.get(sIdx);
int sEnd = decEnds.get(sIdx);
int ovStart = Math.max(gStart, sStart);
int ovEnd = Math.min(gEnd, sEnd);
if (ovStart < ovEnd) {
int localStart = ovStart - sStart;
int localEnd = ovEnd - sStart;
perSegMatches
.computeIfAbsent(sIdx, k -> new ArrayList<>())
.add(new AggressiveSegMatch(sIdx, localStart, localEnd));
TextSegment seg = segments.get(sIdx);
int mappedStart = seg.getStartPos();
int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1);
result.add(new MatchRange(mappedStart, mappedEnd));
} }
return TokenModificationResult.success();
} catch (Exception e) {
return TokenModificationResult.success(); // Non-critical failure
} }
} }
@ -2048,51 +2026,65 @@ public class RedactionService {
} }
} }
private String tryEncodingFallbacks(COSString cosString) { private WidthCalculationResult calculatePreciseWidthAdjustment(
TextSegment segment, List<MatchRange> matches, String text) {
float totalOriginalWidth = 0f, totalPlaceholderWidth = 0f;
int processedMatches = 0;
List<String> warnings = new ArrayList<>();
for (MatchRange match : matches) {
try { try {
byte[] bytes = cosString.getBytes(); int segStart = Math.max(0, match.getStartPos() - segment.getStartPos());
if (bytes.length == 0) return ""; int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"}; if (segStart >= text.length() || segEnd <= segStart || segStart < 0) {
warnings.add("Invalid bounds: " + segStart + "-" + segEnd);
for (String encoding : encodings) { continue;
try {
if (bytes.length >= 2) {
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
return new String(bytes, 2, bytes.length - 2, "UTF-16BE");
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
return new String(bytes, 2, bytes.length - 2, "UTF-16LE");
}
} }
String decoded = new String(bytes, encoding); String originalPart = text.substring(segStart, segEnd);
if (!isGibberish(decoded)) {
return decoded; WidthMeasurement originalMeasurement =
} measureTextWidth(segment.getFont(), originalPart, segment.getFontSize());
} catch (Exception ignored) { if (!originalMeasurement.valid()) {
} warnings.add(
"Cannot measure: "
+ originalPart.substring(
0, Math.min(10, originalPart.length())));
continue;
} }
String placeholderPart = createSafePlaceholder(originalPart, segment);
WidthMeasurement placeholderMeasurement =
measureTextWidth(segment.getFont(), placeholderPart, segment.getFontSize());
totalOriginalWidth += originalMeasurement.width();
totalPlaceholderWidth +=
placeholderMeasurement.valid()
? placeholderMeasurement.width()
: originalMeasurement.width();
processedMatches++;
} catch (Exception e) { } catch (Exception e) {
warnings.add("Error: " + e.getMessage());
} }
return null;
} }
private boolean isGibberish(String text) { return new WidthCalculationResult(
if (text == null || text.trim().isEmpty()) { totalOriginalWidth - totalPlaceholderWidth, processedMatches, warnings);
return true;
} }
int questionMarks = 0; private String createSafePlaceholder(String originalText, TextSegment segment) {
int replacementChars = 0; try {
int totalChars = text.length(); return createPlaceholderWithWidth(
originalText,
for (char c : text.toCharArray()) { measureTextWidth(segment.getFont(), originalText, segment.getFontSize())
if (c == '?') questionMarks++; .width(),
if (c == '\uFFFD') replacementChars++; segment.getFont(),
segment.getFontSize());
} catch (Exception e) {
return "".repeat(Math.max(1, originalText.length()));
} }
double problematicRatio = (double) (questionMarks + replacementChars) / totalChars;
return problematicRatio > 0.3;
} }
private boolean isValidTJArray(COSArray array) { private boolean isValidTJArray(COSArray array) {
@ -2105,9 +2097,19 @@ public class RedactionService {
return true; return true;
} }
private WipeResult wipeAllSemanticTextInTokens(List<Object> tokens) { private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) {
return wipeAllSemanticTextInTokens( if (token == null || operatorName == null) return "";
tokens, true); // Default to removing TU for backward compatibility
try {
return switch (operatorName) {
case "Tj" -> handleTjOperator(token, currentFont);
case "'", "\"" -> handleQuotedOperator(token, currentFont);
case "TJ" -> handleTJOperator(token, currentFont);
default -> "";
};
} catch (Exception e) {
return "";
}
} }
private void processStringElement( private void processStringElement(
@ -2247,15 +2249,7 @@ public class RedactionService {
} }
// Helper classes // Helper classes
@Getter private record WidthMeasurement(float width, boolean valid) {
private static class WidthMeasurement {
private final float width;
private final boolean valid;
public WidthMeasurement(float width, boolean valid) {
this.width = width;
this.valid = valid;
}
public static WidthMeasurement invalid() { public static WidthMeasurement invalid() {
return new WidthMeasurement(0f, false); return new WidthMeasurement(0f, false);