cleanup, remove bloat

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-22 22:50:17 +02:00
parent eb0fbcdfa3
commit 48967f7061

View File

@ -3,6 +3,7 @@ package stirling.software.SPDF.service;
import java.awt.Color;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -708,8 +709,7 @@ public class RedactionService {
private static String tryFontBasedExtraction(COSString cosString, PDFont font) {
try {
String decoded = TextDecodingHelper.tryDecodeWithFont(font, cosString);
return decoded;
return TextDecodingHelper.tryDecodeWithFont(font, cosString);
} catch (Exception e) {
return null;
}
@ -888,56 +888,20 @@ public class RedactionService {
}
}
String createPlaceholderWithFont(String originalWord, PDFont font) {
if (originalWord == null || originalWord.isEmpty()) return " ";
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
private static float calculateCharacterSumWidth(PDFont font, String text) {
float totalWidth = 0f;
for (char c : text.toCharArray()) {
try {
// Use helper to get accurate width at fontSize=1.0
float originalWidth =
WidthCalculator.calculateAccurateWidth(font, originalWord, 1.0f);
String result =
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
return result != null ? result : " ".repeat(Math.max(1, originalWord.length()));
totalWidth += font.getStringWidth(String.valueOf(c));
} catch (Exception e) {
return " ".repeat(Math.max(1, originalWord.length()));
return -1f;
}
}
return totalWidth;
}
return " ".repeat(Math.max(1, originalWord.length()));
}
String createPlaceholderWithWidth(
String originalWord, float targetWidth, PDFont font, float fontSize) {
if (originalWord == null || originalWord.isEmpty()) return " ";
if (font == null || fontSize <= 0) return " ".repeat(Math.max(1, originalWord.length()));
if (!WidthCalculator.isWidthCalculationReliable(font))
return " ".repeat(Math.max(1, originalWord.length()));
if (TextEncodingHelper.isFontSubset(font.getName())) {
String result = createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize);
return result != null
? result
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
}
try {
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
if (spaceWidth <= 0) {
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
}
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces =
Math.max(
originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f));
return " ".repeat(Math.min(spaceCount, maxSpaces));
} catch (Exception e) {
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
return result != null
? result
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
}
private static boolean isValidTokenIndex(List<Object> tokens, int index) {
return index >= 0 && index < tokens.size();
}
private String createSubsetFontPlaceholder(
@ -1020,35 +984,16 @@ public class RedactionService {
}
}
private String createAlternativePlaceholder(
String originalWord, float targetWidth, PDFont font, float fontSize) {
try {
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
if (spaceWidth > 0) {
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces = originalWord.length() * 2;
return " ".repeat(Math.min(spaceCount, maxSpaces));
}
}
for (String alt : alternatives) {
if (" ".equals(alt)) continue;
try {
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
if (cw > 0) {
int count = Math.max(1, Math.round(targetWidth / cw));
int max = originalWord.length() * 2;
return " ".repeat(Math.min(count, max));
}
} catch (Exception ignored) {
}
}
return " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
} catch (Exception e) {
return " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
private static boolean isValidTokenForOperator(Object token, String operatorName) {
if (token == null || operatorName == null) {
return false;
}
return switch (operatorName) {
case "Tj", "'", "\"" -> token instanceof COSString;
case "TJ" -> token instanceof COSArray;
default -> true;
};
}
private List<TextSegment> extractTextSegments(
@ -1257,199 +1202,25 @@ public class RedactionService {
}
}
private List<MatchRange> findAllMatchesAggressive(
List<TextSegment> segments,
List<Object> tokens,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearch) {
List<Pattern> patterns =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
List<MatchRange> result = new ArrayList<>();
Map<Integer, List<AggressiveSegMatch>> perSegMatches = new HashMap<>();
private static int getActualStringLength(COSString cosString, PDFont font) {
try {
String completeText = buildCompleteText(segments);
if (!completeText.isEmpty()) {
List<MatchRange> global =
findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
if (!global.isEmpty()) {
result.addAll(global);
} else if (!useRegex && !targetWords.isEmpty()) {
String lower = completeText.toLowerCase();
for (String word : targetWords) {
String w = word.toLowerCase();
int idx = lower.indexOf(w);
while (idx >= 0) {
result.add(new MatchRange(idx, idx + w.length()));
idx = lower.indexOf(w, idx + 1);
if (font == null) return cosString.getString().length();
String decodedText = TextDecodingHelper.tryDecodeWithFont(font, cosString);
return decodedText != null ? decodedText.length() : cosString.getString().length();
} catch (Exception e) {
return cosString.getString().length();
}
}
}
}
} catch (Exception ignored) {
}
List<String> decodedPerSegment = new ArrayList<>(segments.size());
List<Integer> decStarts = new ArrayList<>(segments.size());
List<Integer> decEnds = new ArrayList<>(segments.size());
int decCursor = 0;
for (TextSegment seg : segments) {
String decoded = null;
private static float calculateSafeWidth(String text, PDFont font, float fontSize) {
try {
Object tok = tokens.get(seg.tokenIndex);
if (("Tj".equals(seg.operatorName)
|| "'".equals(seg.operatorName)
|| "\"".equals(seg.operatorName))
&& tok instanceof COSString cs) {
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
StringBuilder sb = new StringBuilder();
for (COSBase el : arr) {
if (el instanceof COSString s) {
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
sb.append(d != null ? d : s.getString());
if (font != null && fontSize > 0) {
return WidthCalculator.calculateAccurateWidth(font, text, fontSize);
}
} catch (Exception e) {
// Width calculation failed
}
decoded = sb.toString();
}
} catch (Exception ignored) {
}
String basis = (decoded != null) ? decoded : seg.getText();
decodedPerSegment.add(basis);
decStarts.add(decCursor);
decCursor += basis.length();
decEnds.add(decCursor);
}
StringBuilder decodedCompleteSb = new StringBuilder();
for (String d : decodedPerSegment) {
decodedCompleteSb.append(d);
}
String decodedComplete = decodedCompleteSb.toString();
if (!decodedComplete.isEmpty()) {
List<Pattern> patternsDec =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
for (Pattern p : patternsDec) {
try {
var m = p.matcher(decodedComplete);
while (m.find()) {
int gStart = m.start();
int gEnd = m.end();
for (int sIdx = 0; sIdx < segments.size(); sIdx++) {
int sStart = decStarts.get(sIdx);
int sEnd = decEnds.get(sIdx);
int ovStart = Math.max(gStart, sStart);
int ovEnd = Math.min(gEnd, sEnd);
if (ovStart < ovEnd) {
int localStart = ovStart - sStart;
int localEnd = ovEnd - sStart;
perSegMatches
.computeIfAbsent(sIdx, k -> new ArrayList<>())
.add(new AggressiveSegMatch(sIdx, localStart, localEnd));
TextSegment seg = segments.get(sIdx);
int mappedStart = seg.getStartPos();
int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1);
result.add(new MatchRange(mappedStart, mappedEnd));
}
}
}
} catch (Exception ignored) {
}
}
if (perSegMatches.isEmpty() && !useRegex && !targetWords.isEmpty()) {
String lower = decodedComplete.toLowerCase();
for (String word : targetWords) {
String w = word.toLowerCase();
int idx = lower.indexOf(w);
while (idx >= 0) {
int gStart = idx;
int gEnd = idx + w.length();
for (int sIdx = 0; sIdx < segments.size(); sIdx++) {
int sStart = decStarts.get(sIdx);
int sEnd = decEnds.get(sIdx);
int ovStart = Math.max(gStart, sStart);
int ovEnd = Math.min(gEnd, sEnd);
if (ovStart < ovEnd) {
int localStart = ovStart - sStart;
int localEnd = ovEnd - sStart;
perSegMatches
.computeIfAbsent(sIdx, k -> new ArrayList<>())
.add(new AggressiveSegMatch(sIdx, localStart, localEnd));
TextSegment seg = segments.get(sIdx);
int mappedStart = seg.getStartPos();
int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1);
result.add(new MatchRange(mappedStart, mappedEnd));
}
}
idx = lower.indexOf(w, idx + 1);
}
}
}
}
if (!perSegMatches.isEmpty()) {
this.aggressiveSegMatches = perSegMatches;
} else {
this.aggressiveSegMatches = null;
}
for (TextSegment seg : segments) {
String decoded = null;
try {
Object tok = tokens.get(seg.tokenIndex);
if (("Tj".equals(seg.operatorName) || "'".equals(seg.operatorName))
&& tok instanceof COSString cs) {
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
StringBuilder sb = new StringBuilder();
for (COSBase el : arr) {
if (el instanceof COSString s) {
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
sb.append(d != null ? d : s.getString());
}
}
decoded = sb.toString();
}
} catch (Exception ignored) {
}
String basis = (decoded != null && !decoded.isEmpty()) ? decoded : seg.getText();
boolean any = false;
for (Pattern p : patterns) {
try {
var m = p.matcher(basis);
while (m.find()) {
any = true;
result.add(new MatchRange(seg.getStartPos(), seg.getStartPos()));
}
} catch (Exception ignored) {
}
}
if (!any) {
NormalizedMap nm = buildNormalizedMap(seg.getText());
if (!nm.norm.isEmpty()) {
for (String word : targetWords) {
String normWord = normalizeForFuzzy(word);
if (normWord.isEmpty()) {
continue;
}
int idx = nm.norm.indexOf(normWord);
while (idx >= 0) {
int origStart = nm.map[idx];
int origEnd =
nm.map[Math.min(idx + normWord.length() - 1, nm.map.length - 1)]
+ 1;
result.add(
new MatchRange(
seg.getStartPos() + origStart,
seg.getStartPos() + origEnd));
idx = nm.norm.indexOf(normWord, idx + 1);
}
}
}
}
}
result.sort(Comparator.comparingInt(MatchRange::getStartPos));
return result;
return 0f;
}
private List<MatchRange> findMatchesInSegments(
@ -1642,67 +1413,43 @@ public class RedactionService {
}
}
private float calculateCharacterSumWidth(PDFont font, String text) {
float totalWidth = 0f;
for (char c : text.toCharArray()) {
private static void addSpacingAdjustment(
COSArray newArray, TextSegment segment, String originalText, String modifiedText) {
try {
totalWidth += font.getStringWidth(String.valueOf(c));
if (segment.getFont() == null || segment.getFontSize() <= 0) return;
float originalWidth =
calculateSafeWidth(originalText, segment.getFont(), segment.getFontSize());
float modifiedWidth =
calculateSafeWidth(modifiedText, segment.getFont(), segment.getFontSize());
float adjustment = originalWidth - modifiedWidth;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR * 1.10f;
if (Math.abs(kerning) < 1000) {
newArray.add(new COSFloat(kerning));
}
}
} catch (Exception e) {
return -1f;
// Failed to add spacing adjustment
}
}
return totalWidth;
}
private WidthCalculationResult calculatePreciseWidthAdjustment(
TextSegment segment, List<MatchRange> matches, String text) {
float totalOriginalWidth = 0f, totalPlaceholderWidth = 0f;
int processedMatches = 0;
List<String> warnings = new ArrayList<>();
for (MatchRange match : matches) {
private static TokenModificationResult updateOperatorSafely(
List<Object> tokens, int tokenIndex, String originalOperator) {
try {
int segStart = Math.max(0, match.getStartPos() - segment.getStartPos());
int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
if (segStart >= text.length() || segEnd <= segStart || segStart < 0) {
warnings.add("Invalid bounds: " + segStart + "-" + segEnd);
continue;
int operatorIndex = tokenIndex + 1;
if (isValidTokenIndex(tokens, operatorIndex)
&& tokens.get(operatorIndex) instanceof Operator op
&& op.getName().equals(originalOperator)) {
tokens.set(operatorIndex, Operator.getOperator("TJ"));
}
String originalPart = text.substring(segStart, segEnd);
if (originalPart.isEmpty()) continue;
WidthMeasurement originalMeasurement =
measureTextWidth(segment.getFont(), originalPart, segment.getFontSize());
if (!originalMeasurement.isValid()) {
warnings.add(
"Cannot measure: "
+ originalPart.substring(
0, Math.min(10, originalPart.length())));
continue;
}
String placeholderPart = createSafePlaceholder(originalPart, segment);
WidthMeasurement placeholderMeasurement =
measureTextWidth(segment.getFont(), placeholderPart, segment.getFontSize());
totalOriginalWidth += originalMeasurement.getWidth();
totalPlaceholderWidth +=
placeholderMeasurement.isValid()
? placeholderMeasurement.getWidth()
: originalMeasurement.getWidth();
processedMatches++;
return TokenModificationResult.success();
} catch (Exception e) {
warnings.add("Error: " + e.getMessage());
return TokenModificationResult.success(); // Non-critical failure
}
}
return new WidthCalculationResult(
totalOriginalWidth - totalPlaceholderWidth, processedMatches, warnings);
}
private WidthMeasurement measureTextWidth(PDFont font, String text, float fontSize) {
try {
float fontUnits = safeGetStringWidth(font, text);
@ -1724,19 +1471,37 @@ public class RedactionService {
}
}
private String createSafePlaceholder(String originalText, TextSegment segment) {
private static String tryEncodingFallbacks(COSString cosString) {
try {
return createPlaceholderWithWidth(
originalText,
measureTextWidth(segment.getFont(), originalText, segment.getFontSize())
.getWidth(),
segment.getFont(),
segment.getFontSize());
} catch (Exception e) {
return "".repeat(Math.max(1, originalText.length()));
byte[] bytes = cosString.getBytes();
if (bytes.length == 0) return "";
String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"};
for (String encoding : encodings) {
try {
if (bytes.length >= 2) {
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
return new String(
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
return new String(
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
}
}
String decoded = new String(bytes, encoding);
if (!isGibberish(decoded)) {
return decoded;
}
} catch (Exception ignored) {
}
}
} catch (Exception e) {
}
return null;
}
private float applySafetyBounds(
WidthCalculationResult result, TextSegment segment, String text) {
if (result.processedMatches() == 0) return 0f;
@ -1796,20 +1561,27 @@ public class RedactionService {
}
}
private boolean isValidTokenIndex(List<Object> tokens, int index) {
return index >= 0 && index < tokens.size();
private static boolean isGibberish(String text) {
if (text == null || text.trim().isEmpty()) {
return true;
}
private boolean isValidTokenForOperator(Object token, String operatorName) {
if (token == null || operatorName == null) {
return false;
int questionMarks = 0;
int replacementChars = 0;
int totalChars = text.length();
for (char c : text.toCharArray()) {
if (c == '?') questionMarks++;
if (c == '\uFFFD') replacementChars++;
}
return switch (operatorName) {
case "Tj", "'", "\"" -> token instanceof COSString;
case "TJ" -> token instanceof COSArray;
default -> true;
};
double problematicRatio = (double) (questionMarks + replacementChars) / totalChars;
return problematicRatio > 0.3;
}
private static WipeResult wipeAllSemanticTextInTokens(List<Object> tokens) {
return wipeAllSemanticTextInTokens(
tokens, true); // Default to removing TU for backward compatibility
}
private COSArray createRedactedTJArray(
@ -1844,16 +1616,26 @@ public class RedactionService {
}
}
private int getActualStringLength(COSString cosString, PDFont font) {
String createPlaceholderWithFont(String originalWord, PDFont font) {
if (originalWord == null || originalWord.isEmpty()) return " ";
final String repeat = " ".repeat(Math.max(1, originalWord.length()));
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
try {
if (font == null) return cosString.getString().length();
String decodedText = TextDecodingHelper.tryDecodeWithFont(font, cosString);
return decodedText != null ? decodedText.length() : cosString.getString().length();
// Use helper to get accurate width at fontSize=1.0
float originalWidth =
WidthCalculator.calculateAccurateWidth(font, originalWord, 1.0f);
String result =
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
return result != null ? result : repeat;
} catch (Exception e) {
return cosString.getString().length();
return repeat;
}
}
return repeat;
}
private TokenModificationResult performTokenModification(
List<Object> tokens,
Object token,
@ -1913,15 +1695,33 @@ public class RedactionService {
}
}
private float calculateSafeWidth(String text, PDFont font, float fontSize) {
String createPlaceholderWithWidth(
String originalWord, float targetWidth, PDFont font, float fontSize) {
if (originalWord == null || originalWord.isEmpty()) return " ";
if (font == null || fontSize <= 0) return " ".repeat(Math.max(1, originalWord.length()));
if (!WidthCalculator.isWidthCalculationReliable(font))
return " ".repeat(originalWord.length());
final String repeat = " ".repeat(Math.max(1, originalWord.length()));
if (TextEncodingHelper.isFontSubset(font.getName())) {
return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize);
}
try {
if (font != null && fontSize > 0) {
return WidthCalculator.calculateAccurateWidth(font, text, fontSize);
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
if (spaceWidth <= 0) {
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
}
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces =
Math.max(
originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f));
return " ".repeat(Math.min(spaceCount, maxSpaces));
} catch (Exception e) {
// Width calculation failed
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
return result != null ? result : repeat;
}
return 0f;
}
private TokenModificationResult convertToTJWithAdjustment(
@ -1949,42 +1749,208 @@ public class RedactionService {
}
}
private void addSpacingAdjustment(
COSArray newArray, TextSegment segment, String originalText, String modifiedText) {
private String createAlternativePlaceholder(
String originalWord, float targetWidth, PDFont font, float fontSize) {
final String repeat =
" ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
try {
if (segment.getFont() == null || segment.getFontSize() <= 0) return;
float originalWidth =
calculateSafeWidth(originalText, segment.getFont(), segment.getFontSize());
float modifiedWidth =
calculateSafeWidth(modifiedText, segment.getFont(), segment.getFontSize());
float adjustment = originalWidth - modifiedWidth;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR * 1.10f;
if (Math.abs(kerning) < 1000) {
newArray.add(new COSFloat(kerning));
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
if (spaceWidth > 0) {
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces = originalWord.length() * 2;
return " ".repeat(Math.min(spaceCount, maxSpaces));
}
}
} catch (Exception e) {
// Failed to add spacing adjustment
}
}
private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) {
if (token == null || operatorName == null) return "";
for (String alt : alternatives) {
if (" ".equals(alt)) continue;
try {
return switch (operatorName) {
case "Tj" -> handleTjOperator(token, currentFont);
case "'" -> handleQuotedOperator(token, currentFont);
case "\"" -> handleQuotedOperator(token, currentFont);
case "TJ" -> handleTJOperator(token, currentFont);
default -> "";
};
} catch (Exception e) {
return "";
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
if (cw > 0) {
int count = Math.max(1, Math.round(targetWidth / cw));
int max = originalWord.length() * 2;
return " ".repeat(Math.min(count, max));
}
} catch (Exception ignored) {
}
}
return repeat;
} catch (Exception e) {
return repeat;
}
}
private List<MatchRange> findAllMatchesAggressive(
List<TextSegment> segments,
List<Object> tokens,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearch) {
List<Pattern> patterns =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
List<MatchRange> result = new ArrayList<>();
Map<Integer, List<AggressiveSegMatch>> perSegMatches = new HashMap<>();
try {
String completeText = buildCompleteText(segments);
if (!completeText.isEmpty()) {
List<MatchRange> global =
findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
if (!global.isEmpty()) {
result.addAll(global);
} else if (!useRegex && !targetWords.isEmpty()) {
String lower = completeText.toLowerCase();
for (String word : targetWords) {
String w = word.toLowerCase();
int idx = lower.indexOf(w);
while (idx >= 0) {
result.add(new MatchRange(idx, idx + w.length()));
idx = lower.indexOf(w, idx + 1);
}
}
}
}
} catch (Exception ignored) {
}
List<String> decodedPerSegment = new ArrayList<>(segments.size());
List<Integer> decStarts = new ArrayList<>(segments.size());
List<Integer> decEnds = new ArrayList<>(segments.size());
int decCursor = 0;
for (TextSegment seg : segments) {
String decoded = null;
try {
Object tok = tokens.get(seg.tokenIndex);
if (("Tj".equals(seg.operatorName)
|| "'".equals(seg.operatorName)
|| "\"".equals(seg.operatorName))
&& tok instanceof COSString cs) {
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
StringBuilder sb = new StringBuilder();
for (COSBase el : arr) {
if (el instanceof COSString s) {
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
sb.append(d != null ? d : s.getString());
}
}
decoded = sb.toString();
}
} catch (Exception ignored) {
}
String basis = (decoded != null) ? decoded : seg.getText();
decodedPerSegment.add(basis);
decStarts.add(decCursor);
decCursor += basis.length();
decEnds.add(decCursor);
}
StringBuilder decodedCompleteSb = new StringBuilder();
for (String d : decodedPerSegment) {
decodedCompleteSb.append(d);
}
String decodedComplete = decodedCompleteSb.toString();
if (!decodedComplete.isEmpty()) {
List<Pattern> patternsDec =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
for (Pattern p : patternsDec) {
try {
var m = p.matcher(decodedComplete);
while (m.find()) {
int gStart = m.start();
int gEnd = m.end();
mapStartToEnd(
segments, result, perSegMatches, decStarts, decEnds, gStart, gEnd);
}
} catch (Exception ignored) {
}
}
if (perSegMatches.isEmpty() && !useRegex && !targetWords.isEmpty()) {
String lower = decodedComplete.toLowerCase();
for (String word : targetWords) {
String w = word.toLowerCase();
int idx = lower.indexOf(w);
while (idx >= 0) {
int gStart = idx;
int gEnd = idx + w.length();
mapStartToEnd(
(List<TextSegment>) segments,
(List<MatchRange>) result,
(Map<Integer, List<AggressiveSegMatch>>) perSegMatches,
decStarts,
decEnds,
gStart,
gEnd);
idx = lower.indexOf(w, idx + 1);
}
}
}
}
if (!perSegMatches.isEmpty()) {
this.aggressiveSegMatches = perSegMatches;
} else {
this.aggressiveSegMatches = null;
}
for (TextSegment seg : segments) {
String decoded = null;
try {
Object tok = tokens.get(seg.tokenIndex);
if (("Tj".equals(seg.operatorName) || "'".equals(seg.operatorName))
&& tok instanceof COSString cs) {
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
StringBuilder sb = new StringBuilder();
for (COSBase el : arr) {
if (el instanceof COSString s) {
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
sb.append(d != null ? d : s.getString());
}
}
decoded = sb.toString();
}
} catch (Exception ignored) {
}
String basis = (decoded != null && !decoded.isEmpty()) ? decoded : seg.getText();
boolean any = false;
for (Pattern p : patterns) {
try {
var m = p.matcher(basis);
while (m.find()) {
any = true;
result.add(new MatchRange(seg.getStartPos(), seg.getStartPos()));
}
} catch (Exception ignored) {
}
}
if (!any) {
NormalizedMap nm = buildNormalizedMap(seg.getText());
if (!nm.norm.isEmpty()) {
for (String word : targetWords) {
String normWord = normalizeForFuzzy(word);
if (normWord.isEmpty()) {
continue;
}
int idx = nm.norm.indexOf(normWord);
while (idx >= 0) {
int origStart = nm.map[idx];
int origEnd =
nm.map[Math.min(idx + normWord.length() - 1, nm.map.length - 1)]
+ 1;
result.add(
new MatchRange(
seg.getStartPos() + origStart,
seg.getStartPos() + origEnd));
idx = nm.norm.indexOf(normWord, idx + 1);
}
}
}
}
}
result.sort(Comparator.comparingInt(MatchRange::getStartPos));
return result;
}
private String handleTjOperator(Object token, PDFont font) {
@ -2015,18 +1981,30 @@ public class RedactionService {
return textBuilder.toString();
}
private TokenModificationResult updateOperatorSafely(
List<Object> tokens, int tokenIndex, String originalOperator) {
try {
int operatorIndex = tokenIndex + 1;
if (isValidTokenIndex(tokens, operatorIndex)
&& tokens.get(operatorIndex) instanceof Operator op
&& op.getName().equals(originalOperator)) {
tokens.set(operatorIndex, Operator.getOperator("TJ"));
private void mapStartToEnd(
List<TextSegment> segments,
List<MatchRange> result,
Map<Integer, List<AggressiveSegMatch>> perSegMatches,
List<Integer> decStarts,
List<Integer> decEnds,
int gStart,
int gEnd) {
for (int sIdx = 0; sIdx < segments.size(); sIdx++) {
int sStart = decStarts.get(sIdx);
int sEnd = decEnds.get(sIdx);
int ovStart = Math.max(gStart, sStart);
int ovEnd = Math.min(gEnd, sEnd);
if (ovStart < ovEnd) {
int localStart = ovStart - sStart;
int localEnd = ovEnd - sStart;
perSegMatches
.computeIfAbsent(sIdx, k -> new ArrayList<>())
.add(new AggressiveSegMatch(sIdx, localStart, localEnd));
TextSegment seg = segments.get(sIdx);
int mappedStart = seg.getStartPos();
int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1);
result.add(new MatchRange(mappedStart, mappedEnd));
}
return TokenModificationResult.success();
} catch (Exception e) {
return TokenModificationResult.success(); // Non-critical failure
}
}
@ -2048,51 +2026,65 @@ public class RedactionService {
}
}
private String tryEncodingFallbacks(COSString cosString) {
private WidthCalculationResult calculatePreciseWidthAdjustment(
TextSegment segment, List<MatchRange> matches, String text) {
float totalOriginalWidth = 0f, totalPlaceholderWidth = 0f;
int processedMatches = 0;
List<String> warnings = new ArrayList<>();
for (MatchRange match : matches) {
try {
byte[] bytes = cosString.getBytes();
if (bytes.length == 0) return "";
int segStart = Math.max(0, match.getStartPos() - segment.getStartPos());
int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"};
for (String encoding : encodings) {
try {
if (bytes.length >= 2) {
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
return new String(bytes, 2, bytes.length - 2, "UTF-16BE");
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
return new String(bytes, 2, bytes.length - 2, "UTF-16LE");
}
if (segStart >= text.length() || segEnd <= segStart || segStart < 0) {
warnings.add("Invalid bounds: " + segStart + "-" + segEnd);
continue;
}
String decoded = new String(bytes, encoding);
if (!isGibberish(decoded)) {
return decoded;
}
} catch (Exception ignored) {
}
String originalPart = text.substring(segStart, segEnd);
WidthMeasurement originalMeasurement =
measureTextWidth(segment.getFont(), originalPart, segment.getFontSize());
if (!originalMeasurement.valid()) {
warnings.add(
"Cannot measure: "
+ originalPart.substring(
0, Math.min(10, originalPart.length())));
continue;
}
String placeholderPart = createSafePlaceholder(originalPart, segment);
WidthMeasurement placeholderMeasurement =
measureTextWidth(segment.getFont(), placeholderPart, segment.getFontSize());
totalOriginalWidth += originalMeasurement.width();
totalPlaceholderWidth +=
placeholderMeasurement.valid()
? placeholderMeasurement.width()
: originalMeasurement.width();
processedMatches++;
} catch (Exception e) {
warnings.add("Error: " + e.getMessage());
}
return null;
}
private boolean isGibberish(String text) {
if (text == null || text.trim().isEmpty()) {
return true;
return new WidthCalculationResult(
totalOriginalWidth - totalPlaceholderWidth, processedMatches, warnings);
}
int questionMarks = 0;
int replacementChars = 0;
int totalChars = text.length();
for (char c : text.toCharArray()) {
if (c == '?') questionMarks++;
if (c == '\uFFFD') replacementChars++;
private String createSafePlaceholder(String originalText, TextSegment segment) {
try {
return createPlaceholderWithWidth(
originalText,
measureTextWidth(segment.getFont(), originalText, segment.getFontSize())
.width(),
segment.getFont(),
segment.getFontSize());
} catch (Exception e) {
return "".repeat(Math.max(1, originalText.length()));
}
double problematicRatio = (double) (questionMarks + replacementChars) / totalChars;
return problematicRatio > 0.3;
}
private boolean isValidTJArray(COSArray array) {
@ -2105,9 +2097,19 @@ public class RedactionService {
return true;
}
private WipeResult wipeAllSemanticTextInTokens(List<Object> tokens) {
return wipeAllSemanticTextInTokens(
tokens, true); // Default to removing TU for backward compatibility
private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) {
if (token == null || operatorName == null) return "";
try {
return switch (operatorName) {
case "Tj" -> handleTjOperator(token, currentFont);
case "'", "\"" -> handleQuotedOperator(token, currentFont);
case "TJ" -> handleTJOperator(token, currentFont);
default -> "";
};
} catch (Exception e) {
return "";
}
}
private void processStringElement(
@ -2247,15 +2249,7 @@ public class RedactionService {
}
// Helper classes
@Getter
private static class WidthMeasurement {
private final float width;
private final boolean valid;
public WidthMeasurement(float width, boolean valid) {
this.width = width;
this.valid = valid;
}
private record WidthMeasurement(float width, boolean valid) {
public static WidthMeasurement invalid() {
return new WidthMeasurement(0f, false);