mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
cleanup, remove bloat
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
eb0fbcdfa3
commit
48967f7061
@ -3,6 +3,7 @@ package stirling.software.SPDF.service;
|
|||||||
import java.awt.Color;
|
import java.awt.Color;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@ -708,8 +709,7 @@ public class RedactionService {
|
|||||||
|
|
||||||
private static String tryFontBasedExtraction(COSString cosString, PDFont font) {
|
private static String tryFontBasedExtraction(COSString cosString, PDFont font) {
|
||||||
try {
|
try {
|
||||||
String decoded = TextDecodingHelper.tryDecodeWithFont(font, cosString);
|
return TextDecodingHelper.tryDecodeWithFont(font, cosString);
|
||||||
return decoded;
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -888,56 +888,20 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
String createPlaceholderWithFont(String originalWord, PDFont font) {
|
private static float calculateCharacterSumWidth(PDFont font, String text) {
|
||||||
if (originalWord == null || originalWord.isEmpty()) return " ";
|
float totalWidth = 0f;
|
||||||
|
for (char c : text.toCharArray()) {
|
||||||
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
|
|
||||||
try {
|
try {
|
||||||
// Use helper to get accurate width at fontSize=1.0
|
totalWidth += font.getStringWidth(String.valueOf(c));
|
||||||
float originalWidth =
|
|
||||||
WidthCalculator.calculateAccurateWidth(font, originalWord, 1.0f);
|
|
||||||
String result =
|
|
||||||
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
|
|
||||||
return result != null ? result : " ".repeat(Math.max(1, originalWord.length()));
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return " ".repeat(Math.max(1, originalWord.length()));
|
return -1f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return totalWidth;
|
||||||
|
}
|
||||||
|
|
||||||
return " ".repeat(Math.max(1, originalWord.length()));
|
private static boolean isValidTokenIndex(List<Object> tokens, int index) {
|
||||||
}
|
return index >= 0 && index < tokens.size();
|
||||||
|
|
||||||
String createPlaceholderWithWidth(
|
|
||||||
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
|
||||||
if (originalWord == null || originalWord.isEmpty()) return " ";
|
|
||||||
if (font == null || fontSize <= 0) return " ".repeat(Math.max(1, originalWord.length()));
|
|
||||||
if (!WidthCalculator.isWidthCalculationReliable(font))
|
|
||||||
return " ".repeat(Math.max(1, originalWord.length()));
|
|
||||||
|
|
||||||
if (TextEncodingHelper.isFontSubset(font.getName())) {
|
|
||||||
String result = createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize);
|
|
||||||
return result != null
|
|
||||||
? result
|
|
||||||
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
|
|
||||||
if (spaceWidth <= 0) {
|
|
||||||
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
|
|
||||||
int maxSpaces =
|
|
||||||
Math.max(
|
|
||||||
originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f));
|
|
||||||
return " ".repeat(Math.min(spaceCount, maxSpaces));
|
|
||||||
} catch (Exception e) {
|
|
||||||
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
|
||||||
return result != null
|
|
||||||
? result
|
|
||||||
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String createSubsetFontPlaceholder(
|
private String createSubsetFontPlaceholder(
|
||||||
@ -1020,35 +984,16 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String createAlternativePlaceholder(
|
private static boolean isValidTokenForOperator(Object token, String operatorName) {
|
||||||
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
if (token == null || operatorName == null) {
|
||||||
try {
|
return false;
|
||||||
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
|
|
||||||
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
|
|
||||||
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
|
|
||||||
if (spaceWidth > 0) {
|
|
||||||
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
|
|
||||||
int maxSpaces = originalWord.length() * 2;
|
|
||||||
return " ".repeat(Math.min(spaceCount, maxSpaces));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (String alt : alternatives) {
|
|
||||||
if (" ".equals(alt)) continue;
|
|
||||||
try {
|
|
||||||
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
|
|
||||||
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
|
|
||||||
if (cw > 0) {
|
|
||||||
int count = Math.max(1, Math.round(targetWidth / cw));
|
|
||||||
int max = originalWord.length() * 2;
|
|
||||||
return " ".repeat(Math.min(count, max));
|
|
||||||
}
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
|
||||||
} catch (Exception e) {
|
|
||||||
return " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return switch (operatorName) {
|
||||||
|
case "Tj", "'", "\"" -> token instanceof COSString;
|
||||||
|
case "TJ" -> token instanceof COSArray;
|
||||||
|
default -> true;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<TextSegment> extractTextSegments(
|
private List<TextSegment> extractTextSegments(
|
||||||
@ -1257,199 +1202,25 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<MatchRange> findAllMatchesAggressive(
|
private static int getActualStringLength(COSString cosString, PDFont font) {
|
||||||
List<TextSegment> segments,
|
|
||||||
List<Object> tokens,
|
|
||||||
Set<String> targetWords,
|
|
||||||
boolean useRegex,
|
|
||||||
boolean wholeWordSearch) {
|
|
||||||
List<Pattern> patterns =
|
|
||||||
TextFinderUtils.createOptimizedSearchPatterns(
|
|
||||||
targetWords, useRegex, wholeWordSearch);
|
|
||||||
List<MatchRange> result = new ArrayList<>();
|
|
||||||
Map<Integer, List<AggressiveSegMatch>> perSegMatches = new HashMap<>();
|
|
||||||
try {
|
try {
|
||||||
String completeText = buildCompleteText(segments);
|
if (font == null) return cosString.getString().length();
|
||||||
if (!completeText.isEmpty()) {
|
String decodedText = TextDecodingHelper.tryDecodeWithFont(font, cosString);
|
||||||
List<MatchRange> global =
|
return decodedText != null ? decodedText.length() : cosString.getString().length();
|
||||||
findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
|
} catch (Exception e) {
|
||||||
if (!global.isEmpty()) {
|
return cosString.getString().length();
|
||||||
result.addAll(global);
|
|
||||||
} else if (!useRegex && !targetWords.isEmpty()) {
|
|
||||||
String lower = completeText.toLowerCase();
|
|
||||||
for (String word : targetWords) {
|
|
||||||
String w = word.toLowerCase();
|
|
||||||
int idx = lower.indexOf(w);
|
|
||||||
while (idx >= 0) {
|
|
||||||
result.add(new MatchRange(idx, idx + w.length()));
|
|
||||||
idx = lower.indexOf(w, idx + 1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
|
||||||
|
|
||||||
List<String> decodedPerSegment = new ArrayList<>(segments.size());
|
private static float calculateSafeWidth(String text, PDFont font, float fontSize) {
|
||||||
List<Integer> decStarts = new ArrayList<>(segments.size());
|
|
||||||
List<Integer> decEnds = new ArrayList<>(segments.size());
|
|
||||||
int decCursor = 0;
|
|
||||||
for (TextSegment seg : segments) {
|
|
||||||
String decoded = null;
|
|
||||||
try {
|
try {
|
||||||
Object tok = tokens.get(seg.tokenIndex);
|
if (font != null && fontSize > 0) {
|
||||||
if (("Tj".equals(seg.operatorName)
|
return WidthCalculator.calculateAccurateWidth(font, text, fontSize);
|
||||||
|| "'".equals(seg.operatorName)
|
|
||||||
|| "\"".equals(seg.operatorName))
|
|
||||||
&& tok instanceof COSString cs) {
|
|
||||||
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
|
|
||||||
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
for (COSBase el : arr) {
|
|
||||||
if (el instanceof COSString s) {
|
|
||||||
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
|
|
||||||
sb.append(d != null ? d : s.getString());
|
|
||||||
}
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
// Width calculation failed
|
||||||
}
|
}
|
||||||
decoded = sb.toString();
|
return 0f;
|
||||||
}
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
|
||||||
String basis = (decoded != null) ? decoded : seg.getText();
|
|
||||||
decodedPerSegment.add(basis);
|
|
||||||
decStarts.add(decCursor);
|
|
||||||
decCursor += basis.length();
|
|
||||||
decEnds.add(decCursor);
|
|
||||||
}
|
|
||||||
StringBuilder decodedCompleteSb = new StringBuilder();
|
|
||||||
for (String d : decodedPerSegment) {
|
|
||||||
decodedCompleteSb.append(d);
|
|
||||||
}
|
|
||||||
String decodedComplete = decodedCompleteSb.toString();
|
|
||||||
if (!decodedComplete.isEmpty()) {
|
|
||||||
List<Pattern> patternsDec =
|
|
||||||
TextFinderUtils.createOptimizedSearchPatterns(
|
|
||||||
targetWords, useRegex, wholeWordSearch);
|
|
||||||
for (Pattern p : patternsDec) {
|
|
||||||
try {
|
|
||||||
var m = p.matcher(decodedComplete);
|
|
||||||
while (m.find()) {
|
|
||||||
int gStart = m.start();
|
|
||||||
int gEnd = m.end();
|
|
||||||
for (int sIdx = 0; sIdx < segments.size(); sIdx++) {
|
|
||||||
int sStart = decStarts.get(sIdx);
|
|
||||||
int sEnd = decEnds.get(sIdx);
|
|
||||||
int ovStart = Math.max(gStart, sStart);
|
|
||||||
int ovEnd = Math.min(gEnd, sEnd);
|
|
||||||
if (ovStart < ovEnd) {
|
|
||||||
int localStart = ovStart - sStart;
|
|
||||||
int localEnd = ovEnd - sStart;
|
|
||||||
perSegMatches
|
|
||||||
.computeIfAbsent(sIdx, k -> new ArrayList<>())
|
|
||||||
.add(new AggressiveSegMatch(sIdx, localStart, localEnd));
|
|
||||||
TextSegment seg = segments.get(sIdx);
|
|
||||||
int mappedStart = seg.getStartPos();
|
|
||||||
int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1);
|
|
||||||
result.add(new MatchRange(mappedStart, mappedEnd));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (perSegMatches.isEmpty() && !useRegex && !targetWords.isEmpty()) {
|
|
||||||
String lower = decodedComplete.toLowerCase();
|
|
||||||
for (String word : targetWords) {
|
|
||||||
String w = word.toLowerCase();
|
|
||||||
int idx = lower.indexOf(w);
|
|
||||||
while (idx >= 0) {
|
|
||||||
int gStart = idx;
|
|
||||||
int gEnd = idx + w.length();
|
|
||||||
for (int sIdx = 0; sIdx < segments.size(); sIdx++) {
|
|
||||||
int sStart = decStarts.get(sIdx);
|
|
||||||
int sEnd = decEnds.get(sIdx);
|
|
||||||
int ovStart = Math.max(gStart, sStart);
|
|
||||||
int ovEnd = Math.min(gEnd, sEnd);
|
|
||||||
if (ovStart < ovEnd) {
|
|
||||||
int localStart = ovStart - sStart;
|
|
||||||
int localEnd = ovEnd - sStart;
|
|
||||||
perSegMatches
|
|
||||||
.computeIfAbsent(sIdx, k -> new ArrayList<>())
|
|
||||||
.add(new AggressiveSegMatch(sIdx, localStart, localEnd));
|
|
||||||
TextSegment seg = segments.get(sIdx);
|
|
||||||
int mappedStart = seg.getStartPos();
|
|
||||||
int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1);
|
|
||||||
result.add(new MatchRange(mappedStart, mappedEnd));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
idx = lower.indexOf(w, idx + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!perSegMatches.isEmpty()) {
|
|
||||||
this.aggressiveSegMatches = perSegMatches;
|
|
||||||
} else {
|
|
||||||
this.aggressiveSegMatches = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (TextSegment seg : segments) {
|
|
||||||
String decoded = null;
|
|
||||||
try {
|
|
||||||
Object tok = tokens.get(seg.tokenIndex);
|
|
||||||
if (("Tj".equals(seg.operatorName) || "'".equals(seg.operatorName))
|
|
||||||
&& tok instanceof COSString cs) {
|
|
||||||
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
|
|
||||||
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
for (COSBase el : arr) {
|
|
||||||
if (el instanceof COSString s) {
|
|
||||||
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
|
|
||||||
sb.append(d != null ? d : s.getString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
decoded = sb.toString();
|
|
||||||
}
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
|
||||||
String basis = (decoded != null && !decoded.isEmpty()) ? decoded : seg.getText();
|
|
||||||
boolean any = false;
|
|
||||||
for (Pattern p : patterns) {
|
|
||||||
try {
|
|
||||||
var m = p.matcher(basis);
|
|
||||||
while (m.find()) {
|
|
||||||
any = true;
|
|
||||||
result.add(new MatchRange(seg.getStartPos(), seg.getStartPos()));
|
|
||||||
}
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!any) {
|
|
||||||
NormalizedMap nm = buildNormalizedMap(seg.getText());
|
|
||||||
if (!nm.norm.isEmpty()) {
|
|
||||||
for (String word : targetWords) {
|
|
||||||
String normWord = normalizeForFuzzy(word);
|
|
||||||
if (normWord.isEmpty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
int idx = nm.norm.indexOf(normWord);
|
|
||||||
while (idx >= 0) {
|
|
||||||
int origStart = nm.map[idx];
|
|
||||||
int origEnd =
|
|
||||||
nm.map[Math.min(idx + normWord.length() - 1, nm.map.length - 1)]
|
|
||||||
+ 1;
|
|
||||||
result.add(
|
|
||||||
new MatchRange(
|
|
||||||
seg.getStartPos() + origStart,
|
|
||||||
seg.getStartPos() + origEnd));
|
|
||||||
idx = nm.norm.indexOf(normWord, idx + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result.sort(Comparator.comparingInt(MatchRange::getStartPos));
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<MatchRange> findMatchesInSegments(
|
private List<MatchRange> findMatchesInSegments(
|
||||||
@ -1642,67 +1413,43 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private float calculateCharacterSumWidth(PDFont font, String text) {
|
private static void addSpacingAdjustment(
|
||||||
float totalWidth = 0f;
|
COSArray newArray, TextSegment segment, String originalText, String modifiedText) {
|
||||||
for (char c : text.toCharArray()) {
|
|
||||||
try {
|
try {
|
||||||
totalWidth += font.getStringWidth(String.valueOf(c));
|
if (segment.getFont() == null || segment.getFontSize() <= 0) return;
|
||||||
|
|
||||||
|
float originalWidth =
|
||||||
|
calculateSafeWidth(originalText, segment.getFont(), segment.getFontSize());
|
||||||
|
float modifiedWidth =
|
||||||
|
calculateSafeWidth(modifiedText, segment.getFont(), segment.getFontSize());
|
||||||
|
float adjustment = originalWidth - modifiedWidth;
|
||||||
|
|
||||||
|
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
||||||
|
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR * 1.10f;
|
||||||
|
if (Math.abs(kerning) < 1000) {
|
||||||
|
newArray.add(new COSFloat(kerning));
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return -1f;
|
// Failed to add spacing adjustment
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return totalWidth;
|
|
||||||
}
|
|
||||||
|
|
||||||
private WidthCalculationResult calculatePreciseWidthAdjustment(
|
private static TokenModificationResult updateOperatorSafely(
|
||||||
TextSegment segment, List<MatchRange> matches, String text) {
|
List<Object> tokens, int tokenIndex, String originalOperator) {
|
||||||
float totalOriginalWidth = 0f, totalPlaceholderWidth = 0f;
|
|
||||||
int processedMatches = 0;
|
|
||||||
List<String> warnings = new ArrayList<>();
|
|
||||||
|
|
||||||
for (MatchRange match : matches) {
|
|
||||||
try {
|
try {
|
||||||
int segStart = Math.max(0, match.getStartPos() - segment.getStartPos());
|
int operatorIndex = tokenIndex + 1;
|
||||||
int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
|
if (isValidTokenIndex(tokens, operatorIndex)
|
||||||
|
&& tokens.get(operatorIndex) instanceof Operator op
|
||||||
if (segStart >= text.length() || segEnd <= segStart || segStart < 0) {
|
&& op.getName().equals(originalOperator)) {
|
||||||
warnings.add("Invalid bounds: " + segStart + "-" + segEnd);
|
tokens.set(operatorIndex, Operator.getOperator("TJ"));
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
return TokenModificationResult.success();
|
||||||
String originalPart = text.substring(segStart, segEnd);
|
|
||||||
if (originalPart.isEmpty()) continue;
|
|
||||||
|
|
||||||
WidthMeasurement originalMeasurement =
|
|
||||||
measureTextWidth(segment.getFont(), originalPart, segment.getFontSize());
|
|
||||||
if (!originalMeasurement.isValid()) {
|
|
||||||
warnings.add(
|
|
||||||
"Cannot measure: "
|
|
||||||
+ originalPart.substring(
|
|
||||||
0, Math.min(10, originalPart.length())));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
String placeholderPart = createSafePlaceholder(originalPart, segment);
|
|
||||||
WidthMeasurement placeholderMeasurement =
|
|
||||||
measureTextWidth(segment.getFont(), placeholderPart, segment.getFontSize());
|
|
||||||
|
|
||||||
totalOriginalWidth += originalMeasurement.getWidth();
|
|
||||||
totalPlaceholderWidth +=
|
|
||||||
placeholderMeasurement.isValid()
|
|
||||||
? placeholderMeasurement.getWidth()
|
|
||||||
: originalMeasurement.getWidth();
|
|
||||||
processedMatches++;
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
warnings.add("Error: " + e.getMessage());
|
return TokenModificationResult.success(); // Non-critical failure
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new WidthCalculationResult(
|
|
||||||
totalOriginalWidth - totalPlaceholderWidth, processedMatches, warnings);
|
|
||||||
}
|
|
||||||
|
|
||||||
private WidthMeasurement measureTextWidth(PDFont font, String text, float fontSize) {
|
private WidthMeasurement measureTextWidth(PDFont font, String text, float fontSize) {
|
||||||
try {
|
try {
|
||||||
float fontUnits = safeGetStringWidth(font, text);
|
float fontUnits = safeGetStringWidth(font, text);
|
||||||
@ -1724,19 +1471,37 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String createSafePlaceholder(String originalText, TextSegment segment) {
|
private static String tryEncodingFallbacks(COSString cosString) {
|
||||||
try {
|
try {
|
||||||
return createPlaceholderWithWidth(
|
byte[] bytes = cosString.getBytes();
|
||||||
originalText,
|
if (bytes.length == 0) return "";
|
||||||
measureTextWidth(segment.getFont(), originalText, segment.getFontSize())
|
|
||||||
.getWidth(),
|
String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"};
|
||||||
segment.getFont(),
|
|
||||||
segment.getFontSize());
|
for (String encoding : encodings) {
|
||||||
} catch (Exception e) {
|
try {
|
||||||
return "█".repeat(Math.max(1, originalText.length()));
|
if (bytes.length >= 2) {
|
||||||
|
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
|
||||||
|
return new String(
|
||||||
|
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
|
||||||
|
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
|
||||||
|
return new String(
|
||||||
|
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String decoded = new String(bytes, encoding);
|
||||||
|
if (!isGibberish(decoded)) {
|
||||||
|
return decoded;
|
||||||
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
private float applySafetyBounds(
|
private float applySafetyBounds(
|
||||||
WidthCalculationResult result, TextSegment segment, String text) {
|
WidthCalculationResult result, TextSegment segment, String text) {
|
||||||
if (result.processedMatches() == 0) return 0f;
|
if (result.processedMatches() == 0) return 0f;
|
||||||
@ -1796,20 +1561,27 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isValidTokenIndex(List<Object> tokens, int index) {
|
private static boolean isGibberish(String text) {
|
||||||
return index >= 0 && index < tokens.size();
|
if (text == null || text.trim().isEmpty()) {
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isValidTokenForOperator(Object token, String operatorName) {
|
int questionMarks = 0;
|
||||||
if (token == null || operatorName == null) {
|
int replacementChars = 0;
|
||||||
return false;
|
int totalChars = text.length();
|
||||||
|
|
||||||
|
for (char c : text.toCharArray()) {
|
||||||
|
if (c == '?') questionMarks++;
|
||||||
|
if (c == '\uFFFD') replacementChars++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return switch (operatorName) {
|
double problematicRatio = (double) (questionMarks + replacementChars) / totalChars;
|
||||||
case "Tj", "'", "\"" -> token instanceof COSString;
|
return problematicRatio > 0.3;
|
||||||
case "TJ" -> token instanceof COSArray;
|
}
|
||||||
default -> true;
|
|
||||||
};
|
private static WipeResult wipeAllSemanticTextInTokens(List<Object> tokens) {
|
||||||
|
return wipeAllSemanticTextInTokens(
|
||||||
|
tokens, true); // Default to removing TU for backward compatibility
|
||||||
}
|
}
|
||||||
|
|
||||||
private COSArray createRedactedTJArray(
|
private COSArray createRedactedTJArray(
|
||||||
@ -1844,16 +1616,26 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getActualStringLength(COSString cosString, PDFont font) {
|
String createPlaceholderWithFont(String originalWord, PDFont font) {
|
||||||
|
if (originalWord == null || originalWord.isEmpty()) return " ";
|
||||||
|
|
||||||
|
final String repeat = " ".repeat(Math.max(1, originalWord.length()));
|
||||||
|
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
|
||||||
try {
|
try {
|
||||||
if (font == null) return cosString.getString().length();
|
// Use helper to get accurate width at fontSize=1.0
|
||||||
String decodedText = TextDecodingHelper.tryDecodeWithFont(font, cosString);
|
float originalWidth =
|
||||||
return decodedText != null ? decodedText.length() : cosString.getString().length();
|
WidthCalculator.calculateAccurateWidth(font, originalWord, 1.0f);
|
||||||
|
String result =
|
||||||
|
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
|
||||||
|
return result != null ? result : repeat;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return cosString.getString().length();
|
return repeat;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return repeat;
|
||||||
|
}
|
||||||
|
|
||||||
private TokenModificationResult performTokenModification(
|
private TokenModificationResult performTokenModification(
|
||||||
List<Object> tokens,
|
List<Object> tokens,
|
||||||
Object token,
|
Object token,
|
||||||
@ -1913,15 +1695,33 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private float calculateSafeWidth(String text, PDFont font, float fontSize) {
|
String createPlaceholderWithWidth(
|
||||||
|
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
||||||
|
if (originalWord == null || originalWord.isEmpty()) return " ";
|
||||||
|
if (font == null || fontSize <= 0) return " ".repeat(Math.max(1, originalWord.length()));
|
||||||
|
if (!WidthCalculator.isWidthCalculationReliable(font))
|
||||||
|
return " ".repeat(originalWord.length());
|
||||||
|
|
||||||
|
final String repeat = " ".repeat(Math.max(1, originalWord.length()));
|
||||||
|
if (TextEncodingHelper.isFontSubset(font.getName())) {
|
||||||
|
return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize);
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (font != null && fontSize > 0) {
|
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
|
||||||
return WidthCalculator.calculateAccurateWidth(font, text, fontSize);
|
if (spaceWidth <= 0) {
|
||||||
|
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
|
||||||
|
int maxSpaces =
|
||||||
|
Math.max(
|
||||||
|
originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f));
|
||||||
|
return " ".repeat(Math.min(spaceCount, maxSpaces));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// Width calculation failed
|
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
||||||
|
return result != null ? result : repeat;
|
||||||
}
|
}
|
||||||
return 0f;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private TokenModificationResult convertToTJWithAdjustment(
|
private TokenModificationResult convertToTJWithAdjustment(
|
||||||
@ -1949,42 +1749,208 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addSpacingAdjustment(
|
private String createAlternativePlaceholder(
|
||||||
COSArray newArray, TextSegment segment, String originalText, String modifiedText) {
|
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
||||||
|
final String repeat =
|
||||||
|
" ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
||||||
try {
|
try {
|
||||||
if (segment.getFont() == null || segment.getFontSize() <= 0) return;
|
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
|
||||||
|
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
|
||||||
float originalWidth =
|
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
|
||||||
calculateSafeWidth(originalText, segment.getFont(), segment.getFontSize());
|
if (spaceWidth > 0) {
|
||||||
float modifiedWidth =
|
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
|
||||||
calculateSafeWidth(modifiedText, segment.getFont(), segment.getFontSize());
|
int maxSpaces = originalWord.length() * 2;
|
||||||
float adjustment = originalWidth - modifiedWidth;
|
return " ".repeat(Math.min(spaceCount, maxSpaces));
|
||||||
|
|
||||||
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
|
||||||
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR * 1.10f;
|
|
||||||
if (Math.abs(kerning) < 1000) {
|
|
||||||
newArray.add(new COSFloat(kerning));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
for (String alt : alternatives) {
|
||||||
// Failed to add spacing adjustment
|
if (" ".equals(alt)) continue;
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) {
|
|
||||||
if (token == null || operatorName == null) return "";
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return switch (operatorName) {
|
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
|
||||||
case "Tj" -> handleTjOperator(token, currentFont);
|
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
|
||||||
case "'" -> handleQuotedOperator(token, currentFont);
|
if (cw > 0) {
|
||||||
case "\"" -> handleQuotedOperator(token, currentFont);
|
int count = Math.max(1, Math.round(targetWidth / cw));
|
||||||
case "TJ" -> handleTJOperator(token, currentFont);
|
int max = originalWord.length() * 2;
|
||||||
default -> "";
|
return " ".repeat(Math.min(count, max));
|
||||||
};
|
|
||||||
} catch (Exception e) {
|
|
||||||
return "";
|
|
||||||
}
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return repeat;
|
||||||
|
} catch (Exception e) {
|
||||||
|
return repeat;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<MatchRange> findAllMatchesAggressive(
|
||||||
|
List<TextSegment> segments,
|
||||||
|
List<Object> tokens,
|
||||||
|
Set<String> targetWords,
|
||||||
|
boolean useRegex,
|
||||||
|
boolean wholeWordSearch) {
|
||||||
|
List<Pattern> patterns =
|
||||||
|
TextFinderUtils.createOptimizedSearchPatterns(
|
||||||
|
targetWords, useRegex, wholeWordSearch);
|
||||||
|
List<MatchRange> result = new ArrayList<>();
|
||||||
|
Map<Integer, List<AggressiveSegMatch>> perSegMatches = new HashMap<>();
|
||||||
|
try {
|
||||||
|
String completeText = buildCompleteText(segments);
|
||||||
|
if (!completeText.isEmpty()) {
|
||||||
|
List<MatchRange> global =
|
||||||
|
findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
|
||||||
|
if (!global.isEmpty()) {
|
||||||
|
result.addAll(global);
|
||||||
|
} else if (!useRegex && !targetWords.isEmpty()) {
|
||||||
|
String lower = completeText.toLowerCase();
|
||||||
|
for (String word : targetWords) {
|
||||||
|
String w = word.toLowerCase();
|
||||||
|
int idx = lower.indexOf(w);
|
||||||
|
while (idx >= 0) {
|
||||||
|
result.add(new MatchRange(idx, idx + w.length()));
|
||||||
|
idx = lower.indexOf(w, idx + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> decodedPerSegment = new ArrayList<>(segments.size());
|
||||||
|
List<Integer> decStarts = new ArrayList<>(segments.size());
|
||||||
|
List<Integer> decEnds = new ArrayList<>(segments.size());
|
||||||
|
int decCursor = 0;
|
||||||
|
for (TextSegment seg : segments) {
|
||||||
|
String decoded = null;
|
||||||
|
try {
|
||||||
|
Object tok = tokens.get(seg.tokenIndex);
|
||||||
|
if (("Tj".equals(seg.operatorName)
|
||||||
|
|| "'".equals(seg.operatorName)
|
||||||
|
|| "\"".equals(seg.operatorName))
|
||||||
|
&& tok instanceof COSString cs) {
|
||||||
|
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
|
||||||
|
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (COSBase el : arr) {
|
||||||
|
if (el instanceof COSString s) {
|
||||||
|
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
|
||||||
|
sb.append(d != null ? d : s.getString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
decoded = sb.toString();
|
||||||
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
String basis = (decoded != null) ? decoded : seg.getText();
|
||||||
|
decodedPerSegment.add(basis);
|
||||||
|
decStarts.add(decCursor);
|
||||||
|
decCursor += basis.length();
|
||||||
|
decEnds.add(decCursor);
|
||||||
|
}
|
||||||
|
StringBuilder decodedCompleteSb = new StringBuilder();
|
||||||
|
for (String d : decodedPerSegment) {
|
||||||
|
decodedCompleteSb.append(d);
|
||||||
|
}
|
||||||
|
String decodedComplete = decodedCompleteSb.toString();
|
||||||
|
if (!decodedComplete.isEmpty()) {
|
||||||
|
List<Pattern> patternsDec =
|
||||||
|
TextFinderUtils.createOptimizedSearchPatterns(
|
||||||
|
targetWords, useRegex, wholeWordSearch);
|
||||||
|
for (Pattern p : patternsDec) {
|
||||||
|
try {
|
||||||
|
var m = p.matcher(decodedComplete);
|
||||||
|
while (m.find()) {
|
||||||
|
int gStart = m.start();
|
||||||
|
int gEnd = m.end();
|
||||||
|
mapStartToEnd(
|
||||||
|
segments, result, perSegMatches, decStarts, decEnds, gStart, gEnd);
|
||||||
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (perSegMatches.isEmpty() && !useRegex && !targetWords.isEmpty()) {
|
||||||
|
String lower = decodedComplete.toLowerCase();
|
||||||
|
for (String word : targetWords) {
|
||||||
|
String w = word.toLowerCase();
|
||||||
|
int idx = lower.indexOf(w);
|
||||||
|
while (idx >= 0) {
|
||||||
|
int gStart = idx;
|
||||||
|
int gEnd = idx + w.length();
|
||||||
|
mapStartToEnd(
|
||||||
|
(List<TextSegment>) segments,
|
||||||
|
(List<MatchRange>) result,
|
||||||
|
(Map<Integer, List<AggressiveSegMatch>>) perSegMatches,
|
||||||
|
decStarts,
|
||||||
|
decEnds,
|
||||||
|
gStart,
|
||||||
|
gEnd);
|
||||||
|
idx = lower.indexOf(w, idx + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!perSegMatches.isEmpty()) {
|
||||||
|
this.aggressiveSegMatches = perSegMatches;
|
||||||
|
} else {
|
||||||
|
this.aggressiveSegMatches = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (TextSegment seg : segments) {
|
||||||
|
String decoded = null;
|
||||||
|
try {
|
||||||
|
Object tok = tokens.get(seg.tokenIndex);
|
||||||
|
if (("Tj".equals(seg.operatorName) || "'".equals(seg.operatorName))
|
||||||
|
&& tok instanceof COSString cs) {
|
||||||
|
decoded = TextDecodingHelper.tryDecodeWithFont(seg.font, cs);
|
||||||
|
} else if ("TJ".equals(seg.operatorName) && tok instanceof COSArray arr) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (COSBase el : arr) {
|
||||||
|
if (el instanceof COSString s) {
|
||||||
|
String d = TextDecodingHelper.tryDecodeWithFont(seg.font, s);
|
||||||
|
sb.append(d != null ? d : s.getString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
decoded = sb.toString();
|
||||||
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
String basis = (decoded != null && !decoded.isEmpty()) ? decoded : seg.getText();
|
||||||
|
boolean any = false;
|
||||||
|
for (Pattern p : patterns) {
|
||||||
|
try {
|
||||||
|
var m = p.matcher(basis);
|
||||||
|
while (m.find()) {
|
||||||
|
any = true;
|
||||||
|
result.add(new MatchRange(seg.getStartPos(), seg.getStartPos()));
|
||||||
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!any) {
|
||||||
|
NormalizedMap nm = buildNormalizedMap(seg.getText());
|
||||||
|
if (!nm.norm.isEmpty()) {
|
||||||
|
for (String word : targetWords) {
|
||||||
|
String normWord = normalizeForFuzzy(word);
|
||||||
|
if (normWord.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int idx = nm.norm.indexOf(normWord);
|
||||||
|
while (idx >= 0) {
|
||||||
|
int origStart = nm.map[idx];
|
||||||
|
int origEnd =
|
||||||
|
nm.map[Math.min(idx + normWord.length() - 1, nm.map.length - 1)]
|
||||||
|
+ 1;
|
||||||
|
result.add(
|
||||||
|
new MatchRange(
|
||||||
|
seg.getStartPos() + origStart,
|
||||||
|
seg.getStartPos() + origEnd));
|
||||||
|
idx = nm.norm.indexOf(normWord, idx + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.sort(Comparator.comparingInt(MatchRange::getStartPos));
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String handleTjOperator(Object token, PDFont font) {
|
private String handleTjOperator(Object token, PDFont font) {
|
||||||
@ -2015,18 +1981,30 @@ public class RedactionService {
|
|||||||
return textBuilder.toString();
|
return textBuilder.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private TokenModificationResult updateOperatorSafely(
|
private void mapStartToEnd(
|
||||||
List<Object> tokens, int tokenIndex, String originalOperator) {
|
List<TextSegment> segments,
|
||||||
try {
|
List<MatchRange> result,
|
||||||
int operatorIndex = tokenIndex + 1;
|
Map<Integer, List<AggressiveSegMatch>> perSegMatches,
|
||||||
if (isValidTokenIndex(tokens, operatorIndex)
|
List<Integer> decStarts,
|
||||||
&& tokens.get(operatorIndex) instanceof Operator op
|
List<Integer> decEnds,
|
||||||
&& op.getName().equals(originalOperator)) {
|
int gStart,
|
||||||
tokens.set(operatorIndex, Operator.getOperator("TJ"));
|
int gEnd) {
|
||||||
|
for (int sIdx = 0; sIdx < segments.size(); sIdx++) {
|
||||||
|
int sStart = decStarts.get(sIdx);
|
||||||
|
int sEnd = decEnds.get(sIdx);
|
||||||
|
int ovStart = Math.max(gStart, sStart);
|
||||||
|
int ovEnd = Math.min(gEnd, sEnd);
|
||||||
|
if (ovStart < ovEnd) {
|
||||||
|
int localStart = ovStart - sStart;
|
||||||
|
int localEnd = ovEnd - sStart;
|
||||||
|
perSegMatches
|
||||||
|
.computeIfAbsent(sIdx, k -> new ArrayList<>())
|
||||||
|
.add(new AggressiveSegMatch(sIdx, localStart, localEnd));
|
||||||
|
TextSegment seg = segments.get(sIdx);
|
||||||
|
int mappedStart = seg.getStartPos();
|
||||||
|
int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1);
|
||||||
|
result.add(new MatchRange(mappedStart, mappedEnd));
|
||||||
}
|
}
|
||||||
return TokenModificationResult.success();
|
|
||||||
} catch (Exception e) {
|
|
||||||
return TokenModificationResult.success(); // Non-critical failure
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2048,51 +2026,65 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String tryEncodingFallbacks(COSString cosString) {
|
private WidthCalculationResult calculatePreciseWidthAdjustment(
|
||||||
|
TextSegment segment, List<MatchRange> matches, String text) {
|
||||||
|
float totalOriginalWidth = 0f, totalPlaceholderWidth = 0f;
|
||||||
|
int processedMatches = 0;
|
||||||
|
List<String> warnings = new ArrayList<>();
|
||||||
|
|
||||||
|
for (MatchRange match : matches) {
|
||||||
try {
|
try {
|
||||||
byte[] bytes = cosString.getBytes();
|
int segStart = Math.max(0, match.getStartPos() - segment.getStartPos());
|
||||||
if (bytes.length == 0) return "";
|
int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
|
||||||
|
|
||||||
String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"};
|
if (segStart >= text.length() || segEnd <= segStart || segStart < 0) {
|
||||||
|
warnings.add("Invalid bounds: " + segStart + "-" + segEnd);
|
||||||
for (String encoding : encodings) {
|
continue;
|
||||||
try {
|
|
||||||
if (bytes.length >= 2) {
|
|
||||||
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
|
|
||||||
return new String(bytes, 2, bytes.length - 2, "UTF-16BE");
|
|
||||||
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
|
|
||||||
return new String(bytes, 2, bytes.length - 2, "UTF-16LE");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
String decoded = new String(bytes, encoding);
|
String originalPart = text.substring(segStart, segEnd);
|
||||||
if (!isGibberish(decoded)) {
|
|
||||||
return decoded;
|
WidthMeasurement originalMeasurement =
|
||||||
}
|
measureTextWidth(segment.getFont(), originalPart, segment.getFontSize());
|
||||||
} catch (Exception ignored) {
|
if (!originalMeasurement.valid()) {
|
||||||
}
|
warnings.add(
|
||||||
|
"Cannot measure: "
|
||||||
|
+ originalPart.substring(
|
||||||
|
0, Math.min(10, originalPart.length())));
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String placeholderPart = createSafePlaceholder(originalPart, segment);
|
||||||
|
WidthMeasurement placeholderMeasurement =
|
||||||
|
measureTextWidth(segment.getFont(), placeholderPart, segment.getFontSize());
|
||||||
|
|
||||||
|
totalOriginalWidth += originalMeasurement.width();
|
||||||
|
totalPlaceholderWidth +=
|
||||||
|
placeholderMeasurement.valid()
|
||||||
|
? placeholderMeasurement.width()
|
||||||
|
: originalMeasurement.width();
|
||||||
|
processedMatches++;
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
warnings.add("Error: " + e.getMessage());
|
||||||
}
|
}
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isGibberish(String text) {
|
return new WidthCalculationResult(
|
||||||
if (text == null || text.trim().isEmpty()) {
|
totalOriginalWidth - totalPlaceholderWidth, processedMatches, warnings);
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int questionMarks = 0;
|
private String createSafePlaceholder(String originalText, TextSegment segment) {
|
||||||
int replacementChars = 0;
|
try {
|
||||||
int totalChars = text.length();
|
return createPlaceholderWithWidth(
|
||||||
|
originalText,
|
||||||
for (char c : text.toCharArray()) {
|
measureTextWidth(segment.getFont(), originalText, segment.getFontSize())
|
||||||
if (c == '?') questionMarks++;
|
.width(),
|
||||||
if (c == '\uFFFD') replacementChars++;
|
segment.getFont(),
|
||||||
|
segment.getFontSize());
|
||||||
|
} catch (Exception e) {
|
||||||
|
return "█".repeat(Math.max(1, originalText.length()));
|
||||||
}
|
}
|
||||||
|
|
||||||
double problematicRatio = (double) (questionMarks + replacementChars) / totalChars;
|
|
||||||
return problematicRatio > 0.3;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isValidTJArray(COSArray array) {
|
private boolean isValidTJArray(COSArray array) {
|
||||||
@ -2105,9 +2097,19 @@ public class RedactionService {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private WipeResult wipeAllSemanticTextInTokens(List<Object> tokens) {
|
private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) {
|
||||||
return wipeAllSemanticTextInTokens(
|
if (token == null || operatorName == null) return "";
|
||||||
tokens, true); // Default to removing TU for backward compatibility
|
|
||||||
|
try {
|
||||||
|
return switch (operatorName) {
|
||||||
|
case "Tj" -> handleTjOperator(token, currentFont);
|
||||||
|
case "'", "\"" -> handleQuotedOperator(token, currentFont);
|
||||||
|
case "TJ" -> handleTJOperator(token, currentFont);
|
||||||
|
default -> "";
|
||||||
|
};
|
||||||
|
} catch (Exception e) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void processStringElement(
|
private void processStringElement(
|
||||||
@ -2247,15 +2249,7 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Helper classes
|
// Helper classes
|
||||||
@Getter
|
private record WidthMeasurement(float width, boolean valid) {
|
||||||
private static class WidthMeasurement {
|
|
||||||
private final float width;
|
|
||||||
private final boolean valid;
|
|
||||||
|
|
||||||
public WidthMeasurement(float width, boolean valid) {
|
|
||||||
this.width = width;
|
|
||||||
this.valid = valid;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static WidthMeasurement invalid() {
|
public static WidthMeasurement invalid() {
|
||||||
return new WidthMeasurement(0f, false);
|
return new WidthMeasurement(0f, false);
|
||||||
|
Loading…
Reference in New Issue
Block a user