refactor(redaction): replace ThreadLocal with instance variables for aggressive mode handling

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-22 18:14:01 +02:00
parent 162b19f4ad
commit 8c6aa246a7

View File

@ -69,10 +69,8 @@ public class RedactionService {
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
private static final COSString EMPTY_COS_STRING = new COSString(""); private static final COSString EMPTY_COS_STRING = new COSString("");
private static final int MAX_SWEEPS = 3; private static final int MAX_SWEEPS = 3;
private static final ThreadLocal<Boolean> AGGRESSIVE_MODE = private boolean aggressiveMode = false;
ThreadLocal.withInitial(() -> Boolean.FALSE); private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
private static final ThreadLocal<Map<Integer, List<AggressiveSegMatch>>> AGGR_SEG_MATCHES =
new ThreadLocal<>();
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
private static void redactAreas( private static void redactAreas(
@ -473,36 +471,12 @@ public class RedactionService {
return map; return map;
} }
private static COSString redactCosStringByDecodedRanges( private static void performFallbackModification(
PDFont font, COSString cosString, List<AggressiveSegMatch> decRanges) { List<Object> tokens, int tokenIndex, String newText) {
try { try {
byte[] bytes = cosString.getBytes(); tokens.set(tokenIndex, newText.isEmpty() ? EMPTY_COS_STRING : new COSString(newText));
DecodedMapping dm = buildDecodeMapping(font, bytes);
if (dm.text.isEmpty() || dm.charByteStart.length == 0) {
return cosString;
}
boolean[] delete = new boolean[bytes.length];
for (AggressiveSegMatch r : decRanges) {
int ds = Math.max(0, Math.min(r.decodedStart, dm.charByteStart.length));
int de = Math.max(ds, Math.min(r.decodedEnd, dm.charByteStart.length));
if (ds >= de) {
continue;
}
int byteStart = dm.charByteStart[ds];
int byteEnd = dm.charByteEnd[de - 1];
for (int bi = Math.max(0, byteStart); bi < Math.min(bytes.length, byteEnd); bi++) {
delete[bi] = true;
}
}
ByteArrayOutputStream baos = new ByteArrayOutputStream(bytes.length);
for (int bi = 0; bi < bytes.length; bi++) {
if (!delete[bi]) {
baos.write(bytes[bi]);
}
}
return new COSString(baos.toByteArray());
} catch (Exception e) { } catch (Exception e) {
return Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) ? EMPTY_COS_STRING : cosString; performEmergencyFallback(tokens, tokenIndex);
} }
} }
@ -817,6 +791,81 @@ public class RedactionService {
} }
} }
private static void writeRedactedContentToPattern(
PDTilingPattern pattern, List<Object> redactedTokens) throws IOException {
var contentStream = pattern.getContentStream();
try (var out = contentStream.createOutputStream()) {
new ContentStreamWriter(out).writeTokens(redactedTokens);
}
}
public boolean performTextReplacement(
PDDocument document,
Map<Integer, List<PDFText>> allFoundTextsByPage,
String[] listOfText,
boolean useRegex,
boolean wholeWordSearchBool) {
if (allFoundTextsByPage.isEmpty()) {
return false;
}
try {
Set<String> allSearchTerms =
Arrays.stream(listOfText)
.map(String::trim)
.filter(s -> !s.isEmpty())
.collect(Collectors.toSet());
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
for (PDPage page : document.getPages()) {
List<Object> filtered =
createTokensWithoutTargetText(
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
writeFilteredContentStream(document, page, filtered);
}
// Stop early if nothing remains
if (!documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
break;
}
}
return false;
} catch (Exception e) {
return true;
}
}
private COSString redactCosStringByDecodedRanges(
PDFont font, COSString cosString, List<AggressiveSegMatch> decRanges) {
try {
byte[] bytes = cosString.getBytes();
DecodedMapping dm = buildDecodeMapping(font, bytes);
if (dm.text.isEmpty() || dm.charByteStart.length == 0) {
return cosString;
}
boolean[] delete = new boolean[bytes.length];
for (AggressiveSegMatch r : decRanges) {
int ds = Math.max(0, Math.min(r.decodedStart, dm.charByteStart.length));
int de = Math.max(ds, Math.min(r.decodedEnd, dm.charByteStart.length));
if (ds >= de) {
continue;
}
int byteStart = dm.charByteStart[ds];
int byteEnd = dm.charByteEnd[de - 1];
for (int bi = Math.max(0, byteStart); bi < Math.min(bytes.length, byteEnd); bi++) {
delete[bi] = true;
}
}
ByteArrayOutputStream baos = new ByteArrayOutputStream(bytes.length);
for (int bi = 0; bi < bytes.length; bi++) {
if (!delete[bi]) {
baos.write(bytes[bi]);
}
}
return new COSString(baos.toByteArray());
} catch (Exception e) {
return this.aggressiveMode ? EMPTY_COS_STRING : cosString;
}
}
public void performTextReplacementAggressive( public void performTextReplacementAggressive(
PDDocument document, PDDocument document,
Map<Integer, List<PDFText>> allFoundTextsByPage, Map<Integer, List<PDFText>> allFoundTextsByPage,
@ -831,7 +880,8 @@ public class RedactionService {
.map(String::trim) .map(String::trim)
.filter(s -> !s.isEmpty()) .filter(s -> !s.isEmpty())
.collect(Collectors.toSet()); .collect(Collectors.toSet());
AGGRESSIVE_MODE.set(Boolean.TRUE); this.aggressiveMode = true;
this.aggressiveSegMatches = new HashMap<>();
try { try {
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) { for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
boolean anyResidual = false; boolean anyResidual = false;
@ -839,7 +889,7 @@ public class RedactionService {
for (PDPage page : document.getPages()) { for (PDPage page : document.getPages()) {
pageIndex++; pageIndex++;
try { try {
AGGR_SEG_MATCHES.remove(); this.aggressiveSegMatches = new HashMap<>();
List<Object> filtered = List<Object> filtered =
createTokensWithoutTargetText( createTokensWithoutTargetText(
document, document,
@ -884,88 +934,8 @@ public class RedactionService {
} }
} }
} finally { } finally {
AGGRESSIVE_MODE.remove(); this.aggressiveMode = false;
} this.aggressiveSegMatches = null;
}
public boolean performTextReplacement(
PDDocument document,
Map<Integer, List<PDFText>> allFoundTextsByPage,
String[] listOfText,
boolean useRegex,
boolean wholeWordSearchBool) {
if (allFoundTextsByPage.isEmpty()) {
return false;
}
try {
Set<String> allSearchTerms =
Arrays.stream(listOfText)
.map(String::trim)
.filter(s -> !s.isEmpty())
.collect(Collectors.toSet());
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
for (PDPage page : document.getPages()) {
List<Object> filtered =
createTokensWithoutTargetText(
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
writeFilteredContentStream(document, page, filtered);
}
// Stop early if nothing remains
if (!documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
break;
}
}
return false;
} catch (Exception e) {
return true;
}
}
List<Object> createTokensWithoutTargetText(
PDDocument document,
PDPage page,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearch)
throws IOException {
PDFStreamParser parser = new PDFStreamParser(page);
List<Object> tokens = new ArrayList<>();
Object tk;
while (true) {
final Object parsedNextToken = parser.parseNextToken();
if ((tk = parsedNextToken) == null) break;
tokens.add(tk);
}
PDResources resources = page.getResources();
if (resources != null) {
processPageXObjects(document, resources, targetWords, useRegex, wholeWordSearch);
}
List<TextSegment> textSegments =
extractTextSegments(page, tokens, Boolean.TRUE.equals(AGGRESSIVE_MODE.get()));
String completeText = buildCompleteText(textSegments);
List<MatchRange> matches =
Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
? findAllMatchesAggressive(
textSegments, tokens, targetWords, useRegex, wholeWordSearch)
: findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
return applyRedactionsToTokens(tokens, textSegments, matches);
}
private void processPageXObjects(
PDDocument document,
PDResources resources,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearch) {
for (COSName xobjName : resources.getXObjectNames()) {
try {
PDXObject xobj = resources.getXObject(xobjName);
if (xobj instanceof PDFormXObject formXObj) {
processFormXObject(document, formXObj, targetWords, useRegex, wholeWordSearch);
}
} catch (Exception ignored) {
}
} }
} }
@ -1073,12 +1043,39 @@ public class RedactionService {
return sb.toString(); return sb.toString();
} }
private static void performFallbackModification(List<Object> tokens, int tokenIndex, String newText) { List<Object> createTokensWithoutTargetText(
try { PDDocument document,
tokens.set(tokenIndex, newText.isEmpty() ? EMPTY_COS_STRING : new COSString(newText)); PDPage page,
} catch (Exception e) { Set<String> targetWords,
performEmergencyFallback(tokens, tokenIndex); boolean useRegex,
boolean wholeWordSearch)
throws IOException {
PDFStreamParser parser = new PDFStreamParser(page);
List<Object> tokens = new ArrayList<>();
Object tk;
while (true) {
final Object parsedNextToken = parser.parseNextToken();
if ((tk = parsedNextToken) == null) break;
tokens.add(tk);
} }
PDResources resources = page.getResources();
if (resources != null) {
processPageXObjects(
document,
resources,
targetWords,
useRegex,
wholeWordSearch,
this.aggressiveMode);
}
List<TextSegment> textSegments = extractTextSegments(page, tokens, this.aggressiveMode);
String completeText = buildCompleteText(textSegments);
List<MatchRange> matches =
this.aggressiveMode
? findAllMatchesAggressive(
textSegments, tokens, targetWords, useRegex, wholeWordSearch)
: findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
return applyRedactionsToTokens(tokens, textSegments, matches);
} }
private static void performEmergencyFallback(List<Object> tokens, int tokenIndex) { private static void performEmergencyFallback(List<Object> tokens, int tokenIndex) {
@ -1089,50 +1086,23 @@ public class RedactionService {
} }
} }
private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) { private void processPageXObjects(
String text = segment.getText(); PDDocument document,
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) PDResources resources,
&& segment.getFont() != null Set<String> targetWords,
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) { boolean useRegex,
return text; boolean wholeWordSearch,
} boolean aggressive) {
for (COSName xobjName : resources.getXObjectNames()) {
StringBuilder result = new StringBuilder(text); try {
for (MatchRange match : matches) { PDXObject xobj = resources.getXObject(xobjName);
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); if (xobj instanceof PDFormXObject formXObj) {
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); processFormXObject(
if (segmentStart < text.length() && segmentEnd > segmentStart) { document, formXObj, targetWords, useRegex, wholeWordSearch, aggressive);
String originalPart = text.substring(segmentStart, segmentEnd);
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
&& segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(
segment.getFont(), originalPart)) {
continue;
}
if (Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) {
result.replace(segmentStart, segmentEnd, "");
} else {
float originalWidth = 0;
if (segment.getFont() != null && segment.getFontSize() > 0) {
originalWidth =
safeGetStringWidth(segment.getFont(), originalPart)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
}
String placeholder =
(originalWidth > 0)
? createPlaceholderWithWidth(
originalPart,
originalWidth,
segment.getFont(),
segment.getFontSize())
: createPlaceholderWithFont(originalPart, segment.getFont());
result.replace(segmentStart, segmentEnd, placeholder);
} }
} catch (Exception ignored) {
} }
} }
return result.toString();
} }
private float safeGetStringWidth(PDFont font, String text) { private float safeGetStringWidth(PDFont font, String text) {
@ -1358,12 +1328,50 @@ public class RedactionService {
return copy; return copy;
} }
private static void writeRedactedContentToPattern(PDTilingPattern pattern, List<Object> redactedTokens) private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) {
throws IOException { String text = segment.getText();
var contentStream = pattern.getContentStream(); if (!this.aggressiveMode
try (var out = contentStream.createOutputStream()) { && segment.getFont() != null
new ContentStreamWriter(out).writeTokens(redactedTokens); && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) {
return text;
} }
StringBuilder result = new StringBuilder(text);
for (MatchRange match : matches) {
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
if (segmentStart < text.length() && segmentEnd > segmentStart) {
String originalPart = text.substring(segmentStart, segmentEnd);
if (!this.aggressiveMode
&& segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(
segment.getFont(), originalPart)) {
continue;
}
if (this.aggressiveMode) {
result.replace(segmentStart, segmentEnd, "");
} else {
float originalWidth = 0;
if (segment.getFont() != null && segment.getFontSize() > 0) {
originalWidth =
safeGetStringWidth(segment.getFont(), originalPart)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
}
String placeholder =
(originalWidth > 0)
? createPlaceholderWithWidth(
originalPart,
originalWidth,
segment.getFont(),
segment.getFontSize())
: createPlaceholderWithFont(originalPart, segment.getFont());
result.replace(segmentStart, segmentEnd, placeholder);
}
}
}
return result.toString();
} }
private List<MatchRange> findAllMatchesAggressive( private List<MatchRange> findAllMatchesAggressive(
@ -1497,9 +1505,9 @@ public class RedactionService {
} }
} }
if (!perSegMatches.isEmpty()) { if (!perSegMatches.isEmpty()) {
AGGR_SEG_MATCHES.set(perSegMatches); this.aggressiveSegMatches = perSegMatches;
} else { } else {
AGGR_SEG_MATCHES.remove(); this.aggressiveSegMatches = null;
} }
for (TextSegment seg : segments) { for (TextSegment seg : segments) {
@ -1564,8 +1572,8 @@ public class RedactionService {
private List<Object> applyRedactionsToTokens( private List<Object> applyRedactionsToTokens(
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) { List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
List<Object> newTokens = new ArrayList<>(tokens); List<Object> newTokens = new ArrayList<>(tokens);
if (Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) { if (this.aggressiveMode) {
Map<Integer, List<AggressiveSegMatch>> perSeg = AGGR_SEG_MATCHES.get(); Map<Integer, List<AggressiveSegMatch>> perSeg = this.aggressiveSegMatches;
if (perSeg != null && !perSeg.isEmpty()) { if (perSeg != null && !perSeg.isEmpty()) {
List<Integer> segIndices = new ArrayList<>(perSeg.keySet()); List<Integer> segIndices = new ArrayList<>(perSeg.keySet());
segIndices.sort( segIndices.sort(
@ -1887,7 +1895,7 @@ public class RedactionService {
"Processing redaction: segment={}, matches={}, aggressive={}", "Processing redaction: segment={}, matches={}, aggressive={}",
segment, segment,
matches.size(), matches.size(),
AGGRESSIVE_MODE.get()); this.aggressiveMode);
try { try {
COSArray newArray = new COSArray(); COSArray newArray = new COSArray();
@ -2233,7 +2241,7 @@ public class RedactionService {
String originalText = getDecodedString(cosString, segment.getFont()); String originalText = getDecodedString(cosString, segment.getFont());
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) if (!this.aggressiveMode
&& segment.getFont() != null && segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), originalText)) { && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), originalText)) {
newArray.add(cosString); // Keep original COSString to preserve encoding newArray.add(cosString); // Keep original COSString to preserve encoding
@ -2245,9 +2253,7 @@ public class RedactionService {
// Sort matches by start position to process them in order // Sort matches by start position to process them in order
List<MatchRange> sortedMatches = List<MatchRange> sortedMatches =
matches.stream() matches.stream().sorted(Comparator.comparingInt(MatchRange::getStartPos)).toList();
.sorted(Comparator.comparingInt(MatchRange::getStartPos))
.toList();
int cumulativeOffset = 0; // Track cumulative text changes int cumulativeOffset = 0; // Track cumulative text changes
@ -2265,14 +2271,15 @@ public class RedactionService {
newText.length(), newText.length(),
overlapEnd - stringStartInPage - cumulativeOffset); overlapEnd - stringStartInPage - cumulativeOffset);
if (redactionEndInString <= newText.length() && redactionStartInString < redactionEndInString) { if (redactionEndInString <= newText.length()
&& redactionStartInString < redactionEndInString) {
String originalPart = String originalPart =
originalText.substring( originalText.substring(
overlapStart - stringStartInPage, overlapStart - stringStartInPage,
overlapEnd - stringStartInPage); overlapEnd - stringStartInPage);
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) if (!this.aggressiveMode
&& segment.getFont() != null && segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable( && !TextEncodingHelper.isTextSegmentRemovable(
segment.getFont(), originalPart)) { segment.getFont(), originalPart)) {
@ -2282,7 +2289,7 @@ public class RedactionService {
modified = true; modified = true;
String replacement = ""; String replacement = "";
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) { if (!this.aggressiveMode) {
replacement = createSafeReplacement(originalPart, segment); replacement = createSafeReplacement(originalPart, segment);
} }
@ -2297,7 +2304,7 @@ public class RedactionService {
COSString newCosString = createCompatibleCOSString(modifiedString, cosString); COSString newCosString = createCompatibleCOSString(modifiedString, cosString);
newArray.add(newCosString); newArray.add(newCosString);
if (modified && !Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) { if (modified && !this.aggressiveMode) {
addSpacingAdjustment(newArray, segment, originalText, modifiedString); addSpacingAdjustment(newArray, segment, originalText, modifiedString);
} }
} }
@ -2505,7 +2512,8 @@ public class RedactionService {
PDFormXObject formXObject, PDFormXObject formXObject,
Set<String> targetWords, Set<String> targetWords,
boolean useRegex, boolean useRegex,
boolean wholeWordSearch) { boolean wholeWordSearch,
boolean aggressive) {
try { try {
PDResources xobjResources = formXObject.getResources(); PDResources xobjResources = formXObject.getResources();
if (xobjResources == null) { if (xobjResources == null) {
@ -2515,7 +2523,12 @@ public class RedactionService {
PDXObject nestedXObj = xobjResources.getXObject(xobjName); PDXObject nestedXObj = xobjResources.getXObject(xobjName);
if (nestedXObj instanceof PDFormXObject nestedFormXObj) { if (nestedXObj instanceof PDFormXObject nestedFormXObj) {
processFormXObject( processFormXObject(
document, nestedFormXObj, targetWords, useRegex, wholeWordSearch); document,
nestedFormXObj,
targetWords,
useRegex,
wholeWordSearch,
aggressive);
} }
} }
PDFStreamParser parser = new PDFStreamParser(formXObject); PDFStreamParser parser = new PDFStreamParser(formXObject);
@ -2527,7 +2540,7 @@ public class RedactionService {
List<TextSegment> textSegments = extractTextSegmentsFromXObject(xobjResources, tokens); List<TextSegment> textSegments = extractTextSegmentsFromXObject(xobjResources, tokens);
String completeText = buildCompleteText(textSegments); String completeText = buildCompleteText(textSegments);
List<MatchRange> matches = List<MatchRange> matches =
Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) aggressive
? findAllMatchesAggressive( ? findAllMatchesAggressive(
textSegments, tokens, targetWords, useRegex, wholeWordSearch) textSegments, tokens, targetWords, useRegex, wholeWordSearch)
: findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); : findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
@ -2535,7 +2548,7 @@ public class RedactionService {
List<Object> redactedTokens = List<Object> redactedTokens =
applyRedactionsToTokens(tokens, textSegments, matches); applyRedactionsToTokens(tokens, textSegments, matches);
writeRedactedContentToXObject(document, formXObject, redactedTokens); writeRedactedContentToXObject(document, formXObject, redactedTokens);
} else if (Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && !completeText.isEmpty()) { } else if (aggressive && !completeText.isEmpty()) {
WipeResult wr = wipeAllTextShowingOperators(tokens); WipeResult wr = wipeAllTextShowingOperators(tokens);
writeRedactedContentToXObject(document, formXObject, wr.tokens); writeRedactedContentToXObject(document, formXObject, wr.tokens);
} }