mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
refactor(redaction): replace ThreadLocal with instance variables for aggressive mode handling
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
162b19f4ad
commit
8c6aa246a7
@ -69,10 +69,8 @@ public class RedactionService {
|
|||||||
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
|
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
|
||||||
private static final COSString EMPTY_COS_STRING = new COSString("");
|
private static final COSString EMPTY_COS_STRING = new COSString("");
|
||||||
private static final int MAX_SWEEPS = 3;
|
private static final int MAX_SWEEPS = 3;
|
||||||
private static final ThreadLocal<Boolean> AGGRESSIVE_MODE =
|
private boolean aggressiveMode = false;
|
||||||
ThreadLocal.withInitial(() -> Boolean.FALSE);
|
private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
|
||||||
private static final ThreadLocal<Map<Integer, List<AggressiveSegMatch>>> AGGR_SEG_MATCHES =
|
|
||||||
new ThreadLocal<>();
|
|
||||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||||
|
|
||||||
private static void redactAreas(
|
private static void redactAreas(
|
||||||
@ -473,36 +471,12 @@ public class RedactionService {
|
|||||||
return map;
|
return map;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static COSString redactCosStringByDecodedRanges(
|
private static void performFallbackModification(
|
||||||
PDFont font, COSString cosString, List<AggressiveSegMatch> decRanges) {
|
List<Object> tokens, int tokenIndex, String newText) {
|
||||||
try {
|
try {
|
||||||
byte[] bytes = cosString.getBytes();
|
tokens.set(tokenIndex, newText.isEmpty() ? EMPTY_COS_STRING : new COSString(newText));
|
||||||
DecodedMapping dm = buildDecodeMapping(font, bytes);
|
|
||||||
if (dm.text.isEmpty() || dm.charByteStart.length == 0) {
|
|
||||||
return cosString;
|
|
||||||
}
|
|
||||||
boolean[] delete = new boolean[bytes.length];
|
|
||||||
for (AggressiveSegMatch r : decRanges) {
|
|
||||||
int ds = Math.max(0, Math.min(r.decodedStart, dm.charByteStart.length));
|
|
||||||
int de = Math.max(ds, Math.min(r.decodedEnd, dm.charByteStart.length));
|
|
||||||
if (ds >= de) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
int byteStart = dm.charByteStart[ds];
|
|
||||||
int byteEnd = dm.charByteEnd[de - 1];
|
|
||||||
for (int bi = Math.max(0, byteStart); bi < Math.min(bytes.length, byteEnd); bi++) {
|
|
||||||
delete[bi] = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(bytes.length);
|
|
||||||
for (int bi = 0; bi < bytes.length; bi++) {
|
|
||||||
if (!delete[bi]) {
|
|
||||||
baos.write(bytes[bi]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return new COSString(baos.toByteArray());
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) ? EMPTY_COS_STRING : cosString;
|
performEmergencyFallback(tokens, tokenIndex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -817,6 +791,81 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void writeRedactedContentToPattern(
|
||||||
|
PDTilingPattern pattern, List<Object> redactedTokens) throws IOException {
|
||||||
|
var contentStream = pattern.getContentStream();
|
||||||
|
try (var out = contentStream.createOutputStream()) {
|
||||||
|
new ContentStreamWriter(out).writeTokens(redactedTokens);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean performTextReplacement(
|
||||||
|
PDDocument document,
|
||||||
|
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
||||||
|
String[] listOfText,
|
||||||
|
boolean useRegex,
|
||||||
|
boolean wholeWordSearchBool) {
|
||||||
|
if (allFoundTextsByPage.isEmpty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
Set<String> allSearchTerms =
|
||||||
|
Arrays.stream(listOfText)
|
||||||
|
.map(String::trim)
|
||||||
|
.filter(s -> !s.isEmpty())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||||
|
for (PDPage page : document.getPages()) {
|
||||||
|
List<Object> filtered =
|
||||||
|
createTokensWithoutTargetText(
|
||||||
|
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||||
|
writeFilteredContentStream(document, page, filtered);
|
||||||
|
}
|
||||||
|
// Stop early if nothing remains
|
||||||
|
if (!documentStillContainsTargets(
|
||||||
|
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
} catch (Exception e) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private COSString redactCosStringByDecodedRanges(
|
||||||
|
PDFont font, COSString cosString, List<AggressiveSegMatch> decRanges) {
|
||||||
|
try {
|
||||||
|
byte[] bytes = cosString.getBytes();
|
||||||
|
DecodedMapping dm = buildDecodeMapping(font, bytes);
|
||||||
|
if (dm.text.isEmpty() || dm.charByteStart.length == 0) {
|
||||||
|
return cosString;
|
||||||
|
}
|
||||||
|
boolean[] delete = new boolean[bytes.length];
|
||||||
|
for (AggressiveSegMatch r : decRanges) {
|
||||||
|
int ds = Math.max(0, Math.min(r.decodedStart, dm.charByteStart.length));
|
||||||
|
int de = Math.max(ds, Math.min(r.decodedEnd, dm.charByteStart.length));
|
||||||
|
if (ds >= de) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int byteStart = dm.charByteStart[ds];
|
||||||
|
int byteEnd = dm.charByteEnd[de - 1];
|
||||||
|
for (int bi = Math.max(0, byteStart); bi < Math.min(bytes.length, byteEnd); bi++) {
|
||||||
|
delete[bi] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ByteArrayOutputStream baos = new ByteArrayOutputStream(bytes.length);
|
||||||
|
for (int bi = 0; bi < bytes.length; bi++) {
|
||||||
|
if (!delete[bi]) {
|
||||||
|
baos.write(bytes[bi]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new COSString(baos.toByteArray());
|
||||||
|
} catch (Exception e) {
|
||||||
|
return this.aggressiveMode ? EMPTY_COS_STRING : cosString;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void performTextReplacementAggressive(
|
public void performTextReplacementAggressive(
|
||||||
PDDocument document,
|
PDDocument document,
|
||||||
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
||||||
@ -831,7 +880,8 @@ public class RedactionService {
|
|||||||
.map(String::trim)
|
.map(String::trim)
|
||||||
.filter(s -> !s.isEmpty())
|
.filter(s -> !s.isEmpty())
|
||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
AGGRESSIVE_MODE.set(Boolean.TRUE);
|
this.aggressiveMode = true;
|
||||||
|
this.aggressiveSegMatches = new HashMap<>();
|
||||||
try {
|
try {
|
||||||
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||||
boolean anyResidual = false;
|
boolean anyResidual = false;
|
||||||
@ -839,7 +889,7 @@ public class RedactionService {
|
|||||||
for (PDPage page : document.getPages()) {
|
for (PDPage page : document.getPages()) {
|
||||||
pageIndex++;
|
pageIndex++;
|
||||||
try {
|
try {
|
||||||
AGGR_SEG_MATCHES.remove();
|
this.aggressiveSegMatches = new HashMap<>();
|
||||||
List<Object> filtered =
|
List<Object> filtered =
|
||||||
createTokensWithoutTargetText(
|
createTokensWithoutTargetText(
|
||||||
document,
|
document,
|
||||||
@ -884,88 +934,8 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
AGGRESSIVE_MODE.remove();
|
this.aggressiveMode = false;
|
||||||
}
|
this.aggressiveSegMatches = null;
|
||||||
}
|
|
||||||
|
|
||||||
public boolean performTextReplacement(
|
|
||||||
PDDocument document,
|
|
||||||
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
|
||||||
String[] listOfText,
|
|
||||||
boolean useRegex,
|
|
||||||
boolean wholeWordSearchBool) {
|
|
||||||
if (allFoundTextsByPage.isEmpty()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
Set<String> allSearchTerms =
|
|
||||||
Arrays.stream(listOfText)
|
|
||||||
.map(String::trim)
|
|
||||||
.filter(s -> !s.isEmpty())
|
|
||||||
.collect(Collectors.toSet());
|
|
||||||
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
|
||||||
for (PDPage page : document.getPages()) {
|
|
||||||
List<Object> filtered =
|
|
||||||
createTokensWithoutTargetText(
|
|
||||||
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
|
||||||
writeFilteredContentStream(document, page, filtered);
|
|
||||||
}
|
|
||||||
// Stop early if nothing remains
|
|
||||||
if (!documentStillContainsTargets(
|
|
||||||
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
} catch (Exception e) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Object> createTokensWithoutTargetText(
|
|
||||||
PDDocument document,
|
|
||||||
PDPage page,
|
|
||||||
Set<String> targetWords,
|
|
||||||
boolean useRegex,
|
|
||||||
boolean wholeWordSearch)
|
|
||||||
throws IOException {
|
|
||||||
PDFStreamParser parser = new PDFStreamParser(page);
|
|
||||||
List<Object> tokens = new ArrayList<>();
|
|
||||||
Object tk;
|
|
||||||
while (true) {
|
|
||||||
final Object parsedNextToken = parser.parseNextToken();
|
|
||||||
if ((tk = parsedNextToken) == null) break;
|
|
||||||
tokens.add(tk);
|
|
||||||
}
|
|
||||||
PDResources resources = page.getResources();
|
|
||||||
if (resources != null) {
|
|
||||||
processPageXObjects(document, resources, targetWords, useRegex, wholeWordSearch);
|
|
||||||
}
|
|
||||||
List<TextSegment> textSegments =
|
|
||||||
extractTextSegments(page, tokens, Boolean.TRUE.equals(AGGRESSIVE_MODE.get()));
|
|
||||||
String completeText = buildCompleteText(textSegments);
|
|
||||||
List<MatchRange> matches =
|
|
||||||
Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
|
|
||||||
? findAllMatchesAggressive(
|
|
||||||
textSegments, tokens, targetWords, useRegex, wholeWordSearch)
|
|
||||||
: findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
|
|
||||||
return applyRedactionsToTokens(tokens, textSegments, matches);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void processPageXObjects(
|
|
||||||
PDDocument document,
|
|
||||||
PDResources resources,
|
|
||||||
Set<String> targetWords,
|
|
||||||
boolean useRegex,
|
|
||||||
boolean wholeWordSearch) {
|
|
||||||
for (COSName xobjName : resources.getXObjectNames()) {
|
|
||||||
try {
|
|
||||||
PDXObject xobj = resources.getXObject(xobjName);
|
|
||||||
if (xobj instanceof PDFormXObject formXObj) {
|
|
||||||
processFormXObject(document, formXObj, targetWords, useRegex, wholeWordSearch);
|
|
||||||
}
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1073,12 +1043,39 @@ public class RedactionService {
|
|||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void performFallbackModification(List<Object> tokens, int tokenIndex, String newText) {
|
List<Object> createTokensWithoutTargetText(
|
||||||
try {
|
PDDocument document,
|
||||||
tokens.set(tokenIndex, newText.isEmpty() ? EMPTY_COS_STRING : new COSString(newText));
|
PDPage page,
|
||||||
} catch (Exception e) {
|
Set<String> targetWords,
|
||||||
performEmergencyFallback(tokens, tokenIndex);
|
boolean useRegex,
|
||||||
|
boolean wholeWordSearch)
|
||||||
|
throws IOException {
|
||||||
|
PDFStreamParser parser = new PDFStreamParser(page);
|
||||||
|
List<Object> tokens = new ArrayList<>();
|
||||||
|
Object tk;
|
||||||
|
while (true) {
|
||||||
|
final Object parsedNextToken = parser.parseNextToken();
|
||||||
|
if ((tk = parsedNextToken) == null) break;
|
||||||
|
tokens.add(tk);
|
||||||
}
|
}
|
||||||
|
PDResources resources = page.getResources();
|
||||||
|
if (resources != null) {
|
||||||
|
processPageXObjects(
|
||||||
|
document,
|
||||||
|
resources,
|
||||||
|
targetWords,
|
||||||
|
useRegex,
|
||||||
|
wholeWordSearch,
|
||||||
|
this.aggressiveMode);
|
||||||
|
}
|
||||||
|
List<TextSegment> textSegments = extractTextSegments(page, tokens, this.aggressiveMode);
|
||||||
|
String completeText = buildCompleteText(textSegments);
|
||||||
|
List<MatchRange> matches =
|
||||||
|
this.aggressiveMode
|
||||||
|
? findAllMatchesAggressive(
|
||||||
|
textSegments, tokens, targetWords, useRegex, wholeWordSearch)
|
||||||
|
: findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
|
||||||
|
return applyRedactionsToTokens(tokens, textSegments, matches);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void performEmergencyFallback(List<Object> tokens, int tokenIndex) {
|
private static void performEmergencyFallback(List<Object> tokens, int tokenIndex) {
|
||||||
@ -1089,50 +1086,23 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) {
|
private void processPageXObjects(
|
||||||
String text = segment.getText();
|
PDDocument document,
|
||||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
|
PDResources resources,
|
||||||
&& segment.getFont() != null
|
Set<String> targetWords,
|
||||||
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) {
|
boolean useRegex,
|
||||||
return text;
|
boolean wholeWordSearch,
|
||||||
}
|
boolean aggressive) {
|
||||||
|
for (COSName xobjName : resources.getXObjectNames()) {
|
||||||
StringBuilder result = new StringBuilder(text);
|
try {
|
||||||
for (MatchRange match : matches) {
|
PDXObject xobj = resources.getXObject(xobjName);
|
||||||
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
|
if (xobj instanceof PDFormXObject formXObj) {
|
||||||
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
|
processFormXObject(
|
||||||
if (segmentStart < text.length() && segmentEnd > segmentStart) {
|
document, formXObj, targetWords, useRegex, wholeWordSearch, aggressive);
|
||||||
String originalPart = text.substring(segmentStart, segmentEnd);
|
|
||||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
|
|
||||||
&& segment.getFont() != null
|
|
||||||
&& !TextEncodingHelper.isTextSegmentRemovable(
|
|
||||||
segment.getFont(), originalPart)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) {
|
|
||||||
result.replace(segmentStart, segmentEnd, "");
|
|
||||||
} else {
|
|
||||||
float originalWidth = 0;
|
|
||||||
if (segment.getFont() != null && segment.getFontSize() > 0) {
|
|
||||||
originalWidth =
|
|
||||||
safeGetStringWidth(segment.getFont(), originalPart)
|
|
||||||
/ FONT_SCALE_FACTOR
|
|
||||||
* segment.getFontSize();
|
|
||||||
}
|
|
||||||
String placeholder =
|
|
||||||
(originalWidth > 0)
|
|
||||||
? createPlaceholderWithWidth(
|
|
||||||
originalPart,
|
|
||||||
originalWidth,
|
|
||||||
segment.getFont(),
|
|
||||||
segment.getFontSize())
|
|
||||||
: createPlaceholderWithFont(originalPart, segment.getFont());
|
|
||||||
result.replace(segmentStart, segmentEnd, placeholder);
|
|
||||||
}
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result.toString();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private float safeGetStringWidth(PDFont font, String text) {
|
private float safeGetStringWidth(PDFont font, String text) {
|
||||||
@ -1358,12 +1328,50 @@ public class RedactionService {
|
|||||||
return copy;
|
return copy;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void writeRedactedContentToPattern(PDTilingPattern pattern, List<Object> redactedTokens)
|
private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) {
|
||||||
throws IOException {
|
String text = segment.getText();
|
||||||
var contentStream = pattern.getContentStream();
|
if (!this.aggressiveMode
|
||||||
try (var out = contentStream.createOutputStream()) {
|
&& segment.getFont() != null
|
||||||
new ContentStreamWriter(out).writeTokens(redactedTokens);
|
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) {
|
||||||
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
StringBuilder result = new StringBuilder(text);
|
||||||
|
for (MatchRange match : matches) {
|
||||||
|
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
|
||||||
|
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
|
||||||
|
if (segmentStart < text.length() && segmentEnd > segmentStart) {
|
||||||
|
String originalPart = text.substring(segmentStart, segmentEnd);
|
||||||
|
if (!this.aggressiveMode
|
||||||
|
&& segment.getFont() != null
|
||||||
|
&& !TextEncodingHelper.isTextSegmentRemovable(
|
||||||
|
segment.getFont(), originalPart)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.aggressiveMode) {
|
||||||
|
result.replace(segmentStart, segmentEnd, "");
|
||||||
|
} else {
|
||||||
|
float originalWidth = 0;
|
||||||
|
if (segment.getFont() != null && segment.getFontSize() > 0) {
|
||||||
|
originalWidth =
|
||||||
|
safeGetStringWidth(segment.getFont(), originalPart)
|
||||||
|
/ FONT_SCALE_FACTOR
|
||||||
|
* segment.getFontSize();
|
||||||
|
}
|
||||||
|
String placeholder =
|
||||||
|
(originalWidth > 0)
|
||||||
|
? createPlaceholderWithWidth(
|
||||||
|
originalPart,
|
||||||
|
originalWidth,
|
||||||
|
segment.getFont(),
|
||||||
|
segment.getFontSize())
|
||||||
|
: createPlaceholderWithFont(originalPart, segment.getFont());
|
||||||
|
result.replace(segmentStart, segmentEnd, placeholder);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<MatchRange> findAllMatchesAggressive(
|
private List<MatchRange> findAllMatchesAggressive(
|
||||||
@ -1497,9 +1505,9 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!perSegMatches.isEmpty()) {
|
if (!perSegMatches.isEmpty()) {
|
||||||
AGGR_SEG_MATCHES.set(perSegMatches);
|
this.aggressiveSegMatches = perSegMatches;
|
||||||
} else {
|
} else {
|
||||||
AGGR_SEG_MATCHES.remove();
|
this.aggressiveSegMatches = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (TextSegment seg : segments) {
|
for (TextSegment seg : segments) {
|
||||||
@ -1564,8 +1572,8 @@ public class RedactionService {
|
|||||||
private List<Object> applyRedactionsToTokens(
|
private List<Object> applyRedactionsToTokens(
|
||||||
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
|
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
|
||||||
List<Object> newTokens = new ArrayList<>(tokens);
|
List<Object> newTokens = new ArrayList<>(tokens);
|
||||||
if (Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) {
|
if (this.aggressiveMode) {
|
||||||
Map<Integer, List<AggressiveSegMatch>> perSeg = AGGR_SEG_MATCHES.get();
|
Map<Integer, List<AggressiveSegMatch>> perSeg = this.aggressiveSegMatches;
|
||||||
if (perSeg != null && !perSeg.isEmpty()) {
|
if (perSeg != null && !perSeg.isEmpty()) {
|
||||||
List<Integer> segIndices = new ArrayList<>(perSeg.keySet());
|
List<Integer> segIndices = new ArrayList<>(perSeg.keySet());
|
||||||
segIndices.sort(
|
segIndices.sort(
|
||||||
@ -1887,7 +1895,7 @@ public class RedactionService {
|
|||||||
"Processing redaction: segment={}, matches={}, aggressive={}",
|
"Processing redaction: segment={}, matches={}, aggressive={}",
|
||||||
segment,
|
segment,
|
||||||
matches.size(),
|
matches.size(),
|
||||||
AGGRESSIVE_MODE.get());
|
this.aggressiveMode);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
COSArray newArray = new COSArray();
|
COSArray newArray = new COSArray();
|
||||||
@ -2233,7 +2241,7 @@ public class RedactionService {
|
|||||||
|
|
||||||
String originalText = getDecodedString(cosString, segment.getFont());
|
String originalText = getDecodedString(cosString, segment.getFont());
|
||||||
|
|
||||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
|
if (!this.aggressiveMode
|
||||||
&& segment.getFont() != null
|
&& segment.getFont() != null
|
||||||
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), originalText)) {
|
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), originalText)) {
|
||||||
newArray.add(cosString); // Keep original COSString to preserve encoding
|
newArray.add(cosString); // Keep original COSString to preserve encoding
|
||||||
@ -2245,9 +2253,7 @@ public class RedactionService {
|
|||||||
|
|
||||||
// Sort matches by start position to process them in order
|
// Sort matches by start position to process them in order
|
||||||
List<MatchRange> sortedMatches =
|
List<MatchRange> sortedMatches =
|
||||||
matches.stream()
|
matches.stream().sorted(Comparator.comparingInt(MatchRange::getStartPos)).toList();
|
||||||
.sorted(Comparator.comparingInt(MatchRange::getStartPos))
|
|
||||||
.toList();
|
|
||||||
|
|
||||||
int cumulativeOffset = 0; // Track cumulative text changes
|
int cumulativeOffset = 0; // Track cumulative text changes
|
||||||
|
|
||||||
@ -2265,14 +2271,15 @@ public class RedactionService {
|
|||||||
newText.length(),
|
newText.length(),
|
||||||
overlapEnd - stringStartInPage - cumulativeOffset);
|
overlapEnd - stringStartInPage - cumulativeOffset);
|
||||||
|
|
||||||
if (redactionEndInString <= newText.length() && redactionStartInString < redactionEndInString) {
|
if (redactionEndInString <= newText.length()
|
||||||
|
&& redactionStartInString < redactionEndInString) {
|
||||||
|
|
||||||
String originalPart =
|
String originalPart =
|
||||||
originalText.substring(
|
originalText.substring(
|
||||||
overlapStart - stringStartInPage,
|
overlapStart - stringStartInPage,
|
||||||
overlapEnd - stringStartInPage);
|
overlapEnd - stringStartInPage);
|
||||||
|
|
||||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
|
if (!this.aggressiveMode
|
||||||
&& segment.getFont() != null
|
&& segment.getFont() != null
|
||||||
&& !TextEncodingHelper.isTextSegmentRemovable(
|
&& !TextEncodingHelper.isTextSegmentRemovable(
|
||||||
segment.getFont(), originalPart)) {
|
segment.getFont(), originalPart)) {
|
||||||
@ -2282,7 +2289,7 @@ public class RedactionService {
|
|||||||
modified = true;
|
modified = true;
|
||||||
String replacement = "";
|
String replacement = "";
|
||||||
|
|
||||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) {
|
if (!this.aggressiveMode) {
|
||||||
replacement = createSafeReplacement(originalPart, segment);
|
replacement = createSafeReplacement(originalPart, segment);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2297,7 +2304,7 @@ public class RedactionService {
|
|||||||
COSString newCosString = createCompatibleCOSString(modifiedString, cosString);
|
COSString newCosString = createCompatibleCOSString(modifiedString, cosString);
|
||||||
newArray.add(newCosString);
|
newArray.add(newCosString);
|
||||||
|
|
||||||
if (modified && !Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) {
|
if (modified && !this.aggressiveMode) {
|
||||||
addSpacingAdjustment(newArray, segment, originalText, modifiedString);
|
addSpacingAdjustment(newArray, segment, originalText, modifiedString);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2505,7 +2512,8 @@ public class RedactionService {
|
|||||||
PDFormXObject formXObject,
|
PDFormXObject formXObject,
|
||||||
Set<String> targetWords,
|
Set<String> targetWords,
|
||||||
boolean useRegex,
|
boolean useRegex,
|
||||||
boolean wholeWordSearch) {
|
boolean wholeWordSearch,
|
||||||
|
boolean aggressive) {
|
||||||
try {
|
try {
|
||||||
PDResources xobjResources = formXObject.getResources();
|
PDResources xobjResources = formXObject.getResources();
|
||||||
if (xobjResources == null) {
|
if (xobjResources == null) {
|
||||||
@ -2515,7 +2523,12 @@ public class RedactionService {
|
|||||||
PDXObject nestedXObj = xobjResources.getXObject(xobjName);
|
PDXObject nestedXObj = xobjResources.getXObject(xobjName);
|
||||||
if (nestedXObj instanceof PDFormXObject nestedFormXObj) {
|
if (nestedXObj instanceof PDFormXObject nestedFormXObj) {
|
||||||
processFormXObject(
|
processFormXObject(
|
||||||
document, nestedFormXObj, targetWords, useRegex, wholeWordSearch);
|
document,
|
||||||
|
nestedFormXObj,
|
||||||
|
targetWords,
|
||||||
|
useRegex,
|
||||||
|
wholeWordSearch,
|
||||||
|
aggressive);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
PDFStreamParser parser = new PDFStreamParser(formXObject);
|
PDFStreamParser parser = new PDFStreamParser(formXObject);
|
||||||
@ -2527,7 +2540,7 @@ public class RedactionService {
|
|||||||
List<TextSegment> textSegments = extractTextSegmentsFromXObject(xobjResources, tokens);
|
List<TextSegment> textSegments = extractTextSegmentsFromXObject(xobjResources, tokens);
|
||||||
String completeText = buildCompleteText(textSegments);
|
String completeText = buildCompleteText(textSegments);
|
||||||
List<MatchRange> matches =
|
List<MatchRange> matches =
|
||||||
Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
|
aggressive
|
||||||
? findAllMatchesAggressive(
|
? findAllMatchesAggressive(
|
||||||
textSegments, tokens, targetWords, useRegex, wholeWordSearch)
|
textSegments, tokens, targetWords, useRegex, wholeWordSearch)
|
||||||
: findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
|
: findAllMatches(completeText, targetWords, useRegex, wholeWordSearch);
|
||||||
@ -2535,7 +2548,7 @@ public class RedactionService {
|
|||||||
List<Object> redactedTokens =
|
List<Object> redactedTokens =
|
||||||
applyRedactionsToTokens(tokens, textSegments, matches);
|
applyRedactionsToTokens(tokens, textSegments, matches);
|
||||||
writeRedactedContentToXObject(document, formXObject, redactedTokens);
|
writeRedactedContentToXObject(document, formXObject, redactedTokens);
|
||||||
} else if (Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && !completeText.isEmpty()) {
|
} else if (aggressive && !completeText.isEmpty()) {
|
||||||
WipeResult wr = wipeAllTextShowingOperators(tokens);
|
WipeResult wr = wipeAllTextShowingOperators(tokens);
|
||||||
writeRedactedContentToXObject(document, formXObject, wr.tokens);
|
writeRedactedContentToXObject(document, formXObject, wr.tokens);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user