Refactor redaction services and utilities for improved readability and maintainability

- Adjusted indentation and formatting across multiple files for consistency.
- Improved imports ordering in utility classes for better organization.
- Enhanced `performTextReplacementAggressive` method with multi-sweep logic to handle residual text more effectively.
- Added helper methods for verifying document text targets to streamline aggressive redaction.
- Simplified logic and formatting in `RedactionService` and related classes.

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-20 22:45:08 +02:00
parent 8f19369c58
commit 1fac74a3ca
10 changed files with 441 additions and 396 deletions

View File

@ -65,6 +65,7 @@ public class RedactionService {
private static final int FONT_SCALE_FACTOR = 1000; private static final int FONT_SCALE_FACTOR = 1000;
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
private static final COSString EMPTY_COS_STRING = new COSString(""); private static final COSString EMPTY_COS_STRING = new COSString("");
private static final int MAX_SWEEPS = 3;
private static final ThreadLocal<Boolean> AGGRESSIVE_MODE = private static final ThreadLocal<Boolean> AGGRESSIVE_MODE =
ThreadLocal.withInitial(() -> Boolean.FALSE); ThreadLocal.withInitial(() -> Boolean.FALSE);
private static final ThreadLocal<Map<Integer, List<AggressiveSegMatch>>> AGGR_SEG_MATCHES = private static final ThreadLocal<Map<Integer, List<AggressiveSegMatch>>> AGGR_SEG_MATCHES =
@ -268,6 +269,26 @@ public class RedactionService {
return false; return false;
} }
private static boolean documentStillContainsTargets(
PDDocument document,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearch) {
try {
int idx = -1;
for (int i = 0; i < document.getNumberOfPages(); i++) {
idx++;
if (pageStillContainsTargets(
document, idx, targetWords, useRegex, wholeWordSearch)) {
return true;
}
}
} catch (Exception ignored) {
return true;
}
return false;
}
public static Map<Integer, List<PDFText>> findTextToRedact( public static Map<Integer, List<PDFText>> findTextToRedact(
PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) { PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) {
Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>(); Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
@ -809,6 +830,8 @@ public class RedactionService {
.collect(Collectors.toSet()); .collect(Collectors.toSet());
AGGRESSIVE_MODE.set(Boolean.TRUE); AGGRESSIVE_MODE.set(Boolean.TRUE);
try { try {
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
boolean anyResidual = false;
int pageIndex = -1; int pageIndex = -1;
for (PDPage page : document.getPages()) { for (PDPage page : document.getPages()) {
pageIndex++; pageIndex++;
@ -816,7 +839,11 @@ public class RedactionService {
AGGR_SEG_MATCHES.remove(); AGGR_SEG_MATCHES.remove();
List<Object> filtered = List<Object> filtered =
createTokensWithoutTargetText( createTokensWithoutTargetText(
document, page, allSearchTerms, useRegex, wholeWordSearchBool); document,
page,
allSearchTerms,
useRegex,
wholeWordSearchBool);
writeFilteredContentStream(document, page, filtered); writeFilteredContentStream(document, page, filtered);
boolean residual = boolean residual =
pageStillContainsTargets( pageStillContainsTargets(
@ -826,6 +853,7 @@ public class RedactionService {
useRegex, useRegex,
wholeWordSearchBool); wholeWordSearchBool);
if (residual) { if (residual) {
anyResidual = true;
try { try {
var sem = wipeAllSemanticTextInTokens(filtered); var sem = wipeAllSemanticTextInTokens(filtered);
filtered = sem.tokens; filtered = sem.tokens;
@ -842,6 +870,16 @@ public class RedactionService {
} catch (Exception ignored) { } catch (Exception ignored) {
} }
} }
// If no residuals detected in this sweep, stop early
if (!anyResidual) {
break;
}
// As a safety, if nothing left in the doc, stop
if (!documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
break;
}
}
} finally { } finally {
AGGRESSIVE_MODE.remove(); AGGRESSIVE_MODE.remove();
} }
@ -862,12 +900,19 @@ public class RedactionService {
.map(String::trim) .map(String::trim)
.filter(s -> !s.isEmpty()) .filter(s -> !s.isEmpty())
.collect(Collectors.toSet()); .collect(Collectors.toSet());
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
for (PDPage page : document.getPages()) { for (PDPage page : document.getPages()) {
List<Object> filtered = List<Object> filtered =
createTokensWithoutTargetText( createTokensWithoutTargetText(
document, page, allSearchTerms, useRegex, wholeWordSearchBool); document, page, allSearchTerms, useRegex, wholeWordSearchBool);
writeFilteredContentStream(document, page, filtered); writeFilteredContentStream(document, page, filtered);
} }
// Stop early if nothing remains
if (!documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
break;
}
}
return false; return false;
} catch (Exception e) { } catch (Exception e) {
return true; return true;
@ -1473,7 +1518,8 @@ public class RedactionService {
String originalPart = String originalPart =
originalText.substring( originalText.substring(
redactionStartInString, redactionEndInString); redactionStartInString, redactionEndInString);
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && segment.getFont() != null if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
&& segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable( && !TextEncodingHelper.isTextSegmentRemovable(
segment.getFont(), originalPart)) { segment.getFont(), originalPart)) {
continue; continue;
@ -1514,7 +1560,10 @@ public class RedactionService {
} }
String modifiedString = newText.toString(); String modifiedString = newText.toString();
newArray.add(new COSString(modifiedString)); newArray.add(new COSString(modifiedString));
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && modified && segment.getFont() != null && segment.getFontSize() > 0) { if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
&& modified
&& segment.getFont() != null
&& segment.getFontSize() > 0) {
try { try {
float originalWidth = float originalWidth =
safeGetStringWidth(segment.getFont(), originalText) safeGetStringWidth(segment.getFont(), originalText)
@ -1847,8 +1896,7 @@ public class RedactionService {
private PDFont font = null; private PDFont font = null;
private float fontSize = 0; private float fontSize = 0;
public GraphicsState() { public GraphicsState() {}
}
} }
@Data @Data

View File

@ -1,17 +1,17 @@
package stirling.software.SPDF.utils.text; package stirling.software.SPDF.utils.text;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import lombok.extern.slf4j.Slf4j;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.CharBuffer; import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@UtilityClass @UtilityClass
public class TextDecodingHelper { public class TextDecodingHelper {
@ -89,8 +89,7 @@ public class TextDecodingHelper {
} catch (Exception ignored) { } catch (Exception ignored) {
} }
if (charStr == null if (charStr == null && font instanceof PDType0Font type0Font) {
&& font instanceof PDType0Font type0Font) {
try { try {
int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code; int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
charStr = type0Font.toUnicode(cid); charStr = type0Font.toUnicode(cid);
@ -143,8 +142,7 @@ public class TextDecodingHelper {
try { try {
if (bytes.length >= 2) { if (bytes.length >= 2) {
ByteBuffer buffer = ByteBuffer.wrap(bytes); ByteBuffer buffer = ByteBuffer.wrap(bytes);
CharsetDecoder decoder = CharsetDecoder decoder = StandardCharsets.UTF_16BE.newDecoder();
StandardCharsets.UTF_16BE.newDecoder();
CharBuffer charBuffer = decoder.decode(buffer); CharBuffer charBuffer = decoder.decode(buffer);
return charBuffer.toString(); return charBuffer.toString();
} }

View File

@ -2,12 +2,12 @@ package stirling.software.SPDF.utils.text;
import java.io.IOException; import java.io.IOException;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont; import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding; import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
import org.apache.pdfbox.pdmodel.font.encoding.Encoding; import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j

View File

@ -5,13 +5,13 @@ import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.PDResources;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@UtilityClass @UtilityClass
public class TextFinderUtils { public class TextFinderUtils {

View File

@ -1,9 +1,9 @@
package stirling.software.SPDF.utils.text; package stirling.software.SPDF.utils.text;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@ -44,8 +44,7 @@ public class WidthCalculator {
} }
} }
private float calculateWidthWithCharacterIteration( private float calculateWidthWithCharacterIteration(PDFont font, String text, float fontSize) {
PDFont font, String text, float fontSize) {
try { try {
float totalWidth = 0; float totalWidth = 0;