refactor redaction services to improve resource management and streamline text processing

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-25 19:53:19 +02:00
parent f236505cae
commit 3ac7f0df4c
6 changed files with 394 additions and 707 deletions

View File

@ -30,51 +30,39 @@ class AggressiveRedactionService implements RedactionModeStrategy {
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
PDDocument doc = null;
PDDocument fb = null;
try {
doc = pdfDocumentFactory.load(request.getFileInput());
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
return toByteArray(doc);
}
helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord);
Map<Integer, List<PDFText>> residual =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0;
String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000"
: request.getRedactColor();
if (residualExists) {
// Use the new visual redaction with OCR restoration fallback
return helper.performVisualRedactionWithOcrRestoration(
request, listOfText, useRegex, wholeWord);
}
return RedactionService.finalizeRedaction(
doc,
allFound,
request.getRedactColor(),
request.getCustomPadding(),
request.getConvertPDFToImage(), /*text removal*/
request.getConvertPDFToImage(),
true);
} catch (Exception e) {
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
} finally {
if (doc != null)
try {
doc.close();
} catch (IOException ignore) {
}
if (fb != null)
try {
fb.close();
} catch (IOException ignore) {
}
}
private byte[] toByteArray(PDDocument doc) throws IOException {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
}
}

View File

@ -29,49 +29,36 @@ class ModerateRedactionService implements RedactionModeStrategy {
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
PDDocument doc = null;
PDDocument fallback = null;
try {
doc = pdfDocumentFactory.load(request.getFileInput());
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
return toByteArray(doc);
}
boolean fallbackToBoxOnly =
helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord);
String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000"
: request.getRedactColor();
if (fallbackToBoxOnly) {
// Use the new visual redaction with OCR restoration fallback
return helper.performVisualRedactionWithOcrRestoration(
request, listOfText, useRegex, wholeWord);
}
return RedactionService.finalizeRedaction(
doc,
allFound,
effectiveColor,
request.getRedactColor(),
request.getCustomPadding(),
request.getConvertPDFToImage(),
false);
} catch (Exception e) {
throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
} finally {
if (doc != null)
try {
doc.close();
} catch (IOException ignore) {
}
if (fallback != null)
try {
fallback.close();
} catch (IOException ignore) {
}
}
private byte[] toByteArray(PDDocument doc) throws IOException {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
}
}

View File

@ -81,6 +81,7 @@ public class RedactionService {
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
private static final COSString EMPTY_COS_STRING = new COSString("");
private static final int MAX_SWEEPS = 3;
private static final Pattern PATTERN = Pattern.compile(".*(hoepap|temp|generated).*");
private boolean aggressiveMode = false;
private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
private final CustomPDFDocumentFactory pdfDocumentFactory;
@ -266,26 +267,20 @@ public class RedactionService {
boolean wholeWordSearch) {
try {
for (String term : targetWords) {
if (term == null || term.isBlank()) {
continue;
}
if (term == null || term.isBlank()) continue;
TextFinder finder = new TextFinder(term, useRegex, wholeWordSearch);
finder.setStartPage(pageIndex + 1);
finder.setEndPage(pageIndex + 1);
finder.getText(document);
List<PDFText> foundTexts = finder.getFoundTexts();
for (PDFText ft : foundTexts) {
if (ft.getPageIndex() == pageIndex) {
for (PDFText text : finder.getFoundTexts()) {
if (text.getPageIndex() == pageIndex) {
return true;
}
}
if (!foundTexts.isEmpty()) {}
}
return false;
} catch (Exception e) {
return true;
}
@ -297,18 +292,13 @@ public class RedactionService {
boolean useRegex,
boolean wholeWordSearch) {
try {
int idx = -1;
final int numberOfPages = document.getNumberOfPages();
for (int i = 0; i < numberOfPages; i++) {
idx++;
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
if (pageStillContainsTargets(
document, idx, targetWords, useRegex, wholeWordSearch)) {
document, pageIndex, targetWords, useRegex, wholeWordSearch)) {
return true;
}
}
return false;
} catch (Exception e) {
return true;
}
@ -352,12 +342,11 @@ public class RedactionService {
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
allFoundTexts.addAll(pageTexts);
}
if (!allFoundTexts.isEmpty()) {
if (!isTextRemovalMode) {
if (!allFoundTexts.isEmpty() && !isTextRemovalMode) {
Color redactColor = decodeOrDefault(colorString);
redactFoundText(document, allFoundTexts, customPadding, redactColor);
}
}
if (Boolean.TRUE.equals(convertToImage)) {
try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
@ -597,18 +586,11 @@ public class RedactionService {
private static boolean isTextSafeForRedaction(String text) {
if (text == null || text.isEmpty()) return true;
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
int codePoint = c;
if (codePoint >= 65488) {
return false;
}
if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
for (char c : text.toCharArray()) {
if (c >= 65488 || (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')) {
return false;
}
}
return true;
}
@ -657,56 +639,33 @@ public class RedactionService {
return wipeAllSemanticTextInTokens(tokens, true);
}
public byte[] performVisualRedactionWithOcrRestoration(
RedactPdfRequest request,
String[] listOfText,
boolean useRegex,
boolean wholeWordSearch)
throws IOException {
PDDocument visualRedactedDoc = null;
try {
visualRedactedDoc = pdfDocumentFactory.load(request.getFileInput());
Map<Integer, List<PDFText>> allFound =
findTextToRedact(visualRedactedDoc, listOfText, useRegex, wholeWordSearch);
String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000"
: request.getRedactColor();
byte[] visualRedactedBytes =
finalizeRedaction(
visualRedactedDoc,
allFound,
effectiveColor,
request.getCustomPadding(),
true,
false);
return performOcrRestoration(visualRedactedBytes, request);
} catch (Exception e) {
throw new IOException(
"Visual redaction with OCR restoration failed: " + e.getMessage(), e);
} finally {
if (visualRedactedDoc != null) {
try {
visualRedactedDoc.close();
} catch (IOException ignore) {
}
}
private static String normalizeTextForRedaction(String text) {
if (text == null) return null;
StringBuilder normalized = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (c >= 65488) {
normalized.append(' ');
} else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
normalized.append(' ');
} else {
normalized.append(c);
}
}
private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request)
throws IOException, InterruptedException {
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes);
if (isOcrMyPdfAvailable()) {
return processWithOcrMyPdfForRestoration(
tempInputFile.getPath(), tempOutputFile.getPath(), request);
} else if (isTesseractAvailable()) {
return processWithTesseractForRestoration(
tempInputFile.getPath(), tempOutputFile.getPath(), request);
return normalized.toString();
}
return redactedPdfBytes;
private static boolean isOcrMyPdfAvailable() {
try {
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version"));
return result.getRc() == 0;
} catch (Exception e) {
return false;
}
}
@ -780,37 +739,7 @@ public class RedactionService {
}
}
private static String normalizeTextForRedaction(String text) {
if (text == null) return null;
StringBuilder normalized = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if ((int) c >= 65488) {
normalized.append(' ');
} else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
normalized.append(' ');
} else {
normalized.append(c);
}
}
return normalized.toString();
}
private boolean isOcrMyPdfAvailable() {
try {
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version"));
return result.getRc() == 0;
} catch (Exception e) {
return false;
}
}
private boolean isTesseractAvailable() {
private static boolean isTesseractAvailable() {
try {
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
@ -826,7 +755,7 @@ public class RedactionService {
String fontName = font.getName();
if (fontName == null
|| isProperFontSubset(fontName)
|| fontName.toLowerCase().matches(".*(hoepap|temp|generated).*")) {
|| PATTERN.matcher(fontName.toLowerCase()).matches()) {
return false;
}
return hasReliableWidthMetrics(font);
@ -835,6 +764,58 @@ public class RedactionService {
}
}
private static String sanitizeText(String text) {
if (text == null) return "";
StringBuilder sanitized = new StringBuilder();
for (char c : text.toCharArray()) {
sanitized.append(
(Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')
? '\uFFFD'
: c);
}
return sanitized.toString();
}
private static byte[] processWithOcrMyPdfForRestoration(
java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request)
throws IOException, InterruptedException {
List<String> command =
Arrays.asList(
"ocrmypdf",
"--verbose",
"1",
"--output-type",
"pdf",
"--pdf-renderer",
"sandwich",
"--language",
"eng",
"--optimize",
"0",
"--jpeg-quality",
"100",
"--png-quality",
"9",
"--force-ocr",
"--deskew",
"--clean",
"--clean-final",
inputPath.toString(),
outputPath.toString());
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(command);
if (result.getRc() != 0) {
throw new IOException(
"OCRmyPDF restoration failed with return code: "
+ result.getRc()
+ ". Error: "
+ result.getMessages());
}
return java.nio.file.Files.readAllBytes(outputPath);
}
private static String createSubsetFontPlaceholder(
String originalWord, float targetWidth, PDFont font, float fontSize) {
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
@ -843,77 +824,144 @@ public class RedactionService {
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
}
public void performTextReplacementAggressive(
PDDocument document,
Map<Integer, List<PDFText>> allFoundTextsByPage,
String[] listOfText,
private static COSArray buildKerningAdjustedTJArray(
COSArray originalArray, COSArray redactedArray, TextSegment segment) {
try {
if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0)
return redactedArray;
COSArray out = new COSArray();
int size = redactedArray.size();
for (int i = 0; i < size; i++) {
COSBase redEl = redactedArray.get(i);
COSBase origEl =
(originalArray != null && i < originalArray.size())
? originalArray.get(i)
: null;
out.add(redEl);
if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) {
String origText = getDecodedString(origStr, segment.getFont());
String modText = getDecodedString(redStr, segment.getFont());
float wOrig =
calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
float wMod =
calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
float adjustment = wOrig - wMod;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
i++;
float combined = num.floatValue() + kerning;
out.add(new COSFloat(combined));
} else {
out.add(new COSFloat(kerning));
}
}
}
}
return out;
} catch (Exception e) {
return redactedArray;
}
}
private static List<MatchRange> findMatchesInSegments(
List<TextSegment> segments,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearchBool) {
if (allFoundTextsByPage.isEmpty()) {
return;
boolean wholeWordSearch) {
List<MatchRange> allMatches = new ArrayList<>();
List<Pattern> patterns =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size());
int totalMatchesFound = 0;
for (int i = 0; i < segments.size(); i++) {
TextSegment segment = segments.get(i);
String segmentText = segment.getText();
if (segmentText == null || segmentText.isEmpty()) {
log.debug("Skipping empty segment {}", i);
continue;
}
Set<String> allSearchTerms =
Arrays.stream(listOfText)
.map(String::trim)
.filter(s -> !s.isEmpty())
.collect(Collectors.toSet());
this.aggressiveMode = true;
this.aggressiveSegMatches = new HashMap<>();
log.debug("Processing segment {}: '{}'", i, segmentText);
if (segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
log.debug(
"Skipping segment {} - font not removable: {}",
i,
segment.getFont().getName());
continue;
}
int segmentMatches = 0;
for (Pattern pattern : patterns) {
try {
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
boolean anyResidual = false;
int pageIndex = -1;
for (PDPage page : document.getPages()) {
pageIndex++;
try {
this.aggressiveSegMatches = new HashMap<>();
List<Object> filtered =
createTokensWithoutTargetText(
document,
page,
allSearchTerms,
useRegex,
wholeWordSearchBool);
writeFilteredContentStream(document, page, filtered);
boolean residual =
pageStillContainsTargets(
document,
pageIndex,
allSearchTerms,
useRegex,
wholeWordSearchBool);
if (residual) {
anyResidual = true;
try {
var sem = wipeAllSemanticTextInTokens(filtered);
filtered = sem.tokens;
PDResources res = page.getResources();
if (res != null) {
wipeAllSemanticTextInProperties(res);
wipeAllTextInXObjects(document, res);
wipeAllTextInPatterns(document, res);
}
writeFilteredContentStream(document, page, filtered);
} catch (Exception ignored) {
log.debug(
"Matching pattern '{}' against segment text '{}'",
pattern.pattern(),
segmentText);
var matcher = pattern.matcher(segmentText);
while (matcher.find()) {
int matchStart = matcher.start();
int matchEnd = matcher.end();
log.debug(
"Found match in segment {}: positions {}-{}",
i,
matchStart,
matchEnd);
if (matchStart >= 0
&& matchEnd <= segmentText.length()
&& matchStart < matchEnd) {
String matchedText = segmentText.substring(matchStart, matchEnd);
log.debug("Matched text: '{}'", matchedText);
allMatches.add(
new MatchRange(
segment.getStartPos() + matchStart,
segment.getStartPos() + matchEnd));
segmentMatches++;
totalMatchesFound++;
}
}
} catch (Exception ignored) {
} catch (Exception e) {
log.error("Error matching pattern in segment {}: {}", i, e.getMessage());
}
}
if (!anyResidual) {
break;
}
if (!documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
break;
if (segmentMatches > 0) {
log.info("Segment {} had {} matches", i, segmentMatches);
}
}
} finally {
this.aggressiveMode = false;
this.aggressiveSegMatches = null;
log.info("Total matches found across all segments: {}", totalMatchesFound);
allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
if (allMatches.isEmpty()) {
log.warn("No matches found in segments. This might indicate:");
log.warn("1. Text encoding issues preventing proper extraction");
log.warn("2. Font compatibility issues");
log.warn("3. Search terms not matching extracted text");
log.warn("4. Whole word search filtering out matches");
if (!segments.isEmpty()) {
log.warn("Sample segment text: '{}'", segments.get(0).getText());
log.warn("Target words: {}", targetWords);
log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch);
}
}
return allMatches;
}
private static float calculateCharacterSumWidth(PDFont font, String text) {
float totalWidth = 0f;
for (char c : text.toCharArray()) {
@ -1033,19 +1081,29 @@ public class RedactionService {
}
}
private static String sanitizeText(String text) {
if (text == null) return "";
StringBuilder sanitized = new StringBuilder();
for (char c : text.toCharArray()) {
if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
sanitized.append('\uFFFD');
} else {
sanitized.append(c);
public byte[] performVisualRedactionWithOcrRestoration(
RedactPdfRequest request,
String[] listOfText,
boolean useRegex,
boolean wholeWordSearch)
throws IOException {
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
Map<Integer, List<PDFText>> allFound =
findTextToRedact(doc, listOfText, useRegex, wholeWordSearch);
byte[] visualRedactedBytes =
finalizeRedaction(
doc,
allFound,
request.getRedactColor(),
request.getCustomPadding(),
true,
false);
return performOcrRestoration(visualRedactedBytes, request);
} catch (Exception e) {
throw new IOException(
"Visual redaction with OCR restoration failed: " + e.getMessage(), e);
}
}
return sanitized.toString();
}
private static WipeResult wipeAllSemanticTextInTokens(List<Object> tokens, boolean removeTU) {
if (tokens == null || tokens.isEmpty()) {
@ -1064,43 +1122,21 @@ public class RedactionService {
return res;
}
private byte[] processWithOcrMyPdfForRestoration(
java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request)
private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request)
throws IOException, InterruptedException {
List<String> command =
Arrays.asList(
"ocrmypdf",
"--verbose",
"1",
"--output-type",
"pdf",
"--pdf-renderer",
"sandwich",
"--language",
"eng",
"--optimize",
"0",
"--jpeg-quality",
"100",
"--png-quality",
"9",
"--force-ocr",
"--deskew",
"--clean",
"--clean-final",
inputPath.toString(),
outputPath.toString());
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(command);
if (result.getRc() != 0) {
throw new IOException(
"OCRmyPDF restoration failed with return code: "
+ result.getRc()
+ ". Error: "
+ result.getMessages());
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes);
if (isOcrMyPdfAvailable()) {
return processWithOcrMyPdfForRestoration(
tempInputFile.getPath(), tempOutputFile.getPath(), request);
} else if (isTesseractAvailable()) {
return processWithTesseractForRestoration(
tempInputFile.getPath(), tempOutputFile.getPath(), request);
}
return redactedPdfBytes;
}
return java.nio.file.Files.readAllBytes(outputPath);
}
private static boolean removeSemanticProperties(COSDictionary dict, boolean removeTU) {
@ -1427,59 +1463,62 @@ public class RedactionService {
}
}
private int getOriginalTokenCount(PDPage page) {
public void performTextReplacementAggressive(
PDDocument document,
Map<Integer, List<PDFText>> allFoundTextsByPage,
String[] listOfText,
boolean useRegex,
boolean wholeWordSearchBool) {
if (allFoundTextsByPage.isEmpty()) return;
Set<String> allSearchTerms =
Arrays.stream(listOfText)
.map(String::trim)
.filter(s -> !s.isEmpty())
.collect(Collectors.toSet());
this.aggressiveMode = true;
this.aggressiveSegMatches = new HashMap<>();
try {
PDFStreamParser parser = new PDFStreamParser(page);
int count = 0;
while (parser.parseNextToken() != null) {
count++;
}
return count;
} catch (Exception e) {
return 0;
}
}
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
boolean anyResidual = false;
private COSArray buildKerningAdjustedTJArray(
COSArray originalArray, COSArray redactedArray, TextSegment segment) {
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
PDPage page = document.getPages().get(pageIndex);
try {
if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0)
return redactedArray;
this.aggressiveSegMatches = new HashMap<>();
List<Object> filtered =
createTokensWithoutTargetText(
document,
page,
allSearchTerms,
useRegex,
wholeWordSearchBool);
writeFilteredContentStream(document, page, filtered);
COSArray out = new COSArray();
int size = redactedArray.size();
for (int i = 0; i < size; i++) {
COSBase redEl = redactedArray.get(i);
COSBase origEl =
(originalArray != null && i < originalArray.size())
? originalArray.get(i)
: null;
if (pageStillContainsTargets(
document,
pageIndex,
allSearchTerms,
useRegex,
wholeWordSearchBool)) {
anyResidual = true;
processResidualText(document, page, filtered);
}
} catch (Exception ignored) {
}
}
out.add(redEl);
if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) {
String origText = getDecodedString(origStr, segment.getFont());
String modText = getDecodedString(redStr, segment.getFont());
float wOrig =
calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
float wMod =
calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
float adjustment = wOrig - wMod;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
i++;
float combined = num.floatValue() + kerning;
out.add(new COSFloat(combined));
} else {
out.add(new COSFloat(kerning));
if (!anyResidual
|| !documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
break;
}
}
}
}
return out;
} catch (Exception e) {
return redactedArray;
} finally {
this.aggressiveMode = false;
this.aggressiveSegMatches = null;
}
}
@ -1678,6 +1717,21 @@ public class RedactionService {
return problematicRatio > 0.3;
}
private void processResidualText(PDDocument document, PDPage page, List<Object> filtered) {
try {
var sem = wipeAllSemanticTextInTokens(filtered);
filtered = sem.tokens;
PDResources res = page.getResources();
if (res != null) {
wipeAllSemanticTextInProperties(res);
wipeAllTextInXObjects(document, res);
wipeAllTextInPatterns(document, res);
}
writeFilteredContentStream(document, page, filtered);
} catch (Exception ignored) {
}
}
public boolean performTextReplacement(
PDDocument document,
Map<Integer, List<PDFText>> allFoundTextsByPage,
@ -1688,151 +1742,38 @@ public class RedactionService {
log.info("No text found to redact");
return false;
}
try {
Set<String> allSearchTerms =
Arrays.stream(listOfText)
.map(String::trim)
.filter(s -> !s.isEmpty())
.collect(Collectors.toSet());
log.info(
"Starting text replacement with {} search terms: {}",
allSearchTerms.size(),
allSearchTerms);
log.info("Total pages in document: {}", document.getNumberOfPages());
log.info("Initial text found on {} pages", allFoundTextsByPage.size());
log.info("Starting text replacement with {} search terms", allSearchTerms.size());
int initialTotalInstances =
allFoundTextsByPage.values().stream().mapToInt(List::size).sum();
log.info("Total initial instances to redact: {}", initialTotalInstances);
int finalSweepCount = 0;
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
finalSweepCount = sweep + 1;
log.info("=== Starting sweep {} of {} ===", sweep + 1, MAX_SWEEPS);
int pagesProcessed = 0;
int totalModifications = 0;
processPages(document, allSearchTerms, useRegex, wholeWordSearchBool);
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
PDPage page = document.getPages().get(pageIndex);
List<PDFText> pageFoundTexts =
allFoundTextsByPage.getOrDefault(pageIndex, List.of());
log.debug(
"Processing page {} - found {} instances",
pageIndex + 1,
pageFoundTexts.size());
List<Object> filtered =
createTokensWithoutTargetText(
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
writeFilteredContentStream(document, page, filtered);
int tokenDiff = Math.abs(filtered.size() - getOriginalTokenCount(page));
totalModifications += tokenDiff;
pagesProcessed++;
log.debug("Page {} - token modifications: {}", pageIndex + 1, tokenDiff);
}
log.info(
"Sweep {} completed - processed {} pages, total modifications: {}",
sweep + 1,
pagesProcessed,
totalModifications);
boolean stillContainsTargets =
documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool);
if (!stillContainsTargets) {
if (!documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
log.info("SUCCESS: All targets removed after {} sweeps", sweep + 1);
break;
} else {
log.warn(
"WARNING: Still contains targets after sweep {} - continuing...",
sweep + 1);
}
}
boolean finalCheck = false;
for (int verifyAttempt = 0; verifyAttempt < 3; verifyAttempt++) {
log.info("Final verification attempt {} of 3", verifyAttempt + 1);
finalCheck =
documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool);
if (!finalCheck) {
log.info(
"Verification attempt {} passed - no targets found", verifyAttempt + 1);
break;
} else {
log.warn("Verification attempt {} found remaining targets", verifyAttempt + 1);
if (verifyAttempt < 2) {
log.info("Performing additional cleanup sweep due to verification failure");
for (PDPage page : document.getPages()) {
List<Object> additionalFiltered =
createTokensWithoutTargetText(
document,
page,
allSearchTerms,
useRegex,
wholeWordSearchBool);
writeFilteredContentStream(document, page, additionalFiltered);
}
}
}
}
if (finalCheck) {
log.error(
"FAILURE: Document still contains targets after {} sweeps and {} verification attempts. Falling back to visual redaction with OCR restoration.",
MAX_SWEEPS,
3);
log.error("Remaining search terms: {}", allSearchTerms);
log.error("=== DETAILED FAILURE ANALYSIS ===");
for (int pageIdx = 0; pageIdx < document.getNumberOfPages(); pageIdx++) {
for (String term : allSearchTerms) {
try {
TextFinder finder = new TextFinder(term, useRegex, wholeWordSearchBool);
finder.setStartPage(pageIdx + 1);
finder.setEndPage(pageIdx + 1);
finder.getText(document);
for (PDFText found : finder.getFoundTexts()) {
if (found.getPageIndex() == pageIdx) {
log.error(
"REMAINING: '{}' found on page {} at position ({}, {})",
term,
pageIdx + 1,
found.getX1(),
found.getY1());
}
}
} catch (Exception e) {
log.error(
"Error during failure analysis for term '{}' on page {}: {}",
term,
pageIdx + 1,
e.getMessage());
}
}
}
log.error("=== END FAILURE ANALYSIS ===");
return true;
} else {
log.info(
"SUCCESS: All text redaction completed successfully after {} sweeps",
finalSweepCount);
return false;
}
} catch (Exception e) {
log.error("Exception during text replacement: {}", e.getMessage(), e);
return true;
}
// Verification attempts
for (int attempt = 0; attempt < 3; attempt++) {
if (!documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
return false;
}
if (attempt < 2) {
processPages(document, allSearchTerms, useRegex, wholeWordSearchBool);
}
}
log.error("FAILURE: Document still contains targets after {} sweeps", MAX_SWEEPS);
return true;
}
private COSArray createRedactedTJArray(
@ -1917,99 +1858,21 @@ public class RedactionService {
};
}
private List<MatchRange> findMatchesInSegments(
List<TextSegment> segments,
Set<String> targetWords,
private void processPages(
PDDocument document,
Set<String> allSearchTerms,
boolean useRegex,
boolean wholeWordSearch) {
List<MatchRange> allMatches = new ArrayList<>();
List<Pattern> patterns =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size());
int totalMatchesFound = 0;
for (int i = 0; i < segments.size(); i++) {
TextSegment segment = segments.get(i);
String segmentText = segment.getText();
if (segmentText == null || segmentText.isEmpty()) {
log.debug("Skipping empty segment {}", i);
continue;
}
log.debug("Processing segment {}: '{}'", i, segmentText);
if (segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
log.debug(
"Skipping segment {} - font not removable: {}",
i,
segment.getFont().getName());
continue;
}
int segmentMatches = 0;
for (Pattern pattern : patterns) {
boolean wholeWordSearchBool) {
for (PDPage page : document.getPages()) {
try {
log.debug(
"Matching pattern '{}' against segment text '{}'",
pattern.pattern(),
segmentText);
var matcher = pattern.matcher(segmentText);
while (matcher.find()) {
int matchStart = matcher.start();
int matchEnd = matcher.end();
log.debug(
"Found match in segment {}: positions {}-{}",
i,
matchStart,
matchEnd);
if (matchStart >= 0
&& matchEnd <= segmentText.length()
&& matchStart < matchEnd) {
String matchedText = segmentText.substring(matchStart, matchEnd);
log.debug("Matched text: '{}'", matchedText);
allMatches.add(
new MatchRange(
segment.getStartPos() + matchStart,
segment.getStartPos() + matchEnd));
segmentMatches++;
totalMatchesFound++;
}
}
List<Object> filtered =
createTokensWithoutTargetText(
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
writeFilteredContentStream(document, page, filtered);
} catch (Exception e) {
log.error("Error matching pattern in segment {}: {}", i, e.getMessage());
log.warn("Error processing page: {}", e.getMessage());
}
}
if (segmentMatches > 0) {
log.info("Segment {} had {} matches", i, segmentMatches);
}
}
log.info("Total matches found across all segments: {}", totalMatchesFound);
allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
if (allMatches.isEmpty()) {
log.warn("No matches found in segments. This might indicate:");
log.warn("1. Text encoding issues preventing proper extraction");
log.warn("2. Font compatibility issues");
log.warn("3. Search terms not matching extracted text");
log.warn("4. Whole word search filtering out matches");
if (!segments.isEmpty()) {
log.warn("Sample segment text: '{}'", segments.get(0).getText());
log.warn("Target words: {}", targetWords);
log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch);
}
}
return allMatches;
}
private String createSafeReplacement(String originalPart, TextSegment segment) {
@ -2962,9 +2825,9 @@ public class RedactionService {
@Data
public static class DecodedMapping {
public String text;
public int[] charByteStart;
public int[] charByteEnd;
private String text;
private int[] charByteStart;
private int[] charByteEnd;
}
@Data

View File

@ -5,10 +5,17 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import java.util.regex.Pattern;
@Slf4j
@UtilityClass
public class TextEncodingHelper {
private final Pattern PATTERN = Pattern.compile("^[A-Z]+$");
private final Pattern REGEX = Pattern.compile("^[A-Z]{6}\\+.*");
private final Pattern REGEXP = Pattern.compile("^[A-Z]{5}\\+.*");
private final Pattern PATTERN1 = Pattern.compile("^[A-Z]{4}\\+.*");
public boolean canEncodeCharacters(PDFont font, String text) {
if (font == null || text == null) {
return false;
@ -421,21 +428,21 @@ public class TextEncodingHelper {
return false;
}
if (fontName.matches("^[A-Z]{6}\\+.*")) {
if (REGEX.matcher(fontName).matches()) {
return true;
}
if (fontName.matches("^[A-Z]{5}\\+.*")) {
if (REGEXP.matcher(fontName).matches()) {
return true;
}
if (fontName.matches("^[A-Z]{4}\\+.*")) {
if (PATTERN1.matcher(fontName).matches()) {
return true;
}
if (fontName.contains("+")) {
String prefix = fontName.split("\\+")[0];
if (prefix.matches("^[A-Z]+$") && prefix.length() >= 4) {
if (PATTERN.matcher(prefix).matches() && prefix.length() >= 4) {
return true;
}
}
@ -510,68 +517,4 @@ public class TextEncodingHelper {
return false;
}
public boolean canEncodeAnyCharacter(PDFont font) {
if (font == null) {
return false;
}
String[] testStrings = {
"a", "A", "0", " ", ".", "!", "e", "i", "o", "u", "n", "t", "r", "s", "l", "1", "2",
"3", "4", "5", "6", "7", "8", "9", ",", ".", ";", ":", "?", "!", "(", ")", "[", "]",
"{", "}", "hello", "test", "sample", "abc", "123", "ABC"
};
for (String testStr : testStrings) {
try {
byte[] encoded = font.encode(testStr);
if (encoded.length > 0) {
return true;
}
} catch (Exception e) {
}
}
for (int code = 0; code <= 0xFFFF; code += 100) {
try {
String testStr = String.valueOf((char) code);
byte[] encoded = font.encode(testStr);
if (encoded.length > 0) {
return true;
}
} catch (Exception e) {
}
}
return false;
}
public boolean isValidFont(PDFont font) {
if (font == null) {
return false;
}
try {
String name = font.getName();
if (name != null && !name.trim().isEmpty()) {
return true;
}
} catch (Exception e) {
}
try {
if (canCalculateBasicWidths(font)) {
return true;
}
} catch (Exception e) {
}
try {
if (canEncodeAnyCharacter(font)) {
return true;
}
} catch (Exception e) {
}
return false;
}
}

View File

@ -80,10 +80,6 @@ public class WidthCalculator {
Float charWidth =
calculateSingleCharacterWidth(font, character, fontSize, codePoint);
if (charWidth == null) {
return null;
}
totalWidth += charWidth;
if (previousCodePoint != -1) {
totalWidth += calculateKerning(font, previousCodePoint, codePoint, fontSize);
@ -203,9 +199,6 @@ public class WidthCalculator {
Float charWidth =
calculateGlyphWidthComprehensively(font, character, codePoint, fontSize);
if (charWidth == null) {
return null;
}
totalWidth += charWidth;
i += Character.charCount(codePoint);
@ -514,64 +507,4 @@ public class WidthCalculator {
return false;
}
public float calculateMinimumTextWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
return 0;
}
try {
float minWidth = calculateAccurateWidth(font, text, fontSize);
if (minWidth > 0) {
return minWidth * 0.8f;
}
} catch (Exception e) {
}
return text.length() * fontSize * 0.3f;
}
public float calculateMaximumTextWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
return 0;
}
try {
float maxWidth = calculateAccurateWidth(font, text, fontSize);
if (maxWidth > 0) {
return maxWidth * 1.2f;
}
} catch (Exception e) {
}
return text.length() * fontSize * 1.0f;
}
public boolean canCalculateWidthForText(PDFont font, String text) {
if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
try {
Float width = calculateDirectWidth(font, text, 12f);
if (width != null) {
return true;
}
} catch (Exception e) {
}
try {
Float width = calculateCharacterByCharacterWidth(font, text, 12f);
if (width != null) {
return true;
}
} catch (Exception e) {
}
return true;
}
}

View File

@ -13,20 +13,7 @@
color: #6c757d !important;
}
.btn-primary:focus {
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
outline: 2px solid #0d6efd;
outline-offset: 2px;
}
.form-check-input:focus {
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
outline: 2px solid #0d6efd;
outline-offset: 2px;
}
.form-control:focus, .form-select:focus {
border-color: #0d6efd;
.btn-primary:focus, .form-check-input:focus, .form-control:focus, .form-select:focus {
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
outline: 2px solid #0d6efd;
outline-offset: 2px;
@ -36,20 +23,6 @@
background-color: #0d6efd;
border-color: #0d6efd;
}
.sr-only {
position: absolute;
width: 1px;
height: 1px;
padding: 0;
margin: -1px;
overflow: hidden;
clip: rect(0, 0, 0, 0);
white-space: nowrap;
border: 0;
}
</style>
</head>