refactor redaction services to improve resource management and streamline text processing

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-25 19:53:19 +02:00
parent f236505cae
commit 3ac7f0df4c
6 changed files with 394 additions and 707 deletions

View File

@ -30,51 +30,39 @@ class AggressiveRedactionService implements RedactionModeStrategy {
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch()); boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
PDDocument doc = null; try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
PDDocument fb = null;
try {
doc = pdfDocumentFactory.load(request.getFileInput());
Map<Integer, List<PDFText>> allFound = Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) { if (allFound.isEmpty()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { return toByteArray(doc);
doc.save(baos);
return baos.toByteArray();
}
} }
helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord); helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord);
Map<Integer, List<PDFText>> residual = Map<Integer, List<PDFText>> residual =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0; boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0;
String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000"
: request.getRedactColor();
if (residualExists) { if (residualExists) {
// Use the new visual redaction with OCR restoration fallback
return helper.performVisualRedactionWithOcrRestoration( return helper.performVisualRedactionWithOcrRestoration(
request, listOfText, useRegex, wholeWord); request, listOfText, useRegex, wholeWord);
} }
return RedactionService.finalizeRedaction( return RedactionService.finalizeRedaction(
doc, doc,
allFound, allFound,
request.getRedactColor(), request.getRedactColor(),
request.getCustomPadding(), request.getCustomPadding(),
request.getConvertPDFToImage(), /*text removal*/ request.getConvertPDFToImage(),
true); true);
} catch (Exception e) { } catch (Exception e) {
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e); throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
} finally {
if (doc != null)
try {
doc.close();
} catch (IOException ignore) {
}
if (fb != null)
try {
fb.close();
} catch (IOException ignore) {
} }
} }
private byte[] toByteArray(PDDocument doc) throws IOException {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
} }
} }

View File

@ -29,49 +29,36 @@ class ModerateRedactionService implements RedactionModeStrategy {
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch()); boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
PDDocument doc = null; try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
PDDocument fallback = null;
try {
doc = pdfDocumentFactory.load(request.getFileInput());
Map<Integer, List<PDFText>> allFound = Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) { if (allFound.isEmpty()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { return toByteArray(doc);
doc.save(baos);
return baos.toByteArray();
}
} }
boolean fallbackToBoxOnly = boolean fallbackToBoxOnly =
helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord); helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord);
String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000"
: request.getRedactColor();
if (fallbackToBoxOnly) { if (fallbackToBoxOnly) {
// Use the new visual redaction with OCR restoration fallback
return helper.performVisualRedactionWithOcrRestoration( return helper.performVisualRedactionWithOcrRestoration(
request, listOfText, useRegex, wholeWord); request, listOfText, useRegex, wholeWord);
} }
return RedactionService.finalizeRedaction( return RedactionService.finalizeRedaction(
doc, doc,
allFound, allFound,
effectiveColor, request.getRedactColor(),
request.getCustomPadding(), request.getCustomPadding(),
request.getConvertPDFToImage(), request.getConvertPDFToImage(),
false); false);
} catch (Exception e) { } catch (Exception e) {
throw new IOException("Moderate redaction failed: " + e.getMessage(), e); throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
} finally {
if (doc != null)
try {
doc.close();
} catch (IOException ignore) {
}
if (fallback != null)
try {
fallback.close();
} catch (IOException ignore) {
} }
} }
private byte[] toByteArray(PDDocument doc) throws IOException {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
} }
} }

View File

@ -81,6 +81,7 @@ public class RedactionService {
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
private static final COSString EMPTY_COS_STRING = new COSString(""); private static final COSString EMPTY_COS_STRING = new COSString("");
private static final int MAX_SWEEPS = 3; private static final int MAX_SWEEPS = 3;
private static final Pattern PATTERN = Pattern.compile(".*(hoepap|temp|generated).*");
private boolean aggressiveMode = false; private boolean aggressiveMode = false;
private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null; private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
@ -266,26 +267,20 @@ public class RedactionService {
boolean wholeWordSearch) { boolean wholeWordSearch) {
try { try {
for (String term : targetWords) { for (String term : targetWords) {
if (term == null || term.isBlank()) { if (term == null || term.isBlank()) continue;
continue;
}
TextFinder finder = new TextFinder(term, useRegex, wholeWordSearch); TextFinder finder = new TextFinder(term, useRegex, wholeWordSearch);
finder.setStartPage(pageIndex + 1); finder.setStartPage(pageIndex + 1);
finder.setEndPage(pageIndex + 1); finder.setEndPage(pageIndex + 1);
finder.getText(document); finder.getText(document);
List<PDFText> foundTexts = finder.getFoundTexts(); for (PDFText text : finder.getFoundTexts()) {
for (PDFText ft : foundTexts) { if (text.getPageIndex() == pageIndex) {
if (ft.getPageIndex() == pageIndex) {
return true; return true;
} }
} }
if (!foundTexts.isEmpty()) {}
} }
return false; return false;
} catch (Exception e) { } catch (Exception e) {
return true; return true;
} }
@ -297,18 +292,13 @@ public class RedactionService {
boolean useRegex, boolean useRegex,
boolean wholeWordSearch) { boolean wholeWordSearch) {
try { try {
int idx = -1; for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
final int numberOfPages = document.getNumberOfPages();
for (int i = 0; i < numberOfPages; i++) {
idx++;
if (pageStillContainsTargets( if (pageStillContainsTargets(
document, idx, targetWords, useRegex, wholeWordSearch)) { document, pageIndex, targetWords, useRegex, wholeWordSearch)) {
return true; return true;
} }
} }
return false; return false;
} catch (Exception e) { } catch (Exception e) {
return true; return true;
} }
@ -352,12 +342,11 @@ public class RedactionService {
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) { for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
allFoundTexts.addAll(pageTexts); allFoundTexts.addAll(pageTexts);
} }
if (!allFoundTexts.isEmpty()) { if (!allFoundTexts.isEmpty() && !isTextRemovalMode) {
if (!isTextRemovalMode) {
Color redactColor = decodeOrDefault(colorString); Color redactColor = decodeOrDefault(colorString);
redactFoundText(document, allFoundTexts, customPadding, redactColor); redactFoundText(document, allFoundTexts, customPadding, redactColor);
} }
}
if (Boolean.TRUE.equals(convertToImage)) { if (Boolean.TRUE.equals(convertToImage)) {
try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) { try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) {
ByteArrayOutputStream baos = new ByteArrayOutputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream();
@ -597,18 +586,11 @@ public class RedactionService {
private static boolean isTextSafeForRedaction(String text) { private static boolean isTextSafeForRedaction(String text) {
if (text == null || text.isEmpty()) return true; if (text == null || text.isEmpty()) return true;
for (int i = 0; i < text.length(); i++) { for (char c : text.toCharArray()) {
char c = text.charAt(i); if (c >= 65488 || (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')) {
int codePoint = c;
if (codePoint >= 65488) {
return false;
}
if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
return false; return false;
} }
} }
return true; return true;
} }
@ -657,56 +639,33 @@ public class RedactionService {
return wipeAllSemanticTextInTokens(tokens, true); return wipeAllSemanticTextInTokens(tokens, true);
} }
public byte[] performVisualRedactionWithOcrRestoration( private static String normalizeTextForRedaction(String text) {
RedactPdfRequest request, if (text == null) return null;
String[] listOfText,
boolean useRegex, StringBuilder normalized = new StringBuilder();
boolean wholeWordSearch) for (int i = 0; i < text.length(); i++) {
throws IOException { char c = text.charAt(i);
PDDocument visualRedactedDoc = null;
try { if (c >= 65488) {
visualRedactedDoc = pdfDocumentFactory.load(request.getFileInput()); normalized.append(' ');
Map<Integer, List<PDFText>> allFound = } else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
findTextToRedact(visualRedactedDoc, listOfText, useRegex, wholeWordSearch); normalized.append(' ');
String effectiveColor = } else {
(request.getRedactColor() == null || request.getRedactColor().isBlank()) normalized.append(c);
? "#000000"
: request.getRedactColor();
byte[] visualRedactedBytes =
finalizeRedaction(
visualRedactedDoc,
allFound,
effectiveColor,
request.getCustomPadding(),
true,
false);
return performOcrRestoration(visualRedactedBytes, request);
} catch (Exception e) {
throw new IOException(
"Visual redaction with OCR restoration failed: " + e.getMessage(), e);
} finally {
if (visualRedactedDoc != null) {
try {
visualRedactedDoc.close();
} catch (IOException ignore) {
}
}
} }
} }
private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request) return normalized.toString();
throws IOException, InterruptedException {
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes);
if (isOcrMyPdfAvailable()) {
return processWithOcrMyPdfForRestoration(
tempInputFile.getPath(), tempOutputFile.getPath(), request);
} else if (isTesseractAvailable()) {
return processWithTesseractForRestoration(
tempInputFile.getPath(), tempOutputFile.getPath(), request);
} }
return redactedPdfBytes;
private static boolean isOcrMyPdfAvailable() {
try {
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version"));
return result.getRc() == 0;
} catch (Exception e) {
return false;
} }
} }
@ -780,37 +739,7 @@ public class RedactionService {
} }
} }
private static String normalizeTextForRedaction(String text) { private static boolean isTesseractAvailable() {
if (text == null) return null;
StringBuilder normalized = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if ((int) c >= 65488) {
normalized.append(' ');
} else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
normalized.append(' ');
} else {
normalized.append(c);
}
}
return normalized.toString();
}
private boolean isOcrMyPdfAvailable() {
try {
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version"));
return result.getRc() == 0;
} catch (Exception e) {
return false;
}
}
private boolean isTesseractAvailable() {
try { try {
ProcessExecutorResult result = ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT) ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
@ -826,7 +755,7 @@ public class RedactionService {
String fontName = font.getName(); String fontName = font.getName();
if (fontName == null if (fontName == null
|| isProperFontSubset(fontName) || isProperFontSubset(fontName)
|| fontName.toLowerCase().matches(".*(hoepap|temp|generated).*")) { || PATTERN.matcher(fontName.toLowerCase()).matches()) {
return false; return false;
} }
return hasReliableWidthMetrics(font); return hasReliableWidthMetrics(font);
@ -835,6 +764,58 @@ public class RedactionService {
} }
} }
private static String sanitizeText(String text) {
if (text == null) return "";
StringBuilder sanitized = new StringBuilder();
for (char c : text.toCharArray()) {
sanitized.append(
(Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')
? '\uFFFD'
: c);
}
return sanitized.toString();
}
private static byte[] processWithOcrMyPdfForRestoration(
java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request)
throws IOException, InterruptedException {
List<String> command =
Arrays.asList(
"ocrmypdf",
"--verbose",
"1",
"--output-type",
"pdf",
"--pdf-renderer",
"sandwich",
"--language",
"eng",
"--optimize",
"0",
"--jpeg-quality",
"100",
"--png-quality",
"9",
"--force-ocr",
"--deskew",
"--clean",
"--clean-final",
inputPath.toString(),
outputPath.toString());
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(command);
if (result.getRc() != 0) {
throw new IOException(
"OCRmyPDF restoration failed with return code: "
+ result.getRc()
+ ". Error: "
+ result.getMessages());
}
return java.nio.file.Files.readAllBytes(outputPath);
}
private static String createSubsetFontPlaceholder( private static String createSubsetFontPlaceholder(
String originalWord, float targetWidth, PDFont font, float fontSize) { String originalWord, float targetWidth, PDFont font, float fontSize) {
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
@ -843,77 +824,144 @@ public class RedactionService {
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1)); : " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
} }
public void performTextReplacementAggressive( private static COSArray buildKerningAdjustedTJArray(
PDDocument document, COSArray originalArray, COSArray redactedArray, TextSegment segment) {
Map<Integer, List<PDFText>> allFoundTextsByPage, try {
String[] listOfText, if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0)
return redactedArray;
COSArray out = new COSArray();
int size = redactedArray.size();
for (int i = 0; i < size; i++) {
COSBase redEl = redactedArray.get(i);
COSBase origEl =
(originalArray != null && i < originalArray.size())
? originalArray.get(i)
: null;
out.add(redEl);
if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) {
String origText = getDecodedString(origStr, segment.getFont());
String modText = getDecodedString(redStr, segment.getFont());
float wOrig =
calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
float wMod =
calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
float adjustment = wOrig - wMod;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
i++;
float combined = num.floatValue() + kerning;
out.add(new COSFloat(combined));
} else {
out.add(new COSFloat(kerning));
}
}
}
}
return out;
} catch (Exception e) {
return redactedArray;
}
}
private static List<MatchRange> findMatchesInSegments(
List<TextSegment> segments,
Set<String> targetWords,
boolean useRegex, boolean useRegex,
boolean wholeWordSearchBool) { boolean wholeWordSearch) {
if (allFoundTextsByPage.isEmpty()) { List<MatchRange> allMatches = new ArrayList<>();
return; List<Pattern> patterns =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size());
int totalMatchesFound = 0;
for (int i = 0; i < segments.size(); i++) {
TextSegment segment = segments.get(i);
String segmentText = segment.getText();
if (segmentText == null || segmentText.isEmpty()) {
log.debug("Skipping empty segment {}", i);
continue;
} }
Set<String> allSearchTerms =
Arrays.stream(listOfText) log.debug("Processing segment {}: '{}'", i, segmentText);
.map(String::trim)
.filter(s -> !s.isEmpty()) if (segment.getFont() != null
.collect(Collectors.toSet()); && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
this.aggressiveMode = true; log.debug(
this.aggressiveSegMatches = new HashMap<>(); "Skipping segment {} - font not removable: {}",
i,
segment.getFont().getName());
continue;
}
int segmentMatches = 0;
for (Pattern pattern : patterns) {
try { try {
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) { log.debug(
boolean anyResidual = false; "Matching pattern '{}' against segment text '{}'",
int pageIndex = -1; pattern.pattern(),
for (PDPage page : document.getPages()) { segmentText);
pageIndex++; var matcher = pattern.matcher(segmentText);
try { while (matcher.find()) {
this.aggressiveSegMatches = new HashMap<>(); int matchStart = matcher.start();
List<Object> filtered = int matchEnd = matcher.end();
createTokensWithoutTargetText(
document, log.debug(
page, "Found match in segment {}: positions {}-{}",
allSearchTerms, i,
useRegex, matchStart,
wholeWordSearchBool); matchEnd);
writeFilteredContentStream(document, page, filtered);
boolean residual = if (matchStart >= 0
pageStillContainsTargets( && matchEnd <= segmentText.length()
document, && matchStart < matchEnd) {
pageIndex, String matchedText = segmentText.substring(matchStart, matchEnd);
allSearchTerms, log.debug("Matched text: '{}'", matchedText);
useRegex,
wholeWordSearchBool); allMatches.add(
if (residual) { new MatchRange(
anyResidual = true; segment.getStartPos() + matchStart,
try { segment.getStartPos() + matchEnd));
var sem = wipeAllSemanticTextInTokens(filtered); segmentMatches++;
filtered = sem.tokens; totalMatchesFound++;
PDResources res = page.getResources();
if (res != null) {
wipeAllSemanticTextInProperties(res);
wipeAllTextInXObjects(document, res);
wipeAllTextInPatterns(document, res);
}
writeFilteredContentStream(document, page, filtered);
} catch (Exception ignored) {
} }
} }
} catch (Exception ignored) { } catch (Exception e) {
log.error("Error matching pattern in segment {}: {}", i, e.getMessage());
} }
} }
if (!anyResidual) {
break; if (segmentMatches > 0) {
} log.info("Segment {} had {} matches", i, segmentMatches);
if (!documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
break;
} }
} }
} finally {
this.aggressiveMode = false; log.info("Total matches found across all segments: {}", totalMatchesFound);
this.aggressiveSegMatches = null; allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
if (allMatches.isEmpty()) {
log.warn("No matches found in segments. This might indicate:");
log.warn("1. Text encoding issues preventing proper extraction");
log.warn("2. Font compatibility issues");
log.warn("3. Search terms not matching extracted text");
log.warn("4. Whole word search filtering out matches");
if (!segments.isEmpty()) {
log.warn("Sample segment text: '{}'", segments.get(0).getText());
log.warn("Target words: {}", targetWords);
log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch);
} }
} }
return allMatches;
}
private static float calculateCharacterSumWidth(PDFont font, String text) { private static float calculateCharacterSumWidth(PDFont font, String text) {
float totalWidth = 0f; float totalWidth = 0f;
for (char c : text.toCharArray()) { for (char c : text.toCharArray()) {
@ -1033,19 +1081,29 @@ public class RedactionService {
} }
} }
private static String sanitizeText(String text) { public byte[] performVisualRedactionWithOcrRestoration(
if (text == null) return ""; RedactPdfRequest request,
String[] listOfText,
StringBuilder sanitized = new StringBuilder(); boolean useRegex,
for (char c : text.toCharArray()) { boolean wholeWordSearch)
if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') { throws IOException {
sanitized.append('\uFFFD'); try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
} else { Map<Integer, List<PDFText>> allFound =
sanitized.append(c); findTextToRedact(doc, listOfText, useRegex, wholeWordSearch);
byte[] visualRedactedBytes =
finalizeRedaction(
doc,
allFound,
request.getRedactColor(),
request.getCustomPadding(),
true,
false);
return performOcrRestoration(visualRedactedBytes, request);
} catch (Exception e) {
throw new IOException(
"Visual redaction with OCR restoration failed: " + e.getMessage(), e);
} }
} }
return sanitized.toString();
}
private static WipeResult wipeAllSemanticTextInTokens(List<Object> tokens, boolean removeTU) { private static WipeResult wipeAllSemanticTextInTokens(List<Object> tokens, boolean removeTU) {
if (tokens == null || tokens.isEmpty()) { if (tokens == null || tokens.isEmpty()) {
@ -1064,43 +1122,21 @@ public class RedactionService {
return res; return res;
} }
private byte[] processWithOcrMyPdfForRestoration( private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request)
java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request)
throws IOException, InterruptedException { throws IOException, InterruptedException {
List<String> command = try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
Arrays.asList( TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
"ocrmypdf", java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes);
"--verbose",
"1", if (isOcrMyPdfAvailable()) {
"--output-type", return processWithOcrMyPdfForRestoration(
"pdf", tempInputFile.getPath(), tempOutputFile.getPath(), request);
"--pdf-renderer", } else if (isTesseractAvailable()) {
"sandwich", return processWithTesseractForRestoration(
"--language", tempInputFile.getPath(), tempOutputFile.getPath(), request);
"eng", }
"--optimize", return redactedPdfBytes;
"0",
"--jpeg-quality",
"100",
"--png-quality",
"9",
"--force-ocr",
"--deskew",
"--clean",
"--clean-final",
inputPath.toString(),
outputPath.toString());
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
.runCommandWithOutputHandling(command);
if (result.getRc() != 0) {
throw new IOException(
"OCRmyPDF restoration failed with return code: "
+ result.getRc()
+ ". Error: "
+ result.getMessages());
} }
return java.nio.file.Files.readAllBytes(outputPath);
} }
private static boolean removeSemanticProperties(COSDictionary dict, boolean removeTU) { private static boolean removeSemanticProperties(COSDictionary dict, boolean removeTU) {
@ -1427,59 +1463,62 @@ public class RedactionService {
} }
} }
private int getOriginalTokenCount(PDPage page) { public void performTextReplacementAggressive(
PDDocument document,
Map<Integer, List<PDFText>> allFoundTextsByPage,
String[] listOfText,
boolean useRegex,
boolean wholeWordSearchBool) {
if (allFoundTextsByPage.isEmpty()) return;
Set<String> allSearchTerms =
Arrays.stream(listOfText)
.map(String::trim)
.filter(s -> !s.isEmpty())
.collect(Collectors.toSet());
this.aggressiveMode = true;
this.aggressiveSegMatches = new HashMap<>();
try { try {
PDFStreamParser parser = new PDFStreamParser(page); for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
int count = 0; boolean anyResidual = false;
while (parser.parseNextToken() != null) {
count++;
}
return count;
} catch (Exception e) {
return 0;
}
}
private COSArray buildKerningAdjustedTJArray( for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
COSArray originalArray, COSArray redactedArray, TextSegment segment) { PDPage page = document.getPages().get(pageIndex);
try { try {
if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0) this.aggressiveSegMatches = new HashMap<>();
return redactedArray; List<Object> filtered =
createTokensWithoutTargetText(
document,
page,
allSearchTerms,
useRegex,
wholeWordSearchBool);
writeFilteredContentStream(document, page, filtered);
COSArray out = new COSArray(); if (pageStillContainsTargets(
int size = redactedArray.size(); document,
for (int i = 0; i < size; i++) { pageIndex,
COSBase redEl = redactedArray.get(i); allSearchTerms,
COSBase origEl = useRegex,
(originalArray != null && i < originalArray.size()) wholeWordSearchBool)) {
? originalArray.get(i) anyResidual = true;
: null; processResidualText(document, page, filtered);
}
} catch (Exception ignored) {
}
}
out.add(redEl); if (!anyResidual
|| !documentStillContainsTargets(
if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) { document, allSearchTerms, useRegex, wholeWordSearchBool)) {
String origText = getDecodedString(origStr, segment.getFont()); break;
String modText = getDecodedString(redStr, segment.getFont());
float wOrig =
calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
float wMod =
calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
float adjustment = wOrig - wMod;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
i++;
float combined = num.floatValue() + kerning;
out.add(new COSFloat(combined));
} else {
out.add(new COSFloat(kerning));
} }
} }
} } finally {
} this.aggressiveMode = false;
return out; this.aggressiveSegMatches = null;
} catch (Exception e) {
return redactedArray;
} }
} }
@ -1678,6 +1717,21 @@ public class RedactionService {
return problematicRatio > 0.3; return problematicRatio > 0.3;
} }
private void processResidualText(PDDocument document, PDPage page, List<Object> filtered) {
try {
var sem = wipeAllSemanticTextInTokens(filtered);
filtered = sem.tokens;
PDResources res = page.getResources();
if (res != null) {
wipeAllSemanticTextInProperties(res);
wipeAllTextInXObjects(document, res);
wipeAllTextInPatterns(document, res);
}
writeFilteredContentStream(document, page, filtered);
} catch (Exception ignored) {
}
}
public boolean performTextReplacement( public boolean performTextReplacement(
PDDocument document, PDDocument document,
Map<Integer, List<PDFText>> allFoundTextsByPage, Map<Integer, List<PDFText>> allFoundTextsByPage,
@ -1688,151 +1742,38 @@ public class RedactionService {
log.info("No text found to redact"); log.info("No text found to redact");
return false; return false;
} }
try {
Set<String> allSearchTerms = Set<String> allSearchTerms =
Arrays.stream(listOfText) Arrays.stream(listOfText)
.map(String::trim) .map(String::trim)
.filter(s -> !s.isEmpty()) .filter(s -> !s.isEmpty())
.collect(Collectors.toSet()); .collect(Collectors.toSet());
log.info( log.info("Starting text replacement with {} search terms", allSearchTerms.size());
"Starting text replacement with {} search terms: {}",
allSearchTerms.size(),
allSearchTerms);
log.info("Total pages in document: {}", document.getNumberOfPages());
log.info("Initial text found on {} pages", allFoundTextsByPage.size());
int initialTotalInstances =
allFoundTextsByPage.values().stream().mapToInt(List::size).sum();
log.info("Total initial instances to redact: {}", initialTotalInstances);
int finalSweepCount = 0;
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) { for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
finalSweepCount = sweep + 1; processPages(document, allSearchTerms, useRegex, wholeWordSearchBool);
log.info("=== Starting sweep {} of {} ===", sweep + 1, MAX_SWEEPS);
int pagesProcessed = 0;
int totalModifications = 0;
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { if (!documentStillContainsTargets(
PDPage page = document.getPages().get(pageIndex); document, allSearchTerms, useRegex, wholeWordSearchBool)) {
List<PDFText> pageFoundTexts =
allFoundTextsByPage.getOrDefault(pageIndex, List.of());
log.debug(
"Processing page {} - found {} instances",
pageIndex + 1,
pageFoundTexts.size());
List<Object> filtered =
createTokensWithoutTargetText(
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
writeFilteredContentStream(document, page, filtered);
int tokenDiff = Math.abs(filtered.size() - getOriginalTokenCount(page));
totalModifications += tokenDiff;
pagesProcessed++;
log.debug("Page {} - token modifications: {}", pageIndex + 1, tokenDiff);
}
log.info(
"Sweep {} completed - processed {} pages, total modifications: {}",
sweep + 1,
pagesProcessed,
totalModifications);
boolean stillContainsTargets =
documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool);
if (!stillContainsTargets) {
log.info("SUCCESS: All targets removed after {} sweeps", sweep + 1); log.info("SUCCESS: All targets removed after {} sweeps", sweep + 1);
break;
} else {
log.warn(
"WARNING: Still contains targets after sweep {} - continuing...",
sweep + 1);
}
}
boolean finalCheck = false;
for (int verifyAttempt = 0; verifyAttempt < 3; verifyAttempt++) {
log.info("Final verification attempt {} of 3", verifyAttempt + 1);
finalCheck =
documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool);
if (!finalCheck) {
log.info(
"Verification attempt {} passed - no targets found", verifyAttempt + 1);
break;
} else {
log.warn("Verification attempt {} found remaining targets", verifyAttempt + 1);
if (verifyAttempt < 2) {
log.info("Performing additional cleanup sweep due to verification failure");
for (PDPage page : document.getPages()) {
List<Object> additionalFiltered =
createTokensWithoutTargetText(
document,
page,
allSearchTerms,
useRegex,
wholeWordSearchBool);
writeFilteredContentStream(document, page, additionalFiltered);
}
}
}
}
if (finalCheck) {
log.error(
"FAILURE: Document still contains targets after {} sweeps and {} verification attempts. Falling back to visual redaction with OCR restoration.",
MAX_SWEEPS,
3);
log.error("Remaining search terms: {}", allSearchTerms);
log.error("=== DETAILED FAILURE ANALYSIS ===");
for (int pageIdx = 0; pageIdx < document.getNumberOfPages(); pageIdx++) {
for (String term : allSearchTerms) {
try {
TextFinder finder = new TextFinder(term, useRegex, wholeWordSearchBool);
finder.setStartPage(pageIdx + 1);
finder.setEndPage(pageIdx + 1);
finder.getText(document);
for (PDFText found : finder.getFoundTexts()) {
if (found.getPageIndex() == pageIdx) {
log.error(
"REMAINING: '{}' found on page {} at position ({}, {})",
term,
pageIdx + 1,
found.getX1(),
found.getY1());
}
}
} catch (Exception e) {
log.error(
"Error during failure analysis for term '{}' on page {}: {}",
term,
pageIdx + 1,
e.getMessage());
}
}
}
log.error("=== END FAILURE ANALYSIS ===");
return true;
} else {
log.info(
"SUCCESS: All text redaction completed successfully after {} sweeps",
finalSweepCount);
return false; return false;
} }
} catch (Exception e) {
log.error("Exception during text replacement: {}", e.getMessage(), e);
return true;
} }
// Verification attempts
for (int attempt = 0; attempt < 3; attempt++) {
if (!documentStillContainsTargets(
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
return false;
}
if (attempt < 2) {
processPages(document, allSearchTerms, useRegex, wholeWordSearchBool);
}
}
log.error("FAILURE: Document still contains targets after {} sweeps", MAX_SWEEPS);
return true;
} }
private COSArray createRedactedTJArray( private COSArray createRedactedTJArray(
@ -1917,99 +1858,21 @@ public class RedactionService {
}; };
} }
private List<MatchRange> findMatchesInSegments( private void processPages(
List<TextSegment> segments, PDDocument document,
Set<String> targetWords, Set<String> allSearchTerms,
boolean useRegex, boolean useRegex,
boolean wholeWordSearch) { boolean wholeWordSearchBool) {
List<MatchRange> allMatches = new ArrayList<>(); for (PDPage page : document.getPages()) {
List<Pattern> patterns =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size());
int totalMatchesFound = 0;
for (int i = 0; i < segments.size(); i++) {
TextSegment segment = segments.get(i);
String segmentText = segment.getText();
if (segmentText == null || segmentText.isEmpty()) {
log.debug("Skipping empty segment {}", i);
continue;
}
log.debug("Processing segment {}: '{}'", i, segmentText);
if (segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
log.debug(
"Skipping segment {} - font not removable: {}",
i,
segment.getFont().getName());
continue;
}
int segmentMatches = 0;
for (Pattern pattern : patterns) {
try { try {
log.debug( List<Object> filtered =
"Matching pattern '{}' against segment text '{}'", createTokensWithoutTargetText(
pattern.pattern(), document, page, allSearchTerms, useRegex, wholeWordSearchBool);
segmentText); writeFilteredContentStream(document, page, filtered);
var matcher = pattern.matcher(segmentText);
while (matcher.find()) {
int matchStart = matcher.start();
int matchEnd = matcher.end();
log.debug(
"Found match in segment {}: positions {}-{}",
i,
matchStart,
matchEnd);
if (matchStart >= 0
&& matchEnd <= segmentText.length()
&& matchStart < matchEnd) {
String matchedText = segmentText.substring(matchStart, matchEnd);
log.debug("Matched text: '{}'", matchedText);
allMatches.add(
new MatchRange(
segment.getStartPos() + matchStart,
segment.getStartPos() + matchEnd));
segmentMatches++;
totalMatchesFound++;
}
}
} catch (Exception e) { } catch (Exception e) {
log.error("Error matching pattern in segment {}: {}", i, e.getMessage()); log.warn("Error processing page: {}", e.getMessage());
} }
} }
if (segmentMatches > 0) {
log.info("Segment {} had {} matches", i, segmentMatches);
}
}
log.info("Total matches found across all segments: {}", totalMatchesFound);
allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
if (allMatches.isEmpty()) {
log.warn("No matches found in segments. This might indicate:");
log.warn("1. Text encoding issues preventing proper extraction");
log.warn("2. Font compatibility issues");
log.warn("3. Search terms not matching extracted text");
log.warn("4. Whole word search filtering out matches");
if (!segments.isEmpty()) {
log.warn("Sample segment text: '{}'", segments.get(0).getText());
log.warn("Target words: {}", targetWords);
log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch);
}
}
return allMatches;
} }
private String createSafeReplacement(String originalPart, TextSegment segment) { private String createSafeReplacement(String originalPart, TextSegment segment) {
@ -2962,9 +2825,9 @@ public class RedactionService {
@Data @Data
public static class DecodedMapping { public static class DecodedMapping {
public String text; private String text;
public int[] charByteStart; private int[] charByteStart;
public int[] charByteEnd; private int[] charByteEnd;
} }
@Data @Data

View File

@ -5,10 +5,17 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import java.util.regex.Pattern;
@Slf4j @Slf4j
@UtilityClass @UtilityClass
public class TextEncodingHelper { public class TextEncodingHelper {
private final Pattern PATTERN = Pattern.compile("^[A-Z]+$");
private final Pattern REGEX = Pattern.compile("^[A-Z]{6}\\+.*");
private final Pattern REGEXP = Pattern.compile("^[A-Z]{5}\\+.*");
private final Pattern PATTERN1 = Pattern.compile("^[A-Z]{4}\\+.*");
public boolean canEncodeCharacters(PDFont font, String text) { public boolean canEncodeCharacters(PDFont font, String text) {
if (font == null || text == null) { if (font == null || text == null) {
return false; return false;
@ -421,21 +428,21 @@ public class TextEncodingHelper {
return false; return false;
} }
if (fontName.matches("^[A-Z]{6}\\+.*")) { if (REGEX.matcher(fontName).matches()) {
return true; return true;
} }
if (fontName.matches("^[A-Z]{5}\\+.*")) { if (REGEXP.matcher(fontName).matches()) {
return true; return true;
} }
if (fontName.matches("^[A-Z]{4}\\+.*")) { if (PATTERN1.matcher(fontName).matches()) {
return true; return true;
} }
if (fontName.contains("+")) { if (fontName.contains("+")) {
String prefix = fontName.split("\\+")[0]; String prefix = fontName.split("\\+")[0];
if (prefix.matches("^[A-Z]+$") && prefix.length() >= 4) { if (PATTERN.matcher(prefix).matches() && prefix.length() >= 4) {
return true; return true;
} }
} }
@ -510,68 +517,4 @@ public class TextEncodingHelper {
return false; return false;
} }
public boolean canEncodeAnyCharacter(PDFont font) {
if (font == null) {
return false;
}
String[] testStrings = {
"a", "A", "0", " ", ".", "!", "e", "i", "o", "u", "n", "t", "r", "s", "l", "1", "2",
"3", "4", "5", "6", "7", "8", "9", ",", ".", ";", ":", "?", "!", "(", ")", "[", "]",
"{", "}", "hello", "test", "sample", "abc", "123", "ABC"
};
for (String testStr : testStrings) {
try {
byte[] encoded = font.encode(testStr);
if (encoded.length > 0) {
return true;
}
} catch (Exception e) {
}
}
for (int code = 0; code <= 0xFFFF; code += 100) {
try {
String testStr = String.valueOf((char) code);
byte[] encoded = font.encode(testStr);
if (encoded.length > 0) {
return true;
}
} catch (Exception e) {
}
}
return false;
}
public boolean isValidFont(PDFont font) {
if (font == null) {
return false;
}
try {
String name = font.getName();
if (name != null && !name.trim().isEmpty()) {
return true;
}
} catch (Exception e) {
}
try {
if (canCalculateBasicWidths(font)) {
return true;
}
} catch (Exception e) {
}
try {
if (canEncodeAnyCharacter(font)) {
return true;
}
} catch (Exception e) {
}
return false;
}
} }

View File

@ -80,10 +80,6 @@ public class WidthCalculator {
Float charWidth = Float charWidth =
calculateSingleCharacterWidth(font, character, fontSize, codePoint); calculateSingleCharacterWidth(font, character, fontSize, codePoint);
if (charWidth == null) {
return null;
}
totalWidth += charWidth; totalWidth += charWidth;
if (previousCodePoint != -1) { if (previousCodePoint != -1) {
totalWidth += calculateKerning(font, previousCodePoint, codePoint, fontSize); totalWidth += calculateKerning(font, previousCodePoint, codePoint, fontSize);
@ -203,9 +199,6 @@ public class WidthCalculator {
Float charWidth = Float charWidth =
calculateGlyphWidthComprehensively(font, character, codePoint, fontSize); calculateGlyphWidthComprehensively(font, character, codePoint, fontSize);
if (charWidth == null) {
return null;
}
totalWidth += charWidth; totalWidth += charWidth;
i += Character.charCount(codePoint); i += Character.charCount(codePoint);
@ -514,64 +507,4 @@ public class WidthCalculator {
return false; return false;
} }
public float calculateMinimumTextWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
return 0;
}
try {
float minWidth = calculateAccurateWidth(font, text, fontSize);
if (minWidth > 0) {
return minWidth * 0.8f;
}
} catch (Exception e) {
}
return text.length() * fontSize * 0.3f;
}
public float calculateMaximumTextWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
return 0;
}
try {
float maxWidth = calculateAccurateWidth(font, text, fontSize);
if (maxWidth > 0) {
return maxWidth * 1.2f;
}
} catch (Exception e) {
}
return text.length() * fontSize * 1.0f;
}
public boolean canCalculateWidthForText(PDFont font, String text) {
if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
try {
Float width = calculateDirectWidth(font, text, 12f);
if (width != null) {
return true;
}
} catch (Exception e) {
}
try {
Float width = calculateCharacterByCharacterWidth(font, text, 12f);
if (width != null) {
return true;
}
} catch (Exception e) {
}
return true;
}
} }

View File

@ -13,20 +13,7 @@
color: #6c757d !important; color: #6c757d !important;
} }
.btn-primary:focus { .btn-primary:focus, .form-check-input:focus, .form-control:focus, .form-select:focus {
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
outline: 2px solid #0d6efd;
outline-offset: 2px;
}
.form-check-input:focus {
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
outline: 2px solid #0d6efd;
outline-offset: 2px;
}
.form-control:focus, .form-select:focus {
border-color: #0d6efd;
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25); box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
outline: 2px solid #0d6efd; outline: 2px solid #0d6efd;
outline-offset: 2px; outline-offset: 2px;
@ -36,20 +23,6 @@
background-color: #0d6efd; background-color: #0d6efd;
border-color: #0d6efd; border-color: #0d6efd;
} }
.sr-only {
position: absolute;
width: 1px;
height: 1px;
padding: 0;
margin: -1px;
overflow: hidden;
clip: rect(0, 0, 0, 0);
white-space: nowrap;
border: 0;
}
</style> </style>
</head> </head>