mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
refactor redaction services to improve resource management and streamline text processing
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
f236505cae
commit
3ac7f0df4c
@ -30,51 +30,39 @@ class AggressiveRedactionService implements RedactionModeStrategy {
|
|||||||
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
||||||
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
||||||
|
|
||||||
PDDocument doc = null;
|
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
||||||
PDDocument fb = null;
|
|
||||||
try {
|
|
||||||
doc = pdfDocumentFactory.load(request.getFileInput());
|
|
||||||
Map<Integer, List<PDFText>> allFound =
|
Map<Integer, List<PDFText>> allFound =
|
||||||
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
||||||
if (allFound.isEmpty()) {
|
if (allFound.isEmpty()) {
|
||||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
return toByteArray(doc);
|
||||||
doc.save(baos);
|
|
||||||
return baos.toByteArray();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord);
|
helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord);
|
||||||
Map<Integer, List<PDFText>> residual =
|
Map<Integer, List<PDFText>> residual =
|
||||||
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
||||||
boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0;
|
boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0;
|
||||||
String effectiveColor =
|
|
||||||
(request.getRedactColor() == null || request.getRedactColor().isBlank())
|
|
||||||
? "#000000"
|
|
||||||
: request.getRedactColor();
|
|
||||||
if (residualExists) {
|
if (residualExists) {
|
||||||
// Use the new visual redaction with OCR restoration fallback
|
|
||||||
return helper.performVisualRedactionWithOcrRestoration(
|
return helper.performVisualRedactionWithOcrRestoration(
|
||||||
request, listOfText, useRegex, wholeWord);
|
request, listOfText, useRegex, wholeWord);
|
||||||
}
|
}
|
||||||
|
|
||||||
return RedactionService.finalizeRedaction(
|
return RedactionService.finalizeRedaction(
|
||||||
doc,
|
doc,
|
||||||
allFound,
|
allFound,
|
||||||
request.getRedactColor(),
|
request.getRedactColor(),
|
||||||
request.getCustomPadding(),
|
request.getCustomPadding(),
|
||||||
request.getConvertPDFToImage(), /*text removal*/
|
request.getConvertPDFToImage(),
|
||||||
true);
|
true);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
|
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
|
||||||
} finally {
|
|
||||||
if (doc != null)
|
|
||||||
try {
|
|
||||||
doc.close();
|
|
||||||
} catch (IOException ignore) {
|
|
||||||
}
|
|
||||||
if (fb != null)
|
|
||||||
try {
|
|
||||||
fb.close();
|
|
||||||
} catch (IOException ignore) {
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] toByteArray(PDDocument doc) throws IOException {
|
||||||
|
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||||
|
doc.save(baos);
|
||||||
|
return baos.toByteArray();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,49 +29,36 @@ class ModerateRedactionService implements RedactionModeStrategy {
|
|||||||
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
||||||
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
||||||
|
|
||||||
PDDocument doc = null;
|
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
||||||
PDDocument fallback = null;
|
|
||||||
try {
|
|
||||||
doc = pdfDocumentFactory.load(request.getFileInput());
|
|
||||||
Map<Integer, List<PDFText>> allFound =
|
Map<Integer, List<PDFText>> allFound =
|
||||||
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
||||||
if (allFound.isEmpty()) {
|
if (allFound.isEmpty()) {
|
||||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
return toByteArray(doc);
|
||||||
doc.save(baos);
|
|
||||||
return baos.toByteArray();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean fallbackToBoxOnly =
|
boolean fallbackToBoxOnly =
|
||||||
helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord);
|
helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord);
|
||||||
String effectiveColor =
|
|
||||||
(request.getRedactColor() == null || request.getRedactColor().isBlank())
|
|
||||||
? "#000000"
|
|
||||||
: request.getRedactColor();
|
|
||||||
if (fallbackToBoxOnly) {
|
if (fallbackToBoxOnly) {
|
||||||
// Use the new visual redaction with OCR restoration fallback
|
|
||||||
return helper.performVisualRedactionWithOcrRestoration(
|
return helper.performVisualRedactionWithOcrRestoration(
|
||||||
request, listOfText, useRegex, wholeWord);
|
request, listOfText, useRegex, wholeWord);
|
||||||
}
|
}
|
||||||
|
|
||||||
return RedactionService.finalizeRedaction(
|
return RedactionService.finalizeRedaction(
|
||||||
doc,
|
doc,
|
||||||
allFound,
|
allFound,
|
||||||
effectiveColor,
|
request.getRedactColor(),
|
||||||
request.getCustomPadding(),
|
request.getCustomPadding(),
|
||||||
request.getConvertPDFToImage(),
|
request.getConvertPDFToImage(),
|
||||||
false);
|
false);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
|
throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
|
||||||
} finally {
|
|
||||||
if (doc != null)
|
|
||||||
try {
|
|
||||||
doc.close();
|
|
||||||
} catch (IOException ignore) {
|
|
||||||
}
|
|
||||||
if (fallback != null)
|
|
||||||
try {
|
|
||||||
fallback.close();
|
|
||||||
} catch (IOException ignore) {
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] toByteArray(PDDocument doc) throws IOException {
|
||||||
|
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||||
|
doc.save(baos);
|
||||||
|
return baos.toByteArray();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -81,6 +81,7 @@ public class RedactionService {
|
|||||||
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
|
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
|
||||||
private static final COSString EMPTY_COS_STRING = new COSString("");
|
private static final COSString EMPTY_COS_STRING = new COSString("");
|
||||||
private static final int MAX_SWEEPS = 3;
|
private static final int MAX_SWEEPS = 3;
|
||||||
|
private static final Pattern PATTERN = Pattern.compile(".*(hoepap|temp|generated).*");
|
||||||
private boolean aggressiveMode = false;
|
private boolean aggressiveMode = false;
|
||||||
private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
|
private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
|
||||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||||
@ -266,26 +267,20 @@ public class RedactionService {
|
|||||||
boolean wholeWordSearch) {
|
boolean wholeWordSearch) {
|
||||||
try {
|
try {
|
||||||
for (String term : targetWords) {
|
for (String term : targetWords) {
|
||||||
if (term == null || term.isBlank()) {
|
if (term == null || term.isBlank()) continue;
|
||||||
continue;
|
|
||||||
}
|
|
||||||
TextFinder finder = new TextFinder(term, useRegex, wholeWordSearch);
|
TextFinder finder = new TextFinder(term, useRegex, wholeWordSearch);
|
||||||
finder.setStartPage(pageIndex + 1);
|
finder.setStartPage(pageIndex + 1);
|
||||||
finder.setEndPage(pageIndex + 1);
|
finder.setEndPage(pageIndex + 1);
|
||||||
finder.getText(document);
|
finder.getText(document);
|
||||||
|
|
||||||
List<PDFText> foundTexts = finder.getFoundTexts();
|
for (PDFText text : finder.getFoundTexts()) {
|
||||||
for (PDFText ft : foundTexts) {
|
if (text.getPageIndex() == pageIndex) {
|
||||||
if (ft.getPageIndex() == pageIndex) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!foundTexts.isEmpty()) {}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -297,18 +292,13 @@ public class RedactionService {
|
|||||||
boolean useRegex,
|
boolean useRegex,
|
||||||
boolean wholeWordSearch) {
|
boolean wholeWordSearch) {
|
||||||
try {
|
try {
|
||||||
int idx = -1;
|
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
|
||||||
final int numberOfPages = document.getNumberOfPages();
|
|
||||||
for (int i = 0; i < numberOfPages; i++) {
|
|
||||||
idx++;
|
|
||||||
if (pageStillContainsTargets(
|
if (pageStillContainsTargets(
|
||||||
document, idx, targetWords, useRegex, wholeWordSearch)) {
|
document, pageIndex, targetWords, useRegex, wholeWordSearch)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -352,12 +342,11 @@ public class RedactionService {
|
|||||||
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
|
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
|
||||||
allFoundTexts.addAll(pageTexts);
|
allFoundTexts.addAll(pageTexts);
|
||||||
}
|
}
|
||||||
if (!allFoundTexts.isEmpty()) {
|
if (!allFoundTexts.isEmpty() && !isTextRemovalMode) {
|
||||||
if (!isTextRemovalMode) {
|
|
||||||
Color redactColor = decodeOrDefault(colorString);
|
Color redactColor = decodeOrDefault(colorString);
|
||||||
redactFoundText(document, allFoundTexts, customPadding, redactColor);
|
redactFoundText(document, allFoundTexts, customPadding, redactColor);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
if (Boolean.TRUE.equals(convertToImage)) {
|
if (Boolean.TRUE.equals(convertToImage)) {
|
||||||
try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) {
|
try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) {
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
@ -597,18 +586,11 @@ public class RedactionService {
|
|||||||
private static boolean isTextSafeForRedaction(String text) {
|
private static boolean isTextSafeForRedaction(String text) {
|
||||||
if (text == null || text.isEmpty()) return true;
|
if (text == null || text.isEmpty()) return true;
|
||||||
|
|
||||||
for (int i = 0; i < text.length(); i++) {
|
for (char c : text.toCharArray()) {
|
||||||
char c = text.charAt(i);
|
if (c >= 65488 || (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')) {
|
||||||
int codePoint = c;
|
|
||||||
|
|
||||||
if (codePoint >= 65488) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -657,56 +639,33 @@ public class RedactionService {
|
|||||||
return wipeAllSemanticTextInTokens(tokens, true);
|
return wipeAllSemanticTextInTokens(tokens, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte[] performVisualRedactionWithOcrRestoration(
|
private static String normalizeTextForRedaction(String text) {
|
||||||
RedactPdfRequest request,
|
if (text == null) return null;
|
||||||
String[] listOfText,
|
|
||||||
boolean useRegex,
|
StringBuilder normalized = new StringBuilder();
|
||||||
boolean wholeWordSearch)
|
for (int i = 0; i < text.length(); i++) {
|
||||||
throws IOException {
|
char c = text.charAt(i);
|
||||||
PDDocument visualRedactedDoc = null;
|
|
||||||
try {
|
if (c >= 65488) {
|
||||||
visualRedactedDoc = pdfDocumentFactory.load(request.getFileInput());
|
normalized.append(' ');
|
||||||
Map<Integer, List<PDFText>> allFound =
|
} else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
|
||||||
findTextToRedact(visualRedactedDoc, listOfText, useRegex, wholeWordSearch);
|
normalized.append(' ');
|
||||||
String effectiveColor =
|
} else {
|
||||||
(request.getRedactColor() == null || request.getRedactColor().isBlank())
|
normalized.append(c);
|
||||||
? "#000000"
|
|
||||||
: request.getRedactColor();
|
|
||||||
byte[] visualRedactedBytes =
|
|
||||||
finalizeRedaction(
|
|
||||||
visualRedactedDoc,
|
|
||||||
allFound,
|
|
||||||
effectiveColor,
|
|
||||||
request.getCustomPadding(),
|
|
||||||
true,
|
|
||||||
false);
|
|
||||||
return performOcrRestoration(visualRedactedBytes, request);
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new IOException(
|
|
||||||
"Visual redaction with OCR restoration failed: " + e.getMessage(), e);
|
|
||||||
} finally {
|
|
||||||
if (visualRedactedDoc != null) {
|
|
||||||
try {
|
|
||||||
visualRedactedDoc.close();
|
|
||||||
} catch (IOException ignore) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request)
|
return normalized.toString();
|
||||||
throws IOException, InterruptedException {
|
|
||||||
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
|
|
||||||
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
|
|
||||||
java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes);
|
|
||||||
if (isOcrMyPdfAvailable()) {
|
|
||||||
return processWithOcrMyPdfForRestoration(
|
|
||||||
tempInputFile.getPath(), tempOutputFile.getPath(), request);
|
|
||||||
} else if (isTesseractAvailable()) {
|
|
||||||
return processWithTesseractForRestoration(
|
|
||||||
tempInputFile.getPath(), tempOutputFile.getPath(), request);
|
|
||||||
}
|
}
|
||||||
return redactedPdfBytes;
|
|
||||||
|
private static boolean isOcrMyPdfAvailable() {
|
||||||
|
try {
|
||||||
|
ProcessExecutorResult result =
|
||||||
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||||
|
.runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version"));
|
||||||
|
return result.getRc() == 0;
|
||||||
|
} catch (Exception e) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -780,37 +739,7 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String normalizeTextForRedaction(String text) {
|
private static boolean isTesseractAvailable() {
|
||||||
if (text == null) return null;
|
|
||||||
|
|
||||||
StringBuilder normalized = new StringBuilder();
|
|
||||||
for (int i = 0; i < text.length(); i++) {
|
|
||||||
char c = text.charAt(i);
|
|
||||||
|
|
||||||
if ((int) c >= 65488) {
|
|
||||||
normalized.append(' ');
|
|
||||||
} else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
|
|
||||||
normalized.append(' ');
|
|
||||||
} else {
|
|
||||||
normalized.append(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return normalized.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isOcrMyPdfAvailable() {
|
|
||||||
try {
|
|
||||||
ProcessExecutorResult result =
|
|
||||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
|
||||||
.runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version"));
|
|
||||||
return result.getRc() == 0;
|
|
||||||
} catch (Exception e) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isTesseractAvailable() {
|
|
||||||
try {
|
try {
|
||||||
ProcessExecutorResult result =
|
ProcessExecutorResult result =
|
||||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
|
||||||
@ -826,7 +755,7 @@ public class RedactionService {
|
|||||||
String fontName = font.getName();
|
String fontName = font.getName();
|
||||||
if (fontName == null
|
if (fontName == null
|
||||||
|| isProperFontSubset(fontName)
|
|| isProperFontSubset(fontName)
|
||||||
|| fontName.toLowerCase().matches(".*(hoepap|temp|generated).*")) {
|
|| PATTERN.matcher(fontName.toLowerCase()).matches()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return hasReliableWidthMetrics(font);
|
return hasReliableWidthMetrics(font);
|
||||||
@ -835,6 +764,58 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static String sanitizeText(String text) {
|
||||||
|
if (text == null) return "";
|
||||||
|
|
||||||
|
StringBuilder sanitized = new StringBuilder();
|
||||||
|
for (char c : text.toCharArray()) {
|
||||||
|
sanitized.append(
|
||||||
|
(Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')
|
||||||
|
? '\uFFFD'
|
||||||
|
: c);
|
||||||
|
}
|
||||||
|
return sanitized.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static byte[] processWithOcrMyPdfForRestoration(
|
||||||
|
java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request)
|
||||||
|
throws IOException, InterruptedException {
|
||||||
|
List<String> command =
|
||||||
|
Arrays.asList(
|
||||||
|
"ocrmypdf",
|
||||||
|
"--verbose",
|
||||||
|
"1",
|
||||||
|
"--output-type",
|
||||||
|
"pdf",
|
||||||
|
"--pdf-renderer",
|
||||||
|
"sandwich",
|
||||||
|
"--language",
|
||||||
|
"eng",
|
||||||
|
"--optimize",
|
||||||
|
"0",
|
||||||
|
"--jpeg-quality",
|
||||||
|
"100",
|
||||||
|
"--png-quality",
|
||||||
|
"9",
|
||||||
|
"--force-ocr",
|
||||||
|
"--deskew",
|
||||||
|
"--clean",
|
||||||
|
"--clean-final",
|
||||||
|
inputPath.toString(),
|
||||||
|
outputPath.toString());
|
||||||
|
ProcessExecutorResult result =
|
||||||
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||||
|
.runCommandWithOutputHandling(command);
|
||||||
|
if (result.getRc() != 0) {
|
||||||
|
throw new IOException(
|
||||||
|
"OCRmyPDF restoration failed with return code: "
|
||||||
|
+ result.getRc()
|
||||||
|
+ ". Error: "
|
||||||
|
+ result.getMessages());
|
||||||
|
}
|
||||||
|
return java.nio.file.Files.readAllBytes(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
private static String createSubsetFontPlaceholder(
|
private static String createSubsetFontPlaceholder(
|
||||||
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
||||||
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
||||||
@ -843,77 +824,144 @@ public class RedactionService {
|
|||||||
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void performTextReplacementAggressive(
|
private static COSArray buildKerningAdjustedTJArray(
|
||||||
PDDocument document,
|
COSArray originalArray, COSArray redactedArray, TextSegment segment) {
|
||||||
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
try {
|
||||||
String[] listOfText,
|
if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0)
|
||||||
|
return redactedArray;
|
||||||
|
|
||||||
|
COSArray out = new COSArray();
|
||||||
|
int size = redactedArray.size();
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
COSBase redEl = redactedArray.get(i);
|
||||||
|
COSBase origEl =
|
||||||
|
(originalArray != null && i < originalArray.size())
|
||||||
|
? originalArray.get(i)
|
||||||
|
: null;
|
||||||
|
|
||||||
|
out.add(redEl);
|
||||||
|
|
||||||
|
if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) {
|
||||||
|
String origText = getDecodedString(origStr, segment.getFont());
|
||||||
|
String modText = getDecodedString(redStr, segment.getFont());
|
||||||
|
float wOrig =
|
||||||
|
calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
|
||||||
|
float wMod =
|
||||||
|
calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
|
||||||
|
float adjustment = wOrig - wMod;
|
||||||
|
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
||||||
|
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
|
||||||
|
if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
|
||||||
|
i++;
|
||||||
|
float combined = num.floatValue() + kerning;
|
||||||
|
out.add(new COSFloat(combined));
|
||||||
|
} else {
|
||||||
|
out.add(new COSFloat(kerning));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
} catch (Exception e) {
|
||||||
|
return redactedArray;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<MatchRange> findMatchesInSegments(
|
||||||
|
List<TextSegment> segments,
|
||||||
|
Set<String> targetWords,
|
||||||
boolean useRegex,
|
boolean useRegex,
|
||||||
boolean wholeWordSearchBool) {
|
boolean wholeWordSearch) {
|
||||||
if (allFoundTextsByPage.isEmpty()) {
|
List<MatchRange> allMatches = new ArrayList<>();
|
||||||
return;
|
List<Pattern> patterns =
|
||||||
|
TextFinderUtils.createOptimizedSearchPatterns(
|
||||||
|
targetWords, useRegex, wholeWordSearch);
|
||||||
|
|
||||||
|
log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size());
|
||||||
|
|
||||||
|
int totalMatchesFound = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < segments.size(); i++) {
|
||||||
|
TextSegment segment = segments.get(i);
|
||||||
|
String segmentText = segment.getText();
|
||||||
|
if (segmentText == null || segmentText.isEmpty()) {
|
||||||
|
log.debug("Skipping empty segment {}", i);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
Set<String> allSearchTerms =
|
|
||||||
Arrays.stream(listOfText)
|
log.debug("Processing segment {}: '{}'", i, segmentText);
|
||||||
.map(String::trim)
|
|
||||||
.filter(s -> !s.isEmpty())
|
if (segment.getFont() != null
|
||||||
.collect(Collectors.toSet());
|
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
|
||||||
this.aggressiveMode = true;
|
log.debug(
|
||||||
this.aggressiveSegMatches = new HashMap<>();
|
"Skipping segment {} - font not removable: {}",
|
||||||
|
i,
|
||||||
|
segment.getFont().getName());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int segmentMatches = 0;
|
||||||
|
for (Pattern pattern : patterns) {
|
||||||
try {
|
try {
|
||||||
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
log.debug(
|
||||||
boolean anyResidual = false;
|
"Matching pattern '{}' against segment text '{}'",
|
||||||
int pageIndex = -1;
|
pattern.pattern(),
|
||||||
for (PDPage page : document.getPages()) {
|
segmentText);
|
||||||
pageIndex++;
|
var matcher = pattern.matcher(segmentText);
|
||||||
try {
|
while (matcher.find()) {
|
||||||
this.aggressiveSegMatches = new HashMap<>();
|
int matchStart = matcher.start();
|
||||||
List<Object> filtered =
|
int matchEnd = matcher.end();
|
||||||
createTokensWithoutTargetText(
|
|
||||||
document,
|
log.debug(
|
||||||
page,
|
"Found match in segment {}: positions {}-{}",
|
||||||
allSearchTerms,
|
i,
|
||||||
useRegex,
|
matchStart,
|
||||||
wholeWordSearchBool);
|
matchEnd);
|
||||||
writeFilteredContentStream(document, page, filtered);
|
|
||||||
boolean residual =
|
if (matchStart >= 0
|
||||||
pageStillContainsTargets(
|
&& matchEnd <= segmentText.length()
|
||||||
document,
|
&& matchStart < matchEnd) {
|
||||||
pageIndex,
|
String matchedText = segmentText.substring(matchStart, matchEnd);
|
||||||
allSearchTerms,
|
log.debug("Matched text: '{}'", matchedText);
|
||||||
useRegex,
|
|
||||||
wholeWordSearchBool);
|
allMatches.add(
|
||||||
if (residual) {
|
new MatchRange(
|
||||||
anyResidual = true;
|
segment.getStartPos() + matchStart,
|
||||||
try {
|
segment.getStartPos() + matchEnd));
|
||||||
var sem = wipeAllSemanticTextInTokens(filtered);
|
segmentMatches++;
|
||||||
filtered = sem.tokens;
|
totalMatchesFound++;
|
||||||
PDResources res = page.getResources();
|
|
||||||
if (res != null) {
|
|
||||||
wipeAllSemanticTextInProperties(res);
|
|
||||||
wipeAllTextInXObjects(document, res);
|
|
||||||
wipeAllTextInPatterns(document, res);
|
|
||||||
}
|
|
||||||
writeFilteredContentStream(document, page, filtered);
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception ignored) {
|
} catch (Exception e) {
|
||||||
|
log.error("Error matching pattern in segment {}: {}", i, e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!anyResidual) {
|
|
||||||
break;
|
if (segmentMatches > 0) {
|
||||||
}
|
log.info("Segment {} had {} matches", i, segmentMatches);
|
||||||
if (!documentStillContainsTargets(
|
|
||||||
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
|
||||||
this.aggressiveMode = false;
|
log.info("Total matches found across all segments: {}", totalMatchesFound);
|
||||||
this.aggressiveSegMatches = null;
|
allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
|
||||||
|
|
||||||
|
if (allMatches.isEmpty()) {
|
||||||
|
log.warn("No matches found in segments. This might indicate:");
|
||||||
|
log.warn("1. Text encoding issues preventing proper extraction");
|
||||||
|
log.warn("2. Font compatibility issues");
|
||||||
|
log.warn("3. Search terms not matching extracted text");
|
||||||
|
log.warn("4. Whole word search filtering out matches");
|
||||||
|
|
||||||
|
if (!segments.isEmpty()) {
|
||||||
|
log.warn("Sample segment text: '{}'", segments.get(0).getText());
|
||||||
|
log.warn("Target words: {}", targetWords);
|
||||||
|
log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return allMatches;
|
||||||
|
}
|
||||||
|
|
||||||
private static float calculateCharacterSumWidth(PDFont font, String text) {
|
private static float calculateCharacterSumWidth(PDFont font, String text) {
|
||||||
float totalWidth = 0f;
|
float totalWidth = 0f;
|
||||||
for (char c : text.toCharArray()) {
|
for (char c : text.toCharArray()) {
|
||||||
@ -1033,19 +1081,29 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String sanitizeText(String text) {
|
public byte[] performVisualRedactionWithOcrRestoration(
|
||||||
if (text == null) return "";
|
RedactPdfRequest request,
|
||||||
|
String[] listOfText,
|
||||||
StringBuilder sanitized = new StringBuilder();
|
boolean useRegex,
|
||||||
for (char c : text.toCharArray()) {
|
boolean wholeWordSearch)
|
||||||
if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
|
throws IOException {
|
||||||
sanitized.append('\uFFFD');
|
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
||||||
} else {
|
Map<Integer, List<PDFText>> allFound =
|
||||||
sanitized.append(c);
|
findTextToRedact(doc, listOfText, useRegex, wholeWordSearch);
|
||||||
|
byte[] visualRedactedBytes =
|
||||||
|
finalizeRedaction(
|
||||||
|
doc,
|
||||||
|
allFound,
|
||||||
|
request.getRedactColor(),
|
||||||
|
request.getCustomPadding(),
|
||||||
|
true,
|
||||||
|
false);
|
||||||
|
return performOcrRestoration(visualRedactedBytes, request);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new IOException(
|
||||||
|
"Visual redaction with OCR restoration failed: " + e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return sanitized.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static WipeResult wipeAllSemanticTextInTokens(List<Object> tokens, boolean removeTU) {
|
private static WipeResult wipeAllSemanticTextInTokens(List<Object> tokens, boolean removeTU) {
|
||||||
if (tokens == null || tokens.isEmpty()) {
|
if (tokens == null || tokens.isEmpty()) {
|
||||||
@ -1064,43 +1122,21 @@ public class RedactionService {
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
private byte[] processWithOcrMyPdfForRestoration(
|
private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request)
|
||||||
java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request)
|
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
List<String> command =
|
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
|
||||||
Arrays.asList(
|
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
|
||||||
"ocrmypdf",
|
java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes);
|
||||||
"--verbose",
|
|
||||||
"1",
|
if (isOcrMyPdfAvailable()) {
|
||||||
"--output-type",
|
return processWithOcrMyPdfForRestoration(
|
||||||
"pdf",
|
tempInputFile.getPath(), tempOutputFile.getPath(), request);
|
||||||
"--pdf-renderer",
|
} else if (isTesseractAvailable()) {
|
||||||
"sandwich",
|
return processWithTesseractForRestoration(
|
||||||
"--language",
|
tempInputFile.getPath(), tempOutputFile.getPath(), request);
|
||||||
"eng",
|
}
|
||||||
"--optimize",
|
return redactedPdfBytes;
|
||||||
"0",
|
|
||||||
"--jpeg-quality",
|
|
||||||
"100",
|
|
||||||
"--png-quality",
|
|
||||||
"9",
|
|
||||||
"--force-ocr",
|
|
||||||
"--deskew",
|
|
||||||
"--clean",
|
|
||||||
"--clean-final",
|
|
||||||
inputPath.toString(),
|
|
||||||
outputPath.toString());
|
|
||||||
ProcessExecutorResult result =
|
|
||||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
|
||||||
.runCommandWithOutputHandling(command);
|
|
||||||
if (result.getRc() != 0) {
|
|
||||||
throw new IOException(
|
|
||||||
"OCRmyPDF restoration failed with return code: "
|
|
||||||
+ result.getRc()
|
|
||||||
+ ". Error: "
|
|
||||||
+ result.getMessages());
|
|
||||||
}
|
}
|
||||||
return java.nio.file.Files.readAllBytes(outputPath);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean removeSemanticProperties(COSDictionary dict, boolean removeTU) {
|
private static boolean removeSemanticProperties(COSDictionary dict, boolean removeTU) {
|
||||||
@ -1427,59 +1463,62 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getOriginalTokenCount(PDPage page) {
|
public void performTextReplacementAggressive(
|
||||||
|
PDDocument document,
|
||||||
|
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
||||||
|
String[] listOfText,
|
||||||
|
boolean useRegex,
|
||||||
|
boolean wholeWordSearchBool) {
|
||||||
|
if (allFoundTextsByPage.isEmpty()) return;
|
||||||
|
|
||||||
|
Set<String> allSearchTerms =
|
||||||
|
Arrays.stream(listOfText)
|
||||||
|
.map(String::trim)
|
||||||
|
.filter(s -> !s.isEmpty())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
this.aggressiveMode = true;
|
||||||
|
this.aggressiveSegMatches = new HashMap<>();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
PDFStreamParser parser = new PDFStreamParser(page);
|
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||||
int count = 0;
|
boolean anyResidual = false;
|
||||||
while (parser.parseNextToken() != null) {
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
return count;
|
|
||||||
} catch (Exception e) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private COSArray buildKerningAdjustedTJArray(
|
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
|
||||||
COSArray originalArray, COSArray redactedArray, TextSegment segment) {
|
PDPage page = document.getPages().get(pageIndex);
|
||||||
try {
|
try {
|
||||||
if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0)
|
this.aggressiveSegMatches = new HashMap<>();
|
||||||
return redactedArray;
|
List<Object> filtered =
|
||||||
|
createTokensWithoutTargetText(
|
||||||
|
document,
|
||||||
|
page,
|
||||||
|
allSearchTerms,
|
||||||
|
useRegex,
|
||||||
|
wholeWordSearchBool);
|
||||||
|
writeFilteredContentStream(document, page, filtered);
|
||||||
|
|
||||||
COSArray out = new COSArray();
|
if (pageStillContainsTargets(
|
||||||
int size = redactedArray.size();
|
document,
|
||||||
for (int i = 0; i < size; i++) {
|
pageIndex,
|
||||||
COSBase redEl = redactedArray.get(i);
|
allSearchTerms,
|
||||||
COSBase origEl =
|
useRegex,
|
||||||
(originalArray != null && i < originalArray.size())
|
wholeWordSearchBool)) {
|
||||||
? originalArray.get(i)
|
anyResidual = true;
|
||||||
: null;
|
processResidualText(document, page, filtered);
|
||||||
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
out.add(redEl);
|
if (!anyResidual
|
||||||
|
|| !documentStillContainsTargets(
|
||||||
if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) {
|
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||||
String origText = getDecodedString(origStr, segment.getFont());
|
break;
|
||||||
String modText = getDecodedString(redStr, segment.getFont());
|
|
||||||
float wOrig =
|
|
||||||
calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
|
|
||||||
float wMod =
|
|
||||||
calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
|
|
||||||
float adjustment = wOrig - wMod;
|
|
||||||
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
|
||||||
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
|
|
||||||
if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
|
|
||||||
i++;
|
|
||||||
float combined = num.floatValue() + kerning;
|
|
||||||
out.add(new COSFloat(combined));
|
|
||||||
} else {
|
|
||||||
out.add(new COSFloat(kerning));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} finally {
|
||||||
}
|
this.aggressiveMode = false;
|
||||||
return out;
|
this.aggressiveSegMatches = null;
|
||||||
} catch (Exception e) {
|
|
||||||
return redactedArray;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1678,6 +1717,21 @@ public class RedactionService {
|
|||||||
return problematicRatio > 0.3;
|
return problematicRatio > 0.3;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void processResidualText(PDDocument document, PDPage page, List<Object> filtered) {
|
||||||
|
try {
|
||||||
|
var sem = wipeAllSemanticTextInTokens(filtered);
|
||||||
|
filtered = sem.tokens;
|
||||||
|
PDResources res = page.getResources();
|
||||||
|
if (res != null) {
|
||||||
|
wipeAllSemanticTextInProperties(res);
|
||||||
|
wipeAllTextInXObjects(document, res);
|
||||||
|
wipeAllTextInPatterns(document, res);
|
||||||
|
}
|
||||||
|
writeFilteredContentStream(document, page, filtered);
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public boolean performTextReplacement(
|
public boolean performTextReplacement(
|
||||||
PDDocument document,
|
PDDocument document,
|
||||||
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
||||||
@ -1688,151 +1742,38 @@ public class RedactionService {
|
|||||||
log.info("No text found to redact");
|
log.info("No text found to redact");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
try {
|
|
||||||
Set<String> allSearchTerms =
|
Set<String> allSearchTerms =
|
||||||
Arrays.stream(listOfText)
|
Arrays.stream(listOfText)
|
||||||
.map(String::trim)
|
.map(String::trim)
|
||||||
.filter(s -> !s.isEmpty())
|
.filter(s -> !s.isEmpty())
|
||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
log.info(
|
log.info("Starting text replacement with {} search terms", allSearchTerms.size());
|
||||||
"Starting text replacement with {} search terms: {}",
|
|
||||||
allSearchTerms.size(),
|
|
||||||
allSearchTerms);
|
|
||||||
log.info("Total pages in document: {}", document.getNumberOfPages());
|
|
||||||
log.info("Initial text found on {} pages", allFoundTextsByPage.size());
|
|
||||||
|
|
||||||
int initialTotalInstances =
|
|
||||||
allFoundTextsByPage.values().stream().mapToInt(List::size).sum();
|
|
||||||
log.info("Total initial instances to redact: {}", initialTotalInstances);
|
|
||||||
|
|
||||||
int finalSweepCount = 0;
|
|
||||||
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||||
finalSweepCount = sweep + 1;
|
processPages(document, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||||
log.info("=== Starting sweep {} of {} ===", sweep + 1, MAX_SWEEPS);
|
|
||||||
int pagesProcessed = 0;
|
|
||||||
int totalModifications = 0;
|
|
||||||
|
|
||||||
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
|
if (!documentStillContainsTargets(
|
||||||
PDPage page = document.getPages().get(pageIndex);
|
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||||
List<PDFText> pageFoundTexts =
|
|
||||||
allFoundTextsByPage.getOrDefault(pageIndex, List.of());
|
|
||||||
|
|
||||||
log.debug(
|
|
||||||
"Processing page {} - found {} instances",
|
|
||||||
pageIndex + 1,
|
|
||||||
pageFoundTexts.size());
|
|
||||||
|
|
||||||
List<Object> filtered =
|
|
||||||
createTokensWithoutTargetText(
|
|
||||||
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
|
||||||
writeFilteredContentStream(document, page, filtered);
|
|
||||||
|
|
||||||
int tokenDiff = Math.abs(filtered.size() - getOriginalTokenCount(page));
|
|
||||||
totalModifications += tokenDiff;
|
|
||||||
pagesProcessed++;
|
|
||||||
|
|
||||||
log.debug("Page {} - token modifications: {}", pageIndex + 1, tokenDiff);
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info(
|
|
||||||
"Sweep {} completed - processed {} pages, total modifications: {}",
|
|
||||||
sweep + 1,
|
|
||||||
pagesProcessed,
|
|
||||||
totalModifications);
|
|
||||||
|
|
||||||
boolean stillContainsTargets =
|
|
||||||
documentStillContainsTargets(
|
|
||||||
document, allSearchTerms, useRegex, wholeWordSearchBool);
|
|
||||||
|
|
||||||
if (!stillContainsTargets) {
|
|
||||||
log.info("SUCCESS: All targets removed after {} sweeps", sweep + 1);
|
log.info("SUCCESS: All targets removed after {} sweeps", sweep + 1);
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
log.warn(
|
|
||||||
"WARNING: Still contains targets after sweep {} - continuing...",
|
|
||||||
sweep + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean finalCheck = false;
|
|
||||||
for (int verifyAttempt = 0; verifyAttempt < 3; verifyAttempt++) {
|
|
||||||
log.info("Final verification attempt {} of 3", verifyAttempt + 1);
|
|
||||||
finalCheck =
|
|
||||||
documentStillContainsTargets(
|
|
||||||
document, allSearchTerms, useRegex, wholeWordSearchBool);
|
|
||||||
|
|
||||||
if (!finalCheck) {
|
|
||||||
log.info(
|
|
||||||
"Verification attempt {} passed - no targets found", verifyAttempt + 1);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
log.warn("Verification attempt {} found remaining targets", verifyAttempt + 1);
|
|
||||||
if (verifyAttempt < 2) {
|
|
||||||
log.info("Performing additional cleanup sweep due to verification failure");
|
|
||||||
for (PDPage page : document.getPages()) {
|
|
||||||
List<Object> additionalFiltered =
|
|
||||||
createTokensWithoutTargetText(
|
|
||||||
document,
|
|
||||||
page,
|
|
||||||
allSearchTerms,
|
|
||||||
useRegex,
|
|
||||||
wholeWordSearchBool);
|
|
||||||
writeFilteredContentStream(document, page, additionalFiltered);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (finalCheck) {
|
|
||||||
log.error(
|
|
||||||
"FAILURE: Document still contains targets after {} sweeps and {} verification attempts. Falling back to visual redaction with OCR restoration.",
|
|
||||||
MAX_SWEEPS,
|
|
||||||
3);
|
|
||||||
log.error("Remaining search terms: {}", allSearchTerms);
|
|
||||||
|
|
||||||
log.error("=== DETAILED FAILURE ANALYSIS ===");
|
|
||||||
for (int pageIdx = 0; pageIdx < document.getNumberOfPages(); pageIdx++) {
|
|
||||||
for (String term : allSearchTerms) {
|
|
||||||
try {
|
|
||||||
TextFinder finder = new TextFinder(term, useRegex, wholeWordSearchBool);
|
|
||||||
finder.setStartPage(pageIdx + 1);
|
|
||||||
finder.setEndPage(pageIdx + 1);
|
|
||||||
finder.getText(document);
|
|
||||||
|
|
||||||
for (PDFText found : finder.getFoundTexts()) {
|
|
||||||
if (found.getPageIndex() == pageIdx) {
|
|
||||||
log.error(
|
|
||||||
"REMAINING: '{}' found on page {} at position ({}, {})",
|
|
||||||
term,
|
|
||||||
pageIdx + 1,
|
|
||||||
found.getX1(),
|
|
||||||
found.getY1());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error(
|
|
||||||
"Error during failure analysis for term '{}' on page {}: {}",
|
|
||||||
term,
|
|
||||||
pageIdx + 1,
|
|
||||||
e.getMessage());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
log.error("=== END FAILURE ANALYSIS ===");
|
|
||||||
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
log.info(
|
|
||||||
"SUCCESS: All text redaction completed successfully after {} sweeps",
|
|
||||||
finalSweepCount);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("Exception during text replacement: {}", e.getMessage(), e);
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Verification attempts
|
||||||
|
for (int attempt = 0; attempt < 3; attempt++) {
|
||||||
|
if (!documentStillContainsTargets(
|
||||||
|
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (attempt < 2) {
|
||||||
|
processPages(document, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.error("FAILURE: Document still contains targets after {} sweeps", MAX_SWEEPS);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private COSArray createRedactedTJArray(
|
private COSArray createRedactedTJArray(
|
||||||
@ -1917,99 +1858,21 @@ public class RedactionService {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<MatchRange> findMatchesInSegments(
|
private void processPages(
|
||||||
List<TextSegment> segments,
|
PDDocument document,
|
||||||
Set<String> targetWords,
|
Set<String> allSearchTerms,
|
||||||
boolean useRegex,
|
boolean useRegex,
|
||||||
boolean wholeWordSearch) {
|
boolean wholeWordSearchBool) {
|
||||||
List<MatchRange> allMatches = new ArrayList<>();
|
for (PDPage page : document.getPages()) {
|
||||||
List<Pattern> patterns =
|
|
||||||
TextFinderUtils.createOptimizedSearchPatterns(
|
|
||||||
targetWords, useRegex, wholeWordSearch);
|
|
||||||
|
|
||||||
log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size());
|
|
||||||
|
|
||||||
int totalMatchesFound = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < segments.size(); i++) {
|
|
||||||
TextSegment segment = segments.get(i);
|
|
||||||
String segmentText = segment.getText();
|
|
||||||
if (segmentText == null || segmentText.isEmpty()) {
|
|
||||||
log.debug("Skipping empty segment {}", i);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
log.debug("Processing segment {}: '{}'", i, segmentText);
|
|
||||||
|
|
||||||
if (segment.getFont() != null
|
|
||||||
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
|
|
||||||
log.debug(
|
|
||||||
"Skipping segment {} - font not removable: {}",
|
|
||||||
i,
|
|
||||||
segment.getFont().getName());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
int segmentMatches = 0;
|
|
||||||
for (Pattern pattern : patterns) {
|
|
||||||
try {
|
try {
|
||||||
log.debug(
|
List<Object> filtered =
|
||||||
"Matching pattern '{}' against segment text '{}'",
|
createTokensWithoutTargetText(
|
||||||
pattern.pattern(),
|
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||||
segmentText);
|
writeFilteredContentStream(document, page, filtered);
|
||||||
var matcher = pattern.matcher(segmentText);
|
|
||||||
while (matcher.find()) {
|
|
||||||
int matchStart = matcher.start();
|
|
||||||
int matchEnd = matcher.end();
|
|
||||||
|
|
||||||
log.debug(
|
|
||||||
"Found match in segment {}: positions {}-{}",
|
|
||||||
i,
|
|
||||||
matchStart,
|
|
||||||
matchEnd);
|
|
||||||
|
|
||||||
if (matchStart >= 0
|
|
||||||
&& matchEnd <= segmentText.length()
|
|
||||||
&& matchStart < matchEnd) {
|
|
||||||
String matchedText = segmentText.substring(matchStart, matchEnd);
|
|
||||||
log.debug("Matched text: '{}'", matchedText);
|
|
||||||
|
|
||||||
allMatches.add(
|
|
||||||
new MatchRange(
|
|
||||||
segment.getStartPos() + matchStart,
|
|
||||||
segment.getStartPos() + matchEnd));
|
|
||||||
segmentMatches++;
|
|
||||||
totalMatchesFound++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("Error matching pattern in segment {}: {}", i, e.getMessage());
|
log.warn("Error processing page: {}", e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (segmentMatches > 0) {
|
|
||||||
log.info("Segment {} had {} matches", i, segmentMatches);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info("Total matches found across all segments: {}", totalMatchesFound);
|
|
||||||
allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
|
|
||||||
|
|
||||||
if (allMatches.isEmpty()) {
|
|
||||||
log.warn("No matches found in segments. This might indicate:");
|
|
||||||
log.warn("1. Text encoding issues preventing proper extraction");
|
|
||||||
log.warn("2. Font compatibility issues");
|
|
||||||
log.warn("3. Search terms not matching extracted text");
|
|
||||||
log.warn("4. Whole word search filtering out matches");
|
|
||||||
|
|
||||||
if (!segments.isEmpty()) {
|
|
||||||
log.warn("Sample segment text: '{}'", segments.get(0).getText());
|
|
||||||
log.warn("Target words: {}", targetWords);
|
|
||||||
log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return allMatches;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String createSafeReplacement(String originalPart, TextSegment segment) {
|
private String createSafeReplacement(String originalPart, TextSegment segment) {
|
||||||
@ -2962,9 +2825,9 @@ public class RedactionService {
|
|||||||
|
|
||||||
@Data
|
@Data
|
||||||
public static class DecodedMapping {
|
public static class DecodedMapping {
|
||||||
public String text;
|
private String text;
|
||||||
public int[] charByteStart;
|
private int[] charByteStart;
|
||||||
public int[] charByteEnd;
|
private int[] charByteEnd;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
|
@ -5,10 +5,17 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
|
|||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class TextEncodingHelper {
|
public class TextEncodingHelper {
|
||||||
|
|
||||||
|
private final Pattern PATTERN = Pattern.compile("^[A-Z]+$");
|
||||||
|
private final Pattern REGEX = Pattern.compile("^[A-Z]{6}\\+.*");
|
||||||
|
private final Pattern REGEXP = Pattern.compile("^[A-Z]{5}\\+.*");
|
||||||
|
private final Pattern PATTERN1 = Pattern.compile("^[A-Z]{4}\\+.*");
|
||||||
|
|
||||||
public boolean canEncodeCharacters(PDFont font, String text) {
|
public boolean canEncodeCharacters(PDFont font, String text) {
|
||||||
if (font == null || text == null) {
|
if (font == null || text == null) {
|
||||||
return false;
|
return false;
|
||||||
@ -421,21 +428,21 @@ public class TextEncodingHelper {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fontName.matches("^[A-Z]{6}\\+.*")) {
|
if (REGEX.matcher(fontName).matches()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fontName.matches("^[A-Z]{5}\\+.*")) {
|
if (REGEXP.matcher(fontName).matches()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fontName.matches("^[A-Z]{4}\\+.*")) {
|
if (PATTERN1.matcher(fontName).matches()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fontName.contains("+")) {
|
if (fontName.contains("+")) {
|
||||||
String prefix = fontName.split("\\+")[0];
|
String prefix = fontName.split("\\+")[0];
|
||||||
if (prefix.matches("^[A-Z]+$") && prefix.length() >= 4) {
|
if (PATTERN.matcher(prefix).matches() && prefix.length() >= 4) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -510,68 +517,4 @@ public class TextEncodingHelper {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean canEncodeAnyCharacter(PDFont font) {
|
|
||||||
if (font == null) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
String[] testStrings = {
|
|
||||||
"a", "A", "0", " ", ".", "!", "e", "i", "o", "u", "n", "t", "r", "s", "l", "1", "2",
|
|
||||||
"3", "4", "5", "6", "7", "8", "9", ",", ".", ";", ":", "?", "!", "(", ")", "[", "]",
|
|
||||||
"{", "}", "hello", "test", "sample", "abc", "123", "ABC"
|
|
||||||
};
|
|
||||||
|
|
||||||
for (String testStr : testStrings) {
|
|
||||||
try {
|
|
||||||
byte[] encoded = font.encode(testStr);
|
|
||||||
if (encoded.length > 0) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int code = 0; code <= 0xFFFF; code += 100) {
|
|
||||||
try {
|
|
||||||
String testStr = String.valueOf((char) code);
|
|
||||||
byte[] encoded = font.encode(testStr);
|
|
||||||
if (encoded.length > 0) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isValidFont(PDFont font) {
|
|
||||||
if (font == null) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
String name = font.getName();
|
|
||||||
if (name != null && !name.trim().isEmpty()) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (canCalculateBasicWidths(font)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (canEncodeAnyCharacter(font)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -80,10 +80,6 @@ public class WidthCalculator {
|
|||||||
Float charWidth =
|
Float charWidth =
|
||||||
calculateSingleCharacterWidth(font, character, fontSize, codePoint);
|
calculateSingleCharacterWidth(font, character, fontSize, codePoint);
|
||||||
|
|
||||||
if (charWidth == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
totalWidth += charWidth;
|
totalWidth += charWidth;
|
||||||
if (previousCodePoint != -1) {
|
if (previousCodePoint != -1) {
|
||||||
totalWidth += calculateKerning(font, previousCodePoint, codePoint, fontSize);
|
totalWidth += calculateKerning(font, previousCodePoint, codePoint, fontSize);
|
||||||
@ -203,9 +199,6 @@ public class WidthCalculator {
|
|||||||
|
|
||||||
Float charWidth =
|
Float charWidth =
|
||||||
calculateGlyphWidthComprehensively(font, character, codePoint, fontSize);
|
calculateGlyphWidthComprehensively(font, character, codePoint, fontSize);
|
||||||
if (charWidth == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
totalWidth += charWidth;
|
totalWidth += charWidth;
|
||||||
i += Character.charCount(codePoint);
|
i += Character.charCount(codePoint);
|
||||||
@ -514,64 +507,4 @@ public class WidthCalculator {
|
|||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public float calculateMinimumTextWidth(PDFont font, String text, float fontSize) {
|
|
||||||
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
float minWidth = calculateAccurateWidth(font, text, fontSize);
|
|
||||||
if (minWidth > 0) {
|
|
||||||
return minWidth * 0.8f;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
}
|
|
||||||
|
|
||||||
return text.length() * fontSize * 0.3f;
|
|
||||||
}
|
|
||||||
|
|
||||||
public float calculateMaximumTextWidth(PDFont font, String text, float fontSize) {
|
|
||||||
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
float maxWidth = calculateAccurateWidth(font, text, fontSize);
|
|
||||||
if (maxWidth > 0) {
|
|
||||||
return maxWidth * 1.2f;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
}
|
|
||||||
|
|
||||||
return text.length() * fontSize * 1.0f;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean canCalculateWidthForText(PDFont font, String text) {
|
|
||||||
if (font == null || text == null) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (text.isEmpty()) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
Float width = calculateDirectWidth(font, text, 12f);
|
|
||||||
if (width != null) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
Float width = calculateCharacterByCharacterWidth(font, text, 12f);
|
|
||||||
if (width != null) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -13,20 +13,7 @@
|
|||||||
color: #6c757d !important;
|
color: #6c757d !important;
|
||||||
}
|
}
|
||||||
|
|
||||||
.btn-primary:focus {
|
.btn-primary:focus, .form-check-input:focus, .form-control:focus, .form-select:focus {
|
||||||
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
|
|
||||||
outline: 2px solid #0d6efd;
|
|
||||||
outline-offset: 2px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.form-check-input:focus {
|
|
||||||
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
|
|
||||||
outline: 2px solid #0d6efd;
|
|
||||||
outline-offset: 2px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.form-control:focus, .form-select:focus {
|
|
||||||
border-color: #0d6efd;
|
|
||||||
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
|
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
|
||||||
outline: 2px solid #0d6efd;
|
outline: 2px solid #0d6efd;
|
||||||
outline-offset: 2px;
|
outline-offset: 2px;
|
||||||
@ -36,20 +23,6 @@
|
|||||||
background-color: #0d6efd;
|
background-color: #0d6efd;
|
||||||
border-color: #0d6efd;
|
border-color: #0d6efd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
.sr-only {
|
|
||||||
position: absolute;
|
|
||||||
width: 1px;
|
|
||||||
height: 1px;
|
|
||||||
padding: 0;
|
|
||||||
margin: -1px;
|
|
||||||
overflow: hidden;
|
|
||||||
clip: rect(0, 0, 0, 0);
|
|
||||||
white-space: nowrap;
|
|
||||||
border: 0;
|
|
||||||
}
|
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user