diff --git a/app/core/src/main/java/stirling/software/SPDF/service/AggressiveRedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/AggressiveRedactionService.java index 3cca0dfde..bbb549389 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/AggressiveRedactionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/AggressiveRedactionService.java @@ -30,51 +30,39 @@ class AggressiveRedactionService implements RedactionModeStrategy { boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch()); - PDDocument doc = null; - PDDocument fb = null; - try { - doc = pdfDocumentFactory.load(request.getFileInput()); + try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) { Map> allFound = RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); if (allFound.isEmpty()) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - doc.save(baos); - return baos.toByteArray(); - } + return toByteArray(doc); } + helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord); Map> residual = RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0; - String effectiveColor = - (request.getRedactColor() == null || request.getRedactColor().isBlank()) - ? "#000000" - : request.getRedactColor(); + if (residualExists) { - // Use the new visual redaction with OCR restoration fallback return helper.performVisualRedactionWithOcrRestoration( request, listOfText, useRegex, wholeWord); } + return RedactionService.finalizeRedaction( doc, allFound, request.getRedactColor(), request.getCustomPadding(), - request.getConvertPDFToImage(), /*text removal*/ + request.getConvertPDFToImage(), true); } catch (Exception e) { throw new IOException("Aggressive redaction failed: " + e.getMessage(), e); - } finally { - if (doc != null) - try { - doc.close(); - } catch (IOException ignore) { - } - if (fb != null) - try { - fb.close(); - } catch (IOException ignore) { - } + } + } + + private byte[] toByteArray(PDDocument doc) throws IOException { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + doc.save(baos); + return baos.toByteArray(); } } } diff --git a/app/core/src/main/java/stirling/software/SPDF/service/ModerateRedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/ModerateRedactionService.java index 35e1d6907..7f53124a0 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/ModerateRedactionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/ModerateRedactionService.java @@ -29,49 +29,36 @@ class ModerateRedactionService implements RedactionModeStrategy { boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch()); - PDDocument doc = null; - PDDocument fallback = null; - try { - doc = pdfDocumentFactory.load(request.getFileInput()); + try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) { Map> allFound = RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); if (allFound.isEmpty()) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - doc.save(baos); - return baos.toByteArray(); - } + return toByteArray(doc); } + boolean fallbackToBoxOnly = helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord); - String effectiveColor = - (request.getRedactColor() == null || request.getRedactColor().isBlank()) - ? "#000000" - : request.getRedactColor(); if (fallbackToBoxOnly) { - // Use the new visual redaction with OCR restoration fallback return helper.performVisualRedactionWithOcrRestoration( request, listOfText, useRegex, wholeWord); } + return RedactionService.finalizeRedaction( doc, allFound, - effectiveColor, + request.getRedactColor(), request.getCustomPadding(), request.getConvertPDFToImage(), false); } catch (Exception e) { throw new IOException("Moderate redaction failed: " + e.getMessage(), e); - } finally { - if (doc != null) - try { - doc.close(); - } catch (IOException ignore) { - } - if (fallback != null) - try { - fallback.close(); - } catch (IOException ignore) { - } + } + } + + private byte[] toByteArray(PDDocument doc) throws IOException { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + doc.save(baos); + return baos.toByteArray(); } } } diff --git a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java index a7dbb73c5..e01b40404 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java @@ -81,6 +81,7 @@ public class RedactionService { private static final Set TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); private static final COSString EMPTY_COS_STRING = new COSString(""); private static final int MAX_SWEEPS = 3; + private static final Pattern PATTERN = Pattern.compile(".*(hoepap|temp|generated).*"); private boolean aggressiveMode = false; private Map> aggressiveSegMatches = null; private final CustomPDFDocumentFactory pdfDocumentFactory; @@ -266,26 +267,20 @@ public class RedactionService { boolean wholeWordSearch) { try { for (String term : targetWords) { - if (term == null || term.isBlank()) { - continue; - } + if (term == null || term.isBlank()) continue; + TextFinder finder = new TextFinder(term, useRegex, wholeWordSearch); finder.setStartPage(pageIndex + 1); finder.setEndPage(pageIndex + 1); finder.getText(document); - List foundTexts = finder.getFoundTexts(); - for (PDFText ft : foundTexts) { - if (ft.getPageIndex() == pageIndex) { + for (PDFText text : finder.getFoundTexts()) { + if (text.getPageIndex() == pageIndex) { return true; } } - - if (!foundTexts.isEmpty()) {} } - return false; - } catch (Exception e) { return true; } @@ -297,18 +292,13 @@ public class RedactionService { boolean useRegex, boolean wholeWordSearch) { try { - int idx = -1; - final int numberOfPages = document.getNumberOfPages(); - for (int i = 0; i < numberOfPages; i++) { - idx++; + for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { if (pageStillContainsTargets( - document, idx, targetWords, useRegex, wholeWordSearch)) { + document, pageIndex, targetWords, useRegex, wholeWordSearch)) { return true; } } - return false; - } catch (Exception e) { return true; } @@ -352,12 +342,11 @@ public class RedactionService { for (List pageTexts : allFoundTextsByPage.values()) { allFoundTexts.addAll(pageTexts); } - if (!allFoundTexts.isEmpty()) { - if (!isTextRemovalMode) { - Color redactColor = decodeOrDefault(colorString); - redactFoundText(document, allFoundTexts, customPadding, redactColor); - } + if (!allFoundTexts.isEmpty() && !isTextRemovalMode) { + Color redactColor = decodeOrDefault(colorString); + redactFoundText(document, allFoundTexts, customPadding, redactColor); } + if (Boolean.TRUE.equals(convertToImage)) { try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -597,18 +586,11 @@ public class RedactionService { private static boolean isTextSafeForRedaction(String text) { if (text == null || text.isEmpty()) return true; - for (int i = 0; i < text.length(); i++) { - char c = text.charAt(i); - int codePoint = c; - - if (codePoint >= 65488) { - return false; - } - if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') { + for (char c : text.toCharArray()) { + if (c >= 65488 || (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')) { return false; } } - return true; } @@ -657,56 +639,33 @@ public class RedactionService { return wipeAllSemanticTextInTokens(tokens, true); } - public byte[] performVisualRedactionWithOcrRestoration( - RedactPdfRequest request, - String[] listOfText, - boolean useRegex, - boolean wholeWordSearch) - throws IOException { - PDDocument visualRedactedDoc = null; - try { - visualRedactedDoc = pdfDocumentFactory.load(request.getFileInput()); - Map> allFound = - findTextToRedact(visualRedactedDoc, listOfText, useRegex, wholeWordSearch); - String effectiveColor = - (request.getRedactColor() == null || request.getRedactColor().isBlank()) - ? "#000000" - : request.getRedactColor(); - byte[] visualRedactedBytes = - finalizeRedaction( - visualRedactedDoc, - allFound, - effectiveColor, - request.getCustomPadding(), - true, - false); - return performOcrRestoration(visualRedactedBytes, request); - } catch (Exception e) { - throw new IOException( - "Visual redaction with OCR restoration failed: " + e.getMessage(), e); - } finally { - if (visualRedactedDoc != null) { - try { - visualRedactedDoc.close(); - } catch (IOException ignore) { - } + private static String normalizeTextForRedaction(String text) { + if (text == null) return null; + + StringBuilder normalized = new StringBuilder(); + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + + if (c >= 65488) { + normalized.append(' '); + } else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') { + normalized.append(' '); + } else { + normalized.append(c); } } + + return normalized.toString(); } - private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request) - throws IOException, InterruptedException { - try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf"); - TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) { - java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes); - if (isOcrMyPdfAvailable()) { - return processWithOcrMyPdfForRestoration( - tempInputFile.getPath(), tempOutputFile.getPath(), request); - } else if (isTesseractAvailable()) { - return processWithTesseractForRestoration( - tempInputFile.getPath(), tempOutputFile.getPath(), request); - } - return redactedPdfBytes; + private static boolean isOcrMyPdfAvailable() { + try { + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF) + .runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version")); + return result.getRc() == 0; + } catch (Exception e) { + return false; } } @@ -780,37 +739,7 @@ public class RedactionService { } } - private static String normalizeTextForRedaction(String text) { - if (text == null) return null; - - StringBuilder normalized = new StringBuilder(); - for (int i = 0; i < text.length(); i++) { - char c = text.charAt(i); - - if ((int) c >= 65488) { - normalized.append(' '); - } else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') { - normalized.append(' '); - } else { - normalized.append(c); - } - } - - return normalized.toString(); - } - - private boolean isOcrMyPdfAvailable() { - try { - ProcessExecutorResult result = - ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF) - .runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version")); - return result.getRc() == 0; - } catch (Exception e) { - return false; - } - } - - private boolean isTesseractAvailable() { + private static boolean isTesseractAvailable() { try { ProcessExecutorResult result = ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT) @@ -826,7 +755,7 @@ public class RedactionService { String fontName = font.getName(); if (fontName == null || isProperFontSubset(fontName) - || fontName.toLowerCase().matches(".*(hoepap|temp|generated).*")) { + || PATTERN.matcher(fontName.toLowerCase()).matches()) { return false; } return hasReliableWidthMetrics(font); @@ -835,6 +764,58 @@ public class RedactionService { } } + private static String sanitizeText(String text) { + if (text == null) return ""; + + StringBuilder sanitized = new StringBuilder(); + for (char c : text.toCharArray()) { + sanitized.append( + (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') + ? '\uFFFD' + : c); + } + return sanitized.toString(); + } + + private static byte[] processWithOcrMyPdfForRestoration( + java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request) + throws IOException, InterruptedException { + List command = + Arrays.asList( + "ocrmypdf", + "--verbose", + "1", + "--output-type", + "pdf", + "--pdf-renderer", + "sandwich", + "--language", + "eng", + "--optimize", + "0", + "--jpeg-quality", + "100", + "--png-quality", + "9", + "--force-ocr", + "--deskew", + "--clean", + "--clean-final", + inputPath.toString(), + outputPath.toString()); + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF) + .runCommandWithOutputHandling(command); + if (result.getRc() != 0) { + throw new IOException( + "OCRmyPDF restoration failed with return code: " + + result.getRc() + + ". Error: " + + result.getMessages()); + } + return java.nio.file.Files.readAllBytes(outputPath); + } + private static String createSubsetFontPlaceholder( String originalWord, float targetWidth, PDFont font, float fontSize) { String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); @@ -843,75 +824,142 @@ public class RedactionService { : " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1)); } - public void performTextReplacementAggressive( - PDDocument document, - Map> allFoundTextsByPage, - String[] listOfText, - boolean useRegex, - boolean wholeWordSearchBool) { - if (allFoundTextsByPage.isEmpty()) { - return; - } - Set allSearchTerms = - Arrays.stream(listOfText) - .map(String::trim) - .filter(s -> !s.isEmpty()) - .collect(Collectors.toSet()); - this.aggressiveMode = true; - this.aggressiveSegMatches = new HashMap<>(); + private static COSArray buildKerningAdjustedTJArray( + COSArray originalArray, COSArray redactedArray, TextSegment segment) { try { - for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) { - boolean anyResidual = false; - int pageIndex = -1; - for (PDPage page : document.getPages()) { - pageIndex++; - try { - this.aggressiveSegMatches = new HashMap<>(); - List filtered = - createTokensWithoutTargetText( - document, - page, - allSearchTerms, - useRegex, - wholeWordSearchBool); - writeFilteredContentStream(document, page, filtered); - boolean residual = - pageStillContainsTargets( - document, - pageIndex, - allSearchTerms, - useRegex, - wholeWordSearchBool); - if (residual) { - anyResidual = true; - try { - var sem = wipeAllSemanticTextInTokens(filtered); - filtered = sem.tokens; - PDResources res = page.getResources(); - if (res != null) { - wipeAllSemanticTextInProperties(res); - wipeAllTextInXObjects(document, res); - wipeAllTextInPatterns(document, res); - } - writeFilteredContentStream(document, page, filtered); - } catch (Exception ignored) { - } + if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0) + return redactedArray; + + COSArray out = new COSArray(); + int size = redactedArray.size(); + for (int i = 0; i < size; i++) { + COSBase redEl = redactedArray.get(i); + COSBase origEl = + (originalArray != null && i < originalArray.size()) + ? originalArray.get(i) + : null; + + out.add(redEl); + + if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) { + String origText = getDecodedString(origStr, segment.getFont()); + String modText = getDecodedString(redStr, segment.getFont()); + float wOrig = + calculateSafeWidth(origText, segment.getFont(), segment.getFontSize()); + float wMod = + calculateSafeWidth(modText, segment.getFont(), segment.getFontSize()); + float adjustment = wOrig - wMod; + if (Math.abs(adjustment) > PRECISION_THRESHOLD) { + float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR; + if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) { + i++; + float combined = num.floatValue() + kerning; + out.add(new COSFloat(combined)); + } else { + out.add(new COSFloat(kerning)); } - } catch (Exception ignored) { } } - if (!anyResidual) { - break; - } - if (!documentStillContainsTargets( - document, allSearchTerms, useRegex, wholeWordSearchBool)) { - break; + } + return out; + } catch (Exception e) { + return redactedArray; + } + } + + private static List findMatchesInSegments( + List segments, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + List allMatches = new ArrayList<>(); + List patterns = + TextFinderUtils.createOptimizedSearchPatterns( + targetWords, useRegex, wholeWordSearch); + + log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size()); + + int totalMatchesFound = 0; + + for (int i = 0; i < segments.size(); i++) { + TextSegment segment = segments.get(i); + String segmentText = segment.getText(); + if (segmentText == null || segmentText.isEmpty()) { + log.debug("Skipping empty segment {}", i); + continue; + } + + log.debug("Processing segment {}: '{}'", i, segmentText); + + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) { + log.debug( + "Skipping segment {} - font not removable: {}", + i, + segment.getFont().getName()); + continue; + } + + int segmentMatches = 0; + for (Pattern pattern : patterns) { + try { + log.debug( + "Matching pattern '{}' against segment text '{}'", + pattern.pattern(), + segmentText); + var matcher = pattern.matcher(segmentText); + while (matcher.find()) { + int matchStart = matcher.start(); + int matchEnd = matcher.end(); + + log.debug( + "Found match in segment {}: positions {}-{}", + i, + matchStart, + matchEnd); + + if (matchStart >= 0 + && matchEnd <= segmentText.length() + && matchStart < matchEnd) { + String matchedText = segmentText.substring(matchStart, matchEnd); + log.debug("Matched text: '{}'", matchedText); + + allMatches.add( + new MatchRange( + segment.getStartPos() + matchStart, + segment.getStartPos() + matchEnd)); + segmentMatches++; + totalMatchesFound++; + } + } + } catch (Exception e) { + log.error("Error matching pattern in segment {}: {}", i, e.getMessage()); } } - } finally { - this.aggressiveMode = false; - this.aggressiveSegMatches = null; + + if (segmentMatches > 0) { + log.info("Segment {} had {} matches", i, segmentMatches); + } } + + log.info("Total matches found across all segments: {}", totalMatchesFound); + allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos)); + + if (allMatches.isEmpty()) { + log.warn("No matches found in segments. This might indicate:"); + log.warn("1. Text encoding issues preventing proper extraction"); + log.warn("2. Font compatibility issues"); + log.warn("3. Search terms not matching extracted text"); + log.warn("4. Whole word search filtering out matches"); + + if (!segments.isEmpty()) { + log.warn("Sample segment text: '{}'", segments.get(0).getText()); + log.warn("Target words: {}", targetWords); + log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch); + } + } + + return allMatches; } private static float calculateCharacterSumWidth(PDFont font, String text) { @@ -1033,18 +1081,28 @@ public class RedactionService { } } - private static String sanitizeText(String text) { - if (text == null) return ""; - - StringBuilder sanitized = new StringBuilder(); - for (char c : text.toCharArray()) { - if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') { - sanitized.append('\uFFFD'); - } else { - sanitized.append(c); - } + public byte[] performVisualRedactionWithOcrRestoration( + RedactPdfRequest request, + String[] listOfText, + boolean useRegex, + boolean wholeWordSearch) + throws IOException { + try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) { + Map> allFound = + findTextToRedact(doc, listOfText, useRegex, wholeWordSearch); + byte[] visualRedactedBytes = + finalizeRedaction( + doc, + allFound, + request.getRedactColor(), + request.getCustomPadding(), + true, + false); + return performOcrRestoration(visualRedactedBytes, request); + } catch (Exception e) { + throw new IOException( + "Visual redaction with OCR restoration failed: " + e.getMessage(), e); } - return sanitized.toString(); } private static WipeResult wipeAllSemanticTextInTokens(List tokens, boolean removeTU) { @@ -1064,43 +1122,21 @@ public class RedactionService { return res; } - private byte[] processWithOcrMyPdfForRestoration( - java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request) + private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request) throws IOException, InterruptedException { - List command = - Arrays.asList( - "ocrmypdf", - "--verbose", - "1", - "--output-type", - "pdf", - "--pdf-renderer", - "sandwich", - "--language", - "eng", - "--optimize", - "0", - "--jpeg-quality", - "100", - "--png-quality", - "9", - "--force-ocr", - "--deskew", - "--clean", - "--clean-final", - inputPath.toString(), - outputPath.toString()); - ProcessExecutorResult result = - ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF) - .runCommandWithOutputHandling(command); - if (result.getRc() != 0) { - throw new IOException( - "OCRmyPDF restoration failed with return code: " - + result.getRc() - + ". Error: " - + result.getMessages()); + try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf"); + TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) { + java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes); + + if (isOcrMyPdfAvailable()) { + return processWithOcrMyPdfForRestoration( + tempInputFile.getPath(), tempOutputFile.getPath(), request); + } else if (isTesseractAvailable()) { + return processWithTesseractForRestoration( + tempInputFile.getPath(), tempOutputFile.getPath(), request); + } + return redactedPdfBytes; } - return java.nio.file.Files.readAllBytes(outputPath); } private static boolean removeSemanticProperties(COSDictionary dict, boolean removeTU) { @@ -1427,59 +1463,62 @@ public class RedactionService { } } - private int getOriginalTokenCount(PDPage page) { + public void performTextReplacementAggressive( + PDDocument document, + Map> allFoundTextsByPage, + String[] listOfText, + boolean useRegex, + boolean wholeWordSearchBool) { + if (allFoundTextsByPage.isEmpty()) return; + + Set allSearchTerms = + Arrays.stream(listOfText) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toSet()); + + this.aggressiveMode = true; + this.aggressiveSegMatches = new HashMap<>(); + try { - PDFStreamParser parser = new PDFStreamParser(page); - int count = 0; - while (parser.parseNextToken() != null) { - count++; - } - return count; - } catch (Exception e) { - return 0; - } - } + for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) { + boolean anyResidual = false; - private COSArray buildKerningAdjustedTJArray( - COSArray originalArray, COSArray redactedArray, TextSegment segment) { - try { - if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0) - return redactedArray; + for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { + PDPage page = document.getPages().get(pageIndex); + try { + this.aggressiveSegMatches = new HashMap<>(); + List filtered = + createTokensWithoutTargetText( + document, + page, + allSearchTerms, + useRegex, + wholeWordSearchBool); + writeFilteredContentStream(document, page, filtered); - COSArray out = new COSArray(); - int size = redactedArray.size(); - for (int i = 0; i < size; i++) { - COSBase redEl = redactedArray.get(i); - COSBase origEl = - (originalArray != null && i < originalArray.size()) - ? originalArray.get(i) - : null; - - out.add(redEl); - - if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) { - String origText = getDecodedString(origStr, segment.getFont()); - String modText = getDecodedString(redStr, segment.getFont()); - float wOrig = - calculateSafeWidth(origText, segment.getFont(), segment.getFontSize()); - float wMod = - calculateSafeWidth(modText, segment.getFont(), segment.getFontSize()); - float adjustment = wOrig - wMod; - if (Math.abs(adjustment) > PRECISION_THRESHOLD) { - float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR; - if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) { - i++; - float combined = num.floatValue() + kerning; - out.add(new COSFloat(combined)); - } else { - out.add(new COSFloat(kerning)); + if (pageStillContainsTargets( + document, + pageIndex, + allSearchTerms, + useRegex, + wholeWordSearchBool)) { + anyResidual = true; + processResidualText(document, page, filtered); } + } catch (Exception ignored) { } } + + if (!anyResidual + || !documentStillContainsTargets( + document, allSearchTerms, useRegex, wholeWordSearchBool)) { + break; + } } - return out; - } catch (Exception e) { - return redactedArray; + } finally { + this.aggressiveMode = false; + this.aggressiveSegMatches = null; } } @@ -1678,6 +1717,21 @@ public class RedactionService { return problematicRatio > 0.3; } + private void processResidualText(PDDocument document, PDPage page, List filtered) { + try { + var sem = wipeAllSemanticTextInTokens(filtered); + filtered = sem.tokens; + PDResources res = page.getResources(); + if (res != null) { + wipeAllSemanticTextInProperties(res); + wipeAllTextInXObjects(document, res); + wipeAllTextInPatterns(document, res); + } + writeFilteredContentStream(document, page, filtered); + } catch (Exception ignored) { + } + } + public boolean performTextReplacement( PDDocument document, Map> allFoundTextsByPage, @@ -1688,151 +1742,38 @@ public class RedactionService { log.info("No text found to redact"); return false; } - try { - Set allSearchTerms = - Arrays.stream(listOfText) - .map(String::trim) - .filter(s -> !s.isEmpty()) - .collect(Collectors.toSet()); - log.info( - "Starting text replacement with {} search terms: {}", - allSearchTerms.size(), - allSearchTerms); - log.info("Total pages in document: {}", document.getNumberOfPages()); - log.info("Initial text found on {} pages", allFoundTextsByPage.size()); + Set allSearchTerms = + Arrays.stream(listOfText) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toSet()); - int initialTotalInstances = - allFoundTextsByPage.values().stream().mapToInt(List::size).sum(); - log.info("Total initial instances to redact: {}", initialTotalInstances); + log.info("Starting text replacement with {} search terms", allSearchTerms.size()); - int finalSweepCount = 0; - for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) { - finalSweepCount = sweep + 1; - log.info("=== Starting sweep {} of {} ===", sweep + 1, MAX_SWEEPS); - int pagesProcessed = 0; - int totalModifications = 0; + for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) { + processPages(document, allSearchTerms, useRegex, wholeWordSearchBool); - for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { - PDPage page = document.getPages().get(pageIndex); - List pageFoundTexts = - allFoundTextsByPage.getOrDefault(pageIndex, List.of()); - - log.debug( - "Processing page {} - found {} instances", - pageIndex + 1, - pageFoundTexts.size()); - - List filtered = - createTokensWithoutTargetText( - document, page, allSearchTerms, useRegex, wholeWordSearchBool); - writeFilteredContentStream(document, page, filtered); - - int tokenDiff = Math.abs(filtered.size() - getOriginalTokenCount(page)); - totalModifications += tokenDiff; - pagesProcessed++; - - log.debug("Page {} - token modifications: {}", pageIndex + 1, tokenDiff); - } - - log.info( - "Sweep {} completed - processed {} pages, total modifications: {}", - sweep + 1, - pagesProcessed, - totalModifications); - - boolean stillContainsTargets = - documentStillContainsTargets( - document, allSearchTerms, useRegex, wholeWordSearchBool); - - if (!stillContainsTargets) { - log.info("SUCCESS: All targets removed after {} sweeps", sweep + 1); - break; - } else { - log.warn( - "WARNING: Still contains targets after sweep {} - continuing...", - sweep + 1); - } - } - - boolean finalCheck = false; - for (int verifyAttempt = 0; verifyAttempt < 3; verifyAttempt++) { - log.info("Final verification attempt {} of 3", verifyAttempt + 1); - finalCheck = - documentStillContainsTargets( - document, allSearchTerms, useRegex, wholeWordSearchBool); - - if (!finalCheck) { - log.info( - "Verification attempt {} passed - no targets found", verifyAttempt + 1); - break; - } else { - log.warn("Verification attempt {} found remaining targets", verifyAttempt + 1); - if (verifyAttempt < 2) { - log.info("Performing additional cleanup sweep due to verification failure"); - for (PDPage page : document.getPages()) { - List additionalFiltered = - createTokensWithoutTargetText( - document, - page, - allSearchTerms, - useRegex, - wholeWordSearchBool); - writeFilteredContentStream(document, page, additionalFiltered); - } - } - } - } - - if (finalCheck) { - log.error( - "FAILURE: Document still contains targets after {} sweeps and {} verification attempts. Falling back to visual redaction with OCR restoration.", - MAX_SWEEPS, - 3); - log.error("Remaining search terms: {}", allSearchTerms); - - log.error("=== DETAILED FAILURE ANALYSIS ==="); - for (int pageIdx = 0; pageIdx < document.getNumberOfPages(); pageIdx++) { - for (String term : allSearchTerms) { - try { - TextFinder finder = new TextFinder(term, useRegex, wholeWordSearchBool); - finder.setStartPage(pageIdx + 1); - finder.setEndPage(pageIdx + 1); - finder.getText(document); - - for (PDFText found : finder.getFoundTexts()) { - if (found.getPageIndex() == pageIdx) { - log.error( - "REMAINING: '{}' found on page {} at position ({}, {})", - term, - pageIdx + 1, - found.getX1(), - found.getY1()); - } - } - } catch (Exception e) { - log.error( - "Error during failure analysis for term '{}' on page {}: {}", - term, - pageIdx + 1, - e.getMessage()); - } - } - } - log.error("=== END FAILURE ANALYSIS ==="); - - return true; - } else { - log.info( - "SUCCESS: All text redaction completed successfully after {} sweeps", - finalSweepCount); + if (!documentStillContainsTargets( + document, allSearchTerms, useRegex, wholeWordSearchBool)) { + log.info("SUCCESS: All targets removed after {} sweeps", sweep + 1); return false; } - - } catch (Exception e) { - log.error("Exception during text replacement: {}", e.getMessage(), e); - return true; } + + // Verification attempts + for (int attempt = 0; attempt < 3; attempt++) { + if (!documentStillContainsTargets( + document, allSearchTerms, useRegex, wholeWordSearchBool)) { + return false; + } + if (attempt < 2) { + processPages(document, allSearchTerms, useRegex, wholeWordSearchBool); + } + } + + log.error("FAILURE: Document still contains targets after {} sweeps", MAX_SWEEPS); + return true; } private COSArray createRedactedTJArray( @@ -1917,99 +1858,21 @@ public class RedactionService { }; } - private List findMatchesInSegments( - List segments, - Set targetWords, + private void processPages( + PDDocument document, + Set allSearchTerms, boolean useRegex, - boolean wholeWordSearch) { - List allMatches = new ArrayList<>(); - List patterns = - TextFinderUtils.createOptimizedSearchPatterns( - targetWords, useRegex, wholeWordSearch); - - log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size()); - - int totalMatchesFound = 0; - - for (int i = 0; i < segments.size(); i++) { - TextSegment segment = segments.get(i); - String segmentText = segment.getText(); - if (segmentText == null || segmentText.isEmpty()) { - log.debug("Skipping empty segment {}", i); - continue; - } - - log.debug("Processing segment {}: '{}'", i, segmentText); - - if (segment.getFont() != null - && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) { - log.debug( - "Skipping segment {} - font not removable: {}", - i, - segment.getFont().getName()); - continue; - } - - int segmentMatches = 0; - for (Pattern pattern : patterns) { - try { - log.debug( - "Matching pattern '{}' against segment text '{}'", - pattern.pattern(), - segmentText); - var matcher = pattern.matcher(segmentText); - while (matcher.find()) { - int matchStart = matcher.start(); - int matchEnd = matcher.end(); - - log.debug( - "Found match in segment {}: positions {}-{}", - i, - matchStart, - matchEnd); - - if (matchStart >= 0 - && matchEnd <= segmentText.length() - && matchStart < matchEnd) { - String matchedText = segmentText.substring(matchStart, matchEnd); - log.debug("Matched text: '{}'", matchedText); - - allMatches.add( - new MatchRange( - segment.getStartPos() + matchStart, - segment.getStartPos() + matchEnd)); - segmentMatches++; - totalMatchesFound++; - } - } - } catch (Exception e) { - log.error("Error matching pattern in segment {}: {}", i, e.getMessage()); - } - } - - if (segmentMatches > 0) { - log.info("Segment {} had {} matches", i, segmentMatches); + boolean wholeWordSearchBool) { + for (PDPage page : document.getPages()) { + try { + List filtered = + createTokensWithoutTargetText( + document, page, allSearchTerms, useRegex, wholeWordSearchBool); + writeFilteredContentStream(document, page, filtered); + } catch (Exception e) { + log.warn("Error processing page: {}", e.getMessage()); } } - - log.info("Total matches found across all segments: {}", totalMatchesFound); - allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos)); - - if (allMatches.isEmpty()) { - log.warn("No matches found in segments. This might indicate:"); - log.warn("1. Text encoding issues preventing proper extraction"); - log.warn("2. Font compatibility issues"); - log.warn("3. Search terms not matching extracted text"); - log.warn("4. Whole word search filtering out matches"); - - if (!segments.isEmpty()) { - log.warn("Sample segment text: '{}'", segments.get(0).getText()); - log.warn("Target words: {}", targetWords); - log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch); - } - } - - return allMatches; } private String createSafeReplacement(String originalPart, TextSegment segment) { @@ -2962,9 +2825,9 @@ public class RedactionService { @Data public static class DecodedMapping { - public String text; - public int[] charByteStart; - public int[] charByteEnd; + private String text; + private int[] charByteStart; + private int[] charByteEnd; } @Data diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java index 748d1179f..68625807b 100644 --- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java @@ -5,10 +5,17 @@ import org.apache.pdfbox.pdmodel.font.PDFont; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; +import java.util.regex.Pattern; + @Slf4j @UtilityClass public class TextEncodingHelper { + private final Pattern PATTERN = Pattern.compile("^[A-Z]+$"); + private final Pattern REGEX = Pattern.compile("^[A-Z]{6}\\+.*"); + private final Pattern REGEXP = Pattern.compile("^[A-Z]{5}\\+.*"); + private final Pattern PATTERN1 = Pattern.compile("^[A-Z]{4}\\+.*"); + public boolean canEncodeCharacters(PDFont font, String text) { if (font == null || text == null) { return false; @@ -421,21 +428,21 @@ public class TextEncodingHelper { return false; } - if (fontName.matches("^[A-Z]{6}\\+.*")) { + if (REGEX.matcher(fontName).matches()) { return true; } - if (fontName.matches("^[A-Z]{5}\\+.*")) { + if (REGEXP.matcher(fontName).matches()) { return true; } - if (fontName.matches("^[A-Z]{4}\\+.*")) { + if (PATTERN1.matcher(fontName).matches()) { return true; } if (fontName.contains("+")) { String prefix = fontName.split("\\+")[0]; - if (prefix.matches("^[A-Z]+$") && prefix.length() >= 4) { + if (PATTERN.matcher(prefix).matches() && prefix.length() >= 4) { return true; } } @@ -510,68 +517,4 @@ public class TextEncodingHelper { return false; } - public boolean canEncodeAnyCharacter(PDFont font) { - if (font == null) { - return false; - } - - String[] testStrings = { - "a", "A", "0", " ", ".", "!", "e", "i", "o", "u", "n", "t", "r", "s", "l", "1", "2", - "3", "4", "5", "6", "7", "8", "9", ",", ".", ";", ":", "?", "!", "(", ")", "[", "]", - "{", "}", "hello", "test", "sample", "abc", "123", "ABC" - }; - - for (String testStr : testStrings) { - try { - byte[] encoded = font.encode(testStr); - if (encoded.length > 0) { - return true; - } - } catch (Exception e) { - } - } - - for (int code = 0; code <= 0xFFFF; code += 100) { - try { - String testStr = String.valueOf((char) code); - byte[] encoded = font.encode(testStr); - if (encoded.length > 0) { - return true; - } - } catch (Exception e) { - } - } - - return false; - } - - public boolean isValidFont(PDFont font) { - if (font == null) { - return false; - } - - try { - String name = font.getName(); - if (name != null && !name.trim().isEmpty()) { - return true; - } - } catch (Exception e) { - } - - try { - if (canCalculateBasicWidths(font)) { - return true; - } - } catch (Exception e) { - } - - try { - if (canEncodeAnyCharacter(font)) { - return true; - } - } catch (Exception e) { - } - - return false; - } } diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java index e465f225a..a9b4eaf32 100644 --- a/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java @@ -80,10 +80,6 @@ public class WidthCalculator { Float charWidth = calculateSingleCharacterWidth(font, character, fontSize, codePoint); - if (charWidth == null) { - return null; - } - totalWidth += charWidth; if (previousCodePoint != -1) { totalWidth += calculateKerning(font, previousCodePoint, codePoint, fontSize); @@ -203,9 +199,6 @@ public class WidthCalculator { Float charWidth = calculateGlyphWidthComprehensively(font, character, codePoint, fontSize); - if (charWidth == null) { - return null; - } totalWidth += charWidth; i += Character.charCount(codePoint); @@ -514,64 +507,4 @@ public class WidthCalculator { return false; } - - public float calculateMinimumTextWidth(PDFont font, String text, float fontSize) { - if (font == null || text == null || text.isEmpty() || fontSize <= 0) { - return 0; - } - - try { - float minWidth = calculateAccurateWidth(font, text, fontSize); - if (minWidth > 0) { - return minWidth * 0.8f; - } - } catch (Exception e) { - } - - return text.length() * fontSize * 0.3f; - } - - public float calculateMaximumTextWidth(PDFont font, String text, float fontSize) { - if (font == null || text == null || text.isEmpty() || fontSize <= 0) { - return 0; - } - - try { - float maxWidth = calculateAccurateWidth(font, text, fontSize); - if (maxWidth > 0) { - return maxWidth * 1.2f; - } - } catch (Exception e) { - } - - return text.length() * fontSize * 1.0f; - } - - public boolean canCalculateWidthForText(PDFont font, String text) { - if (font == null || text == null) { - return false; - } - - if (text.isEmpty()) { - return true; - } - - try { - Float width = calculateDirectWidth(font, text, 12f); - if (width != null) { - return true; - } - } catch (Exception e) { - } - - try { - Float width = calculateCharacterByCharacterWidth(font, text, 12f); - if (width != null) { - return true; - } - } catch (Exception e) { - } - - return true; - } } diff --git a/app/core/src/main/resources/templates/security/auto-redact.html b/app/core/src/main/resources/templates/security/auto-redact.html index ae8df9f59..7c0a5b626 100644 --- a/app/core/src/main/resources/templates/security/auto-redact.html +++ b/app/core/src/main/resources/templates/security/auto-redact.html @@ -13,20 +13,7 @@ color: #6c757d !important; } - .btn-primary:focus { - box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25); - outline: 2px solid #0d6efd; - outline-offset: 2px; - } - - .form-check-input:focus { - box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25); - outline: 2px solid #0d6efd; - outline-offset: 2px; - } - - .form-control:focus, .form-select:focus { - border-color: #0d6efd; + .btn-primary:focus, .form-check-input:focus, .form-control:focus, .form-select:focus { box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25); outline: 2px solid #0d6efd; outline-offset: 2px; @@ -36,20 +23,6 @@ background-color: #0d6efd; border-color: #0d6efd; } - - - - .sr-only { - position: absolute; - width: 1px; - height: 1px; - padding: 0; - margin: -1px; - overflow: hidden; - clip: rect(0, 0, 0, 0); - white-space: nowrap; - border: 0; - }