diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java b/app/core/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java index eb7245e5a..17d99b948 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java @@ -1,5 +1,10 @@ package stirling.software.SPDF.controller.web; +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + import org.springframework.stereotype.Controller; import org.springframework.ui.Model; import org.springframework.web.bind.annotation.GetMapping; @@ -7,14 +12,36 @@ import org.springframework.web.bind.annotation.GetMapping; import io.swagger.v3.oas.annotations.Hidden; import io.swagger.v3.oas.annotations.tags.Tag; +import lombok.RequiredArgsConstructor; + +import stirling.software.common.model.ApplicationProperties; + @Controller @Tag(name = "Security", description = "Security APIs") +@RequiredArgsConstructor public class SecurityWebController { + private final ApplicationProperties applicationProperties; + + private List getAvailableTesseractLanguages() { + String tessdataDir = applicationProperties.getSystem().getTessdataDir(); + File[] files = new File(tessdataDir).listFiles(); + if (files == null) { + return Collections.emptyList(); + } + return Arrays.stream(files) + .filter(file -> file.getName().endsWith(".traineddata")) + .map(file -> file.getName().replace(".traineddata", "")) + .filter(lang -> !"osd".equalsIgnoreCase(lang)) + .sorted() + .toList(); + } + @GetMapping("/auto-redact") @Hidden public String autoRedactForm(Model model) { model.addAttribute("currentPage", "auto-redact"); + model.addAttribute("languages", getAvailableTesseractLanguages()); return "security/auto-redact"; } diff --git a/app/core/src/main/java/stirling/software/SPDF/model/api/security/RedactPdfRequest.java b/app/core/src/main/java/stirling/software/SPDF/model/api/security/RedactPdfRequest.java index 6fe768f5d..ad707567e 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/api/security/RedactPdfRequest.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/api/security/RedactPdfRequest.java @@ -1,5 +1,7 @@ package stirling.software.SPDF.model.api.security; +import java.util.List; + import io.swagger.v3.oas.annotations.media.Schema; import lombok.Data; @@ -53,4 +55,10 @@ public class RedactPdfRequest extends PDFFile { allowableValues = {"moderate", "visual", "aggressive"}, requiredMode = Schema.RequiredMode.NOT_REQUIRED) private String redactionMode; + + @Schema( + description = + "List of OCR languages to use for restoration when needed (Tesseract codes like 'eng', 'deu')", + requiredMode = Schema.RequiredMode.NOT_REQUIRED) + private List languages; } diff --git a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java index e01b40404..8e4a6f470 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java @@ -12,6 +12,7 @@ import java.util.Collections; import java.util.Comparator; import java.util.Deque; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -87,6 +88,140 @@ public class RedactionService { private final CustomPDFDocumentFactory pdfDocumentFactory; private final TempFileManager tempFileManager; + private static List parseAllTokens(PDFStreamParser parser) throws IOException { + List tokens = new ArrayList<>(); + Object token; + while ((token = parser.parseNextToken()) != null) { + tokens.add(token); + } + return tokens; + } + + private static String buildLanguageOption(RedactPdfRequest request) { + List langs = (request != null) ? request.getLanguages() : null; + return (langs == null || langs.isEmpty()) ? "eng" : String.join("+", langs); + } + + private static byte[] processWithOcrMyPdfForRestoration( + java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request) + throws IOException, InterruptedException { + String languageOption = buildLanguageOption(request); + List command = + Arrays.asList( + "ocrmypdf", + "--verbose", + "1", + "--output-type", + "pdf", + "--pdf-renderer", + "sandwich", + "--language", + languageOption, + "--optimize", + "0", + "--jpeg-quality", + "100", + "--png-quality", + "9", + "--force-ocr", + "--deskew", + "--clean", + "--clean-final", + inputPath.toString(), + outputPath.toString()); + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF) + .runCommandWithOutputHandling(command); + if (result.getRc() != 0) { + throw new IOException( + "OCRmyPDF restoration failed with return code: " + + result.getRc() + + ". Error: " + + result.getMessages()); + } + return java.nio.file.Files.readAllBytes(outputPath); + } + + private static String createEnhancedSubsetPlaceholder( + String originalWord, float targetWidth, PDFont font, float fontSize) { + if (originalWord == null || originalWord.isEmpty()) { + return " "; + } + + try { + GlyphCoverageProbe probe = new GlyphCoverageProbe(font); + + float embeddedWidth = 0f; + for (int i = 0; i < originalWord.length(); ) { + int codePoint = originalWord.codePointAt(i); + embeddedWidth += + probe.getWidthWithFallback( + codePoint, FallbackStrategy.EMBED_WIDTH, fontSize); + i += Character.charCount(codePoint); + } + + if (embeddedWidth > 0 && Math.abs(embeddedWidth - targetWidth) < targetWidth * 0.5f) { + float spaceWidth = + probe.getWidthWithFallback(' ', FallbackStrategy.EMBED_WIDTH, fontSize); + + if (spaceWidth > 0) { + int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); + int maxSpaces = Math.max(originalWord.length() * 3, 20); + return " ".repeat(Math.min(spaceCount, maxSpaces)); + } + } + + return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + + } catch (Exception e) { + return " ".repeat(Math.max(1, originalWord.length())); + } + } + + static String createPlaceholderWithWidth( + String originalWord, float targetWidth, PDFont font, float fontSize) { + if (originalWord == null || originalWord.isEmpty()) return " "; + if (font == null || fontSize <= 0) return " ".repeat(originalWord.length()); + + // Enhanced font subset handling + if (TextEncodingHelper.isFontSubset(font.getName())) { + return createEnhancedSubsetPlaceholder(originalWord, targetWidth, font, fontSize); + } + + if (!WidthCalculator.isWidthCalculationReliable(font)) + return " ".repeat(originalWord.length()); + + final String repeat = " ".repeat(Math.max(1, originalWord.length())); + + try { + float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize); + if (spaceWidth <= 0) { + return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + } + + int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); + int maxSpaces = + Math.max( + originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f)); + return " ".repeat(Math.min(spaceCount, maxSpaces)); + } catch (Exception e) { + String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + return result != null ? result : repeat; + } + } + + public static Set createPrivacyScrubOptions() { + return Set.of( + ScrubOption.REMOVE_ACTUALTEXT, + ScrubOption.REMOVE_ALT, + ScrubOption.REMOVE_TU, + ScrubOption.NORMALIZE_WHITESPACE); + } + + public static Set createBasicScrubOptions() { + return Set.of(ScrubOption.NORMALIZE_WHITESPACE); + } + private static void redactAreas( List redactionAreas, PDDocument document, PDPageTree allPages) throws IOException { @@ -583,6 +718,67 @@ public class RedactionService { return strategy.redact(request); } + /** + * Enhanced redaction with semantic scrubbing Integrates the PDFBox enhancement plan for both + * text redaction and metadata cleanup + */ + public byte[] redactPdfWithSemanticScrubbing( + RedactPdfRequest request, Set scrubOptions) throws IOException { + + String mode = request.getRedactionMode(); + if (mode == null || mode.isBlank()) { + mode = "moderate"; + } + + // Perform standard redaction first + RedactionModeStrategy strategy = + switch (mode.toLowerCase()) { + case "visual" -> new VisualRedactionService(pdfDocumentFactory, this); + case "aggressive" -> new AggressiveRedactionService(pdfDocumentFactory, this); + default -> new ModerateRedactionService(pdfDocumentFactory, this); + }; + + byte[] redactedBytes = strategy.redact(request); + + // Apply semantic scrubbing to the redacted document + if (scrubOptions != null && !scrubOptions.isEmpty()) { + try (PDDocument document = pdfDocumentFactory.load(redactedBytes)) { + DefaultSemanticScrubber scrubber = new DefaultSemanticScrubber(); + scrubber.scrub(document, scrubOptions); + + // Save the scrubbed document + try (ByteArrayOutputStream output = new ByteArrayOutputStream()) { + document.save(output); + return output.toByteArray(); + } + } catch (Exception e) { + log.warn( + "Semantic scrubbing failed, returning redacted document without scrubbing", + e); + return redactedBytes; + } + } + + return redactedBytes; + } + + public byte[] applySemanticScrubbing(MultipartFile file, Set scrubOptions) + throws IOException { + if (scrubOptions == null || scrubOptions.isEmpty()) { + return file.getBytes(); // No scrubbing requested + } + + try (PDDocument document = pdfDocumentFactory.load(file)) { + DefaultSemanticScrubber scrubber = new DefaultSemanticScrubber(); + scrubber.scrub(document, scrubOptions); + + try (ByteArrayOutputStream output = new ByteArrayOutputStream()) { + document.save(output); + return output.toByteArray(); + } + } + } + private static boolean isTextSafeForRedaction(String text) { if (text == null || text.isEmpty()) return true; @@ -777,51 +973,175 @@ public class RedactionService { return sanitized.toString(); } - private static byte[] processWithOcrMyPdfForRestoration( + private byte[] processWithTesseractForRestoration( java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request) throws IOException, InterruptedException { - List command = - Arrays.asList( - "ocrmypdf", - "--verbose", - "1", - "--output-type", - "pdf", - "--pdf-renderer", - "sandwich", - "--language", - "eng", - "--optimize", - "0", - "--jpeg-quality", - "100", - "--png-quality", - "9", - "--force-ocr", - "--deskew", - "--clean", - "--clean-final", - inputPath.toString(), - outputPath.toString()); - ProcessExecutorResult result = - ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF) - .runCommandWithOutputHandling(command); - if (result.getRc() != 0) { - throw new IOException( - "OCRmyPDF restoration failed with return code: " - + result.getRc() - + ". Error: " - + result.getMessages()); + try (TempDirectory tempDir = new TempDirectory(tempFileManager)) { + java.io.File tempOutputDir = new java.io.File(tempDir.getPath().toFile(), "output"); + java.io.File tempImagesDir = new java.io.File(tempDir.getPath().toFile(), "images"); + java.io.File finalOutputFile = + new java.io.File(tempDir.getPath().toFile(), "final_output.pdf"); + tempOutputDir.mkdirs(); + tempImagesDir.mkdirs(); + try (PDDocument document = pdfDocumentFactory.load(inputPath.toFile())) { + PDFRenderer pdfRenderer = new PDFRenderer(document); + int pageCount = document.getNumberOfPages(); + PDFMergerUtility merger = new PDFMergerUtility(); + merger.setDestinationFileName(finalOutputFile.toString()); + for (int pageNum = 0; pageNum < pageCount; pageNum++) { + BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 600); + java.io.File imagePath = + new java.io.File(tempImagesDir, "page_" + pageNum + ".png"); + ImageIO.write(image, "png", imagePath); + List command = + new ArrayList<>( + Arrays.asList( + "tesseract", + imagePath.toString(), + new java.io.File(tempOutputDir, "page_" + pageNum) + .toString(), + "-l", + buildLanguageOption(request), + "--dpi", + "600", + "--psm", + "1", + "pdf")); + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT) + .runCommandWithOutputHandling(command); + if (result.getRc() != 0) { + throw new IOException( + "Tesseract restoration failed with return code: " + result.getRc()); + } + java.io.File pageOutputPath = + new java.io.File(tempOutputDir, "page_" + pageNum + ".pdf"); + merger.addSource(pageOutputPath); + } + merger.mergeDocuments(null); + java.nio.file.Files.copy( + finalOutputFile.toPath(), + outputPath, + java.nio.file.StandardCopyOption.REPLACE_EXISTING); + } + return java.nio.file.Files.readAllBytes(outputPath); } - return java.nio.file.Files.readAllBytes(outputPath); } - private static String createSubsetFontPlaceholder( - String originalWord, float targetWidth, PDFont font, float fontSize) { - String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); - return result != null - ? result - : " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1)); + List createTokensWithoutTargetText( + PDDocument document, + PDPage page, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) + throws IOException { + log.debug("Processing page with {} target words: {}", targetWords.size(), targetWords); + + PDFStreamParser parser = new PDFStreamParser(page); + List tokens = parseAllTokens(parser); + int tokenCount = tokens.size(); + + log.debug("Parsed {} tokens from page content stream", tokenCount); + + if (tokenCount == 0 && !targetWords.isEmpty()) { + log.warn( + "No tokens parsed from page content stream - this might indicate encoding issues"); + log.warn("Attempting alternative verification for target words: {}", targetWords); + + try { + TextFinder directFinder = new TextFinder("", false, false); + directFinder.setStartPage(document.getPages().indexOf(page) + 1); + directFinder.setEndPage(document.getPages().indexOf(page) + 1); + directFinder.getText(document); + + StringBuilder pageText = new StringBuilder(); + for (PDFText pdfText : directFinder.getFoundTexts()) { + if (pdfText.getText() != null) { + pageText.append(pdfText.getText()).append(" "); + } + } + + String extractedText = pageText.toString().trim(); + log.debug("Alternative text extraction found: '{}'", extractedText); + + for (String word : targetWords) { + if (extractedText.toLowerCase().contains(word.toLowerCase())) { + log.warn("Found target word '{}' via alternative extraction method", word); + } + } + + } catch (Exception e) { + log.error("Alternative text extraction failed: {}", e.getMessage()); + } + } + + PDResources resources = page.getResources(); + if (resources != null) { + log.debug("Processing XObjects for page"); + processPageXObjects( + document, + resources, + targetWords, + useRegex, + wholeWordSearch, + this.aggressiveMode); + } + + List textSegments = + extractTextSegmentsFromTokens(page.getResources(), tokens, this.aggressiveMode); + log.debug("Extracted {} text segments from tokens", textSegments.size()); + + if (!textSegments.isEmpty()) { + StringBuilder allText = new StringBuilder(); + boolean hasProblematicChars = false; + + for (TextSegment seg : textSegments) { + if (seg.getText() != null && !seg.getText().trim().isEmpty()) { + String segmentText = seg.getText(); + if (!isTextSafeForRedaction(segmentText)) { + hasProblematicChars = true; + segmentText = normalizeTextForRedaction(segmentText); + log.debug( + "Normalized problematic text in segment: original contained encoding issues"); + } + allText.append(segmentText).append(" "); + } + } + + String completeText = allText.toString().trim(); + if (!completeText.isEmpty()) { + log.debug("Complete extracted text: '{}'", completeText); + if (hasProblematicChars) { + log.info("Applied character normalization to handle encoding issues"); + } + } + } + + List matches; + if (this.aggressiveMode) { + log.debug("Using aggressive mode for matching"); + matches = + findAllMatchesAggressive( + textSegments, tokens, targetWords, useRegex, wholeWordSearch); + } else { + log.debug("Using moderate mode for matching"); + matches = findMatchesInSegments(textSegments, targetWords, useRegex, wholeWordSearch); + } + + log.info("Found {} matches to redact", matches.size()); + if (!matches.isEmpty()) { + log.debug("Match ranges: {}", matches); + } + + List resultTokens = applyRedactionsToTokens(tokens, textSegments, matches); + int modifications = tokens.size() - resultTokens.size(); + log.debug( + "Applied redactions - original tokens: {}, result tokens: {}, modifications: {}", + tokens.size(), + resultTokens.size(), + modifications); + + return resultTokens; } private static COSArray buildKerningAdjustedTJArray( @@ -1059,16 +1379,6 @@ public class RedactionService { }; } - private List extractTextSegments( - PDPage page, List tokens, boolean aggressive) { - return extractTextSegmentsEnhanced(page, tokens, aggressive); - } - - private List extractTextSegmentsEnhanced( - PDPage page, List tokens, boolean aggressive) { - return extractTextSegmentsFromTokens(page.getResources(), tokens, aggressive); - } - private static boolean hasReliableWidthMetrics(PDFont font) { try { String testString = "AbCdEf123"; @@ -1160,33 +1470,26 @@ public class RedactionService { return changed; } - static String createPlaceholderWithWidth( - String originalWord, float targetWidth, PDFont font, float fontSize) { - if (originalWord == null || originalWord.isEmpty()) return " "; - if (font == null || fontSize <= 0) return " ".repeat(originalWord.length()); - if (!WidthCalculator.isWidthCalculationReliable(font)) - return " ".repeat(originalWord.length()); - - final String repeat = " ".repeat(Math.max(1, originalWord.length())); - if (TextEncodingHelper.isFontSubset(font.getName())) { - return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize); - } - + private int wipeAllTextInFormXObject(PDDocument document, PDFormXObject formXObject) + throws IOException { + int modifications = 0; try { - float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize); - if (spaceWidth <= 0) { - return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + PDResources res = formXObject.getResources(); + if (res != null) { + modifications += wipeAllTextInResources(document, res); } - - int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); - int maxSpaces = - Math.max( - originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f)); - return " ".repeat(Math.min(spaceCount, maxSpaces)); - } catch (Exception e) { - String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); - return result != null ? result : repeat; + PDFStreamParser parser = new PDFStreamParser(formXObject); + List tokens = parseAllTokens(parser); + WipeResult wrText = wipeAllTextShowingOperators(tokens); + modifications += wrText.modifications; + WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens); + modifications += wrSem.modifications; + if (wrText.modifications > 0 || wrSem.modifications > 0) { + writeRedactedContentToXObject(document, formXObject, wrSem.tokens); + } + } catch (Exception ignored) { } + return modifications; } private String applyRedactionsToSegmentText(TextSegment segment, List matches) { @@ -1335,56 +1638,30 @@ public class RedactionService { } } - private byte[] processWithTesseractForRestoration( - java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request) - throws IOException, InterruptedException { - try (TempDirectory tempDir = new TempDirectory(tempFileManager)) { - java.io.File tempOutputDir = new java.io.File(tempDir.getPath().toFile(), "output"); - java.io.File tempImagesDir = new java.io.File(tempDir.getPath().toFile(), "images"); - java.io.File finalOutputFile = - new java.io.File(tempDir.getPath().toFile(), "final_output.pdf"); - tempOutputDir.mkdirs(); - tempImagesDir.mkdirs(); - try (PDDocument document = pdfDocumentFactory.load(inputPath.toFile())) { - PDFRenderer pdfRenderer = new PDFRenderer(document); - int pageCount = document.getNumberOfPages(); - PDFMergerUtility merger = new PDFMergerUtility(); - merger.setDestinationFileName(finalOutputFile.toString()); - for (int pageNum = 0; pageNum < pageCount; pageNum++) { - BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 600); - java.io.File imagePath = - new java.io.File(tempImagesDir, "page_" + pageNum + ".png"); - ImageIO.write(image, "png", imagePath); - List command = - Arrays.asList( - "tesseract", - imagePath.toString(), - new java.io.File(tempOutputDir, "page_" + pageNum).toString(), - "-l", - "eng", - "--dpi", - "600", - "--psm", - "1", - "pdf"); - ProcessExecutorResult result = - ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT) - .runCommandWithOutputHandling(command); - if (result.getRc() != 0) { - throw new IOException( - "Tesseract restoration failed with return code: " + result.getRc()); + private void wipeAllTextInPatterns(PDDocument document, PDResources resources) { + try { + for (COSName patName : resources.getPatternNames()) { + try { + var pattern = resources.getPattern(patName); + if (pattern + instanceof + org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern tiling) { + PDResources patRes = tiling.getResources(); + if (patRes != null) { + wipeAllTextInResources(document, patRes); + } + PDFStreamParser parser = new PDFStreamParser(tiling); + List tokens = parseAllTokens(parser); + WipeResult wrText = wipeAllTextShowingOperators(tokens); + WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens); + if (wrText.modifications > 0 || wrSem.modifications > 0) { + writeRedactedContentToPattern(tiling, wrSem.tokens); + } } - java.io.File pageOutputPath = - new java.io.File(tempOutputDir, "page_" + pageNum + ".pdf"); - merger.addSource(pageOutputPath); + } catch (Exception ignored) { } - merger.mergeDocuments(null); - java.nio.file.Files.copy( - finalOutputFile.toPath(), - outputPath, - java.nio.file.StandardCopyOption.REPLACE_EXISTING); } - return java.nio.file.Files.readAllBytes(outputPath); + } catch (Exception ignored) { } } @@ -1577,126 +1854,49 @@ public class RedactionService { return baseLimit; } - List createTokensWithoutTargetText( + private void processFormXObject( PDDocument document, - PDPage page, + PDFormXObject formXObject, Set targetWords, boolean useRegex, - boolean wholeWordSearch) - throws IOException { - log.debug("Processing page with {} target words: {}", targetWords.size(), targetWords); - - PDFStreamParser parser = new PDFStreamParser(page); - List tokens = new ArrayList<>(); - Object tk; - int tokenCount = 0; - while (true) { - final Object parsedNextToken = parser.parseNextToken(); - if ((tk = parsedNextToken) == null) break; - tokens.add(tk); - tokenCount++; - } - - log.debug("Parsed {} tokens from page content stream", tokenCount); - - if (tokenCount == 0 && !targetWords.isEmpty()) { - log.warn( - "No tokens parsed from page content stream - this might indicate encoding issues"); - log.warn("Attempting alternative verification for target words: {}", targetWords); - - try { - TextFinder directFinder = new TextFinder("", false, false); - directFinder.setStartPage(document.getPages().indexOf(page) + 1); - directFinder.setEndPage(document.getPages().indexOf(page) + 1); - directFinder.getText(document); - - StringBuilder pageText = new StringBuilder(); - for (PDFText pdfText : directFinder.getFoundTexts()) { - if (pdfText.getText() != null) { - pageText.append(pdfText.getText()).append(" "); - } - } - - String extractedText = pageText.toString().trim(); - log.debug("Alternative text extraction found: '{}'", extractedText); - - for (String word : targetWords) { - if (extractedText.toLowerCase().contains(word.toLowerCase())) { - log.warn("Found target word '{}' via alternative extraction method", word); - } - } - - } catch (Exception e) { - log.error("Alternative text extraction failed: {}", e.getMessage()); + boolean wholeWordSearch, + boolean aggressive) { + try { + PDResources xobjResources = formXObject.getResources(); + if (xobjResources == null) { + return; } - } - - PDResources resources = page.getResources(); - if (resources != null) { - log.debug("Processing XObjects for page"); - processPageXObjects( - document, - resources, - targetWords, - useRegex, - wholeWordSearch, - this.aggressiveMode); - } - - List textSegments = extractTextSegments(page, tokens, this.aggressiveMode); - log.debug("Extracted {} text segments from tokens", textSegments.size()); - - if (!textSegments.isEmpty()) { - StringBuilder allText = new StringBuilder(); - boolean hasProblematicChars = false; - - for (TextSegment seg : textSegments) { - if (seg.getText() != null && !seg.getText().trim().isEmpty()) { - String segmentText = seg.getText(); - if (!isTextSafeForRedaction(segmentText)) { - hasProblematicChars = true; - segmentText = normalizeTextForRedaction(segmentText); - log.debug( - "Normalized problematic text in segment: original contained encoding issues"); - } - allText.append(segmentText).append(" "); + for (COSName xobjName : xobjResources.getXObjectNames()) { + PDXObject nestedXObj = xobjResources.getXObject(xobjName); + if (nestedXObj instanceof PDFormXObject nestedFormXObj) { + processFormXObject( + document, + nestedFormXObj, + targetWords, + useRegex, + wholeWordSearch, + aggressive); } } - - String completeText = allText.toString().trim(); - if (!completeText.isEmpty()) { - log.debug("Complete extracted text: '{}'", completeText); - if (hasProblematicChars) { - log.info("Applied character normalization to handle encoding issues"); - } + PDFStreamParser parser = new PDFStreamParser(formXObject); + List tokens = parseAllTokens(parser); + List textSegments = extractTextSegmentsFromXObject(xobjResources, tokens); + String completeText = buildCompleteText(textSegments); + List matches = + aggressive + ? findAllMatchesAggressive( + textSegments, tokens, targetWords, useRegex, wholeWordSearch) + : findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); + if (!matches.isEmpty()) { + List redactedTokens = + applyRedactionsToTokens(tokens, textSegments, matches); + writeRedactedContentToXObject(document, formXObject, redactedTokens); + } else if (aggressive && !completeText.isEmpty()) { + WipeResult wr = wipeAllTextShowingOperators(tokens); + writeRedactedContentToXObject(document, formXObject, wr.tokens); } + } catch (Exception ignored) { } - - List matches; - if (this.aggressiveMode) { - log.debug("Using aggressive mode for matching"); - matches = - findAllMatchesAggressive( - textSegments, tokens, targetWords, useRegex, wholeWordSearch); - } else { - log.debug("Using moderate mode for matching"); - matches = findMatchesInSegments(textSegments, targetWords, useRegex, wholeWordSearch); - } - - log.info("Found {} matches to redact", matches.size()); - if (!matches.isEmpty()) { - log.debug("Match ranges: {}", matches); - } - - List resultTokens = applyRedactionsToTokens(tokens, textSegments, matches); - int modifications = tokens.size() - resultTokens.size(); - log.debug( - "Applied redactions - original tokens: {}, result tokens: {}, modifications: {}", - tokens.size(), - resultTokens.size(), - modifications); - - return resultTokens; } private static boolean isGibberish(String text) { @@ -2649,60 +2849,46 @@ public class RedactionService { } } - private int wipeAllTextInFormXObject(PDDocument document, PDFormXObject formXObject) + public byte[] performEnhancedRedaction( + RedactPdfRequest request, + String[] targetText, + Set scrubOptions, + FallbackStrategy fontStrategy) throws IOException { - int modifications = 0; - try { - PDResources res = formXObject.getResources(); - if (res != null) { - modifications += wipeAllTextInResources(document, res); - } - PDFStreamParser parser = new PDFStreamParser(formXObject); - List tokens = new ArrayList<>(); - Object token; - while ((token = parser.parseNextToken()) != null) { - tokens.add(token); - } - WipeResult wrText = wipeAllTextShowingOperators(tokens); - modifications += wrText.modifications; - WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens); - modifications += wrSem.modifications; - if (wrText.modifications > 0 || wrSem.modifications > 0) { - writeRedactedContentToXObject(document, formXObject, wrSem.tokens); - } - } catch (Exception ignored) { - } - return modifications; + + log.info( + "Starting enhanced redaction with {} targets and {} scrub options", + targetText.length, + scrubOptions.size()); + + byte[] result = redactPdfWithSemanticScrubbing(request, scrubOptions); + + log.info("Enhanced redaction completed successfully"); + return result; } - private void wipeAllTextInPatterns(PDDocument document, PDResources resources) { + public boolean validateFontCoverage(PDFont font, String text) { + if (font == null || text == null || text.isEmpty()) { + return false; + } + try { - for (COSName patName : resources.getPatternNames()) { - try { - var pattern = resources.getPattern(patName); - if (pattern - instanceof - org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern tiling) { - PDResources patRes = tiling.getResources(); - if (patRes != null) { - wipeAllTextInResources(document, patRes); - } - PDFStreamParser parser = new PDFStreamParser(tiling); - List tokens = new ArrayList<>(); - Object token; - while ((token = parser.parseNextToken()) != null) { - tokens.add(token); - } - WipeResult wrText = wipeAllTextShowingOperators(tokens); - WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens); - if (wrText.modifications > 0 || wrSem.modifications > 0) { - writeRedactedContentToPattern(tiling, wrSem.tokens); - } - } - } catch (Exception ignored) { + GlyphCoverageProbe probe = new GlyphCoverageProbe(font); + + for (int i = 0; i < text.length(); ) { + int codePoint = text.codePointAt(i); + if (!probe.hasGlyph(codePoint)) { + log.debug( + "Font {} missing glyph for code point: {}", font.getName(), codePoint); + return false; } + i += Character.charCount(codePoint); } - } catch (Exception ignored) { + + return true; + } catch (Exception e) { + log.debug("Error validating font coverage", e); + return false; } } @@ -2716,53 +2902,10 @@ public class RedactionService { } } - private void processFormXObject( - PDDocument document, - PDFormXObject formXObject, - Set targetWords, - boolean useRegex, - boolean wholeWordSearch, - boolean aggressive) { - try { - PDResources xobjResources = formXObject.getResources(); - if (xobjResources == null) { - return; - } - for (COSName xobjName : xobjResources.getXObjectNames()) { - PDXObject nestedXObj = xobjResources.getXObject(xobjName); - if (nestedXObj instanceof PDFormXObject nestedFormXObj) { - processFormXObject( - document, - nestedFormXObj, - targetWords, - useRegex, - wholeWordSearch, - aggressive); - } - } - PDFStreamParser parser = new PDFStreamParser(formXObject); - List tokens = new ArrayList<>(); - Object token; - while ((token = parser.parseNextToken()) != null) { - tokens.add(token); - } - List textSegments = extractTextSegmentsFromXObject(xobjResources, tokens); - String completeText = buildCompleteText(textSegments); - List matches = - aggressive - ? findAllMatchesAggressive( - textSegments, tokens, targetWords, useRegex, wholeWordSearch) - : findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); - if (!matches.isEmpty()) { - List redactedTokens = - applyRedactionsToTokens(tokens, textSegments, matches); - writeRedactedContentToXObject(document, formXObject, redactedTokens); - } else if (aggressive && !completeText.isEmpty()) { - WipeResult wr = wipeAllTextShowingOperators(tokens); - writeRedactedContentToXObject(document, formXObject, wr.tokens); - } - } catch (Exception ignored) { - } + public enum FallbackStrategy { + EMBED_WIDTH, + AVERAGE_WIDTH, + LEGACY_SUM } private static class TokenModificationResult { @@ -2843,4 +2986,239 @@ public class RedactionService { List tokens; int modifications; } + + public enum ScrubOption { + REMOVE_ACTUALTEXT, + REMOVE_ALT, + REMOVE_TU, + NORMALIZE_WHITESPACE + } + + public interface SemanticScrubber { + void scrub(PDDocument document, Set options); + } + + private static class GlyphCoverageProbe { + private final PDFont font; + private final Set availableGlyphs; + + public GlyphCoverageProbe(PDFont font) { + this.font = font; + this.availableGlyphs = buildGlyphCoverage(font); + } + + private Set buildGlyphCoverage(PDFont font) { + Set coverage = new HashSet<>(); + if (font == null) return coverage; + + try { + if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) { + for (int cid = 0; cid < 65536; cid++) { + try { + String unicode = font.toUnicode(cid); + if (unicode != null && !unicode.isEmpty()) { + coverage.add(cid); + } + } catch (Exception e) { + // Glyph not available + } + } + } + } catch (Exception e) { + log.debug("Could not build glyph coverage for font: {}", font.getName(), e); + } + return coverage; + } + + public boolean hasGlyph(int codePoint) { + if (font == null) return false; + try { + if (availableGlyphs.contains(codePoint)) { + return true; + } + String testChar = new String(Character.toChars(codePoint)); + byte[] encoded = font.encode(testChar); + return encoded.length > 0; + } catch (Exception e) { + return false; + } + } + + public float getWidthWithFallback( + int codePoint, FallbackStrategy strategy, float fontSize) { + if (hasGlyph(codePoint)) { + try { + String charStr = new String(Character.toChars(codePoint)); + return font.getStringWidth(charStr) / FONT_SCALE_FACTOR * fontSize; + } catch (Exception e) { + // Fall through + } + } + return switch (strategy) { + case EMBED_WIDTH -> getEmbeddedProgramWidth(codePoint, fontSize); + case AVERAGE_WIDTH -> getAverageFontWidth(fontSize); + case LEGACY_SUM -> getLegacySumFallback(codePoint, fontSize); + }; + } + + private float getEmbeddedProgramWidth(int codePoint, float fontSize) { + try { + if (font.getFontDescriptor() != null) { + float avgWidth = font.getFontDescriptor().getAverageWidth(); + if (avgWidth > 0) { + return avgWidth / FONT_SCALE_FACTOR * fontSize; + } + } + return getAverageFontWidth(fontSize); + } catch (Exception e) { + return getAverageFontWidth(fontSize); + } + } + + private float getAverageFontWidth(float fontSize) { + try { + String[] testChars = {"a", "e", "i", "o", "u", "n", "r", "t", "s"}; + float totalWidth = 0; + int validChars = 0; + + for (String ch : testChars) { + try { + float width = font.getStringWidth(ch); + if (width > 0) { + totalWidth += width; + validChars++; + } + } catch (Exception e) { + // Skip + } + } + + if (validChars > 0) { + return (totalWidth / validChars) / FONT_SCALE_FACTOR * fontSize; + } + + try { + float spaceWidth = font.getStringWidth(" "); + return spaceWidth / FONT_SCALE_FACTOR * fontSize; + } catch (Exception e) { + return fontSize * 0.5f; + } + } catch (Exception e) { + return fontSize * 0.5f; + } + } + + private float getLegacySumFallback(int codePoint, float fontSize) { + return fontSize * 0.6f; + } + } + + public static class DefaultSemanticScrubber implements SemanticScrubber { + + @Override + public void scrub(PDDocument document, Set options) { + if (document == null || options == null || options.isEmpty()) { + return; + } + + log.info("Starting semantic scrub with options: {}", options); + + try { + scrubStructureTree(document, options); + + if (options.contains(ScrubOption.REMOVE_ACTUALTEXT) + || options.contains(ScrubOption.REMOVE_ALT) + || options.contains(ScrubOption.REMOVE_TU)) { + scrubAnnotations(document, options); + } + + log.info("Semantic scrub completed successfully"); + } catch (Exception e) { + log.error("Error during semantic scrub", e); + } + } + + private void scrubStructureTree(PDDocument document, Set options) { + try { + COSDictionary catalog = document.getDocumentCatalog().getCOSObject(); + COSBase structTreeRoot = catalog.getDictionaryObject(COSName.STRUCT_TREE_ROOT); + + if (structTreeRoot instanceof COSDictionary structRoot) { + scrubStructureElement(structRoot, options); + } + } catch (Exception e) { + log.debug("Could not scrub structure tree", e); + } + } + + private void scrubStructureElement(COSDictionary element, Set options) { + if (element == null) return; + + if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)) { + element.removeItem(COSName.ACTUAL_TEXT); + } + if (options.contains(ScrubOption.REMOVE_ALT)) { + element.removeItem(COSName.ALT); + } + if (options.contains(ScrubOption.REMOVE_TU)) { + element.removeItem(COSName.TU); + } + + if (options.contains(ScrubOption.NORMALIZE_WHITESPACE)) { + normalizeWhitespaceInElement(element); + } + + COSBase kids = element.getDictionaryObject(COSName.K); + if (kids instanceof COSArray kidsArray) { + for (COSBase kid : kidsArray) { + if (kid instanceof COSDictionary kidDict) { + scrubStructureElement(kidDict, options); + } + } + } else if (kids instanceof COSDictionary kidDict) { + scrubStructureElement(kidDict, options); + } + } + + private void normalizeWhitespaceInElement(COSDictionary element) { + for (COSName key : List.of(COSName.ACTUAL_TEXT, COSName.ALT, COSName.TU)) { + COSBase value = element.getDictionaryObject(key); + if (value instanceof COSString cosString) { + String text = cosString.getString(); + if (text != null) { + String normalized = text.replaceAll("\\s+", " ").trim(); + if (normalized.length() > 256) { + normalized = normalized.substring(0, 256); + } + element.setString(key, normalized); + } + } + } + } + + private void scrubAnnotations(PDDocument document, Set options) { + try { + for (org.apache.pdfbox.pdmodel.PDPage page : document.getPages()) { + for (org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation annotation : + page.getAnnotations()) { + COSDictionary annotDict = annotation.getCOSObject(); + + if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)) { + annotDict.removeItem(COSName.ACTUAL_TEXT); + } + + if (options.contains(ScrubOption.REMOVE_ALT)) { + annotDict.removeItem(COSName.ALT); + } + + if (options.contains(ScrubOption.REMOVE_TU)) { + annotDict.removeItem(COSName.TU); + } + } + } + } catch (Exception e) { + log.debug("Could not scrub annotations", e); + } + } + } } diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java index 68625807b..1f96f65d3 100644 --- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java @@ -1,12 +1,12 @@ package stirling.software.SPDF.utils.text; +import java.util.regex.Pattern; + import org.apache.pdfbox.pdmodel.font.PDFont; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; -import java.util.regex.Pattern; - @Slf4j @UtilityClass public class TextEncodingHelper { @@ -516,5 +516,4 @@ public class TextEncodingHelper { return false; } - } diff --git a/app/core/src/main/resources/messages_en_GB.properties b/app/core/src/main/resources/messages_en_GB.properties index 599dd0989..4a95aa8d9 100644 --- a/app/core/src/main/resources/messages_en_GB.properties +++ b/app/core/src/main/resources/messages_en_GB.properties @@ -921,6 +921,15 @@ autoRedact.wholeWordSearchLabel=Whole Word Search autoRedact.customPaddingLabel=Custom Extra Padding autoRedact.convertPDFToImageLabel=Convert PDF to PDF-Image (Used to remove text behind the box) autoRedact.submitButton=Submit +autoRedact.pdfImageLabel=PDF Image +autoRedact.redactionStyleLabel=Redaction Style +autoRedact.pdfImageDescription=For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality. +autoRedact.visualRedactionLabel=Visual +autoRedact.visualRedactionDescription=Converts to image with visual redactions for maximum security. +autoRedact.deleteTextLabel=Delete Text +autoRedact.deleteTextDescription=Removes the text completely. This may alter the original layout or leave a gap. +autoRedact.keepLayoutLabel=Keep Layout +autoRedact.keepLayoutDescription=Covers text with a redaction box, preserving the page's original design. #redact redact.title=Manual Redaction diff --git a/app/core/src/main/resources/templates/security/auto-redact.html b/app/core/src/main/resources/templates/security/auto-redact.html index 7c0a5b626..6a0061c4d 100644 --- a/app/core/src/main/resources/templates/security/auto-redact.html +++ b/app/core/src/main/resources/templates/security/auto-redact.html @@ -23,6 +23,15 @@ background-color: #0d6efd; border-color: #0d6efd; } + + /* OCR language list styling */ + #languages { + max-height: 400px; + overflow-y: auto; + border: 1px solid var(--md-sys-color-surface-3); + border-radius: 5px; + padding: 10px; + } @@ -62,27 +71,27 @@
- +
- - Converts to image with visual redactions for maximum security. + + Converts to image with visual redactions for maximum security.
- - Removes the text completely. This may alter the original layout or leave a gap. + + Removes the text completely. This may alter the original layout or leave a gap.
- - Covers text with a redaction box, preserving the page's original design. + + Covers text with a redaction box, preserving the page's original design.
- - For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality. + + For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality.
@@ -113,22 +122,14 @@
-
- - +
+ +
+
+ + +
+
Used when OCR restoration is needed
@@ -144,7 +145,7 @@
-