diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 51d5e5a53..900770ef9 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -1,39 +1,9 @@ package stirling.software.SPDF.controller.api.security; -import java.awt.Color; -import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.Objects; -import java.util.Set; -import java.util.regex.Pattern; -import java.util.stream.Collectors; -import org.apache.pdfbox.contentstream.operator.Operator; -import org.apache.pdfbox.cos.COSArray; -import org.apache.pdfbox.cos.COSBase; -import org.apache.pdfbox.cos.COSFloat; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.cos.COSNumber; -import org.apache.pdfbox.cos.COSString; -import org.apache.pdfbox.pdfparser.PDFStreamParser; -import org.apache.pdfbox.pdfwriter.ContentStreamWriter; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.PDPageTree; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.common.PDStream; -import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.graphics.PDXObject; -import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.WebDataBinder; import org.springframework.web.bind.annotation.InitBinder; @@ -41,51 +11,26 @@ import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; -import org.springframework.web.multipart.MultipartFile; import io.github.pixee.security.Filenames; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; -import lombok.AllArgsConstructor; -import lombok.Data; import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import stirling.software.SPDF.model.PDFText; import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest; import stirling.software.SPDF.model.api.security.RedactPdfRequest; -import stirling.software.SPDF.pdf.TextFinder; -import stirling.software.SPDF.utils.text.TextEncodingHelper; -import stirling.software.SPDF.utils.text.TextFinderUtils; -import stirling.software.SPDF.utils.text.WidthCalculator; -import stirling.software.common.model.api.security.RedactionArea; -import stirling.software.common.service.CustomPDFDocumentFactory; -import stirling.software.common.util.GeneralUtils; -import stirling.software.common.util.PdfUtils; +import stirling.software.SPDF.service.RedactionService; import stirling.software.common.util.WebResponseUtils; import stirling.software.common.util.propertyeditor.StringToArrayListPropertyEditor; @RestController @RequestMapping("/api/v1/security") -@Slf4j @Tag(name = "Security", description = "Security APIs") @RequiredArgsConstructor public class RedactController { - private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.6f; - private static final float PRECISION_THRESHOLD = 1e-3f; - private static final int FONT_SCALE_FACTOR = 1000; - - // Redaction box width reduction factor (10% reduction) - private static final float REDACTION_WIDTH_REDUCTION_FACTOR = 0.9f; - - // Text showing operators - private static final Set TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); - - private static final COSString EMPTY_COS_STRING = new COSString(""); - - private final CustomPDFDocumentFactory pdfDocumentFactory; + private final RedactionService redactionService; private String removeFileExtension(String filename) { return filename.replaceFirst("[.][^.]+$", ""); @@ -106,392 +51,14 @@ public class RedactController { + "Input:PDF Output:PDF Type:SISO") public ResponseEntity redactPDF(@ModelAttribute ManualRedactPdfRequest request) throws IOException { - - MultipartFile file = request.getFileInput(); - List redactionAreas = request.getRedactions(); - - try (PDDocument document = pdfDocumentFactory.load(file)) { - PDPageTree allPages = document.getDocumentCatalog().getPages(); - - redactPages(request, document, allPages); - - redactAreas(redactionAreas, document, allPages); - - if (Boolean.TRUE.equals(request.getConvertPDFToImage())) { - try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - convertedPdf.save(baos); - byte[] pdfContent = baos.toByteArray(); - - return WebResponseUtils.bytesToWebResponse( - pdfContent, - removeFileExtension( - Objects.requireNonNull( - Filenames.toSimpleFileName( - file.getOriginalFilename()))) - + "_redacted.pdf"); - } - } - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - document.save(baos); - byte[] pdfContent = baos.toByteArray(); - - return WebResponseUtils.bytesToWebResponse( - pdfContent, - removeFileExtension( - Objects.requireNonNull( - Filenames.toSimpleFileName(file.getOriginalFilename()))) - + "_redacted.pdf"); - } - } - - private void redactAreas( - List redactionAreas, PDDocument document, PDPageTree allPages) - throws IOException { - - if (redactionAreas == null || redactionAreas.isEmpty()) { - return; - } - - Map> redactionsByPage = new HashMap<>(); - - for (RedactionArea redactionArea : redactionAreas) { - - if (redactionArea.getPage() == null - || redactionArea.getPage() <= 0 - || redactionArea.getHeight() == null - || redactionArea.getHeight() <= 0.0D - || redactionArea.getWidth() == null - || redactionArea.getWidth() <= 0.0D) { - continue; - } - - redactionsByPage - .computeIfAbsent(redactionArea.getPage(), k -> new ArrayList<>()) - .add(redactionArea); - } - - for (Map.Entry> entry : redactionsByPage.entrySet()) { - Integer pageNumber = entry.getKey(); - List areasForPage = entry.getValue(); - - if (pageNumber > allPages.getCount()) { - continue; // Skip if the page number is out of bounds - } - - PDPage page = allPages.get(pageNumber - 1); - - try (PDPageContentStream contentStream = - new PDPageContentStream( - document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { - - contentStream.saveGraphicsState(); - for (RedactionArea redactionArea : areasForPage) { - Color redactColor = decodeOrDefault(redactionArea.getColor()); - - contentStream.setNonStrokingColor(redactColor); - - float x = redactionArea.getX().floatValue(); - float y = redactionArea.getY().floatValue(); - float width = redactionArea.getWidth().floatValue(); - float height = redactionArea.getHeight().floatValue(); - - float pdfY = page.getBBox().getHeight() - y - height; - - contentStream.addRect(x, pdfY, width, height); - contentStream.fill(); - } - contentStream.restoreGraphicsState(); - } - } - } - - private void redactPages( - ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages) - throws IOException { - - Color redactColor = decodeOrDefault(request.getPageRedactionColor()); - List pageNumbers = getPageNumbers(request, allPages.getCount()); - - for (Integer pageNumber : pageNumbers) { - - PDPage page = allPages.get(pageNumber); - - try (PDPageContentStream contentStream = - new PDPageContentStream( - document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { - contentStream.setNonStrokingColor(redactColor); - - PDRectangle box = page.getBBox(); - - contentStream.addRect(0, 0, box.getWidth(), box.getHeight()); - contentStream.fill(); - } - } - } - - private void redactFoundText( - PDDocument document, - List blocks, - float customPadding, - Color redactColor, - boolean isTextRemovalMode) - throws IOException { - - var allPages = document.getDocumentCatalog().getPages(); - - Map> blocksByPage = new HashMap<>(); - for (PDFText block : blocks) { - blocksByPage.computeIfAbsent(block.getPageIndex(), k -> new ArrayList<>()).add(block); - } - - for (Map.Entry> entry : blocksByPage.entrySet()) { - Integer pageIndex = entry.getKey(); - List pageBlocks = entry.getValue(); - - if (pageIndex >= allPages.getCount()) { - continue; // Skip if page index is out of bounds - } - - var page = allPages.get(pageIndex); - try (PDPageContentStream contentStream = - new PDPageContentStream( - document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { - - contentStream.saveGraphicsState(); - - try { - contentStream.setNonStrokingColor(redactColor); - PDRectangle pageBox = page.getBBox(); - - for (PDFText block : pageBlocks) { - float padding = - (block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER - + customPadding; - - float originalWidth = block.getX2() - block.getX1(); - float boxWidth; - float boxX; - - // Only apply width reduction when text is actually being removed - if (isTextRemovalMode) { - // Calculate reduced width and center the box - boxWidth = - originalWidth - * REDACTION_WIDTH_REDUCTION_FACTOR; // 10% reduction - float widthReduction = originalWidth - boxWidth; - boxX = block.getX1() + (widthReduction / 2); // Center the reduced box - } else { - // Use original width for box-only redaction - boxWidth = originalWidth; - boxX = block.getX1(); - } - - contentStream.addRect( - boxX, - pageBox.getHeight() - block.getY2() - padding, - boxWidth, - block.getY2() - block.getY1() + 2 * padding); - } - - contentStream.fill(); - - } finally { - contentStream.restoreGraphicsState(); - } - } - } - } - - String createPlaceholderWithFont(String originalWord, PDFont font) { - if (originalWord == null || originalWord.isEmpty()) { - return originalWord; - } - - if (font != null && TextEncodingHelper.isFontSubset(font.getName())) { - try { - float originalWidth = safeGetStringWidth(font, originalWord) / FONT_SCALE_FACTOR; - return createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f); - } catch (Exception e) { - log.debug( - "Subset font placeholder creation failed for {}: {}", - font.getName(), - e.getMessage()); - return ""; - } - } - - return " ".repeat(originalWord.length()); - } - - /** - * Enhanced placeholder creation using advanced width calculation. Incorporates font validation - * and sophisticated fallback strategies. - */ - String createPlaceholderWithWidth( - String originalWord, float targetWidth, PDFont font, float fontSize) { - if (originalWord == null || originalWord.isEmpty()) { - return originalWord; - } - - if (font == null || fontSize <= 0) { - return " ".repeat(originalWord.length()); - } - - try { - // Check font reliability before proceeding - if (!WidthCalculator.isWidthCalculationReliable(font)) { - log.debug( - "Font {} unreliable for width calculation, using simple placeholder", - font.getName()); - return " ".repeat(originalWord.length()); - } - - // Use enhanced subset font detection - if (TextEncodingHelper.isFontSubset(font.getName())) { - return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize); - } - - // Enhanced space width calculation - float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize); - - if (spaceWidth <= 0) { - return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); - } - - int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); - - // More conservative space limit based on original word characteristics - int maxSpaces = - Math.max( - originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f)); - spaceCount = Math.min(spaceCount, maxSpaces); - - return " ".repeat(spaceCount); - - } catch (Exception e) { - log.debug("Enhanced placeholder creation failed: {}", e.getMessage()); - return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); - } - } - - private String createSubsetFontPlaceholder( - String originalWord, float targetWidth, PDFont font, float fontSize) { - try { - log.debug("Subset font {} - trying to find replacement characters", font.getName()); - String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); - - if (result.isEmpty()) { - log.debug( - "Subset font {} has no suitable replacement characters, using empty string", - font.getName()); - } - - return result; - - } catch (Exception e) { - log.debug("Subset font placeholder creation failed: {}", e.getMessage()); - return ""; - } - } - - private String createAlternativePlaceholder( - String originalWord, float targetWidth, PDFont font, float fontSize) { - try { - String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"}; - - if (TextEncodingHelper.fontSupportsCharacter(font, " ")) { - float spaceWidth = safeGetStringWidth(font, " ") / FONT_SCALE_FACTOR * fontSize; - if (spaceWidth > 0) { - int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); - int maxSpaces = originalWord.length() * 2; - spaceCount = Math.min(spaceCount, maxSpaces); - log.debug("Using spaces for font {}", font.getName()); - return " ".repeat(spaceCount); - } - } - - for (String altChar : alternatives) { - if (" ".equals(altChar)) continue; // Already tried spaces - - try { - if (!TextEncodingHelper.fontSupportsCharacter(font, altChar)) { - continue; - } - - float charWidth = - safeGetStringWidth(font, altChar) / FONT_SCALE_FACTOR * fontSize; - if (charWidth > 0) { - int charCount = Math.max(1, Math.round(targetWidth / charWidth)); - int maxChars = originalWord.length() * 2; - charCount = Math.min(charCount, maxChars); - log.debug( - "Using character '{}' for width calculation but spaces for placeholder in font {}", - altChar, - font.getName()); - - return " ".repeat(charCount); - } - } catch (Exception e) { - } - } - - log.debug( - "All placeholder alternatives failed for font {}, using empty string", - font.getName()); - return ""; - - } catch (Exception e) { - log.debug("Alternative placeholder creation failed: {}", e.getMessage()); - return ""; - } - } - - void writeFilteredContentStream(PDDocument document, PDPage page, List tokens) - throws IOException { - - PDStream newStream = new PDStream(document); - - try { - try (var out = newStream.createOutputStream()) { - ContentStreamWriter writer = new ContentStreamWriter(out); - writer.writeTokens(tokens); - } - - page.setContents(newStream); - - } catch (IOException e) { - throw new IOException("Failed to write filtered content stream to page", e); - } - } - - Color decodeOrDefault(String hex) { - if (hex == null) { - return Color.BLACK; - } - - String colorString = hex.startsWith("#") ? hex : "#" + hex; - - try { - return Color.decode(colorString); - } catch (NumberFormatException e) { - return Color.BLACK; - } - } - - boolean isTextShowingOperator(String opName) { - return TEXT_SHOWING_OPERATORS.contains(opName); - } - - private List getPageNumbers(ManualRedactPdfRequest request, int pagesCount) { - String pageNumbersInput = request.getPageNumbers(); - String[] parsedPageNumbers = - pageNumbersInput != null ? pageNumbersInput.split(",") : new String[0]; - List pageNumbers = - GeneralUtils.parsePageList(parsedPageNumbers, pagesCount, false); - Collections.sort(pageNumbers); - return pageNumbers; + byte[] pdfContent = redactionService.redactPDF(request); + return WebResponseUtils.bytesToWebResponse( + pdfContent, + removeFileExtension( + Objects.requireNonNull( + Filenames.toSimpleFileName( + request.getFileInput().getOriginalFilename()))) + + "_redacted.pdf"); } @PostMapping(value = "/auto-redact", consumes = "multipart/form-data") @@ -501,1189 +68,15 @@ public class RedactController { "This endpoint automatically redacts text from a PDF file based on specified patterns. " + "Users can provide text patterns to redact, with options for regex and whole word matching. " + "Input:PDF Output:PDF Type:SISO") - public ResponseEntity redactPdf(@ModelAttribute RedactPdfRequest request) { - String[] listOfText = request.getListOfText().split("\n"); - boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); - boolean wholeWordSearchBool = Boolean.TRUE.equals(request.getWholeWordSearch()); - - if (listOfText.length == 0 || (listOfText.length == 1 && listOfText[0].trim().isEmpty())) { - throw new IllegalArgumentException("No text patterns provided for redaction"); - } - - PDDocument document = null; - PDDocument fallbackDocument = null; - - try { - if (request.getFileInput() == null) { - log.error("File input is null"); - throw new IllegalArgumentException("File input cannot be null"); - } - - document = pdfDocumentFactory.load(request.getFileInput()); - - if (document == null) { - log.error("Failed to load PDF document"); - throw new IllegalArgumentException("Failed to load PDF document"); - } - - Map> allFoundTextsByPage = - findTextToRedact(document, listOfText, useRegex, wholeWordSearchBool); - - int totalMatches = allFoundTextsByPage.values().stream().mapToInt(List::size).sum(); - log.info( - "Redaction scan: {} occurrences across {} pages (patterns={}, regex={}, wholeWord={})", - totalMatches, - allFoundTextsByPage.size(), - listOfText.length, - useRegex, - wholeWordSearchBool); - - if (allFoundTextsByPage.isEmpty()) { - log.info("No text found matching redaction patterns"); - byte[] originalContent; - try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - document.save(baos); - originalContent = baos.toByteArray(); - } - - return WebResponseUtils.bytesToWebResponse( - originalContent, - removeFileExtension( - Objects.requireNonNull( - Filenames.toSimpleFileName( - request.getFileInput() - .getOriginalFilename()))) - + "_redacted.pdf"); - } - - boolean fallbackToBoxOnlyMode; - try { - fallbackToBoxOnlyMode = - performTextReplacement( - document, - allFoundTextsByPage, - listOfText, - useRegex, - wholeWordSearchBool); - } catch (Exception e) { - log.warn( - "Text replacement redaction failed, falling back to box-only mode: {}", - e.getMessage()); - fallbackToBoxOnlyMode = true; - } - - if (fallbackToBoxOnlyMode) { - log.warn( - "Font compatibility issues detected. Using box-only redaction mode for better reliability."); - - fallbackDocument = pdfDocumentFactory.load(request.getFileInput()); - - allFoundTextsByPage = - findTextToRedact( - fallbackDocument, listOfText, useRegex, wholeWordSearchBool); - - byte[] pdfContent = - finalizeRedaction( - fallbackDocument, - allFoundTextsByPage, - request.getRedactColor(), - request.getCustomPadding(), - request.getConvertPDFToImage(), - false); // Box-only mode, use original box sizes - - return WebResponseUtils.bytesToWebResponse( - pdfContent, - removeFileExtension( - Objects.requireNonNull( - Filenames.toSimpleFileName( - request.getFileInput() - .getOriginalFilename()))) - + "_redacted.pdf"); - } - - byte[] pdfContent = - finalizeRedaction( - document, - allFoundTextsByPage, - request.getRedactColor(), - request.getCustomPadding(), - request.getConvertPDFToImage(), - true); // Text removal mode, use reduced box sizes - - return WebResponseUtils.bytesToWebResponse( - pdfContent, - removeFileExtension( - Objects.requireNonNull( - Filenames.toSimpleFileName( - request.getFileInput().getOriginalFilename()))) - + "_redacted.pdf"); - - } catch (Exception e) { - log.error("Redaction operation failed: {}", e.getMessage(), e); - throw new RuntimeException("Failed to perform PDF redaction: " + e.getMessage(), e); - - } finally { - if (document != null) { - try { - if (fallbackDocument == null) { - document.close(); - } - } catch (IOException e) { - log.warn("Failed to close main document: {}", e.getMessage()); - } - } - - if (fallbackDocument != null) { - try { - fallbackDocument.close(); - } catch (IOException e) { - log.warn("Failed to close fallback document: {}", e.getMessage()); - } - } - } - } - - private Map> findTextToRedact( - PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) { - Map> allFoundTextsByPage = new HashMap<>(); - - for (String text : listOfText) { - text = text.trim(); - if (text.isEmpty()) continue; - - log.debug( - "Searching for text: '{}' (regex: {}, wholeWord: {})", - text, - useRegex, - wholeWordSearch); - - try { - TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearch); - textFinder.getText(document); - - List foundTexts = textFinder.getFoundTexts(); - log.debug("TextFinder found {} instances of '{}'", foundTexts.size(), text); - - for (PDFText found : foundTexts) { - allFoundTextsByPage - .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>()) - .add(found); - log.debug( - "Added match on page {} at ({},{},{},{}): '{}'", - found.getPageIndex(), - found.getX1(), - found.getY1(), - found.getX2(), - found.getY2(), - found.getText()); - } - } catch (Exception e) { - log.error("Error processing search term '{}': {}", text, e.getMessage()); - } - } - - return allFoundTextsByPage; - } - - private boolean performTextReplacement( - PDDocument document, - Map> allFoundTextsByPage, - String[] listOfText, - boolean useRegex, - boolean wholeWordSearchBool) { - if (allFoundTextsByPage.isEmpty()) { - return false; - } - - if (detectCustomEncodingFonts(document)) { - log.warn( - "Custom encoded fonts detected (non-standard encodings / DictionaryEncoding / damaged fonts). " - + "Text replacement is unreliable for these fonts. Falling back to box-only redaction mode."); - return true; // signal caller to fall back - } - - try { - Set allSearchTerms = - Arrays.stream(listOfText) - .map(String::trim) - .filter(s -> !s.isEmpty()) - .collect(Collectors.toSet()); - - int pageCount = 0; - for (PDPage page : document.getPages()) { - pageCount++; - List filteredTokens = - createTokensWithoutTargetText( - document, page, allSearchTerms, useRegex, wholeWordSearchBool); - writeFilteredContentStream(document, page, filteredTokens); - } - log.info("Successfully performed text replacement redaction on {} pages.", pageCount); - return false; - } catch (Exception e) { - log.error( - "Text replacement redaction failed due to font or encoding issues. " - + "Will fall back to box-only redaction mode. Error: {}", - e.getMessage()); - return true; - } - } - - private byte[] finalizeRedaction( - PDDocument document, - Map> allFoundTextsByPage, - String colorString, - float customPadding, - Boolean convertToImage, - boolean isTextRemovalMode) - throws IOException { - - List allFoundTexts = new ArrayList<>(); - for (List pageTexts : allFoundTextsByPage.values()) { - allFoundTexts.addAll(pageTexts); - } - - if (!allFoundTexts.isEmpty()) { - Color redactColor = decodeOrDefault(colorString); - - redactFoundText(document, allFoundTexts, customPadding, redactColor, isTextRemovalMode); - - cleanDocumentMetadata(document); - } - - if (Boolean.TRUE.equals(convertToImage)) { - try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) { - cleanDocumentMetadata(convertedPdf); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - convertedPdf.save(baos); - byte[] out = baos.toByteArray(); - - log.info( - "Redaction finalized (image mode): {} pages ➜ {} KB", - convertedPdf.getNumberOfPages(), - out.length / 1024); - - return out; - } - } - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - document.save(baos); - byte[] out = baos.toByteArray(); - - log.info( - "Redaction finalized: {} pages ➜ {} KB", - document.getNumberOfPages(), - out.length / 1024); - - return out; - } - - private void cleanDocumentMetadata(PDDocument document) { - try { - var documentInfo = document.getDocumentInformation(); - if (documentInfo != null) { - documentInfo.setAuthor(null); - documentInfo.setSubject(null); - documentInfo.setKeywords(null); - - documentInfo.setModificationDate(java.util.Calendar.getInstance()); - - log.debug("Cleaned document metadata for security"); - } - - if (document.getDocumentCatalog() != null) { - try { - document.getDocumentCatalog().setMetadata(null); - } catch (Exception e) { - log.debug("Could not clear XMP metadata: {}", e.getMessage()); - } - } - - } catch (Exception e) { - log.warn("Failed to clean document metadata: {}", e.getMessage()); - } - } - - List createTokensWithoutTargetText( - PDDocument document, - PDPage page, - Set targetWords, - boolean useRegex, - boolean wholeWordSearch) - throws IOException { - - PDFStreamParser parser = new PDFStreamParser(page); - List tokens = new ArrayList<>(); - Object token; - while ((token = parser.parseNextToken()) != null) { - tokens.add(token); - } - - PDResources resources = page.getResources(); - if (resources != null) { - processPageXObjects(document, resources, targetWords, useRegex, wholeWordSearch); - } - - List textSegments = extractTextSegments(page, tokens); - - String completeText = buildCompleteText(textSegments); - - List matches = - findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); - - return applyRedactionsToTokens(tokens, textSegments, matches); - } - - private void processPageXObjects( - PDDocument document, - PDResources resources, - Set targetWords, - boolean useRegex, - boolean wholeWordSearch) { - - for (COSName xobjName : resources.getXObjectNames()) { - try { - PDXObject xobj = resources.getXObject(xobjName); - if (xobj instanceof PDFormXObject formXObj) { - processFormXObject(document, formXObj, targetWords, useRegex, wholeWordSearch); - log.debug("Processed Form XObject: {}", xobjName.getName()); - } - } catch (Exception e) { - log.warn("Failed to process XObject {}: {}", xobjName.getName(), e.getMessage()); - } - } - } - - @Data - private static class GraphicsState { - private PDFont font = null; - private float fontSize = 0; - } - - @Data - @AllArgsConstructor - private static class TextSegment { - private int tokenIndex; - private String operatorName; - private String text; - private int startPos; - private int endPos; - private PDFont font; - private float fontSize; - } - - @Data - @AllArgsConstructor - private static class MatchRange { - private int startPos; - private int endPos; - } - - private List extractTextSegments(PDPage page, List tokens) { - - List segments = new ArrayList<>(); - int currentTextPos = 0; - GraphicsState graphicsState = new GraphicsState(); - PDResources resources = page.getResources(); - - for (int i = 0; i < tokens.size(); i++) { - Object currentToken = tokens.get(i); - - if (currentToken instanceof Operator op) { - String opName = op.getName(); - - if ("Tf".equals(opName) && i >= 2) { - try { - COSName fontName = (COSName) tokens.get(i - 2); - COSBase fontSizeBase = (COSBase) tokens.get(i - 1); - if (fontSizeBase instanceof COSNumber cosNumber) { - graphicsState.setFont(resources.getFont(fontName)); - graphicsState.setFontSize(cosNumber.floatValue()); - } - } catch (ClassCastException | IOException e) { - log.debug( - "Failed to extract font and font size from Tf operator: {}", - e.getMessage()); - } - } - - currentTextPos = - getCurrentTextPos( - tokens, segments, currentTextPos, graphicsState, i, opName); - } - } - - return segments; - } - - private String buildCompleteText(List segments) { - StringBuilder sb = new StringBuilder(); - for (TextSegment segment : segments) { - sb.append(segment.text); - } - return sb.toString(); - } - - private List findAllMatches( - String completeText, - Set targetWords, - boolean useRegex, - boolean wholeWordSearch) { - - // Use the new utility for creating optimized patterns - List patterns = - TextFinderUtils.createOptimizedSearchPatterns( - targetWords, useRegex, wholeWordSearch); - - return patterns.stream() - .flatMap( - pattern -> { - try { - return pattern.matcher(completeText).results(); - } catch (Exception e) { - log.debug( - "Pattern matching failed for pattern {}: {}", - pattern.pattern(), - e.getMessage()); - return java.util.stream.Stream.empty(); - } - }) - .map(matchResult -> new MatchRange(matchResult.start(), matchResult.end())) - .sorted(Comparator.comparingInt(MatchRange::getStartPos)) - .collect(Collectors.toList()); - } - - private List applyRedactionsToTokens( - List tokens, List textSegments, List matches) { - - long startTime = System.currentTimeMillis(); - - try { - List newTokens = new ArrayList<>(tokens); - - Map> matchesBySegment = new HashMap<>(); - for (MatchRange match : matches) { - for (int i = 0; i < textSegments.size(); i++) { - TextSegment segment = textSegments.get(i); - int overlapStart = Math.max(match.startPos, segment.startPos); - int overlapEnd = Math.min(match.endPos, segment.endPos); - if (overlapStart < overlapEnd) { - matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match); - } - } - } - - List tasks = new ArrayList<>(); - for (Map.Entry> entry : matchesBySegment.entrySet()) { - int segmentIndex = entry.getKey(); - List segmentMatches = entry.getValue(); - TextSegment segment = textSegments.get(segmentIndex); - - if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) { - String newText = applyRedactionsToSegmentText(segment, segmentMatches); - try { - float adjustment = calculateWidthAdjustment(segment, segmentMatches); - tasks.add(new ModificationTask(segment, newText, adjustment)); - } catch (Exception e) { - log.debug( - "Width adjustment calculation failed for segment: {}", - e.getMessage()); - } - } else if ("TJ".equals(segment.operatorName)) { - tasks.add(new ModificationTask(segment, null, 0)); - } - } - - tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex)); - - for (ModificationTask task : tasks) { - List segmentMatches = - matchesBySegment.getOrDefault( - textSegments.indexOf(task.segment), Collections.emptyList()); - modifyTokenForRedaction( - newTokens, task.segment, task.newText, task.adjustment, segmentMatches); - } - - return newTokens; - - } finally { - long processingTime = System.currentTimeMillis() - startTime; - log.debug( - "Token redaction processing completed in {} ms for {} matches", - processingTime, - matches.size()); - } - } - - @Data - @AllArgsConstructor - private static class ModificationTask { - private TextSegment segment; - private String newText; // Only for Tj - private float adjustment; // Only for Tj - } - - private String applyRedactionsToSegmentText(TextSegment segment, List matches) { - String text = segment.getText(); - - if (segment.getFont() != null - && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) { - log.debug( - "Skipping text segment '{}' - font {} cannot process this text reliably", - text, - segment.getFont().getName()); - return text; // Return original text unchanged - } - - StringBuilder result = new StringBuilder(text); - - for (MatchRange match : matches) { - int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); - int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); - - if (segmentStart < text.length() && segmentEnd > segmentStart) { - String originalPart = text.substring(segmentStart, segmentEnd); - - if (segment.getFont() != null - && !TextEncodingHelper.isTextSegmentRemovable( - segment.getFont(), originalPart)) { - log.debug( - "Skipping text part '{}' within segment - cannot be processed reliably", - originalPart); - continue; // Skip this match, process others - } - - float originalWidth = 0; - if (segment.getFont() != null && segment.getFontSize() > 0) { - try { - originalWidth = - safeGetStringWidth(segment.getFont(), originalPart) - / FONT_SCALE_FACTOR - * segment.getFontSize(); - } catch (Exception e) { - log.debug( - "Failed to calculate original width for placeholder: {}", - e.getMessage()); - } - } - - String placeholder = - (originalWidth > 0) - ? createPlaceholderWithWidth( - originalPart, - originalWidth, - segment.getFont(), - segment.getFontSize()) - : createPlaceholderWithFont(originalPart, segment.getFont()); - - result.replace(segmentStart, segmentEnd, placeholder); - } - } - - return result.toString(); - } - - private float safeGetStringWidth(PDFont font, String text) { - if (font == null || text == null || text.isEmpty()) { - return 0; - } - - if (!WidthCalculator.isWidthCalculationReliable(font)) { - log.debug( - "Font {} flagged as unreliable for width calculation, using fallback", - font.getName()); - return calculateConservativeWidth(font, text); - } - - if (!TextEncodingHelper.canEncodeCharacters(font, text)) { - log.debug( - "Text cannot be encoded by font {}, using character-based fallback", - font.getName()); - return calculateCharacterBasedWidth(font, text); - } - - try { - float width = font.getStringWidth(text); - log.debug("Direct width calculation successful for '{}': {}", text, width); - return width; - - } catch (Exception e) { - log.debug( - "Direct width calculation failed for font {}: {}", - font.getName(), - e.getMessage()); - return calculateFallbackWidth(font, text); - } - } - - private float calculateCharacterBasedWidth(PDFont font, String text) { - try { - float totalWidth = 0; - for (int i = 0; i < text.length(); i++) { - String character = text.substring(i, i + 1); - try { - // Validate character encoding first - if (!TextEncodingHelper.fontSupportsCharacter(font, character)) { - totalWidth += font.getAverageFontWidth(); - continue; - } - - byte[] encoded = font.encode(character); - if (encoded.length > 0) { - int glyphCode = encoded[0] & 0xFF; - float glyphWidth = font.getWidth(glyphCode); - - // Try alternative width methods if primary fails - if (glyphWidth == 0) { - try { - glyphWidth = font.getWidthFromFont(glyphCode); - } catch (Exception e2) { - glyphWidth = font.getAverageFontWidth(); - } - } - - totalWidth += glyphWidth; - } else { - totalWidth += font.getAverageFontWidth(); - } - } catch (Exception e2) { - // Character processing failed, use average width - totalWidth += font.getAverageFontWidth(); - } - } - - log.debug("Character-based width calculation: {}", totalWidth); - return totalWidth; - - } catch (Exception e) { - log.debug("Character-based width calculation failed: {}", e.getMessage()); - return calculateConservativeWidth(font, text); - } - } - - private float calculateFallbackWidth(PDFont font, String text) { - try { - // Method 1: Font bounding box approach - if (font.getFontDescriptor() != null - && font.getFontDescriptor().getFontBoundingBox() != null) { - - PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); - float avgCharWidth = bbox.getWidth() * 0.6f; // Conservative estimate - float fallbackWidth = text.length() * avgCharWidth; - - log.debug("Bounding box fallback width: {}", fallbackWidth); - return fallbackWidth; - } - - // Method 2: Average font width - try { - float avgWidth = font.getAverageFontWidth(); - if (avgWidth > 0) { - float fallbackWidth = text.length() * avgWidth; - log.debug("Average width fallback: {}", fallbackWidth); - return fallbackWidth; - } - } catch (Exception e2) { - log.debug("Average font width calculation failed: {}", e2.getMessage()); - } - - // Method 3: Conservative estimate based on font metrics - return calculateConservativeWidth(font, text); - - } catch (Exception e) { - log.debug("Fallback width calculation failed: {}", e.getMessage()); - return calculateConservativeWidth(font, text); - } - } - - private float calculateConservativeWidth(PDFont font, String text) { - float conservativeWidth = text.length() * 500f; - - log.debug( - "Conservative width estimate for font {} text '{}': {}", - font.getName(), - text, - conservativeWidth); - return conservativeWidth; - } - - private float calculateWidthAdjustment(TextSegment segment, List matches) { - try { - if (segment.getFont() == null || segment.getFontSize() <= 0) { - return 0; - } - - String fontName = segment.getFont().getName(); - if (fontName != null - && (fontName.contains("HOEPAP") || TextEncodingHelper.isFontSubset(fontName))) { - log.debug("Skipping width adjustment for problematic/subset font: {}", fontName); - return 0; - } - - float totalOriginal = 0; - float totalPlaceholder = 0; - - String text = segment.getText(); - - for (MatchRange match : matches) { - int segStart = Math.max(0, match.getStartPos() - segment.getStartPos()); - int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); - - if (segStart < text.length() && segEnd > segStart) { - String originalPart = text.substring(segStart, segEnd); - - float originalWidth = - safeGetStringWidth(segment.getFont(), originalPart) - / FONT_SCALE_FACTOR - * segment.getFontSize(); - - String placeholderPart = - createPlaceholderWithWidth( - originalPart, - originalWidth, - segment.getFont(), - segment.getFontSize()); - - float origUnits = safeGetStringWidth(segment.getFont(), originalPart); - float placeUnits = safeGetStringWidth(segment.getFont(), placeholderPart); - - float orig = (origUnits / FONT_SCALE_FACTOR) * segment.getFontSize(); - float place = (placeUnits / FONT_SCALE_FACTOR) * segment.getFontSize(); - - totalOriginal += orig; - totalPlaceholder += place; - } - } - - float adjustment = totalOriginal - totalPlaceholder; - - float maxReasonableAdjustment = - Math.max( - segment.getText().length() * segment.getFontSize() * 2, - totalOriginal * 1.5f // Allow up to 50% more than original width - ); - - if (Math.abs(adjustment) > maxReasonableAdjustment) { - log.debug( - "Width adjustment {} seems unreasonable for text length {}, capping to 0", - adjustment, - segment.getText().length()); - return 0; - } - - return adjustment; - } catch (Exception ex) { - log.debug("Width adjustment failed: {}", ex.getMessage()); - return 0; - } - } - - private void modifyTokenForRedaction( - List tokens, - TextSegment segment, - String newText, - float adjustment, - List matches) { - - if (segment.getTokenIndex() < 0 || segment.getTokenIndex() >= tokens.size()) { - return; - } - - Object token = tokens.get(segment.getTokenIndex()); - String operatorName = segment.getOperatorName(); - - try { - if (("Tj".equals(operatorName) || "'".equals(operatorName)) - && token instanceof COSString) { - - if (Math.abs(adjustment) < PRECISION_THRESHOLD) { - if (newText.isEmpty()) { - tokens.set(segment.getTokenIndex(), EMPTY_COS_STRING); - } else { - tokens.set(segment.getTokenIndex(), new COSString(newText)); - } - } else { - COSArray newArray = new COSArray(); - newArray.add(new COSString(newText)); - if (segment.getFontSize() > 0) { - - float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR; - - newArray.add(new COSFloat(kerning)); - } - tokens.set(segment.getTokenIndex(), newArray); - - int operatorIndex = segment.getTokenIndex() + 1; - if (operatorIndex < tokens.size() - && tokens.get(operatorIndex) instanceof Operator op - && op.getName().equals(operatorName)) { - tokens.set(operatorIndex, Operator.getOperator("TJ")); - } - } - } else if ("TJ".equals(operatorName) && token instanceof COSArray) { - COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches); - tokens.set(segment.getTokenIndex(), newArray); - } - } catch (Exception e) { - log.debug( - "Token modification failed for segment at index {}: {}", - segment.getTokenIndex(), - e.getMessage()); - } - } - - private COSArray createRedactedTJArray( - COSArray originalArray, TextSegment segment, List matches) { - try { - COSArray newArray = new COSArray(); - int textOffsetInSegment = 0; - - for (COSBase element : originalArray) { - if (element instanceof COSString cosString) { - String originalText = cosString.getString(); - - if (segment.getFont() != null - && !TextEncodingHelper.isTextSegmentRemovable( - segment.getFont(), originalText)) { - log.debug( - "Skipping TJ text part '{}' - cannot be processed reliably with font {}", - originalText, - segment.getFont().getName()); - newArray.add(element); // Keep original unchanged - textOffsetInSegment += originalText.length(); - continue; - } - - StringBuilder newText = new StringBuilder(originalText); - boolean modified = false; - - for (MatchRange match : matches) { - int stringStartInPage = segment.getStartPos() + textOffsetInSegment; - int stringEndInPage = stringStartInPage + originalText.length(); - - int overlapStart = Math.max(match.getStartPos(), stringStartInPage); - int overlapEnd = Math.min(match.getEndPos(), stringEndInPage); - - if (overlapStart < overlapEnd) { - int redactionStartInString = overlapStart - stringStartInPage; - int redactionEndInString = overlapEnd - stringStartInPage; - if (redactionStartInString >= 0 - && redactionEndInString <= originalText.length()) { - String originalPart = - originalText.substring( - redactionStartInString, redactionEndInString); - - if (segment.getFont() != null - && !TextEncodingHelper.isTextSegmentRemovable( - segment.getFont(), originalPart)) { - log.debug( - "Skipping TJ text part '{}' - cannot be redacted reliably", - originalPart); - continue; // Skip this redaction, keep original text - } - - modified = true; - float originalWidth = 0; - if (segment.getFont() != null && segment.getFontSize() > 0) { - try { - originalWidth = - safeGetStringWidth(segment.getFont(), originalPart) - / FONT_SCALE_FACTOR - * segment.getFontSize(); - } catch (Exception e) { - log.debug( - "Failed to calculate original width for TJ placeholder: {}", - e.getMessage()); - } - } - - String placeholder = - (originalWidth > 0) - ? createPlaceholderWithWidth( - originalPart, - originalWidth, - segment.getFont(), - segment.getFontSize()) - : createPlaceholderWithFont( - originalPart, segment.getFont()); - - newText.replace( - redactionStartInString, redactionEndInString, placeholder); - } - } - } - - String modifiedString = newText.toString(); - newArray.add(new COSString(modifiedString)); - - if (modified && segment.getFont() != null && segment.getFontSize() > 0) { - try { - float originalWidth = - safeGetStringWidth(segment.getFont(), originalText) - / FONT_SCALE_FACTOR - * segment.getFontSize(); - float modifiedWidth = - safeGetStringWidth(segment.getFont(), modifiedString) - / FONT_SCALE_FACTOR - * segment.getFontSize(); - float adjustment = originalWidth - modifiedWidth; - if (Math.abs(adjustment) > PRECISION_THRESHOLD) { - float kerning = - (-adjustment / segment.getFontSize()) - * FONT_SCALE_FACTOR - * 1.10f; - - newArray.add(new COSFloat(kerning)); - } - } catch (Exception e) { - log.debug( - "Width adjustment calculation failed for segment: {}", - e.getMessage()); - } - } - - textOffsetInSegment += originalText.length(); - } else { - newArray.add(element); - } - } - return newArray; - } catch (Exception e) { - return originalArray; - } - } - - private String extractTextFromToken(Object token, String operatorName) { - return switch (operatorName) { - case "Tj", "'" -> { - if (token instanceof COSString cosString) { - yield cosString.getString(); - } - yield ""; - } - case "TJ" -> { - if (token instanceof COSArray cosArray) { - StringBuilder sb = new StringBuilder(); - for (COSBase element : cosArray) { - if (element instanceof COSString cosString) { - sb.append(cosString.getString()); - } - } - yield sb.toString(); - } - yield ""; - } - default -> ""; - }; - } - - private boolean detectCustomEncodingFonts(PDDocument document) { - try { - var documentCatalog = document.getDocumentCatalog(); - if (documentCatalog == null) { - return false; - } - - int totalFonts = 0; - int customEncodedFonts = 0; - int subsetFonts = 0; - int unreliableFonts = 0; - - for (PDPage page : document.getPages()) { - if (TextFinderUtils.hasProblematicFonts(page)) { - log.debug("Page contains fonts flagged as problematic by TextFinderUtils"); - } - - PDResources resources = page.getResources(); - if (resources == null) { - continue; - } - - for (COSName fontName : resources.getFontNames()) { - try { - PDFont font = resources.getFont(fontName); - if (font != null) { - totalFonts++; - - // Enhanced analysis using helper classes - boolean isSubset = TextEncodingHelper.isFontSubset(font.getName()); - boolean hasCustomEncoding = TextEncodingHelper.hasCustomEncoding(font); - boolean isReliable = WidthCalculator.isWidthCalculationReliable(font); - boolean canCalculateWidths = - TextEncodingHelper.canCalculateBasicWidths(font); - - if (isSubset) { - subsetFonts++; - } - - if (hasCustomEncoding) { - customEncodedFonts++; - log.debug("Font {} has custom encoding", font.getName()); - } - - if (!isReliable || !canCalculateWidths) { - unreliableFonts++; - log.debug( - "Font {} flagged as unreliable: reliable={}, canCalculateWidths={}", - font.getName(), - isReliable, - canCalculateWidths); - } - - if (!TextFinderUtils.validateFontReliability(font)) { - log.debug( - "Font {} failed comprehensive reliability check", - font.getName()); - } - } - } catch (Exception e) { - log.debug( - "Font loading/analysis failed for {}: {}", - fontName.getName(), - e.getMessage()); - customEncodedFonts++; - unreliableFonts++; - totalFonts++; - } - } - } - - log.info( - "Enhanced font analysis: {}/{} custom encoding, {}/{} subset, {}/{} unreliable fonts", - customEncodedFonts, - totalFonts, - subsetFonts, - totalFonts, - unreliableFonts, - totalFonts); - - // Consider document problematic if we have custom encodings or unreliable fonts - return customEncodedFonts > 0 || unreliableFonts > 0; - - } catch (Exception e) { - log.warn("Enhanced font detection analysis failed: {}", e.getMessage()); - return true; // Assume problematic if analysis fails - } - } - - private void processFormXObject( - PDDocument document, - PDFormXObject formXObject, - Set targetWords, - boolean useRegex, - boolean wholeWordSearch) { - - try { - PDResources xobjResources = formXObject.getResources(); - if (xobjResources == null) { - return; - } - - for (COSName xobjName : xobjResources.getXObjectNames()) { - PDXObject nestedXObj = xobjResources.getXObject(xobjName); - if (nestedXObj instanceof PDFormXObject nestedFormXObj) { - processFormXObject( - document, nestedFormXObj, targetWords, useRegex, wholeWordSearch); - } - } - - PDFStreamParser parser = new PDFStreamParser(formXObject); - List tokens = new ArrayList<>(); - Object token; - while ((token = parser.parseNextToken()) != null) { - tokens.add(token); - } - - List textSegments = extractTextSegmentsFromXObject(xobjResources, tokens); - String completeText = buildCompleteText(textSegments); - - List matches = - findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); - - if (!matches.isEmpty()) { - List redactedTokens = - applyRedactionsToTokens(tokens, textSegments, matches); - writeRedactedContentToXObject(document, formXObject, redactedTokens); - log.debug("Processed {} redactions in Form XObject", matches.size()); - } - - } catch (Exception e) { - log.warn("Failed to process Form XObject: {}", e.getMessage()); - } - } - - private List extractTextSegmentsFromXObject( - PDResources resources, List tokens) { - List segments = new ArrayList<>(); - int currentTextPos = 0; - GraphicsState graphicsState = new GraphicsState(); - - for (int i = 0; i < tokens.size(); i++) { - Object currentToken = tokens.get(i); - - if (currentToken instanceof Operator op) { - String opName = op.getName(); - - if ("Tf".equals(opName) && i >= 2) { - try { - COSName fontName = (COSName) tokens.get(i - 2); - COSBase fontSizeBase = (COSBase) tokens.get(i - 1); - if (fontSizeBase instanceof COSNumber cosNumber) { - graphicsState.setFont(resources.getFont(fontName)); - graphicsState.setFontSize(cosNumber.floatValue()); - } - } catch (ClassCastException | IOException e) { - log.debug("Font extraction failed in XObject: {}", e.getMessage()); - } - } - - currentTextPos = - getCurrentTextPos( - tokens, segments, currentTextPos, graphicsState, i, opName); - } - } - - return segments; - } - - private int getCurrentTextPos( - List tokens, - List segments, - int currentTextPos, - GraphicsState graphicsState, - int i, - String opName) { - if (isTextShowingOperator(opName) && i > 0) { - String textContent = extractTextFromToken(tokens.get(i - 1), opName); - if (!textContent.isEmpty()) { - segments.add( - new TextSegment( - i - 1, - opName, - textContent, - currentTextPos, - currentTextPos + textContent.length(), - graphicsState.font, - graphicsState.fontSize)); - currentTextPos += textContent.length(); - } - } - return currentTextPos; - } - - private void writeRedactedContentToXObject( - PDDocument document, PDFormXObject formXObject, List redactedTokens) - throws IOException { - - PDStream newStream = new PDStream(document); - - try (var out = newStream.createOutputStream()) { - ContentStreamWriter writer = new ContentStreamWriter(out); - writer.writeTokens(redactedTokens); - } - - formXObject.getCOSObject().removeItem(COSName.CONTENTS); - formXObject.getCOSObject().setItem(COSName.CONTENTS, newStream.getCOSObject()); + public ResponseEntity redactPdf(@ModelAttribute RedactPdfRequest request) + throws IOException { + byte[] pdfContent = redactionService.redactPdf(request); + return WebResponseUtils.bytesToWebResponse( + pdfContent, + removeFileExtension( + Objects.requireNonNull( + Filenames.toSimpleFileName( + request.getFileInput().getOriginalFilename()))) + + "_redacted.pdf"); } } diff --git a/app/core/src/main/java/stirling/software/SPDF/model/api/security/RedactPdfRequest.java b/app/core/src/main/java/stirling/software/SPDF/model/api/security/RedactPdfRequest.java index 279a41a27..3cd42e410 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/api/security/RedactPdfRequest.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/api/security/RedactPdfRequest.java @@ -46,4 +46,11 @@ public class RedactPdfRequest extends PDFFile { defaultValue = "false", requiredMode = Schema.RequiredMode.REQUIRED) private Boolean convertPDFToImage; + + @Schema( + description = "Redaction mode: moderate, visual, or aggressive", + defaultValue = "moderate", + allowableValues = {"moderate", "visual", "aggressive"}, + requiredMode = Schema.RequiredMode.NOT_REQUIRED) + private String redactionMode; } diff --git a/app/core/src/main/java/stirling/software/SPDF/service/AggressiveRedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/AggressiveRedactionService.java new file mode 100644 index 000000000..f438c49e4 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/service/AggressiveRedactionService.java @@ -0,0 +1,85 @@ +package stirling.software.SPDF.service; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import org.apache.pdfbox.pdmodel.PDDocument; + +import stirling.software.SPDF.model.PDFText; +import stirling.software.SPDF.model.api.security.RedactPdfRequest; +import stirling.software.common.service.CustomPDFDocumentFactory; + +class AggressiveRedactionService implements RedactionModeStrategy { + + private final CustomPDFDocumentFactory pdfDocumentFactory; + private final RedactionService helper; + + AggressiveRedactionService( + CustomPDFDocumentFactory pdfDocumentFactory, RedactionService helper) { + this.pdfDocumentFactory = pdfDocumentFactory; + this.helper = helper; + } + + @Override + public byte[] redact(RedactPdfRequest request) throws IOException { + String[] listOfText = request.getListOfText().split("\n"); + boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); + boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch()); + + PDDocument doc = null; + PDDocument fb = null; + try { + doc = pdfDocumentFactory.load(request.getFileInput()); + Map> allFound = + RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); + if (allFound.isEmpty()) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + doc.save(baos); + return baos.toByteArray(); + } + } + helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord); + Map> residual = + RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); + boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0; + String effectiveColor = + (request.getRedactColor() == null || request.getRedactColor().isBlank()) + ? "#000000" + : request.getRedactColor(); + if (residualExists) { + fb = pdfDocumentFactory.load(request.getFileInput()); + Map> fbFound = + RedactionService.findTextToRedact(fb, listOfText, useRegex, wholeWord); + return RedactionService.finalizeRedaction( + fb, + fbFound, + effectiveColor, + request.getCustomPadding(), /*force*/ + true, + false); + } + return RedactionService.finalizeRedaction( + doc, + allFound, + request.getRedactColor(), + request.getCustomPadding(), + request.getConvertPDFToImage(), /*text removal*/ + true); + } catch (Exception e) { + throw new IOException("Aggressive redaction failed: " + e.getMessage(), e); + } finally { + if (doc != null) + try { + doc.close(); + } catch (IOException ignore) { + } + if (fb != null) + try { + fb.close(); + } catch (IOException ignore) { + } + } + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/service/ModerateRedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/ModerateRedactionService.java new file mode 100644 index 000000000..cf1d0d0a1 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/service/ModerateRedactionService.java @@ -0,0 +1,83 @@ +package stirling.software.SPDF.service; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import org.apache.pdfbox.pdmodel.PDDocument; + +import stirling.software.SPDF.model.PDFText; +import stirling.software.SPDF.model.api.security.RedactPdfRequest; +import stirling.software.common.service.CustomPDFDocumentFactory; + +class ModerateRedactionService implements RedactionModeStrategy { + + private final CustomPDFDocumentFactory pdfDocumentFactory; + private final RedactionService helper; + + ModerateRedactionService(CustomPDFDocumentFactory pdfDocumentFactory, RedactionService helper) { + this.pdfDocumentFactory = pdfDocumentFactory; + this.helper = helper; + } + + @Override + public byte[] redact(RedactPdfRequest request) throws IOException { + String[] listOfText = request.getListOfText().split("\n"); + boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); + boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch()); + + PDDocument doc = null; + PDDocument fallback = null; + try { + doc = pdfDocumentFactory.load(request.getFileInput()); + Map> allFound = + RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); + if (allFound.isEmpty()) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + doc.save(baos); + return baos.toByteArray(); + } + } + boolean fallbackToBoxOnly = + helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord); + String effectiveColor = + (request.getRedactColor() == null || request.getRedactColor().isBlank()) + ? "#000000" + : request.getRedactColor(); + if (fallbackToBoxOnly) { + fallback = pdfDocumentFactory.load(request.getFileInput()); + allFound = + RedactionService.findTextToRedact( + fallback, listOfText, useRegex, wholeWord); + return RedactionService.finalizeRedaction( + fallback, + allFound, + effectiveColor, + request.getCustomPadding(), + request.getConvertPDFToImage(), + false); + } + return RedactionService.finalizeRedaction( + doc, + allFound, + effectiveColor, + request.getCustomPadding(), + request.getConvertPDFToImage(), + false); + } catch (Exception e) { + throw new IOException("Moderate redaction failed: " + e.getMessage(), e); + } finally { + if (doc != null) + try { + doc.close(); + } catch (IOException ignore) { + } + if (fallback != null) + try { + fallback.close(); + } catch (IOException ignore) { + } + } + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/service/RedactionModeStrategy.java b/app/core/src/main/java/stirling/software/SPDF/service/RedactionModeStrategy.java new file mode 100644 index 000000000..3c663867e --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionModeStrategy.java @@ -0,0 +1,9 @@ +package stirling.software.SPDF.service; + +import java.io.IOException; + +import stirling.software.SPDF.model.api.security.RedactPdfRequest; + +public interface RedactionModeStrategy { + byte[] redact(RedactPdfRequest request) throws IOException; +} diff --git a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java new file mode 100644 index 000000000..d464298fc --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java @@ -0,0 +1,2008 @@ +package stirling.software.SPDF.service; + +import java.awt.Color; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSFloat; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSNumber; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdfparser.PDFStreamParser; +import org.apache.pdfbox.pdfwriter.ContentStreamWriter; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.common.PDStream; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDType0Font; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.RequiredArgsConstructor; + +import stirling.software.SPDF.model.PDFText; +import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest; +import stirling.software.SPDF.model.api.security.RedactPdfRequest; +import stirling.software.SPDF.pdf.TextFinder; +import stirling.software.SPDF.utils.text.TextEncodingHelper; +import stirling.software.SPDF.utils.text.TextFinderUtils; +import stirling.software.SPDF.utils.text.WidthCalculator; +import stirling.software.common.model.api.security.RedactionArea; +import stirling.software.common.service.CustomPDFDocumentFactory; +import stirling.software.common.util.PdfUtils; + +@Service +@RequiredArgsConstructor +public class RedactionService { + + private static final Pattern FUZZY_STRIP = Pattern.compile("[^a-z0-9]+"); + private static final Pattern PAGE_SPLIT = Pattern.compile("[,\\s]+"); + private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.6f; + private static final float PRECISION_THRESHOLD = 1e-3f; + private static final int FONT_SCALE_FACTOR = 1000; + private static final Set TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); + private static final COSString EMPTY_COS_STRING = new COSString(""); + private static final ThreadLocal AGGRESSIVE_MODE = + ThreadLocal.withInitial(() -> Boolean.FALSE); + private static final ThreadLocal>> AGGR_SEG_MATCHES = + new ThreadLocal<>(); + private final CustomPDFDocumentFactory pdfDocumentFactory; + + private static void redactAreas( + List redactionAreas, PDDocument document, PDPageTree allPages) + throws IOException { + if (redactionAreas == null || redactionAreas.isEmpty()) { + return; + } + + Map> redactionsByPage = new HashMap<>(); + for (RedactionArea redactionArea : redactionAreas) { + if (redactionArea.getPage() == null + || redactionArea.getPage() <= 0 + || redactionArea.getHeight() == null + || redactionArea.getHeight() <= 0.0D + || redactionArea.getWidth() == null + || redactionArea.getWidth() <= 0.0D) { + continue; + } + redactionsByPage + .computeIfAbsent(redactionArea.getPage(), k -> new ArrayList<>()) + .add(redactionArea); + } + + for (Map.Entry> entry : redactionsByPage.entrySet()) { + Integer pageNumber = entry.getKey(); + List areasForPage = entry.getValue(); + if (pageNumber > allPages.getCount()) { + continue; + } + PDPage page = allPages.get(pageNumber - 1); + + try (PDPageContentStream contentStream = + new PDPageContentStream( + document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { + contentStream.saveGraphicsState(); + for (RedactionArea redactionArea : areasForPage) { + Color redactColor = decodeOrDefault(redactionArea.getColor()); + contentStream.setNonStrokingColor(redactColor); + float x = redactionArea.getX().floatValue(); + float y = redactionArea.getY().floatValue(); + float width = redactionArea.getWidth().floatValue(); + float height = redactionArea.getHeight().floatValue(); + float pdfY = page.getBBox().getHeight() - y - height; + contentStream.addRect(x, pdfY, width, height); + contentStream.fill(); + } + contentStream.restoreGraphicsState(); + } + } + } + + private static void redactPages( + ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages) + throws IOException { + Color redactColor = decodeOrDefault(request.getPageRedactionColor()); + List pageNumbers = getPageNumbers(request, allPages.getCount()); + + for (Integer pageNumber : pageNumbers) { + PDPage page = allPages.get(pageNumber); + try (PDPageContentStream contentStream = + new PDPageContentStream( + document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { + contentStream.setNonStrokingColor(redactColor); + PDRectangle box = page.getBBox(); + contentStream.addRect(0, 0, box.getWidth(), box.getHeight()); + contentStream.fill(); + } + } + } + + private static Color decodeOrDefault(String hex) { + if (hex == null) { + return Color.BLACK; + } + String colorString = (!hex.isEmpty() && hex.charAt(0) == '#') ? hex : "#" + hex; + try { + return Color.decode(colorString); + } catch (NumberFormatException e) { + return Color.BLACK; + } + } + + private static List getPageNumbers(ManualRedactPdfRequest request, int pagesCount) { + String pageNumbersInput = request.getPageNumbers(); + String[] parts = + (pageNumbersInput != null) ? PAGE_SPLIT.split(pageNumbersInput) : new String[0]; + List pageNumbers = new ArrayList<>(); + if (parts.length == 0 || parts[0].isEmpty()) { + return pageNumbers; + } + for (String token : parts) { + if (token.contains("-")) { + String[] range = token.split("-"); + if (range.length == 2) { + int start = Integer.parseInt(range[0]); + int end = Integer.parseInt(range[1]); + if (start > 0 && end > 0 && start <= end) { + for (int i = start; i <= end; i++) { + if (i <= pagesCount) { + pageNumbers.add(i - 1); + } + } + } + } + } else { + try { + int num = Integer.parseInt(token); + if (num > 0 && num <= pagesCount) { + pageNumbers.add(num - 1); + } + } catch (NumberFormatException ignored) { + } + } + } + return pageNumbers; + } + + private static void redactFoundText( + PDDocument document, List blocks, float customPadding, Color redactColor) + throws IOException { + var allPages = document.getDocumentCatalog().getPages(); + Map> blocksByPage = new HashMap<>(); + for (PDFText block : blocks) { + blocksByPage.computeIfAbsent(block.getPageIndex(), k -> new ArrayList<>()).add(block); + } + for (Map.Entry> entry : blocksByPage.entrySet()) { + Integer pageIndex = entry.getKey(); + if (pageIndex >= allPages.getCount()) { + continue; + } + PDPage page = allPages.get(pageIndex); + List pageBlocks = entry.getValue(); + try (PDPageContentStream cs = + new PDPageContentStream( + document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { + cs.saveGraphicsState(); + try { + cs.setNonStrokingColor(redactColor); + PDRectangle pageBox = page.getBBox(); + for (PDFText b : pageBlocks) { + float padding = + (b.getY2() - b.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER + + customPadding; + float width = b.getX2() - b.getX1(); + cs.addRect( + b.getX1(), + pageBox.getHeight() - b.getY2() - padding, + width, + b.getY2() - b.getY1() + 2 * padding); + } + cs.fill(); + } finally { + cs.restoreGraphicsState(); + } + } + } + } + + static void writeFilteredContentStream(PDDocument document, PDPage page, List tokens) + throws IOException { + PDStream newStream = new PDStream(document); + try (var out = newStream.createOutputStream()) { + new ContentStreamWriter(out).writeTokens(tokens); + } + page.setContents(newStream); + } + + static boolean isTextShowingOperator(String opName) { + return TEXT_SHOWING_OPERATORS.contains(opName); + } + + private static boolean pageStillContainsTargets( + PDDocument document, + int pageIndex, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + try { + for (String term : targetWords) { + if (term == null || term.isBlank()) { + continue; + } + TextFinder finder = new TextFinder(term, useRegex, wholeWordSearch); + finder.setStartPage(pageIndex + 1); + finder.setEndPage(pageIndex + 1); + finder.getText(document); + for (PDFText ft : finder.getFoundTexts()) { + if (ft.getPageIndex() == pageIndex) { + return true; + } + } + } + } catch (Exception e) { + return true; + } + return false; + } + + public static Map> findTextToRedact( + PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) { + Map> allFoundTextsByPage = new HashMap<>(); + for (String text : listOfText) { + String t = text.trim(); + if (t.isEmpty()) { + continue; + } + try { + TextFinder finder = new TextFinder(t, useRegex, wholeWordSearch); + finder.getText(document); + for (PDFText found : finder.getFoundTexts()) { + allFoundTextsByPage + .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>()) + .add(found); + } + } catch (Exception ignored) { + } + } + return allFoundTextsByPage; + } + + public static byte[] finalizeRedaction( + PDDocument document, + Map> allFoundTextsByPage, + String colorString, + float customPadding, + Boolean convertToImage, + boolean isTextRemovalMode) + throws IOException { + List allFoundTexts = new ArrayList<>(); + for (List pageTexts : allFoundTextsByPage.values()) { + allFoundTexts.addAll(pageTexts); + } + if (!allFoundTexts.isEmpty()) { + if (!isTextRemovalMode) { + Color redactColor = decodeOrDefault(colorString); + redactFoundText(document, allFoundTexts, customPadding, redactColor); + } + cleanDocumentMetadata(document); + } + if (Boolean.TRUE.equals(convertToImage)) { + try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) { + cleanDocumentMetadata(convertedPdf); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + convertedPdf.save(baos); + return baos.toByteArray(); + } + } + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + document.save(baos); + return baos.toByteArray(); + } + + private static void cleanDocumentMetadata(PDDocument document) { + try { + var info = document.getDocumentInformation(); + if (info != null) { + info.setAuthor(null); + info.setSubject(null); + info.setKeywords(null); + info.setModificationDate(java.util.Calendar.getInstance()); + } + if (document.getDocumentCatalog() != null) { + document.getDocumentCatalog().setMetadata(null); + } + } catch (Exception ignored) { + } + } + + private static String tryDecodeWithFontEnhanced(PDFont font, COSString cosString) { + try { + if (font == null || cosString == null) { + return null; + } + byte[] bytes = cosString.getBytes(); + if (bytes.length == 0) { + return ""; + } + String basicDecoded = tryDecodeWithFont(font, cosString); + if (basicDecoded != null && !basicDecoded.contains("?")) { + return basicDecoded; + } + StringBuilder out = new StringBuilder(); + for (byte aByte : bytes) { + int code = aByte & 0xFF; + String charStr = null; + try { + charStr = font.toUnicode(code); + } catch (Exception ignored) { + } + if (charStr == null && font.getName() != null && font.getName().contains("+")) { + charStr = mapSubsetCharacter(code); + } + + out.append(charStr != null ? charStr : ""); + } + return out.toString(); + } catch (Exception e) { + return tryDecodeWithFont(font, cosString); + } + } + + private static String mapSubsetCharacter(int code) { + if (code >= 32 && code <= 126) { + return String.valueOf((char) code); + } + if (code >= 160 && code <= 255) { + return String.valueOf((char) (code - 128)); + } + return null; + } + + private static String normalizeForFuzzy(String s) { + if (s == null) { + return ""; + } + String lower = s.toLowerCase(); + return FUZZY_STRIP.matcher(lower).replaceAll(""); + } + + private static NormalizedMap buildNormalizedMap(String original) { + NormalizedMap nm = new NormalizedMap(); + if (original == null) { + nm.norm = ""; + nm.map = new int[0]; + return nm; + } + StringBuilder norm = new StringBuilder(); + List mapping = new ArrayList<>(); + for (int i = 0; i < original.length(); i++) { + char c = Character.toLowerCase(original.charAt(i)); + if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { + norm.append(c); + mapping.add(i); + } + } + nm.norm = norm.toString(); + nm.map = mapping.stream().mapToInt(Integer::intValue).toArray(); + return nm; + } + + private static List findAllMatches( + String completeText, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + List patterns = + TextFinderUtils.createOptimizedSearchPatterns( + targetWords, useRegex, wholeWordSearch); + return patterns.stream() + .flatMap( + pattern -> { + try { + return pattern.matcher(completeText).results(); + } catch (Exception e) { + return java.util.stream.Stream.empty(); + } + }) + .map(mr -> new MatchRange(mr.start(), mr.end())) + .sorted(Comparator.comparingInt(MatchRange::getStartPos)) + .collect(Collectors.toList()); + } + + private static DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) { + DecodedMapping map = new DecodedMapping(); + if (font == null || bytes == null) { + map.text = ""; + map.charByteStart = new int[0]; + map.charByteEnd = new int[0]; + return map; + } + StringBuilder sb = new StringBuilder(); + List starts = new ArrayList<>(); + List ends = new ArrayList<>(); + int i = 0; + boolean isType0 = font instanceof PDType0Font; + while (i < bytes.length) { + int b1 = bytes[i] & 0xFF; + String u = null; + int consumed = 1; + try { + if (isType0 && i + 1 < bytes.length) { + int b2 = bytes[i + 1] & 0xFF; + int code = (b1 << 8) | b2; + String u2 = null; + try { + u2 = font.toUnicode(code); + } catch (Exception ignored) { + } + if (u2 != null) { + u = u2; + consumed = 2; + } + } + if (u == null) { + try { + u = font.toUnicode(b1); + } catch (Exception ignored) { + } + if (u == null) { + u = "?"; + } + } + } catch (Exception e) { + u = "?"; + } + int start = i; + int end = i + consumed; + for (int k = 0; k < u.length(); k++) { + sb.append(u.charAt(k)); + starts.add(start); + ends.add(end); + } + i += consumed; + } + map.text = sb.toString(); + map.charByteStart = starts.stream().mapToInt(Integer::intValue).toArray(); + map.charByteEnd = ends.stream().mapToInt(Integer::intValue).toArray(); + return map; + } + + private static COSString redactCosStringByDecodedRanges( + PDFont font, COSString cosString, List decRanges) { + try { + byte[] bytes = cosString.getBytes(); + DecodedMapping dm = buildDecodeMapping(font, bytes); + if (dm.text.isEmpty() || dm.charByteStart.length == 0) { + return cosString; + } + boolean[] delete = new boolean[bytes.length]; + for (AggressiveSegMatch r : decRanges) { + int ds = Math.max(0, Math.min(r.decodedStart, dm.charByteStart.length)); + int de = Math.max(ds, Math.min(r.decodedEnd, dm.charByteStart.length)); + if (ds >= de) { + continue; + } + int byteStart = dm.charByteStart[ds]; + int byteEnd = dm.charByteEnd[de - 1]; + for (int bi = Math.max(0, byteStart); bi < Math.min(bytes.length, byteEnd); bi++) { + delete[bi] = true; + } + } + ByteArrayOutputStream baos = new ByteArrayOutputStream(bytes.length); + for (int bi = 0; bi < bytes.length; bi++) { + if (!delete[bi]) { + baos.write(bytes[bi]); + } + } + return new COSString(baos.toByteArray()); + } catch (Exception e) { + return Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) ? EMPTY_COS_STRING : cosString; + } + } + + private static COSArray redactTJArrayByDecodedRanges( + PDFont font, COSArray originalArray, List decRanges) { + try { + COSArray newArray = new COSArray(); + int decodedCursor = 0; + for (COSBase element : originalArray) { + if (element instanceof COSString cosString) { + byte[] bytes = cosString.getBytes(); + DecodedMapping dm = buildDecodeMapping(font, bytes); + int decodedLen = dm.text.length(); + if (decodedLen == 0 || dm.charByteStart.length == 0) { + newArray.add(element); + continue; + } + boolean[] delete = new boolean[bytes.length]; + for (AggressiveSegMatch r : decRanges) { + int gStart = r.decodedStart; + int gEnd = r.decodedEnd; + int ovStart = Math.max(gStart, decodedCursor); + int ovEnd = Math.min(gEnd, decodedCursor + decodedLen); + if (ovStart < ovEnd) { + int localStart = ovStart - decodedCursor; + int localEnd = ovEnd - decodedCursor; + int byteStart = dm.charByteStart[localStart]; + int byteEnd = dm.charByteEnd[localEnd - 1]; + for (int bi = Math.max(0, byteStart); + bi < Math.min(bytes.length, byteEnd); + bi++) { + delete[bi] = true; + } + } + } + ByteArrayOutputStream baos = new ByteArrayOutputStream(bytes.length); + for (int bi = 0; bi < bytes.length; bi++) { + if (!delete[bi]) { + baos.write(bytes[bi]); + } + } + newArray.add(new COSString(baos.toByteArray())); + decodedCursor += decodedLen; + } else { + newArray.add(element); + } + } + return newArray; + } catch (Exception e) { + return originalArray; + } + } + + private static float calculateCharacterBasedWidth(PDFont font, String text) { + try { + float totalWidth = 0; + for (int i = 0; i < text.length(); i++) { + String ch = text.substring(i, i + 1); + try { + if (!TextEncodingHelper.fontSupportsCharacter(font, ch)) { + totalWidth += font.getAverageFontWidth(); + continue; + } + byte[] encoded = font.encode(ch); + if (encoded.length > 0) { + int glyphCode = encoded[0] & 0xFF; + float glyphWidth = font.getWidth(glyphCode); + if (glyphWidth == 0) { + try { + glyphWidth = font.getWidthFromFont(glyphCode); + } catch (Exception e2) { + glyphWidth = font.getAverageFontWidth(); + } + } + totalWidth += glyphWidth; + } else { + totalWidth += font.getAverageFontWidth(); + } + } catch (Exception e2) { + totalWidth += font.getAverageFontWidth(); + } + } + return totalWidth; + } catch (Exception e) { + return calculateConservativeWidth(font, text); + } + } + + private static float calculateFallbackWidth(PDFont font, String text) { + try { + if (font.getFontDescriptor() != null + && font.getFontDescriptor().getFontBoundingBox() != null) { + PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); + float avgCharWidth = bbox.getWidth() * 0.6f; + return text.length() * avgCharWidth; + } + try { + float avgWidth = font.getAverageFontWidth(); + if (avgWidth > 0) { + return text.length() * avgWidth; + } + } catch (Exception ignored) { + } + return calculateConservativeWidth(font, text); + } catch (Exception e) { + return calculateConservativeWidth(font, text); + } + } + + private static float calculateConservativeWidth(PDFont font, String text) { + return text.length() * 500f; + } + + private static String tryDecodeWithFont(PDFont font, COSString cosString) { + try { + if (font == null || cosString == null) { + return null; + } + byte[] bytes = cosString.getBytes(); + if (bytes.length == 0) { + return ""; + } + boolean anyMapped = false; + StringBuilder out = new StringBuilder(); + for (byte b : bytes) { + int code = b & 0xFF; + String uni = null; + try { + uni = font.toUnicode(code); + } catch (Exception ignored) { + } + if (uni != null) { + out.append(uni); + anyMapped = true; + } else { + out.append('?'); + } + } + if (anyMapped) { + return out.toString(); + } + out.setLength(0); + anyMapped = false; + for (int i = 0; i < bytes.length; ) { + int b1 = bytes[i] & 0xFF; + String u1 = null; + try { + u1 = font.toUnicode(b1); + } catch (Exception ignored) { + } + if (i + 1 < bytes.length) { + int b2 = bytes[i + 1] & 0xFF; + int code = (b1 << 8) | b2; + String u2 = null; + try { + u2 = font.toUnicode(code); + } catch (Exception ignored) { + } + if (u2 != null) { + out.append(u2); + i += 2; + anyMapped = true; + continue; + } + } + if (u1 != null) { + out.append(u1); + } else { + out.append('?'); + } + i += 1; + } + return anyMapped ? out.toString() : null; + } catch (Exception e) { + return null; + } + } + + private static WipeResult wipeAllTextShowingOperators(List tokens) { + List newTokens = new ArrayList<>(tokens); + int modifications = 0; + for (int i = 0; i < newTokens.size(); i++) { + Object t = newTokens.get(i); + if (t instanceof Operator op) { + String name = op.getName(); + if (("Tj".equals(name) || "'".equals(name) || "\"".equals(name)) + && i > 0 + && newTokens.get(i - 1) instanceof COSString) { + newTokens.set(i - 1, EMPTY_COS_STRING); + modifications++; + } else if ("TJ".equals(name) && i > 0 && newTokens.get(i - 1) instanceof COSArray) { + COSArray arr = (COSArray) newTokens.get(i - 1); + COSArray newArr = new COSArray(); + for (int j = 0; j < arr.size(); j++) { + COSBase el = arr.get(j); + if (el instanceof COSString) { + newArr.add(EMPTY_COS_STRING); + modifications++; + } else { + newArr.add(el); + } + } + newTokens.set(i - 1, newArr); + } + } + } + WipeResult res = new WipeResult(); + res.tokens = newTokens; + res.modifications = modifications; + return res; + } + + private static int wipeAllSemanticTextInProperties(PDResources resources) { + int modifications = 0; + if (resources == null) { + return 0; + } + var cosRes = resources.getCOSObject(); + var propsObj = cosRes.getDictionaryObject(COSName.PROPERTIES); + if (propsObj instanceof COSDictionary propsDict) { + for (COSName key : new ArrayList<>(propsDict.keySet())) { + var val = propsDict.getDictionaryObject(key); + if (val instanceof COSDictionary dict) { + boolean changed = false; + if (dict.containsKey(COSName.getPDFName("ActualText"))) { + dict.removeItem(COSName.getPDFName("ActualText")); + changed = true; + } + if (dict.containsKey(COSName.getPDFName("Alt"))) { + dict.removeItem(COSName.getPDFName("Alt")); + changed = true; + } + if (dict.containsKey(COSName.getPDFName("TU"))) { + dict.removeItem(COSName.getPDFName("TU")); + changed = true; + } + if (changed) { + modifications++; + } + } + } + } + return modifications; + } + + private static void writeRedactedContentToXObject( + PDDocument document, PDFormXObject formXObject, List redactedTokens) + throws IOException { + var cosStream = formXObject.getCOSObject(); + try (var out = cosStream.createOutputStream()) { + new ContentStreamWriter(out).writeTokens(redactedTokens); + } + } + + public byte[] redactPDF(ManualRedactPdfRequest request) throws IOException { + MultipartFile file = request.getFileInput(); + + try (PDDocument document = pdfDocumentFactory.load(file)) { + PDPageTree allPages = document.getDocumentCatalog().getPages(); + + redactPages(request, document, allPages); + redactAreas(request.getRedactions(), document, allPages); + + if (Boolean.TRUE.equals(request.getConvertPDFToImage())) { + try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + convertedPdf.save(baos); + return baos.toByteArray(); + } + } + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + document.save(baos); + return baos.toByteArray(); + } + } + + public byte[] redactPdf(RedactPdfRequest request) throws IOException { + String mode = request.getRedactionMode(); + if (mode == null || mode.isBlank()) { + mode = "moderate"; + } + RedactionModeStrategy strategy = + switch (mode.toLowerCase()) { + case "visual" -> new VisualRedactionService(pdfDocumentFactory, this); + case "aggressive" -> new AggressiveRedactionService(pdfDocumentFactory, this); + default -> new ModerateRedactionService(pdfDocumentFactory, this); + }; + return strategy.redact(request); + } + + String createPlaceholderWithFont(String originalWord, PDFont font) { + if (originalWord == null || originalWord.isEmpty()) { + return originalWord; + } + if (font != null && TextEncodingHelper.isFontSubset(font.getName())) { + try { + float originalWidth = safeGetStringWidth(font, originalWord) / FONT_SCALE_FACTOR; + return createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f); + } catch (Exception e) { + return ""; + } + } + return " ".repeat(originalWord.length()); + } + + String createPlaceholderWithWidth( + String originalWord, float targetWidth, PDFont font, float fontSize) { + if (originalWord == null || originalWord.isEmpty()) { + return originalWord; + } + if (font == null || fontSize <= 0) { + return " ".repeat(originalWord.length()); + } + try { + if (!WidthCalculator.isWidthCalculationReliable(font)) { + return " ".repeat(originalWord.length()); + } + if (TextEncodingHelper.isFontSubset(font.getName())) { + return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize); + } + float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize); + if (spaceWidth <= 0) { + return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + } + int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); + int maxSpaces = + Math.max( + originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f)); + return " ".repeat(Math.min(spaceCount, maxSpaces)); + } catch (Exception e) { + return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + } + } + + private String createSubsetFontPlaceholder( + String originalWord, float targetWidth, PDFont font, float fontSize) { + try { + return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + } catch (Exception e) { + return ""; + } + } + + private String createAlternativePlaceholder( + String originalWord, float targetWidth, PDFont font, float fontSize) { + try { + String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"}; + if (TextEncodingHelper.fontSupportsCharacter(font, " ")) { + float spaceWidth = safeGetStringWidth(font, " ") / FONT_SCALE_FACTOR * fontSize; + if (spaceWidth > 0) { + int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); + int maxSpaces = originalWord.length() * 2; + return " ".repeat(Math.min(spaceCount, maxSpaces)); + } + } + for (String alt : alternatives) { + if (" ".equals(alt)) { + continue; + } + try { + if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) { + continue; + } + float cw = safeGetStringWidth(font, alt) / FONT_SCALE_FACTOR * fontSize; + if (cw > 0) { + int count = Math.max(1, Math.round(targetWidth / cw)); + int max = originalWord.length() * 2; + return " ".repeat(Math.min(count, max)); + } + } catch (Exception ignored) { + } + } + return ""; + } catch (Exception e) { + return ""; + } + } + + public void performTextReplacementAggressive( + PDDocument document, + Map> allFoundTextsByPage, + String[] listOfText, + boolean useRegex, + boolean wholeWordSearchBool) { + if (allFoundTextsByPage.isEmpty()) { + return; + } + Set allSearchTerms = + Arrays.stream(listOfText) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toSet()); + AGGRESSIVE_MODE.set(Boolean.TRUE); + try { + int pageIndex = -1; + for (PDPage page : document.getPages()) { + pageIndex++; + try { + AGGR_SEG_MATCHES.remove(); + List filtered = + createTokensWithoutTargetText( + document, page, allSearchTerms, useRegex, wholeWordSearchBool); + writeFilteredContentStream(document, page, filtered); + boolean residual = + pageStillContainsTargets( + document, + pageIndex, + allSearchTerms, + useRegex, + wholeWordSearchBool); + if (residual) { + try { + var sem = wipeAllSemanticTextInTokens(filtered); + filtered = sem.tokens; + PDResources res = page.getResources(); + if (res != null) { + wipeAllSemanticTextInProperties(res); + wipeAllTextInXObjects(document, res); + wipeAllTextInPatterns(document, res); + } + writeFilteredContentStream(document, page, filtered); + } catch (Exception ignored) { + } + } + } catch (Exception ignored) { + } + } + } finally { + AGGRESSIVE_MODE.remove(); + } + } + + public boolean performTextReplacement( + PDDocument document, + Map> allFoundTextsByPage, + String[] listOfText, + boolean useRegex, + boolean wholeWordSearchBool) { + if (allFoundTextsByPage.isEmpty()) { + return false; + } + try { + Set allSearchTerms = + Arrays.stream(listOfText) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toSet()); + for (PDPage page : document.getPages()) { + List filtered = + createTokensWithoutTargetText( + document, page, allSearchTerms, useRegex, wholeWordSearchBool); + writeFilteredContentStream(document, page, filtered); + } + return false; + } catch (Exception e) { + return true; + } + } + + List createTokensWithoutTargetText( + PDDocument document, + PDPage page, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) + throws IOException { + PDFStreamParser parser = new PDFStreamParser(page); + List tokens = new ArrayList<>(); + Object tk; + while (true) { + final Object parsedNextToken = parser.parseNextToken(); + if ((tk = parsedNextToken) == null) break; + tokens.add(tk); + } + PDResources resources = page.getResources(); + if (resources != null) { + processPageXObjects(document, resources, targetWords, useRegex, wholeWordSearch); + } + List textSegments = + extractTextSegments(page, tokens, Boolean.TRUE.equals(AGGRESSIVE_MODE.get())); + String completeText = buildCompleteText(textSegments); + List matches = + Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) + ? findAllMatchesAggressive( + textSegments, tokens, targetWords, useRegex, wholeWordSearch) + : findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); + return applyRedactionsToTokens(tokens, textSegments, matches); + } + + private void processPageXObjects( + PDDocument document, + PDResources resources, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + for (COSName xobjName : resources.getXObjectNames()) { + try { + PDXObject xobj = resources.getXObject(xobjName); + if (xobj instanceof PDFormXObject formXObj) { + processFormXObject(document, formXObj, targetWords, useRegex, wholeWordSearch); + } + } catch (Exception ignored) { + } + } + } + + private List extractTextSegments( + PDPage page, List tokens, boolean aggressive) { + List segments = extractTextSegmentsEnhanced(page, tokens, aggressive); + if (segments.isEmpty()) { + segments = extractTextSegmentsFallback(page, tokens, aggressive); + } + return segments; + } + + private List extractTextSegmentsEnhanced( + PDPage page, List tokens, boolean aggressive) { + List segments = new ArrayList<>(); + int currentTextPos = 0; + GraphicsState gs = new GraphicsState(); + PDResources resources = page.getResources(); + for (int i = 0; i < tokens.size(); i++) { + Object currentToken = tokens.get(i); + if (currentToken instanceof Operator op) { + String opName = op.getName(); + if ("Tf".equals(opName) && i >= 2) { + try { + COSName fontName = (COSName) tokens.get(i - 2); + COSBase fontSizeBase = (COSBase) tokens.get(i - 1); + if (fontSizeBase instanceof COSNumber cosNumber) { + gs.setFont(resources.getFont(fontName)); + gs.setFontSize(cosNumber.floatValue()); + } + } catch (ClassCastException | IOException ignored) { + } + } + if (isTextShowingOperator(opName) && i > 0) { + String textContent = extractTextFromToken(tokens.get(i - 1), opName); + if (!textContent.isEmpty()) { + if (aggressive + && gs.font != null + && tokens.get(i - 1) instanceof COSString cs) { + tryDecodeWithFontEnhanced(gs.font, cs); + } + segments.add( + new TextSegment( + i - 1, + opName, + textContent, + currentTextPos, + currentTextPos + textContent.length(), + gs.font, + gs.fontSize)); + currentTextPos += textContent.length(); + } + } + } + } + return segments; + } + + private List extractTextSegmentsFallback( + PDPage page, List tokens, boolean aggressive) { + List segments = new ArrayList<>(); + int currentTextPos = 0; + GraphicsState gs = new GraphicsState(); + PDResources resources = page.getResources(); + for (int i = 0; i < tokens.size(); i++) { + Object currentToken = tokens.get(i); + if (currentToken instanceof Operator op) { + String opName = op.getName(); + if ("Tf".equals(opName) && i >= 2) { + try { + COSName fontName = (COSName) tokens.get(i - 2); + COSBase fontSizeBase = (COSBase) tokens.get(i - 1); + if (fontSizeBase instanceof COSNumber cosNumber) { + gs.setFont(resources.getFont(fontName)); + gs.setFontSize(cosNumber.floatValue()); + } + } catch (Exception ignored) { + } + } + if (isTextShowingOperator(opName) && i > 0) { + String textContent = extractTextFromToken(tokens.get(i - 1), opName); + if (!textContent.isEmpty()) { + segments.add( + new TextSegment( + i - 1, + opName, + textContent, + currentTextPos, + currentTextPos + textContent.length(), + gs.font, + gs.fontSize)); + currentTextPos += textContent.length(); + } + } + } + } + return segments; + } + + private String buildCompleteText(List segments) { + StringBuilder sb = new StringBuilder(); + for (TextSegment segment : segments) { + sb.append(segment.text); + } + return sb.toString(); + } + + private List findAllMatchesAggressive( + List segments, + List tokens, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + List patterns = + TextFinderUtils.createOptimizedSearchPatterns( + targetWords, useRegex, wholeWordSearch); + List result = new ArrayList<>(); + Map> perSegMatches = new HashMap<>(); + try { + String completeText = buildCompleteText(segments); + if (!completeText.isEmpty()) { + List global = + findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); + if (!global.isEmpty()) { + result.addAll(global); + } else if (!useRegex && !targetWords.isEmpty()) { + String lower = completeText.toLowerCase(); + for (String word : targetWords) { + String w = word.toLowerCase(); + int idx = lower.indexOf(w); + while (idx >= 0) { + result.add(new MatchRange(idx, idx + w.length())); + idx = lower.indexOf(w, idx + 1); + } + } + } + } + } catch (Exception ignored) { + } + + List decodedPerSegment = new ArrayList<>(segments.size()); + List decStarts = new ArrayList<>(segments.size()); + List decEnds = new ArrayList<>(segments.size()); + int decCursor = 0; + for (TextSegment seg : segments) { + String decoded = null; + try { + Object tok = tokens.get(seg.getTokenIndex()); + if (("Tj".equals(seg.getOperatorName()) + || "'".equals(seg.getOperatorName()) + || "\"".equals(seg.getOperatorName())) + && tok instanceof COSString cs) { + decoded = tryDecodeWithFont(seg.getFont(), cs); + } else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) { + StringBuilder sb = new StringBuilder(); + for (COSBase el : arr) { + if (el instanceof COSString s) { + String d = tryDecodeWithFont(seg.getFont(), s); + sb.append(d != null ? d : s.getString()); + } + } + decoded = sb.toString(); + } + } catch (Exception ignored) { + } + String basis = (decoded != null) ? decoded : seg.getText(); + decodedPerSegment.add(basis); + decStarts.add(decCursor); + decCursor += basis.length(); + decEnds.add(decCursor); + } + StringBuilder decodedCompleteSb = new StringBuilder(); + for (String d : decodedPerSegment) { + decodedCompleteSb.append(d); + } + String decodedComplete = decodedCompleteSb.toString(); + if (!decodedComplete.isEmpty()) { + List patternsDec = + TextFinderUtils.createOptimizedSearchPatterns( + targetWords, useRegex, wholeWordSearch); + for (Pattern p : patternsDec) { + try { + var m = p.matcher(decodedComplete); + while (m.find()) { + int gStart = m.start(); + int gEnd = m.end(); + for (int sIdx = 0; sIdx < segments.size(); sIdx++) { + int sStart = decStarts.get(sIdx); + int sEnd = decEnds.get(sIdx); + int ovStart = Math.max(gStart, sStart); + int ovEnd = Math.min(gEnd, sEnd); + if (ovStart < ovEnd) { + int localStart = ovStart - sStart; + int localEnd = ovEnd - sStart; + perSegMatches + .computeIfAbsent(sIdx, k -> new ArrayList<>()) + .add(new AggressiveSegMatch(sIdx, localStart, localEnd)); + TextSegment seg = segments.get(sIdx); + int mappedStart = seg.getStartPos(); + int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1); + result.add(new MatchRange(mappedStart, mappedEnd)); + } + } + } + } catch (Exception ignored) { + } + } + if (perSegMatches.isEmpty() && !useRegex && !targetWords.isEmpty()) { + String lower = decodedComplete.toLowerCase(); + for (String word : targetWords) { + String w = word.toLowerCase(); + int idx = lower.indexOf(w); + while (idx >= 0) { + int gStart = idx; + int gEnd = idx + w.length(); + for (int sIdx = 0; sIdx < segments.size(); sIdx++) { + int sStart = decStarts.get(sIdx); + int sEnd = decEnds.get(sIdx); + int ovStart = Math.max(gStart, sStart); + int ovEnd = Math.min(gEnd, sEnd); + if (ovStart < ovEnd) { + int localStart = ovStart - sStart; + int localEnd = ovEnd - sStart; + perSegMatches + .computeIfAbsent(sIdx, k -> new ArrayList<>()) + .add(new AggressiveSegMatch(sIdx, localStart, localEnd)); + TextSegment seg = segments.get(sIdx); + int mappedStart = seg.getStartPos(); + int mappedEnd = Math.min(seg.getEndPos(), seg.getStartPos() + 1); + result.add(new MatchRange(mappedStart, mappedEnd)); + } + } + idx = lower.indexOf(w, idx + 1); + } + } + } + } + if (!perSegMatches.isEmpty()) { + AGGR_SEG_MATCHES.set(perSegMatches); + } else { + AGGR_SEG_MATCHES.remove(); + } + + for (TextSegment seg : segments) { + String decoded = null; + try { + Object tok = tokens.get(seg.getTokenIndex()); + if (("Tj".equals(seg.getOperatorName()) || "'".equals(seg.getOperatorName())) + && tok instanceof COSString cs) { + decoded = tryDecodeWithFont(seg.getFont(), cs); + } else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) { + StringBuilder sb = new StringBuilder(); + for (COSBase el : arr) { + if (el instanceof COSString s) { + String d = tryDecodeWithFont(seg.getFont(), s); + sb.append(d != null ? d : s.getString()); + } + } + decoded = sb.toString(); + } + } catch (Exception ignored) { + } + String basis = (decoded != null && !decoded.isEmpty()) ? decoded : seg.getText(); + boolean any = false; + for (Pattern p : patterns) { + try { + var m = p.matcher(basis); + while (m.find()) { + any = true; + result.add(new MatchRange(seg.getStartPos(), seg.getStartPos())); + } + } catch (Exception ignored) { + } + } + if (!any) { + NormalizedMap nm = buildNormalizedMap(seg.getText()); + if (!nm.norm.isEmpty()) { + for (String word : targetWords) { + String normWord = normalizeForFuzzy(word); + if (normWord.isEmpty()) { + continue; + } + int idx = nm.norm.indexOf(normWord); + while (idx >= 0) { + int origStart = nm.map[idx]; + int origEnd = + nm.map[Math.min(idx + normWord.length() - 1, nm.map.length - 1)] + + 1; + result.add( + new MatchRange( + seg.getStartPos() + origStart, + seg.getStartPos() + origEnd)); + idx = nm.norm.indexOf(normWord, idx + 1); + } + } + } + } + } + result.sort(Comparator.comparingInt(MatchRange::getStartPos)); + return result; + } + + private List applyRedactionsToTokens( + List tokens, List textSegments, List matches) { + List newTokens = new ArrayList<>(tokens); + if (Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) { + Map> perSeg = AGGR_SEG_MATCHES.get(); + if (perSeg != null && !perSeg.isEmpty()) { + List segIndices = new ArrayList<>(perSeg.keySet()); + segIndices.sort( + (a, b) -> + Integer.compare( + textSegments.get(b).getTokenIndex(), + textSegments.get(a).getTokenIndex())); + for (Integer segIndex : segIndices) { + TextSegment segment = textSegments.get(segIndex); + List segMatches = perSeg.getOrDefault(segIndex, List.of()); + if (segMatches.isEmpty()) { + continue; + } + Object token = newTokens.get(segment.getTokenIndex()); + String opName = segment.getOperatorName(); + if (("Tj".equals(opName) || "'".equals(opName) || "\"".equals(opName)) + && token instanceof COSString cs) { + COSString redacted = + redactCosStringByDecodedRanges(segment.getFont(), cs, segMatches); + newTokens.set(segment.getTokenIndex(), redacted); + } else if ("TJ".equals(opName) && token instanceof COSArray arr) { + COSArray redacted = + redactTJArrayByDecodedRanges(segment.getFont(), arr, segMatches); + newTokens.set(segment.getTokenIndex(), redacted); + } + } + return newTokens; + } + } + Map> matchesBySegment = new HashMap<>(); + for (MatchRange match : matches) { + for (int i = 0; i < textSegments.size(); i++) { + TextSegment segment = textSegments.get(i); + int overlapStart = Math.max(match.startPos, segment.startPos); + int overlapEnd = Math.min(match.endPos, segment.endPos); + if (overlapStart < overlapEnd) { + matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match); + } + } + } + List tasks = new ArrayList<>(); + for (Map.Entry> entry : matchesBySegment.entrySet()) { + int segmentIndex = entry.getKey(); + List segmentMatches = entry.getValue(); + TextSegment segment = textSegments.get(segmentIndex); + if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) { + String newText = applyRedactionsToSegmentText(segment, segmentMatches); + float adjustment = 0; + adjustment = calculateWidthAdjustment(segment, segmentMatches); + tasks.add(new ModificationTask(segment, newText, adjustment)); + } else if ("TJ".equals(segment.operatorName)) { + tasks.add(new ModificationTask(segment, null, 0)); + } + } + tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex)); + for (ModificationTask task : tasks) { + List segmentMatches = + matchesBySegment.getOrDefault( + textSegments.indexOf(task.segment), Collections.emptyList()); + modifyTokenForRedaction( + newTokens, task.segment, task.newText, task.adjustment, segmentMatches); + } + return newTokens; + } + + private String applyRedactionsToSegmentText(TextSegment segment, List matches) { + String text = segment.getText(); + if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) + && segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) { + return text; + } + + StringBuilder result = new StringBuilder(text); + for (MatchRange match : matches) { + int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); + int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); + if (segmentStart < text.length() && segmentEnd > segmentStart) { + String originalPart = text.substring(segmentStart, segmentEnd); + if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) + && segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable( + segment.getFont(), originalPart)) { + continue; + } + + if (Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) { + result.replace(segmentStart, segmentEnd, ""); + } else { + float originalWidth = 0; + if (segment.getFont() != null && segment.getFontSize() > 0) { + originalWidth = + safeGetStringWidth(segment.getFont(), originalPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + } + String placeholder = + (originalWidth > 0) + ? createPlaceholderWithWidth( + originalPart, + originalWidth, + segment.getFont(), + segment.getFontSize()) + : createPlaceholderWithFont(originalPart, segment.getFont()); + result.replace(segmentStart, segmentEnd, placeholder); + } + } + } + return result.toString(); + } + + private float safeGetStringWidth(PDFont font, String text) { + if (font == null || text == null || text.isEmpty()) { + return 0; + } + if (!WidthCalculator.isWidthCalculationReliable(font)) { + return calculateConservativeWidth(font, text); + } + if (!TextEncodingHelper.canEncodeCharacters(font, text)) { + return calculateCharacterBasedWidth(font, text); + } + try { + return font.getStringWidth(text); + } catch (Exception e) { + return calculateFallbackWidth(font, text); + } + } + + private float calculateWidthAdjustment(TextSegment segment, List matches) { + try { + if (segment.getFont() == null || segment.getFontSize() <= 0) { + return 0; + } + String fontName = segment.getFont().getName(); + if (fontName != null + && (fontName.contains("HOEPAP") || TextEncodingHelper.isFontSubset(fontName))) { + return 0; + } + float totalOriginal = 0; + float totalPlaceholder = 0; + String text = segment.getText(); + for (MatchRange match : matches) { + int segStart = Math.max(0, match.getStartPos() - segment.getStartPos()); + int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); + if (segStart < text.length() && segEnd > segStart) { + String originalPart = text.substring(segStart, segEnd); + float originalWidth = + safeGetStringWidth(segment.getFont(), originalPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + String placeholderPart = + createPlaceholderWithWidth( + originalPart, + originalWidth, + segment.getFont(), + segment.getFontSize()); + float origUnits = safeGetStringWidth(segment.getFont(), originalPart); + float placeUnits = safeGetStringWidth(segment.getFont(), placeholderPart); + float orig = (origUnits / FONT_SCALE_FACTOR) * segment.getFontSize(); + float place = (placeUnits / FONT_SCALE_FACTOR) * segment.getFontSize(); + totalOriginal += orig; + totalPlaceholder += place; + } + } + float adjustment = totalOriginal - totalPlaceholder; + float maxReasonable = + Math.max( + segment.getText().length() * segment.getFontSize() * 2, + totalOriginal * 1.5f); + return (Math.abs(adjustment) > maxReasonable) ? 0 : adjustment; + } catch (Exception ex) { + return 0; + } + } + + private void modifyTokenForRedaction( + List tokens, + TextSegment segment, + String newText, + float adjustment, + List matches) { + if (segment.getTokenIndex() < 0 || segment.getTokenIndex() >= tokens.size()) { + return; + } + Object token = tokens.get(segment.getTokenIndex()); + String operatorName = segment.getOperatorName(); + try { + if (("Tj".equals(operatorName) || "'".equals(operatorName) || "\"".equals(operatorName)) + && token instanceof COSString) { + if (Math.abs(adjustment) < PRECISION_THRESHOLD) { + tokens.set( + segment.getTokenIndex(), + newText.isEmpty() ? EMPTY_COS_STRING : new COSString(newText)); + } else { + COSArray newArray = new COSArray(); + newArray.add(new COSString(newText)); + if (segment.getFontSize() > 0) { + float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR; + newArray.add(new COSFloat(kerning)); + } + tokens.set(segment.getTokenIndex(), newArray); + int operatorIndex = segment.getTokenIndex() + 1; + if (operatorIndex < tokens.size() + && tokens.get(operatorIndex) instanceof Operator op + && op.getName().equals(operatorName)) { + tokens.set(operatorIndex, Operator.getOperator("TJ")); + } + } + } else if ("TJ".equals(operatorName) && token instanceof COSArray) { + COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches); + tokens.set(segment.getTokenIndex(), newArray); + } + } catch (Exception ignored) { + } + } + + private COSArray createRedactedTJArray( + COSArray originalArray, TextSegment segment, List matches) { + try { + COSArray newArray = new COSArray(); + int textOffsetInSegment = 0; + for (COSBase element : originalArray) { + if (element instanceof COSString cosString) { + String originalText = cosString.getString(); + if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) + && segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable( + segment.getFont(), originalText)) { + newArray.add(element); + textOffsetInSegment += originalText.length(); + continue; + } + + StringBuilder newText = new StringBuilder(originalText); + boolean modified = false; + for (MatchRange match : matches) { + int stringStartInPage = segment.getStartPos() + textOffsetInSegment; + int stringEndInPage = stringStartInPage + originalText.length(); + int overlapStart = Math.max(match.getStartPos(), stringStartInPage); + int overlapEnd = Math.min(match.getEndPos(), stringEndInPage); + if (overlapStart < overlapEnd) { + int redactionStartInString = overlapStart - stringStartInPage; + int redactionEndInString = overlapEnd - stringStartInPage; + if (redactionStartInString >= 0 + && redactionEndInString <= originalText.length()) { + String originalPart = + originalText.substring( + redactionStartInString, redactionEndInString); + if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) { + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable( + segment.getFont(), originalPart)) { + continue; + } + } + modified = true; + if (Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) { + newText.replace( + redactionStartInString, redactionEndInString, ""); + } else { + float originalWidth = 0; + if (segment.getFont() != null && segment.getFontSize() > 0) { + try { + originalWidth = + safeGetStringWidth( + segment.getFont(), originalPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + } catch (Exception ignored) { + } + } + String placeholder = + (originalWidth > 0) + ? createPlaceholderWithWidth( + originalPart, + originalWidth, + segment.getFont(), + segment.getFontSize()) + : createPlaceholderWithFont( + originalPart, segment.getFont()); + newText.replace( + redactionStartInString, + redactionEndInString, + placeholder); + } + } + } + } + String modifiedString = newText.toString(); + newArray.add(new COSString(modifiedString)); + if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())) { + if (modified && segment.getFont() != null && segment.getFontSize() > 0) { + try { + float originalWidth = + safeGetStringWidth(segment.getFont(), originalText) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + float modifiedWidth = + safeGetStringWidth(segment.getFont(), modifiedString) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + float adjustment = originalWidth - modifiedWidth; + if (Math.abs(adjustment) > PRECISION_THRESHOLD) { + float kerning = + (-adjustment / segment.getFontSize()) + * FONT_SCALE_FACTOR + * 1.10f; + newArray.add(new COSFloat(kerning)); + } + } catch (Exception ignored) { + } + } + } + textOffsetInSegment += originalText.length(); + } else { + newArray.add(element); + } + } + return newArray; + } catch (Exception e) { + return originalArray; + } + } + + private String extractTextFromToken(Object token, String operatorName) { + return switch (operatorName) { + case "Tj", "'", "\"" -> { + if (token instanceof COSString cosString) { + yield cosString.getString(); + } + yield ""; + } + case "TJ" -> { + if (token instanceof COSArray cosArray) { + StringBuilder sb = new StringBuilder(); + for (COSBase element : cosArray) { + if (element instanceof COSString cosString) { + sb.append(cosString.getString()); + } + } + yield sb.toString(); + } + yield ""; + } + default -> ""; + }; + } + + private WipeResult wipeAllSemanticTextInTokens(List tokens) { + List newTokens = new ArrayList<>(tokens); + int modifications = 0; + for (int i = 0; i < newTokens.size(); i++) { + Object t = newTokens.get(i); + if (t instanceof Operator op) { + String name = op.getName(); + if ("BDC".equals(name) && i > 0) { + Object maybeDict = newTokens.get(i - 1); + if (maybeDict instanceof COSDictionary dict) { + boolean changed = false; + if (dict.containsKey(COSName.getPDFName("ActualText"))) { + dict.removeItem(COSName.getPDFName("ActualText")); + changed = true; + } + if (dict.containsKey(COSName.getPDFName("Alt"))) { + dict.removeItem(COSName.getPDFName("Alt")); + changed = true; + } + if (dict.containsKey(COSName.getPDFName("TU"))) { + dict.removeItem(COSName.getPDFName("TU")); + changed = true; + } + if (changed) { + modifications++; + } + } + } + } + } + WipeResult res = new WipeResult(); + res.tokens = newTokens; + res.modifications = modifications; + return res; + } + + private int wipeAllTextInResources(PDDocument document, PDResources resources) { + int totalMods = 0; + try { + totalMods += wipeAllSemanticTextInProperties(resources); + for (COSName xobjName : resources.getXObjectNames()) { + try { + PDXObject xobj = resources.getXObject(xobjName); + if (xobj instanceof PDFormXObject form) { + totalMods += wipeAllTextInFormXObject(document, form); + } + } catch (Exception ignored) { + } + } + } catch (Exception ignored) { + } + return totalMods; + } + + private int wipeAllTextInXObjects(PDDocument document, PDResources resources) { + int modifications = 0; + try { + for (COSName xobjName : resources.getXObjectNames()) { + try { + PDXObject xobj = resources.getXObject(xobjName); + if (xobj instanceof PDFormXObject form) { + modifications += wipeAllTextInFormXObject(document, form); + } + } catch (Exception ignored) { + } + } + } catch (Exception ignored) { + } + return modifications; + } + + private int wipeAllTextInFormXObject(PDDocument document, PDFormXObject formXObject) + throws IOException { + int modifications = 0; + try { + PDResources res = formXObject.getResources(); + if (res != null) { + modifications += wipeAllTextInResources(document, res); + } + PDFStreamParser parser = new PDFStreamParser(formXObject); + List tokens = new ArrayList<>(); + Object token; + while ((token = parser.parseNextToken()) != null) { + tokens.add(token); + } + WipeResult wrText = wipeAllTextShowingOperators(tokens); + modifications += wrText.modifications; + WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens); + modifications += wrSem.modifications; + if (wrText.modifications > 0 || wrSem.modifications > 0) { + writeRedactedContentToXObject(document, formXObject, wrSem.tokens); + } + } catch (Exception ignored) { + } + return modifications; + } + + private void wipeAllTextInPatterns(PDDocument document, PDResources resources) { + int totalMods = 0; + try { + for (COSName patName : resources.getPatternNames()) { + try { + var pattern = resources.getPattern(patName); + if (pattern + instanceof + org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern tiling) { + PDResources patRes = tiling.getResources(); + if (patRes != null) { + totalMods += wipeAllTextInResources(document, patRes); + } + PDFStreamParser parser = new PDFStreamParser(tiling); + List tokens = new ArrayList<>(); + Object token; + while ((token = parser.parseNextToken()) != null) { + tokens.add(token); + } + WipeResult wrText = wipeAllTextShowingOperators(tokens); + totalMods += wrText.modifications; + WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens); + totalMods += wrSem.modifications; + if (wrText.modifications > 0 || wrSem.modifications > 0) { + writeRedactedContentToPattern(tiling, wrSem.tokens); + } + } + } catch (Exception ignored) { + } + } + } catch (Exception ignored) { + } + } + + private int wipeAllTextInAnnotations(PDDocument document, PDPage page) { + int totalMods = 0; + try { + var annotations = page.getAnnotations(); + if (annotations == null || annotations.isEmpty()) { + return 0; + } + for (var annot : annotations) { + try { + var ap = annot.getAppearance(); + if (ap == null) { + continue; + } + var normal = ap.getNormalAppearance(); + if (normal == null) { + continue; + } + if (normal.isStream()) { + var stream = normal.getAppearanceStream(); + if (stream != null) { + totalMods += wipeAllTextInFormXObject(document, stream); + } + } else if (normal.isSubDictionary()) { + var map = normal.getSubDictionary(); + if (map != null) { + for (var entry : map.values()) { + if (entry != null) { + totalMods += wipeAllTextInFormXObject(document, entry); + } + } + } + } + } catch (Exception ignored) { + } + } + } catch (Exception ignored) { + } + return totalMods; + } + + private void writeRedactedContentToPattern( + org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern pattern, + List redactedTokens) + throws IOException { + var contentStream = pattern.getContentStream(); + try (var out = contentStream.createOutputStream()) { + new ContentStreamWriter(out).writeTokens(redactedTokens); + } + } + + private void processFormXObject( + PDDocument document, + PDFormXObject formXObject, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + try { + PDResources xobjResources = formXObject.getResources(); + if (xobjResources == null) { + return; + } + for (COSName xobjName : xobjResources.getXObjectNames()) { + PDXObject nestedXObj = xobjResources.getXObject(xobjName); + if (nestedXObj instanceof PDFormXObject nestedFormXObj) { + processFormXObject( + document, nestedFormXObj, targetWords, useRegex, wholeWordSearch); + } + } + PDFStreamParser parser = new PDFStreamParser(formXObject); + List tokens = new ArrayList<>(); + Object token; + while ((token = parser.parseNextToken()) != null) { + tokens.add(token); + } + List textSegments = extractTextSegmentsFromXObject(xobjResources, tokens); + String completeText = buildCompleteText(textSegments); + List matches = + Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) + ? findAllMatchesAggressive( + textSegments, tokens, targetWords, useRegex, wholeWordSearch) + : findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); + if (!matches.isEmpty()) { + List redactedTokens = + applyRedactionsToTokens(tokens, textSegments, matches); + writeRedactedContentToXObject(document, formXObject, redactedTokens); + } else if (Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && !completeText.isEmpty()) { + WipeResult wr = wipeAllTextShowingOperators(tokens); + writeRedactedContentToXObject(document, formXObject, wr.tokens); + } + } catch (Exception ignored) { + } + } + + private List extractTextSegmentsFromXObject( + PDResources resources, List tokens) { + List segments = new ArrayList<>(); + int currentTextPos = 0; + GraphicsState gs = new GraphicsState(); + for (int i = 0; i < tokens.size(); i++) { + Object currentToken = tokens.get(i); + if (currentToken instanceof Operator op) { + String opName = op.getName(); + if ("Tf".equals(opName) && i >= 2) { + try { + COSName fontName = (COSName) tokens.get(i - 2); + COSBase fontSizeBase = (COSBase) tokens.get(i - 1); + if (fontSizeBase instanceof COSNumber cosNumber) { + gs.setFont(resources.getFont(fontName)); + gs.setFontSize(cosNumber.floatValue()); + } + } catch (ClassCastException | IOException ignored) { + } + } + if (isTextShowingOperator(opName) && i > 0) { + String textContent = extractTextFromToken(tokens.get(i - 1), opName); + if (!textContent.isEmpty()) { + segments.add( + new TextSegment( + i - 1, + opName, + textContent, + currentTextPos, + currentTextPos + textContent.length(), + gs.font, + gs.fontSize)); + currentTextPos += textContent.length(); + } + } + } + } + return segments; + } + + @Data + @AllArgsConstructor + private static class AggressiveSegMatch { + private int segmentIndex; + private int decodedStart; + private int decodedEnd; + } + + @Data + @AllArgsConstructor + private static class GraphicsState { + private PDFont font = null; + private float fontSize = 0; + + public GraphicsState() { + } + } + + @Data + @AllArgsConstructor + private static class TextSegment { + private int tokenIndex; + private String operatorName; + private String text; + private int startPos; + private int endPos; + private PDFont font; + private float fontSize; + } + + @Data + @AllArgsConstructor + private static class MatchRange { + private int startPos; + private int endPos; + } + + @Data + private static class NormalizedMap { + String norm; + int[] map; + } + + @Data + private static class DecodedMapping { + String text; + int[] charByteStart; + int[] charByteEnd; + } + + @Data + @AllArgsConstructor + private static class ModificationTask { + private TextSegment segment; + private String newText; + private float adjustment; + } + + @Data + private static class WipeResult { + List tokens; + int modifications; + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/service/VisualRedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/VisualRedactionService.java new file mode 100644 index 000000000..c85410a7f --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/service/VisualRedactionService.java @@ -0,0 +1,50 @@ +package stirling.software.SPDF.service; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import org.apache.pdfbox.pdmodel.PDDocument; + +import stirling.software.SPDF.model.PDFText; +import stirling.software.SPDF.model.api.security.RedactPdfRequest; +import stirling.software.common.service.CustomPDFDocumentFactory; + +class VisualRedactionService implements RedactionModeStrategy { + + private final CustomPDFDocumentFactory pdfDocumentFactory; + + VisualRedactionService(CustomPDFDocumentFactory pdfDocumentFactory, RedactionService helper) { + this.pdfDocumentFactory = pdfDocumentFactory; + } + + @Override + public byte[] redact(RedactPdfRequest request) throws IOException { + String[] listOfText = request.getListOfText().split("\n"); + boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); + boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch()); + + try (PDDocument document = pdfDocumentFactory.load(request.getFileInput())) { + Map> allFound = + RedactionService.findTextToRedact(document, listOfText, useRegex, wholeWord); + if (allFound.isEmpty()) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + document.save(baos); + return baos.toByteArray(); + } + } + String effectiveColor = + (request.getRedactColor() == null || request.getRedactColor().isBlank()) + ? "#000000" + : request.getRedactColor(); + return RedactionService.finalizeRedaction( + document, + allFound, + effectiveColor, + request.getCustomPadding(), + request.getConvertPDFToImage(), + false); + } + } +} diff --git a/app/core/src/main/resources/templates/security/auto-redact.html b/app/core/src/main/resources/templates/security/auto-redact.html index 2188540a9..fe6722fa7 100644 --- a/app/core/src/main/resources/templates/security/auto-redact.html +++ b/app/core/src/main/resources/templates/security/auto-redact.html @@ -20,7 +20,7 @@ -
+
@@ -53,13 +53,42 @@
@@ -82,6 +111,21 @@
+
+ + +
+ + + +