From d7fb66bb7924cd91b6de1c1bf844122f4115b140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Sat, 12 Jul 2025 11:03:33 +0200 Subject: [PATCH 01/13] feat: auto-redact to support text removal on true PDFs --- .../api/security/RedactController.java | 729 ++++++++++++++++-- .../software/SPDF/pdf/TextFinder.java | 149 ++-- 2 files changed, 753 insertions(+), 125 deletions(-) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 88d271cfb..b647ea511 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -1,19 +1,33 @@ package stirling.software.SPDF.controller.api.security; -import java.awt.*; +import java.awt.Color; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdfparser.PDFStreamParser; +import org.apache.pdfbox.pdfwriter.ContentStreamWriter; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.common.PDStream; +import org.apache.pdfbox.pdmodel.font.PDFont; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.WebDataBinder; import org.springframework.web.bind.annotation.InitBinder; @@ -27,6 +41,8 @@ import io.github.pixee.security.Filenames; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; +import lombok.AllArgsConstructor; +import lombok.Data; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -48,6 +64,13 @@ import stirling.software.common.util.propertyeditor.StringToArrayListPropertyEdi @RequiredArgsConstructor public class RedactController { + private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.3f; + private static final float PRECISION_THRESHOLD = 1e-3f; + private static final int FONT_SCALE_FACTOR = 1000; + + // Text showing operators + private static final Set TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); + private final CustomPDFDocumentFactory pdfDocumentFactory; @InitBinder @@ -65,17 +88,30 @@ public class RedactController { + " Type:SISO") public ResponseEntity redactPDF(@ModelAttribute ManualRedactPdfRequest request) throws IOException { + log.debug( + "Starting manual redaction for file: {}", + request.getFileInput().getOriginalFilename()); + MultipartFile file = request.getFileInput(); List redactionAreas = request.getRedactions(); + log.debug( + "Processing {} redaction areas", + redactionAreas != null ? redactionAreas.size() : 0); + PDDocument document = pdfDocumentFactory.load(file); + log.debug("Loaded PDF document with {} pages", document.getNumberOfPages()); PDPageTree allPages = document.getDocumentCatalog().getPages(); + log.debug("Starting page redactions"); redactPages(request, document, allPages); + + log.debug("Starting area redactions"); redactAreas(redactionAreas, document, allPages); if (Boolean.TRUE.equals(request.getConvertPDFToImage())) { + log.debug("Converting PDF to image format"); PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document); document.close(); document = convertedPdf; @@ -86,6 +122,8 @@ public class RedactController { document.close(); byte[] pdfContent = baos.toByteArray(); + log.debug("Manual redaction completed. Output PDF size: {} bytes", pdfContent.length); + return WebResponseUtils.bytesToWebResponse( pdfContent, Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "") @@ -95,17 +133,30 @@ public class RedactController { private void redactAreas( List redactionAreas, PDDocument document, PDPageTree allPages) throws IOException { + log.debug("Processing redaction areas"); + // Group redaction areas by page Map> redactionsByPage = new HashMap<>(); // Process and validate each redaction area for (RedactionArea redactionArea : redactionAreas) { + log.debug( + "Validating redaction area on page {}: x={}, y={}, width={}, height={}", + redactionArea.getPage(), + redactionArea.getX(), + redactionArea.getY(), + redactionArea.getWidth(), + redactionArea.getHeight()); + if (redactionArea.getPage() == null || redactionArea.getPage() <= 0 || redactionArea.getHeight() == null || redactionArea.getHeight() <= 0.0D || redactionArea.getWidth() == null - || redactionArea.getWidth() <= 0.0D) continue; + || redactionArea.getWidth() <= 0.0D) { + log.debug("Skipping invalid redaction area: {}", redactionArea); + continue; + } // Group by page number redactionsByPage @@ -113,70 +164,151 @@ public class RedactController { .add(redactionArea); } + log.debug("Grouped redactions by page: {} pages affected", redactionsByPage.size()); + // Process each page only once for (Map.Entry> entry : redactionsByPage.entrySet()) { Integer pageNumber = entry.getKey(); List areasForPage = entry.getValue(); + log.debug( + "Processing page {} with {} redaction areas", pageNumber, areasForPage.size()); + if (pageNumber > allPages.getCount()) { + log.debug( + "Skipping page {} - out of bounds (total pages: {})", + pageNumber, + allPages.getCount()); continue; // Skip if page number is out of bounds } PDPage page = allPages.get(pageNumber - 1); - PDRectangle box = page.getBBox(); - // Create only one content stream per page - PDPageContentStream contentStream = + // Create only one content stream per page to draw all redaction boxes + try (PDPageContentStream contentStream = new PDPageContentStream( - document, page, PDPageContentStream.AppendMode.APPEND, true, true); + document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { - // Process all redactions for this page - for (RedactionArea redactionArea : areasForPage) { - Color redactColor = decodeOrDefault(redactionArea.getColor(), Color.BLACK); - contentStream.setNonStrokingColor(redactColor); + // Process all redactions for this page + for (RedactionArea redactionArea : areasForPage) { + Color redactColor = decodeOrDefault(redactionArea.getColor()); + log.debug( + "Applying redaction with color {} at ({}, {}) size {}x{}", + redactColor, + redactionArea.getX(), + redactionArea.getY(), + redactionArea.getWidth(), + redactionArea.getHeight()); - float x = redactionArea.getX().floatValue(); - float y = redactionArea.getY().floatValue(); - float width = redactionArea.getWidth().floatValue(); - float height = redactionArea.getHeight().floatValue(); + contentStream.setNonStrokingColor(redactColor); - contentStream.addRect(x, box.getHeight() - y - height, width, height); - contentStream.fill(); + float x = redactionArea.getX().floatValue(); + float y = redactionArea.getY().floatValue(); + float width = redactionArea.getWidth().floatValue(); + float height = redactionArea.getHeight().floatValue(); + + // The y-coordinate needs to be transformed from a top-left origin to a + // bottom-left origin. + float pdfY = page.getBBox().getHeight() - y - height; + + contentStream.addRect(x, pdfY, width, height); + contentStream.fill(); + } } - - contentStream.close(); } + + log.debug("Completed redaction areas processing"); } private void redactPages( ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages) throws IOException { - Color redactColor = decodeOrDefault(request.getPageRedactionColor(), Color.BLACK); + log.debug("Starting page redactions"); + + Color redactColor = decodeOrDefault(request.getPageRedactionColor()); List pageNumbers = getPageNumbers(request, allPages.getCount()); + + log.debug("Redacting {} pages with color {}", pageNumbers.size(), redactColor); + for (Integer pageNumber : pageNumbers) { + log.debug("Redacting entire page {}", pageNumber + 1); + PDPage page = allPages.get(pageNumber); - PDPageContentStream contentStream = + try (PDPageContentStream contentStream = new PDPageContentStream( - document, page, PDPageContentStream.AppendMode.APPEND, true, true); - contentStream.setNonStrokingColor(redactColor); + document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { + contentStream.setNonStrokingColor(redactColor); - PDRectangle box = page.getBBox(); + PDRectangle box = page.getBBox(); + log.debug( + "Page {} dimensions: {}x{}", + pageNumber + 1, + box.getWidth(), + box.getHeight()); - contentStream.addRect(0, 0, box.getWidth(), box.getHeight()); - contentStream.fill(); - contentStream.close(); + contentStream.addRect(0, 0, box.getWidth(), box.getHeight()); + contentStream.fill(); + } } + + log.debug("Completed page redactions"); } - private Color decodeOrDefault(String hex, Color defaultColor) { - try { - if (hex != null && !hex.startsWith("#")) { - hex = "#" + hex; + private void redactFoundText( + PDDocument document, List blocks, float customPadding, Color redactColor) + throws IOException { + log.debug( + "Redacting {} text blocks with padding {} and color {}", + blocks.size(), + customPadding, + redactColor); + + var allPages = document.getDocumentCatalog().getPages(); + + for (PDFText block : blocks) { + log.debug( + "Redacting text block on page {}: '{}' at ({}, {}) to ({}, {})", + block.getPageIndex() + 1, + block.getText(), + block.getX1(), + block.getY1(), + block.getX2(), + block.getY2()); + + var page = allPages.get(block.getPageIndex()); + try (PDPageContentStream contentStream = + new PDPageContentStream( + document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { + contentStream.setNonStrokingColor(redactColor); + float padding = + (block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER + + customPadding; + PDRectangle pageBox = page.getBBox(); + contentStream.addRect( + block.getX1(), + pageBox.getHeight() - block.getY2() - padding, + block.getX2() - block.getX1(), + block.getY2() - block.getY1() + 2 * padding); + contentStream.fill(); } - return Color.decode(hex); - } catch (Exception e) { - return defaultColor; + } + + log.debug("Completed text block redactions"); + } + + private Color decodeOrDefault(String hex) { + if (hex == null) { + return Color.BLACK; + } + + String colorString = hex.startsWith("#") ? hex : "#" + hex; + + try { + return Color.decode(colorString); + } catch (NumberFormatException e) { + log.warn("Invalid color string '{}'. Using default color BLACK.", hex); + return Color.BLACK; } } @@ -198,6 +330,10 @@ public class RedactController { + " Input:PDF, Output:PDF, Type:SISO") public ResponseEntity redactPdf(@ModelAttribute RedactPdfRequest request) throws Exception { + log.debug( + "Starting auto-redaction for file: {}", + request.getFileInput().getOriginalFilename()); + MultipartFile file = request.getFileInput(); String listOfTextString = request.getListOfText(); boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); @@ -206,28 +342,80 @@ public class RedactController { float customPadding = request.getCustomPadding(); boolean convertPDFToImage = Boolean.TRUE.equals(request.getConvertPDFToImage()); + log.debug( + "Auto-redaction parameters: useRegex={}, wholeWordSearch={}, customPadding={}, convertToImage={}", + useRegex, + wholeWordSearchBool, + customPadding, + convertPDFToImage); + String[] listOfText = listOfTextString.split("\n"); + log.debug("Searching for {} text patterns", listOfText.length); + PDDocument document = pdfDocumentFactory.load(file); + log.debug("Loaded PDF document with {} pages", document.getNumberOfPages()); Color redactColor; try { - if (!colorString.startsWith("#")) { + if (colorString != null && !colorString.startsWith("#")) { colorString = "#" + colorString; } redactColor = Color.decode(colorString); + log.debug("Using redaction color: {}", redactColor); } catch (NumberFormatException e) { log.warn("Invalid color string provided. Using default color BLACK for redaction."); redactColor = Color.BLACK; } + // Step 1: Find all text locations for all search terms + log.debug("Step 1: Finding all text locations"); + Map> allFoundTextsByPage = new HashMap<>(); + Set allSearchTerms = new HashSet<>(); for (String text : listOfText) { text = text.trim(); + if (text.isEmpty()) continue; + + log.debug("Searching for text pattern: '{}'", text); + allSearchTerms.add(text); TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool); - List foundTexts = textFinder.getTextLocations(document); - redactFoundText(document, foundTexts, customPadding, redactColor); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + log.debug("Found {} instances of pattern '{}'", foundTexts.size(), text); + + for (PDFText found : foundTexts) { + allFoundTextsByPage + .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>()) + .add(found); + } + } + + log.debug("Total pages with found text: {}", allFoundTextsByPage.size()); + + // Step 2: Process each page + log.debug("Step 2: Processing each page for text replacement"); + for (PDPage page : document.getPages()) { + // Replace text content + List filteredTokens = + createTokensWithoutTargetText( + page, allSearchTerms, useRegex, wholeWordSearchBool); + writeFilteredContentStream(document, page, filteredTokens); + } + + // Draw redaction boxes for all found texts + List allFoundTexts = new ArrayList<>(); + for (List pageTexts : allFoundTextsByPage.values()) { + allFoundTexts.addAll(pageTexts); + } + + log.debug("Drawing redaction boxes for {} total found texts", allFoundTexts.size()); + + if (!allFoundTexts.isEmpty()) { + redactFoundText(document, allFoundTexts, customPadding, redactColor); } if (convertPDFToImage) { + log.debug("Converting redacted PDF to image format"); PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document); document.close(); document = convertedPdf; @@ -238,32 +426,465 @@ public class RedactController { document.close(); byte[] pdfContent = baos.toByteArray(); + log.debug("Auto-redaction completed. Output PDF size: {} bytes", pdfContent.length); + return WebResponseUtils.bytesToWebResponse( pdfContent, Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "") + "_redacted.pdf"); } - private void redactFoundText( - PDDocument document, List blocks, float customPadding, Color redactColor) + private List createTokensWithoutTargetText( + PDPage page, Set targetWords, boolean useRegex, boolean wholeWordSearch) throws IOException { - var allPages = document.getDocumentCatalog().getPages(); + log.debug( + "Creating tokens without target text for page, searching for {} words", + targetWords.size()); - for (PDFText block : blocks) { - var page = allPages.get(block.getPageIndex()); - PDPageContentStream contentStream = - new PDPageContentStream( - document, page, PDPageContentStream.AppendMode.APPEND, true, true); - contentStream.setNonStrokingColor(redactColor); - float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding; - PDRectangle pageBox = page.getBBox(); - contentStream.addRect( - block.getX1(), - pageBox.getHeight() - block.getY1() - padding, - block.getX2() - block.getX1(), - block.getY2() - block.getY1() + 2 * padding); - contentStream.fill(); - contentStream.close(); + PDFStreamParser parser = new PDFStreamParser(page); + List tokens = new ArrayList<>(); + Object token; + while ((token = parser.parseNextToken()) != null) { + tokens.add(token); + } + + log.debug("Parsed {} tokens from page content stream", tokens.size()); + + List textSegments = extractTextSegments(page, tokens); + log.debug("Extracted {} text segments", textSegments.size()); + + String completeText = buildCompleteText(textSegments); + log.debug("Built complete text of {} characters", completeText.length()); + + List matches = + findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); + log.debug("Found {} matches in complete text", matches.size()); + + return applyRedactionsToTokens(tokens, textSegments, matches); + } + + @Data + private static class GraphicsState { + private PDFont font = null; + private float fontSize = 0; + } + + @Data + @AllArgsConstructor + private static class TextSegment { + private int tokenIndex; + private String operatorName; + private String text; + private int startPos; + private int endPos; + private PDFont font; + private float fontSize; + } + + @Data + @AllArgsConstructor + private static class MatchRange { + private int startPos; + private int endPos; + } + + private List extractTextSegments(PDPage page, List tokens) + throws IOException { + log.debug("Extracting text segments from {} tokens", tokens.size()); + + List segments = new ArrayList<>(); + int currentTextPos = 0; + GraphicsState graphicsState = new GraphicsState(); + PDResources resources = page.getResources(); + + for (int i = 0; i < tokens.size(); i++) { + Object currentToken = tokens.get(i); + + if (currentToken instanceof Operator op) { + String opName = op.getName(); + + if ("Tf".equals(opName) && i >= 2) { + try { + COSName fontName = (COSName) tokens.get(i - 2); + COSBase fontSizeBase = (COSBase) tokens.get(i - 1); + if (fontSizeBase instanceof org.apache.pdfbox.cos.COSNumber cosNumber) { + graphicsState.setFont(resources.getFont(fontName)); + graphicsState.setFontSize(cosNumber.floatValue()); + log.debug( + "Updated font state: {} size {}", + fontName.getName(), + graphicsState.getFontSize()); + } + } catch (ClassCastException | IOException e) { + log.warn("Failed to update font state", e); + } + } + + if (isTextShowingOperator(opName) && i > 0) { + String textContent = extractTextFromToken(tokens.get(i - 1), opName); + if (!textContent.isEmpty()) { + log.debug( + "Found text segment '{}' at position {} with operator {}", + textContent, + currentTextPos, + opName); + segments.add( + new TextSegment( + i - 1, + opName, + textContent, + currentTextPos, + currentTextPos + textContent.length(), + graphicsState.font, + graphicsState.fontSize)); + currentTextPos += textContent.length(); + } + } + } + } + + log.debug("Extracted {} text segments from page", segments.size()); + return segments; + } + + private String buildCompleteText(List segments) { + StringBuilder sb = new StringBuilder(); + for (TextSegment segment : segments) { + sb.append(segment.text); + } + return sb.toString(); + } + + private List findAllMatches( + String completeText, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + log.debug( + "Finding matches in text of {} characters for {} target words", + completeText.length(), + targetWords.size()); + + List matches = new ArrayList<>(); + + for (String target : targetWords) { + log.debug("Searching for pattern: '{}'", target); + + String patternString = useRegex ? target : Pattern.quote(target); + if (wholeWordSearch) { + patternString = "\\b" + patternString + "\\b"; + } + Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(completeText); + + int matchCount = 0; + while (matcher.find()) { + matches.add(new MatchRange(matcher.start(), matcher.end())); + matchCount++; + log.debug( + "Found match for '{}' at positions {}-{}", + target, + matcher.start(), + matcher.end()); + } + + log.debug("Total matches for '{}': {}", target, matchCount); + } + + matches.sort((a, b) -> Integer.compare(a.startPos, b.startPos)); + log.debug("Found {} total matches across all patterns", matches.size()); + + return matches; + } + + private List applyRedactionsToTokens( + List tokens, List textSegments, List matches) { + log.debug( + "Applying redactions to {} tokens with {} text segments and {} matches", + tokens.size(), + textSegments.size(), + matches.size()); + + List newTokens = new ArrayList<>(tokens); + + // Group matches by segment to pass to modification methods + Map> matchesBySegment = new HashMap<>(); + for (MatchRange match : matches) { + for (int i = 0; i < textSegments.size(); i++) { + TextSegment segment = textSegments.get(i); + int overlapStart = Math.max(match.startPos, segment.startPos); + int overlapEnd = Math.min(match.endPos, segment.endPos); + if (overlapStart < overlapEnd) { + matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match); + } + } + } + + log.debug("Grouped matches by segment: {} segments affected", matchesBySegment.size()); + + // Create a list of modification tasks + List tasks = new ArrayList<>(); + for (Map.Entry> entry : matchesBySegment.entrySet()) { + int segmentIndex = entry.getKey(); + List segmentMatches = entry.getValue(); + TextSegment segment = textSegments.get(segmentIndex); + + log.debug( + "Creating modification task for segment {} with {} matches", + segmentIndex, + segmentMatches.size()); + + if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) { + String newText = applyRedactionsToSegmentText(segment, segmentMatches); + try { + float adjustment = calculateWidthAdjustment(segment, segmentMatches); + tasks.add(new ModificationTask(segment, newText, adjustment)); + } catch (IOException e) { + log.warn("Failed to calculate width adjustment for redaction.", e); + } + } else if ("TJ".equals(segment.operatorName)) { + tasks.add(new ModificationTask(segment, null, 0)); + } + } + + // Sort tasks by token index in descending order to avoid index shifting issues + tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex)); + + log.debug("Applying {} modification tasks", tasks.size()); + + // Apply modifications + for (ModificationTask task : tasks) { + List segmentMatches = + matchesBySegment.getOrDefault( + textSegments.indexOf(task.segment), Collections.emptyList()); + modifyTokenForRedaction( + newTokens, task.segment, task.newText, task.adjustment, segmentMatches); + } + + log.debug("Completed applying redactions to tokens"); + return newTokens; + } + + @Data + @AllArgsConstructor + private static class ModificationTask { + private TextSegment segment; + private String newText; // Only for Tj + private float adjustment; // Only for Tj + } + + private String applyRedactionsToSegmentText(TextSegment segment, List matches) { + String text = segment.getText(); + StringBuilder result = new StringBuilder(text); + + for (MatchRange match : matches) { + int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); + int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); + + if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) { + String placeholder = createPlaceholder(text.substring(segmentStart, segmentEnd)); + result.replace(segmentStart, segmentEnd, placeholder); + } + } + + return result.toString(); + } + + private float calculateWidthAdjustment(TextSegment segment, List matches) + throws IOException { + float totalOriginalWidth = 0; + float totalPlaceholderWidth = 0; + String text = segment.getText(); + + for (MatchRange match : matches) { + int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); + int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); + + if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) { + String originalPart = text.substring(segmentStart, segmentEnd); + String placeholderPart = createPlaceholder(originalPart); + + if (segment.getFont() != null) { + totalOriginalWidth += + segment.getFont().getStringWidth(originalPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + totalPlaceholderWidth += + segment.getFont().getStringWidth(placeholderPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + } + } + } + return totalOriginalWidth - totalPlaceholderWidth; + } + + private void modifyTokenForRedaction( + List tokens, + TextSegment segment, + String newText, + float adjustment, + List matches) { + log.debug( + "Modifying token at index {} for segment '{}' with operator {}", + segment.getTokenIndex(), + segment.getText(), + segment.getOperatorName()); + + if (segment.getTokenIndex() < 0 || segment.getTokenIndex() >= tokens.size()) { + log.debug( + "Token index {} out of bounds (0-{})", + segment.getTokenIndex(), + tokens.size() - 1); + return; + } + + Object token = tokens.get(segment.getTokenIndex()); + String operatorName = segment.getOperatorName(); + + try { + if (("Tj".equals(operatorName) || "'".equals(operatorName)) + && token instanceof COSString) { + log.debug("Modifying Tj/quote operator with adjustment {}", adjustment); + + if (Math.abs(adjustment) < PRECISION_THRESHOLD) { + tokens.set(segment.getTokenIndex(), new COSString(newText)); + } else { + COSArray newArray = new COSArray(); + newArray.add(new COSString(newText)); + if (segment.getFontSize() > 0) { + float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize(); + newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); + log.debug("Applied kerning adjustment: {}", kerning); + } + tokens.set(segment.getTokenIndex(), newArray); + + int operatorIndex = segment.getTokenIndex() + 1; + if (operatorIndex < tokens.size() + && tokens.get(operatorIndex) instanceof Operator op + && op.getName().equals(operatorName)) { + tokens.set(operatorIndex, Operator.getOperator("TJ")); + log.debug("Changed operator from {} to TJ", operatorName); + } + } + } else if ("TJ".equals(operatorName) && token instanceof COSArray) { + log.debug("Modifying TJ operator array"); + COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches); + tokens.set(segment.getTokenIndex(), newArray); + } + } catch (IOException e) { + log.warn("Failed to modify token for redaction: {}", e.getMessage(), e); } } + + private COSArray createRedactedTJArray( + COSArray originalArray, TextSegment segment, List matches) + throws IOException { + COSArray newArray = new COSArray(); + int textOffsetInSegment = 0; + + for (COSBase element : originalArray) { + if (element instanceof COSString cosString) { + String originalText = cosString.getString(); + StringBuilder newText = new StringBuilder(originalText); + boolean modified = false; + + for (MatchRange match : matches) { + int stringStartInPage = segment.getStartPos() + textOffsetInSegment; + int stringEndInPage = stringStartInPage + originalText.length(); + + int overlapStart = Math.max(match.getStartPos(), stringStartInPage); + int overlapEnd = Math.min(match.getEndPos(), stringEndInPage); + + if (overlapStart < overlapEnd) { + modified = true; + int redactionStartInString = overlapStart - stringStartInPage; + int redactionEndInString = overlapEnd - stringStartInPage; + if (redactionStartInString >= 0 + && redactionEndInString <= originalText.length()) { + String placeholder = + createPlaceholder( + originalText.substring( + redactionStartInString, redactionEndInString)); + newText.replace( + redactionStartInString, redactionEndInString, placeholder); + } + } + } + + String modifiedString = newText.toString(); + newArray.add(new COSString(modifiedString)); + + if (modified && segment.getFont() != null && segment.getFontSize() > 0) { + float originalWidth = + segment.getFont().getStringWidth(originalText) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + float modifiedWidth = + segment.getFont().getStringWidth(modifiedString) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + float adjustment = originalWidth - modifiedWidth; + if (Math.abs(adjustment) > PRECISION_THRESHOLD) { + float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize(); + newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); + } + } + + textOffsetInSegment += originalText.length(); + } else { + newArray.add(element); + } + } + return newArray; + } + + private String extractTextFromToken(Object token, String operatorName) { + return switch (operatorName) { + case "Tj", "'" -> { + if (token instanceof COSString cosString) { + yield cosString.getString(); + } + yield ""; + } + case "TJ" -> { + if (token instanceof COSArray cosArray) { + StringBuilder sb = new StringBuilder(); + for (COSBase element : cosArray) { + if (element instanceof COSString cosString) { + sb.append(cosString.getString()); + } + } + yield sb.toString(); + } + yield ""; + } + default -> ""; + }; + } + + private String createPlaceholder(String originalWord) { + if (originalWord == null || originalWord.isEmpty()) { + return originalWord; + } + return "".repeat(originalWord.length()); + } + + private void writeFilteredContentStream(PDDocument document, PDPage page, List tokens) + throws IOException { + log.debug("Writing filtered content stream with {} tokens", tokens.size()); + + PDStream newStream = new PDStream(document); + try (var out = newStream.createOutputStream()) { + ContentStreamWriter writer = new ContentStreamWriter(out); + writer.writeTokens(tokens); + } + page.setContents(newStream); + + log.debug("Successfully wrote filtered content stream"); + } + + private boolean isTextShowingOperator(String opName) { + return TEXT_SHOWING_OPERATORS.contains(opName); + } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index 4119b3eac..d9ddf3b91 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -6,102 +6,109 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; -import lombok.extern.slf4j.Slf4j; - import stirling.software.SPDF.model.PDFText; -@Slf4j public class TextFinder extends PDFTextStripper { - private final String searchText; + private final String searchTerm; private final boolean useRegex; private final boolean wholeWordSearch; - private final List textOccurrences = new ArrayList<>(); + private final List foundTexts = new ArrayList<>(); - public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch) + private final List pageTextPositions = new ArrayList<>(); + private final StringBuilder pageTextBuilder = new StringBuilder(); + + public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch) throws IOException { - this.searchText = searchText.toLowerCase(); + super(); + this.searchTerm = searchTerm; this.useRegex = useRegex; this.wholeWordSearch = wholeWordSearch; - setSortByPosition(true); + this.setWordSeparator(" "); } - private List findOccurrencesInText(String searchText, String content) { - List matches = new ArrayList<>(); - - Pattern pattern; - - if (useRegex) { - // Use regex-based search - pattern = - wholeWordSearch - ? Pattern.compile("\\b" + searchText + "\\b") - : Pattern.compile(searchText); - } else { - // Use normal text search - pattern = - wholeWordSearch - ? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b") - : Pattern.compile(Pattern.quote(searchText)); - } - - Matcher matcher = pattern.matcher(content); - while (matcher.find()) { - matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start())); - } - return matches; + @Override + protected void startPage(PDPage page) { + pageTextPositions.clear(); + pageTextBuilder.setLength(0); } @Override protected void writeString(String text, List textPositions) { - for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) { - int index = match.startIndex; - if (index + match.matchLength <= textPositions.size()) { - // Initial values based on the first character - TextPosition first = textPositions.get(index); - float minX = first.getX(); - float minY = first.getY(); - float maxX = first.getX() + first.getWidth(); - float maxY = first.getY() + first.getHeight(); + pageTextBuilder.append(text); + pageTextPositions.addAll(textPositions); + } - // Loop over the rest of the characters and adjust bounding box values - for (int i = index; i < index + match.matchLength; i++) { - TextPosition position = textPositions.get(i); - minX = Math.min(minX, position.getX()); - minY = Math.min(minY, position.getY()); - maxX = Math.max(maxX, position.getX() + position.getWidth()); - maxY = Math.max(maxY, position.getY() + position.getHeight()); + @Override + protected void writeWordSeparator() { + pageTextBuilder.append(getWordSeparator()); + pageTextPositions.add(null); // Placeholder for separator + } + + @Override + protected void writeLineSeparator() { + pageTextBuilder.append(getLineSeparator()); + pageTextPositions.add(null); // Placeholder for separator + } + + @Override + protected void endPage(PDPage page) { + String text = pageTextBuilder.toString(); + if (text.isEmpty() || this.searchTerm == null || this.searchTerm.isEmpty()) { + return; + } + + String processedSearchTerm = this.searchTerm.trim(); + String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E"; + if (this.wholeWordSearch) { + regex = "\\b" + regex + "\\b"; + } + + Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + Matcher matcher = pattern.matcher(text); + + while (matcher.find()) { + int matchStart = matcher.start(); + int matchEnd = matcher.end(); + + float minX = Float.MAX_VALUE; + float minY = Float.MAX_VALUE; + float maxX = Float.MIN_VALUE; + float maxY = Float.MIN_VALUE; + boolean foundPosition = false; + + for (int i = matchStart; i < matchEnd; i++) { + if (i >= pageTextPositions.size()) { + continue; } + TextPosition pos = pageTextPositions.get(i); + if (pos != null) { + foundPosition = true; + minX = Math.min(minX, pos.getX()); + maxX = Math.max(maxX, pos.getX() + pos.getWidth()); + minY = Math.min(minY, pos.getY() - pos.getHeight()); + maxY = Math.max(maxY, pos.getY()); + } + } - textOccurrences.add( - new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text)); + if (foundPosition) { + foundTexts.add( + new PDFText( + this.getCurrentPageNo() - 1, + minX, + minY, + maxX, + maxY, + matcher.group())); } } } - public List getTextLocations(PDDocument document) throws Exception { - this.getText(document); - log.debug( - "Found " - + textOccurrences.size() - + " occurrences of '" - + searchText - + "' in the document."); - - return textOccurrences; - } - - private class MatchInfo { - int startIndex; - int matchLength; - - MatchInfo(int startIndex, int matchLength) { - this.startIndex = startIndex; - this.matchLength = matchLength; - } + public List getFoundTexts() { + return foundTexts; } } From 72e34fbadd94e58ba03fa7fe0f185a41e7422db0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Sat, 12 Jul 2025 12:54:04 +0200 Subject: [PATCH 02/13] feat: minor code improvements in RedactController and added test for TextFinder and RedactController --- .../api/security/RedactController.java | 45 +- .../api/security/RedactControllerTest.java | 688 ++++++++++++++++++ .../software/SPDF/pdf/TextFinderTest.java | 485 ++++++++++++ 3 files changed, 1190 insertions(+), 28 deletions(-) create mode 100644 stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java create mode 100644 stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index b647ea511..e76b48009 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -3,13 +3,7 @@ package stirling.software.SPDF.controller.api.security; import java.awt.Color; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -126,7 +120,8 @@ public class RedactController { return WebResponseUtils.bytesToWebResponse( pdfContent, - Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "") + Objects.requireNonNull(Filenames.toSimpleFileName(file.getOriginalFilename())) + .replaceFirst("[.][^.]+$", "") + "_redacted.pdf"); } @@ -135,6 +130,11 @@ public class RedactController { throws IOException { log.debug("Processing redaction areas"); + if (redactionAreas == null || redactionAreas.isEmpty()) { + log.debug("No redaction areas to process"); + return; + } + // Group redaction areas by page Map> redactionsByPage = new HashMap<>(); @@ -179,7 +179,7 @@ public class RedactController { "Skipping page {} - out of bounds (total pages: {})", pageNumber, allPages.getCount()); - continue; // Skip if page number is out of bounds + continue; // Skip if the page number is out of bounds } PDPage page = allPages.get(pageNumber - 1); @@ -223,7 +223,6 @@ public class RedactController { private void redactPages( ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages) throws IOException { - log.debug("Starting page redactions"); Color redactColor = decodeOrDefault(request.getPageRedactionColor()); List pageNumbers = getPageNumbers(request, allPages.getCount()); @@ -353,19 +352,9 @@ public class RedactController { log.debug("Searching for {} text patterns", listOfText.length); PDDocument document = pdfDocumentFactory.load(file); - log.debug("Loaded PDF document with {} pages", document.getNumberOfPages()); - Color redactColor; - try { - if (colorString != null && !colorString.startsWith("#")) { - colorString = "#" + colorString; - } - redactColor = Color.decode(colorString); - log.debug("Using redaction color: {}", redactColor); - } catch (NumberFormatException e) { - log.warn("Invalid color string provided. Using default color BLACK for redaction."); - redactColor = Color.BLACK; - } + Color redactColor = decodeOrDefault(colorString); + log.debug("Using redaction color: {}", redactColor); // Step 1: Find all text locations for all search terms log.debug("Step 1: Finding all text locations"); @@ -430,7 +419,8 @@ public class RedactController { return WebResponseUtils.bytesToWebResponse( pdfContent, - Filenames.toSimpleFileName(file.getOriginalFilename()).replaceFirst("[.][^.]+$", "") + Objects.requireNonNull(Filenames.toSimpleFileName(file.getOriginalFilename())) + .replaceFirst("[.][^.]+$", "") + "_redacted.pdf"); } @@ -488,8 +478,7 @@ public class RedactController { private int endPos; } - private List extractTextSegments(PDPage page, List tokens) - throws IOException { + private List extractTextSegments(PDPage page, List tokens) { log.debug("Extracting text segments from {} tokens", tokens.size()); List segments = new ArrayList<>(); @@ -591,7 +580,7 @@ public class RedactController { log.debug("Total matches for '{}': {}", target, matchCount); } - matches.sort((a, b) -> Integer.compare(a.startPos, b.startPos)); + matches.sort(Comparator.comparingInt(a -> a.startPos)); log.debug("Found {} total matches across all patterns", matches.size()); return matches; @@ -681,7 +670,7 @@ public class RedactController { int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); - if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) { + if (segmentStart < text.length() && segmentEnd > segmentStart) { String placeholder = createPlaceholder(text.substring(segmentStart, segmentEnd)); result.replace(segmentStart, segmentEnd, placeholder); } @@ -700,7 +689,7 @@ public class RedactController { int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); - if (segmentStart >= 0 && segmentStart < text.length() && segmentEnd > segmentStart) { + if (segmentStart < text.length() && segmentEnd > segmentStart) { String originalPart = text.substring(segmentStart, segmentEnd); String placeholderPart = createPlaceholder(originalPart); diff --git a/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java b/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java new file mode 100644 index 000000000..60ea78e26 --- /dev/null +++ b/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java @@ -0,0 +1,688 @@ +package stirling.software.SPDF.controller.api.security; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.ArgumentMatchers.*; +import static org.mockito.Mockito.*; + +import java.awt.Color; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.mockito.junit.jupiter.MockitoSettings; +import org.mockito.quality.Strictness; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.http.ResponseEntity; +import org.springframework.mock.web.MockMultipartFile; + +import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest; +import stirling.software.SPDF.model.api.security.RedactPdfRequest; +import stirling.software.common.model.api.security.RedactionArea; +import stirling.software.common.service.CustomPDFDocumentFactory; + +@DisplayName("PDF Redaction Controller tests") +@ExtendWith(MockitoExtension.class) +@MockitoSettings(strictness = Strictness.LENIENT) +class RedactControllerTest { + + private static final Logger log = LoggerFactory.getLogger(RedactControllerTest.class); + + @Mock + private CustomPDFDocumentFactory pdfDocumentFactory; + + @InjectMocks + private RedactController redactController; + + private MockMultipartFile mockPdfFile; + private PDDocument mockDocument; + private PDPageTree mockPages; + private PDPage mockPage; + + // Helpers + private void testAutoRedaction(String searchText, boolean useRegex, boolean wholeWordSearch, + String redactColor, float padding, boolean convertToImage, + boolean expectSuccess) throws Exception { + RedactPdfRequest request = createRedactPdfRequest(); + request.setListOfText(searchText); + request.setUseRegex(useRegex); + request.setWholeWordSearch(wholeWordSearch); + request.setRedactColor(redactColor); + request.setCustomPadding(padding); + request.setConvertPDFToImage(convertToImage); + + try { + ResponseEntity response = redactController.redactPdf(request); + + if (expectSuccess && response != null) { + assertNotNull(response); + assertEquals(200, response.getStatusCode().value()); + assertNotNull(response.getBody()); + assertTrue(response.getBody().length > 0); + verify(mockDocument, times(1)).save(any(ByteArrayOutputStream.class)); + verify(mockDocument, times(1)).close(); + } + } catch (Exception e) { + if (expectSuccess) { + log.info("Redaction test completed with graceful handling: {}", e.getMessage()); + } else { + assertNotNull(e.getMessage()); + } + } + } + + private void testManualRedaction(List redactionAreas, boolean convertToImage) throws Exception { + ManualRedactPdfRequest request = createManualRedactPdfRequest(); + request.setRedactions(redactionAreas); + request.setConvertPDFToImage(convertToImage); + + try { + ResponseEntity response = redactController.redactPDF(request); + + if (response != null) { + assertNotNull(response); + assertEquals(200, response.getStatusCode().value()); + verify(mockDocument, times(1)).save(any(ByteArrayOutputStream.class)); + } + } catch (Exception e) { + log.info("Manual redaction test completed with graceful handling: {}", e.getMessage()); + } + } + + @BeforeEach + void setUp() throws IOException { + mockPdfFile = new MockMultipartFile( + "fileInput", + "test.pdf", + "application/pdf", + createSimplePdfContent() + ); + + // Mock PDF document and related objects + mockDocument = mock(PDDocument.class); + mockPages = mock(PDPageTree.class); + mockPage = mock(PDPage.class); + org.apache.pdfbox.pdmodel.PDDocumentCatalog mockCatalog = mock(org.apache.pdfbox.pdmodel.PDDocumentCatalog.class); + + // Setup document structure properly + when(pdfDocumentFactory.load(any(MockMultipartFile.class))).thenReturn(mockDocument); + when(mockDocument.getDocumentCatalog()).thenReturn(mockCatalog); + when(mockCatalog.getPages()).thenReturn(mockPages); + when(mockDocument.getNumberOfPages()).thenReturn(1); + when(mockDocument.getPages()).thenReturn(mockPages); + + // Setup page tree + when(mockPages.getCount()).thenReturn(1); + when(mockPages.get(0)).thenReturn(mockPage); + when(mockPages.iterator()).thenReturn(Collections.singletonList(mockPage).iterator()); + + PDRectangle pageRect = new PDRectangle(0, 0, 612, 792); + when(mockPage.getCropBox()).thenReturn(pageRect); + when(mockPage.getMediaBox()).thenReturn(pageRect); + when(mockPage.getBBox()).thenReturn(pageRect); + + InputStream mockInputStream = new ByteArrayInputStream("BT /F1 12 Tf 100 200 Td (test content) Tj ET".getBytes()); + when(mockPage.getContents()).thenReturn(mockInputStream); + + when(mockPage.hasContents()).thenReturn(true); + + org.apache.pdfbox.cos.COSDocument mockCOSDocument = mock(org.apache.pdfbox.cos.COSDocument.class); + org.apache.pdfbox.cos.COSStream mockCOSStream = mock(org.apache.pdfbox.cos.COSStream.class); + when(mockDocument.getDocument()).thenReturn(mockCOSDocument); + when(mockCOSDocument.createCOSStream()).thenReturn(mockCOSStream); + + ByteArrayOutputStream mockOutputStream = new ByteArrayOutputStream(); + when(mockCOSStream.createOutputStream()).thenReturn(mockOutputStream); + when(mockCOSStream.createOutputStream(any())).thenReturn(mockOutputStream); + + doAnswer(invocation -> { + ByteArrayOutputStream baos = invocation.getArgument(0); + baos.write("Mock PDF Content".getBytes()); + return null; + }).when(mockDocument).save(any(ByteArrayOutputStream.class)); + doNothing().when(mockDocument).close(); + } + + @AfterEach + void tearDown() { + reset(mockDocument, mockPages, mockPage, pdfDocumentFactory); + } + + @Nested + @DisplayName("Automatic Text Redaction") + class AutomaticRedactionTests { + + @Test + @DisplayName("Should redact basic text successfully") + void redactBasicText() throws Exception { + testAutoRedaction("confidential\nsecret", false, false, "#000000", 2.0f, false, true); + } + + @Test + @DisplayName("Should handle simple text redaction") + void handleSimpleTextRedaction() throws Exception { + testAutoRedaction("sensitive", false, false, "#000000", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle empty text list gracefully") + void handleEmptyTextList() throws Exception { + testAutoRedaction("", false, false, "#000000", 1.0f, false, true); + } + + @Test + @DisplayName("Should redact multiple search terms") + void redactMultipleSearchTerms() throws Exception { + testAutoRedaction("confidential\nsecret\nprivate\nclassified", false, true, "#FF0000", 2.0f, false, true); + } + } + + @Nested + @DisplayName("Regular Expression Redaction") + class RegexRedactionTests { + + @Test + @DisplayName("Should redact using regex patterns") + void redactUsingRegexPatterns() throws Exception { + testAutoRedaction("\\d{3}-\\d{2}-\\d{4}", true, false, "#FF0000", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle email pattern redaction") + void handleEmailPatternRedaction() throws Exception { + testAutoRedaction("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, false, "#0000FF", 1.5f, false, true); + } + + @Test + @DisplayName("Should handle phone number patterns") + void handlePhoneNumberPatterns() throws Exception { + testAutoRedaction("\\(\\d{3}\\)\\s*\\d{3}-\\d{4}", true, false, "#FF0000", 1.0f, false, true); + } + + @ParameterizedTest + @ValueSource(strings = { + "\\d{3}-\\d{2}-\\d{4}", // SSN pattern + "\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}", // Credit card pattern + "\\b[A-Z]{2,}\\b", // Uppercase words + "\\$\\d+\\.\\d{2}", // Currency pattern + "\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\b" // IP address pattern + }) + @DisplayName("Should handle various regex patterns") + void handleVariousRegexPatterns(String regexPattern) throws Exception { + testAutoRedaction(regexPattern, true, false, "#000000", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle invalid regex gracefully") + void handleInvalidRegex() throws Exception { + testAutoRedaction("[invalid regex(", true, false, "#000000", 1.0f, false, false); + } + } + + @Nested + @DisplayName("Whole Word Search Redaction") + class WholeWordRedactionTests { + + @Test + @DisplayName("Should redact whole words only") + void redactWholeWordsOnly() throws Exception { + testAutoRedaction("test", false, true, "#0000FF", 0.5f, false, true); + } + + @Test + @DisplayName("Should handle word boundaries correctly") + void handleWordBoundariesCorrectly() throws Exception { + testAutoRedaction("confidential", false, true, "#FF0000", 1.0f, false, true); + } + + @Test + @DisplayName("Should distinguish between partial and whole word matches") + void distinguishBetweenPartialAndWholeWordMatches() throws Exception { + // Test both whole word and partial matching + testAutoRedaction("secret", false, true, "#000000", 1.0f, false, true); + testAutoRedaction("secret", false, false, "#000000", 1.0f, false, true); + } + } + + @Nested + @DisplayName("Color and Styling Options") + class ColorAndStylingTests { + + @Test + @DisplayName("Should handle red hex color") + void handleRedHexColor() throws Exception { + testAutoRedaction("test", false, false, "#FF0000", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle green hex color") + void handleGreenHexColor() throws Exception { + testAutoRedaction("test", false, false, "#00FF00", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle blue hex color") + void handleBlueHexColor() throws Exception { + testAutoRedaction("test", false, false, "#0000FF", 1.0f, false, true); + } + + @Test + @DisplayName("Should default to black for invalid colors") + void defaultToBlackForInvalidColors() throws Exception { + testAutoRedaction("test", false, false, "invalid-color", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle yellow hex color") + void handleYellowHexColor() throws Exception { + testAutoRedaction("test", false, false, "#FFFF00", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle magenta hex color") + void handleMagentaHexColor() throws Exception { + testAutoRedaction("test", false, false, "#FF00FF", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle cyan hex color") + void handleCyanHexColor() throws Exception { + testAutoRedaction("test", false, false, "#00FFFF", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle black hex color") + void handleBlackHexColor() throws Exception { + testAutoRedaction("test", false, false, "#000000", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle white hex color") + void handleWhiteHexColor() throws Exception { + testAutoRedaction("test", false, false, "#FFFFFF", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle zero padding") + void handleZeroPadding() throws Exception { + testAutoRedaction("test", false, false, "#000000", 0.0f, false, true); + } + + @Test + @DisplayName("Should handle normal padding") + void handleNormalPadding() throws Exception { + testAutoRedaction("test", false, false, "#000000", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle large padding") + void handleLargePadding() throws Exception { + testAutoRedaction("test", false, false, "#000000", 2.5f, false, true); + } + + @Test + @DisplayName("Should handle extra large padding") + void handleExtraLargePadding() throws Exception { + testAutoRedaction("test", false, false, "#000000", 5.0f, false, true); + } + } + + @Nested + @DisplayName("Manual Redaction Areas") + class ManualRedactionTests { + + @Test + @DisplayName("Should redact using manual areas") + void redactUsingManualAreas() throws Exception { + List redactionAreas = createValidRedactionAreas(); + testManualRedaction(redactionAreas, false); + } + + @Test + @DisplayName("Should handle null redaction areas") + void handleNullRedactionAreas() throws Exception { + testManualRedaction(null, false); + } + + @Test + @DisplayName("Should handle empty redaction areas") + void handleEmptyRedactionAreas() throws Exception { + testManualRedaction(new ArrayList<>(), false); + } + + @Test + @DisplayName("Should handle invalid redaction area coordinates") + void handleInvalidRedactionAreaCoordinates() throws Exception { + List invalidAreas = createInvalidRedactionAreas(); + testManualRedaction(invalidAreas, false); + } + + @Test + @DisplayName("Should handle multiple redaction areas") + void handleMultipleRedactionAreas() throws Exception { + List multipleAreas = createMultipleRedactionAreas(); + testManualRedaction(multipleAreas, false); + } + + @Test + @DisplayName("Should handle overlapping redaction areas") + void handleOverlappingRedactionAreas() throws Exception { + List overlappingAreas = createOverlappingRedactionAreas(); + testManualRedaction(overlappingAreas, false); + } + } + + @Nested + @DisplayName("Image Conversion Options") + class ImageConversionTests { + + @Test + @DisplayName("Should handle PDF to image conversion disabled") + void handlePdfToImageConversionDisabled() throws Exception { + testAutoRedaction("sensitive", false, false, "#000000", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle PDF to image conversion enabled") + void handlePdfToImageConversionEnabled() throws Exception { + testAutoRedaction("sensitive", false, false, "#000000", 1.0f, true, true); + } + + @Test + @DisplayName("Should handle manual redaction with image conversion") + void handleManualRedactionWithImageConversion() throws Exception { + List areas = createValidRedactionAreas(); + testManualRedaction(areas, true); + } + } + + @Nested + @DisplayName("Error Handling and Edge Cases") + class ErrorHandlingTests { + + @Test + @DisplayName("Should handle null file input gracefully") + void handleNullFileInput() throws Exception { + RedactPdfRequest request = new RedactPdfRequest(); + request.setFileInput(null); + request.setListOfText("test"); + + assertDoesNotThrow(() -> { + try { + redactController.redactPdf(request); + } catch (Exception e) { + assertNotNull(e); + } + }); + } + + @Test + @DisplayName("Should handle malformed PDF gracefully") + void handleMalformedPdfGracefully() throws Exception { + MockMultipartFile malformedFile = new MockMultipartFile( + "fileInput", + "malformed.pdf", + "application/pdf", + "Not a real PDF content".getBytes() + ); + + RedactPdfRequest request = new RedactPdfRequest(); + request.setFileInput(malformedFile); + request.setListOfText("test"); + + assertDoesNotThrow(() -> { + try { + redactController.redactPdf(request); + } catch (Exception e) { + assertNotNull(e); + } + }); + } + + @Test + @DisplayName("Should handle extremely long search text") + void handleExtremelyLongSearchText() throws Exception { + String longText = "a".repeat(10000); + testAutoRedaction(longText, false, false, "#000000", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle special characters in search text") + void handleSpecialCharactersInSearchText() throws Exception { + testAutoRedaction("特殊字符测试 ñáéíóú àèìòù", false, false, "#000000", 1.0f, false, true); + } + + @ParameterizedTest + @ValueSource(strings = {"", " ", "\t", "\n", "\r\n", " \t\n "}) + @DisplayName("Should handle whitespace-only search terms") + void handleWhitespaceOnlySearchTerms(String whitespacePattern) throws Exception { + testAutoRedaction(whitespacePattern, false, false, "#000000", 1.0f, false, true); + } + } + + @Nested + @DisplayName("Color Decoding Utility Tests") + class ColorDecodingTests { + + @Test + @DisplayName("Should decode valid hex color with hash") + void decodeValidHexColorWithHash() throws Exception { + java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); + method.setAccessible(true); + + Color result = (Color) method.invoke(redactController, "#FF0000"); + assertEquals(Color.RED, result); + } + + @Test + @DisplayName("Should decode valid hex color without hash") + void decodeValidHexColorWithoutHash() throws Exception { + java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); + method.setAccessible(true); + + Color result = (Color) method.invoke(redactController, "FF0000"); + assertEquals(Color.RED, result); + } + + @Test + @DisplayName("Should default to black for null color") + void defaultToBlackForNullColor() throws Exception { + java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); + method.setAccessible(true); + + Color result = (Color) method.invoke(redactController, (String) null); + assertEquals(Color.BLACK, result); + } + + @Test + @DisplayName("Should default to black for invalid color") + void defaultToBlackForInvalidColor() throws Exception { + java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); + method.setAccessible(true); + + Color result = (Color) method.invoke(redactController, "invalid-color"); + assertEquals(Color.BLACK, result); + } + + @ParameterizedTest + @ValueSource(strings = {"#FF0000", "#00FF00", "#0000FF", "#FFFFFF", "#000000", "FF0000", "00FF00", "0000FF"}) + @DisplayName("Should handle various valid color formats") + void handleVariousValidColorFormats(String colorInput) throws Exception { + java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); + method.setAccessible(true); + + Color result = (Color) method.invoke(redactController, colorInput); + assertNotNull(result); + assertTrue(result.equals(Color.BLACK) || !result.equals(Color.BLACK)); + } + + @Test + @DisplayName("Should handle short hex codes appropriately") + void handleShortHexCodes() throws Exception { + java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); + method.setAccessible(true); + + Color result1 = (Color) method.invoke(redactController, "123"); + Color result2 = (Color) method.invoke(redactController, "#12"); + + assertNotNull(result1); + assertNotNull(result2); + } + } + + @Nested + @DisplayName("Performance and Boundary Tests") + class PerformanceTests { + + @Test + @DisplayName("Should handle large text lists efficiently") + void handleLargeTextListsEfficiently() throws Exception { + StringBuilder largeTextList = new StringBuilder(); + for (int i = 0; i < 1000; i++) { + largeTextList.append("term").append(i).append("\n"); + } + + long startTime = System.currentTimeMillis(); + testAutoRedaction(largeTextList.toString(), false, false, "#000000", 1.0f, false, true); + long endTime = System.currentTimeMillis(); + + assertTrue(endTime - startTime < 10000, "Large text list processing should complete within 10 seconds"); + } + + @Test + @DisplayName("Should handle many redaction areas efficiently") + void handleManyRedactionAreasEfficiently() throws Exception { + List manyAreas = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + RedactionArea area = new RedactionArea(); + area.setPage(1); + area.setX(10.0 + i); + area.setY(10.0 + i); + area.setWidth(50.0); + area.setHeight(20.0); + area.setColor("000000"); + manyAreas.add(area); + } + + long startTime = System.currentTimeMillis(); + testManualRedaction(manyAreas, false); + long endTime = System.currentTimeMillis(); + + assertTrue(endTime - startTime < 5000, "Many redaction areas should be processed within 5 seconds"); + } + } + + private RedactPdfRequest createRedactPdfRequest() { + RedactPdfRequest request = new RedactPdfRequest(); + request.setFileInput(mockPdfFile); + return request; + } + + private ManualRedactPdfRequest createManualRedactPdfRequest() { + ManualRedactPdfRequest request = new ManualRedactPdfRequest(); + request.setFileInput(mockPdfFile); + return request; + } + + private byte[] createSimplePdfContent() { + return "Mock PDF Content".getBytes(); + } + + private List createValidRedactionAreas() { + List areas = new ArrayList<>(); + + RedactionArea area1 = new RedactionArea(); + area1.setPage(1); + area1.setX(100.0); + area1.setY(100.0); + area1.setWidth(200.0); + area1.setHeight(50.0); + area1.setColor("000000"); + areas.add(area1); + + RedactionArea area2 = new RedactionArea(); + area2.setPage(1); + area2.setX(300.0); + area2.setY(200.0); + area2.setWidth(150.0); + area2.setHeight(30.0); + area2.setColor("FF0000"); + areas.add(area2); + + return areas; + } + + private List createInvalidRedactionAreas() { + List areas = new ArrayList<>(); + + RedactionArea invalidArea = new RedactionArea(); + invalidArea.setPage(null); // Invalid - null page + invalidArea.setX(100.0); + invalidArea.setY(100.0); + invalidArea.setWidth(200.0); + invalidArea.setHeight(50.0); + areas.add(invalidArea); + + return areas; + } + + private List createMultipleRedactionAreas() { + List areas = new ArrayList<>(); + + for (int i = 0; i < 5; i++) { + RedactionArea area = new RedactionArea(); + area.setPage(1); + area.setX(50.0 + (i * 60)); + area.setY(50.0 + (i * 40)); + area.setWidth(50.0); + area.setHeight(30.0); + area.setColor(String.format("%06X", i * 0x333333)); + areas.add(area); + } + + return areas; + } + + private List createOverlappingRedactionAreas() { + List areas = new ArrayList<>(); + + RedactionArea area1 = new RedactionArea(); + area1.setPage(1); + area1.setX(100.0); + area1.setY(100.0); + area1.setWidth(200.0); + area1.setHeight(100.0); + area1.setColor("FF0000"); + areas.add(area1); + + RedactionArea area2 = new RedactionArea(); + area2.setPage(1); + area2.setX(150.0); // Overlaps with area1 + area2.setY(150.0); // Overlaps with area1 + area2.setWidth(200.0); + area2.setHeight(100.0); + area2.setColor("00FF00"); + areas.add(area2); + + return areas; + } +} diff --git a/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java b/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java new file mode 100644 index 000000000..246f10af7 --- /dev/null +++ b/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java @@ -0,0 +1,485 @@ +package stirling.software.SPDF.pdf; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.IOException; +import java.util.List; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; +import org.mockito.junit.jupiter.MockitoExtension; + +import stirling.software.SPDF.model.PDFText; + +@DisplayName("PDF Text Finder tests") +@ExtendWith(MockitoExtension.class) +class TextFinderTest { + + private PDDocument document; + private PDPage page; + + // Helpers + private void testTextFinding(String pageContent, String searchTerm, boolean useRegex, boolean wholeWord, + String[] expectedTexts, int expectedCount) throws IOException { + addTextToPage(pageContent); + TextFinder textFinder = new TextFinder(searchTerm, useRegex, wholeWord); + + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(expectedCount, foundTexts.size(), + String.format("Expected %d matches for search term '%s'", expectedCount, searchTerm)); + + if (expectedTexts != null) { + for (String expectedText : expectedTexts) { + assertTrue(foundTexts.stream().anyMatch(text -> text.getText().equals(expectedText)), + String.format("Expected to find text: '%s'", expectedText)); + } + } + + // Verify basic properties of found texts + foundTexts.forEach(text -> { + assertNotNull(text.getText()); + assertTrue(text.getX1() >= 0); + assertTrue(text.getY1() >= 0); + assertTrue(text.getX2() >= text.getX1()); + assertTrue(text.getY2() >= text.getY1()); + assertEquals(0, text.getPageIndex()); // Single page test + }); + } + + @BeforeEach + void setUp() { + document = new PDDocument(); + page = new PDPage(PDRectangle.A4); + document.addPage(page); + } + + @AfterEach + void tearDown() throws IOException { + if (document != null) { + document.close(); + } + } + + @Nested + @DisplayName("Basic Text Search") + class BasicSearchTests { + + @Test + @DisplayName("Should find simple text correctly") + void findSimpleText() throws IOException { + testTextFinding("This is a confidential document with secret information.", + "confidential", false, false, + new String[]{"confidential"}, 1); + } + + @Test + @DisplayName("Should perform case-insensitive search") + void performCaseInsensitiveSearch() throws IOException { + testTextFinding("This document contains CONFIDENTIAL information.", + "confidential", false, false, + new String[]{"CONFIDENTIAL"}, 1); + } + + @Test + @DisplayName("Should find multiple occurrences of same term") + void findMultipleOccurrences() throws IOException { + testTextFinding("The secret code is secret123. Keep this secret safe!", + "secret", false, false, + new String[]{"secret", "secret", "secret"}, 3); + } + + @Test + @DisplayName("Should handle empty search term gracefully") + void handleEmptySearchTerm() throws IOException { + testTextFinding("This is a test document.", "", false, false, null, 0); + } + + @Test + @DisplayName("Should handle null search term gracefully") + void handleNullSearchTerm() throws IOException { + testTextFinding("This is a test document.", null, false, false, null, 0); + } + + @Test + @DisplayName("Should return no results when no match found") + void returnNoResultsWhenNoMatch() throws IOException { + testTextFinding("This is a test document.", "nonexistent", false, false, null, 0); + } + } + + @Nested + @DisplayName("Whole Word Search") + class WholeWordSearchTests { + + @Test + @DisplayName("Should find only whole words when enabled") + void findOnlyWholeWords() throws IOException { + testTextFinding("This is a test testing document with tested results.", + "test", false, true, + new String[]{"test"}, 1); + } + + @Test + @DisplayName("Should find partial matches when whole word search disabled") + void findPartialMatches() throws IOException { + testTextFinding("This is a test testing document with tested results.", + "test", false, false, + new String[]{"test", "test", "test"}, 3); + } + + @Test + @DisplayName("Should handle punctuation boundaries correctly") + void handlePunctuationBoundaries() throws IOException { + testTextFinding("Hello, world! Testing: test-case (test).", + "test", false, true, + new String[]{"test"}, 2); // Both standalone "test" and "test" in "test-case" + } + + @Test + @DisplayName("Should handle word boundaries with special characters") + void handleSpecialCharacterBoundaries() throws IOException { + testTextFinding("Email: test@example.com and test.txt file", + "test", false, true, + new String[]{"test"}, 2); // Both in email and filename should match + } + } + + @Nested + @DisplayName("Regular Expression Search") + class RegexSearchTests { + + @Test + @DisplayName("Should find text matching regex pattern") + void findTextMatchingRegex() throws IOException { + testTextFinding("Contact John at 123-45-6789 or Jane at 987-65-4321 for details.", + "\\d{3}-\\d{2}-\\d{4}", true, false, + new String[]{"123-45-6789", "987-65-4321"}, 2); + } + + @Test + @DisplayName("Should find email addresses with regex") + void findEmailAddresses() throws IOException { + testTextFinding("Email: test@example.com and admin@test.org", + "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, false, + new String[]{"test@example.com", "admin@test.org"}, 2); + } + + @Test + @DisplayName("Should combine regex with whole word search") + void combineRegexWithWholeWord() throws IOException { + testTextFinding("Email: test@example.com and admin@test.org", + "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, true, + new String[]{"test@example.com", "admin@test.org"}, 2); + } + + @Test + @DisplayName("Should find currency patterns") + void findCurrencyPatterns() throws IOException { + testTextFinding("Price: $100.50 and €75.25", + "\\$\\d+\\.\\d{2}", true, false, + new String[]{"$100.50"}, 1); + } + + @ParameterizedTest + @ValueSource(strings = { + "\\d{4}-\\d{2}-\\d{2}", // Date pattern + "\\b[A-Z]{2,}\\b", // Uppercase words + "\\w+@\\w+\\.\\w+", // Simple email pattern + "\\$\\d+", // Simple currency + "\\b\\d{3,4}\\b" // 3-4 digit numbers + }) + @DisplayName("Should handle various regex patterns") + void handleVariousRegexPatterns(String regexPattern) throws IOException { + String testContent = "Date: 2023-12-25, Email: test@domain.com, Price: $250, Code: ABC123, Number: 1234"; + addTextToPage(testContent); + + TextFinder textFinder = new TextFinder(regexPattern, true, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + // Each pattern should find at least one match in our test content + assertFalse(foundTexts.isEmpty(), String.format("Pattern '%s' should find at least one match", regexPattern)); + } + + @Test + @DisplayName("Should handle invalid regex gracefully") + void handleInvalidRegex() throws IOException { + addTextToPage("This is test content."); + + try { + TextFinder textFinder = new TextFinder("[invalid regex(", true, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + assertNotNull(foundTexts); + } catch (java.util.regex.PatternSyntaxException e) { + assertNotNull(e.getMessage()); + assertTrue(e.getMessage().contains("Unclosed character class") || + e.getMessage().contains("syntax"), + "Exception should indicate regex syntax error"); + } catch (RuntimeException | IOException e) { + assertNotNull(e.getMessage()); + } + } + } + + @Nested + @DisplayName("Special Characters and Encoding") + class SpecialCharacterTests { + + @Test + @DisplayName("Should handle international characters") + void handleInternationalCharacters() throws IOException { + testTextFinding("Hello café naïve résumé", + "café", false, false, + new String[]{"café"}, 1); + } + + @Test + @DisplayName("Should find text with accented characters") + void findAccentedCharacters() throws IOException { + testTextFinding("Café, naïve, résumé, piñata", + "café", false, false, + new String[]{"Café"}, 1); // Case insensitive + } + + @Test + @DisplayName("Should handle special symbols") + void handleSpecialSymbols() throws IOException { + testTextFinding("Symbols: © ® ™ ± × ÷ § ¶", + "©", false, false, + new String[]{"©"}, 1); + } + + @Test + @DisplayName("Should find currency symbols") + void findCurrencySymbols() throws IOException { + testTextFinding("Prices: $100 €75 £50 ¥1000", + "[€£¥]", true, false, + new String[]{"€", "£", "¥"}, 3); + } + } + + @Nested + @DisplayName("Multi-page Document Tests") + class MultiPageTests { + + @Test + @DisplayName("Should find text across multiple pages") + void findTextAcrossPages() throws IOException { + PDPage secondPage = new PDPage(PDRectangle.A4); + document.addPage(secondPage); + + addTextToPage("First page with confidential data."); + + addTextToPage(secondPage, "Second page with secret information."); + + TextFinder textFinder = new TextFinder("confidential|secret", true, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(2, foundTexts.size()); + + long page0Count = foundTexts.stream().filter(text -> text.getPageIndex() == 0).count(); + long page1Count = foundTexts.stream().filter(text -> text.getPageIndex() == 1).count(); + + assertEquals(1, page0Count); + assertEquals(1, page1Count); + } + + @Test + @DisplayName("Should handle empty pages gracefully") + void handleEmptyPages() throws IOException { + PDPage emptyPage = new PDPage(PDRectangle.A4); + document.addPage(emptyPage); + + addTextToPage("Content on first page only."); + + TextFinder textFinder = new TextFinder("content", false, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(1, foundTexts.size()); + assertEquals(0, foundTexts.get(0).getPageIndex()); + } + } + + @Nested + @DisplayName("Performance and Boundary Tests") + class PerformanceTests { + + @Test + @DisplayName("Should handle very long search terms") + void handleLongSearchTerms() throws IOException { + String longTerm = "a".repeat(1000); + String content = "Short text with " + longTerm + " embedded."; + + testTextFinding(content, longTerm, false, false, new String[]{longTerm}, 1); + } + + @Test + @DisplayName("Should handle documents with many pages efficiently") + void handleManyPages() throws IOException { + for (int i = 0; i < 10; i++) { + if (i > 0) { // The first page already exists + document.addPage(new PDPage(PDRectangle.A4)); + } + addTextToPage(document.getPage(i), "Page " + i + " contains searchable content."); + } + + long startTime = System.currentTimeMillis(); + TextFinder textFinder = new TextFinder("searchable", false, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + long endTime = System.currentTimeMillis(); + + assertEquals(10, foundTexts.size()); + assertTrue(endTime - startTime < 3000, + "Multi-page search should complete within 3 seconds"); + } + } + + @Nested + @DisplayName("Error Handling and Edge Cases") + class ErrorHandlingTests { + + @Test + @DisplayName("Should handle null document gracefully") + void handleNullDocument() throws IOException { + TextFinder textFinder = new TextFinder("test", false, false); + + try { + textFinder.getText(null); + List foundTexts = textFinder.getFoundTexts(); + assertNotNull(foundTexts); + assertEquals(0, foundTexts.size()); + } catch (Exception e) { + assertNotNull(e.getMessage()); + } + } + + @Test + @DisplayName("Should handle document without pages") + void handleDocumentWithoutPages() throws IOException { + try (PDDocument emptyDocument = new PDDocument()) { + TextFinder textFinder = new TextFinder("test", false, false); + textFinder.getText(emptyDocument); + List foundTexts = textFinder.getFoundTexts(); + assertEquals(0, foundTexts.size()); + } + } + + @Test + @DisplayName("Should handle pages without content") + void handlePagesWithoutContent() throws IOException { + TextFinder textFinder = new TextFinder("test", false, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(0, foundTexts.size()); + } + + @Test + @DisplayName("Should handle extremely complex regex patterns") + void handleComplexRegexPatterns() throws IOException { + addTextToPage("Complex content with various patterns: abc123, def456, XYZ789"); + + String complexRegex = "(?=.*\\d)(?=.*[a-z])(?=.*[A-Z])[a-zA-Z\\d]{6}"; + + assertDoesNotThrow(() -> { + TextFinder textFinder = new TextFinder(complexRegex, true, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + assertNotNull(foundTexts); + }); + } + + @ParameterizedTest + @ValueSource(strings = {"", " ", "\t", "\n", "\r\n", " \t\n "}) + @DisplayName("Should handle whitespace-only search terms") + void handleWhitespaceSearchTerms(String whitespacePattern) throws IOException { + addTextToPage("This is normal text content."); + + TextFinder textFinder = new TextFinder(whitespacePattern, false, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(0, foundTexts.size()); + } + } + + @Nested + @DisplayName("Text Coordinate Verification") + class CoordinateTests { + + @Test + @DisplayName("Should provide accurate text coordinates") + void provideAccurateCoordinates() throws IOException { + addTextToPage("Sample text for coordinate testing."); + + TextFinder textFinder = new TextFinder("coordinate", false, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(1, foundTexts.size()); + PDFText foundText = foundTexts.get(0); + + assertTrue(foundText.getX1() >= 0, "X1 should be non-negative"); + assertTrue(foundText.getY1() >= 0, "Y1 should be non-negative"); + assertTrue(foundText.getX2() > foundText.getX1(), "X2 should be greater than X1"); + assertTrue(foundText.getY2() > foundText.getY1(), "Y2 should be greater than Y1"); + + double width = foundText.getX2() - foundText.getX1(); + double height = foundText.getY2() - foundText.getY1(); + + assertTrue(width > 0, "Text width should be positive"); + assertTrue(height > 0, "Text height should be positive"); + assertTrue(width < 1000, "Text width should be reasonable"); + assertTrue(height < 100, "Text height should be reasonable"); + } + + @Test + @DisplayName("Should handle overlapping text regions") + void handleOverlappingTextRegions() throws IOException { + addTextToPage("Overlapping test text content."); + + TextFinder textFinder = new TextFinder("test", false, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertFalse(foundTexts.isEmpty()); + foundTexts.forEach(text -> { + assertNotNull(text.getText()); + assertTrue(text.getX1() >= 0 && text.getY1() >= 0); + }); + } + } + + // Helper methods + private void addTextToPage(String text) throws IOException { + addTextToPage(page, text); + } + + private void addTextToPage(PDPage targetPage, String text) throws IOException { + try (PDPageContentStream contentStream = new PDPageContentStream(document, targetPage)) { + contentStream.beginText(); + contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12); + contentStream.newLineAtOffset(50, 750); + contentStream.showText(text); + contentStream.endText(); + } + } +} From 03093f50f67051b8c394440e0f7eab7b4d066bb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Sat, 12 Jul 2025 13:29:29 +0200 Subject: [PATCH 03/13] feat: add safe string width calculation to handle unsupported characters in RedactController --- .../api/security/RedactController.java | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index e76b48009..81507de45 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -679,6 +679,36 @@ public class RedactController { return result.toString(); } + /** + * Safely calculates the width of a string, handling characters that might not be supported by + * the font. If a character is not supported, it's replaced with a space or skipped. + */ + private float safeGetStringWidth(PDFont font, String text) throws IOException { + if (font == null || text == null || text.isEmpty()) { + return 0; + } + + StringBuilder safeText = new StringBuilder(); + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + try { + // Try to encode the character to check if it's supported + font.encode(String.valueOf(c)); + safeText.append(c); + } catch (IllegalArgumentException e) { + // If the character is not supported, replace it with a space + // This is a simple fallback - you might want to use a different strategy + safeText.append(' '); + log.debug( + "Replaced unsupported character U+{} with space in font {}", + Integer.toHexString(c | 0x10000).substring(1), + font.getName()); + } + } + + return font.getStringWidth(safeText.toString()); + } + private float calculateWidthAdjustment(TextSegment segment, List matches) throws IOException { float totalOriginalWidth = 0; @@ -695,11 +725,11 @@ public class RedactController { if (segment.getFont() != null) { totalOriginalWidth += - segment.getFont().getStringWidth(originalPart) + safeGetStringWidth(segment.getFont(), originalPart) / FONT_SCALE_FACTOR * segment.getFontSize(); totalPlaceholderWidth += - segment.getFont().getStringWidth(placeholderPart) + safeGetStringWidth(segment.getFont(), placeholderPart) / FONT_SCALE_FACTOR * segment.getFontSize(); } @@ -806,11 +836,11 @@ public class RedactController { if (modified && segment.getFont() != null && segment.getFontSize() > 0) { float originalWidth = - segment.getFont().getStringWidth(originalText) + safeGetStringWidth(segment.getFont(), originalText) / FONT_SCALE_FACTOR * segment.getFontSize(); float modifiedWidth = - segment.getFont().getStringWidth(modifiedString) + safeGetStringWidth(segment.getFont(), modifiedString) / FONT_SCALE_FACTOR * segment.getFontSize(); float adjustment = originalWidth - modifiedWidth; From 8329540e255fc2d1a4367d6dee1b1e7d3fb8becc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Sun, 13 Jul 2025 16:00:32 +0200 Subject: [PATCH 04/13] Copilot suggestions, quality improvements. Tests for token removal/creation. Explicitly import stuff. JAVADOC comment. Super call for page call in TextFinder. --- .../api/security/RedactController.java | 76 +-- .../software/SPDF/pdf/TextFinder.java | 3 +- .../api/security/RedactControllerTest.java | 484 +++++++++++++++--- 3 files changed, 470 insertions(+), 93 deletions(-) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 81507de45..e145312ba 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -3,7 +3,15 @@ package stirling.software.SPDF.controller.api.security; import java.awt.Color; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -296,7 +304,28 @@ public class RedactController { log.debug("Completed text block redactions"); } - private Color decodeOrDefault(String hex) { + String createPlaceholder(String originalWord) { + if (originalWord == null || originalWord.isEmpty()) { + return originalWord; + } + return " ".repeat(originalWord.length()); + } + + void writeFilteredContentStream(PDDocument document, PDPage page, List tokens) + throws IOException { + log.debug("Writing filtered content stream with {} tokens", tokens.size()); + + PDStream newStream = new PDStream(document); + try (var out = newStream.createOutputStream()) { + ContentStreamWriter writer = new ContentStreamWriter(out); + writer.writeTokens(tokens); + } + page.setContents(newStream); + + log.debug("Successfully wrote filtered content stream"); + } + + Color decodeOrDefault(String hex) { if (hex == null) { return Color.BLACK; } @@ -311,6 +340,10 @@ public class RedactController { } } + boolean isTextShowingOperator(String opName) { + return TEXT_SHOWING_OPERATORS.contains(opName); + } + private List getPageNumbers(ManualRedactPdfRequest request, int pagesCount) { String pageNumbersInput = request.getPageNumbers(); String[] parsedPageNumbers = @@ -424,7 +457,17 @@ public class RedactController { + "_redacted.pdf"); } - private List createTokensWithoutTargetText( + /** + * Creates a list of tokens from the page content stream, without the target text. + * + * @param page The PDF page to process. + * @param targetWords The set of words to redact. + * @param useRegex Whether to treat target words as regex patterns. + * @param wholeWordSearch Whether to match whole words only. + * @return A list of tokens with redactions applied. + * @throws IOException If an error occurs while parsing the PDF content stream. + */ + List createTokensWithoutTargetText( PDPage page, Set targetWords, boolean useRegex, boolean wholeWordSearch) throws IOException { log.debug( @@ -697,7 +740,7 @@ public class RedactController { safeText.append(c); } catch (IllegalArgumentException e) { // If the character is not supported, replace it with a space - // This is a simple fallback - you might want to use a different strategy + // This is a simple fallback safeText.append(' '); log.debug( "Replaced unsupported character U+{} with space in font {}", @@ -881,29 +924,4 @@ public class RedactController { default -> ""; }; } - - private String createPlaceholder(String originalWord) { - if (originalWord == null || originalWord.isEmpty()) { - return originalWord; - } - return "".repeat(originalWord.length()); - } - - private void writeFilteredContentStream(PDDocument document, PDPage page, List tokens) - throws IOException { - log.debug("Writing filtered content stream with {} tokens", tokens.size()); - - PDStream newStream = new PDStream(document); - try (var out = newStream.createOutputStream()) { - ContentStreamWriter writer = new ContentStreamWriter(out); - writer.writeTokens(tokens); - } - page.setContents(newStream); - - log.debug("Successfully wrote filtered content stream"); - } - - private boolean isTextShowingOperator(String opName) { - return TEXT_SHOWING_OPERATORS.contains(opName); - } } diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index d9ddf3b91..6efb5fde6 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -32,7 +32,8 @@ public class TextFinder extends PDFTextStripper { } @Override - protected void startPage(PDPage page) { + protected void startPage(PDPage page) throws IOException { + super.startPage(page); pageTextPositions.clear(); pageTextBuilder.setLength(0); } diff --git a/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java b/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java index 60ea78e26..ab501f143 100644 --- a/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java +++ b/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java @@ -12,11 +12,21 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Set; +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSFloat; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; @@ -58,10 +68,13 @@ class RedactControllerTest { private PDPageTree mockPages; private PDPage mockPage; + private PDDocument realDocument; + private PDPage realPage; + // Helpers private void testAutoRedaction(String searchText, boolean useRegex, boolean wholeWordSearch, - String redactColor, float padding, boolean convertToImage, - boolean expectSuccess) throws Exception { + String redactColor, float padding, boolean convertToImage, + boolean expectSuccess) throws Exception { RedactPdfRequest request = createRedactPdfRequest(); request.setListOfText(searchText); request.setUseRegex(useRegex); @@ -111,10 +124,10 @@ class RedactControllerTest { @BeforeEach void setUp() throws IOException { mockPdfFile = new MockMultipartFile( - "fileInput", - "test.pdf", - "application/pdf", - createSimplePdfContent() + "fileInput", + "test.pdf", + "application/pdf", + createSimplePdfContent() ); // Mock PDF document and related objects @@ -160,11 +173,28 @@ class RedactControllerTest { return null; }).when(mockDocument).save(any(ByteArrayOutputStream.class)); doNothing().when(mockDocument).close(); + + // Initialize a real document for unit tests + setupRealDocument(); + } + + private void setupRealDocument() throws IOException { + realDocument = new PDDocument(); + realPage = new PDPage(PDRectangle.A4); + realDocument.addPage(realPage); + + // Set up basic page resources + PDResources resources = new PDResources(); + resources.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + realPage.setResources(resources); } @AfterEach - void tearDown() { + void tearDown() throws IOException { reset(mockDocument, mockPages, mockPage, pdfDocumentFactory); + if (realDocument != null) { + realDocument.close(); + } } @Nested @@ -486,40 +516,28 @@ class RedactControllerTest { @Test @DisplayName("Should decode valid hex color with hash") void decodeValidHexColorWithHash() throws Exception { - java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); - method.setAccessible(true); - - Color result = (Color) method.invoke(redactController, "#FF0000"); + Color result = redactController.decodeOrDefault("#FF0000"); assertEquals(Color.RED, result); } @Test @DisplayName("Should decode valid hex color without hash") void decodeValidHexColorWithoutHash() throws Exception { - java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); - method.setAccessible(true); - - Color result = (Color) method.invoke(redactController, "FF0000"); + Color result = redactController.decodeOrDefault("FF0000"); assertEquals(Color.RED, result); } @Test @DisplayName("Should default to black for null color") void defaultToBlackForNullColor() throws Exception { - java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); - method.setAccessible(true); - - Color result = (Color) method.invoke(redactController, (String) null); + Color result = redactController.decodeOrDefault(null); assertEquals(Color.BLACK, result); } @Test @DisplayName("Should default to black for invalid color") void defaultToBlackForInvalidColor() throws Exception { - java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); - method.setAccessible(true); - - Color result = (Color) method.invoke(redactController, "invalid-color"); + Color result = redactController.decodeOrDefault("invalid-color"); assertEquals(Color.BLACK, result); } @@ -527,22 +545,18 @@ class RedactControllerTest { @ValueSource(strings = {"#FF0000", "#00FF00", "#0000FF", "#FFFFFF", "#000000", "FF0000", "00FF00", "0000FF"}) @DisplayName("Should handle various valid color formats") void handleVariousValidColorFormats(String colorInput) throws Exception { - java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); - method.setAccessible(true); - - Color result = (Color) method.invoke(redactController, colorInput); + Color result = redactController.decodeOrDefault(colorInput); assertNotNull(result); - assertTrue(result.equals(Color.BLACK) || !result.equals(Color.BLACK)); + assertTrue(result.getRed() >= 0 && result.getRed() <= 255, "Red component should be in valid range"); + assertTrue(result.getGreen() >= 0 && result.getGreen() <= 255, "Green component should be in valid range"); + assertTrue(result.getBlue() >= 0 && result.getBlue() <= 255, "Blue component should be in valid range"); } @Test @DisplayName("Should handle short hex codes appropriately") void handleShortHexCodes() throws Exception { - java.lang.reflect.Method method = RedactController.class.getDeclaredMethod("decodeOrDefault", String.class); - method.setAccessible(true); - - Color result1 = (Color) method.invoke(redactController, "123"); - Color result2 = (Color) method.invoke(redactController, "#12"); + Color result1 = redactController.decodeOrDefault("123"); + Color result2 = redactController.decodeOrDefault("#12"); assertNotNull(result1); assertNotNull(result2); @@ -550,44 +564,222 @@ class RedactControllerTest { } @Nested - @DisplayName("Performance and Boundary Tests") - class PerformanceTests { + @DisplayName("Content Stream Unit Tests") + class ContentStreamUnitTests { @Test - @DisplayName("Should handle large text lists efficiently") - void handleLargeTextListsEfficiently() throws Exception { - StringBuilder largeTextList = new StringBuilder(); - for (int i = 0; i < 1000; i++) { - largeTextList.append("term").append(i).append("\n"); - } + @DisplayName("createTokensWithoutTargetText should remove simple text tokens") + void shouldRemoveSimpleTextTokens() throws Exception { + createRealPageWithSimpleText("This document contains confidential information."); - long startTime = System.currentTimeMillis(); - testAutoRedaction(largeTextList.toString(), false, false, "#000000", 1.0f, false, true); - long endTime = System.currentTimeMillis(); + Set targetWords = Set.of("confidential"); - assertTrue(endTime - startTime < 10000, "Large text list processing should complete within 10 seconds"); + List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + + assertNotNull(tokens); + assertFalse(tokens.isEmpty()); + + String reconstructedText = extractTextFromTokens(tokens); + assertFalse(reconstructedText.contains("confidential"), + "Target text should be replaced with placeholder"); + assertTrue(reconstructedText.contains("document"), + "Non-target text should remain"); } @Test - @DisplayName("Should handle many redaction areas efficiently") - void handleManyRedactionAreasEfficiently() throws Exception { - List manyAreas = new ArrayList<>(); - for (int i = 0; i < 100; i++) { - RedactionArea area = new RedactionArea(); - area.setPage(1); - area.setX(10.0 + i); - area.setY(10.0 + i); - area.setWidth(50.0); - area.setHeight(20.0); - area.setColor("000000"); - manyAreas.add(area); + @DisplayName("createTokensWithoutTargetText should handle TJ operator arrays") + void shouldHandleTJOperatorArrays() throws Exception { + createRealPageWithTJArrayText(); + + Set targetWords = Set.of("secret"); + + List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + + assertNotNull(tokens); + + boolean foundModifiedTJArray = false; + for (Object token : tokens) { + if (token instanceof COSArray array) { + for (int i = 0; i < array.size(); i++) { + if (array.getObject(i) instanceof COSString cosString) { + String text = cosString.getString(); + if (text.contains("secret")) { + fail("Target text 'secret' should have been redacted from TJ array"); + } + foundModifiedTJArray = true; + } + } + } } + assertTrue(foundModifiedTJArray, "Should find at least one TJ array"); + } - long startTime = System.currentTimeMillis(); - testManualRedaction(manyAreas, false); - long endTime = System.currentTimeMillis(); + @Test + @DisplayName("createTokensWithoutTargetText should preserve non-text tokens") + void shouldPreserveNonTextTokens() throws Exception { + createRealPageWithMixedContent(); - assertTrue(endTime - startTime < 5000, "Many redaction areas should be processed within 5 seconds"); + Set targetWords = Set.of("redact"); + + List originalTokens = getOriginalTokens(); + List filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + + long originalNonTextCount = originalTokens.stream() + .filter(token -> token instanceof Operator op && !redactController.isTextShowingOperator(op.getName())) + .count(); + + long filteredNonTextCount = filteredTokens.stream() + .filter(token -> token instanceof Operator op && !redactController.isTextShowingOperator(op.getName())) + .count(); + + assertTrue(filteredNonTextCount > 0, + "Non-text operators should be preserved"); + + assertTrue(filteredNonTextCount >= originalNonTextCount / 2, + "A reasonable number of non-text operators should be preserved"); + } + + @Test + @DisplayName("createTokensWithoutTargetText should handle regex patterns") + void shouldHandleRegexPatterns() throws Exception { + createRealPageWithSimpleText("Phone: 123-456-7890 and SSN: 111-22-3333"); + + Set targetWords = Set.of("\\d{3}-\\d{2}-\\d{4}"); // SSN pattern + + List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, true, false); + + String reconstructedText = extractTextFromTokens(tokens); + assertFalse(reconstructedText.contains("111-22-3333"), "SSN should be redacted"); + assertTrue(reconstructedText.contains("123-456-7890"), "Phone should remain"); + } + + @Test + @DisplayName("createTokensWithoutTargetText should handle whole word search") + void shouldHandleWholeWordSearch() throws Exception { + createRealPageWithSimpleText("This test testing tested document"); + + Set targetWords = Set.of("test"); + + List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, true); + + String reconstructedText = extractTextFromTokens(tokens); + assertTrue(reconstructedText.contains("testing"), "Partial matches should remain"); + assertTrue(reconstructedText.contains("tested"), "Partial matches should remain"); + } + + @ParameterizedTest + @ValueSource(strings = {"Tj", "TJ", "'", "\""}) + @DisplayName("createTokensWithoutTargetText should handle all text operators") + void shouldHandleAllTextOperators(String operatorName) throws Exception { + createRealPageWithSpecificOperator(operatorName); + + Set targetWords = Set.of("sensitive"); + + List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + + String reconstructedText = extractTextFromTokens(tokens); + assertFalse(reconstructedText.contains("sensitive"), + "Text should be redacted regardless of operator type"); + } + + @Test + @DisplayName("writeFilteredContentStream should write tokens to new stream") + void shouldWriteTokensToNewContentStream() throws Exception { + List tokens = createSampleTokenList(); + + redactController.writeFilteredContentStream(realDocument, realPage, tokens); + + assertNotNull(realPage.getContents(), "Page should have content stream"); + + // Verify the content can be read back + try (InputStream inputStream = realPage.getContents()) { + byte[] content = readAllBytes(inputStream); + assertTrue(content.length > 0, "Content stream should not be empty"); + } + } + + @Test + @DisplayName("writeFilteredContentStream should handle empty token list") + void shouldHandleEmptyTokenList() throws Exception { + List emptyTokens = Collections.emptyList(); + + assertDoesNotThrow(() -> redactController.writeFilteredContentStream(realDocument, realPage, emptyTokens)); + + assertNotNull(realPage.getContents(), "Page should still have content stream"); + } + + @Test + @DisplayName("writeFilteredContentStream should replace existing content") + void shouldReplaceExistingContentStream() throws Exception { + createRealPageWithSimpleText("Original content"); + String originalContent = extractTextFromModifiedPage(realPage); + + List newTokens = createSampleTokenList(); + redactController.writeFilteredContentStream(realDocument, realPage, newTokens); + + String newContent = extractTextFromModifiedPage(realPage); + assertNotEquals(originalContent, newContent, "Content stream should be replaced"); + } + + @Test + @DisplayName("Placeholder creation should maintain text width") + void shouldCreateWidthMatchingPlaceholder() throws Exception { + String originalText = "confidential"; + String placeholder = redactController.createPlaceholder(originalText); + + assertEquals(originalText.length(), placeholder.length(), + "Placeholder should maintain character count for width preservation"); + } + + @Test + @DisplayName("Placeholder should handle special characters") + void shouldHandleSpecialCharactersInPlaceholder() throws Exception { + String originalText = "café naïve"; + String placeholder = redactController.createPlaceholder(originalText); + + assertEquals(originalText.length(), placeholder.length()); + assertFalse(placeholder.contains("café"), "Placeholder should not contain original text"); + } + + @Test + @DisplayName("Integration test: createTokens and writeStream") + void shouldIntegrateTokenCreationAndWriting() throws Exception { + createRealPageWithSimpleText("This document contains secret information."); + + Set targetWords = Set.of("secret"); + + List filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + + redactController.writeFilteredContentStream(realDocument, realPage, filteredTokens); + assertNotNull(realPage.getContents()); + + String finalText = extractTextFromModifiedPage(realPage); + assertFalse(finalText.contains("secret"), "Target text should be completely removed"); + assertTrue(finalText.contains("document"), "Other text should remain"); + } + + @Test + @DisplayName("Should preserve text positioning operators") + void shouldPreserveTextPositioning() throws Exception { + createRealPageWithPositionedText(); + + Set targetWords = Set.of("confidential"); + + List originalTokens = getOriginalTokens(); + List filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + + long originalPositioning = originalTokens.stream() + .filter(token -> token instanceof Operator op && + (op.getName().equals("Td") || op.getName().equals("TD") || op.getName().equals("Tm"))) + .count(); + + long filteredPositioning = filteredTokens.stream() + .filter(token -> token instanceof Operator op && + (op.getName().equals("Td") || op.getName().equals("TD") || op.getName().equals("Tm"))) + .count(); + + assertTrue(filteredPositioning > 0, + "Positioning operators should be preserved"); } } @@ -603,8 +795,21 @@ class RedactControllerTest { return request; } - private byte[] createSimplePdfContent() { - return "Mock PDF Content".getBytes(); + private byte[] createSimplePdfContent() throws IOException { + try (PDDocument doc = new PDDocument()) { + PDPage page = new PDPage(PDRectangle.A4); + doc.addPage(page); + try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) { + contentStream.beginText(); + contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12); + contentStream.newLineAtOffset(100, 700); + contentStream.showText("This is a simple PDF."); + contentStream.endText(); + } + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + doc.save(baos); + return baos.toByteArray(); + } } private List createValidRedactionAreas() { @@ -685,4 +890,157 @@ class RedactControllerTest { return areas; } + + // Helper methods for real PDF content creation + private void createRealPageWithSimpleText(String text) throws IOException { + realPage = new PDPage(PDRectangle.A4); + while (realDocument.getNumberOfPages() > 0) { + realDocument.removePage(0); + } + realDocument.addPage(realPage); + realPage.setResources(new PDResources()); + realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + + try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { + contentStream.beginText(); + contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); + contentStream.newLineAtOffset(50, 750); + contentStream.showText(text); + contentStream.endText(); + } + } + + private void createRealPageWithTJArrayText() throws IOException { + realPage = new PDPage(PDRectangle.A4); + while (realDocument.getNumberOfPages() > 0) { + realDocument.removePage(0); + } + realDocument.addPage(realPage); + realPage.setResources(new PDResources()); + realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + + try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { + contentStream.beginText(); + contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); + contentStream.newLineAtOffset(50, 750); + + contentStream.showText("This is "); + contentStream.newLineAtOffset(-10, 0); // Simulate positioning + contentStream.showText("secret"); + contentStream.newLineAtOffset(10, 0); // Reset positioning + contentStream.showText(" information"); + contentStream.endText(); + } + } + + private void createRealPageWithMixedContent() throws IOException { + realPage = new PDPage(PDRectangle.A4); + while (realDocument.getNumberOfPages() > 0) { + realDocument.removePage(0); + } + realDocument.addPage(realPage); + realPage.setResources(new PDResources()); + realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + + try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { + contentStream.setLineWidth(2); + contentStream.moveTo(100, 100); + contentStream.lineTo(200, 200); + contentStream.stroke(); + + contentStream.beginText(); + contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); + contentStream.newLineAtOffset(50, 750); + contentStream.showText("Please redact this content"); + contentStream.endText(); + } + } + + private void createRealPageWithSpecificOperator(String operatorName) throws IOException { + createRealPageWithSimpleText("sensitive data"); + } + + private void createRealPageWithPositionedText() throws IOException { + realPage = new PDPage(PDRectangle.A4); + while (realDocument.getNumberOfPages() > 0) { + realDocument.removePage(0); + } + realDocument.addPage(realPage); + realPage.setResources(new PDResources()); + realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + + try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { + contentStream.beginText(); + contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); + contentStream.newLineAtOffset(50, 750); + contentStream.showText("Normal text "); + contentStream.newLineAtOffset(100, 0); + contentStream.showText("confidential"); + contentStream.newLineAtOffset(100, 0); + contentStream.showText(" more text"); + contentStream.endText(); + } + } + + // Helper for token creation + private List createSampleTokenList() { + return List.of( + Operator.getOperator("BT"), + COSName.getPDFName("F1"), + new COSFloat(12), + Operator.getOperator("Tf"), + new COSString("Sample text"), + Operator.getOperator("Tj"), + Operator.getOperator("ET") + ); + } + + private List getOriginalTokens() throws Exception { + // Create a new page to avoid side effects from other tests + PDPage pageForTokenExtraction = new PDPage(PDRectangle.A4); + pageForTokenExtraction.setResources(realPage.getResources()); + try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, pageForTokenExtraction)) { + contentStream.beginText(); + contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); + contentStream.newLineAtOffset(50, 750); + contentStream.showText("Original content"); + contentStream.endText(); + } + return redactController.createTokensWithoutTargetText(pageForTokenExtraction, Collections.emptySet(), false, false); + } + + private String extractTextFromTokens(List tokens) { + StringBuilder text = new StringBuilder(); + for (Object token : tokens) { + if (token instanceof COSString cosString) { + text.append(cosString.getString()); + } else if (token instanceof COSArray array) { + for (int i = 0; i < array.size(); i++) { + if (array.getObject(i) instanceof COSString cosString) { + text.append(cosString.getString()); + } + } + } + } + return text.toString(); + } + + private String extractTextFromModifiedPage(PDPage page) throws IOException { + if (page.getContents() != null) { + try (InputStream inputStream = page.getContents()) { + return new String(readAllBytes(inputStream)); + } + } + return ""; + } + + private byte[] readAllBytes(InputStream inputStream) throws IOException { + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + int nRead; + byte[] data = new byte[1024]; + while ((nRead = inputStream.read(data, 0, data.length)) != -1) { + buffer.write(data, 0, nRead); + } + return buffer.toByteArray(); + } } From 314e6c4bc1f1a5edf201606111ab0442cf9bbe3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Sun, 13 Jul 2025 21:52:22 +0200 Subject: [PATCH 05/13] feat: enhance redaction process with custom font handling and fallback mechanisms in RedactController --- .../api/security/RedactController.java | 337 +++++++++++++----- 1 file changed, 244 insertions(+), 93 deletions(-) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index e145312ba..4c3b4dcde 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -308,6 +308,7 @@ public class RedactController { if (originalWord == null || originalWord.isEmpty()) { return originalWord; } + // Use regular spaces - we'll handle width calculation separately return " ".repeat(originalWord.length()); } @@ -414,14 +415,37 @@ public class RedactController { log.debug("Total pages with found text: {}", allFoundTextsByPage.size()); - // Step 2: Process each page + // Step 2: Process each page with better font fallback handling log.debug("Step 2: Processing each page for text replacement"); - for (PDPage page : document.getPages()) { - // Replace text content - List filteredTokens = - createTokensWithoutTargetText( - page, allSearchTerms, useRegex, wholeWordSearchBool); - writeFilteredContentStream(document, page, filteredTokens); + boolean fallbackToBoxOnlyMode = false; + + // Check if document uses custom encoding fonts that may cause issues + boolean hasCustomEncodingFonts = detectCustomEncodingFonts(document); + if (hasCustomEncodingFonts) { + log.info( + "Detected fonts with custom encoding. Using box-only redaction mode to preserve document integrity."); + fallbackToBoxOnlyMode = true; + } + + if (!fallbackToBoxOnlyMode) { + try { + for (PDPage page : document.getPages()) { + // Replace text content + List filteredTokens = + createTokensWithoutTargetText( + page, allSearchTerms, useRegex, wholeWordSearchBool); + writeFilteredContentStream(document, page, filteredTokens); + } + } catch (Exception e) { + log.warn( + "Font encoding error encountered during text modification: {}. Falling back to box-only redaction mode.", + e.getMessage()); + fallbackToBoxOnlyMode = true; + + // Reload the document to reset any partial modifications + document.close(); + document = pdfDocumentFactory.load(file); + } } // Draw redaction boxes for all found texts @@ -433,6 +457,9 @@ public class RedactController { log.debug("Drawing redaction boxes for {} total found texts", allFoundTexts.size()); if (!allFoundTexts.isEmpty()) { + if (fallbackToBoxOnlyMode) { + log.info("Using fallback box-only redaction mode due to font encoding issues"); + } redactFoundText(document, allFoundTexts, customPadding, redactColor); } @@ -671,8 +698,11 @@ public class RedactController { try { float adjustment = calculateWidthAdjustment(segment, segmentMatches); tasks.add(new ModificationTask(segment, newText, adjustment)); - } catch (IOException e) { - log.warn("Failed to calculate width adjustment for redaction.", e); + } catch (Exception e) { + log.warn( + "Failed to calculate width adjustment for redaction due to font encoding issues: {}. Using zero adjustment.", + e.getMessage()); + tasks.add(new ModificationTask(segment, newText, 0)); } } else if ("TJ".equals(segment.operatorName)) { tasks.add(new ModificationTask(segment, null, 0)); @@ -723,62 +753,108 @@ public class RedactController { } /** - * Safely calculates the width of a string, handling characters that might not be supported by - * the font. If a character is not supported, it's replaced with a space or skipped. + * Safely calculates the width of a string using hardcoded estimates for fonts with custom + * encoding. This avoids issues with fonts that have non-standard character mappings. */ private float safeGetStringWidth(PDFont font, String text) throws IOException { if (font == null || text == null || text.isEmpty()) { return 0; } - StringBuilder safeText = new StringBuilder(); - for (int i = 0; i < text.length(); i++) { - char c = text.charAt(i); - try { - // Try to encode the character to check if it's supported - font.encode(String.valueOf(c)); - safeText.append(c); - } catch (IllegalArgumentException e) { - // If the character is not supported, replace it with a space - // This is a simple fallback - safeText.append(' '); - log.debug( - "Replaced unsupported character U+{} with space in font {}", - Integer.toHexString(c | 0x10000).substring(1), - font.getName()); + try { + // First, try to get the width directly for standard fonts + return font.getStringWidth(text); + } catch (Exception e) { + log.debug( + "Font width calculation failed for '{}' in font {}: {}. Using hardcoded width estimation.", + text, + font.getName(), + e.getMessage()); + + // Hardcoded width estimation based on font size and character count + // This provides consistent spacing even with problematic custom encoding fonts + return getHardcodedStringWidth(text, font); + } + } + + /** + * Provides hardcoded width estimation for text when font metrics are unreliable. Uses average + * character widths based on font type and character analysis. + */ + private float getHardcodedStringWidth(String text, PDFont font) { + if (text == null || text.isEmpty()) { + return 0; + } + + // Determine base character width based on font type + float baseCharWidth; + String fontName = font.getName().toLowerCase(); + + if (fontName.contains("courier") || fontName.contains("mono")) { + // Monospace fonts - all characters same width + baseCharWidth = 600; // Standard monospace width in font units + } else if (fontName.contains("times") || fontName.contains("serif")) { + // Serif fonts - slightly narrower average + baseCharWidth = 450; + } else { + // Sans-serif fonts (Arial, Helvetica, etc.) - standard width + baseCharWidth = 500; + } + + // Calculate total width with character-specific adjustments + float totalWidth = 0; + for (char c : text.toCharArray()) { + if (c == ' ') { + totalWidth += baseCharWidth * 0.3f; // Spaces are typically 30% of base width + } else if (Character.isUpperCase(c)) { + totalWidth += baseCharWidth * 1.2f; // Uppercase slightly wider + } else if (c == 'i' || c == 'l' || c == 'j' || c == 'f' || c == 't') { + totalWidth += baseCharWidth * 0.4f; // Narrow characters + } else if (c == 'm' || c == 'w' || c == 'W' || c == 'M') { + totalWidth += baseCharWidth * 1.5f; // Wide characters + } else { + totalWidth += baseCharWidth; // Standard width } } - return font.getStringWidth(safeText.toString()); + return totalWidth; } private float calculateWidthAdjustment(TextSegment segment, List matches) throws IOException { - float totalOriginalWidth = 0; - float totalPlaceholderWidth = 0; - String text = segment.getText(); + try { + float totalOriginalWidth = 0; + float totalPlaceholderWidth = 0; + String text = segment.getText(); - for (MatchRange match : matches) { - int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); - int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); + for (MatchRange match : matches) { + int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); + int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); - if (segmentStart < text.length() && segmentEnd > segmentStart) { - String originalPart = text.substring(segmentStart, segmentEnd); - String placeholderPart = createPlaceholder(originalPart); + if (segmentStart < text.length() && segmentEnd > segmentStart) { + String originalPart = text.substring(segmentStart, segmentEnd); + String placeholderPart = createPlaceholder(originalPart); - if (segment.getFont() != null) { - totalOriginalWidth += - safeGetStringWidth(segment.getFont(), originalPart) - / FONT_SCALE_FACTOR - * segment.getFontSize(); - totalPlaceholderWidth += - safeGetStringWidth(segment.getFont(), placeholderPart) - / FONT_SCALE_FACTOR - * segment.getFontSize(); + if (segment.getFont() != null) { + totalOriginalWidth += + safeGetStringWidth(segment.getFont(), originalPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + totalPlaceholderWidth += + safeGetStringWidth(segment.getFont(), placeholderPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + } } } + return totalOriginalWidth - totalPlaceholderWidth; + } catch (Exception e) { + log.warn( + "Failed to calculate width adjustment for segment '{}' due to font encoding issues: {}. Skipping adjustment.", + segment.getText(), + e.getMessage()); + return 0; // No adjustment when font operations fail } - return totalOriginalWidth - totalPlaceholderWidth; } private void modifyTokenForRedaction( @@ -834,71 +910,92 @@ public class RedactController { COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches); tokens.set(segment.getTokenIndex(), newArray); } - } catch (IOException e) { - log.warn("Failed to modify token for redaction: {}", e.getMessage(), e); + } catch (Exception e) { + log.warn( + "Failed to modify token for redaction due to font encoding issues: {}. Skipping text modification for segment '{}'.", + e.getMessage(), + segment.getText()); + // Don't throw the exception - let the process continue with box-only redaction } } private COSArray createRedactedTJArray( COSArray originalArray, TextSegment segment, List matches) throws IOException { - COSArray newArray = new COSArray(); - int textOffsetInSegment = 0; + try { + COSArray newArray = new COSArray(); + int textOffsetInSegment = 0; - for (COSBase element : originalArray) { - if (element instanceof COSString cosString) { - String originalText = cosString.getString(); - StringBuilder newText = new StringBuilder(originalText); - boolean modified = false; + for (COSBase element : originalArray) { + if (element instanceof COSString cosString) { + String originalText = cosString.getString(); + StringBuilder newText = new StringBuilder(originalText); + boolean modified = false; - for (MatchRange match : matches) { - int stringStartInPage = segment.getStartPos() + textOffsetInSegment; - int stringEndInPage = stringStartInPage + originalText.length(); + for (MatchRange match : matches) { + int stringStartInPage = segment.getStartPos() + textOffsetInSegment; + int stringEndInPage = stringStartInPage + originalText.length(); - int overlapStart = Math.max(match.getStartPos(), stringStartInPage); - int overlapEnd = Math.min(match.getEndPos(), stringEndInPage); + int overlapStart = Math.max(match.getStartPos(), stringStartInPage); + int overlapEnd = Math.min(match.getEndPos(), stringEndInPage); - if (overlapStart < overlapEnd) { - modified = true; - int redactionStartInString = overlapStart - stringStartInPage; - int redactionEndInString = overlapEnd - stringStartInPage; - if (redactionStartInString >= 0 - && redactionEndInString <= originalText.length()) { - String placeholder = - createPlaceholder( - originalText.substring( - redactionStartInString, redactionEndInString)); - newText.replace( - redactionStartInString, redactionEndInString, placeholder); + if (overlapStart < overlapEnd) { + modified = true; + int redactionStartInString = overlapStart - stringStartInPage; + int redactionEndInString = overlapEnd - stringStartInPage; + if (redactionStartInString >= 0 + && redactionEndInString <= originalText.length()) { + String placeholder = + createPlaceholder( + originalText.substring( + redactionStartInString, + redactionEndInString)); + newText.replace( + redactionStartInString, redactionEndInString, placeholder); + } } } - } - String modifiedString = newText.toString(); - newArray.add(new COSString(modifiedString)); + String modifiedString = newText.toString(); + newArray.add(new COSString(modifiedString)); - if (modified && segment.getFont() != null && segment.getFontSize() > 0) { - float originalWidth = - safeGetStringWidth(segment.getFont(), originalText) - / FONT_SCALE_FACTOR - * segment.getFontSize(); - float modifiedWidth = - safeGetStringWidth(segment.getFont(), modifiedString) - / FONT_SCALE_FACTOR - * segment.getFontSize(); - float adjustment = originalWidth - modifiedWidth; - if (Math.abs(adjustment) > PRECISION_THRESHOLD) { - float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize(); - newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); + if (modified && segment.getFont() != null && segment.getFontSize() > 0) { + try { + float originalWidth = + safeGetStringWidth(segment.getFont(), originalText) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + float modifiedWidth = + safeGetStringWidth(segment.getFont(), modifiedString) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + float adjustment = originalWidth - modifiedWidth; + if (Math.abs(adjustment) > PRECISION_THRESHOLD) { + float kerning = + -FONT_SCALE_FACTOR * adjustment / segment.getFontSize(); + newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); + } + } catch (Exception e) { + log.warn( + "Failed to calculate kerning adjustment for TJ array element due to font encoding issues: {}. Skipping adjustment.", + e.getMessage()); + // Continue without kerning adjustment + } } - } - textOffsetInSegment += originalText.length(); - } else { - newArray.add(element); + textOffsetInSegment += originalText.length(); + } else { + newArray.add(element); + } } + return newArray; + } catch (Exception e) { + log.warn( + "Failed to create redacted TJ array due to font encoding issues: {}. Returning original array.", + e.getMessage()); + // Return the original array if we can't modify it safely + return originalArray; } - return newArray; } private String extractTextFromToken(Object token, String operatorName) { @@ -924,4 +1021,58 @@ public class RedactController { default -> ""; }; } + + /** + * Detects if the document contains fonts with custom encoding that may cause text modification + * issues. Custom encoding fonts often have internal character mappings that don't follow + * Unicode standards. + */ + private boolean detectCustomEncodingFonts(PDDocument document) { + try { + for (PDPage page : document.getPages()) { + PDResources resources = page.getResources(); + if (resources != null) { + for (COSName fontName : resources.getFontNames()) { + try { + PDFont font = resources.getFont(fontName); + if (font != null) { + String name = font.getName(); + // Check for font names that commonly indicate custom encoding + if (name != null + && (name.contains("HOEP") + || // Common custom encoding prefix + name.contains("+") + || // Subset fonts often have custom encoding + name.matches(".*[A-Z]{6}\\+.*") // Six letter prefix + // pattern + )) { + log.debug("Detected potential custom encoding font: {}", name); + // Try a simple encoding test + try { + font.encode(" "); // Test space character + font.getStringWidth(" "); + } catch (Exception e) { + log.debug( + "Font {} failed basic encoding test: {}", + name, + e.getMessage()); + return true; + } + } + } + } catch (Exception e) { + log.debug( + "Error checking font for custom encoding: {}", e.getMessage()); + } + } + } + } + return false; + } catch (Exception e) { + log.warn( + "Error detecting custom encoding fonts: {}. Assuming custom encoding present.", + e.getMessage()); + return true; // Err on the side of caution + } + } } From d7ed471e5d807448519084805dec34cb2646e515 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Mon, 14 Jul 2025 01:01:58 +0200 Subject: [PATCH 06/13] feat: improve logging and font width calculation strategies in RedactController --- .../api/security/RedactController.java | 418 ++++++++++++++++-- 1 file changed, 374 insertions(+), 44 deletions(-) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 4c3b4dcde..ad34414bd 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -30,6 +30,8 @@ import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.WebDataBinder; import org.springframework.web.bind.annotation.InitBinder; @@ -308,7 +310,7 @@ public class RedactController { if (originalWord == null || originalWord.isEmpty()) { return originalWord; } - // Use regular spaces - we'll handle width calculation separately + return " ".repeat(originalWord.length()); } @@ -406,6 +408,20 @@ public class RedactController { log.debug("Found {} instances of pattern '{}'", foundTexts.size(), text); + // Log details of found text instances + for (int i = 0; i < foundTexts.size(); i++) { + PDFText found = foundTexts.get(i); + log.debug( + " Match {}: '{}' on page {} at ({}, {}) to ({}, {})", + i + 1, + found.getText(), + found.getPageIndex() + 1, + found.getX1(), + found.getY1(), + found.getX2(), + found.getY2()); + } + for (PDFText found : foundTexts) { allFoundTextsByPage .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>()) @@ -459,8 +475,17 @@ public class RedactController { if (!allFoundTexts.isEmpty()) { if (fallbackToBoxOnlyMode) { log.info("Using fallback box-only redaction mode due to font encoding issues"); + log.debug( + "Text removal was skipped to preserve document integrity. Only drawing redaction boxes over {} text instances.", + allFoundTexts.size()); + } else { + log.debug( + "Using full text replacement redaction mode with {} text instances.", + allFoundTexts.size()); } redactFoundText(document, allFoundTexts, customPadding, redactColor); + } else { + log.debug("No matching text found for redaction patterns"); } if (convertPDFToImage) { @@ -513,8 +538,31 @@ public class RedactController { List textSegments = extractTextSegments(page, tokens); log.debug("Extracted {} text segments", textSegments.size()); + // Log detailed text segment information + for (int i = 0; + i < Math.min(textSegments.size(), 20); + i++) { // Log first 20 segments to avoid spam + TextSegment segment = textSegments.get(i); + log.debug( + "Text segment {}: '{}' (font: {}, operator: {}, pos: {}-{})", + i, + segment.getText(), + segment.getFont() != null ? segment.getFont().getName() : "null", + segment.getOperatorName(), + segment.getStartPos(), + segment.getEndPos()); + } + if (textSegments.size() > 20) { + log.debug("... and {} more text segments", textSegments.size() - 20); + } + String completeText = buildCompleteText(textSegments); - log.debug("Built complete text of {} characters", completeText.length()); + log.debug( + "Built complete text of {} characters: '{}'", + completeText.length(), + completeText.length() > 200 + ? completeText.substring(0, 200) + "..." + : completeText); List matches = findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); @@ -752,10 +800,6 @@ public class RedactController { return result.toString(); } - /** - * Safely calculates the width of a string using hardcoded estimates for fonts with custom - * encoding. This avoids issues with fonts that have non-standard character mappings. - */ private float safeGetStringWidth(PDFont font, String text) throws IOException { if (font == null || text == null || text.isEmpty()) { return 0; @@ -766,58 +810,311 @@ public class RedactController { return font.getStringWidth(text); } catch (Exception e) { log.debug( - "Font width calculation failed for '{}' in font {}: {}. Using hardcoded width estimation.", + "Font width calculation failed for '{}' in font {}: {}. Using fallback strategies.", text, font.getName(), e.getMessage()); - // Hardcoded width estimation based on font size and character count - // This provides consistent spacing even with problematic custom encoding fonts - return getHardcodedStringWidth(text, font); + // Strategy 1: Character-by-character encoding test + float charByCharWidth = getCharacterByCharacterWidth(font, text); + if (charByCharWidth > 0) { + return charByCharWidth; + } + + // Strategy 2: Use font substitution with Standard 14 fonts + float substitutionWidth = getWidthWithFontSubstitution(font, text); + if (substitutionWidth > 0) { + return substitutionWidth; + } + + // Strategy 3: Statistical estimation based on real font metrics + return getStatisticalWidth(text, font); } } - /** - * Provides hardcoded width estimation for text when font metrics are unreliable. Uses average - * character widths based on font type and character analysis. - */ - private float getHardcodedStringWidth(String text, PDFont font) { + private float getCharacterByCharacterWidth(PDFont font, String text) { if (text == null || text.isEmpty()) { return 0; } - // Determine base character width based on font type - float baseCharWidth; - String fontName = font.getName().toLowerCase(); + try { + float totalWidth = 0; + for (char c : text.toCharArray()) { + try { + String charStr = String.valueOf(c); + font.encode(charStr); // Test if character can be encoded + totalWidth += font.getStringWidth(charStr); + } catch (Exception e) { + try { + totalWidth += font.getStringWidth(" "); + } catch (Exception e2) { + totalWidth += 500; // Standard average width + } + } + } + return totalWidth; + } catch (Exception e) { + log.debug("Character-by-character width calculation failed: {}", e.getMessage()); + return 0; // Failed, try next strategy + } + } - if (fontName.contains("courier") || fontName.contains("mono")) { - // Monospace fonts - all characters same width - baseCharWidth = 600; // Standard monospace width in font units - } else if (fontName.contains("times") || fontName.contains("serif")) { - // Serif fonts - slightly narrower average - baseCharWidth = 450; - } else { - // Sans-serif fonts (Arial, Helvetica, etc.) - standard width - baseCharWidth = 500; + private float getWidthWithFontSubstitution(PDFont originalFont, String text) { + try { + PDFont substituteFont = findBestStandardFontSubstitute(originalFont); + float width = substituteFont.getStringWidth(text); + + FontCharacteristics characteristics = getFontCharacteristics(originalFont); + + return width; + } catch (Exception e) { + log.debug("Font substitution width calculation failed: {}", e.getMessage()); + } + return 0; // Failed, try next strategy + } + + private PDFont findBestStandardFontSubstitute(PDFont originalFont) { + String fontFamily = null; + String fontName = null; + boolean isBold = false; + boolean isItalic = false; + boolean isMonospace = false; + + try { + // Try to get font metadata from PDFontDescriptor + if (originalFont.getFontDescriptor() != null) { + fontFamily = originalFont.getFontDescriptor().getFontFamily(); + + if (fontFamily == null || fontFamily.isEmpty()) { + fontName = originalFont.getFontDescriptor().getFontName(); + } + + int flags = originalFont.getFontDescriptor().getFlags(); + isBold = (flags & 0x40) != 0; // Check if FORCE_BOLD flag is set (0x40) + isItalic = (flags & 0x40000) != 0; // Check if ITALIC flag is set (0x40000) + isMonospace = (flags & 0x1) != 0; // Check if FIXED_PITCH flag is set (0x1) + } + } catch (Exception e) { + log.debug("Error accessing font descriptor: {}", e.getMessage()); } - // Calculate total width with character-specific adjustments - float totalWidth = 0; - for (char c : text.toCharArray()) { - if (c == ' ') { - totalWidth += baseCharWidth * 0.3f; // Spaces are typically 30% of base width - } else if (Character.isUpperCase(c)) { - totalWidth += baseCharWidth * 1.2f; // Uppercase slightly wider - } else if (c == 'i' || c == 'l' || c == 'j' || c == 'f' || c == 't') { - totalWidth += baseCharWidth * 0.4f; // Narrow characters - } else if (c == 'm' || c == 'w' || c == 'W' || c == 'M') { - totalWidth += baseCharWidth * 1.5f; // Wide characters + // If we couldn't get metadata from descriptor, fall back to font name + if ((fontFamily == null || fontFamily.isEmpty()) + && (fontName == null || fontName.isEmpty())) { + fontName = originalFont.getName().toLowerCase(); + } else if (fontFamily != null) { + fontFamily = fontFamily.toLowerCase(); + } else { + fontName = fontName.toLowerCase(); + } + + // Determine font characteristics based on metadata or name + boolean isSerif = false; + boolean isCourier = false; + + // Check font family first + if (fontFamily != null) { + isCourier = fontFamily.contains("courier"); + isMonospace = + isMonospace + || isCourier + || fontFamily.contains("mono") + || fontFamily.contains("fixed"); + isSerif = + fontFamily.contains("times") + || fontFamily.contains("serif") + || fontFamily.contains("roman"); + } + + // If needed, check font name as fallback + if (fontName != null) { + isCourier = isCourier || fontName.contains("courier"); + isMonospace = + isMonospace + || isCourier + || fontName.contains("mono") + || fontName.contains("fixed"); + isSerif = + isSerif + || fontName.contains("times") + || fontName.contains("serif") + || fontName.contains("roman"); + isBold = isBold || fontName.contains("bold"); + isItalic = isItalic || fontName.contains("italic") || fontName.contains("oblique"); + } + + // Select the appropriate standard font based on characteristics + if (isMonospace) { + return new PDType1Font(Standard14Fonts.FontName.COURIER); + } + + if (isSerif) { + if (isBold && isItalic) { + return new PDType1Font(Standard14Fonts.FontName.TIMES_BOLD_ITALIC); + } else if (isBold) { + return new PDType1Font(Standard14Fonts.FontName.TIMES_BOLD); + } else if (isItalic) { + return new PDType1Font(Standard14Fonts.FontName.TIMES_ITALIC); } else { - totalWidth += baseCharWidth; // Standard width + return new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN); } } - return totalWidth; + // Sans-serif fonts (Helvetica) + if (isBold && isItalic) { + return new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD_OBLIQUE); + } else if (isBold) { + return new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD); + } else if (isItalic) { + return new PDType1Font(Standard14Fonts.FontName.HELVETICA_OBLIQUE); + } + + return new PDType1Font(Standard14Fonts.FontName.HELVETICA); + } + + private float getStatisticalWidth(String text, PDFont font) { + if (text == null || text.isEmpty()) { + return 0; + } + + PDFont referenceFont = findBestStandardFontSubstitute(font); + + // Get font characteristics using metadata + FontCharacteristics characteristics = getFontCharacteristics(font); + + try { + + return referenceFont.getStringWidth(text); + } catch (Exception e) { + float avgCharWidth = getAverageCharacterWidth(font); + return text.length() * avgCharWidth; + } + } + + private FontCharacteristics getFontCharacteristics(PDFont font) { + FontCharacteristics characteristics = new FontCharacteristics(); + + try { + // Try to get font metadata from PDFontDescriptor + if (font.getFontDescriptor() != null) { + characteristics.fontFamily = font.getFontDescriptor().getFontFamily(); + + if (characteristics.fontFamily == null || characteristics.fontFamily.isEmpty()) { + characteristics.fontName = font.getFontDescriptor().getFontName(); + } + + int flags = font.getFontDescriptor().getFlags(); + characteristics.isBold = (flags & 0x40) != 0; // FORCE_BOLD flag + characteristics.isItalic = (flags & 0x40000) != 0; // ITALIC flag + characteristics.isMonospace = (flags & 0x1) != 0; // FIXED_PITCH flag + } + } catch (Exception e) { + log.debug("Error accessing font descriptor: {}", e.getMessage()); + } + + // If we couldn't get metadata from descriptor, fall back to font name + if ((characteristics.fontFamily == null || characteristics.fontFamily.isEmpty()) + && (characteristics.fontName == null || characteristics.fontName.isEmpty())) { + characteristics.fontName = font.getName(); + } + + if (characteristics.fontFamily != null) { + characteristics.fontFamily = characteristics.fontFamily.toLowerCase(); + } + if (characteristics.fontName != null) { + characteristics.fontName = characteristics.fontName.toLowerCase(); + } + + if (characteristics.fontFamily != null) { + characteristics.isCourier = characteristics.fontFamily.contains("courier"); + characteristics.isMonospace = + characteristics.isMonospace + || characteristics.isCourier + || characteristics.fontFamily.contains("mono") + || characteristics.fontFamily.contains("fixed"); + characteristics.isSerif = + characteristics.fontFamily.contains("times") + || characteristics.fontFamily.contains("serif") + || characteristics.fontFamily.contains("roman"); + characteristics.isTimesNewRoman = + characteristics.fontFamily.contains("timesnewroman") + || characteristics.fontFamily.contains("timesnew"); + } + + if (characteristics.fontName != null) { + characteristics.isCourier = + characteristics.isCourier || characteristics.fontName.contains("courier"); + characteristics.isMonospace = + characteristics.isMonospace + || characteristics.isCourier + || characteristics.fontName.contains("mono") + || characteristics.fontName.contains("fixed"); + characteristics.isSerif = + characteristics.isSerif + || characteristics.fontName.contains("times") + || characteristics.fontName.contains("serif") + || characteristics.fontName.contains("roman"); + characteristics.isBold = + characteristics.isBold || characteristics.fontName.contains("bold"); + characteristics.isItalic = + characteristics.isItalic + || characteristics.fontName.contains("italic") + || characteristics.fontName.contains("oblique"); + characteristics.isTimesNewRoman = + characteristics.isTimesNewRoman + || (characteristics.fontName.contains("timesnewroman") + || characteristics.fontName.contains("timesnew")) + && (characteristics.fontName.contains("psmt") + || characteristics.fontName.contains("ps-")); + } + + return characteristics; + } + + private static class FontCharacteristics { + String fontFamily; + String fontName; + boolean isBold; + boolean isItalic; + boolean isMonospace; + boolean isSerif; + boolean isCourier; + boolean isTimesNewRoman; + } + + private float getAverageCharacterWidth(PDFont font) { + String sampleText = "etaoinshrdlucmfwypvbgkjqxz0123456789 ,."; + + FontCharacteristics characteristics = getFontCharacteristics(font); + + try { + + return font.getStringWidth(sampleText) / sampleText.length(); + } catch (Exception e) { + try { + PDFont substituteFont = findBestStandardFontSubstitute(font); + + return substituteFont.getStringWidth(sampleText) / sampleText.length(); + } catch (Exception e2) { + if (characteristics.isMonospace || characteristics.isCourier) { + return 600; // Monospace fonts + } else if (characteristics.isTimesNewRoman) { + return 550; // TimesNewRoman fonts - increased from standard Times + } else if (characteristics.isSerif) { + return 480; // Times-style serif fonts + } else if (characteristics.fontFamily != null + && (characteristics.fontFamily.contains("arial") + || characteristics.fontFamily.contains("helvetica"))) { + return 520; // Helvetica/Arial-style sans-serif + } else if (characteristics.fontName != null + && (characteristics.fontName.contains("arial") + || characteristics.fontName.contains("helvetica"))) { + return 520; // Helvetica/Arial-style sans-serif + } else { + return 500; // Generic sans-serif average + } + } + } } private float calculateWidthAdjustment(TextSegment segment, List matches) @@ -891,9 +1188,20 @@ public class RedactController { COSArray newArray = new COSArray(); newArray.add(new COSString(newText)); if (segment.getFontSize() > 0) { - float kerning = -FONT_SCALE_FACTOR * adjustment / segment.getFontSize(); + + float adjustmentFactor = + 1.2f; // Increase adjustment by 20% to compensate for spaces + float kerning = + -1 + * adjustment + * adjustmentFactor + * (FONT_SCALE_FACTOR / segment.getFontSize()); + newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); - log.debug("Applied kerning adjustment: {}", kerning); + log.debug( + "Applied kerning adjustment: {} for width adjustment: {}", + kerning, + adjustment); } tokens.set(segment.getTokenIndex(), newArray); @@ -915,7 +1223,6 @@ public class RedactController { "Failed to modify token for redaction due to font encoding issues: {}. Skipping text modification for segment '{}'.", e.getMessage(), segment.getText()); - // Don't throw the exception - let the process continue with box-only redaction } } @@ -971,9 +1278,19 @@ public class RedactController { * segment.getFontSize(); float adjustment = originalWidth - modifiedWidth; if (Math.abs(adjustment) > PRECISION_THRESHOLD) { + + float adjustmentFactor = 0.8f; float kerning = - -FONT_SCALE_FACTOR * adjustment / segment.getFontSize(); + -1 + * adjustment + * adjustmentFactor + * (FONT_SCALE_FACTOR / segment.getFontSize()); + newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); + log.debug( + "Applied kerning adjustment: {} for width adjustment: {}", + kerning, + adjustment); } } catch (Exception e) { log.warn( @@ -1029,14 +1346,23 @@ public class RedactController { */ private boolean detectCustomEncodingFonts(PDDocument document) { try { + log.debug("Starting font encoding detection..."); for (PDPage page : document.getPages()) { PDResources resources = page.getResources(); if (resources != null) { + int fontCount = 0; + for (COSName fn : resources.getFontNames()) fontCount++; + log.debug("Found {} fonts on page", fontCount); for (COSName fontName : resources.getFontNames()) { try { PDFont font = resources.getFont(fontName); if (font != null) { String name = font.getName(); + log.debug( + "Analyzing font: {} (type: {})", + name, + font.getClass().getSimpleName()); + // Check for font names that commonly indicate custom encoding if (name != null && (name.contains("HOEP") @@ -1051,6 +1377,7 @@ public class RedactController { try { font.encode(" "); // Test space character font.getStringWidth(" "); + log.debug("Font {} passed basic encoding test", name); } catch (Exception e) { log.debug( "Font {} failed basic encoding test: {}", @@ -1058,6 +1385,8 @@ public class RedactController { e.getMessage()); return true; } + } else { + log.debug("Font {} appears to use standard encoding", name); } } } catch (Exception e) { @@ -1067,6 +1396,7 @@ public class RedactController { } } } + log.debug("Font encoding detection complete - no problematic fonts found"); return false; } catch (Exception e) { log.warn( From 03e252f9e85c2bc0d7001b48fc06602d9c026a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Mon, 14 Jul 2025 01:30:45 +0200 Subject: [PATCH 07/13] feat: adjust text padding multiplier and refine kerning calculations --- .../api/security/RedactController.java | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index ad34414bd..bd00bba3d 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -68,7 +68,7 @@ import stirling.software.common.util.propertyeditor.StringToArrayListPropertyEdi @RequiredArgsConstructor public class RedactController { - private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.3f; + private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.6f; private static final float PRECISION_THRESHOLD = 1e-3f; private static final int FONT_SCALE_FACTOR = 1000; @@ -1189,14 +1189,23 @@ public class RedactController { newArray.add(new COSString(newText)); if (segment.getFontSize() > 0) { - float adjustmentFactor = - 1.2f; // Increase adjustment by 20% to compensate for spaces + float adjustmentFactor = 1.05f; float kerning = -1 * adjustment * adjustmentFactor * (FONT_SCALE_FACTOR / segment.getFontSize()); + // Cap kerning value to prevent extreme outliers that mangle text + float maxKerning = 500f; + if (Math.abs(kerning) > maxKerning) { + log.warn( + "Kerning value {} is an outlier. Capping to {}.", + kerning, + kerning > 0 ? maxKerning : -maxKerning); + kerning = Math.max(-maxKerning, Math.min(maxKerning, kerning)); + } + newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); log.debug( "Applied kerning adjustment: {} for width adjustment: {}", @@ -1279,7 +1288,8 @@ public class RedactController { float adjustment = originalWidth - modifiedWidth; if (Math.abs(adjustment) > PRECISION_THRESHOLD) { - float adjustmentFactor = 0.8f; + float adjustmentFactor = 1.05f; // Increase kerning, visually more + // natural float kerning = -1 * adjustment From 809d016f1773d6bd8b92ddd3c58bfd238373cd53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Tue, 15 Jul 2025 17:41:45 +0200 Subject: [PATCH 08/13] feat: enhance PDF redaction capabilities with improved handling of complex document structures, and various redaction area scenarios --- .../api/security/RedactController.java | 1634 ++++++++--------- .../api/security/RedactControllerTest.java | 309 +++- 2 files changed, 1098 insertions(+), 845 deletions(-) diff --git a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index bd00bba3d..902af6f65 100644 --- a/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/stirling-pdf/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -4,21 +4,23 @@ import java.awt.Color; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; -import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSFloat; import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSNumber; import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdfwriter.ContentStreamWriter; @@ -30,8 +32,8 @@ import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDType1Font; -import org.apache.pdfbox.pdmodel.font.Standard14Fonts; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.WebDataBinder; import org.springframework.web.bind.annotation.InitBinder; @@ -75,8 +77,14 @@ public class RedactController { // Text showing operators private static final Set TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); + private static final COSString EMPTY_COS_STRING = new COSString(""); + private final CustomPDFDocumentFactory pdfDocumentFactory; + private String removeFileExtension(String filename) { + return filename.replaceFirst("[.][^.]+$", ""); + } + @InitBinder public void initBinder(WebDataBinder binder) { binder.registerCustomEditor( @@ -85,78 +93,64 @@ public class RedactController { @PostMapping(value = "/redact", consumes = "multipart/form-data") @Operation( - summary = "Redacts areas and pages in a PDF document", + summary = "Redact PDF manually", description = - "This operation takes an input PDF file with a list of areas, page" - + " number(s)/range(s)/function(s) to redact. Input:PDF, Output:PDF," - + " Type:SISO") + "This endpoint redacts content from a PDF file based on manually specified areas. " + + "Users can specify areas to redact and optionally convert the PDF to an image. " + + "Input:PDF Output:PDF Type:SISO") public ResponseEntity redactPDF(@ModelAttribute ManualRedactPdfRequest request) throws IOException { - log.debug( - "Starting manual redaction for file: {}", - request.getFileInput().getOriginalFilename()); MultipartFile file = request.getFileInput(); List redactionAreas = request.getRedactions(); - log.debug( - "Processing {} redaction areas", - redactionAreas != null ? redactionAreas.size() : 0); + try (PDDocument document = pdfDocumentFactory.load(file)) { + PDPageTree allPages = document.getDocumentCatalog().getPages(); - PDDocument document = pdfDocumentFactory.load(file); - log.debug("Loaded PDF document with {} pages", document.getNumberOfPages()); + redactPages(request, document, allPages); - PDPageTree allPages = document.getDocumentCatalog().getPages(); + redactAreas(redactionAreas, document, allPages); - log.debug("Starting page redactions"); - redactPages(request, document, allPages); + if (Boolean.TRUE.equals(request.getConvertPDFToImage())) { + try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + convertedPdf.save(baos); + byte[] pdfContent = baos.toByteArray(); - log.debug("Starting area redactions"); - redactAreas(redactionAreas, document, allPages); + return WebResponseUtils.bytesToWebResponse( + pdfContent, + removeFileExtension( + Objects.requireNonNull( + Filenames.toSimpleFileName( + file.getOriginalFilename()))) + + "_redacted.pdf"); + } + } - if (Boolean.TRUE.equals(request.getConvertPDFToImage())) { - log.debug("Converting PDF to image format"); - PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document); - document.close(); - document = convertedPdf; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + document.save(baos); + byte[] pdfContent = baos.toByteArray(); + + return WebResponseUtils.bytesToWebResponse( + pdfContent, + removeFileExtension( + Objects.requireNonNull( + Filenames.toSimpleFileName(file.getOriginalFilename()))) + + "_redacted.pdf"); } - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - document.save(baos); - document.close(); - - byte[] pdfContent = baos.toByteArray(); - log.debug("Manual redaction completed. Output PDF size: {} bytes", pdfContent.length); - - return WebResponseUtils.bytesToWebResponse( - pdfContent, - Objects.requireNonNull(Filenames.toSimpleFileName(file.getOriginalFilename())) - .replaceFirst("[.][^.]+$", "") - + "_redacted.pdf"); } private void redactAreas( List redactionAreas, PDDocument document, PDPageTree allPages) throws IOException { - log.debug("Processing redaction areas"); if (redactionAreas == null || redactionAreas.isEmpty()) { - log.debug("No redaction areas to process"); return; } - // Group redaction areas by page Map> redactionsByPage = new HashMap<>(); - // Process and validate each redaction area for (RedactionArea redactionArea : redactionAreas) { - log.debug( - "Validating redaction area on page {}: x={}, y={}, width={}, height={}", - redactionArea.getPage(), - redactionArea.getX(), - redactionArea.getY(), - redactionArea.getWidth(), - redactionArea.getHeight()); if (redactionArea.getPage() == null || redactionArea.getPage() <= 0 @@ -164,51 +158,31 @@ public class RedactController { || redactionArea.getHeight() <= 0.0D || redactionArea.getWidth() == null || redactionArea.getWidth() <= 0.0D) { - log.debug("Skipping invalid redaction area: {}", redactionArea); continue; } - // Group by page number redactionsByPage .computeIfAbsent(redactionArea.getPage(), k -> new ArrayList<>()) .add(redactionArea); } - log.debug("Grouped redactions by page: {} pages affected", redactionsByPage.size()); - - // Process each page only once for (Map.Entry> entry : redactionsByPage.entrySet()) { Integer pageNumber = entry.getKey(); List areasForPage = entry.getValue(); - log.debug( - "Processing page {} with {} redaction areas", pageNumber, areasForPage.size()); - if (pageNumber > allPages.getCount()) { - log.debug( - "Skipping page {} - out of bounds (total pages: {})", - pageNumber, - allPages.getCount()); continue; // Skip if the page number is out of bounds } PDPage page = allPages.get(pageNumber - 1); - // Create only one content stream per page to draw all redaction boxes try (PDPageContentStream contentStream = new PDPageContentStream( document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { - // Process all redactions for this page + contentStream.saveGraphicsState(); for (RedactionArea redactionArea : areasForPage) { Color redactColor = decodeOrDefault(redactionArea.getColor()); - log.debug( - "Applying redaction with color {} at ({}, {}) size {}x{}", - redactColor, - redactionArea.getX(), - redactionArea.getY(), - redactionArea.getWidth(), - redactionArea.getHeight()); contentStream.setNonStrokingColor(redactColor); @@ -217,17 +191,14 @@ public class RedactController { float width = redactionArea.getWidth().floatValue(); float height = redactionArea.getHeight().floatValue(); - // The y-coordinate needs to be transformed from a top-left origin to a - // bottom-left origin. float pdfY = page.getBBox().getHeight() - y - height; contentStream.addRect(x, pdfY, width, height); contentStream.fill(); } + contentStream.restoreGraphicsState(); } } - - log.debug("Completed redaction areas processing"); } private void redactPages( @@ -237,10 +208,7 @@ public class RedactController { Color redactColor = decodeOrDefault(request.getPageRedactionColor()); List pageNumbers = getPageNumbers(request, allPages.getCount()); - log.debug("Redacting {} pages with color {}", pageNumbers.size(), redactColor); - for (Integer pageNumber : pageNumbers) { - log.debug("Redacting entire page {}", pageNumber + 1); PDPage page = allPages.get(pageNumber); @@ -250,60 +218,62 @@ public class RedactController { contentStream.setNonStrokingColor(redactColor); PDRectangle box = page.getBBox(); - log.debug( - "Page {} dimensions: {}x{}", - pageNumber + 1, - box.getWidth(), - box.getHeight()); contentStream.addRect(0, 0, box.getWidth(), box.getHeight()); contentStream.fill(); } } - - log.debug("Completed page redactions"); } private void redactFoundText( PDDocument document, List blocks, float customPadding, Color redactColor) throws IOException { - log.debug( - "Redacting {} text blocks with padding {} and color {}", - blocks.size(), - customPadding, - redactColor); var allPages = document.getDocumentCatalog().getPages(); + Map> blocksByPage = new HashMap<>(); for (PDFText block : blocks) { - log.debug( - "Redacting text block on page {}: '{}' at ({}, {}) to ({}, {})", - block.getPageIndex() + 1, - block.getText(), - block.getX1(), - block.getY1(), - block.getX2(), - block.getY2()); + blocksByPage.computeIfAbsent(block.getPageIndex(), k -> new ArrayList<>()).add(block); + } - var page = allPages.get(block.getPageIndex()); + for (Map.Entry> entry : blocksByPage.entrySet()) { + Integer pageIndex = entry.getKey(); + List pageBlocks = entry.getValue(); + + if (pageIndex >= allPages.getCount()) { + continue; // Skip if page index is out of bounds + } + + var page = allPages.get(pageIndex); try (PDPageContentStream contentStream = new PDPageContentStream( document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { - contentStream.setNonStrokingColor(redactColor); - float padding = - (block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER - + customPadding; - PDRectangle pageBox = page.getBBox(); - contentStream.addRect( - block.getX1(), - pageBox.getHeight() - block.getY2() - padding, - block.getX2() - block.getX1(), - block.getY2() - block.getY1() + 2 * padding); - contentStream.fill(); + + contentStream.saveGraphicsState(); + + try { + contentStream.setNonStrokingColor(redactColor); + PDRectangle pageBox = page.getBBox(); + + for (PDFText block : pageBlocks) { + float padding = + (block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER + + customPadding; + + contentStream.addRect( + block.getX1(), + pageBox.getHeight() - block.getY2() - padding, + block.getX2() - block.getX1(), + block.getY2() - block.getY1() + 2 * padding); + } + + contentStream.fill(); + + } finally { + contentStream.restoreGraphicsState(); + } } } - - log.debug("Completed text block redactions"); } String createPlaceholder(String originalWord) { @@ -316,16 +286,20 @@ public class RedactController { void writeFilteredContentStream(PDDocument document, PDPage page, List tokens) throws IOException { - log.debug("Writing filtered content stream with {} tokens", tokens.size()); PDStream newStream = new PDStream(document); - try (var out = newStream.createOutputStream()) { - ContentStreamWriter writer = new ContentStreamWriter(out); - writer.writeTokens(tokens); - } - page.setContents(newStream); - log.debug("Successfully wrote filtered content stream"); + try { + try (var out = newStream.createOutputStream()) { + ContentStreamWriter writer = new ContentStreamWriter(out); + writer.writeTokens(tokens); + } + + page.setContents(newStream); + + } catch (IOException e) { + throw new IOException("Failed to write filtered content stream to page", e); + } } Color decodeOrDefault(String hex) { @@ -338,7 +312,6 @@ public class RedactController { try { return Color.decode(colorString); } catch (NumberFormatException e) { - log.warn("Invalid color string '{}'. Using default color BLACK.", hex); return Color.BLACK; } } @@ -359,172 +332,327 @@ public class RedactController { @PostMapping(value = "/auto-redact", consumes = "multipart/form-data") @Operation( - summary = "Redacts listOfText in a PDF document", + summary = "Redact PDF automatically", description = - "This operation takes an input PDF file and redacts the provided listOfText." - + " Input:PDF, Output:PDF, Type:SISO") - public ResponseEntity redactPdf(@ModelAttribute RedactPdfRequest request) - throws Exception { - log.debug( - "Starting auto-redaction for file: {}", - request.getFileInput().getOriginalFilename()); - - MultipartFile file = request.getFileInput(); - String listOfTextString = request.getListOfText(); + "This endpoint automatically redacts text from a PDF file based on specified patterns. " + + "Users can provide text patterns to redact, with options for regex and whole word matching. " + + "Input:PDF Output:PDF Type:SISO") + public ResponseEntity redactPdf(@ModelAttribute RedactPdfRequest request) { + String[] listOfText = request.getListOfText().split("\n"); boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); boolean wholeWordSearchBool = Boolean.TRUE.equals(request.getWholeWordSearch()); - String colorString = request.getRedactColor(); - float customPadding = request.getCustomPadding(); - boolean convertPDFToImage = Boolean.TRUE.equals(request.getConvertPDFToImage()); - log.debug( - "Auto-redaction parameters: useRegex={}, wholeWordSearch={}, customPadding={}, convertToImage={}", - useRegex, - wholeWordSearchBool, - customPadding, - convertPDFToImage); + if (listOfText.length == 0 || (listOfText.length == 1 && listOfText[0].trim().isEmpty())) { + throw new IllegalArgumentException("No text patterns provided for redaction"); + } - String[] listOfText = listOfTextString.split("\n"); - log.debug("Searching for {} text patterns", listOfText.length); + PDDocument document = null; + PDDocument fallbackDocument = null; - PDDocument document = pdfDocumentFactory.load(file); + try { + if (request.getFileInput() == null) { + log.error("File input is null"); + throw new IllegalArgumentException("File input cannot be null"); + } - Color redactColor = decodeOrDefault(colorString); - log.debug("Using redaction color: {}", redactColor); + document = pdfDocumentFactory.load(request.getFileInput()); - // Step 1: Find all text locations for all search terms - log.debug("Step 1: Finding all text locations"); + if (document == null) { + log.error("Failed to load PDF document"); + throw new IllegalArgumentException("Failed to load PDF document"); + } + + Map> allFoundTextsByPage = + findTextToRedact(document, listOfText, useRegex, wholeWordSearchBool); + + int totalMatches = allFoundTextsByPage.values().stream().mapToInt(List::size).sum(); + log.info( + "Redaction scan: {} occurrences across {} pages (patterns={}, regex={}, wholeWord={})", + totalMatches, + allFoundTextsByPage.size(), + listOfText.length, + useRegex, + wholeWordSearchBool); + + if (allFoundTextsByPage.isEmpty()) { + log.info("No text found matching redaction patterns"); + byte[] originalContent; + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + document.save(baos); + originalContent = baos.toByteArray(); + } + + return WebResponseUtils.bytesToWebResponse( + originalContent, + removeFileExtension( + Objects.requireNonNull( + Filenames.toSimpleFileName( + request.getFileInput() + .getOriginalFilename()))) + + "_redacted.pdf"); + } + + boolean fallbackToBoxOnlyMode; + try { + fallbackToBoxOnlyMode = + performTextReplacement( + document, + allFoundTextsByPage, + listOfText, + useRegex, + wholeWordSearchBool); + } catch (Exception e) { + log.warn( + "Text replacement redaction failed, falling back to box-only mode: {}", + e.getMessage()); + fallbackToBoxOnlyMode = true; + } + + if (fallbackToBoxOnlyMode) { + log.warn( + "Font compatibility issues detected. Using box-only redaction mode for better reliability."); + + fallbackDocument = pdfDocumentFactory.load(request.getFileInput()); + + allFoundTextsByPage = + findTextToRedact( + fallbackDocument, listOfText, useRegex, wholeWordSearchBool); + + byte[] pdfContent = + finalizeRedaction( + fallbackDocument, + allFoundTextsByPage, + request.getRedactColor(), + request.getCustomPadding(), + request.getConvertPDFToImage()); + + return WebResponseUtils.bytesToWebResponse( + pdfContent, + removeFileExtension( + Objects.requireNonNull( + Filenames.toSimpleFileName( + request.getFileInput() + .getOriginalFilename()))) + + "_redacted.pdf"); + } + + byte[] pdfContent = + finalizeRedaction( + document, + allFoundTextsByPage, + request.getRedactColor(), + request.getCustomPadding(), + request.getConvertPDFToImage()); + + return WebResponseUtils.bytesToWebResponse( + pdfContent, + removeFileExtension( + Objects.requireNonNull( + Filenames.toSimpleFileName( + request.getFileInput().getOriginalFilename()))) + + "_redacted.pdf"); + + } catch (Exception e) { + log.error("Redaction operation failed: {}", e.getMessage(), e); + throw new RuntimeException("Failed to perform PDF redaction: " + e.getMessage(), e); + + } finally { + if (document != null) { + try { + if (fallbackDocument == null) { + document.close(); + } + } catch (IOException e) { + log.warn("Failed to close main document: {}", e.getMessage()); + } + } + + if (fallbackDocument != null) { + try { + fallbackDocument.close(); + } catch (IOException e) { + log.warn("Failed to close fallback document: {}", e.getMessage()); + } + } + } + } + + private Map> findTextToRedact( + PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) { Map> allFoundTextsByPage = new HashMap<>(); - Set allSearchTerms = new HashSet<>(); + for (String text : listOfText) { text = text.trim(); if (text.isEmpty()) continue; - log.debug("Searching for text pattern: '{}'", text); - allSearchTerms.add(text); - TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool); - textFinder.getText(document); - List foundTexts = textFinder.getFoundTexts(); - - log.debug("Found {} instances of pattern '{}'", foundTexts.size(), text); - - // Log details of found text instances - for (int i = 0; i < foundTexts.size(); i++) { - PDFText found = foundTexts.get(i); - log.debug( - " Match {}: '{}' on page {} at ({}, {}) to ({}, {})", - i + 1, - found.getText(), - found.getPageIndex() + 1, - found.getX1(), - found.getY1(), - found.getX2(), - found.getY2()); - } - - for (PDFText found : foundTexts) { - allFoundTextsByPage - .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>()) - .add(found); - } - } - - log.debug("Total pages with found text: {}", allFoundTextsByPage.size()); - - // Step 2: Process each page with better font fallback handling - log.debug("Step 2: Processing each page for text replacement"); - boolean fallbackToBoxOnlyMode = false; - - // Check if document uses custom encoding fonts that may cause issues - boolean hasCustomEncodingFonts = detectCustomEncodingFonts(document); - if (hasCustomEncodingFonts) { - log.info( - "Detected fonts with custom encoding. Using box-only redaction mode to preserve document integrity."); - fallbackToBoxOnlyMode = true; - } - - if (!fallbackToBoxOnlyMode) { try { - for (PDPage page : document.getPages()) { - // Replace text content - List filteredTokens = - createTokensWithoutTargetText( - page, allSearchTerms, useRegex, wholeWordSearchBool); - writeFilteredContentStream(document, page, filteredTokens); + int pageCount = document.getNumberOfPages(); + for (int pageIndex = 0; pageIndex < pageCount; pageIndex++) { + try (PDDocument singlePageDoc = new PDDocument()) { + PDPage page = document.getPage(pageIndex); + singlePageDoc.addPage(page); + + TextFinder pageFinder = new TextFinder(text, useRegex, wholeWordSearch); + + pageFinder.getText(singlePageDoc); + + for (PDFText found : pageFinder.getFoundTexts()) { + PDFText adjustedText = + new PDFText( + pageIndex, + found.getX1(), + found.getY1(), + found.getX2(), + found.getY2(), + found.getText()); + + allFoundTextsByPage + .computeIfAbsent(pageIndex, k -> new ArrayList<>()) + .add(adjustedText); + } + } catch (Exception e) { + log.error( + "Error processing page {} for search term '{}': {}", + pageIndex, + text, + e.getMessage()); + } } } catch (Exception e) { - log.warn( - "Font encoding error encountered during text modification: {}. Falling back to box-only redaction mode.", + log.error( + "Error initializing TextFinder for search term '{}': {}", + text, e.getMessage()); - fallbackToBoxOnlyMode = true; - - // Reload the document to reset any partial modifications - document.close(); - document = pdfDocumentFactory.load(file); } } - // Draw redaction boxes for all found texts + return allFoundTextsByPage; + } + + private boolean performTextReplacement( + PDDocument document, + Map> allFoundTextsByPage, + String[] listOfText, + boolean useRegex, + boolean wholeWordSearchBool) { + if (allFoundTextsByPage.isEmpty()) { + return false; + } + + if (detectCustomEncodingFonts(document)) { + log.warn( + "Problematic fonts detected (custom encodings / Type3 / damaged). " + + "Skipping inline text replacement and using box-only redaction for safety."); + return true; // signal caller to fall back + } + + try { + Set allSearchTerms = + Arrays.stream(listOfText) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toSet()); + + for (PDPage page : document.getPages()) { + List filteredTokens = + createTokensWithoutTargetText( + document, page, allSearchTerms, useRegex, wholeWordSearchBool); + writeFilteredContentStream(document, page, filteredTokens); + } + log.info("Successfully performed text replacement redaction."); + return false; + } catch (Exception e) { + log.error( + "Text replacement redaction failed due to font or encoding issues. " + + "Will fall back to box-only redaction mode. Error: {}", + e.getMessage()); + return true; + } + } + + private byte[] finalizeRedaction( + PDDocument document, + Map> allFoundTextsByPage, + String colorString, + float customPadding, + Boolean convertToImage) + throws IOException { + List allFoundTexts = new ArrayList<>(); for (List pageTexts : allFoundTextsByPage.values()) { allFoundTexts.addAll(pageTexts); } - log.debug("Drawing redaction boxes for {} total found texts", allFoundTexts.size()); - if (!allFoundTexts.isEmpty()) { - if (fallbackToBoxOnlyMode) { - log.info("Using fallback box-only redaction mode due to font encoding issues"); - log.debug( - "Text removal was skipped to preserve document integrity. Only drawing redaction boxes over {} text instances.", - allFoundTexts.size()); - } else { - log.debug( - "Using full text replacement redaction mode with {} text instances.", - allFoundTexts.size()); - } + Color redactColor = decodeOrDefault(colorString); + redactFoundText(document, allFoundTexts, customPadding, redactColor); - } else { - log.debug("No matching text found for redaction patterns"); + + cleanDocumentMetadata(document); } - if (convertPDFToImage) { - log.debug("Converting redacted PDF to image format"); - PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document); - document.close(); - document = convertedPdf; + if (Boolean.TRUE.equals(convertToImage)) { + try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) { + cleanDocumentMetadata(convertedPdf); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + convertedPdf.save(baos); + byte[] out = baos.toByteArray(); + + log.info( + "Redaction finalized (image mode): {} pages ➜ {} KB", + convertedPdf.getNumberOfPages(), + out.length / 1024); + + return out; + } } ByteArrayOutputStream baos = new ByteArrayOutputStream(); document.save(baos); - document.close(); + byte[] out = baos.toByteArray(); - byte[] pdfContent = baos.toByteArray(); - log.debug("Auto-redaction completed. Output PDF size: {} bytes", pdfContent.length); + log.info( + "Redaction finalized: {} pages ➜ {} KB", + document.getNumberOfPages(), + out.length / 1024); - return WebResponseUtils.bytesToWebResponse( - pdfContent, - Objects.requireNonNull(Filenames.toSimpleFileName(file.getOriginalFilename())) - .replaceFirst("[.][^.]+$", "") - + "_redacted.pdf"); + return out; + } + + private void cleanDocumentMetadata(PDDocument document) { + try { + var documentInfo = document.getDocumentInformation(); + if (documentInfo != null) { + documentInfo.setAuthor(null); + documentInfo.setSubject(null); + documentInfo.setKeywords(null); + + documentInfo.setModificationDate(java.util.Calendar.getInstance()); + + log.debug("Cleaned document metadata for security"); + } + + if (document.getDocumentCatalog() != null) { + try { + document.getDocumentCatalog().setMetadata(null); + } catch (Exception e) { + log.debug("Could not clear XMP metadata: {}", e.getMessage()); + } + } + + } catch (Exception e) { + log.warn("Failed to clean document metadata: {}", e.getMessage()); + } } - /** - * Creates a list of tokens from the page content stream, without the target text. - * - * @param page The PDF page to process. - * @param targetWords The set of words to redact. - * @param useRegex Whether to treat target words as regex patterns. - * @param wholeWordSearch Whether to match whole words only. - * @return A list of tokens with redactions applied. - * @throws IOException If an error occurs while parsing the PDF content stream. - */ List createTokensWithoutTargetText( - PDPage page, Set targetWords, boolean useRegex, boolean wholeWordSearch) + PDDocument document, + PDPage page, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) throws IOException { - log.debug( - "Creating tokens without target text for page, searching for {} words", - targetWords.size()); PDFStreamParser parser = new PDFStreamParser(page); List tokens = new ArrayList<>(); @@ -533,44 +661,50 @@ public class RedactController { tokens.add(token); } - log.debug("Parsed {} tokens from page content stream", tokens.size()); + PDResources resources = page.getResources(); + if (resources != null) { + processPageXObjects(document, resources, targetWords, useRegex, wholeWordSearch); + } List textSegments = extractTextSegments(page, tokens); - log.debug("Extracted {} text segments", textSegments.size()); - - // Log detailed text segment information - for (int i = 0; - i < Math.min(textSegments.size(), 20); - i++) { // Log first 20 segments to avoid spam - TextSegment segment = textSegments.get(i); - log.debug( - "Text segment {}: '{}' (font: {}, operator: {}, pos: {}-{})", - i, - segment.getText(), - segment.getFont() != null ? segment.getFont().getName() : "null", - segment.getOperatorName(), - segment.getStartPos(), - segment.getEndPos()); - } - if (textSegments.size() > 20) { - log.debug("... and {} more text segments", textSegments.size() - 20); - } String completeText = buildCompleteText(textSegments); - log.debug( - "Built complete text of {} characters: '{}'", - completeText.length(), - completeText.length() > 200 - ? completeText.substring(0, 200) + "..." - : completeText); List matches = findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); - log.debug("Found {} matches in complete text", matches.size()); return applyRedactionsToTokens(tokens, textSegments, matches); } + List createTokensWithoutTargetText( + PDPage page, Set targetWords, boolean useRegex, boolean wholeWordSearch) + throws IOException { + try (PDDocument tempDoc = new PDDocument()) { + return createTokensWithoutTargetText( + tempDoc, page, targetWords, useRegex, wholeWordSearch); + } + } + + private void processPageXObjects( + PDDocument document, + PDResources resources, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + + for (COSName xobjName : resources.getXObjectNames()) { + try { + PDXObject xobj = resources.getXObject(xobjName); + if (xobj instanceof PDFormXObject formXObj) { + processFormXObject(document, formXObj, targetWords, useRegex, wholeWordSearch); + log.debug("Processed Form XObject: {}", xobjName.getName()); + } + } catch (Exception e) { + log.warn("Failed to process XObject {}: {}", xobjName.getName(), e.getMessage()); + } + } + } + @Data private static class GraphicsState { private PDFont font = null; @@ -597,7 +731,6 @@ public class RedactController { } private List extractTextSegments(PDPage page, List tokens) { - log.debug("Extracting text segments from {} tokens", tokens.size()); List segments = new ArrayList<>(); int currentTextPos = 0; @@ -614,43 +747,23 @@ public class RedactController { try { COSName fontName = (COSName) tokens.get(i - 2); COSBase fontSizeBase = (COSBase) tokens.get(i - 1); - if (fontSizeBase instanceof org.apache.pdfbox.cos.COSNumber cosNumber) { + if (fontSizeBase instanceof COSNumber cosNumber) { graphicsState.setFont(resources.getFont(fontName)); graphicsState.setFontSize(cosNumber.floatValue()); - log.debug( - "Updated font state: {} size {}", - fontName.getName(), - graphicsState.getFontSize()); } } catch (ClassCastException | IOException e) { - log.warn("Failed to update font state", e); + log.debug( + "Failed to extract font and font size from Tf operator: {}", + e.getMessage()); } } - if (isTextShowingOperator(opName) && i > 0) { - String textContent = extractTextFromToken(tokens.get(i - 1), opName); - if (!textContent.isEmpty()) { - log.debug( - "Found text segment '{}' at position {} with operator {}", - textContent, - currentTextPos, - opName); - segments.add( - new TextSegment( - i - 1, - opName, - textContent, - currentTextPos, - currentTextPos + textContent.length(), - graphicsState.font, - graphicsState.fontSize)); - currentTextPos += textContent.length(); - } - } + currentTextPos = + getCurrentTextPos( + tokens, segments, currentTextPos, graphicsState, i, opName); } } - log.debug("Extracted {} text segments from page", segments.size()); return segments; } @@ -667,112 +780,82 @@ public class RedactController { Set targetWords, boolean useRegex, boolean wholeWordSearch) { - log.debug( - "Finding matches in text of {} characters for {} target words", - completeText.length(), - targetWords.size()); - List matches = new ArrayList<>(); - - for (String target : targetWords) { - log.debug("Searching for pattern: '{}'", target); - - String patternString = useRegex ? target : Pattern.quote(target); - if (wholeWordSearch) { - patternString = "\\b" + patternString + "\\b"; - } - Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE); - Matcher matcher = pattern.matcher(completeText); - - int matchCount = 0; - while (matcher.find()) { - matches.add(new MatchRange(matcher.start(), matcher.end())); - matchCount++; - log.debug( - "Found match for '{}' at positions {}-{}", - target, - matcher.start(), - matcher.end()); - } - - log.debug("Total matches for '{}': {}", target, matchCount); - } - - matches.sort(Comparator.comparingInt(a -> a.startPos)); - log.debug("Found {} total matches across all patterns", matches.size()); - - return matches; + return targetWords.stream() + .map( + target -> { + String patternString = useRegex ? target : Pattern.quote(target); + if (wholeWordSearch) { + patternString = "\\b" + patternString + "\\b"; + } + return Pattern.compile(patternString, Pattern.CASE_INSENSITIVE); + }) + .flatMap(pattern -> pattern.matcher(completeText).results()) + .map(matchResult -> new MatchRange(matchResult.start(), matchResult.end())) + .sorted(Comparator.comparingInt(MatchRange::getStartPos)) + .collect(Collectors.toList()); } private List applyRedactionsToTokens( List tokens, List textSegments, List matches) { - log.debug( - "Applying redactions to {} tokens with {} text segments and {} matches", - tokens.size(), - textSegments.size(), - matches.size()); - List newTokens = new ArrayList<>(tokens); + long startTime = System.currentTimeMillis(); - // Group matches by segment to pass to modification methods - Map> matchesBySegment = new HashMap<>(); - for (MatchRange match : matches) { - for (int i = 0; i < textSegments.size(); i++) { - TextSegment segment = textSegments.get(i); - int overlapStart = Math.max(match.startPos, segment.startPos); - int overlapEnd = Math.min(match.endPos, segment.endPos); - if (overlapStart < overlapEnd) { - matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match); + try { + List newTokens = new ArrayList<>(tokens); + + Map> matchesBySegment = new HashMap<>(); + for (MatchRange match : matches) { + for (int i = 0; i < textSegments.size(); i++) { + TextSegment segment = textSegments.get(i); + int overlapStart = Math.max(match.startPos, segment.startPos); + int overlapEnd = Math.min(match.endPos, segment.endPos); + if (overlapStart < overlapEnd) { + matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match); + } } } - } - log.debug("Grouped matches by segment: {} segments affected", matchesBySegment.size()); + List tasks = new ArrayList<>(); + for (Map.Entry> entry : matchesBySegment.entrySet()) { + int segmentIndex = entry.getKey(); + List segmentMatches = entry.getValue(); + TextSegment segment = textSegments.get(segmentIndex); - // Create a list of modification tasks - List tasks = new ArrayList<>(); - for (Map.Entry> entry : matchesBySegment.entrySet()) { - int segmentIndex = entry.getKey(); - List segmentMatches = entry.getValue(); - TextSegment segment = textSegments.get(segmentIndex); + if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) { + String newText = applyRedactionsToSegmentText(segment, segmentMatches); + try { + float adjustment = calculateWidthAdjustment(segment, segmentMatches); + tasks.add(new ModificationTask(segment, newText, adjustment)); + } catch (Exception e) { + log.debug( + "Width adjustment calculation failed for segment: {}", + e.getMessage()); + } + } else if ("TJ".equals(segment.operatorName)) { + tasks.add(new ModificationTask(segment, null, 0)); + } + } + tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex)); + + for (ModificationTask task : tasks) { + List segmentMatches = + matchesBySegment.getOrDefault( + textSegments.indexOf(task.segment), Collections.emptyList()); + modifyTokenForRedaction( + newTokens, task.segment, task.newText, task.adjustment, segmentMatches); + } + + return newTokens; + + } finally { + long processingTime = System.currentTimeMillis() - startTime; log.debug( - "Creating modification task for segment {} with {} matches", - segmentIndex, - segmentMatches.size()); - - if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) { - String newText = applyRedactionsToSegmentText(segment, segmentMatches); - try { - float adjustment = calculateWidthAdjustment(segment, segmentMatches); - tasks.add(new ModificationTask(segment, newText, adjustment)); - } catch (Exception e) { - log.warn( - "Failed to calculate width adjustment for redaction due to font encoding issues: {}. Using zero adjustment.", - e.getMessage()); - tasks.add(new ModificationTask(segment, newText, 0)); - } - } else if ("TJ".equals(segment.operatorName)) { - tasks.add(new ModificationTask(segment, null, 0)); - } + "Token redaction processing completed in {} ms for {} matches", + processingTime, + matches.size()); } - - // Sort tasks by token index in descending order to avoid index shifting issues - tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex)); - - log.debug("Applying {} modification tasks", tasks.size()); - - // Apply modifications - for (ModificationTask task : tasks) { - List segmentMatches = - matchesBySegment.getOrDefault( - textSegments.indexOf(task.segment), Collections.emptyList()); - modifyTokenForRedaction( - newTokens, task.segment, task.newText, task.adjustment, segmentMatches); - } - - log.debug("Completed applying redactions to tokens"); - return newTokens; } @Data @@ -800,357 +883,124 @@ public class RedactController { return result.toString(); } - private float safeGetStringWidth(PDFont font, String text) throws IOException { + private float safeGetStringWidth(PDFont font, String text) { if (font == null || text == null || text.isEmpty()) { return 0; } try { - // First, try to get the width directly for standard fonts return font.getStringWidth(text); } catch (Exception e) { - log.debug( - "Font width calculation failed for '{}' in font {}: {}. Using fallback strategies.", - text, - font.getName(), - e.getMessage()); - - // Strategy 1: Character-by-character encoding test - float charByCharWidth = getCharacterByCharacterWidth(font, text); - if (charByCharWidth > 0) { - return charByCharWidth; - } - - // Strategy 2: Use font substitution with Standard 14 fonts - float substitutionWidth = getWidthWithFontSubstitution(font, text); - if (substitutionWidth > 0) { - return substitutionWidth; - } - - // Strategy 3: Statistical estimation based on real font metrics - return getStatisticalWidth(text, font); - } - } - - private float getCharacterByCharacterWidth(PDFont font, String text) { - if (text == null || text.isEmpty()) { - return 0; - } - - try { - float totalWidth = 0; - for (char c : text.toCharArray()) { - try { - String charStr = String.valueOf(c); - font.encode(charStr); // Test if character can be encoded - totalWidth += font.getStringWidth(charStr); - } catch (Exception e) { + try { + float totalWidth = 0; + for (int i = 0; i < text.length(); i++) { + String character = text.substring(i, i + 1); try { - totalWidth += font.getStringWidth(" "); + byte[] encoded = font.encode(character); + if (encoded.length > 0) { + int glyphCode = encoded[0] & 0xFF; + + float glyphWidth = font.getWidth(glyphCode); + + if (glyphWidth == 0) { + try { + glyphWidth = font.getWidthFromFont(glyphCode); + } catch (Exception e2) { + glyphWidth = font.getAverageFontWidth(); + } + } + + totalWidth += glyphWidth; + } else { + totalWidth += font.getAverageFontWidth(); + } } catch (Exception e2) { - totalWidth += 500; // Standard average width + totalWidth += font.getAverageFontWidth(); } } - } - return totalWidth; - } catch (Exception e) { - log.debug("Character-by-character width calculation failed: {}", e.getMessage()); - return 0; // Failed, try next strategy - } - } - - private float getWidthWithFontSubstitution(PDFont originalFont, String text) { - try { - PDFont substituteFont = findBestStandardFontSubstitute(originalFont); - float width = substituteFont.getStringWidth(text); - - FontCharacteristics characteristics = getFontCharacteristics(originalFont); - - return width; - } catch (Exception e) { - log.debug("Font substitution width calculation failed: {}", e.getMessage()); - } - return 0; // Failed, try next strategy - } - - private PDFont findBestStandardFontSubstitute(PDFont originalFont) { - String fontFamily = null; - String fontName = null; - boolean isBold = false; - boolean isItalic = false; - boolean isMonospace = false; - - try { - // Try to get font metadata from PDFontDescriptor - if (originalFont.getFontDescriptor() != null) { - fontFamily = originalFont.getFontDescriptor().getFontFamily(); - - if (fontFamily == null || fontFamily.isEmpty()) { - fontName = originalFont.getFontDescriptor().getFontName(); - } - - int flags = originalFont.getFontDescriptor().getFlags(); - isBold = (flags & 0x40) != 0; // Check if FORCE_BOLD flag is set (0x40) - isItalic = (flags & 0x40000) != 0; // Check if ITALIC flag is set (0x40000) - isMonospace = (flags & 0x1) != 0; // Check if FIXED_PITCH flag is set (0x1) - } - } catch (Exception e) { - log.debug("Error accessing font descriptor: {}", e.getMessage()); - } - - // If we couldn't get metadata from descriptor, fall back to font name - if ((fontFamily == null || fontFamily.isEmpty()) - && (fontName == null || fontName.isEmpty())) { - fontName = originalFont.getName().toLowerCase(); - } else if (fontFamily != null) { - fontFamily = fontFamily.toLowerCase(); - } else { - fontName = fontName.toLowerCase(); - } - - // Determine font characteristics based on metadata or name - boolean isSerif = false; - boolean isCourier = false; - - // Check font family first - if (fontFamily != null) { - isCourier = fontFamily.contains("courier"); - isMonospace = - isMonospace - || isCourier - || fontFamily.contains("mono") - || fontFamily.contains("fixed"); - isSerif = - fontFamily.contains("times") - || fontFamily.contains("serif") - || fontFamily.contains("roman"); - } - - // If needed, check font name as fallback - if (fontName != null) { - isCourier = isCourier || fontName.contains("courier"); - isMonospace = - isMonospace - || isCourier - || fontName.contains("mono") - || fontName.contains("fixed"); - isSerif = - isSerif - || fontName.contains("times") - || fontName.contains("serif") - || fontName.contains("roman"); - isBold = isBold || fontName.contains("bold"); - isItalic = isItalic || fontName.contains("italic") || fontName.contains("oblique"); - } - - // Select the appropriate standard font based on characteristics - if (isMonospace) { - return new PDType1Font(Standard14Fonts.FontName.COURIER); - } - - if (isSerif) { - if (isBold && isItalic) { - return new PDType1Font(Standard14Fonts.FontName.TIMES_BOLD_ITALIC); - } else if (isBold) { - return new PDType1Font(Standard14Fonts.FontName.TIMES_BOLD); - } else if (isItalic) { - return new PDType1Font(Standard14Fonts.FontName.TIMES_ITALIC); - } else { - return new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN); - } - } - - // Sans-serif fonts (Helvetica) - if (isBold && isItalic) { - return new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD_OBLIQUE); - } else if (isBold) { - return new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD); - } else if (isItalic) { - return new PDType1Font(Standard14Fonts.FontName.HELVETICA_OBLIQUE); - } - - return new PDType1Font(Standard14Fonts.FontName.HELVETICA); - } - - private float getStatisticalWidth(String text, PDFont font) { - if (text == null || text.isEmpty()) { - return 0; - } - - PDFont referenceFont = findBestStandardFontSubstitute(font); - - // Get font characteristics using metadata - FontCharacteristics characteristics = getFontCharacteristics(font); - - try { - - return referenceFont.getStringWidth(text); - } catch (Exception e) { - float avgCharWidth = getAverageCharacterWidth(font); - return text.length() * avgCharWidth; - } - } - - private FontCharacteristics getFontCharacteristics(PDFont font) { - FontCharacteristics characteristics = new FontCharacteristics(); - - try { - // Try to get font metadata from PDFontDescriptor - if (font.getFontDescriptor() != null) { - characteristics.fontFamily = font.getFontDescriptor().getFontFamily(); - - if (characteristics.fontFamily == null || characteristics.fontFamily.isEmpty()) { - characteristics.fontName = font.getFontDescriptor().getFontName(); - } - - int flags = font.getFontDescriptor().getFlags(); - characteristics.isBold = (flags & 0x40) != 0; // FORCE_BOLD flag - characteristics.isItalic = (flags & 0x40000) != 0; // ITALIC flag - characteristics.isMonospace = (flags & 0x1) != 0; // FIXED_PITCH flag - } - } catch (Exception e) { - log.debug("Error accessing font descriptor: {}", e.getMessage()); - } - - // If we couldn't get metadata from descriptor, fall back to font name - if ((characteristics.fontFamily == null || characteristics.fontFamily.isEmpty()) - && (characteristics.fontName == null || characteristics.fontName.isEmpty())) { - characteristics.fontName = font.getName(); - } - - if (characteristics.fontFamily != null) { - characteristics.fontFamily = characteristics.fontFamily.toLowerCase(); - } - if (characteristics.fontName != null) { - characteristics.fontName = characteristics.fontName.toLowerCase(); - } - - if (characteristics.fontFamily != null) { - characteristics.isCourier = characteristics.fontFamily.contains("courier"); - characteristics.isMonospace = - characteristics.isMonospace - || characteristics.isCourier - || characteristics.fontFamily.contains("mono") - || characteristics.fontFamily.contains("fixed"); - characteristics.isSerif = - characteristics.fontFamily.contains("times") - || characteristics.fontFamily.contains("serif") - || characteristics.fontFamily.contains("roman"); - characteristics.isTimesNewRoman = - characteristics.fontFamily.contains("timesnewroman") - || characteristics.fontFamily.contains("timesnew"); - } - - if (characteristics.fontName != null) { - characteristics.isCourier = - characteristics.isCourier || characteristics.fontName.contains("courier"); - characteristics.isMonospace = - characteristics.isMonospace - || characteristics.isCourier - || characteristics.fontName.contains("mono") - || characteristics.fontName.contains("fixed"); - characteristics.isSerif = - characteristics.isSerif - || characteristics.fontName.contains("times") - || characteristics.fontName.contains("serif") - || characteristics.fontName.contains("roman"); - characteristics.isBold = - characteristics.isBold || characteristics.fontName.contains("bold"); - characteristics.isItalic = - characteristics.isItalic - || characteristics.fontName.contains("italic") - || characteristics.fontName.contains("oblique"); - characteristics.isTimesNewRoman = - characteristics.isTimesNewRoman - || (characteristics.fontName.contains("timesnewroman") - || characteristics.fontName.contains("timesnew")) - && (characteristics.fontName.contains("psmt") - || characteristics.fontName.contains("ps-")); - } - - return characteristics; - } - - private static class FontCharacteristics { - String fontFamily; - String fontName; - boolean isBold; - boolean isItalic; - boolean isMonospace; - boolean isSerif; - boolean isCourier; - boolean isTimesNewRoman; - } - - private float getAverageCharacterWidth(PDFont font) { - String sampleText = "etaoinshrdlucmfwypvbgkjqxz0123456789 ,."; - - FontCharacteristics characteristics = getFontCharacteristics(font); - - try { - - return font.getStringWidth(sampleText) / sampleText.length(); - } catch (Exception e) { - try { - PDFont substituteFont = findBestStandardFontSubstitute(font); - - return substituteFont.getStringWidth(sampleText) / sampleText.length(); + return totalWidth; } catch (Exception e2) { - if (characteristics.isMonospace || characteristics.isCourier) { - return 600; // Monospace fonts - } else if (characteristics.isTimesNewRoman) { - return 550; // TimesNewRoman fonts - increased from standard Times - } else if (characteristics.isSerif) { - return 480; // Times-style serif fonts - } else if (characteristics.fontFamily != null - && (characteristics.fontFamily.contains("arial") - || characteristics.fontFamily.contains("helvetica"))) { - return 520; // Helvetica/Arial-style sans-serif - } else if (characteristics.fontName != null - && (characteristics.fontName.contains("arial") - || characteristics.fontName.contains("helvetica"))) { - return 520; // Helvetica/Arial-style sans-serif - } else { - return 500; // Generic sans-serif average - } + log.debug("PDFBox API width calculation failed: {}", e2.getMessage()); } + + try { + if (font.getFontDescriptor() != null + && font.getFontDescriptor().getFontBoundingBox() != null) { + PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); + float avgCharWidth = bbox.getHeight() / 1000f * 0.865f; + return text.length() * avgCharWidth * FONT_SCALE_FACTOR; + } + } catch (Exception e2) { + log.debug("Font bounding box width calculation failed: {}", e2.getMessage()); + } + + try { + float avgWidth = font.getAverageFontWidth(); + return text.length() * avgWidth; + } catch (Exception e2) { + log.debug("Average font width calculation failed: {}", e2.getMessage()); + } + + float conservativeWidth = text.length() * 500f; // 500 units per character + log.debug( + "All width calculation methods failed for font {}, using conservative estimate: {}", + font.getName(), + conservativeWidth); + return conservativeWidth; } } - private float calculateWidthAdjustment(TextSegment segment, List matches) - throws IOException { + private float calculateWidthAdjustment(TextSegment segment, List matches) { try { - float totalOriginalWidth = 0; - float totalPlaceholderWidth = 0; + if (segment.getFont() == null || segment.getFontSize() <= 0) { + return 0; + } + + String fontName = segment.getFont().getName(); + if (fontName != null && (fontName.contains("HOEPAP") || isFontSubset(fontName))) { + log.debug("Skipping width adjustment for problematic/subset font: {}", fontName); + return 0; + } + + float totalOriginal = 0; + float totalPlaceholder = 0; + String text = segment.getText(); for (MatchRange match : matches) { - int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); - int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); + int segStart = Math.max(0, match.getStartPos() - segment.getStartPos()); + int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); - if (segmentStart < text.length() && segmentEnd > segmentStart) { - String originalPart = text.substring(segmentStart, segmentEnd); + if (segStart < text.length() && segEnd > segStart) { + String originalPart = text.substring(segStart, segEnd); String placeholderPart = createPlaceholder(originalPart); - if (segment.getFont() != null) { - totalOriginalWidth += - safeGetStringWidth(segment.getFont(), originalPart) - / FONT_SCALE_FACTOR - * segment.getFontSize(); - totalPlaceholderWidth += - safeGetStringWidth(segment.getFont(), placeholderPart) - / FONT_SCALE_FACTOR - * segment.getFontSize(); - } + float origUnits = safeGetStringWidth(segment.getFont(), originalPart); + float placeUnits = safeGetStringWidth(segment.getFont(), placeholderPart); + + float orig = (origUnits / FONT_SCALE_FACTOR) * segment.getFontSize(); + float place = (placeUnits / FONT_SCALE_FACTOR) * segment.getFontSize(); + + totalOriginal += orig; + totalPlaceholder += place; } } - return totalOriginalWidth - totalPlaceholderWidth; - } catch (Exception e) { - log.warn( - "Failed to calculate width adjustment for segment '{}' due to font encoding issues: {}. Skipping adjustment.", - segment.getText(), - e.getMessage()); - return 0; // No adjustment when font operations fail + + float adjustment = totalOriginal - totalPlaceholder; + + float maxReasonableAdjustment = segment.getText().length() * segment.getFontSize() * 2; + if (Math.abs(adjustment) > maxReasonableAdjustment) { + log.debug( + "Width adjustment {} seems unreasonable for text length {}, capping to 0", + adjustment, + segment.getText().length()); + return 0; + } + + return adjustment; + } catch (Exception ex) { + log.debug("Width adjustment failed: {}", ex.getMessage()); + return 0; } } @@ -1160,17 +1010,8 @@ public class RedactController { String newText, float adjustment, List matches) { - log.debug( - "Modifying token at index {} for segment '{}' with operator {}", - segment.getTokenIndex(), - segment.getText(), - segment.getOperatorName()); if (segment.getTokenIndex() < 0 || segment.getTokenIndex() >= tokens.size()) { - log.debug( - "Token index {} out of bounds (0-{})", - segment.getTokenIndex(), - tokens.size() - 1); return; } @@ -1180,37 +1021,21 @@ public class RedactController { try { if (("Tj".equals(operatorName) || "'".equals(operatorName)) && token instanceof COSString) { - log.debug("Modifying Tj/quote operator with adjustment {}", adjustment); if (Math.abs(adjustment) < PRECISION_THRESHOLD) { - tokens.set(segment.getTokenIndex(), new COSString(newText)); + if (newText.isEmpty()) { + tokens.set(segment.getTokenIndex(), EMPTY_COS_STRING); + } else { + tokens.set(segment.getTokenIndex(), new COSString(newText)); + } } else { COSArray newArray = new COSArray(); newArray.add(new COSString(newText)); if (segment.getFontSize() > 0) { - float adjustmentFactor = 1.05f; - float kerning = - -1 - * adjustment - * adjustmentFactor - * (FONT_SCALE_FACTOR / segment.getFontSize()); + float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR; - // Cap kerning value to prevent extreme outliers that mangle text - float maxKerning = 500f; - if (Math.abs(kerning) > maxKerning) { - log.warn( - "Kerning value {} is an outlier. Capping to {}.", - kerning, - kerning > 0 ? maxKerning : -maxKerning); - kerning = Math.max(-maxKerning, Math.min(maxKerning, kerning)); - } - - newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); - log.debug( - "Applied kerning adjustment: {} for width adjustment: {}", - kerning, - adjustment); + newArray.add(new COSFloat(kerning)); } tokens.set(segment.getTokenIndex(), newArray); @@ -1219,25 +1044,22 @@ public class RedactController { && tokens.get(operatorIndex) instanceof Operator op && op.getName().equals(operatorName)) { tokens.set(operatorIndex, Operator.getOperator("TJ")); - log.debug("Changed operator from {} to TJ", operatorName); } } } else if ("TJ".equals(operatorName) && token instanceof COSArray) { - log.debug("Modifying TJ operator array"); COSArray newArray = createRedactedTJArray((COSArray) token, segment, matches); tokens.set(segment.getTokenIndex(), newArray); } } catch (Exception e) { - log.warn( - "Failed to modify token for redaction due to font encoding issues: {}. Skipping text modification for segment '{}'.", - e.getMessage(), - segment.getText()); + log.debug( + "Token modification failed for segment at index {}: {}", + segment.getTokenIndex(), + e.getMessage()); } } private COSArray createRedactedTJArray( - COSArray originalArray, TextSegment segment, List matches) - throws IOException { + COSArray originalArray, TextSegment segment, List matches) { try { COSArray newArray = new COSArray(); int textOffsetInSegment = 0; @@ -1287,26 +1109,17 @@ public class RedactController { * segment.getFontSize(); float adjustment = originalWidth - modifiedWidth; if (Math.abs(adjustment) > PRECISION_THRESHOLD) { - - float adjustmentFactor = 1.05f; // Increase kerning, visually more - // natural float kerning = - -1 - * adjustment - * adjustmentFactor - * (FONT_SCALE_FACTOR / segment.getFontSize()); + (-adjustment / segment.getFontSize()) + * FONT_SCALE_FACTOR + * 1.10f; - newArray.add(new org.apache.pdfbox.cos.COSFloat(kerning)); - log.debug( - "Applied kerning adjustment: {} for width adjustment: {}", - kerning, - adjustment); + newArray.add(new COSFloat(kerning)); } } catch (Exception e) { - log.warn( - "Failed to calculate kerning adjustment for TJ array element due to font encoding issues: {}. Skipping adjustment.", + log.debug( + "Width adjustment calculation failed for segment: {}", e.getMessage()); - // Continue without kerning adjustment } } @@ -1317,10 +1130,6 @@ public class RedactController { } return newArray; } catch (Exception e) { - log.warn( - "Failed to create redacted TJ array due to font encoding issues: {}. Returning original array.", - e.getMessage()); - // Return the original array if we can't modify it safely return originalArray; } } @@ -1349,70 +1158,233 @@ public class RedactController { }; } - /** - * Detects if the document contains fonts with custom encoding that may cause text modification - * issues. Custom encoding fonts often have internal character mappings that don't follow - * Unicode standards. - */ private boolean detectCustomEncodingFonts(PDDocument document) { try { - log.debug("Starting font encoding detection..."); + var documentCatalog = document.getDocumentCatalog(); + if (documentCatalog == null) { + return false; + } + for (PDPage page : document.getPages()) { PDResources resources = page.getResources(); - if (resources != null) { - int fontCount = 0; - for (COSName fn : resources.getFontNames()) fontCount++; - log.debug("Found {} fonts on page", fontCount); - for (COSName fontName : resources.getFontNames()) { - try { - PDFont font = resources.getFont(fontName); - if (font != null) { - String name = font.getName(); - log.debug( - "Analyzing font: {} (type: {})", - name, - font.getClass().getSimpleName()); + if (resources == null) { + continue; + } - // Check for font names that commonly indicate custom encoding - if (name != null - && (name.contains("HOEP") - || // Common custom encoding prefix - name.contains("+") - || // Subset fonts often have custom encoding - name.matches(".*[A-Z]{6}\\+.*") // Six letter prefix - // pattern - )) { - log.debug("Detected potential custom encoding font: {}", name); - // Try a simple encoding test - try { - font.encode(" "); // Test space character - font.getStringWidth(" "); - log.debug("Font {} passed basic encoding test", name); - } catch (Exception e) { - log.debug( - "Font {} failed basic encoding test: {}", - name, - e.getMessage()); - return true; - } - } else { - log.debug("Font {} appears to use standard encoding", name); - } - } - } catch (Exception e) { + for (COSName fontName : resources.getFontNames()) { + try { + PDFont font = resources.getFont(fontName); + if (font != null && hasProblematicFontCharacteristics(font)) { log.debug( - "Error checking font for custom encoding: {}", e.getMessage()); + "Detected problematic font: {} (type: {})", + font.getName(), + font.getClass().getSimpleName()); + return true; } + } catch (IOException e) { + log.debug( + "Font loading failed for {}: {}", + fontName.getName(), + e.getMessage()); + return true; } } } - log.debug("Font encoding detection complete - no problematic fonts found"); return false; } catch (Exception e) { - log.warn( - "Error detecting custom encoding fonts: {}. Assuming custom encoding present.", - e.getMessage()); - return true; // Err on the side of caution + log.warn("Font detection analysis failed: {}", e.getMessage()); + return false; } } + + private boolean hasProblematicFontCharacteristics(PDFont font) { + try { + if (font.isDamaged()) { + log.debug("Font {} is marked as damaged by PDFBox", font.getName()); + return true; + } + + String fontName = font.getName(); + if (isFontSubset(fontName)) { + if (hasKnownProblematicPattern(fontName)) { + return cannotCalculateBasicWidths(font); + } + return false; + } + + String fontType = font.getClass().getSimpleName(); + if ("PDType3Font".equals(fontType)) { + return cannotCalculateBasicWidths(font); + } + + return false; + + } catch (Exception e) { + log.debug("Font analysis failed for {}: {}", font.getName(), e.getMessage()); + return true; + } + } + + private boolean cannotCalculateBasicWidths(PDFont font) { + try { + float spaceWidth = font.getStringWidth(" "); + if (spaceWidth <= 0) { + return true; + } + + String[] testChars = {"a", "A", "0", ".", "e", "!"}; + for (String ch : testChars) { + try { + float width = font.getStringWidth(ch); + if (width > 0) { + return false; // Found at least one character we can measure + } + } catch (Exception e) { + } + } + + return true; // Can't calculate width for any test characters + } catch (Exception e) { + return true; // Font failed basic width calculation + } + } + + private boolean isFontSubset(String fontName) { + if (fontName == null) { + return false; + } + return fontName.matches("^[A-Z]{6}\\+.*"); + } + + private boolean hasKnownProblematicPattern(String fontName) { + if (fontName == null) { + return false; + } + + return fontName.contains("HOEPAP") + || fontName.contains("HOEPGL") + || fontName.contains("HOEPNL") + || fontName.toLowerCase().contains("corrupt") + || fontName.toLowerCase().contains("damaged"); + } + + private void processFormXObject( + PDDocument document, + PDFormXObject formXObject, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + + try { + PDResources xobjResources = formXObject.getResources(); + if (xobjResources == null) { + return; + } + + for (COSName xobjName : xobjResources.getXObjectNames()) { + PDXObject nestedXObj = xobjResources.getXObject(xobjName); + if (nestedXObj instanceof PDFormXObject nestedFormXObj) { + processFormXObject( + document, nestedFormXObj, targetWords, useRegex, wholeWordSearch); + } + } + + PDFStreamParser parser = new PDFStreamParser(formXObject); + List tokens = new ArrayList<>(); + Object token; + while ((token = parser.parseNextToken()) != null) { + tokens.add(token); + } + + List textSegments = extractTextSegmentsFromXObject(xobjResources, tokens); + String completeText = buildCompleteText(textSegments); + + List matches = + findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); + + if (!matches.isEmpty()) { + List redactedTokens = + applyRedactionsToTokens(tokens, textSegments, matches); + writeRedactedContentToXObject(document, formXObject, redactedTokens); + log.debug("Processed {} redactions in Form XObject", matches.size()); + } + + } catch (Exception e) { + log.warn("Failed to process Form XObject: {}", e.getMessage()); + } + } + + private List extractTextSegmentsFromXObject( + PDResources resources, List tokens) { + List segments = new ArrayList<>(); + int currentTextPos = 0; + GraphicsState graphicsState = new GraphicsState(); + + for (int i = 0; i < tokens.size(); i++) { + Object currentToken = tokens.get(i); + + if (currentToken instanceof Operator op) { + String opName = op.getName(); + + if ("Tf".equals(opName) && i >= 2) { + try { + COSName fontName = (COSName) tokens.get(i - 2); + COSBase fontSizeBase = (COSBase) tokens.get(i - 1); + if (fontSizeBase instanceof COSNumber cosNumber) { + graphicsState.setFont(resources.getFont(fontName)); + graphicsState.setFontSize(cosNumber.floatValue()); + } + } catch (ClassCastException | IOException e) { + log.debug("Font extraction failed in XObject: {}", e.getMessage()); + } + } + + currentTextPos = + getCurrentTextPos( + tokens, segments, currentTextPos, graphicsState, i, opName); + } + } + + return segments; + } + + private int getCurrentTextPos( + List tokens, + List segments, + int currentTextPos, + GraphicsState graphicsState, + int i, + String opName) { + if (isTextShowingOperator(opName) && i > 0) { + String textContent = extractTextFromToken(tokens.get(i - 1), opName); + if (!textContent.isEmpty()) { + segments.add( + new TextSegment( + i - 1, + opName, + textContent, + currentTextPos, + currentTextPos + textContent.length(), + graphicsState.font, + graphicsState.fontSize)); + currentTextPos += textContent.length(); + } + } + return currentTextPos; + } + + private void writeRedactedContentToXObject( + PDDocument document, PDFormXObject formXObject, List redactedTokens) + throws IOException { + + PDStream newStream = new PDStream(document); + + try (var out = newStream.createOutputStream()) { + ContentStreamWriter writer = new ContentStreamWriter(out); + writer.writeTokens(redactedTokens); + } + + formXObject.getCOSObject().removeItem(COSName.CONTENTS); + formXObject.getCOSObject().setItem(COSName.CONTENTS, newStream.getCOSObject()); + } } diff --git a/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java b/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java index ab501f143..3e83650d6 100644 --- a/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java +++ b/stirling-pdf/src/test/java/stirling/software/SPDF/controller/api/security/RedactControllerTest.java @@ -224,6 +224,78 @@ class RedactControllerTest { void redactMultipleSearchTerms() throws Exception { testAutoRedaction("confidential\nsecret\nprivate\nclassified", false, true, "#FF0000", 2.0f, false, true); } + + @Test + @DisplayName("Should handle very large number of search terms") + void handleLargeNumberOfSearchTerms() throws Exception { + StringBuilder terms = new StringBuilder(); + for (int i = 0; i < 100; i++) { + terms.append("term").append(i).append("\n"); + } + testAutoRedaction(terms.toString(), false, false, "#000000", 1.0f, false, true); + } + + @Test + @DisplayName("Should handle complex document structure") + void handleComplexDocumentStructure() throws Exception { + when(mockPages.getCount()).thenReturn(5); + when(mockDocument.getNumberOfPages()).thenReturn(5); + + List pageList = new ArrayList<>(); + for (int i = 0; i < 5; i++) { + PDPage page = mock(PDPage.class); + PDRectangle pageRect = new PDRectangle(0, 0, 612, 792); + when(page.getCropBox()).thenReturn(pageRect); + when(page.getMediaBox()).thenReturn(pageRect); + when(page.getBBox()).thenReturn(pageRect); + when(page.hasContents()).thenReturn(true); + + InputStream mockInputStream = new ByteArrayInputStream( + ("BT /F1 12 Tf 100 200 Td (page " + i + " content with confidential info) Tj ET").getBytes()); + when(page.getContents()).thenReturn(mockInputStream); + + pageList.add(page); + } + + when(mockPages.iterator()).thenReturn(pageList.iterator()); + for (int i = 0; i < 5; i++) { + when(mockPages.get(i)).thenReturn(pageList.get(i)); + } + + testAutoRedaction("confidential", false, false, "#000000", 1.0f, false, true); + + // Reset to original state + reset(mockPages); + when(mockPages.getCount()).thenReturn(1); + when(mockPages.get(0)).thenReturn(mockPage); + when(mockPages.iterator()).thenReturn(Collections.singletonList(mockPage).iterator()); + when(mockDocument.getNumberOfPages()).thenReturn(1); + } + + @Test + @DisplayName("Should handle document with metadata") + void handleDocumentWithMetadata() throws Exception { + RedactPdfRequest request = createRedactPdfRequest(); + request.setListOfText("confidential"); + request.setUseRegex(false); + request.setWholeWordSearch(false); + request.setRedactColor("#000000"); + request.setCustomPadding(1.0f); + request.setConvertPDFToImage(false); + + when(mockPages.get(0)).thenReturn(mockPage); + + org.apache.pdfbox.pdmodel.PDDocumentInformation mockInfo = mock(org.apache.pdfbox.pdmodel.PDDocumentInformation.class); + when(mockDocument.getDocumentInformation()).thenReturn(mockInfo); + + ResponseEntity response = redactController.redactPdf(request); + + assertNotNull(response); + assertEquals(200, response.getStatusCode().value()); + + verify(mockDocument).save(any(ByteArrayOutputStream.class)); + verify(mockDocument).close(); + } } @Nested @@ -283,14 +355,6 @@ class RedactControllerTest { void handleWordBoundariesCorrectly() throws Exception { testAutoRedaction("confidential", false, true, "#FF0000", 1.0f, false, true); } - - @Test - @DisplayName("Should distinguish between partial and whole word matches") - void distinguishBetweenPartialAndWholeWordMatches() throws Exception { - // Test both whole word and partial matching - testAutoRedaction("secret", false, true, "#000000", 1.0f, false, true); - testAutoRedaction("secret", false, false, "#000000", 1.0f, false, true); - } } @Nested @@ -419,6 +483,74 @@ class RedactControllerTest { List overlappingAreas = createOverlappingRedactionAreas(); testManualRedaction(overlappingAreas, false); } + + @Test + @DisplayName("Should handle redaction areas with different colors") + void handleRedactionAreasWithDifferentColors() throws Exception { + List areas = new ArrayList<>(); + + String[] colors = {"FF0000", "00FF00", "0000FF", "FFFF00", "FF00FF", "00FFFF"}; + for (int i = 0; i < colors.length; i++) { + RedactionArea area = new RedactionArea(); + area.setPage(1); + area.setX(50.0 + (i * 60)); + area.setY(50.0); + area.setWidth(50.0); + area.setHeight(30.0); + area.setColor(colors[i]); + areas.add(area); + } + + testManualRedaction(areas, false); + } + + @Test + @DisplayName("Should handle redaction areas on multiple pages") + void handleRedactionAreasOnMultiplePages() throws Exception { + when(mockPages.getCount()).thenReturn(3); + when(mockDocument.getNumberOfPages()).thenReturn(3); + + List pageList = new ArrayList<>(); + for (int i = 0; i < 3; i++) { + PDPage page = mock(PDPage.class); + PDRectangle pageRect = new PDRectangle(0, 0, 612, 792); + when(page.getCropBox()).thenReturn(pageRect); + when(page.getMediaBox()).thenReturn(pageRect); + when(page.getBBox()).thenReturn(pageRect); + when(page.hasContents()).thenReturn(true); + + InputStream mockInputStream = new ByteArrayInputStream( + ("BT /F1 12 Tf 100 200 Td (page " + i + " content) Tj ET").getBytes()); + when(page.getContents()).thenReturn(mockInputStream); + + pageList.add(page); + } + + when(mockPages.iterator()).thenReturn(pageList.iterator()); + for (int i = 0; i < 3; i++) { + when(mockPages.get(i)).thenReturn(pageList.get(i)); + } + + List areas = new ArrayList<>(); + for (int i = 0; i < 3; i++) { + RedactionArea area = new RedactionArea(); + area.setPage(i + 1); // Pages are 1-indexed + area.setX(100.0); + area.setY(100.0); + area.setWidth(200.0); + area.setHeight(50.0); + area.setColor("000000"); + areas.add(area); + } + + testManualRedaction(areas, false); + + reset(mockPages); + when(mockPages.getCount()).thenReturn(1); + when(mockPages.get(0)).thenReturn(mockPage); + when(mockPages.iterator()).thenReturn(Collections.singletonList(mockPage).iterator()); + when(mockDocument.getNumberOfPages()).thenReturn(1); + } } @Nested @@ -507,6 +639,55 @@ class RedactControllerTest { void handleWhitespaceOnlySearchTerms(String whitespacePattern) throws Exception { testAutoRedaction(whitespacePattern, false, false, "#000000", 1.0f, false, true); } + + @Test + @DisplayName("Should handle null redact color gracefully") + void handleNullRedactColor() throws Exception { + RedactPdfRequest request = createRedactPdfRequest(); + request.setListOfText("test"); + request.setRedactColor(null); + + ResponseEntity response = redactController.redactPdf(request); + + assertNotNull(response); + assertEquals(200, response.getStatusCode().value()); + } + + @Test + @DisplayName("Should handle negative padding gracefully") + void handleNegativePadding() throws Exception { + testAutoRedaction("test", false, false, "#000000", -1.0f, false, true); + } + + @Test + @DisplayName("Should handle extremely large padding") + void handleExtremelyLargePadding() throws Exception { + testAutoRedaction("test", false, false, "#000000", 100.0f, false, true); + } + + @Test + @DisplayName("Should handle null manual redaction areas gracefully") + void handleNullManualRedactionAreas() throws Exception { + ManualRedactPdfRequest request = createManualRedactPdfRequest(); + request.setRedactions(null); + + ResponseEntity response = redactController.redactPDF(request); + + assertNotNull(response); + assertEquals(200, response.getStatusCode().value()); + } + + @Test + @DisplayName("Should handle out of bounds page numbers gracefully") + void handleOutOfBoundsPageNumbers() throws Exception { + ManualRedactPdfRequest request = createManualRedactPdfRequest(); + request.setPageNumbers("100-200"); + + ResponseEntity response = redactController.redactPDF(request); + + assertNotNull(response); + assertEquals(200, response.getStatusCode().value()); + } } @Nested @@ -765,14 +946,8 @@ class RedactControllerTest { Set targetWords = Set.of("confidential"); - List originalTokens = getOriginalTokens(); List filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); - long originalPositioning = originalTokens.stream() - .filter(token -> token instanceof Operator op && - (op.getName().equals("Td") || op.getName().equals("TD") || op.getName().equals("Tm"))) - .count(); - long filteredPositioning = filteredTokens.stream() .filter(token -> token instanceof Operator op && (op.getName().equals("Td") || op.getName().equals("TD") || op.getName().equals("Tm"))) @@ -781,6 +956,112 @@ class RedactControllerTest { assertTrue(filteredPositioning > 0, "Positioning operators should be preserved"); } + + @Test + @DisplayName("Should handle complex content streams with multiple operators") + void shouldHandleComplexContentStreams() throws Exception { + realPage = new PDPage(PDRectangle.A4); + while (realDocument.getNumberOfPages() > 0) { + realDocument.removePage(0); + } + realDocument.addPage(realPage); + realPage.setResources(new PDResources()); + realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + + try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { + contentStream.setLineWidth(2); + contentStream.moveTo(100, 100); + contentStream.lineTo(200, 200); + contentStream.stroke(); + + contentStream.beginText(); + contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); + contentStream.newLineAtOffset(50, 750); + contentStream.showText("This is a complex document with "); + contentStream.setTextRise(5); + contentStream.showText("confidential"); + contentStream.setTextRise(0); + contentStream.showText(" information."); + contentStream.endText(); + + contentStream.addRect(300, 300, 100, 100); + contentStream.fill(); + } + + Set targetWords = Set.of("confidential"); + + List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + + assertNotNull(tokens); + assertFalse(tokens.isEmpty()); + + String reconstructedText = extractTextFromTokens(tokens); + assertFalse(reconstructedText.contains("confidential"), "Target text should be redacted"); + + boolean hasGraphicsOperators = tokens.stream() + .anyMatch(token -> token instanceof Operator op && + (op.getName().equals("re") || op.getName().equals("f") || + op.getName().equals("m") || op.getName().equals("l") || + op.getName().equals("S"))); + + assertTrue(hasGraphicsOperators, "Graphics operators should be preserved"); + } + + @Test + @DisplayName("Should handle documents with multiple text blocks") + void shouldHandleDocumentsWithMultipleTextBlocks() throws Exception { + // Create a document with multiple text blocks + realPage = new PDPage(PDRectangle.A4); + while (realDocument.getNumberOfPages() > 0) { + realDocument.removePage(0); + } + realDocument.addPage(realPage); + + // Create resources + PDResources resources = new PDResources(); + resources.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + realPage.setResources(resources); + + try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { + contentStream.beginText(); + contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12); + contentStream.newLineAtOffset(50, 750); + contentStream.showText("This is the first text block"); + contentStream.endText(); + + contentStream.setLineWidth(2); + contentStream.moveTo(100, 700); + contentStream.lineTo(200, 700); + contentStream.stroke(); + + contentStream.beginText(); + contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12); + contentStream.newLineAtOffset(50, 650); + contentStream.showText("This block contains confidential information"); + contentStream.endText(); + + contentStream.addRect(100, 600, 100, 50); + contentStream.fill(); + + contentStream.beginText(); + contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12); + contentStream.newLineAtOffset(50, 550); + contentStream.showText("This is the third text block"); + contentStream.endText(); + } + + RedactPdfRequest request = createRedactPdfRequest(); + request.setListOfText("confidential"); + request.setUseRegex(false); + request.setWholeWordSearch(false); + + ResponseEntity response = redactController.redactPdf(request); + + assertNotNull(response); + assertEquals(200, response.getStatusCode().value()); + assertNotNull(response.getBody()); + assertTrue(response.getBody().length > 0); + } } private RedactPdfRequest createRedactPdfRequest() { From 344602cba4596f37b4f3872a074e2051f9c6ff9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Tue, 15 Jul 2025 17:53:07 +0200 Subject: [PATCH 09/13] refactor: remove unused `createTokensWithoutTargetText` method in RedactController --- .../SPDF/controller/api/security/RedactController.java | 9 --------- 1 file changed, 9 deletions(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 902af6f65..f6f383398 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -676,15 +676,6 @@ public class RedactController { return applyRedactionsToTokens(tokens, textSegments, matches); } - List createTokensWithoutTargetText( - PDPage page, Set targetWords, boolean useRegex, boolean wholeWordSearch) - throws IOException { - try (PDDocument tempDoc = new PDDocument()) { - return createTokensWithoutTargetText( - tempDoc, page, targetWords, useRegex, wholeWordSearch); - } - } - private void processPageXObjects( PDDocument document, PDResources resources, From a1e0e6f2fdd4d90c60abba67c9561cafc23ba0fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Tue, 15 Jul 2025 18:11:31 +0200 Subject: [PATCH 10/13] refactor: simplify text search logic in RedactController and improve `TextFinder` page handling. Fix potential document close issue. --- .../api/security/RedactController.java | 42 ++++--------------- .../software/SPDF/pdf/TextFinder.java | 5 ++- 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index f6f383398..9eb303a41 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -462,6 +462,7 @@ public class RedactController { if (fallbackDocument == null) { document.close(); } + document.close(); } catch (IOException e) { log.warn("Failed to close main document: {}", e.getMessage()); } @@ -486,43 +487,16 @@ public class RedactController { if (text.isEmpty()) continue; try { - int pageCount = document.getNumberOfPages(); - for (int pageIndex = 0; pageIndex < pageCount; pageIndex++) { - try (PDDocument singlePageDoc = new PDDocument()) { - PDPage page = document.getPage(pageIndex); - singlePageDoc.addPage(page); + TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearch); + textFinder.getText(document); - TextFinder pageFinder = new TextFinder(text, useRegex, wholeWordSearch); - - pageFinder.getText(singlePageDoc); - - for (PDFText found : pageFinder.getFoundTexts()) { - PDFText adjustedText = - new PDFText( - pageIndex, - found.getX1(), - found.getY1(), - found.getX2(), - found.getY2(), - found.getText()); - - allFoundTextsByPage - .computeIfAbsent(pageIndex, k -> new ArrayList<>()) - .add(adjustedText); - } - } catch (Exception e) { - log.error( - "Error processing page {} for search term '{}': {}", - pageIndex, - text, - e.getMessage()); - } + for (PDFText found : textFinder.getFoundTexts()) { + allFoundTextsByPage + .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>()) + .add(found); } } catch (Exception e) { - log.error( - "Error initializing TextFinder for search term '{}': {}", - text, - e.getMessage()); + log.error("Error processing search term '{}': {}", text, e.getMessage()); } } diff --git a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index 6efb5fde6..69b4ddc42 100644 --- a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -57,9 +57,10 @@ public class TextFinder extends PDFTextStripper { } @Override - protected void endPage(PDPage page) { + protected void endPage(PDPage page) throws IOException { String text = pageTextBuilder.toString(); if (text.isEmpty() || this.searchTerm == null || this.searchTerm.isEmpty()) { + super.endPage(page); return; } @@ -107,6 +108,8 @@ public class TextFinder extends PDFTextStripper { matcher.group())); } } + + super.endPage(page); } public List getFoundTexts() { From 7a9f96217245a77569d15680f895f8b639ccbdfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Tue, 15 Jul 2025 20:55:29 +0200 Subject: [PATCH 11/13] feat: improve placeholder generation logic and custom font handling in RedactController --- .../api/security/RedactController.java | 353 ++++++++++++++++-- 1 file changed, 320 insertions(+), 33 deletions(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 9eb303a41..399c4adbf 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -32,6 +32,9 @@ import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDSimpleFont; +import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding; +import org.apache.pdfbox.pdmodel.font.encoding.Encoding; import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.springframework.http.ResponseEntity; @@ -276,14 +279,133 @@ public class RedactController { } } - String createPlaceholder(String originalWord) { + String createPlaceholderWithFont(String originalWord, PDFont font) { if (originalWord == null || originalWord.isEmpty()) { return originalWord; } + if (font != null && isFontSubset(font.getName())) { + try { + float originalWidth = safeGetStringWidth(font, originalWord) / FONT_SCALE_FACTOR; + return createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f); + } catch (Exception e) { + log.debug( + "Subset font placeholder creation failed for {}: {}", + font.getName(), + e.getMessage()); + return ""; + } + } + return " ".repeat(originalWord.length()); } + String createPlaceholderWithWidth( + String originalWord, float targetWidth, PDFont font, float fontSize) { + if (originalWord == null || originalWord.isEmpty()) { + return originalWord; + } + + if (font == null || fontSize <= 0) { + return " ".repeat(originalWord.length()); + } + + try { + if (isFontSubset(font.getName())) { + return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize); + } + + float spaceWidth = safeGetStringWidth(font, " ") / FONT_SCALE_FACTOR * fontSize; + + if (spaceWidth <= 0) { + return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + } + + int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); + + int maxSpaces = originalWord.length() * 2; + spaceCount = Math.min(spaceCount, maxSpaces); + + return " ".repeat(spaceCount); + + } catch (Exception e) { + log.debug("Width-based placeholder creation failed: {}", e.getMessage()); + return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + } + } + + private String createSubsetFontPlaceholder( + String originalWord, float targetWidth, PDFont font, float fontSize) { + try { + log.debug("Subset font {} - trying to find replacement characters", font.getName()); + String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + + if (result.isEmpty()) { + log.debug( + "Subset font {} has no suitable replacement characters, using empty string", + font.getName()); + } + + return result; + + } catch (Exception e) { + log.debug("Subset font placeholder creation failed: {}", e.getMessage()); + return ""; + } + } + + private String createAlternativePlaceholder( + String originalWord, float targetWidth, PDFont font, float fontSize) { + try { + String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"}; + + if (fontSupportsCharacter(font, " ")) { + float spaceWidth = safeGetStringWidth(font, " ") / FONT_SCALE_FACTOR * fontSize; + if (spaceWidth > 0) { + int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); + int maxSpaces = originalWord.length() * 2; + spaceCount = Math.min(spaceCount, maxSpaces); + log.debug("Using spaces for font {}", font.getName()); + return " ".repeat(spaceCount); + } + } + + for (String altChar : alternatives) { + if (altChar.equals(" ")) continue; // Already tried spaces + + try { + if (!fontSupportsCharacter(font, altChar)) { + continue; + } + + float charWidth = + safeGetStringWidth(font, altChar) / FONT_SCALE_FACTOR * fontSize; + if (charWidth > 0) { + int charCount = Math.max(1, Math.round(targetWidth / charWidth)); + int maxChars = originalWord.length() * 2; + charCount = Math.min(charCount, maxChars); + log.debug( + "Using character '{}' for width calculation but spaces for placeholder in font {}", + altChar, + font.getName()); + + return " ".repeat(charCount); + } + } catch (Exception e) { + } + } + + log.debug( + "All placeholder alternatives failed for font {}, using empty string", + font.getName()); + return ""; + + } catch (Exception e) { + log.debug("Alternative placeholder creation failed: {}", e.getMessage()); + return ""; + } + } + void writeFilteredContentStream(PDDocument document, PDPage page, List tokens) throws IOException { @@ -515,8 +637,8 @@ public class RedactController { if (detectCustomEncodingFonts(document)) { log.warn( - "Problematic fonts detected (custom encodings / Type3 / damaged). " - + "Skipping inline text replacement and using box-only redaction for safety."); + "Custom encoded fonts detected (non-standard encodings / DictionaryEncoding / damaged fonts). " + + "Text replacement is unreliable for these fonts. Falling back to box-only redaction mode."); return true; // signal caller to fall back } @@ -527,13 +649,15 @@ public class RedactController { .filter(s -> !s.isEmpty()) .collect(Collectors.toSet()); + int pageCount = 0; for (PDPage page : document.getPages()) { + pageCount++; List filteredTokens = createTokensWithoutTargetText( document, page, allSearchTerms, useRegex, wholeWordSearchBool); writeFilteredContentStream(document, page, filteredTokens); } - log.info("Successfully performed text replacement redaction."); + log.info("Successfully performed text replacement redaction on {} pages.", pageCount); return false; } catch (Exception e) { log.error( @@ -840,7 +964,31 @@ public class RedactController { int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); if (segmentStart < text.length() && segmentEnd > segmentStart) { - String placeholder = createPlaceholder(text.substring(segmentStart, segmentEnd)); + String originalPart = text.substring(segmentStart, segmentEnd); + + float originalWidth = 0; + if (segment.getFont() != null && segment.getFontSize() > 0) { + try { + originalWidth = + safeGetStringWidth(segment.getFont(), originalPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + } catch (Exception e) { + log.debug( + "Failed to calculate original width for placeholder: {}", + e.getMessage()); + } + } + + String placeholder = + (originalWidth > 0) + ? createPlaceholderWithWidth( + originalPart, + originalWidth, + segment.getFont(), + segment.getFontSize()) + : createPlaceholderWithFont(originalPart, segment.getFont()); + result.replace(segmentStart, segmentEnd, placeholder); } } @@ -938,7 +1086,18 @@ public class RedactController { if (segStart < text.length() && segEnd > segStart) { String originalPart = text.substring(segStart, segEnd); - String placeholderPart = createPlaceholder(originalPart); + + float originalWidth = + safeGetStringWidth(segment.getFont(), originalPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + + String placeholderPart = + createPlaceholderWithWidth( + originalPart, + originalWidth, + segment.getFont(), + segment.getFontSize()); float origUnits = safeGetStringWidth(segment.getFont(), originalPart); float placeUnits = safeGetStringWidth(segment.getFont(), placeholderPart); @@ -953,7 +1112,12 @@ public class RedactController { float adjustment = totalOriginal - totalPlaceholder; - float maxReasonableAdjustment = segment.getText().length() * segment.getFontSize() * 2; + float maxReasonableAdjustment = + Math.max( + segment.getText().length() * segment.getFontSize() * 2, + totalOriginal * 1.5f // Allow up to 50% more than original width + ); + if (Math.abs(adjustment) > maxReasonableAdjustment) { log.debug( "Width adjustment {} seems unreasonable for text length {}, capping to 0", @@ -1048,11 +1212,34 @@ public class RedactController { int redactionEndInString = overlapEnd - stringStartInPage; if (redactionStartInString >= 0 && redactionEndInString <= originalText.length()) { + String originalPart = + originalText.substring( + redactionStartInString, redactionEndInString); + + float originalWidth = 0; + if (segment.getFont() != null && segment.getFontSize() > 0) { + try { + originalWidth = + safeGetStringWidth(segment.getFont(), originalPart) + / FONT_SCALE_FACTOR + * segment.getFontSize(); + } catch (Exception e) { + log.debug( + "Failed to calculate original width for TJ placeholder: {}", + e.getMessage()); + } + } + String placeholder = - createPlaceholder( - originalText.substring( - redactionStartInString, - redactionEndInString)); + (originalWidth > 0) + ? createPlaceholderWithWidth( + originalPart, + originalWidth, + segment.getFont(), + segment.getFontSize()) + : createPlaceholderWithFont( + originalPart, segment.getFont()); + newText.replace( redactionStartInString, redactionEndInString, placeholder); } @@ -1130,6 +1317,10 @@ public class RedactController { return false; } + int totalFonts = 0; + int customEncodedFonts = 0; + int subsetFonts = 0; + for (PDPage page : document.getPages()) { PDResources resources = page.getResources(); if (resources == null) { @@ -1139,23 +1330,42 @@ public class RedactController { for (COSName fontName : resources.getFontNames()) { try { PDFont font = resources.getFont(fontName); - if (font != null && hasProblematicFontCharacteristics(font)) { - log.debug( - "Detected problematic font: {} (type: {})", - font.getName(), - font.getClass().getSimpleName()); - return true; + if (font != null) { + totalFonts++; + + boolean isSubset = isFontSubset(font.getName()); + boolean isProblematic = hasProblematicFontCharacteristics(font); + + if (isSubset) { + subsetFonts++; + } + + if (isProblematic) { + customEncodedFonts++; + log.debug( + "Detected problematic font: {} (type: {})", + font.getName(), + font.getClass().getSimpleName()); + } } } catch (IOException e) { log.debug( "Font loading failed for {}: {}", fontName.getName(), e.getMessage()); - return true; + customEncodedFonts++; } } } - return false; + + log.info( + "Font analysis: {}/{} fonts use custom encoding, {}/{} are subset fonts (subset fonts with standard encodings are fine)", + customEncodedFonts, + totalFonts, + subsetFonts, + totalFonts); + + return customEncodedFonts > 0; } catch (Exception e) { log.warn("Font detection analysis failed: {}", e.getMessage()); return false; @@ -1169,24 +1379,89 @@ public class RedactController { return true; } - String fontName = font.getName(); - if (isFontSubset(fontName)) { - if (hasKnownProblematicPattern(fontName)) { - return cannotCalculateBasicWidths(font); - } - return false; + if (hasCustomEncoding(font)) { + log.debug( + "Font {} uses custom encoding - text replacement will be unreliable", + font.getName()); + return true; } String fontType = font.getClass().getSimpleName(); if ("PDType3Font".equals(fontType)) { + log.debug("Font {} is Type3 - may have text replacement issues", font.getName()); return cannotCalculateBasicWidths(font); } + log.debug("Font {} appears suitable for text replacement", font.getName()); return false; } catch (Exception e) { log.debug("Font analysis failed for {}: {}", font.getName(), e.getMessage()); - return true; + return false; + } + } + + private boolean hasCustomEncoding(PDFont font) { + try { + if (font instanceof PDSimpleFont simpleFont) { + try { + Encoding encoding = simpleFont.getEncoding(); + if (encoding != null) { + String encodingName = encoding.getEncodingName(); + + // Check if it's one of the standard encodings + if ("WinAnsiEncoding".equals(encodingName) + || "MacRomanEncoding".equals(encodingName) + || "StandardEncoding".equals(encodingName) + || "MacExpertEncoding".equals(encodingName) + || "SymbolEncoding".equals(encodingName) + || "ZapfDingbatsEncoding".equals(encodingName)) { + + log.debug( + "Font {} uses standard encoding: {}", + font.getName(), + encodingName); + return false; + } + + if (encoding instanceof DictionaryEncoding) { + log.debug( + "Font {} uses DictionaryEncoding - likely custom", + font.getName()); + return true; + } + + log.debug( + "Font {} uses non-standard encoding: {}", + font.getName(), + encodingName); + return true; + } + } catch (Exception e) { + log.debug( + "Could not determine encoding for font {}: {}", + font.getName(), + e.getMessage()); + } + } + + if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) { + log.debug("Font {} is Type0 (CID) - generally uses standard CMaps", font.getName()); + return false; // Be forgiving with CID fonts + } + + log.debug( + "Font {} type {} - assuming standard encoding", + font.getName(), + font.getClass().getSimpleName()); + return false; + + } catch (Exception e) { + log.debug( + "Custom encoding detection failed for font {}: {}", + font.getName(), + e.getMessage()); + return false; // Be forgiving on detection failure } } @@ -1221,16 +1496,28 @@ public class RedactController { return fontName.matches("^[A-Z]{6}\\+.*"); } - private boolean hasKnownProblematicPattern(String fontName) { - if (fontName == null) { + private boolean fontSupportsCharacter(PDFont font, String character) { + if (font == null || character == null || character.isEmpty()) { return false; } - return fontName.contains("HOEPAP") - || fontName.contains("HOEPGL") - || fontName.contains("HOEPNL") - || fontName.toLowerCase().contains("corrupt") - || fontName.toLowerCase().contains("damaged"); + try { + byte[] encoded = font.encode(character); + if (encoded.length == 0) { + return false; + } + + float width = font.getStringWidth(character); + return width > 0; + + } catch (Exception e) { + log.debug( + "Character '{}' not supported by font {}: {}", + character, + font.getName(), + e.getMessage()); + return false; + } } private void processFormXObject( From 6315721e8f6e3f50819e77d13be5841a4fc67c38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Fri, 18 Jul 2025 18:50:17 +0200 Subject: [PATCH 12/13] feat: add TextFinderUtils and WidthCalculator for text processing and font validation, TextEncodingHelper for encoding support --- .../api/security/RedactController.java | 502 ++++++++++-------- .../software/SPDF/pdf/TextFinder.java | 100 +++- .../SPDF/utils/text/TextEncodingHelper.java | 351 ++++++++++++ .../SPDF/utils/text/TextFinderUtils.java | 140 +++++ .../SPDF/utils/text/WidthCalculator.java | 136 +++++ .../software/SPDF/pdf/TextFinderTest.java | 107 +++- 6 files changed, 1106 insertions(+), 230 deletions(-) create mode 100644 app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java create mode 100644 app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java create mode 100644 app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 399c4adbf..296108516 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -32,9 +32,6 @@ import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDSimpleFont; -import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding; -import org.apache.pdfbox.pdmodel.font.encoding.Encoding; import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.springframework.http.ResponseEntity; @@ -59,6 +56,9 @@ import stirling.software.SPDF.model.PDFText; import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest; import stirling.software.SPDF.model.api.security.RedactPdfRequest; import stirling.software.SPDF.pdf.TextFinder; +import stirling.software.SPDF.utils.text.TextEncodingHelper; +import stirling.software.SPDF.utils.text.TextFinderUtils; +import stirling.software.SPDF.utils.text.WidthCalculator; import stirling.software.common.model.api.security.RedactionArea; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.GeneralUtils; @@ -77,6 +77,9 @@ public class RedactController { private static final float PRECISION_THRESHOLD = 1e-3f; private static final int FONT_SCALE_FACTOR = 1000; + // Redaction box width reduction factor (10% reduction) + private static final float REDACTION_WIDTH_REDUCTION_FACTOR = 0.9f; + // Text showing operators private static final Set TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); @@ -229,7 +232,11 @@ public class RedactController { } private void redactFoundText( - PDDocument document, List blocks, float customPadding, Color redactColor) + PDDocument document, + List blocks, + float customPadding, + Color redactColor, + boolean isTextRemovalMode) throws IOException { var allPages = document.getDocumentCatalog().getPages(); @@ -263,10 +270,28 @@ public class RedactController { (block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER + customPadding; + float originalWidth = block.getX2() - block.getX1(); + float boxWidth; + float boxX; + + // Only apply width reduction when text is actually being removed + if (isTextRemovalMode) { + // Calculate reduced width and center the box + boxWidth = + originalWidth + * REDACTION_WIDTH_REDUCTION_FACTOR; // 10% reduction + float widthReduction = originalWidth - boxWidth; + boxX = block.getX1() + (widthReduction / 2); // Center the reduced box + } else { + // Use original width for box-only redaction + boxWidth = originalWidth; + boxX = block.getX1(); + } + contentStream.addRect( - block.getX1(), + boxX, pageBox.getHeight() - block.getY2() - padding, - block.getX2() - block.getX1(), + boxWidth, block.getY2() - block.getY1() + 2 * padding); } @@ -284,7 +309,7 @@ public class RedactController { return originalWord; } - if (font != null && isFontSubset(font.getName())) { + if (font != null && TextEncodingHelper.isFontSubset(font.getName())) { try { float originalWidth = safeGetStringWidth(font, originalWord) / FONT_SCALE_FACTOR; return createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f); @@ -300,6 +325,10 @@ public class RedactController { return " ".repeat(originalWord.length()); } + /** + * Enhanced placeholder creation using advanced width calculation. Incorporates font validation + * and sophisticated fallback strategies. + */ String createPlaceholderWithWidth( String originalWord, float targetWidth, PDFont font, float fontSize) { if (originalWord == null || originalWord.isEmpty()) { @@ -311,11 +340,21 @@ public class RedactController { } try { - if (isFontSubset(font.getName())) { + // Check font reliability before proceeding + if (!WidthCalculator.isWidthCalculationReliable(font)) { + log.debug( + "Font {} unreliable for width calculation, using simple placeholder", + font.getName()); + return " ".repeat(originalWord.length()); + } + + // Use enhanced subset font detection + if (TextEncodingHelper.isFontSubset(font.getName())) { return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize); } - float spaceWidth = safeGetStringWidth(font, " ") / FONT_SCALE_FACTOR * fontSize; + // Enhanced space width calculation + float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize); if (spaceWidth <= 0) { return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); @@ -323,13 +362,16 @@ public class RedactController { int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); - int maxSpaces = originalWord.length() * 2; + // More conservative space limit based on original word characteristics + int maxSpaces = + Math.max( + originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f)); spaceCount = Math.min(spaceCount, maxSpaces); return " ".repeat(spaceCount); } catch (Exception e) { - log.debug("Width-based placeholder creation failed: {}", e.getMessage()); + log.debug("Enhanced placeholder creation failed: {}", e.getMessage()); return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); } } @@ -359,7 +401,7 @@ public class RedactController { try { String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"}; - if (fontSupportsCharacter(font, " ")) { + if (TextEncodingHelper.fontSupportsCharacter(font, " ")) { float spaceWidth = safeGetStringWidth(font, " ") / FONT_SCALE_FACTOR * fontSize; if (spaceWidth > 0) { int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); @@ -371,10 +413,10 @@ public class RedactController { } for (String altChar : alternatives) { - if (altChar.equals(" ")) continue; // Already tried spaces + if (" ".equals(altChar)) continue; // Already tried spaces try { - if (!fontSupportsCharacter(font, altChar)) { + if (!TextEncodingHelper.fontSupportsCharacter(font, altChar)) { continue; } @@ -546,7 +588,8 @@ public class RedactController { allFoundTextsByPage, request.getRedactColor(), request.getCustomPadding(), - request.getConvertPDFToImage()); + request.getConvertPDFToImage(), + false); // Box-only mode, use original box sizes return WebResponseUtils.bytesToWebResponse( pdfContent, @@ -564,7 +607,8 @@ public class RedactController { allFoundTextsByPage, request.getRedactColor(), request.getCustomPadding(), - request.getConvertPDFToImage()); + request.getConvertPDFToImage(), + true); // Text removal mode, use reduced box sizes return WebResponseUtils.bytesToWebResponse( pdfContent, @@ -608,14 +652,31 @@ public class RedactController { text = text.trim(); if (text.isEmpty()) continue; + log.debug( + "Searching for text: '{}' (regex: {}, wholeWord: {})", + text, + useRegex, + wholeWordSearch); + try { TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearch); textFinder.getText(document); - for (PDFText found : textFinder.getFoundTexts()) { + List foundTexts = textFinder.getFoundTexts(); + log.debug("TextFinder found {} instances of '{}'", foundTexts.size(), text); + + for (PDFText found : foundTexts) { allFoundTextsByPage .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>()) .add(found); + log.debug( + "Added match on page {} at ({},{},{},{}): '{}'", + found.getPageIndex(), + found.getX1(), + found.getY1(), + found.getX2(), + found.getY2(), + found.getText()); } } catch (Exception e) { log.error("Error processing search term '{}': {}", text, e.getMessage()); @@ -673,7 +734,8 @@ public class RedactController { Map> allFoundTextsByPage, String colorString, float customPadding, - Boolean convertToImage) + Boolean convertToImage, + boolean isTextRemovalMode) throws IOException { List allFoundTexts = new ArrayList<>(); @@ -684,7 +746,7 @@ public class RedactController { if (!allFoundTexts.isEmpty()) { Color redactColor = decodeOrDefault(colorString); - redactFoundText(document, allFoundTexts, customPadding, redactColor); + redactFoundText(document, allFoundTexts, customPadding, redactColor, isTextRemovalMode); cleanDocumentMetadata(document); } @@ -870,16 +932,24 @@ public class RedactController { boolean useRegex, boolean wholeWordSearch) { - return targetWords.stream() - .map( - target -> { - String patternString = useRegex ? target : Pattern.quote(target); - if (wholeWordSearch) { - patternString = "\\b" + patternString + "\\b"; + // Use the new utility for creating optimized patterns + List patterns = + TextFinderUtils.createOptimizedSearchPatterns( + targetWords, useRegex, wholeWordSearch); + + return patterns.stream() + .flatMap( + pattern -> { + try { + return pattern.matcher(completeText).results(); + } catch (Exception e) { + log.debug( + "Pattern matching failed for pattern {}: {}", + pattern.pattern(), + e.getMessage()); + return java.util.stream.Stream.empty(); } - return Pattern.compile(patternString, Pattern.CASE_INSENSITIVE); }) - .flatMap(pattern -> pattern.matcher(completeText).results()) .map(matchResult -> new MatchRange(matchResult.start(), matchResult.end())) .sorted(Comparator.comparingInt(MatchRange::getStartPos)) .collect(Collectors.toList()); @@ -957,6 +1027,16 @@ public class RedactController { private String applyRedactionsToSegmentText(TextSegment segment, List matches) { String text = segment.getText(); + + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) { + log.debug( + "Skipping text segment '{}' - font {} cannot process this text reliably", + text, + segment.getFont().getName()); + return text; // Return original text unchanged + } + StringBuilder result = new StringBuilder(text); for (MatchRange match : matches) { @@ -966,6 +1046,15 @@ public class RedactController { if (segmentStart < text.length() && segmentEnd > segmentStart) { String originalPart = text.substring(segmentStart, segmentEnd); + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable( + segment.getFont(), originalPart)) { + log.debug( + "Skipping text part '{}' within segment - cannot be processed reliably", + originalPart); + continue; // Skip this match, process others + } + float originalWidth = 0; if (segment.getFont() != null && segment.getFontSize() > 0) { try { @@ -1001,68 +1090,125 @@ public class RedactController { return 0; } + if (!WidthCalculator.isWidthCalculationReliable(font)) { + log.debug( + "Font {} flagged as unreliable for width calculation, using fallback", + font.getName()); + return calculateConservativeWidth(font, text); + } + + if (!TextEncodingHelper.canEncodeCharacters(font, text)) { + log.debug( + "Text cannot be encoded by font {}, using character-based fallback", + font.getName()); + return calculateCharacterBasedWidth(font, text); + } + try { - return font.getStringWidth(text); + float width = font.getStringWidth(text); + log.debug("Direct width calculation successful for '{}': {}", text, width); + return width; + } catch (Exception e) { - try { - float totalWidth = 0; - for (int i = 0; i < text.length(); i++) { - String character = text.substring(i, i + 1); - try { - byte[] encoded = font.encode(character); - if (encoded.length > 0) { - int glyphCode = encoded[0] & 0xFF; + log.debug( + "Direct width calculation failed for font {}: {}", + font.getName(), + e.getMessage()); + return calculateFallbackWidth(font, text); + } + } - float glyphWidth = font.getWidth(glyphCode); + private float calculateCharacterBasedWidth(PDFont font, String text) { + try { + float totalWidth = 0; + for (int i = 0; i < text.length(); i++) { + String character = text.substring(i, i + 1); + try { + // Validate character encoding first + if (!TextEncodingHelper.fontSupportsCharacter(font, character)) { + totalWidth += font.getAverageFontWidth(); + continue; + } - if (glyphWidth == 0) { - try { - glyphWidth = font.getWidthFromFont(glyphCode); - } catch (Exception e2) { - glyphWidth = font.getAverageFontWidth(); - } + byte[] encoded = font.encode(character); + if (encoded.length > 0) { + int glyphCode = encoded[0] & 0xFF; + float glyphWidth = font.getWidth(glyphCode); + + // Try alternative width methods if primary fails + if (glyphWidth == 0) { + try { + glyphWidth = font.getWidthFromFont(glyphCode); + } catch (Exception e2) { + glyphWidth = font.getAverageFontWidth(); } - - totalWidth += glyphWidth; - } else { - totalWidth += font.getAverageFontWidth(); } - } catch (Exception e2) { + + totalWidth += glyphWidth; + } else { totalWidth += font.getAverageFontWidth(); } + } catch (Exception e2) { + // Character processing failed, use average width + totalWidth += font.getAverageFontWidth(); } - return totalWidth; - } catch (Exception e2) { - log.debug("PDFBox API width calculation failed: {}", e2.getMessage()); } - try { - if (font.getFontDescriptor() != null - && font.getFontDescriptor().getFontBoundingBox() != null) { - PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); - float avgCharWidth = bbox.getHeight() / 1000f * 0.865f; - return text.length() * avgCharWidth * FONT_SCALE_FACTOR; - } - } catch (Exception e2) { - log.debug("Font bounding box width calculation failed: {}", e2.getMessage()); + log.debug("Character-based width calculation: {}", totalWidth); + return totalWidth; + + } catch (Exception e) { + log.debug("Character-based width calculation failed: {}", e.getMessage()); + return calculateConservativeWidth(font, text); + } + } + + private float calculateFallbackWidth(PDFont font, String text) { + try { + // Method 1: Font bounding box approach + if (font.getFontDescriptor() != null + && font.getFontDescriptor().getFontBoundingBox() != null) { + + PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); + float avgCharWidth = bbox.getWidth() * 0.6f; // Conservative estimate + float fallbackWidth = text.length() * avgCharWidth; + + log.debug("Bounding box fallback width: {}", fallbackWidth); + return fallbackWidth; } + // Method 2: Average font width try { float avgWidth = font.getAverageFontWidth(); - return text.length() * avgWidth; + if (avgWidth > 0) { + float fallbackWidth = text.length() * avgWidth; + log.debug("Average width fallback: {}", fallbackWidth); + return fallbackWidth; + } } catch (Exception e2) { log.debug("Average font width calculation failed: {}", e2.getMessage()); } - float conservativeWidth = text.length() * 500f; // 500 units per character - log.debug( - "All width calculation methods failed for font {}, using conservative estimate: {}", - font.getName(), - conservativeWidth); - return conservativeWidth; + // Method 3: Conservative estimate based on font metrics + return calculateConservativeWidth(font, text); + + } catch (Exception e) { + log.debug("Fallback width calculation failed: {}", e.getMessage()); + return calculateConservativeWidth(font, text); } } + private float calculateConservativeWidth(PDFont font, String text) { + float conservativeWidth = text.length() * 500f; + + log.debug( + "Conservative width estimate for font {} text '{}': {}", + font.getName(), + text, + conservativeWidth); + return conservativeWidth; + } + private float calculateWidthAdjustment(TextSegment segment, List matches) { try { if (segment.getFont() == null || segment.getFontSize() <= 0) { @@ -1070,7 +1216,8 @@ public class RedactController { } String fontName = segment.getFont().getName(); - if (fontName != null && (fontName.contains("HOEPAP") || isFontSubset(fontName))) { + if (fontName != null + && (fontName.contains("HOEPAP") || TextEncodingHelper.isFontSubset(fontName))) { log.debug("Skipping width adjustment for problematic/subset font: {}", fontName); return 0; } @@ -1196,6 +1343,19 @@ public class RedactController { for (COSBase element : originalArray) { if (element instanceof COSString cosString) { String originalText = cosString.getString(); + + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable( + segment.getFont(), originalText)) { + log.debug( + "Skipping TJ text part '{}' - cannot be processed reliably with font {}", + originalText, + segment.getFont().getName()); + newArray.add(element); // Keep original unchanged + textOffsetInSegment += originalText.length(); + continue; + } + StringBuilder newText = new StringBuilder(originalText); boolean modified = false; @@ -1207,7 +1367,6 @@ public class RedactController { int overlapEnd = Math.min(match.getEndPos(), stringEndInPage); if (overlapStart < overlapEnd) { - modified = true; int redactionStartInString = overlapStart - stringStartInPage; int redactionEndInString = overlapEnd - stringStartInPage; if (redactionStartInString >= 0 @@ -1216,6 +1375,16 @@ public class RedactController { originalText.substring( redactionStartInString, redactionEndInString); + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable( + segment.getFont(), originalPart)) { + log.debug( + "Skipping TJ text part '{}' - cannot be redacted reliably", + originalPart); + continue; // Skip this redaction, keep original text + } + + modified = true; float originalWidth = 0; if (segment.getFont() != null && segment.getFontSize() > 0) { try { @@ -1320,8 +1489,13 @@ public class RedactController { int totalFonts = 0; int customEncodedFonts = 0; int subsetFonts = 0; + int unreliableFonts = 0; for (PDPage page : document.getPages()) { + if (TextFinderUtils.hasProblematicFonts(page)) { + log.debug("Page contains fonts flagged as problematic by TextFinderUtils"); + } + PDResources resources = page.getResources(); if (resources == null) { continue; @@ -1333,190 +1507,64 @@ public class RedactController { if (font != null) { totalFonts++; - boolean isSubset = isFontSubset(font.getName()); - boolean isProblematic = hasProblematicFontCharacteristics(font); + // Enhanced analysis using helper classes + boolean isSubset = TextEncodingHelper.isFontSubset(font.getName()); + boolean hasCustomEncoding = TextEncodingHelper.hasCustomEncoding(font); + boolean isReliable = WidthCalculator.isWidthCalculationReliable(font); + boolean canCalculateWidths = + TextEncodingHelper.canCalculateBasicWidths(font); if (isSubset) { subsetFonts++; } - if (isProblematic) { + if (hasCustomEncoding) { customEncodedFonts++; + log.debug("Font {} has custom encoding", font.getName()); + } + + if (!isReliable || !canCalculateWidths) { + unreliableFonts++; log.debug( - "Detected problematic font: {} (type: {})", + "Font {} flagged as unreliable: reliable={}, canCalculateWidths={}", font.getName(), - font.getClass().getSimpleName()); + isReliable, + canCalculateWidths); + } + + if (!TextFinderUtils.validateFontReliability(font)) { + log.debug( + "Font {} failed comprehensive reliability check", + font.getName()); } } - } catch (IOException e) { + } catch (Exception e) { log.debug( - "Font loading failed for {}: {}", + "Font loading/analysis failed for {}: {}", fontName.getName(), e.getMessage()); customEncodedFonts++; + unreliableFonts++; + totalFonts++; } } } log.info( - "Font analysis: {}/{} fonts use custom encoding, {}/{} are subset fonts (subset fonts with standard encodings are fine)", + "Enhanced font analysis: {}/{} custom encoding, {}/{} subset, {}/{} unreliable fonts", customEncodedFonts, totalFonts, subsetFonts, + totalFonts, + unreliableFonts, totalFonts); - return customEncodedFonts > 0; - } catch (Exception e) { - log.warn("Font detection analysis failed: {}", e.getMessage()); - return false; - } - } - - private boolean hasProblematicFontCharacteristics(PDFont font) { - try { - if (font.isDamaged()) { - log.debug("Font {} is marked as damaged by PDFBox", font.getName()); - return true; - } - - if (hasCustomEncoding(font)) { - log.debug( - "Font {} uses custom encoding - text replacement will be unreliable", - font.getName()); - return true; - } - - String fontType = font.getClass().getSimpleName(); - if ("PDType3Font".equals(fontType)) { - log.debug("Font {} is Type3 - may have text replacement issues", font.getName()); - return cannotCalculateBasicWidths(font); - } - - log.debug("Font {} appears suitable for text replacement", font.getName()); - return false; + // Consider document problematic if we have custom encodings or unreliable fonts + return customEncodedFonts > 0 || unreliableFonts > 0; } catch (Exception e) { - log.debug("Font analysis failed for {}: {}", font.getName(), e.getMessage()); - return false; - } - } - - private boolean hasCustomEncoding(PDFont font) { - try { - if (font instanceof PDSimpleFont simpleFont) { - try { - Encoding encoding = simpleFont.getEncoding(); - if (encoding != null) { - String encodingName = encoding.getEncodingName(); - - // Check if it's one of the standard encodings - if ("WinAnsiEncoding".equals(encodingName) - || "MacRomanEncoding".equals(encodingName) - || "StandardEncoding".equals(encodingName) - || "MacExpertEncoding".equals(encodingName) - || "SymbolEncoding".equals(encodingName) - || "ZapfDingbatsEncoding".equals(encodingName)) { - - log.debug( - "Font {} uses standard encoding: {}", - font.getName(), - encodingName); - return false; - } - - if (encoding instanceof DictionaryEncoding) { - log.debug( - "Font {} uses DictionaryEncoding - likely custom", - font.getName()); - return true; - } - - log.debug( - "Font {} uses non-standard encoding: {}", - font.getName(), - encodingName); - return true; - } - } catch (Exception e) { - log.debug( - "Could not determine encoding for font {}: {}", - font.getName(), - e.getMessage()); - } - } - - if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) { - log.debug("Font {} is Type0 (CID) - generally uses standard CMaps", font.getName()); - return false; // Be forgiving with CID fonts - } - - log.debug( - "Font {} type {} - assuming standard encoding", - font.getName(), - font.getClass().getSimpleName()); - return false; - - } catch (Exception e) { - log.debug( - "Custom encoding detection failed for font {}: {}", - font.getName(), - e.getMessage()); - return false; // Be forgiving on detection failure - } - } - - private boolean cannotCalculateBasicWidths(PDFont font) { - try { - float spaceWidth = font.getStringWidth(" "); - if (spaceWidth <= 0) { - return true; - } - - String[] testChars = {"a", "A", "0", ".", "e", "!"}; - for (String ch : testChars) { - try { - float width = font.getStringWidth(ch); - if (width > 0) { - return false; // Found at least one character we can measure - } - } catch (Exception e) { - } - } - - return true; // Can't calculate width for any test characters - } catch (Exception e) { - return true; // Font failed basic width calculation - } - } - - private boolean isFontSubset(String fontName) { - if (fontName == null) { - return false; - } - return fontName.matches("^[A-Z]{6}\\+.*"); - } - - private boolean fontSupportsCharacter(PDFont font, String character) { - if (font == null || character == null || character.isEmpty()) { - return false; - } - - try { - byte[] encoded = font.encode(character); - if (encoded.length == 0) { - return false; - } - - float width = font.getStringWidth(character); - return width > 0; - - } catch (Exception e) { - log.debug( - "Character '{}' not supported by font {}: {}", - character, - font.getName(), - e.getMessage()); - return false; + log.warn("Enhanced font detection analysis failed: {}", e.getMessage()); + return true; // Assume problematic if analysis fails } } diff --git a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index 69b4ddc42..432fad101 100644 --- a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -10,8 +10,11 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; +import lombok.extern.slf4j.Slf4j; + import stirling.software.SPDF.model.PDFText; +@Slf4j public class TextFinder extends PDFTextStripper { private final String searchTerm; @@ -67,16 +70,40 @@ public class TextFinder extends PDFTextStripper { String processedSearchTerm = this.searchTerm.trim(); String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E"; if (this.wholeWordSearch) { - regex = "\\b" + regex + "\\b"; + if (processedSearchTerm.length() == 1 + && Character.isDigit(processedSearchTerm.charAt(0))) { + regex = "(?= pageTextPositions.size()) { + log.debug( + "Position index {} exceeds available positions ({})", + i, + pageTextPositions.size()); continue; } TextPosition pos = pageTextPositions.get(i); @@ -97,6 +128,27 @@ public class TextFinder extends PDFTextStripper { } } + if (!foundPosition && matchStart < pageTextPositions.size()) { + log.debug( + "Attempting to find nearby positions for match at {}-{}", + matchStart, + matchEnd); + + for (int i = Math.max(0, matchStart - 5); + i < Math.min(pageTextPositions.size(), matchEnd + 5); + i++) { + TextPosition pos = pageTextPositions.get(i); + if (pos != null) { + foundPosition = true; + minX = Math.min(minX, pos.getX()); + maxX = Math.max(maxX, pos.getX() + pos.getWidth()); + minY = Math.min(minY, pos.getY() - pos.getHeight()); + maxY = Math.max(maxY, pos.getY()); + break; + } + } + } + if (foundPosition) { foundTexts.add( new PDFText( @@ -106,13 +158,59 @@ public class TextFinder extends PDFTextStripper { maxX, maxY, matcher.group())); + log.debug( + "Added PDFText for match: page={}, bounds=({},{},{},{}), text='{}'", + getCurrentPageNo() - 1, + minX, + minY, + maxX, + maxY, + matcher.group()); + } else { + log.warn( + "Found text match '{}' but no valid position data at {}-{}", + matcher.group(), + matchStart, + matchEnd); } } + log.debug( + "Page {} search complete: found {} matches for '{}'", + getCurrentPageNo(), + matchCount, + processedSearchTerm); + super.endPage(page); } public List getFoundTexts() { return foundTexts; } + + public String getDebugInfo() { + StringBuilder debug = new StringBuilder(); + debug.append("Extracted text length: ").append(pageTextBuilder.length()).append("\n"); + debug.append("Position count: ").append(pageTextPositions.size()).append("\n"); + debug.append("Text content: '") + .append(pageTextBuilder.toString().replace("\n", "\\n").replace("\r", "\\r")) + .append("'\n"); + + String text = pageTextBuilder.toString(); + for (int i = 0; i < Math.min(text.length(), 50); i++) { + char c = text.charAt(i); + TextPosition pos = i < pageTextPositions.size() ? pageTextPositions.get(i) : null; + debug.append( + String.format( + " [%d] '%c' (0x%02X) -> %s\n", + i, + c, + (int) c, + pos != null + ? String.format("(%.1f,%.1f)", pos.getX(), pos.getY()) + : "null")); + } + + return debug.toString(); + } } diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java new file mode 100644 index 000000000..4292e6c52 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java @@ -0,0 +1,351 @@ +package stirling.software.SPDF.utils.text; + +import java.io.IOException; + +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDSimpleFont; +import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding; +import org.apache.pdfbox.pdmodel.font.encoding.Encoding; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class TextEncodingHelper { + + public static boolean canEncodeCharacters(PDFont font, String text) { + if (font == null || text == null || text.isEmpty()) { + return false; + } + + try { + // Step 1: Primary check - full-string encoding (permissive for "good" cases) + byte[] encoded = font.encode(text); + if (encoded.length > 0) { + log.debug( + "Text '{}' has good full-string encoding for font {} - permissively allowing", + text, + font.getName() != null ? font.getName() : "Unknown"); + return true; + } + + // Step 2: Smart array-based fallback for TJ operator-style text + log.debug( + "Full encoding failed for '{}' - using array-based fallback for font {}", + text, + font.getName() != null ? font.getName() : "Unknown"); + + return validateAsCodePointArray(font, text); + + } catch (IOException | IllegalArgumentException e) { + log.debug( + "Encoding exception for text '{}' with font {} - trying array fallback: {}", + text, + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + + if (isFontSubset(font.getName()) || hasCustomEncoding(font)) { + return validateAsCodePointArray(font, text); + } + + return false; // Non-subset fonts with encoding exceptions are likely problematic + } + } + + private static boolean validateAsCodePointArray(PDFont font, String text) { + int totalCodePoints = 0; + int successfulCodePoints = 0; + + // Iterate through code points (handles surrogates correctly per Unicode docs) + for (int i = 0; i < text.length(); ) { + int codePoint = text.codePointAt(i); + String charStr = new String(Character.toChars(codePoint)); + totalCodePoints++; + + try { + // Test encoding for this code point + byte[] charEncoded = font.encode(charStr); + if (charEncoded.length > 0) { + float charWidth = font.getStringWidth(charStr); + + if (charWidth >= 0) { + successfulCodePoints++; + log.debug( + "Code point '{}' (U+{}) encoded successfully", + charStr, + Integer.toHexString(codePoint).toUpperCase()); + } else { + log.debug( + "Code point '{}' (U+{}) has invalid width: {}", + charStr, + Integer.toHexString(codePoint).toUpperCase(), + charWidth); + } + } else { + log.debug( + "Code point '{}' (U+{}) encoding failed - empty result", + charStr, + Integer.toHexString(codePoint).toUpperCase()); + } + } catch (IOException | IllegalArgumentException e) { + log.debug( + "Code point '{}' (U+{}) validation failed: {}", + charStr, + Integer.toHexString(codePoint).toUpperCase(), + e.getMessage()); + } + + i += Character.charCount(codePoint); // Handle surrogates properly + } + + double successRate = + totalCodePoints > 0 ? (double) successfulCodePoints / totalCodePoints : 0; + boolean isAcceptable = successRate >= 0.95; + + log.debug( + "Array validation for '{}': {}/{} code points successful ({:.1f}%) - {}", + text, + successfulCodePoints, + totalCodePoints, + successRate * 100, + isAcceptable ? "ALLOWING" : "rejecting"); + + return isAcceptable; + } + + public static boolean isTextSegmentRemovable(PDFont font, String text) { + if (font == null || text == null || text.isEmpty()) { + return false; + } + + // Log the attempt + log.debug( + "Evaluating text segment for removal: '{}' with font {}", + text, + font.getName() != null ? font.getName() : "Unknown Font"); + + if (isSimpleCharacter(text)) { + try { + font.encode(text); + font.getStringWidth(text); + log.debug( + "Text '{}' is a simple character and passed validation - allowing removal", + text); + return true; + } catch (Exception e) { + log.debug( + "Simple character '{}' failed basic validation with font {}: {}", + text, + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; + } + } + + // For complex text, require comprehensive validation + return isTextFullyRemovable(font, text); + } + + public static boolean isTextFullyRemovable(PDFont font, String text) { + if (font == null || text == null || text.isEmpty()) { + return false; + } + + try { + // Check 1: Verify encoding capability using new smart approach + if (!canEncodeCharacters(font, text)) { + log.debug( + "Text '{}' failed encoding validation for font {}", + text, + font.getName() != null ? font.getName() : "Unknown"); + return false; + } + + // Check 2: Validate width calculation capability + float width = font.getStringWidth(text); + if (width < 0) { // Allow zero width (invisible chars) but reject negative (invalid) + log.debug( + "Text '{}' has invalid width {} for font {}", + text, + width, + font.getName() != null ? font.getName() : "Unknown"); + return false; // Invalid metrics prevent accurate removal + } + + // Check 3: Verify font descriptor completeness for redaction area calculation + if (font.getFontDescriptor() == null) { + log.debug( + "Missing font descriptor for font {}", + font.getName() != null ? font.getName() : "Unknown"); + return false; + } + + // Check 4: Test bounding box calculation for redaction area + try { + font.getFontDescriptor().getFontBoundingBox(); + } catch (IllegalArgumentException e) { + log.debug( + "Font bounding box unavailable for font {}: {}", + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; + } + + log.debug( + "Text '{}' passed comprehensive validation for font {}", + text, + font.getName() != null ? font.getName() : "Unknown"); + return true; + + } catch (IOException e) { + log.debug( + "Text '{}' failed validation for font {} due to IO error: {}", + text, + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; + } catch (IllegalArgumentException e) { + log.debug( + "Text '{}' failed validation for font {} due to argument error: {}", + text, + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; + } + } + + private static boolean isSimpleCharacter(String text) { + if (text == null || text.isEmpty()) { + return false; + } + + if (text.length() > 20) { + return false; + } + + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + + // Allow letters, digits, and whitespace (most common cases) + if (Character.isLetterOrDigit(c) || Character.isWhitespace(c)) { + continue; + } + + // Allow common ASCII punctuation + if (c >= 32 && c <= 126 && ".,!?;:()-[]{}\"'/@#$%&*+=<>|\\~`".indexOf(c) >= 0) { + continue; + } + + return false; + } + + return true; + } + + public static boolean hasCustomEncoding(PDFont font) { + try { + if (font instanceof PDSimpleFont simpleFont) { + try { + Encoding encoding = simpleFont.getEncoding(); + if (encoding != null) { + // Check for dictionary-based custom encodings + if (encoding instanceof DictionaryEncoding) { + log.debug("Font {} uses DictionaryEncoding (custom)", font.getName()); + return true; + } + + String encodingName = encoding.getClass().getSimpleName(); + if (encodingName.contains("Custom") + || encodingName.contains("Dictionary")) { + log.debug( + "Font {} uses custom encoding: {}", + font.getName(), + encodingName); + return true; + } + } + } catch (Exception e) { + log.debug( + "Encoding detection failed for font {}: {}", + font.getName(), + e.getMessage()); + return true; // Assume custom if detection fails + } + } + + if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) { + log.debug( + "Font {} is Type0 (CID) - generally uses standard CMaps", + font.getName() != null ? font.getName() : "Unknown"); + return false; + } + + log.debug( + "Font {} type {} - assuming standard encoding", + font.getName() != null ? font.getName() : "Unknown", + font.getClass().getSimpleName()); + return false; + + } catch (IllegalArgumentException e) { + log.debug( + "Custom encoding detection failed for font {}: {}", + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; // Be forgiving on detection failure + } + } + + public static boolean fontSupportsCharacter(PDFont font, String character) { + if (font == null || character == null || character.isEmpty()) { + return false; + } + + try { + byte[] encoded = font.encode(character); + if (encoded.length == 0) { + return false; + } + + float width = font.getStringWidth(character); + return width > 0; + + } catch (IOException | IllegalArgumentException e) { + log.debug( + "Character '{}' not supported by font {}: {}", + character, + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; + } + } + + public static boolean isFontSubset(String fontName) { + if (fontName == null) { + return false; + } + return fontName.matches("^[A-Z]{6}\\+.*"); + } + + public static boolean canCalculateBasicWidths(PDFont font) { + try { + float spaceWidth = font.getStringWidth(" "); + if (spaceWidth <= 0) { + return false; + } + + String[] testChars = {"a", "A", "0", ".", "e", "!"}; + for (String ch : testChars) { + try { + float width = font.getStringWidth(ch); + if (width > 0) { + return true; + } + } catch (IOException | IllegalArgumentException e) { + } + } + + return false; // Can't calculate width for any test characters + } catch (IOException | IllegalArgumentException e) { + return false; // Font failed basic width calculation + } + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java new file mode 100644 index 000000000..4c7d86abd --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java @@ -0,0 +1,140 @@ +package stirling.software.SPDF.utils.text; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class TextFinderUtils { + + public static boolean validateFontReliability(org.apache.pdfbox.pdmodel.font.PDFont font) { + if (font == null) { + return false; + } + + if (font.isDamaged()) { + log.debug( + "Font {} is marked as damaged - using TextEncodingHelper validation", + font.getName()); + } + + if (TextEncodingHelper.canCalculateBasicWidths(font)) { + log.debug( + "Font {} passed basic width calculations - considering reliable", + font.getName()); + return true; + } + + String[] basicTests = {"1", "2", "3", "a", "A", "e", "E", " "}; + + int workingChars = 0; + for (String testChar : basicTests) { + if (TextEncodingHelper.canEncodeCharacters(font, testChar)) { + workingChars++; + } + } + + if (workingChars > 0) { + log.debug( + "Font {} can process {}/{} basic characters - considering reliable", + font.getName(), + workingChars, + basicTests.length); + return true; + } + + log.debug("Font {} failed all basic tests - considering unreliable", font.getName()); + return false; + } + + public static List createOptimizedSearchPatterns( + Set searchTerms, boolean useRegex, boolean wholeWordSearch) { + List patterns = new ArrayList<>(); + + for (String term : searchTerms) { + if (term == null || term.trim().isEmpty()) { + continue; + } + + try { + String patternString = useRegex ? term.trim() : Pattern.quote(term.trim()); + + if (wholeWordSearch) { + patternString = applyWordBoundaries(term.trim(), patternString); + } + + Pattern pattern = + Pattern.compile( + patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + patterns.add(pattern); + + log.debug("Created search pattern: '{}' -> '{}'", term.trim(), patternString); + + } catch (Exception e) { + log.warn("Failed to create pattern for term '{}': {}", term, e.getMessage()); + } + } + + return patterns; + } + + private static String applyWordBoundaries(String originalTerm, String patternString) { + if (originalTerm.length() == 1 && Character.isDigit(originalTerm.charAt(0))) { + return "(? 0 && (completelyUnusableFonts * 2 > totalFonts); + log.debug( + "Page font analysis: {}/{} fonts are completely unusable - page {} problematic", + completelyUnusableFonts, + totalFonts, + hasProblems ? "IS" : "is NOT"); + + return hasProblems; + + } catch (Exception e) { + log.warn("Font analysis failed for page: {}", e.getMessage()); + return false; // Be permissive if analysis fails + } + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java new file mode 100644 index 000000000..fde3809c4 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java @@ -0,0 +1,136 @@ +package stirling.software.SPDF.utils.text; + +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDFont; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class WidthCalculator { + + private static final int FONT_SCALE_FACTOR = 1000; + + public static float calculateAccurateWidth(PDFont font, String text, float fontSize) { + if (font == null || text == null || text.isEmpty() || fontSize <= 0) { + return 0; + } + + if (!TextEncodingHelper.canEncodeCharacters(font, text)) { + log.debug( + "Text cannot be encoded by font {}, using fallback width calculation", + font.getName()); + return calculateFallbackWidth(font, text, fontSize); + } + + try { + float rawWidth = font.getStringWidth(text); + float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize; + + log.debug( + "Direct width calculation successful for font {}: {} -> {}", + font.getName(), + rawWidth, + scaledWidth); + return scaledWidth; + + } catch (Exception e) { + log.debug( + "Direct width calculation failed for font {}: {}", + font.getName(), + e.getMessage()); + return calculateWidthWithCharacterIteration(font, text, fontSize); + } + } + + private static float calculateWidthWithCharacterIteration( + PDFont font, String text, float fontSize) { + try { + float totalWidth = 0; + + for (int i = 0; i < text.length(); i++) { + String character = text.substring(i, i + 1); + try { + byte[] encoded = font.encode(character); + if (encoded.length > 0) { + int glyphCode = encoded[0] & 0xFF; + float glyphWidth = font.getWidth(glyphCode); + + if (glyphWidth == 0) { + try { + glyphWidth = font.getWidthFromFont(glyphCode); + } catch (Exception e2) { + glyphWidth = font.getAverageFontWidth(); + } + } + + totalWidth += (glyphWidth / FONT_SCALE_FACTOR) * fontSize; + } else { + totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize; + } + } catch (Exception e2) { + totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize; + } + } + + log.debug("Character iteration width calculation: {}", totalWidth); + return totalWidth; + + } catch (Exception e) { + log.debug("Character iteration failed: {}", e.getMessage()); + return calculateFallbackWidth(font, text, fontSize); + } + } + + private static float calculateFallbackWidth(PDFont font, String text, float fontSize) { + try { + if (font.getFontDescriptor() != null + && font.getFontDescriptor().getFontBoundingBox() != null) { + + PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); + float avgCharWidth = + bbox.getWidth() / FONT_SCALE_FACTOR * 0.6f; // Conservative estimate + float fallbackWidth = text.length() * avgCharWidth * fontSize; + + log.debug("Bounding box fallback width: {}", fallbackWidth); + return fallbackWidth; + } + + float avgWidth = font.getAverageFontWidth(); + float fallbackWidth = (text.length() * avgWidth / FONT_SCALE_FACTOR) * fontSize; + + log.debug("Average width fallback: {}", fallbackWidth); + return fallbackWidth; + + } catch (Exception e) { + float conservativeWidth = text.length() * 0.5f * fontSize; + log.debug( + "Conservative fallback width for font {}: {}", + font.getName(), + conservativeWidth); + return conservativeWidth; + } + } + + public static boolean isWidthCalculationReliable(PDFont font) { + if (font == null) { + return false; + } + + if (font.isDamaged()) { + log.debug("Font {} is damaged", font.getName()); + return false; + } + + if (!TextEncodingHelper.canCalculateBasicWidths(font)) { + log.debug("Font {} cannot perform basic width calculations", font.getName()); + return false; + } + + if (TextEncodingHelper.hasCustomEncoding(font)) { + log.debug("Font {} has custom encoding", font.getName()); + return false; + } + + return true; + } +} diff --git a/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java b/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java index 246f10af7..ebb5bebf7 100644 --- a/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java +++ b/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java @@ -1,7 +1,5 @@ package stirling.software.SPDF.pdf; -import static org.junit.jupiter.api.Assertions.*; - import java.io.IOException; import java.util.List; @@ -12,6 +10,11 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.jupiter.api.AfterEach; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; @@ -468,6 +471,106 @@ class TextFinderTest { } } + @Nested + @DisplayName("Single Character and Digit Tests") + class SingleCharacterAndDigitTests { + + @Test + @DisplayName("Should find single digits in various contexts with whole word search") + void findSingleDigitsWholeWord() throws IOException { + String content = "Item 1 of 5 costs $2.50. Order number: 1234. Reference: A1B."; + addTextToPage(content); + + TextFinder textFinder = new TextFinder("1", false, true); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(1, foundTexts.size(), + "Should find exactly one standalone '1', not the ones embedded in other numbers/codes"); + assertEquals("1", foundTexts.get(0).getText()); + } + + @Test + @DisplayName("Should find single digits without whole word search") + void findSingleDigitsNoWholeWord() throws IOException { + String content = "Item 1 of 5 costs $2.50. Order number: 1234. Reference: A1B."; + addTextToPage(content); + + TextFinder textFinder = new TextFinder("1", false, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertTrue(foundTexts.size() >= 3, + "Should find multiple instances of '1' including standalone, in '1234', and in 'A1B'"); + } + + @Test + @DisplayName("Should find single characters in various contexts") + void findSingleCharacters() throws IOException { + String content = "Grade: A. Section B has item A-1. The letter A appears multiple times."; + addTextToPage(content); + + TextFinder textFinder = new TextFinder("A", false, true); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertTrue(foundTexts.size() >= 2, "Should find multiple standalone 'A' characters"); + + for (PDFText found : foundTexts) { + assertEquals("A", found.getText()); + } + } + + @Test + @DisplayName("Should handle digits at word boundaries correctly") + void findDigitsAtWordBoundaries() throws IOException { + String content = "Numbers: 1, 2, 3. Code: 123. Version: 1.0. Item1 and Item2."; + addTextToPage(content); + + TextFinder textFinder1 = new TextFinder("1", false, true); + textFinder1.getText(document); + List foundTexts1 = textFinder1.getFoundTexts(); + + assertEquals(1, foundTexts1.size(), + "Should find only the standalone '1' at the beginning"); + + TextFinder textFinder2 = new TextFinder("2", false, true); + textFinder2.getText(document); + List foundTexts2 = textFinder2.getFoundTexts(); + + assertEquals(1, foundTexts2.size(), + "Should find only the standalone '2' in the number list"); + } + + @Test + @DisplayName("Should handle special characters and punctuation boundaries") + void findDigitsWithPunctuationBoundaries() throws IOException { + String content = "Items: (1), [2], {3}, item#4, price$5, and 6%."; + addTextToPage(content); + + TextFinder textFinder = new TextFinder("1", false, true); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(1, foundTexts.size(), "Should find '1' surrounded by parentheses"); + assertEquals("1", foundTexts.get(0).getText()); + } + + @Test + @DisplayName("Should handle edge case with spacing and formatting") + void findDigitsWithSpacingIssues() throws IOException { + String content = "List: 1 , 2 , 3 and item 1 here."; + addTextToPage(content); + + TextFinder textFinder = new TextFinder("1", false, true); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(2, foundTexts.size(), + "Should find both '1' instances despite spacing variations"); + } + } + // Helper methods private void addTextToPage(String text) throws IOException { addTextToPage(page, text); From c81cbb10c466a3a0deb74444ef897e40cde5575e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= <127139797+balazs-szucs@users.noreply.github.com> Date: Fri, 18 Jul 2025 19:04:50 +0200 Subject: [PATCH 13/13] Update app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../software/SPDF/controller/api/security/RedactController.java | 1 - 1 file changed, 1 deletion(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 296108516..51d5e5a53 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -628,7 +628,6 @@ public class RedactController { if (fallbackDocument == null) { document.close(); } - document.close(); } catch (IOException e) { log.warn("Failed to close main document: {}", e.getMessage()); }