diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java index 399c4adbf..296108516 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -32,9 +32,6 @@ import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDSimpleFont; -import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding; -import org.apache.pdfbox.pdmodel.font.encoding.Encoding; import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.springframework.http.ResponseEntity; @@ -59,6 +56,9 @@ import stirling.software.SPDF.model.PDFText; import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest; import stirling.software.SPDF.model.api.security.RedactPdfRequest; import stirling.software.SPDF.pdf.TextFinder; +import stirling.software.SPDF.utils.text.TextEncodingHelper; +import stirling.software.SPDF.utils.text.TextFinderUtils; +import stirling.software.SPDF.utils.text.WidthCalculator; import stirling.software.common.model.api.security.RedactionArea; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.GeneralUtils; @@ -77,6 +77,9 @@ public class RedactController { private static final float PRECISION_THRESHOLD = 1e-3f; private static final int FONT_SCALE_FACTOR = 1000; + // Redaction box width reduction factor (10% reduction) + private static final float REDACTION_WIDTH_REDUCTION_FACTOR = 0.9f; + // Text showing operators private static final Set TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); @@ -229,7 +232,11 @@ public class RedactController { } private void redactFoundText( - PDDocument document, List blocks, float customPadding, Color redactColor) + PDDocument document, + List blocks, + float customPadding, + Color redactColor, + boolean isTextRemovalMode) throws IOException { var allPages = document.getDocumentCatalog().getPages(); @@ -263,10 +270,28 @@ public class RedactController { (block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER + customPadding; + float originalWidth = block.getX2() - block.getX1(); + float boxWidth; + float boxX; + + // Only apply width reduction when text is actually being removed + if (isTextRemovalMode) { + // Calculate reduced width and center the box + boxWidth = + originalWidth + * REDACTION_WIDTH_REDUCTION_FACTOR; // 10% reduction + float widthReduction = originalWidth - boxWidth; + boxX = block.getX1() + (widthReduction / 2); // Center the reduced box + } else { + // Use original width for box-only redaction + boxWidth = originalWidth; + boxX = block.getX1(); + } + contentStream.addRect( - block.getX1(), + boxX, pageBox.getHeight() - block.getY2() - padding, - block.getX2() - block.getX1(), + boxWidth, block.getY2() - block.getY1() + 2 * padding); } @@ -284,7 +309,7 @@ public class RedactController { return originalWord; } - if (font != null && isFontSubset(font.getName())) { + if (font != null && TextEncodingHelper.isFontSubset(font.getName())) { try { float originalWidth = safeGetStringWidth(font, originalWord) / FONT_SCALE_FACTOR; return createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f); @@ -300,6 +325,10 @@ public class RedactController { return " ".repeat(originalWord.length()); } + /** + * Enhanced placeholder creation using advanced width calculation. Incorporates font validation + * and sophisticated fallback strategies. + */ String createPlaceholderWithWidth( String originalWord, float targetWidth, PDFont font, float fontSize) { if (originalWord == null || originalWord.isEmpty()) { @@ -311,11 +340,21 @@ public class RedactController { } try { - if (isFontSubset(font.getName())) { + // Check font reliability before proceeding + if (!WidthCalculator.isWidthCalculationReliable(font)) { + log.debug( + "Font {} unreliable for width calculation, using simple placeholder", + font.getName()); + return " ".repeat(originalWord.length()); + } + + // Use enhanced subset font detection + if (TextEncodingHelper.isFontSubset(font.getName())) { return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize); } - float spaceWidth = safeGetStringWidth(font, " ") / FONT_SCALE_FACTOR * fontSize; + // Enhanced space width calculation + float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize); if (spaceWidth <= 0) { return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); @@ -323,13 +362,16 @@ public class RedactController { int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); - int maxSpaces = originalWord.length() * 2; + // More conservative space limit based on original word characteristics + int maxSpaces = + Math.max( + originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f)); spaceCount = Math.min(spaceCount, maxSpaces); return " ".repeat(spaceCount); } catch (Exception e) { - log.debug("Width-based placeholder creation failed: {}", e.getMessage()); + log.debug("Enhanced placeholder creation failed: {}", e.getMessage()); return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); } } @@ -359,7 +401,7 @@ public class RedactController { try { String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"}; - if (fontSupportsCharacter(font, " ")) { + if (TextEncodingHelper.fontSupportsCharacter(font, " ")) { float spaceWidth = safeGetStringWidth(font, " ") / FONT_SCALE_FACTOR * fontSize; if (spaceWidth > 0) { int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); @@ -371,10 +413,10 @@ public class RedactController { } for (String altChar : alternatives) { - if (altChar.equals(" ")) continue; // Already tried spaces + if (" ".equals(altChar)) continue; // Already tried spaces try { - if (!fontSupportsCharacter(font, altChar)) { + if (!TextEncodingHelper.fontSupportsCharacter(font, altChar)) { continue; } @@ -546,7 +588,8 @@ public class RedactController { allFoundTextsByPage, request.getRedactColor(), request.getCustomPadding(), - request.getConvertPDFToImage()); + request.getConvertPDFToImage(), + false); // Box-only mode, use original box sizes return WebResponseUtils.bytesToWebResponse( pdfContent, @@ -564,7 +607,8 @@ public class RedactController { allFoundTextsByPage, request.getRedactColor(), request.getCustomPadding(), - request.getConvertPDFToImage()); + request.getConvertPDFToImage(), + true); // Text removal mode, use reduced box sizes return WebResponseUtils.bytesToWebResponse( pdfContent, @@ -608,14 +652,31 @@ public class RedactController { text = text.trim(); if (text.isEmpty()) continue; + log.debug( + "Searching for text: '{}' (regex: {}, wholeWord: {})", + text, + useRegex, + wholeWordSearch); + try { TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearch); textFinder.getText(document); - for (PDFText found : textFinder.getFoundTexts()) { + List foundTexts = textFinder.getFoundTexts(); + log.debug("TextFinder found {} instances of '{}'", foundTexts.size(), text); + + for (PDFText found : foundTexts) { allFoundTextsByPage .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>()) .add(found); + log.debug( + "Added match on page {} at ({},{},{},{}): '{}'", + found.getPageIndex(), + found.getX1(), + found.getY1(), + found.getX2(), + found.getY2(), + found.getText()); } } catch (Exception e) { log.error("Error processing search term '{}': {}", text, e.getMessage()); @@ -673,7 +734,8 @@ public class RedactController { Map> allFoundTextsByPage, String colorString, float customPadding, - Boolean convertToImage) + Boolean convertToImage, + boolean isTextRemovalMode) throws IOException { List allFoundTexts = new ArrayList<>(); @@ -684,7 +746,7 @@ public class RedactController { if (!allFoundTexts.isEmpty()) { Color redactColor = decodeOrDefault(colorString); - redactFoundText(document, allFoundTexts, customPadding, redactColor); + redactFoundText(document, allFoundTexts, customPadding, redactColor, isTextRemovalMode); cleanDocumentMetadata(document); } @@ -870,16 +932,24 @@ public class RedactController { boolean useRegex, boolean wholeWordSearch) { - return targetWords.stream() - .map( - target -> { - String patternString = useRegex ? target : Pattern.quote(target); - if (wholeWordSearch) { - patternString = "\\b" + patternString + "\\b"; + // Use the new utility for creating optimized patterns + List patterns = + TextFinderUtils.createOptimizedSearchPatterns( + targetWords, useRegex, wholeWordSearch); + + return patterns.stream() + .flatMap( + pattern -> { + try { + return pattern.matcher(completeText).results(); + } catch (Exception e) { + log.debug( + "Pattern matching failed for pattern {}: {}", + pattern.pattern(), + e.getMessage()); + return java.util.stream.Stream.empty(); } - return Pattern.compile(patternString, Pattern.CASE_INSENSITIVE); }) - .flatMap(pattern -> pattern.matcher(completeText).results()) .map(matchResult -> new MatchRange(matchResult.start(), matchResult.end())) .sorted(Comparator.comparingInt(MatchRange::getStartPos)) .collect(Collectors.toList()); @@ -957,6 +1027,16 @@ public class RedactController { private String applyRedactionsToSegmentText(TextSegment segment, List matches) { String text = segment.getText(); + + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) { + log.debug( + "Skipping text segment '{}' - font {} cannot process this text reliably", + text, + segment.getFont().getName()); + return text; // Return original text unchanged + } + StringBuilder result = new StringBuilder(text); for (MatchRange match : matches) { @@ -966,6 +1046,15 @@ public class RedactController { if (segmentStart < text.length() && segmentEnd > segmentStart) { String originalPart = text.substring(segmentStart, segmentEnd); + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable( + segment.getFont(), originalPart)) { + log.debug( + "Skipping text part '{}' within segment - cannot be processed reliably", + originalPart); + continue; // Skip this match, process others + } + float originalWidth = 0; if (segment.getFont() != null && segment.getFontSize() > 0) { try { @@ -1001,68 +1090,125 @@ public class RedactController { return 0; } + if (!WidthCalculator.isWidthCalculationReliable(font)) { + log.debug( + "Font {} flagged as unreliable for width calculation, using fallback", + font.getName()); + return calculateConservativeWidth(font, text); + } + + if (!TextEncodingHelper.canEncodeCharacters(font, text)) { + log.debug( + "Text cannot be encoded by font {}, using character-based fallback", + font.getName()); + return calculateCharacterBasedWidth(font, text); + } + try { - return font.getStringWidth(text); + float width = font.getStringWidth(text); + log.debug("Direct width calculation successful for '{}': {}", text, width); + return width; + } catch (Exception e) { - try { - float totalWidth = 0; - for (int i = 0; i < text.length(); i++) { - String character = text.substring(i, i + 1); - try { - byte[] encoded = font.encode(character); - if (encoded.length > 0) { - int glyphCode = encoded[0] & 0xFF; + log.debug( + "Direct width calculation failed for font {}: {}", + font.getName(), + e.getMessage()); + return calculateFallbackWidth(font, text); + } + } - float glyphWidth = font.getWidth(glyphCode); + private float calculateCharacterBasedWidth(PDFont font, String text) { + try { + float totalWidth = 0; + for (int i = 0; i < text.length(); i++) { + String character = text.substring(i, i + 1); + try { + // Validate character encoding first + if (!TextEncodingHelper.fontSupportsCharacter(font, character)) { + totalWidth += font.getAverageFontWidth(); + continue; + } - if (glyphWidth == 0) { - try { - glyphWidth = font.getWidthFromFont(glyphCode); - } catch (Exception e2) { - glyphWidth = font.getAverageFontWidth(); - } + byte[] encoded = font.encode(character); + if (encoded.length > 0) { + int glyphCode = encoded[0] & 0xFF; + float glyphWidth = font.getWidth(glyphCode); + + // Try alternative width methods if primary fails + if (glyphWidth == 0) { + try { + glyphWidth = font.getWidthFromFont(glyphCode); + } catch (Exception e2) { + glyphWidth = font.getAverageFontWidth(); } - - totalWidth += glyphWidth; - } else { - totalWidth += font.getAverageFontWidth(); } - } catch (Exception e2) { + + totalWidth += glyphWidth; + } else { totalWidth += font.getAverageFontWidth(); } + } catch (Exception e2) { + // Character processing failed, use average width + totalWidth += font.getAverageFontWidth(); } - return totalWidth; - } catch (Exception e2) { - log.debug("PDFBox API width calculation failed: {}", e2.getMessage()); } - try { - if (font.getFontDescriptor() != null - && font.getFontDescriptor().getFontBoundingBox() != null) { - PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); - float avgCharWidth = bbox.getHeight() / 1000f * 0.865f; - return text.length() * avgCharWidth * FONT_SCALE_FACTOR; - } - } catch (Exception e2) { - log.debug("Font bounding box width calculation failed: {}", e2.getMessage()); + log.debug("Character-based width calculation: {}", totalWidth); + return totalWidth; + + } catch (Exception e) { + log.debug("Character-based width calculation failed: {}", e.getMessage()); + return calculateConservativeWidth(font, text); + } + } + + private float calculateFallbackWidth(PDFont font, String text) { + try { + // Method 1: Font bounding box approach + if (font.getFontDescriptor() != null + && font.getFontDescriptor().getFontBoundingBox() != null) { + + PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); + float avgCharWidth = bbox.getWidth() * 0.6f; // Conservative estimate + float fallbackWidth = text.length() * avgCharWidth; + + log.debug("Bounding box fallback width: {}", fallbackWidth); + return fallbackWidth; } + // Method 2: Average font width try { float avgWidth = font.getAverageFontWidth(); - return text.length() * avgWidth; + if (avgWidth > 0) { + float fallbackWidth = text.length() * avgWidth; + log.debug("Average width fallback: {}", fallbackWidth); + return fallbackWidth; + } } catch (Exception e2) { log.debug("Average font width calculation failed: {}", e2.getMessage()); } - float conservativeWidth = text.length() * 500f; // 500 units per character - log.debug( - "All width calculation methods failed for font {}, using conservative estimate: {}", - font.getName(), - conservativeWidth); - return conservativeWidth; + // Method 3: Conservative estimate based on font metrics + return calculateConservativeWidth(font, text); + + } catch (Exception e) { + log.debug("Fallback width calculation failed: {}", e.getMessage()); + return calculateConservativeWidth(font, text); } } + private float calculateConservativeWidth(PDFont font, String text) { + float conservativeWidth = text.length() * 500f; + + log.debug( + "Conservative width estimate for font {} text '{}': {}", + font.getName(), + text, + conservativeWidth); + return conservativeWidth; + } + private float calculateWidthAdjustment(TextSegment segment, List matches) { try { if (segment.getFont() == null || segment.getFontSize() <= 0) { @@ -1070,7 +1216,8 @@ public class RedactController { } String fontName = segment.getFont().getName(); - if (fontName != null && (fontName.contains("HOEPAP") || isFontSubset(fontName))) { + if (fontName != null + && (fontName.contains("HOEPAP") || TextEncodingHelper.isFontSubset(fontName))) { log.debug("Skipping width adjustment for problematic/subset font: {}", fontName); return 0; } @@ -1196,6 +1343,19 @@ public class RedactController { for (COSBase element : originalArray) { if (element instanceof COSString cosString) { String originalText = cosString.getString(); + + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable( + segment.getFont(), originalText)) { + log.debug( + "Skipping TJ text part '{}' - cannot be processed reliably with font {}", + originalText, + segment.getFont().getName()); + newArray.add(element); // Keep original unchanged + textOffsetInSegment += originalText.length(); + continue; + } + StringBuilder newText = new StringBuilder(originalText); boolean modified = false; @@ -1207,7 +1367,6 @@ public class RedactController { int overlapEnd = Math.min(match.getEndPos(), stringEndInPage); if (overlapStart < overlapEnd) { - modified = true; int redactionStartInString = overlapStart - stringStartInPage; int redactionEndInString = overlapEnd - stringStartInPage; if (redactionStartInString >= 0 @@ -1216,6 +1375,16 @@ public class RedactController { originalText.substring( redactionStartInString, redactionEndInString); + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable( + segment.getFont(), originalPart)) { + log.debug( + "Skipping TJ text part '{}' - cannot be redacted reliably", + originalPart); + continue; // Skip this redaction, keep original text + } + + modified = true; float originalWidth = 0; if (segment.getFont() != null && segment.getFontSize() > 0) { try { @@ -1320,8 +1489,13 @@ public class RedactController { int totalFonts = 0; int customEncodedFonts = 0; int subsetFonts = 0; + int unreliableFonts = 0; for (PDPage page : document.getPages()) { + if (TextFinderUtils.hasProblematicFonts(page)) { + log.debug("Page contains fonts flagged as problematic by TextFinderUtils"); + } + PDResources resources = page.getResources(); if (resources == null) { continue; @@ -1333,190 +1507,64 @@ public class RedactController { if (font != null) { totalFonts++; - boolean isSubset = isFontSubset(font.getName()); - boolean isProblematic = hasProblematicFontCharacteristics(font); + // Enhanced analysis using helper classes + boolean isSubset = TextEncodingHelper.isFontSubset(font.getName()); + boolean hasCustomEncoding = TextEncodingHelper.hasCustomEncoding(font); + boolean isReliable = WidthCalculator.isWidthCalculationReliable(font); + boolean canCalculateWidths = + TextEncodingHelper.canCalculateBasicWidths(font); if (isSubset) { subsetFonts++; } - if (isProblematic) { + if (hasCustomEncoding) { customEncodedFonts++; + log.debug("Font {} has custom encoding", font.getName()); + } + + if (!isReliable || !canCalculateWidths) { + unreliableFonts++; log.debug( - "Detected problematic font: {} (type: {})", + "Font {} flagged as unreliable: reliable={}, canCalculateWidths={}", font.getName(), - font.getClass().getSimpleName()); + isReliable, + canCalculateWidths); + } + + if (!TextFinderUtils.validateFontReliability(font)) { + log.debug( + "Font {} failed comprehensive reliability check", + font.getName()); } } - } catch (IOException e) { + } catch (Exception e) { log.debug( - "Font loading failed for {}: {}", + "Font loading/analysis failed for {}: {}", fontName.getName(), e.getMessage()); customEncodedFonts++; + unreliableFonts++; + totalFonts++; } } } log.info( - "Font analysis: {}/{} fonts use custom encoding, {}/{} are subset fonts (subset fonts with standard encodings are fine)", + "Enhanced font analysis: {}/{} custom encoding, {}/{} subset, {}/{} unreliable fonts", customEncodedFonts, totalFonts, subsetFonts, + totalFonts, + unreliableFonts, totalFonts); - return customEncodedFonts > 0; - } catch (Exception e) { - log.warn("Font detection analysis failed: {}", e.getMessage()); - return false; - } - } - - private boolean hasProblematicFontCharacteristics(PDFont font) { - try { - if (font.isDamaged()) { - log.debug("Font {} is marked as damaged by PDFBox", font.getName()); - return true; - } - - if (hasCustomEncoding(font)) { - log.debug( - "Font {} uses custom encoding - text replacement will be unreliable", - font.getName()); - return true; - } - - String fontType = font.getClass().getSimpleName(); - if ("PDType3Font".equals(fontType)) { - log.debug("Font {} is Type3 - may have text replacement issues", font.getName()); - return cannotCalculateBasicWidths(font); - } - - log.debug("Font {} appears suitable for text replacement", font.getName()); - return false; + // Consider document problematic if we have custom encodings or unreliable fonts + return customEncodedFonts > 0 || unreliableFonts > 0; } catch (Exception e) { - log.debug("Font analysis failed for {}: {}", font.getName(), e.getMessage()); - return false; - } - } - - private boolean hasCustomEncoding(PDFont font) { - try { - if (font instanceof PDSimpleFont simpleFont) { - try { - Encoding encoding = simpleFont.getEncoding(); - if (encoding != null) { - String encodingName = encoding.getEncodingName(); - - // Check if it's one of the standard encodings - if ("WinAnsiEncoding".equals(encodingName) - || "MacRomanEncoding".equals(encodingName) - || "StandardEncoding".equals(encodingName) - || "MacExpertEncoding".equals(encodingName) - || "SymbolEncoding".equals(encodingName) - || "ZapfDingbatsEncoding".equals(encodingName)) { - - log.debug( - "Font {} uses standard encoding: {}", - font.getName(), - encodingName); - return false; - } - - if (encoding instanceof DictionaryEncoding) { - log.debug( - "Font {} uses DictionaryEncoding - likely custom", - font.getName()); - return true; - } - - log.debug( - "Font {} uses non-standard encoding: {}", - font.getName(), - encodingName); - return true; - } - } catch (Exception e) { - log.debug( - "Could not determine encoding for font {}: {}", - font.getName(), - e.getMessage()); - } - } - - if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) { - log.debug("Font {} is Type0 (CID) - generally uses standard CMaps", font.getName()); - return false; // Be forgiving with CID fonts - } - - log.debug( - "Font {} type {} - assuming standard encoding", - font.getName(), - font.getClass().getSimpleName()); - return false; - - } catch (Exception e) { - log.debug( - "Custom encoding detection failed for font {}: {}", - font.getName(), - e.getMessage()); - return false; // Be forgiving on detection failure - } - } - - private boolean cannotCalculateBasicWidths(PDFont font) { - try { - float spaceWidth = font.getStringWidth(" "); - if (spaceWidth <= 0) { - return true; - } - - String[] testChars = {"a", "A", "0", ".", "e", "!"}; - for (String ch : testChars) { - try { - float width = font.getStringWidth(ch); - if (width > 0) { - return false; // Found at least one character we can measure - } - } catch (Exception e) { - } - } - - return true; // Can't calculate width for any test characters - } catch (Exception e) { - return true; // Font failed basic width calculation - } - } - - private boolean isFontSubset(String fontName) { - if (fontName == null) { - return false; - } - return fontName.matches("^[A-Z]{6}\\+.*"); - } - - private boolean fontSupportsCharacter(PDFont font, String character) { - if (font == null || character == null || character.isEmpty()) { - return false; - } - - try { - byte[] encoded = font.encode(character); - if (encoded.length == 0) { - return false; - } - - float width = font.getStringWidth(character); - return width > 0; - - } catch (Exception e) { - log.debug( - "Character '{}' not supported by font {}: {}", - character, - font.getName(), - e.getMessage()); - return false; + log.warn("Enhanced font detection analysis failed: {}", e.getMessage()); + return true; // Assume problematic if analysis fails } } diff --git a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index 69b4ddc42..432fad101 100644 --- a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -10,8 +10,11 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; +import lombok.extern.slf4j.Slf4j; + import stirling.software.SPDF.model.PDFText; +@Slf4j public class TextFinder extends PDFTextStripper { private final String searchTerm; @@ -67,16 +70,40 @@ public class TextFinder extends PDFTextStripper { String processedSearchTerm = this.searchTerm.trim(); String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E"; if (this.wholeWordSearch) { - regex = "\\b" + regex + "\\b"; + if (processedSearchTerm.length() == 1 + && Character.isDigit(processedSearchTerm.charAt(0))) { + regex = "(?= pageTextPositions.size()) { + log.debug( + "Position index {} exceeds available positions ({})", + i, + pageTextPositions.size()); continue; } TextPosition pos = pageTextPositions.get(i); @@ -97,6 +128,27 @@ public class TextFinder extends PDFTextStripper { } } + if (!foundPosition && matchStart < pageTextPositions.size()) { + log.debug( + "Attempting to find nearby positions for match at {}-{}", + matchStart, + matchEnd); + + for (int i = Math.max(0, matchStart - 5); + i < Math.min(pageTextPositions.size(), matchEnd + 5); + i++) { + TextPosition pos = pageTextPositions.get(i); + if (pos != null) { + foundPosition = true; + minX = Math.min(minX, pos.getX()); + maxX = Math.max(maxX, pos.getX() + pos.getWidth()); + minY = Math.min(minY, pos.getY() - pos.getHeight()); + maxY = Math.max(maxY, pos.getY()); + break; + } + } + } + if (foundPosition) { foundTexts.add( new PDFText( @@ -106,13 +158,59 @@ public class TextFinder extends PDFTextStripper { maxX, maxY, matcher.group())); + log.debug( + "Added PDFText for match: page={}, bounds=({},{},{},{}), text='{}'", + getCurrentPageNo() - 1, + minX, + minY, + maxX, + maxY, + matcher.group()); + } else { + log.warn( + "Found text match '{}' but no valid position data at {}-{}", + matcher.group(), + matchStart, + matchEnd); } } + log.debug( + "Page {} search complete: found {} matches for '{}'", + getCurrentPageNo(), + matchCount, + processedSearchTerm); + super.endPage(page); } public List getFoundTexts() { return foundTexts; } + + public String getDebugInfo() { + StringBuilder debug = new StringBuilder(); + debug.append("Extracted text length: ").append(pageTextBuilder.length()).append("\n"); + debug.append("Position count: ").append(pageTextPositions.size()).append("\n"); + debug.append("Text content: '") + .append(pageTextBuilder.toString().replace("\n", "\\n").replace("\r", "\\r")) + .append("'\n"); + + String text = pageTextBuilder.toString(); + for (int i = 0; i < Math.min(text.length(), 50); i++) { + char c = text.charAt(i); + TextPosition pos = i < pageTextPositions.size() ? pageTextPositions.get(i) : null; + debug.append( + String.format( + " [%d] '%c' (0x%02X) -> %s\n", + i, + c, + (int) c, + pos != null + ? String.format("(%.1f,%.1f)", pos.getX(), pos.getY()) + : "null")); + } + + return debug.toString(); + } } diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java new file mode 100644 index 000000000..4292e6c52 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java @@ -0,0 +1,351 @@ +package stirling.software.SPDF.utils.text; + +import java.io.IOException; + +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDSimpleFont; +import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding; +import org.apache.pdfbox.pdmodel.font.encoding.Encoding; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class TextEncodingHelper { + + public static boolean canEncodeCharacters(PDFont font, String text) { + if (font == null || text == null || text.isEmpty()) { + return false; + } + + try { + // Step 1: Primary check - full-string encoding (permissive for "good" cases) + byte[] encoded = font.encode(text); + if (encoded.length > 0) { + log.debug( + "Text '{}' has good full-string encoding for font {} - permissively allowing", + text, + font.getName() != null ? font.getName() : "Unknown"); + return true; + } + + // Step 2: Smart array-based fallback for TJ operator-style text + log.debug( + "Full encoding failed for '{}' - using array-based fallback for font {}", + text, + font.getName() != null ? font.getName() : "Unknown"); + + return validateAsCodePointArray(font, text); + + } catch (IOException | IllegalArgumentException e) { + log.debug( + "Encoding exception for text '{}' with font {} - trying array fallback: {}", + text, + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + + if (isFontSubset(font.getName()) || hasCustomEncoding(font)) { + return validateAsCodePointArray(font, text); + } + + return false; // Non-subset fonts with encoding exceptions are likely problematic + } + } + + private static boolean validateAsCodePointArray(PDFont font, String text) { + int totalCodePoints = 0; + int successfulCodePoints = 0; + + // Iterate through code points (handles surrogates correctly per Unicode docs) + for (int i = 0; i < text.length(); ) { + int codePoint = text.codePointAt(i); + String charStr = new String(Character.toChars(codePoint)); + totalCodePoints++; + + try { + // Test encoding for this code point + byte[] charEncoded = font.encode(charStr); + if (charEncoded.length > 0) { + float charWidth = font.getStringWidth(charStr); + + if (charWidth >= 0) { + successfulCodePoints++; + log.debug( + "Code point '{}' (U+{}) encoded successfully", + charStr, + Integer.toHexString(codePoint).toUpperCase()); + } else { + log.debug( + "Code point '{}' (U+{}) has invalid width: {}", + charStr, + Integer.toHexString(codePoint).toUpperCase(), + charWidth); + } + } else { + log.debug( + "Code point '{}' (U+{}) encoding failed - empty result", + charStr, + Integer.toHexString(codePoint).toUpperCase()); + } + } catch (IOException | IllegalArgumentException e) { + log.debug( + "Code point '{}' (U+{}) validation failed: {}", + charStr, + Integer.toHexString(codePoint).toUpperCase(), + e.getMessage()); + } + + i += Character.charCount(codePoint); // Handle surrogates properly + } + + double successRate = + totalCodePoints > 0 ? (double) successfulCodePoints / totalCodePoints : 0; + boolean isAcceptable = successRate >= 0.95; + + log.debug( + "Array validation for '{}': {}/{} code points successful ({:.1f}%) - {}", + text, + successfulCodePoints, + totalCodePoints, + successRate * 100, + isAcceptable ? "ALLOWING" : "rejecting"); + + return isAcceptable; + } + + public static boolean isTextSegmentRemovable(PDFont font, String text) { + if (font == null || text == null || text.isEmpty()) { + return false; + } + + // Log the attempt + log.debug( + "Evaluating text segment for removal: '{}' with font {}", + text, + font.getName() != null ? font.getName() : "Unknown Font"); + + if (isSimpleCharacter(text)) { + try { + font.encode(text); + font.getStringWidth(text); + log.debug( + "Text '{}' is a simple character and passed validation - allowing removal", + text); + return true; + } catch (Exception e) { + log.debug( + "Simple character '{}' failed basic validation with font {}: {}", + text, + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; + } + } + + // For complex text, require comprehensive validation + return isTextFullyRemovable(font, text); + } + + public static boolean isTextFullyRemovable(PDFont font, String text) { + if (font == null || text == null || text.isEmpty()) { + return false; + } + + try { + // Check 1: Verify encoding capability using new smart approach + if (!canEncodeCharacters(font, text)) { + log.debug( + "Text '{}' failed encoding validation for font {}", + text, + font.getName() != null ? font.getName() : "Unknown"); + return false; + } + + // Check 2: Validate width calculation capability + float width = font.getStringWidth(text); + if (width < 0) { // Allow zero width (invisible chars) but reject negative (invalid) + log.debug( + "Text '{}' has invalid width {} for font {}", + text, + width, + font.getName() != null ? font.getName() : "Unknown"); + return false; // Invalid metrics prevent accurate removal + } + + // Check 3: Verify font descriptor completeness for redaction area calculation + if (font.getFontDescriptor() == null) { + log.debug( + "Missing font descriptor for font {}", + font.getName() != null ? font.getName() : "Unknown"); + return false; + } + + // Check 4: Test bounding box calculation for redaction area + try { + font.getFontDescriptor().getFontBoundingBox(); + } catch (IllegalArgumentException e) { + log.debug( + "Font bounding box unavailable for font {}: {}", + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; + } + + log.debug( + "Text '{}' passed comprehensive validation for font {}", + text, + font.getName() != null ? font.getName() : "Unknown"); + return true; + + } catch (IOException e) { + log.debug( + "Text '{}' failed validation for font {} due to IO error: {}", + text, + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; + } catch (IllegalArgumentException e) { + log.debug( + "Text '{}' failed validation for font {} due to argument error: {}", + text, + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; + } + } + + private static boolean isSimpleCharacter(String text) { + if (text == null || text.isEmpty()) { + return false; + } + + if (text.length() > 20) { + return false; + } + + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + + // Allow letters, digits, and whitespace (most common cases) + if (Character.isLetterOrDigit(c) || Character.isWhitespace(c)) { + continue; + } + + // Allow common ASCII punctuation + if (c >= 32 && c <= 126 && ".,!?;:()-[]{}\"'/@#$%&*+=<>|\\~`".indexOf(c) >= 0) { + continue; + } + + return false; + } + + return true; + } + + public static boolean hasCustomEncoding(PDFont font) { + try { + if (font instanceof PDSimpleFont simpleFont) { + try { + Encoding encoding = simpleFont.getEncoding(); + if (encoding != null) { + // Check for dictionary-based custom encodings + if (encoding instanceof DictionaryEncoding) { + log.debug("Font {} uses DictionaryEncoding (custom)", font.getName()); + return true; + } + + String encodingName = encoding.getClass().getSimpleName(); + if (encodingName.contains("Custom") + || encodingName.contains("Dictionary")) { + log.debug( + "Font {} uses custom encoding: {}", + font.getName(), + encodingName); + return true; + } + } + } catch (Exception e) { + log.debug( + "Encoding detection failed for font {}: {}", + font.getName(), + e.getMessage()); + return true; // Assume custom if detection fails + } + } + + if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) { + log.debug( + "Font {} is Type0 (CID) - generally uses standard CMaps", + font.getName() != null ? font.getName() : "Unknown"); + return false; + } + + log.debug( + "Font {} type {} - assuming standard encoding", + font.getName() != null ? font.getName() : "Unknown", + font.getClass().getSimpleName()); + return false; + + } catch (IllegalArgumentException e) { + log.debug( + "Custom encoding detection failed for font {}: {}", + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; // Be forgiving on detection failure + } + } + + public static boolean fontSupportsCharacter(PDFont font, String character) { + if (font == null || character == null || character.isEmpty()) { + return false; + } + + try { + byte[] encoded = font.encode(character); + if (encoded.length == 0) { + return false; + } + + float width = font.getStringWidth(character); + return width > 0; + + } catch (IOException | IllegalArgumentException e) { + log.debug( + "Character '{}' not supported by font {}: {}", + character, + font.getName() != null ? font.getName() : "Unknown", + e.getMessage()); + return false; + } + } + + public static boolean isFontSubset(String fontName) { + if (fontName == null) { + return false; + } + return fontName.matches("^[A-Z]{6}\\+.*"); + } + + public static boolean canCalculateBasicWidths(PDFont font) { + try { + float spaceWidth = font.getStringWidth(" "); + if (spaceWidth <= 0) { + return false; + } + + String[] testChars = {"a", "A", "0", ".", "e", "!"}; + for (String ch : testChars) { + try { + float width = font.getStringWidth(ch); + if (width > 0) { + return true; + } + } catch (IOException | IllegalArgumentException e) { + } + } + + return false; // Can't calculate width for any test characters + } catch (IOException | IllegalArgumentException e) { + return false; // Font failed basic width calculation + } + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java new file mode 100644 index 000000000..4c7d86abd --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java @@ -0,0 +1,140 @@ +package stirling.software.SPDF.utils.text; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class TextFinderUtils { + + public static boolean validateFontReliability(org.apache.pdfbox.pdmodel.font.PDFont font) { + if (font == null) { + return false; + } + + if (font.isDamaged()) { + log.debug( + "Font {} is marked as damaged - using TextEncodingHelper validation", + font.getName()); + } + + if (TextEncodingHelper.canCalculateBasicWidths(font)) { + log.debug( + "Font {} passed basic width calculations - considering reliable", + font.getName()); + return true; + } + + String[] basicTests = {"1", "2", "3", "a", "A", "e", "E", " "}; + + int workingChars = 0; + for (String testChar : basicTests) { + if (TextEncodingHelper.canEncodeCharacters(font, testChar)) { + workingChars++; + } + } + + if (workingChars > 0) { + log.debug( + "Font {} can process {}/{} basic characters - considering reliable", + font.getName(), + workingChars, + basicTests.length); + return true; + } + + log.debug("Font {} failed all basic tests - considering unreliable", font.getName()); + return false; + } + + public static List createOptimizedSearchPatterns( + Set searchTerms, boolean useRegex, boolean wholeWordSearch) { + List patterns = new ArrayList<>(); + + for (String term : searchTerms) { + if (term == null || term.trim().isEmpty()) { + continue; + } + + try { + String patternString = useRegex ? term.trim() : Pattern.quote(term.trim()); + + if (wholeWordSearch) { + patternString = applyWordBoundaries(term.trim(), patternString); + } + + Pattern pattern = + Pattern.compile( + patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + patterns.add(pattern); + + log.debug("Created search pattern: '{}' -> '{}'", term.trim(), patternString); + + } catch (Exception e) { + log.warn("Failed to create pattern for term '{}': {}", term, e.getMessage()); + } + } + + return patterns; + } + + private static String applyWordBoundaries(String originalTerm, String patternString) { + if (originalTerm.length() == 1 && Character.isDigit(originalTerm.charAt(0))) { + return "(? 0 && (completelyUnusableFonts * 2 > totalFonts); + log.debug( + "Page font analysis: {}/{} fonts are completely unusable - page {} problematic", + completelyUnusableFonts, + totalFonts, + hasProblems ? "IS" : "is NOT"); + + return hasProblems; + + } catch (Exception e) { + log.warn("Font analysis failed for page: {}", e.getMessage()); + return false; // Be permissive if analysis fails + } + } +} diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java new file mode 100644 index 000000000..fde3809c4 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/WidthCalculator.java @@ -0,0 +1,136 @@ +package stirling.software.SPDF.utils.text; + +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDFont; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class WidthCalculator { + + private static final int FONT_SCALE_FACTOR = 1000; + + public static float calculateAccurateWidth(PDFont font, String text, float fontSize) { + if (font == null || text == null || text.isEmpty() || fontSize <= 0) { + return 0; + } + + if (!TextEncodingHelper.canEncodeCharacters(font, text)) { + log.debug( + "Text cannot be encoded by font {}, using fallback width calculation", + font.getName()); + return calculateFallbackWidth(font, text, fontSize); + } + + try { + float rawWidth = font.getStringWidth(text); + float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize; + + log.debug( + "Direct width calculation successful for font {}: {} -> {}", + font.getName(), + rawWidth, + scaledWidth); + return scaledWidth; + + } catch (Exception e) { + log.debug( + "Direct width calculation failed for font {}: {}", + font.getName(), + e.getMessage()); + return calculateWidthWithCharacterIteration(font, text, fontSize); + } + } + + private static float calculateWidthWithCharacterIteration( + PDFont font, String text, float fontSize) { + try { + float totalWidth = 0; + + for (int i = 0; i < text.length(); i++) { + String character = text.substring(i, i + 1); + try { + byte[] encoded = font.encode(character); + if (encoded.length > 0) { + int glyphCode = encoded[0] & 0xFF; + float glyphWidth = font.getWidth(glyphCode); + + if (glyphWidth == 0) { + try { + glyphWidth = font.getWidthFromFont(glyphCode); + } catch (Exception e2) { + glyphWidth = font.getAverageFontWidth(); + } + } + + totalWidth += (glyphWidth / FONT_SCALE_FACTOR) * fontSize; + } else { + totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize; + } + } catch (Exception e2) { + totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize; + } + } + + log.debug("Character iteration width calculation: {}", totalWidth); + return totalWidth; + + } catch (Exception e) { + log.debug("Character iteration failed: {}", e.getMessage()); + return calculateFallbackWidth(font, text, fontSize); + } + } + + private static float calculateFallbackWidth(PDFont font, String text, float fontSize) { + try { + if (font.getFontDescriptor() != null + && font.getFontDescriptor().getFontBoundingBox() != null) { + + PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); + float avgCharWidth = + bbox.getWidth() / FONT_SCALE_FACTOR * 0.6f; // Conservative estimate + float fallbackWidth = text.length() * avgCharWidth * fontSize; + + log.debug("Bounding box fallback width: {}", fallbackWidth); + return fallbackWidth; + } + + float avgWidth = font.getAverageFontWidth(); + float fallbackWidth = (text.length() * avgWidth / FONT_SCALE_FACTOR) * fontSize; + + log.debug("Average width fallback: {}", fallbackWidth); + return fallbackWidth; + + } catch (Exception e) { + float conservativeWidth = text.length() * 0.5f * fontSize; + log.debug( + "Conservative fallback width for font {}: {}", + font.getName(), + conservativeWidth); + return conservativeWidth; + } + } + + public static boolean isWidthCalculationReliable(PDFont font) { + if (font == null) { + return false; + } + + if (font.isDamaged()) { + log.debug("Font {} is damaged", font.getName()); + return false; + } + + if (!TextEncodingHelper.canCalculateBasicWidths(font)) { + log.debug("Font {} cannot perform basic width calculations", font.getName()); + return false; + } + + if (TextEncodingHelper.hasCustomEncoding(font)) { + log.debug("Font {} has custom encoding", font.getName()); + return false; + } + + return true; + } +} diff --git a/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java b/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java index 246f10af7..ebb5bebf7 100644 --- a/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java +++ b/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java @@ -1,7 +1,5 @@ package stirling.software.SPDF.pdf; -import static org.junit.jupiter.api.Assertions.*; - import java.io.IOException; import java.util.List; @@ -12,6 +10,11 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.jupiter.api.AfterEach; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; @@ -468,6 +471,106 @@ class TextFinderTest { } } + @Nested + @DisplayName("Single Character and Digit Tests") + class SingleCharacterAndDigitTests { + + @Test + @DisplayName("Should find single digits in various contexts with whole word search") + void findSingleDigitsWholeWord() throws IOException { + String content = "Item 1 of 5 costs $2.50. Order number: 1234. Reference: A1B."; + addTextToPage(content); + + TextFinder textFinder = new TextFinder("1", false, true); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(1, foundTexts.size(), + "Should find exactly one standalone '1', not the ones embedded in other numbers/codes"); + assertEquals("1", foundTexts.get(0).getText()); + } + + @Test + @DisplayName("Should find single digits without whole word search") + void findSingleDigitsNoWholeWord() throws IOException { + String content = "Item 1 of 5 costs $2.50. Order number: 1234. Reference: A1B."; + addTextToPage(content); + + TextFinder textFinder = new TextFinder("1", false, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertTrue(foundTexts.size() >= 3, + "Should find multiple instances of '1' including standalone, in '1234', and in 'A1B'"); + } + + @Test + @DisplayName("Should find single characters in various contexts") + void findSingleCharacters() throws IOException { + String content = "Grade: A. Section B has item A-1. The letter A appears multiple times."; + addTextToPage(content); + + TextFinder textFinder = new TextFinder("A", false, true); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertTrue(foundTexts.size() >= 2, "Should find multiple standalone 'A' characters"); + + for (PDFText found : foundTexts) { + assertEquals("A", found.getText()); + } + } + + @Test + @DisplayName("Should handle digits at word boundaries correctly") + void findDigitsAtWordBoundaries() throws IOException { + String content = "Numbers: 1, 2, 3. Code: 123. Version: 1.0. Item1 and Item2."; + addTextToPage(content); + + TextFinder textFinder1 = new TextFinder("1", false, true); + textFinder1.getText(document); + List foundTexts1 = textFinder1.getFoundTexts(); + + assertEquals(1, foundTexts1.size(), + "Should find only the standalone '1' at the beginning"); + + TextFinder textFinder2 = new TextFinder("2", false, true); + textFinder2.getText(document); + List foundTexts2 = textFinder2.getFoundTexts(); + + assertEquals(1, foundTexts2.size(), + "Should find only the standalone '2' in the number list"); + } + + @Test + @DisplayName("Should handle special characters and punctuation boundaries") + void findDigitsWithPunctuationBoundaries() throws IOException { + String content = "Items: (1), [2], {3}, item#4, price$5, and 6%."; + addTextToPage(content); + + TextFinder textFinder = new TextFinder("1", false, true); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(1, foundTexts.size(), "Should find '1' surrounded by parentheses"); + assertEquals("1", foundTexts.get(0).getText()); + } + + @Test + @DisplayName("Should handle edge case with spacing and formatting") + void findDigitsWithSpacingIssues() throws IOException { + String content = "List: 1 , 2 , 3 and item 1 here."; + addTextToPage(content); + + TextFinder textFinder = new TextFinder("1", false, true); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + + assertEquals(2, foundTexts.size(), + "Should find both '1' instances despite spacing variations"); + } + } + // Helper methods private void addTextToPage(String text) throws IOException { addTextToPage(page, text);