From 7db58ad6ddac56879c07005af8d27f1c059e8c50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Sun, 24 Aug 2025 16:59:09 +0200 Subject: [PATCH] enhance text handling and encoding validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Balázs Szücs --- .../software/SPDF/pdf/TextFinder.java | 88 +- .../SPDF/service/RedactionService.java | 1161 ++++++++++++----- .../SPDF/utils/text/TextDecodingHelper.java | 279 ++-- .../SPDF/utils/text/TextEncodingHelper.java | 652 ++++++--- .../SPDF/utils/text/TextFinderUtils.java | 178 ++- .../SPDF/utils/text/WidthCalculator.java | 675 +++++----- 6 files changed, 1914 insertions(+), 1119 deletions(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index eaa5d2981..3f693d400 100644 --- a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -6,23 +6,20 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; -import lombok.Getter; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; -import lombok.extern.slf4j.Slf4j; +import lombok.Getter; import stirling.software.SPDF.model.PDFText; -@Slf4j public class TextFinder extends PDFTextStripper { private final String searchTerm; private final boolean useRegex; private final boolean wholeWordSearch; - @Getter - private final List foundTexts = new ArrayList<>(); + @Getter private final List foundTexts = new ArrayList<>(); private final List pageTextPositions = new ArrayList<>(); private final StringBuilder pageTextBuilder = new StringBuilder(); @@ -45,20 +42,39 @@ public class TextFinder extends PDFTextStripper { @Override protected void writeString(String text, List textPositions) { - pageTextBuilder.append(text); - pageTextPositions.addAll(textPositions); + for (TextPosition tp : textPositions) { + if (tp == null) continue; + String u = tp.getUnicode(); + if (u == null) continue; + for (int i = 0; i < u.length(); ) { + int cp = u.codePointAt(i); + pageTextBuilder.append(Character.toChars(cp)); + // Add one position per code unit appended (1-2 chars depending on surrogate) + int codeUnits = Character.charCount(cp); + for (int k = 0; k < codeUnits; k++) { + pageTextPositions.add(tp); + } + i += codeUnits; + } + } } @Override protected void writeWordSeparator() { - pageTextBuilder.append(getWordSeparator()); - pageTextPositions.add(null); // Placeholder for separator + String sep = getWordSeparator(); + pageTextBuilder.append(sep); + for (int i = 0; i < sep.length(); i++) { + pageTextPositions.add(null); + } } @Override protected void writeLineSeparator() { - pageTextBuilder.append(getLineSeparator()); - pageTextPositions.add(null); // Placeholder for separator + String sep = getLineSeparator(); + pageTextBuilder.append(sep); + for (int i = 0; i < sep.length(); i++) { + pageTextPositions.add(null); + } } @Override @@ -91,27 +107,10 @@ public class TextFinder extends PDFTextStripper { Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher matcher = pattern.matcher(text); - log.debug( - "Searching for '{}' in page {} with regex '{}' (wholeWord: {}, useRegex: {})", - processedSearchTerm, - getCurrentPageNo(), - regex, - wholeWordSearch, - useRegex); - - int matchCount = 0; while (matcher.find()) { - matchCount++; int matchStart = matcher.start(); int matchEnd = matcher.end(); - log.debug( - "Found match #{} at positions {}-{}: '{}'", - matchCount, - matchStart, - matchEnd, - matcher.group()); - float minX = Float.MAX_VALUE; float minY = Float.MAX_VALUE; float maxX = Float.MIN_VALUE; @@ -119,13 +118,7 @@ public class TextFinder extends PDFTextStripper { boolean foundPosition = false; for (int i = matchStart; i < matchEnd; i++) { - if (i >= pageTextPositions.size()) { - log.debug( - "Position index {} exceeds available positions ({})", - i, - pageTextPositions.size()); - continue; - } + if (i >= pageTextPositions.size()) continue; TextPosition pos = pageTextPositions.get(i); if (pos != null) { foundPosition = true; @@ -137,11 +130,6 @@ public class TextFinder extends PDFTextStripper { } if (!foundPosition && matchStart < pageTextPositions.size()) { - log.debug( - "Attempting to find nearby positions for match at {}-{}", - matchStart, - matchEnd); - for (int i = Math.max(0, matchStart - 5); i < Math.min(pageTextPositions.size(), matchEnd + 5); i++) { @@ -166,29 +154,11 @@ public class TextFinder extends PDFTextStripper { maxX, maxY, matcher.group())); - log.debug( - "Added PDFText for match: page={}, bounds=({},{},{},{}), text='{}'", - getCurrentPageNo() - 1, - minX, - minY, - maxX, - maxY, - matcher.group()); } else { - log.warn( - "Found text match '{}' but no valid position data at {}-{}", - matcher.group(), - matchStart, - matchEnd); + // no position info } } - log.debug( - "Page {} search complete: found {} matches for '{}'", - getCurrentPageNo(), - matchCount, - processedSearchTerm); - super.endPage(page); } diff --git a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java index b5143ead5..6d344824a 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java @@ -70,7 +70,7 @@ public class RedactionService { private static final int FONT_SCALE_FACTOR = 1000; private static final Set TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\""); private static final COSString EMPTY_COS_STRING = new COSString(""); - private static final int MAX_SWEEPS = 3; + private static final int MAX_SWEEPS = 5; private boolean aggressiveMode = false; private Map> aggressiveSegMatches = null; private final CustomPDFDocumentFactory pdfDocumentFactory; @@ -158,9 +158,9 @@ public class RedactionService { String[] parts = pageNumbers.split(","); for (String part : parts) { - part = part.trim(); - if (part.contains("-")) { - String[] range = part.split("-"); + String trim = part.trim(); + if (trim.contains("-")) { + String[] range = trim.split("-"); if (range.length == 2) { try { int start = Integer.parseInt(range[0].trim()); @@ -173,7 +173,7 @@ public class RedactionService { } } else { try { - result.add(Integer.parseInt(part)); + result.add(Integer.parseInt(trim)); } catch (NumberFormatException ignored) { } } @@ -255,24 +255,57 @@ public class RedactionService { boolean useRegex, boolean wholeWordSearch) { try { + log.debug("Checking page {} for {} target words", pageIndex + 1, targetWords.size()); + for (String term : targetWords) { if (term == null || term.isBlank()) { + log.debug("Skipping empty/null term"); continue; } + + log.debug("Searching for term '{}' on page {}", term, pageIndex + 1); TextFinder finder = new TextFinder(term, useRegex, wholeWordSearch); finder.setStartPage(pageIndex + 1); finder.setEndPage(pageIndex + 1); finder.getText(document); - for (PDFText ft : finder.getFoundTexts()) { + + List foundTexts = finder.getFoundTexts(); + log.debug( + "Found {} instances of '{}' on page {}", + foundTexts.size(), + term, + pageIndex + 1); + + for (PDFText ft : foundTexts) { if (ft.getPageIndex() == pageIndex) { + log.warn( + "FOUND REMAINING TARGET: '{}' on page {} - text content: '{}'", + term, + pageIndex + 1, + ft.getText() != null ? ft.getText() : "null"); return true; } } + + if (!foundTexts.isEmpty()) { + log.debug( + "Found instances but not on target page {} (found on pages: {})", + pageIndex + 1, + foundTexts.stream() + .map(ft -> String.valueOf(ft.getPageIndex() + 1)) + .distinct() + .collect(java.util.stream.Collectors.joining(", "))); + } } + + log.debug("Page {} contains no target words", pageIndex + 1); + return false; + } catch (Exception e) { + log.error("Error checking page {} for targets: {}", pageIndex + 1, e.getMessage()); + log.warn("Due to error, assuming page {} may still contain targets", pageIndex + 1); return true; } - return false; } private static boolean documentStillContainsTargets( @@ -280,40 +313,86 @@ public class RedactionService { Set targetWords, boolean useRegex, boolean wholeWordSearch) { + log.debug("Verifying if document still contains targets: {}", targetWords); + try { int idx = -1; - for (int i = 0; i < document.getNumberOfPages(); i++) { + final int numberOfPages = document.getNumberOfPages(); + for (int i = 0; i < numberOfPages; i++) { idx++; + log.debug("Checking page {} for remaining targets", idx + 1); + if (pageStillContainsTargets( document, idx, targetWords, useRegex, wholeWordSearch)) { + log.warn("Page {} still contains target words", idx + 1); return true; } } - } catch (Exception ignored) { + + log.info("Document verification completed - no targets found"); + return false; + + } catch (Exception e) { + log.error("Error during document verification: {}", e.getMessage()); + log.warn("Due to verification error, assuming targets may still exist"); return true; } - return false; } public static Map> findTextToRedact( PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) { Map> allFoundTextsByPage = new HashMap<>(); + log.info( + "Starting text search with {} terms, useRegex={}, wholeWordSearch={}", + listOfText.length, + useRegex, + wholeWordSearch); + + int totalInstancesFound = 0; + for (String text : listOfText) { String t = text.trim(); if (t.isEmpty()) { + log.debug("Skipping empty search term"); continue; } + + log.info("Searching for term: '{}'", t); try { TextFinder finder = new TextFinder(t, useRegex, wholeWordSearch); finder.getText(document); - for (PDFText found : finder.getFoundTexts()) { + List foundTexts = finder.getFoundTexts(); + + log.info("Found {} instances of '{}' across the document", foundTexts.size(), t); + + for (PDFText found : foundTexts) { allFoundTextsByPage .computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>()) .add(found); + + log.debug( + "Found instance on page {}: '{}'", + found.getPageIndex() + 1, + found.getText() != null ? found.getText() : "null"); + totalInstancesFound++; } - } catch (Exception ignored) { + } catch (Exception e) { + log.error("Error searching for term '{}': {}", t, e.getMessage()); } } + + log.info("Total instances found across all search terms: {}", totalInstancesFound); + log.info( + "Text found on {} pages out of {} total pages", + allFoundTextsByPage.size(), + document.getNumberOfPages()); + + // Log distribution by page + allFoundTextsByPage.forEach( + (pageIndex, texts) -> { + log.info("Page {}: {} instances", pageIndex + 1, texts.size()); + }); + return allFoundTextsByPage; } @@ -457,8 +536,6 @@ public class RedactionService { } } - // Removed ad-hoc width fallbacks; WidthCalculator is the single source of truth now - private static WipeResult wipeAllTextShowingOperators(List tokens) { List newTokens = new ArrayList<>(tokens); int modifications = 0; @@ -623,16 +700,13 @@ public class RedactionService { markedContentStack.push(i); if ("BDC".equals(name) && i > 0) { Object prev = tokens.get(i - 1); - if (prev instanceof COSDictionary dict) { - if (removeSemanticProperties(dict, removeTU)) { - modifications++; - } + if (prev instanceof COSDictionary dict + && removeSemanticProperties(dict, removeTU)) { + modifications++; } } - } else if ("EMC".equals(name)) { - if (!markedContentStack.isEmpty()) { - markedContentStack.pop(); - } + } else if ("EMC".equals(name) && !markedContentStack.isEmpty()) { + markedContentStack.pop(); } } } @@ -647,36 +721,80 @@ public class RedactionService { } } - public boolean performTextReplacement( - PDDocument document, - Map> allFoundTextsByPage, - String[] listOfText, - boolean useRegex, - boolean wholeWordSearchBool) { - if (allFoundTextsByPage.isEmpty()) { - return false; - } - try { - Set allSearchTerms = - Arrays.stream(listOfText) - .map(String::trim) - .filter(s -> !s.isEmpty()) - .collect(Collectors.toSet()); - for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) { - for (PDPage page : document.getPages()) { - List filtered = - createTokensWithoutTargetText( - document, page, allSearchTerms, useRegex, wholeWordSearchBool); - writeFilteredContentStream(document, page, filtered); - } - if (!documentStillContainsTargets( - document, allSearchTerms, useRegex, wholeWordSearchBool)) { - break; - } + private static String normalizeTextForRedaction(String text) { + if (text == null) return null; + + StringBuilder normalized = new StringBuilder(); + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + + if ((int) c >= 65488) { + normalized.append(' '); + } else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') { + normalized.append(' '); + } else { + normalized.append(c); } - return false; + } + + return normalized.toString(); + } + + private static boolean isTextSafeForRedaction(String text) { + if (text == null || text.isEmpty()) return true; + + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + int codePoint = c; + + if (codePoint >= 65488) { + return false; // Contains problematic high-range characters + } + if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') { + return false; // Contains problematic control characters + } + } + + return true; + } + + private static List deepCopyTokens(List original) { + List copy = new ArrayList<>(original.size()); + for (Object obj : original) { + if (obj instanceof COSDictionary dict) { + COSDictionary newDict = new COSDictionary(); + for (COSName key : dict.keySet()) { + newDict.setItem(key, dict.getDictionaryObject(key)); + } + copy.add(newDict); + } else if (obj instanceof List nestedList + && !nestedList.isEmpty() + && nestedList.get(0) instanceof Object) { + try { + @SuppressWarnings("unchecked") + List objectList = (List) nestedList; + copy.add(deepCopyTokens(objectList)); + } catch (ClassCastException e) { + copy.add(obj); // Fallback to shallow copy if cast fails + } + } else { + copy.add(obj); // Shallow copy for primitives/operators + } + } + return copy; + } + + private static boolean isFontSuitableForWidthCalculation(PDFont font) { + try { + String fontName = font.getName(); + if (fontName == null + || isProperFontSubset(fontName) + || fontName.toLowerCase().matches(".*(hoepap|temp|generated).*")) { + return false; + } + return hasReliableWidthMetrics(font); } catch (Exception e) { - return true; + return false; } } @@ -791,41 +909,30 @@ public class RedactionService { return fontName.charAt(6) == '+'; } - List createTokensWithoutTargetText( - PDDocument document, - PDPage page, - Set targetWords, - boolean useRegex, - boolean wholeWordSearch) - throws IOException { - PDFStreamParser parser = new PDFStreamParser(page); - List tokens = new ArrayList<>(); - Object tk; - while (true) { - final Object parsedNextToken = parser.parseNextToken(); - if ((tk = parsedNextToken) == null) break; - tokens.add(tk); + private static TokenModificationResult modifySimpleTextOperator( + List tokens, + Object token, + String operatorName, + String newText, + float adjustment, + TextSegment segment) { + if (!(token instanceof COSString)) { + return TokenModificationResult.failure("Expected COSString"); } - PDResources resources = page.getResources(); - if (resources != null) { - processPageXObjects( - document, - resources, - targetWords, - useRegex, - wholeWordSearch, - this.aggressiveMode); + + try { + int tokenIndex = segment.tokenIndex; + if (Math.abs(adjustment) < PRECISION_THRESHOLD) { + tokens.set( + tokenIndex, newText.isEmpty() ? EMPTY_COS_STRING : new COSString(newText)); + return TokenModificationResult.success(); + } else { + return convertToTJWithAdjustment( + tokens, tokenIndex, operatorName, newText, adjustment, segment); + } + } catch (Exception e) { + return TokenModificationResult.failure("Modification failed: " + e.getMessage()); } - List textSegments = extractTextSegments(page, tokens, this.aggressiveMode); - List matches; - if (this.aggressiveMode) { - matches = - findAllMatchesAggressive( - textSegments, tokens, targetWords, useRegex, wholeWordSearch); - } else { - matches = findMatchesInSegments(textSegments, targetWords, useRegex, wholeWordSearch); - } - return applyRedactionsToTokens(tokens, textSegments, matches); } private static void performEmergencyFallback(List tokens, int tokenIndex) { @@ -952,29 +1059,33 @@ public class RedactionService { return changed; } - private static List deepCopyTokens(List original) { - List copy = new ArrayList<>(original.size()); - for (Object obj : original) { - if (obj instanceof COSDictionary dict) { - COSDictionary newDict = new COSDictionary(); - for (COSName key : dict.keySet()) { - newDict.setItem(key, dict.getDictionaryObject(key)); - } - copy.add(newDict); - } else if (obj instanceof List nestedList - && !nestedList.isEmpty() - && nestedList.get(0) instanceof Object) { - try { - List objectList = (List) nestedList; - copy.add(deepCopyTokens(objectList)); - } catch (ClassCastException e) { - copy.add(obj); // Fallback to shallow copy if cast fails - } - } else { - copy.add(obj); // Shallow copy for primitives/operators - } + static String createPlaceholderWithWidth( + String originalWord, float targetWidth, PDFont font, float fontSize) { + if (originalWord == null || originalWord.isEmpty()) return " "; + if (font == null || fontSize <= 0) return " ".repeat(originalWord.length()); + if (!WidthCalculator.isWidthCalculationReliable(font)) + return " ".repeat(originalWord.length()); + + final String repeat = " ".repeat(Math.max(1, originalWord.length())); + if (TextEncodingHelper.isFontSubset(font.getName())) { + return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize); + } + + try { + float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize); + if (spaceWidth <= 0) { + return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + } + + int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); + int maxSpaces = + Math.max( + originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f)); + return " ".repeat(Math.min(spaceCount, maxSpaces)); + } catch (Exception e) { + String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + return result != null ? result : repeat; } - return copy; } private String applyRedactionsToSegmentText(TextSegment segment, List matches) { @@ -1060,48 +1171,14 @@ public class RedactionService { return 0f; } - private List findMatchesInSegments( - List segments, - Set targetWords, - boolean useRegex, - boolean wholeWordSearch) { - List allMatches = new ArrayList<>(); - List patterns = - TextFinderUtils.createOptimizedSearchPatterns( - targetWords, useRegex, wholeWordSearch); - - for (TextSegment segment : segments) { - String segmentText = segment.getText(); - if (segmentText == null || segmentText.isEmpty()) continue; - - if (segment.getFont() != null - && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) { - continue; - } - - for (Pattern pattern : patterns) { - try { - var matcher = pattern.matcher(segmentText); - while (matcher.find()) { - int matchStart = matcher.start(); - int matchEnd = matcher.end(); - - if (matchStart >= 0 - && matchEnd <= segmentText.length() - && matchStart < matchEnd) { - allMatches.add( - new MatchRange( - segment.getStartPos() + matchStart, - segment.getStartPos() + matchEnd)); - } - } - } catch (Exception e) { - } + private static boolean isValidTJArray(COSArray array) { + if (array == null || array.size() == 0) return false; + for (COSBase element : array) { + if (!(element instanceof COSString) && !(element instanceof COSNumber)) { + return false; } } - - allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos)); - return allMatches; + return true; } private static String createAlternativePlaceholder( @@ -1158,18 +1235,167 @@ public class RedactionService { } } - private boolean isFontSuitableForWidthCalculation(PDFont font) { - try { - String fontName = font.getName(); - if (fontName == null - || isProperFontSubset(fontName) - || fontName.toLowerCase().matches(".*(hoepap|temp|generated).*")) { - return false; - } - return hasReliableWidthMetrics(font); - } catch (Exception e) { + public boolean performTextReplacement( + PDDocument document, + Map> allFoundTextsByPage, + String[] listOfText, + boolean useRegex, + boolean wholeWordSearchBool) { + if (allFoundTextsByPage.isEmpty()) { + log.info("No text found to redact"); return false; } + try { + Set allSearchTerms = + Arrays.stream(listOfText) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toSet()); + + log.info( + "Starting text replacement with {} search terms: {}", + allSearchTerms.size(), + allSearchTerms); + log.info("Total pages in document: {}", document.getNumberOfPages()); + log.info("Initial text found on {} pages", allFoundTextsByPage.size()); + + // Count initial instances + int initialTotalInstances = + allFoundTextsByPage.values().stream().mapToInt(List::size).sum(); + log.info("Total initial instances to redact: {}", initialTotalInstances); + + int finalSweepCount = 0; + for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) { + finalSweepCount = sweep + 1; + log.info("=== Starting sweep {} of {} ===", sweep + 1, MAX_SWEEPS); + int pagesProcessed = 0; + int totalModifications = 0; + + for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { + PDPage page = document.getPages().get(pageIndex); + List pageFoundTexts = + allFoundTextsByPage.getOrDefault(pageIndex, List.of()); + + log.debug( + "Processing page {} - found {} instances", + pageIndex + 1, + pageFoundTexts.size()); + + List filtered = + createTokensWithoutTargetText( + document, page, allSearchTerms, useRegex, wholeWordSearchBool); + writeFilteredContentStream(document, page, filtered); + + // Count modifications (rough estimate based on token count difference) + int tokenDiff = Math.abs(filtered.size() - getOriginalTokenCount(page)); + totalModifications += tokenDiff; + pagesProcessed++; + + log.debug("Page {} - token modifications: {}", pageIndex + 1, tokenDiff); + } + + log.info( + "Sweep {} completed - processed {} pages, total modifications: {}", + sweep + 1, + pagesProcessed, + totalModifications); + + // Check remaining targets + boolean stillContainsTargets = + documentStillContainsTargets( + document, allSearchTerms, useRegex, wholeWordSearchBool); + + if (!stillContainsTargets) { + log.info("SUCCESS: All targets removed after {} sweeps", sweep + 1); + break; + } else { + log.warn( + "WARNING: Still contains targets after sweep {} - continuing...", + sweep + 1); + } + } + + // Final verification - run multiple times to catch any missed instances + boolean finalCheck = false; + for (int verifyAttempt = 0; verifyAttempt < 3; verifyAttempt++) { + log.info("Final verification attempt {} of 3", verifyAttempt + 1); + finalCheck = + documentStillContainsTargets( + document, allSearchTerms, useRegex, wholeWordSearchBool); + + if (!finalCheck) { + log.info( + "Verification attempt {} passed - no targets found", verifyAttempt + 1); + break; + } else { + log.warn("Verification attempt {} found remaining targets", verifyAttempt + 1); + if (verifyAttempt < 2) { + log.info("Performing additional cleanup sweep due to verification failure"); + // Try one more sweep + for (PDPage page : document.getPages()) { + List additionalFiltered = + createTokensWithoutTargetText( + document, + page, + allSearchTerms, + useRegex, + wholeWordSearchBool); + writeFilteredContentStream(document, page, additionalFiltered); + } + } + } + } + + if (finalCheck) { + log.error( + "FAILURE: Document still contains targets after {} sweeps and {} verification attempts. Falling back to visual redaction.", + MAX_SWEEPS, + 3); + log.error("Remaining search terms: {}", allSearchTerms); + + // Log detailed information about what was found + log.error("=== DETAILED FAILURE ANALYSIS ==="); + for (int pageIdx = 0; pageIdx < document.getNumberOfPages(); pageIdx++) { + for (String term : allSearchTerms) { + try { + TextFinder finder = new TextFinder(term, useRegex, wholeWordSearchBool); + finder.setStartPage(pageIdx + 1); + finder.setEndPage(pageIdx + 1); + finder.getText(document); + + for (PDFText found : finder.getFoundTexts()) { + if (found.getPageIndex() == pageIdx) { + log.error( + "REMAINING: '{}' found on page {} at position ({}, {})", + term, + pageIdx + 1, + found.getX1(), + found.getY1()); + } + } + } catch (Exception e) { + log.error( + "Error during failure analysis for term '{}' on page {}: {}", + term, + pageIdx + 1, + e.getMessage()); + } + } + } + log.error("=== END FAILURE ANALYSIS ==="); + + return true; // Return true to indicate fallback needed + } else { + log.info( + "SUCCESS: All text redaction completed successfully after {} sweeps", + finalSweepCount); + return false; // Return false to indicate success + } + + } catch (Exception e) { + log.error("Exception during text replacement: {}", e.getMessage(), e); + return true; + } } static String createPlaceholderWithFont(String originalWord, PDFont font) { @@ -1253,120 +1479,17 @@ public class RedactionService { } } - private List applyRedactionsToTokens( - List tokens, List textSegments, List matches) { - List newTokens = new ArrayList<>(tokens); - if (this.aggressiveMode) { - Map> perSeg = this.aggressiveSegMatches; - if (perSeg != null && !perSeg.isEmpty()) { - List segIndices = new ArrayList<>(perSeg.keySet()); - segIndices.sort( - (a, b) -> - Integer.compare( - textSegments.get(b).tokenIndex, - textSegments.get(a).tokenIndex)); - for (Integer segIndex : segIndices) { - TextSegment segment = textSegments.get(segIndex); - List segMatches = perSeg.getOrDefault(segIndex, List.of()); - if (segMatches.isEmpty()) { - continue; - } - Object token = newTokens.get(segment.tokenIndex); - String opName = segment.operatorName; - if (("Tj".equals(opName) || "'".equals(opName) || "\"".equals(opName)) - && token instanceof COSString cs) { - COSString redacted = - redactCosStringByDecodedRanges(segment.font, cs, segMatches); - if (segment.font != null && segment.fontSize > 0) { - String originalText = getDecodedString(cs, segment.font); - String modifiedText = getDecodedString(redacted, segment.font); - float wOrig = - calculateSafeWidth( - originalText, segment.font, segment.fontSize); - float wMod = - calculateSafeWidth( - modifiedText, segment.font, segment.fontSize); - float adjustment = wOrig - wMod; - if (Math.abs(adjustment) > PRECISION_THRESHOLD) { - COSArray arr = new COSArray(); - arr.add(redacted); - float kerning = - (-adjustment / segment.fontSize) * FONT_SCALE_FACTOR; - arr.add(new COSFloat(kerning)); - newTokens.set(segment.tokenIndex, arr); - updateOperatorSafely(newTokens, segment.tokenIndex, opName); - } else { - newTokens.set(segment.tokenIndex, redacted); - } - } else { - newTokens.set(segment.tokenIndex, redacted); - } - } else if ("TJ".equals(opName) && token instanceof COSArray arr) { - COSArray redacted = - redactTJArrayByDecodedRanges(segment.font, arr, segMatches); - // Inject kerning adjustments per string element to preserve layout - COSArray withKerning = buildKerningAdjustedTJArray(arr, redacted, segment); - newTokens.set(segment.tokenIndex, withKerning); - } - } - return newTokens; + private int getOriginalTokenCount(PDPage page) { + try { + PDFStreamParser parser = new PDFStreamParser(page); + int count = 0; + while (parser.parseNextToken() != null) { + count++; } + return count; + } catch (Exception e) { + return 0; } - Map> matchesBySegment = new HashMap<>(); - for (MatchRange match : matches) { - for (int i = 0; i < textSegments.size(); i++) { - TextSegment segment = textSegments.get(i); - int overlapStart = Math.max(match.startPos, segment.startPos); - int overlapEnd = Math.min(match.endPos, segment.endPos); - if (overlapStart < overlapEnd) { - matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match); - } - } - } - List tasks = new ArrayList<>(); - for (Map.Entry> entry : matchesBySegment.entrySet()) { - int segmentIndex = entry.getKey(); - List segmentMatches = entry.getValue(); - - if (segmentIndex < 0 || segmentIndex >= textSegments.size()) continue; - TextSegment segment = textSegments.get(segmentIndex); - if (segment == null) continue; - - try { - if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) { - String newText = applyRedactionsToSegmentText(segment, segmentMatches); - if (newText == null) newText = ""; - float adjustment = calculateWidthAdjustment(segment, segmentMatches); - tasks.add(new ModificationTask(segment, newText, adjustment)); - } else if ("TJ".equals(segment.operatorName)) { - tasks.add(new ModificationTask(segment, "", 0)); - } - } catch (Exception e) { - // Skip this segment - } - } - tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex)); - - int maxTasksToProcess = Math.min(tasks.size(), 1000); - - for (int i = 0; i < maxTasksToProcess && i < tasks.size(); i++) { - ModificationTask task = tasks.get(i); - try { - List segmentMatches = - matchesBySegment.getOrDefault( - textSegments.indexOf(task.segment), Collections.emptyList()); - - if (task.segment.tokenIndex >= newTokens.size()) continue; - if (task.segment.getText() == null || task.segment.getText().isEmpty()) continue; - - modifyTokenForRedaction( - newTokens, task.segment, task.newText, task.adjustment, segmentMatches); - } catch (Exception e) { - // Skip this task - } - } - - return newTokens; } private COSArray buildKerningAdjustedTJArray( @@ -1465,43 +1588,131 @@ public class RedactionService { baseLimit = Math.max(baseLimit, text.length() * avgCharWidth * 1.5f); } } catch (Exception e) { - // Use default } return baseLimit; } - private void modifyTokenForRedaction( - List tokens, - TextSegment segment, - String newText, - float adjustment, - List matches) { - if (tokens == null || segment == null || newText == null) return; - if (!isValidTokenIndex(tokens, segment.tokenIndex) || segment.operatorName == null) return; + List createTokensWithoutTargetText( + PDDocument document, + PDPage page, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) + throws IOException { + log.debug("Processing page with {} target words: {}", targetWords.size(), targetWords); - try { - Object token = tokens.get(segment.tokenIndex); - if (token == null || !isValidTokenForOperator(token, segment.operatorName)) return; + PDFStreamParser parser = new PDFStreamParser(page); + List tokens = new ArrayList<>(); + Object tk; + int tokenCount = 0; + while (true) { + final Object parsedNextToken = parser.parseNextToken(); + if ((tk = parsedNextToken) == null) break; + tokens.add(tk); + tokenCount++; + } + + log.debug("Parsed {} tokens from page content stream", tokenCount); + + if (tokenCount == 0 && !targetWords.isEmpty()) { + log.warn( + "No tokens parsed from page content stream - this might indicate encoding issues"); + log.warn("Attempting alternative verification for target words: {}", targetWords); - TokenModificationResult result = - performTokenModification( - tokens, - token, - segment.operatorName, - newText, - adjustment, - segment, - matches); - if (!result.isSuccess()) { - performFallbackModification(tokens, segment.tokenIndex, newText); - } - } catch (Exception e) { try { - performEmergencyFallback(tokens, segment.tokenIndex); - } catch (Exception emergencyError) { - // Final fallback failed - continue processing + TextFinder directFinder = new TextFinder("", false, false); + directFinder.setStartPage(document.getPages().indexOf(page) + 1); + directFinder.setEndPage(document.getPages().indexOf(page) + 1); + directFinder.getText(document); + + StringBuilder pageText = new StringBuilder(); + for (PDFText pdfText : directFinder.getFoundTexts()) { + if (pdfText.getText() != null) { + pageText.append(pdfText.getText()).append(" "); + } + } + + String extractedText = pageText.toString().trim(); + log.debug("Alternative text extraction found: '{}'", extractedText); + + for (String word : targetWords) { + if (extractedText.toLowerCase().contains(word.toLowerCase())) { + log.warn("Found target word '{}' via alternative extraction method", word); + } + } + + } catch (Exception e) { + log.error("Alternative text extraction failed: {}", e.getMessage()); } } + + PDResources resources = page.getResources(); + if (resources != null) { + log.debug("Processing XObjects for page"); + processPageXObjects( + document, + resources, + targetWords, + useRegex, + wholeWordSearch, + this.aggressiveMode); + } + + List textSegments = extractTextSegments(page, tokens, this.aggressiveMode); + log.debug("Extracted {} text segments from tokens", textSegments.size()); + + // Log extracted text content for debugging + if (!textSegments.isEmpty()) { + StringBuilder allText = new StringBuilder(); + boolean hasProblematicChars = false; + + for (TextSegment seg : textSegments) { + if (seg.getText() != null && !seg.getText().trim().isEmpty()) { + String segmentText = seg.getText(); + if (!isTextSafeForRedaction(segmentText)) { + hasProblematicChars = true; + segmentText = normalizeTextForRedaction(segmentText); + log.debug( + "Normalized problematic text in segment: original contained encoding issues"); + } + allText.append(segmentText).append(" "); + } + } + + String completeText = allText.toString().trim(); + if (!completeText.isEmpty()) { + log.debug("Complete extracted text: '{}'", completeText); + if (hasProblematicChars) { + log.info("Applied character normalization to handle encoding issues"); + } + } + } + + List matches; + if (this.aggressiveMode) { + log.debug("Using aggressive mode for matching"); + matches = + findAllMatchesAggressive( + textSegments, tokens, targetWords, useRegex, wholeWordSearch); + } else { + log.debug("Using moderate mode for matching"); + matches = findMatchesInSegments(textSegments, targetWords, useRegex, wholeWordSearch); + } + + log.info("Found {} matches to redact", matches.size()); + if (!matches.isEmpty()) { + log.debug("Match ranges: {}", matches); + } + + List resultTokens = applyRedactionsToTokens(tokens, textSegments, matches); + int modifications = tokens.size() - resultTokens.size(); + log.debug( + "Applied redactions - original tokens: {}, result tokens: {}, modifications: {}", + tokens.size(), + resultTokens.size(), + modifications); + + return resultTokens; } private static boolean isGibberish(String text) { @@ -1609,30 +1820,100 @@ public class RedactionService { }; } - private TokenModificationResult modifySimpleTextOperator( - List tokens, - Object token, - String operatorName, - String newText, - float adjustment, - TextSegment segment) { - if (!(token instanceof COSString)) { - return TokenModificationResult.failure("Expected COSString"); + private List findMatchesInSegments( + List segments, + Set targetWords, + boolean useRegex, + boolean wholeWordSearch) { + List allMatches = new ArrayList<>(); + List patterns = + TextFinderUtils.createOptimizedSearchPatterns( + targetWords, useRegex, wholeWordSearch); + + log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size()); + + int totalMatchesFound = 0; + + for (int i = 0; i < segments.size(); i++) { + TextSegment segment = segments.get(i); + String segmentText = segment.getText(); + if (segmentText == null || segmentText.isEmpty()) { + log.debug("Skipping empty segment {}", i); + continue; + } + + log.debug("Processing segment {}: '{}'", i, segmentText); + + if (segment.getFont() != null + && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) { + log.debug( + "Skipping segment {} - font not removable: {}", + i, + segment.getFont().getName()); + continue; + } + + int segmentMatches = 0; + for (Pattern pattern : patterns) { + try { + log.debug( + "Matching pattern '{}' against segment text '{}'", + pattern.pattern(), + segmentText); + var matcher = pattern.matcher(segmentText); + while (matcher.find()) { + int matchStart = matcher.start(); + int matchEnd = matcher.end(); + + log.debug( + "Found match in segment {}: positions {}-{}", + i, + matchStart, + matchEnd); + + if (matchStart >= 0 + && matchEnd <= segmentText.length() + && matchStart < matchEnd) { + String matchedText = segmentText.substring(matchStart, matchEnd); + log.debug("Matched text: '{}'", matchedText); + + allMatches.add( + new MatchRange( + segment.getStartPos() + matchStart, + segment.getStartPos() + matchEnd)); + segmentMatches++; + totalMatchesFound++; + } + } + } catch (Exception e) { + log.error("Error matching pattern in segment {}: {}", i, e.getMessage()); + } + } + + if (segmentMatches > 0) { + log.info("Segment {} had {} matches", i, segmentMatches); + } } - try { - int tokenIndex = segment.tokenIndex; - if (Math.abs(adjustment) < PRECISION_THRESHOLD) { - tokens.set( - tokenIndex, newText.isEmpty() ? EMPTY_COS_STRING : new COSString(newText)); - return TokenModificationResult.success(); - } else { - return convertToTJWithAdjustment( - tokens, tokenIndex, operatorName, newText, adjustment, segment); + log.info("Total matches found across all segments: {}", totalMatchesFound); + allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos)); + + if (allMatches.isEmpty()) { + log.warn("No matches found in segments. This might indicate:"); + log.warn("1. Text encoding issues preventing proper extraction"); + log.warn("2. Font compatibility issues"); + log.warn("3. Search terms not matching extracted text"); + log.warn("4. Whole word search filtering out matches"); + + // Log some debugging info + if (!segments.isEmpty()) { + log.warn("Sample segment text: '{}'", segments.get(0).getText()); + log.warn("Target words: {}", targetWords); + log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch); } - } catch (Exception e) { - return TokenModificationResult.failure("Modification failed: " + e.getMessage()); } + + return allMatches; } private String createSafeReplacement(String originalPart, TextSegment segment) { @@ -1651,33 +1932,205 @@ public class RedactionService { } } - String createPlaceholderWithWidth( - String originalWord, float targetWidth, PDFont font, float fontSize) { - if (originalWord == null || originalWord.isEmpty()) return " "; - if (font == null || fontSize <= 0) return " ".repeat(Math.max(1, originalWord.length())); - if (!WidthCalculator.isWidthCalculationReliable(font)) - return " ".repeat(originalWord.length()); + private List applyRedactionsToTokens( + List tokens, List textSegments, List matches) { + log.debug( + "Applying redactions to {} tokens with {} matches across {} segments", + tokens.size(), + matches.size(), + textSegments.size()); - final String repeat = " ".repeat(Math.max(1, originalWord.length())); - if (TextEncodingHelper.isFontSubset(font.getName())) { - return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize); + List newTokens = new ArrayList<>(tokens); + int totalModifications = 0; + + if (this.aggressiveMode) { + log.debug("Using aggressive mode for token redaction"); + Map> perSeg = this.aggressiveSegMatches; + if (perSeg != null && !perSeg.isEmpty()) { + log.debug("Processing {} aggressive segments", perSeg.size()); + List segIndices = new ArrayList<>(perSeg.keySet()); + segIndices.sort( + (a, b) -> + Integer.compare( + textSegments.get(b).tokenIndex, + textSegments.get(a).tokenIndex)); + for (Integer segIndex : segIndices) { + TextSegment segment = textSegments.get(segIndex); + List segMatches = perSeg.getOrDefault(segIndex, List.of()); + if (segMatches.isEmpty()) { + continue; + } + + log.debug( + "Processing aggressive segment {} with {} matches", + segIndex, + segMatches.size()); + Object token = newTokens.get(segment.tokenIndex); + String opName = segment.operatorName; + if (("Tj".equals(opName) || "'".equals(opName) || "\"".equals(opName)) + && token instanceof COSString cs) { + log.debug( + "Redacting Tj/TjQuote operator at token index {}", + segment.tokenIndex); + COSString redacted = + redactCosStringByDecodedRanges(segment.font, cs, segMatches); + if (segment.font != null && segment.fontSize > 0) { + String originalText = getDecodedString(cs, segment.font); + String modifiedText = getDecodedString(redacted, segment.font); + log.debug( + "Original text: '{}', Modified text: '{}'", + originalText, + modifiedText); + float wOrig = + calculateSafeWidth( + originalText, segment.font, segment.fontSize); + float wMod = + calculateSafeWidth( + modifiedText, segment.font, segment.fontSize); + float adjustment = wOrig - wMod; + if (Math.abs(adjustment) > PRECISION_THRESHOLD) { + log.debug("Applying kerning adjustment: {}", adjustment); + COSArray arr = new COSArray(); + arr.add(redacted); + float kerning = + (-adjustment / segment.fontSize) * FONT_SCALE_FACTOR; + arr.add(new COSFloat(kerning)); + newTokens.set(segment.tokenIndex, arr); + updateOperatorSafely(newTokens, segment.tokenIndex, opName); + totalModifications++; + } else { + newTokens.set(segment.tokenIndex, redacted); + totalModifications++; + } + } else { + newTokens.set(segment.tokenIndex, redacted); + totalModifications++; + } + } else if ("TJ".equals(opName) && token instanceof COSArray arr) { + log.debug("Redacting TJ operator at token index {}", segment.tokenIndex); + COSArray redacted = + redactTJArrayByDecodedRanges(segment.font, arr, segMatches); + // Inject kerning adjustments per string element to preserve layout + COSArray withKerning = buildKerningAdjustedTJArray(arr, redacted, segment); + newTokens.set(segment.tokenIndex, withKerning); + totalModifications++; + } + } + log.info("Aggressive mode completed - {} modifications made", totalModifications); + return newTokens; + } } - try { - float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize); - if (spaceWidth <= 0) { - return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); + log.debug("Using moderate mode for token redaction"); + Map> matchesBySegment = new HashMap<>(); + for (MatchRange match : matches) { + for (int i = 0; i < textSegments.size(); i++) { + TextSegment segment = textSegments.get(i); + int overlapStart = Math.max(match.startPos, segment.startPos); + int overlapEnd = Math.min(match.endPos, segment.endPos); + if (overlapStart < overlapEnd) { + matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match); + } + } + } + + log.debug("Matches distributed across {} segments", matchesBySegment.size()); + matchesBySegment.forEach( + (segIdx, matchList) -> + log.debug("Segment {}: {} matches", segIdx, matchList.size())); + + List tasks = new ArrayList<>(); + for (Map.Entry> entry : matchesBySegment.entrySet()) { + int segmentIndex = entry.getKey(); + List segmentMatches = entry.getValue(); + + if (segmentIndex < 0 || segmentIndex >= textSegments.size()) { + log.warn("Invalid segment index: {}", segmentIndex); + continue; + } + TextSegment segment = textSegments.get(segmentIndex); + if (segment == null) { + log.warn("Null segment at index: {}", segmentIndex); + continue; } - int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); - int maxSpaces = - Math.max( - originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f)); - return " ".repeat(Math.min(spaceCount, maxSpaces)); - } catch (Exception e) { - String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); - return result != null ? result : repeat; + try { + if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) { + log.debug( + "Creating modification task for Tj operator at segment {}", + segmentIndex); + String newText = applyRedactionsToSegmentText(segment, segmentMatches); + if (newText == null) newText = ""; + float adjustment = calculateWidthAdjustment(segment, segmentMatches); + tasks.add(new ModificationTask(segment, newText, adjustment)); + log.debug( + "Task created: original='{}', new='{}', adjustment={}", + segment.getText(), + newText, + adjustment); + } else if ("TJ".equals(segment.operatorName)) { + log.debug( + "Creating modification task for TJ operator at segment {}", + segmentIndex); + tasks.add(new ModificationTask(segment, "", 0)); + } + } catch (Exception e) { + log.error( + "Error creating modification task for segment {}: {}", + segmentIndex, + e.getMessage()); + } } + + log.info("Created {} modification tasks", tasks.size()); + tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex)); + + int maxTasksToProcess = Math.min(tasks.size(), 1000); + log.debug("Processing {} out of {} tasks (limit: 1000)", maxTasksToProcess, tasks.size()); + + int successfulModifications = 0; + for (int i = 0; i < maxTasksToProcess && i < tasks.size(); i++) { + ModificationTask task = tasks.get(i); + try { + List segmentMatches = + matchesBySegment.getOrDefault( + textSegments.indexOf(task.segment), Collections.emptyList()); + + if (task.segment.tokenIndex >= newTokens.size()) { + log.warn( + "Token index {} out of bounds (size: {})", + task.segment.tokenIndex, + newTokens.size()); + continue; + } + if (task.segment.getText() == null || task.segment.getText().isEmpty()) { + log.debug("Skipping empty text segment at index {}", task.segment.tokenIndex); + continue; + } + + log.debug( + "Applying redaction to token {}: '{}' -> '{}'", + task.segment.tokenIndex, + task.segment.getText(), + task.newText); + + modifyTokenForRedaction( + newTokens, task.segment, task.newText, task.adjustment, segmentMatches); + successfulModifications++; + totalModifications++; + + } catch (Exception e) { + log.error("Error applying redaction to task {}: {}", i, e.getMessage()); + } + } + + log.info( + "Redaction completed - {} successful modifications out of {} tasks", + successfulModifications, + tasks.size()); + log.info("Total modifications made: {}", totalModifications); + + return newTokens; } private List extractTextSegmentsFromTokens( @@ -2047,14 +2500,38 @@ public class RedactionService { } } - private boolean isValidTJArray(COSArray array) { - if (array == null || array.size() == 0) return false; - for (COSBase element : array) { - if (!(element instanceof COSString) && !(element instanceof COSNumber)) { - return false; + private void modifyTokenForRedaction( + List tokens, + TextSegment segment, + String newText, + float adjustment, + List matches) { + if (tokens == null || segment == null || newText == null) return; + if (!isValidTokenIndex(tokens, segment.tokenIndex) || segment.operatorName == null) return; + + try { + Object token = tokens.get(segment.tokenIndex); + if (!isValidTokenForOperator(token, segment.operatorName)) return; + + TokenModificationResult result = + performTokenModification( + tokens, + token, + segment.operatorName, + newText, + adjustment, + segment, + matches); + if (!result.isSuccess()) { + performFallbackModification(tokens, segment.tokenIndex, newText); + } + } catch (Exception e) { + try { + performEmergencyFallback(tokens, segment.tokenIndex); + } catch (Exception emergencyError) { + // Final fallback failed - continue processing } } - return true; } private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) { diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java index eeae28eac..8c7e4cfbe 100644 --- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java @@ -2,6 +2,7 @@ package stirling.software.SPDF.utils.text; import java.nio.ByteBuffer; import java.nio.CharBuffer; +import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.StandardCharsets; import java.util.ArrayList; @@ -13,11 +14,9 @@ import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.font.*; import lombok.experimental.UtilityClass; -import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.service.RedactionService; -@Slf4j @UtilityClass public class TextDecodingHelper { @@ -25,6 +24,8 @@ public class TextDecodingHelper { private final int ASCII_UPPER_BOUND = 126; private final int EXTENDED_ASCII_LOWER_BOUND = 160; private final int EXTENDED_ASCII_UPPER_BOUND = 255; + private final int PROBLEMATIC_CODE_LOWER_BOUND = 65488; + private final int PROBLEMATIC_CODE_UPPER_BOUND = 65535; public PDFont getFontSafely(PDResources resources, COSName fontName) { if (resources == null || fontName == null) { @@ -33,27 +34,15 @@ public class TextDecodingHelper { try { PDFont font = resources.getFont(fontName); - if (font == null) { - return null; - } - + if (font == null) return null; try { - String fontNameCheck = font.getName(); - if (fontNameCheck == null || fontNameCheck.trim().isEmpty()) { - log.debug("Font {} has null or empty name, skipping", fontName.getName()); - return null; - } + String n = font.getName(); + if (n == null || n.trim().isEmpty()) return null; } catch (Exception e) { - log.debug( - "Error accessing font name for {}, skipping: {}", - fontName.getName(), - e.getMessage()); return null; } - return font; } catch (Exception e) { - log.debug("Error retrieving font {}: {}", fontName.getName(), e.getMessage()); return null; } } @@ -65,90 +54,160 @@ public class TextDecodingHelper { try { byte[] bytes = cosString.getBytes(); - if (bytes.length == 0) { - return; - } - + if (bytes.length == 0) return; String basicDecoded = tryDecodeWithFont(font, cosString); if (basicDecoded != null && !basicDecoded.contains("?") - && !basicDecoded.trim().isEmpty()) { - return; - } - + && !basicDecoded.trim().isEmpty()) return; decodeCharactersEnhanced(font, bytes); - } catch (Exception e) { - log.error("Decoding failed: {}", e.getMessage(), e); try { tryDecodeWithFont(font, cosString); - } catch (Exception fallbackException) { + } catch (Exception ignored) { } } } public String decodeCharactersEnhanced(PDFont font, byte[] bytes) { + // Try font-guided decoding first + String fontPass = decodeByFontTables(font, bytes); + if (isAcceptable(fontPass)) return fontPass; + + // Try UTF-8 strict decoding + String utf8 = tryDecodeCharset(bytes, StandardCharsets.UTF_8); + if (isAcceptable(utf8)) return utf8; + + // UTF-16 BE/LE + String u16be = tryDecodeCharset(bytes, StandardCharsets.UTF_16BE); + if (isAcceptable(u16be)) return u16be; + + String u16le = tryDecodeCharset(bytes, StandardCharsets.UTF_16LE); + if (isAcceptable(u16le)) return u16le; + + // Common Windows encodings + String win1252 = tryDecodeCharset(bytes, Charset.forName("windows-1252")); + if (isAcceptable(win1252)) return win1252; + + String win1250 = tryDecodeCharset(bytes, Charset.forName("windows-1250")); + if (isAcceptable(win1250)) return win1250; + + String gb2312 = tryDecodeCharset(bytes, Charset.forName("GB2312")); + if (isAcceptable(gb2312)) return gb2312; + + String big5 = tryDecodeCharset(bytes, Charset.forName("Big5")); + if (isAcceptable(big5)) return big5; + + String shiftJis = tryDecodeCharset(bytes, Charset.forName("Shift_JIS")); + if (isAcceptable(shiftJis)) return shiftJis; + + String euckr = tryDecodeCharset(bytes, Charset.forName("EUC-KR")); + if (isAcceptable(euckr)) return euckr; + + // Fallback to ISO-8859-1 + String latin1 = tryDecodeCharset(bytes, StandardCharsets.ISO_8859_1); + return isAcceptable(latin1) ? latin1 : null; + } + + private String decodeByFontTables(PDFont font, byte[] bytes) { + if (font == null || bytes == null || bytes.length == 0) return null; StringBuilder out = new StringBuilder(); - boolean hasValidCharacters = false; int i = 0; while (i < bytes.length) { - int code = bytes[i] & 0xFF; - String charStr = decodeSingleCharacter(font, code, bytes); - - if (charStr == null && code >= 128 && i + 1 < bytes.length) { - int combinedCode = (code << 8) | (bytes[i + 1] & 0xFF); - charStr = decodeSingleCharacter(font, combinedCode, bytes); - if (charStr != null) { - i += 2; // Skip the next byte - out.append(charStr); - hasValidCharacters = true; - continue; + String ch = null; + int consumed = 1; + try { + ch = tryToUnicode(font, bytes, i); + if (ch == null && i + 1 < bytes.length) { + consumed = 2; + ch = tryToUnicode(font, bytes, i, 2); } + } catch (Exception ignored) { } - - if (charStr != null && !charStr.isEmpty()) { - out.append(charStr); - hasValidCharacters = true; - } else { - out.append('?'); + if (!isPrintable(ch)) { + // Handle problematic character codes specifically + ch = "�"; } - i++; + out.append(ch); + i += consumed; } - String result = out.toString(); - return hasValidCharacters ? result : null; + String s = out.toString(); + return isAcceptable(s) ? s : null; + } + + private String tryToUnicode(PDFont font, byte[] bytes, int pos) { + int code = bytes[pos] & 0xFF; + try { + return font.toUnicode(code); + } catch (Exception e) { + return null; + } + } + + private String tryToUnicode(PDFont font, byte[] bytes, int pos, int len) { + if (pos + len - 1 >= bytes.length) return null; + int code = 0; + for (int j = 0; j < len; j++) code = (code << 8) | (bytes[pos + j] & 0xFF); + try { + return font.toUnicode(code); + } catch (Exception e) { + return null; + } + } + + private String tryDecodeCharset(byte[] bytes, Charset cs) { + try { + String s = new String(bytes, cs); + return isPrintable(s) ? s : null; + } catch (Exception e) { + return null; + } + } + + private boolean isPrintable(String s) { + if (s == null || s.isEmpty()) return false; + int printable = 0; + for (int i = 0; i < s.length(); ) { + int cp = s.codePointAt(i); + int type = Character.getType(cp); + if (type != Character.CONTROL && type != Character.FORMAT && cp != 0xFFFD) printable++; + i += Character.charCount(cp); + } + return printable >= Math.max(1, s.codePointCount(0, s.length()) * 3 / 4); + } + + private boolean isAcceptable(String s) { + return isPrintable(s); } public String decodeSingleCharacter(PDFont font, int code, byte[] bytes) { String charStr = null; - try { charStr = font.toUnicode(code); } catch (Exception ignored) { } - if (charStr == null && font instanceof PDType0Font type0Font) { try { int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code; charStr = type0Font.toUnicode(cid); - log.debug("CID decoding successful for code {}: {}", cid, charStr); - } catch (Exception e) { - log.debug("CID decoding failed for code {}: {}", code, e.getMessage()); + } catch (Exception ignored) { } } - if (charStr == null && font.getName() != null && font.getName().contains("+")) { charStr = mapSubsetCharacter(code); } - if (charStr == null) { charStr = fallbackCharacterMapping(code, bytes, font); } - return charStr; } public String fallbackCharacterMapping(int code, byte[] bytes, PDFont font) { try { + // Handle problematic high-range character codes that cause .notdef warnings + if (code >= PROBLEMATIC_CODE_LOWER_BOUND && code <= PROBLEMATIC_CODE_UPPER_BOUND) { + return handleProblematicCharacterCode(code, font); + } + if (font instanceof PDType0Font && bytes.length > 1) { return null; } @@ -164,18 +223,15 @@ public class TextDecodingHelper { String fontName = font.getName(); if (fontName != null) { String lowerName = fontName.toLowerCase(); - if (lowerName.contains("cjk") - || lowerName.contains("gb") - || lowerName.contains("jp")) { - // Basic CJK fallback (expand with a lookup table if needed) - if (code >= 0x4E00 && code <= 0x9FFF) { - return String.valueOf( - (char) code); // Unicode Basic Multilingual Plane for CJK - } + if ((lowerName.contains("cjk") + || lowerName.contains("gb") + || lowerName.contains("jp")) + && code >= 0x4E00 + && code <= 0x9FFF) { + return String.valueOf((char) code); } } - // Fallback to UTF-8/16 decoding attempt for unknown encodings try { if (bytes.length >= 2) { ByteBuffer buffer = ByteBuffer.wrap(bytes); @@ -184,7 +240,7 @@ public class TextDecodingHelper { return charBuffer.toString(); } } catch (Exception e) { - log.debug("UTF fallback failed: {}", e.getMessage()); + } return null; @@ -193,6 +249,19 @@ public class TextDecodingHelper { } } + public String handleProblematicCharacterCode(int code, PDFont font) { + if (code >= PROBLEMATIC_CODE_LOWER_BOUND && code <= PROBLEMATIC_CODE_UPPER_BOUND) { + int adjustedCode = code - PROBLEMATIC_CODE_LOWER_BOUND; + if (adjustedCode >= ASCII_LOWER_BOUND) { + return String.valueOf((char) adjustedCode); + } + if (font != null && font.getName() != null && font.getName().contains("+")) { + return mapSubsetCharacter(adjustedCode); + } + } + return "�"; + } + public String mapSubsetCharacter(int code) { if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) { return String.valueOf((char) code); @@ -221,6 +290,7 @@ public class TextDecodingHelper { uni = font.toUnicode(code); } catch (Exception ignored) { } + if (uni != null) { out.append(uni); anyMapped = true; @@ -239,6 +309,7 @@ public class TextDecodingHelper { u1 = font.toUnicode(b1); } catch (Exception ignored) { } + if (i + 1 < bytes.length) { int b2 = bytes[i + 1] & 0xFF; int code = (b1 << 8) | b2; @@ -247,6 +318,12 @@ public class TextDecodingHelper { u2 = font.toUnicode(code); } catch (Exception ignored) { } + + // Handle problematic multi-byte codes + if (u2 == null && code >= PROBLEMATIC_CODE_LOWER_BOUND) { + u2 = handleProblematicCharacterCode(code, font); + } + if (u2 != null) { out.append(u2); i += 2; @@ -267,12 +344,12 @@ public class TextDecodingHelper { } } - public static RedactionService.DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) { + public RedactionService.DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) { RedactionService.DecodedMapping map = new RedactionService.DecodedMapping(); if (font == null || bytes == null) { - map.text = ""; - map.charByteStart = new int[0]; - map.charByteEnd = new int[0]; + map.setText(""); + map.setCharByteStart(new int[0]); + map.setCharByteEnd(new int[0]); return map; } @@ -289,46 +366,32 @@ public class TextDecodingHelper { while (i < bytes.length) { int start = i; - String decodedChar = null; - int consumed = 1; + String decodedChar; + int consumed; try { if (isType0) { - // Handle CID fonts and multi-byte encodings decodedChar = decodeType0Font((PDType0Font) font, bytes, i); consumed = getType0CharLength((PDType0Font) font, bytes, i); } else if (isType1) { - // Handle Type1 fonts with specific encoding decodedChar = decodeType1Font((PDType1Font) font, bytes, i); - consumed = getType1CharLength((PDType1Font) font, bytes, i); + consumed = 1; } else if (isType3) { - // Handle Type3 bitmap fonts decodedChar = decodeType3Font((PDType3Font) font, bytes, i); - consumed = 1; // Type3 typically single byte + consumed = 1; } else if (isTrueType) { - // Handle TrueType fonts decodedChar = decodeTrueTypeFont((PDTrueTypeFont) font, bytes, i); consumed = getTrueTypeCharLength((PDTrueTypeFont) font, bytes, i); } else { - // Generic fallback for other font types decodedChar = decodeGenericFont(font, bytes, i); - consumed = getGenericCharLength(font, bytes, i); - } - - // Validate the consumed length - if (consumed <= 0 || i + consumed > bytes.length) { consumed = 1; } - + if (consumed <= 0 || i + consumed > bytes.length) consumed = 1; } catch (Exception e) { - // Log the error for debugging purposes - System.err.println( - "Error decoding character at position " + i + ": " + e.getMessage()); decodedChar = null; consumed = 1; } - // Handle null or empty decoded characters if (decodedChar == null || decodedChar.isEmpty()) { decodedChar = handleUndecodableChar(bytes, i, consumed); } @@ -345,15 +408,14 @@ public class TextDecodingHelper { i += consumed; } - map.text = sb.toString(); - map.charByteStart = starts.stream().mapToInt(Integer::intValue).toArray(); - map.charByteEnd = ends.stream().mapToInt(Integer::intValue).toArray(); + map.setText(sb.toString()); + map.setCharByteStart(starts.stream().mapToInt(Integer::intValue).toArray()); + map.setCharByteEnd(ends.stream().mapToInt(Integer::intValue).toArray()); return map; } - private static String decodeType0Font(PDType0Font font, byte[] bytes, int position) { + private String decodeType0Font(PDType0Font font, byte[] bytes, int position) { try { - // Try multi-byte decoding first (common for CJK fonts) if (position + 1 < bytes.length) { int b1 = bytes[position] & 0xFF; int b2 = bytes[position + 1] & 0xFF; @@ -372,7 +434,7 @@ public class TextDecodingHelper { } } - private static int getType0CharLength(PDType0Font font, byte[] bytes, int position) { + private int getType0CharLength(PDType0Font font, byte[] bytes, int position) { try { if (position + 1 < bytes.length) { int b1 = bytes[position] & 0xFF; @@ -389,7 +451,7 @@ public class TextDecodingHelper { } } - private static String decodeType1Font(PDType1Font font, byte[] bytes, int position) { + private String decodeType1Font(PDType1Font font, byte[] bytes, int position) { try { int code = bytes[position] & 0xFF; return font.toUnicode(code); @@ -398,11 +460,7 @@ public class TextDecodingHelper { } } - private static int getType1CharLength(PDType1Font font, byte[] bytes, int position) { - return 1; // Type1 fonts are typically single-byte - } - - private static String decodeType3Font(PDType3Font font, byte[] bytes, int position) { + private String decodeType3Font(PDType3Font font, byte[] bytes, int position) { try { int code = bytes[position] & 0xFF; return font.toUnicode(code); @@ -411,7 +469,7 @@ public class TextDecodingHelper { } } - private static String decodeTrueTypeFont(PDTrueTypeFont font, byte[] bytes, int position) { + private String decodeTrueTypeFont(PDTrueTypeFont font, byte[] bytes, int position) { try { int code = bytes[position] & 0xFF; String unicode = font.toUnicode(code); @@ -429,7 +487,7 @@ public class TextDecodingHelper { } } - private static int getTrueTypeCharLength(PDTrueTypeFont font, byte[] bytes, int position) { + private int getTrueTypeCharLength(PDTrueTypeFont font, byte[] bytes, int position) { try { // First try single byte int code = bytes[position] & 0xFF; @@ -454,7 +512,7 @@ public class TextDecodingHelper { } } - private static String decodeGenericFont(PDFont font, byte[] bytes, int position) { + private String decodeGenericFont(PDFont font, byte[] bytes, int position) { try { int code = bytes[position] & 0xFF; return font.toUnicode(code); @@ -463,13 +521,8 @@ public class TextDecodingHelper { } } - private static int getGenericCharLength(PDFont font, byte[] bytes, int position) { - return 1; // Default to single byte for unknown font types - } + private String handleUndecodableChar(byte[] bytes, int position, int length) { - private static String handleUndecodableChar(byte[] bytes, int position, int length) { - - // Or try to interpret as ISO-8859-1 (Latin-1) as fallback try { byte[] charBytes = new byte[length]; System.arraycopy(bytes, position, charBytes, 0, length); @@ -478,9 +531,7 @@ public class TextDecodingHelper { return fallback; } } catch (Exception e) { - // Ignore and fall through to default } - - return "�"; // Unicode replacement character instead of "?" + return "�"; } } diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java index 62d7130f2..748d1179f 100644 --- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextEncodingHelper.java @@ -1,11 +1,6 @@ package stirling.software.SPDF.utils.text; -import java.io.IOException; - import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDSimpleFont; -import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding; -import org.apache.pdfbox.pdmodel.font.encoding.Encoding; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; @@ -15,225 +10,360 @@ import lombok.extern.slf4j.Slf4j; public class TextEncodingHelper { public boolean canEncodeCharacters(PDFont font, String text) { - if (font == null || text == null || text.isEmpty()) { + if (font == null || text == null) { return false; } + if (text.isEmpty()) { + return true; + } + try { - // Step 1: Primary check - full-string encoding (permissive for "good" cases) byte[] encoded = font.encode(text); if (encoded.length > 0) { - log.debug( - "Text '{}' has good full-string encoding for font {} - permissively allowing", - text, - font.getName() != null ? font.getName() : "Unknown"); return true; } - - // Step 2: Smart array-based fallback for TJ operator-style text - log.debug( - "Full encoding failed for '{}' - using array-based fallback for font {}", - text, - font.getName() != null ? font.getName() : "Unknown"); - - return validateAsCodePointArray(font, text); - - } catch (IOException | IllegalArgumentException e) { - log.debug( - "Encoding exception for text '{}' with font {} - trying array fallback: {}", - text, - font.getName() != null ? font.getName() : "Unknown", - e.getMessage()); - - if (isFontSubset(font.getName()) || hasCustomEncoding(font)) { - return validateAsCodePointArray(font, text); - } - - return false; // Non-subset fonts with encoding exceptions are likely problematic + } catch (Exception e) { } + + return validateAsCodePointArray(font, text); } private boolean validateAsCodePointArray(PDFont font, String text) { + if (text == null || text.isEmpty()) { + return true; + } + int totalCodePoints = 0; int successfulCodePoints = 0; - // Iterate through code points (handles surrogates correctly per Unicode docs) for (int i = 0; i < text.length(); ) { int codePoint = text.codePointAt(i); String charStr = new String(Character.toChars(codePoint)); totalCodePoints++; try { - // Test encoding for this code point byte[] charEncoded = font.encode(charStr); if (charEncoded.length > 0) { - float charWidth = font.getStringWidth(charStr); - - if (charWidth >= 0) { - successfulCodePoints++; - log.debug( - "Code point '{}' (U+{}) encoded successfully", - charStr, - Integer.toHexString(codePoint).toUpperCase()); - } else { - log.debug( - "Code point '{}' (U+{}) has invalid width: {}", - charStr, - Integer.toHexString(codePoint).toUpperCase(), - charWidth); + try { + float charWidth = font.getStringWidth(charStr); + if (charWidth >= 0) { + successfulCodePoints++; + } + } catch (Exception e) { + try { + if (canDecodeCharacter(font, charStr)) { + successfulCodePoints++; + } + } catch (Exception e2) { + } } } else { - log.debug( - "Code point '{}' (U+{}) encoding failed - empty result", - charStr, - Integer.toHexString(codePoint).toUpperCase()); + try { + if (canDecodeCharacter(font, charStr)) { + successfulCodePoints++; + } + } catch (Exception e) { + } + } + } catch (Exception e) { + try { + if (canDecodeCharacter(font, charStr)) { + successfulCodePoints++; + } + } catch (Exception e2) { + if (isBasicCharacter(codePoint)) { + successfulCodePoints++; + } } - } catch (IOException | IllegalArgumentException e) { - log.debug( - "Code point '{}' (U+{}) validation failed: {}", - charStr, - Integer.toHexString(codePoint).toUpperCase(), - e.getMessage()); } - i += Character.charCount(codePoint); // Handle surrogates properly + i += Character.charCount(codePoint); } - double successRate = - totalCodePoints > 0 ? (double) successfulCodePoints / totalCodePoints : 0; - boolean isAcceptable = successRate >= 0.95; + if (totalCodePoints == 0) { + return true; + } - log.debug( - "Array validation for '{}': {}/{} code points successful ({:.1f}%) - {}", - text, - successfulCodePoints, - totalCodePoints, - successRate * 100, - isAcceptable ? "ALLOWING" : "rejecting"); - - return isAcceptable; + double successRate = (double) successfulCodePoints / totalCodePoints; + return successRate >= 0.1; } - public boolean isTextSegmentRemovable(PDFont font, String text) { - if (font == null || text == null || text.isEmpty()) { + private boolean canDecodeCharacter(PDFont font, String charStr) { + if (font == null || charStr == null || charStr.isEmpty()) { return false; } - // Log the attempt - log.debug( - "Evaluating text segment for removal: '{}' with font {}", - text, - font.getName() != null ? font.getName() : "Unknown Font"); + try { + for (int code = 0; code <= 0xFFFF; code++) { + try { + String decoded = font.toUnicode(code); + if (decoded != null && decoded.equals(charStr)) { + return true; + } + } catch (Exception e) { + } + } + } catch (Exception e) { + } + + return false; + } + + private boolean isBasicCharacter(int codePoint) { + return (codePoint >= 32 && codePoint <= 126) + || (codePoint >= 160 && codePoint <= 255) + || Character.isWhitespace(codePoint) + || Character.isLetterOrDigit(codePoint); + } + + public boolean isTextSegmentRemovable(PDFont font, String text) { + if (font == null || text == null) { + return false; + } + + if (text.isEmpty()) { + return true; + } if (isSimpleCharacter(text)) { try { font.encode(text); font.getStringWidth(text); - log.debug( - "Text '{}' is a simple character and passed validation - allowing removal", - text); return true; } catch (Exception e) { - log.debug( - "Simple character '{}' failed basic validation with font {}: {}", - text, - font.getName() != null ? font.getName() : "Unknown", - e.getMessage()); - return false; + try { + return canHandleText(font, text); + } catch (Exception e2) { + return false; + } } } - // For complex text, require comprehensive validation return isTextFullyRemovable(font, text); } - public boolean isTextFullyRemovable(PDFont font, String text) { - if (font == null || text == null || text.isEmpty()) { + private boolean canHandleText(PDFont font, String text) { + if (font == null || text == null) { return false; } + if (text.isEmpty()) { + return true; + } + + for (int i = 0; i < text.length(); ) { + int codePoint = text.codePointAt(i); + String charStr = new String(Character.toChars(codePoint)); + + boolean canHandle = false; + + try { + byte[] encoded = font.encode(charStr); + if (encoded.length > 0) { + canHandle = true; + } + } catch (Exception e) { + } + + if (!canHandle) { + try { + if (canDecodeCharacter(font, charStr)) { + canHandle = true; + } + } catch (Exception e) { + } + } + + if (!canHandle && isBasicCharacter(codePoint)) { + canHandle = true; + } + + if (!canHandle) { + return false; + } + + i += Character.charCount(codePoint); + } + + return true; + } + + public boolean isTextFullyRemovable(PDFont font, String text) { + if (font == null || text == null) { + return false; + } + + if (text.isEmpty()) { + return true; + } + try { - // Check 1: Verify encoding capability using new smart approach if (!canEncodeCharacters(font, text)) { - log.debug( - "Text '{}' failed encoding validation for font {}", - text, - font.getName() != null ? font.getName() : "Unknown"); return false; } - // Check 2: Validate width calculation capability - float width = font.getStringWidth(text); - if (width < 0) { // Allow zero width (invisible chars) but reject negative (invalid) - log.debug( - "Text '{}' has invalid width {} for font {}", - text, - width, - font.getName() != null ? font.getName() : "Unknown"); - return false; // Invalid metrics prevent accurate removal + try { + float width = font.getStringWidth(text); + if (width < 0) { + return false; + } + } catch (Exception e) { + try { + if (!canCalculateTextWidth(font, text)) { + return false; + } + } catch (Exception e2) { + return false; + } } - // Check 3: Verify font descriptor completeness for redaction area calculation - if (font.getFontDescriptor() == null) { - log.debug( - "Missing font descriptor for font {}", - font.getName() != null ? font.getName() : "Unknown"); - return false; + try { + if (font.getFontDescriptor() == null) { + try { + return canHandleWithoutDescriptor(font, text); + } catch (Exception e) { + return false; + } + } + } catch (Exception e) { + try { + return canHandleWithoutDescriptor(font, text); + } catch (Exception e2) { + return false; + } } - // Check 4: Test bounding box calculation for redaction area try { font.getFontDescriptor().getFontBoundingBox(); - } catch (IllegalArgumentException e) { - log.debug( - "Font bounding box unavailable for font {}: {}", - font.getName() != null ? font.getName() : "Unknown", - e.getMessage()); + } catch (Exception e) { + try { + return canHandleWithoutBoundingBox(font, text); + } catch (Exception e2) { + return false; + } + } + + return true; + + } catch (Exception e) { + try { + return canHandleText(font, text); + } catch (Exception e2) { + return false; + } + } + } + + private boolean canCalculateTextWidth(PDFont font, String text) { + if (font == null || text == null) { + return false; + } + + if (text.isEmpty()) { + return true; + } + + for (int i = 0; i < text.length(); ) { + int codePoint = text.codePointAt(i); + String charStr = new String(Character.toChars(codePoint)); + + boolean hasWidth = false; + try { + float charWidth = font.getStringWidth(charStr); + if (charWidth >= 0) { + hasWidth = true; + } + } catch (Exception e) { + try { + float defaultWidth = getDefaultCharWidth(font); + if (defaultWidth > 0) { + hasWidth = true; + } + } catch (Exception e2) { + } + } + + if (!hasWidth && isBasicCharacter(codePoint)) { + hasWidth = true; + } + + if (!hasWidth) { return false; } - log.debug( - "Text '{}' passed comprehensive validation for font {}", - text, - font.getName() != null ? font.getName() : "Unknown"); - return true; + i += Character.charCount(codePoint); + } - } catch (IOException e) { - log.debug( - "Text '{}' failed validation for font {} due to IO error: {}", - text, - font.getName() != null ? font.getName() : "Unknown", - e.getMessage()); - return false; - } catch (IllegalArgumentException e) { - log.debug( - "Text '{}' failed validation for font {} due to argument error: {}", - text, - font.getName() != null ? font.getName() : "Unknown", - e.getMessage()); - return false; + return true; + } + + private float getDefaultCharWidth(PDFont font) { + String[] testChars = {" ", "a", "A", "0", ".", "e", "!", "i", "l", "I"}; + for (String testChar : testChars) { + try { + float width = font.getStringWidth(testChar); + if (width > 0) { + return width; + } + } catch (Exception e) { + } + } + return 500; + } + + private boolean canHandleWithoutDescriptor(PDFont font, String text) { + try { + return canCalculateTextWidth(font, text); + } catch (Exception e) { + return canHandleText(font, text); + } + } + + private boolean canHandleWithoutBoundingBox(PDFont font, String text) { + try { + return canCalculateTextWidth(font, text); + } catch (Exception e) { + return canHandleText(font, text); } } private boolean isSimpleCharacter(String text) { - if (text == null || text.isEmpty()) { + if (text == null) { return false; } - if (text.length() > 20) { + if (text.isEmpty()) { + return true; + } + + if (text.length() > 50) { return false; } for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); - // Allow letters, digits, and whitespace (most common cases) if (Character.isLetterOrDigit(c) || Character.isWhitespace(c)) { continue; } - // Allow common ASCII punctuation - if (c >= 32 && c <= 126 && ".,!?;:()-[]{}\"'/@#$%&*+=<>|\\~`".indexOf(c) >= 0) { + if (c >= 32 && c <= 126) { + continue; + } + + if (c >= 160 && c <= 255) { + continue; + } + + if (Character.getType(c) == Character.OTHER_PUNCTUATION + || Character.getType(c) == Character.DASH_PUNCTUATION + || Character.getType(c) == Character.START_PUNCTUATION + || Character.getType(c) == Character.END_PUNCTUATION + || Character.getType(c) == Character.CONNECTOR_PUNCTUATION + || Character.getType(c) == Character.OTHER_SYMBOL + || Character.getType(c) == Character.MATH_SYMBOL + || Character.getType(c) == Character.CURRENCY_SYMBOL) { continue; } @@ -243,111 +373,205 @@ public class TextEncodingHelper { return true; } - public boolean hasCustomEncoding(PDFont font) { - try { - if (font instanceof PDSimpleFont simpleFont) { - try { - Encoding encoding = simpleFont.getEncoding(); - if (encoding != null) { - // Check for dictionary-based custom encodings - if (encoding instanceof DictionaryEncoding) { - log.debug("Font {} uses DictionaryEncoding (custom)", font.getName()); - return true; - } - - String encodingName = encoding.getClass().getSimpleName(); - if (encodingName.contains("Custom") - || encodingName.contains("Dictionary")) { - log.debug( - "Font {} uses custom encoding: {}", - font.getName(), - encodingName); - return true; - } - } - } catch (Exception e) { - log.debug( - "Encoding detection failed for font {}: {}", - font.getName(), - e.getMessage()); - return true; // Assume custom if detection fails - } - } - - if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) { - log.debug( - "Font {} is Type0 (CID) - generally uses standard CMaps", - font.getName() != null ? font.getName() : "Unknown"); - return false; - } - - log.debug( - "Font {} type {} - assuming standard encoding", - font.getName() != null ? font.getName() : "Unknown", - font.getClass().getSimpleName()); - return false; - - } catch (IllegalArgumentException e) { - log.debug( - "Custom encoding detection failed for font {}: {}", - font.getName() != null ? font.getName() : "Unknown", - e.getMessage()); - return false; // Be forgiving on detection failure - } - } - public boolean fontSupportsCharacter(PDFont font, String character) { - if (font == null || character == null || character.isEmpty()) { + if (font == null || character == null) { return false; } + if (character.isEmpty()) { + return true; + } + try { byte[] encoded = font.encode(character); - if (encoded.length == 0) { - return false; + if (encoded.length > 0) { + try { + float width = font.getStringWidth(character); + if (width >= 0) { + return true; + } + } catch (Exception e) { + } + return true; } + } catch (Exception e) { + } - float width = font.getStringWidth(character); - return width > 0; + try { + if (canDecodeCharacter(font, character)) { + return true; + } + } catch (Exception e) { + } - } catch (IOException | IllegalArgumentException e) { - log.debug( - "Character '{}' not supported by font {}: {}", - character, - font.getName() != null ? font.getName() : "Unknown", - e.getMessage()); + for (int i = 0; i < character.length(); ) { + int codePoint = character.codePointAt(i); + if (isBasicCharacter(codePoint)) { + i += Character.charCount(codePoint); + continue; + } return false; } + + return true; } public boolean isFontSubset(String fontName) { if (fontName == null) { return false; } - return fontName.matches("^[A-Z]{6}\\+.*"); + + if (fontName.matches("^[A-Z]{6}\\+.*")) { + return true; + } + + if (fontName.matches("^[A-Z]{5}\\+.*")) { + return true; + } + + if (fontName.matches("^[A-Z]{4}\\+.*")) { + return true; + } + + if (fontName.contains("+")) { + String prefix = fontName.split("\\+")[0]; + if (prefix.matches("^[A-Z]+$") && prefix.length() >= 4) { + return true; + } + } + + return false; } public boolean canCalculateBasicWidths(PDFont font) { + if (font == null) { + return false; + } + try { float spaceWidth = font.getStringWidth(" "); - if (spaceWidth <= 0) { - return false; + if (spaceWidth > 0) { + return true; } + } catch (Exception e) { + } - String[] testChars = {"a", "A", "0", ".", "e", "!"}; - for (String ch : testChars) { + String[] testChars = { + "a", "A", "0", ".", "e", "!", "i", "l", "I", "m", "M", "W", "w", "1", "|", "-", "_", + "=", "+", "(", ")", "[", "]", "{", "}", "<", ">", "/", "\\", "?", ",", ";", ":", "\"", + "'", "`", "~", "@", "#", "$", "%", "^", "&", "*" + }; + int successCount = 0; + + for (String ch : testChars) { + try { + float width = font.getStringWidth(ch); + if (width > 0) { + successCount++; + if (successCount >= 3) { + return true; + } + } + } catch (Exception e) { + } + } + + try { + for (int code = 32; code <= 126; code++) { try { + String ch = String.valueOf((char) code); + float width = font.getStringWidth(ch); + if (width > 0) { + successCount++; + if (successCount >= 1) { + return true; + } + } + } catch (Exception e) { + } + } + } catch (Exception e) { + } + + try { + for (int code = 160; code <= 255; code++) { + try { + String ch = String.valueOf((char) code); float width = font.getStringWidth(ch); if (width > 0) { return true; } - } catch (IOException | IllegalArgumentException e) { + } catch (Exception e) { } } - - return false; // Can't calculate width for any test characters - } catch (IOException | IllegalArgumentException e) { - return false; // Font failed basic width calculation + } catch (Exception e) { } + + return false; + } + + public boolean canEncodeAnyCharacter(PDFont font) { + if (font == null) { + return false; + } + + String[] testStrings = { + "a", "A", "0", " ", ".", "!", "e", "i", "o", "u", "n", "t", "r", "s", "l", "1", "2", + "3", "4", "5", "6", "7", "8", "9", ",", ".", ";", ":", "?", "!", "(", ")", "[", "]", + "{", "}", "hello", "test", "sample", "abc", "123", "ABC" + }; + + for (String testStr : testStrings) { + try { + byte[] encoded = font.encode(testStr); + if (encoded.length > 0) { + return true; + } + } catch (Exception e) { + } + } + + for (int code = 0; code <= 0xFFFF; code += 100) { + try { + String testStr = String.valueOf((char) code); + byte[] encoded = font.encode(testStr); + if (encoded.length > 0) { + return true; + } + } catch (Exception e) { + } + } + + return false; + } + + public boolean isValidFont(PDFont font) { + if (font == null) { + return false; + } + + try { + String name = font.getName(); + if (name != null && !name.trim().isEmpty()) { + return true; + } + } catch (Exception e) { + } + + try { + if (canCalculateBasicWidths(font)) { + return true; + } + } catch (Exception e) { + } + + try { + if (canEncodeAnyCharacter(font)) { + return true; + } + } catch (Exception e) { + } + + return false; } } diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java index 00b9d65cf..38d700572 100644 --- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java @@ -5,10 +5,6 @@ import java.util.List; import java.util.Set; import java.util.regex.Pattern; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.font.PDFont; - import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; @@ -16,128 +12,116 @@ import lombok.extern.slf4j.Slf4j; @UtilityClass public class TextFinderUtils { - public boolean validateFontReliability(PDFont font) { - if (font == null) { - return false; - } - - if (font.isDamaged()) { - log.debug( - "Font {} is marked as damaged - using TextEncodingHelper validation", - font.getName()); - } - - if (TextEncodingHelper.canCalculateBasicWidths(font)) { - log.debug( - "Font {} passed basic width calculations - considering reliable", - font.getName()); - return true; - } - - String[] basicTests = {"1", "2", "3", "a", "A", "e", "E", " "}; - - int workingChars = 0; - for (String testChar : basicTests) { - if (TextEncodingHelper.canEncodeCharacters(font, testChar)) { - workingChars++; - } - } - - if (workingChars > 0) { - log.debug( - "Font {} can process {}/{} basic characters - considering reliable", - font.getName(), - workingChars, - basicTests.length); - return true; - } - - log.debug("Font {} failed all basic tests - considering unreliable", font.getName()); - return false; - } - public List createOptimizedSearchPatterns( Set searchTerms, boolean useRegex, boolean wholeWordSearch) { List patterns = new ArrayList<>(); + if (searchTerms == null) { + return patterns; + } + for (String term : searchTerms) { - if (term == null || term.trim().isEmpty()) { + if (term == null) { + continue; + } + + String trimmedTerm = term.trim(); + if (trimmedTerm.isEmpty()) { continue; } try { - String patternString = useRegex ? term.trim() : Pattern.quote(term.trim()); - - if (wholeWordSearch) { - patternString = applyWordBoundaries(term.trim(), patternString); + String patternString; + if (useRegex) { + patternString = trimmedTerm; + try { + Pattern.compile(patternString); + } catch (Exception e) { + patternString = Pattern.quote(trimmedTerm); + } + } else { + patternString = Pattern.quote(trimmedTerm); } - Pattern pattern = - Pattern.compile( - patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + if (wholeWordSearch) { + patternString = applyWordBoundaries(trimmedTerm, patternString, useRegex); + } + + int flags = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL; + try { + flags |= Pattern.CANON_EQ; + } catch (Exception e) { + } + + Pattern pattern = Pattern.compile(patternString, flags); patterns.add(pattern); - log.debug("Created search pattern: '{}' -> '{}'", term.trim(), patternString); - } catch (Exception e) { - log.warn("Failed to create pattern for term '{}': {}", term, e.getMessage()); + try { + String quotedTerm = Pattern.quote(trimmedTerm); + if (wholeWordSearch) { + quotedTerm = applyWordBoundaries(trimmedTerm, quotedTerm, false); + } + Pattern fallbackPattern = + Pattern.compile( + quotedTerm, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + patterns.add(fallbackPattern); + } catch (Exception e2) { + try { + Pattern simplestPattern = Pattern.compile(Pattern.quote(trimmedTerm)); + patterns.add(simplestPattern); + } catch (Exception e3) { + } + } } } return patterns; } - private String applyWordBoundaries(String originalTerm, String patternString) { - if (originalTerm.length() == 1 && Character.isDigit(originalTerm.charAt(0))) { - return "(? 0 && (completelyUnusableFonts * 2 > totalFonts); - log.debug( - "Page font analysis: {}/{} fonts are completely unusable - page {} problematic", - completelyUnusableFonts, - totalFonts, - hasProblems ? "IS" : "is NOT"); + boolean startsWithWordChar = Character.isLetterOrDigit(originalTerm.charAt(0)); + boolean endsWithWordChar = + Character.isLetterOrDigit(originalTerm.charAt(originalTerm.length() - 1)); - return hasProblems; + String result = patternString; + + if (startsWithWordChar) { + result = "(? widthCache = new ConcurrentHashMap<>(); - private final Map reliabilityCache = new ConcurrentHashMap<>(); - - private String createCacheKey(PDFont font, String text, float fontSize) { - return String.format("%s|%s|%.2f", font.getName(), text, fontSize); - } - - private String createReliabilityCacheKey(PDFont font) { - return font.getName(); - } - public float calculateAccurateWidth(PDFont font, String text, float fontSize) { - return calculateAccurateWidth(font, text, fontSize, true); - } + if (font == null || text == null || fontSize <= 0) { + return 0; + } - public float calculateAccurateWidth( - PDFont font, String text, float fontSize, boolean useCache) { - if (font == null || text == null || text.isEmpty() || fontSize <= 0) return 0; - - if (useCache) { - String cacheKey = createCacheKey(font, text, fontSize); - Float cachedWidth = widthCache.get(cacheKey); - if (cachedWidth != null) return cachedWidth; + if (text.isEmpty()) { + return 0; } String normalizedText = normalizeText(text); Float directWidth = calculateDirectWidth(font, normalizedText, fontSize); if (directWidth != null) { - if (useCache) widthCache.put(createCacheKey(font, text, fontSize), directWidth); return directWidth; } Float charByCharWidth = calculateCharacterByCharacterWidth(font, normalizedText, fontSize); if (charByCharWidth != null) { - if (useCache) widthCache.put(createCacheKey(font, text, fontSize), charByCharWidth); return charByCharWidth; } Float glyphWidth = calculateGlyphBasedWidth(font, normalizedText, fontSize); if (glyphWidth != null) { - if (useCache) widthCache.put(createCacheKey(font, text, fontSize), glyphWidth); return glyphWidth; } - float fallbackWidth = calculateComprehensiveFallbackWidth(font, normalizedText, fontSize); - if (useCache) widthCache.put(createCacheKey(font, text, fontSize), fallbackWidth); - return fallbackWidth; + return calculateComprehensiveFallbackWidth(font, normalizedText, fontSize); } private String normalizeText(String text) { - return Normalizer.normalize(text, Normalizer.Form.NFC); + if (text == null) return ""; + try { + return Normalizer.normalize(text, Normalizer.Form.NFC); + } catch (Exception e) { + return text; + } } private Float calculateDirectWidth(PDFont font, String text, float fontSize) { - if (!TextEncodingHelper.canEncodeCharacters(font, text)) return null; - try { - float rawWidth = font.getStringWidth(text); - float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize; - return rawWidth >= 0 && scaledWidth >= 0 ? scaledWidth : null; + if (!TextEncodingHelper.canEncodeCharacters(font, text)) { + return null; + } + float rawWidth = font.getStringWidth(text) / 1000f; + if (rawWidth < 0) return null; + float scaledWidth = rawWidth * fontSize; + return scaledWidth >= 0 ? scaledWidth : null; } catch (Exception e) { return null; } @@ -96,7 +77,12 @@ public class WidthCalculator { for (int codePoint : codePoints) { String character = new String(Character.toChars(codePoint)); - Float charWidth = calculateSingleCharacterWidth(font, character, fontSize); + Float charWidth = + calculateSingleCharacterWidth(font, character, fontSize, codePoint); + + if (charWidth == null) { + return null; + } totalWidth += charWidth; if (previousCodePoint != -1) { @@ -104,7 +90,7 @@ public class WidthCalculator { } previousCodePoint = codePoint; } - return totalWidth; + return totalWidth >= 0 ? totalWidth : null; } catch (Exception e) { return null; } @@ -112,80 +98,99 @@ public class WidthCalculator { private List getCodePoints(String text) { List codePoints = new ArrayList<>(); + if (text == null) return codePoints; + for (int i = 0; i < text.length(); ) { - int codePoint = text.codePointAt(i); - codePoints.add(codePoint); - i += Character.charCount(codePoint); + try { + int codePoint = text.codePointAt(i); + codePoints.add(codePoint); + i += Character.charCount(codePoint); + } catch (Exception e) { + i++; + } } return codePoints; } - private Float calculateSingleCharacterWidth(PDFont font, String character, float fontSize) { + private Float calculateSingleCharacterWidth( + PDFont font, String character, float fontSize, int codePoint) { try { - byte[] encoded = null; - - try { - encoded = font.encode(character); - if (encoded.length == 0) encoded = null; - } catch (Exception e) { - log.debug("Direct encoding failed for '{}': {}", character, e.getMessage()); - } - - if (encoded == null && font instanceof PDType0Font) { + if (TextEncodingHelper.fontSupportsCharacter(font, character)) { try { - encoded = character.getBytes(StandardCharsets.UTF_8); - } catch (Exception e) { - log.debug("UTF-8 encoding failed for '{}': {}", character, e.getMessage()); + float raw = font.getStringWidth(character) / 1000f; + if (raw >= 0) return raw * fontSize; + } catch (Exception ignored) { } } - - if (encoded != null && encoded.length > 0) { - Float width = calculateGlyphWidth(font, encoded, fontSize); - if (width != null && width >= 0) return width; - } - - return calculateAverageCharacterWidth(font, fontSize); - } catch (Exception e) { - log.debug( - "Single character width calculation failed for '{}': {}", - character, - e.getMessage()); - return calculateAverageCharacterWidth(font, fontSize); } - } - private Float calculateGlyphWidth(PDFont font, byte[] encoded, float fontSize) { - for (byte b : encoded) { - try { - int glyphCode = b & 0xFF; - float glyphWidth = font.getWidth(glyphCode); + try { + float w = font.getWidth(codePoint) / 1000f; + if (w >= 0) return w * fontSize; + } catch (Exception ignored) { + } - if (glyphWidth > 0) { - return (glyphWidth / FONT_SCALE_FACTOR) * fontSize; - } - - // Try alternative width methods - try { - glyphWidth = font.getWidthFromFont(glyphCode); - if (glyphWidth > 0) { - return (glyphWidth / FONT_SCALE_FACTOR) * fontSize; - } - } catch (Exception e) { - log.debug( - "getWidthFromFont failed for glyph {}: {}", glyphCode, e.getMessage()); - } - - } catch (Exception e) { - log.debug("Glyph width calculation failed for byte {}: {}", b, e.getMessage()); + try { + if (codePoint >= 0 && codePoint <= 0xFFFF) { + float w = font.getWidth(codePoint) / 1000f; + if (w >= 0) return w * fontSize; } + } catch (Exception ignored) { } - return null; + + try { + byte[] encoded = font.encode(character); + if (encoded.length > 0) { + for (byte b : encoded) { + try { + int glyphCode = b & 0xFF; + float w = font.getWidth(glyphCode) / 1000f; + if (w >= 0) return w * fontSize; + } catch (Exception ignored) { + } + } + } + } catch (Exception ignored) { + } + + return calculateCategoryBasedWidth(font, codePoint, fontSize); } private float calculateKerning( PDFont font, int leftCodePoint, int rightCodePoint, float fontSize) { - return 0; + try { + if (font instanceof PDSimpleFont) { + PDSimpleFont simpleFont = (PDSimpleFont) font; + try { + java.lang.reflect.Method getKerningMethod = + simpleFont.getClass().getMethod("getKerning", int.class, int.class); + float kerningValue = + (Float) + getKerningMethod.invoke( + simpleFont, leftCodePoint, rightCodePoint); + return (kerningValue / 1000f) * fontSize; + } catch (Exception e) { + } + } + } catch (Exception e) { + } + + try { + String leftChar = new String(Character.toChars(leftCodePoint)); + String rightChar = new String(Character.toChars(rightCodePoint)); + String combined = leftChar + rightChar; + + float combinedWidth = font.getStringWidth(combined) / 1000f; + float leftWidth = font.getStringWidth(leftChar) / 1000f; + float rightWidth = font.getStringWidth(rightChar) / 1000f; + + float kerning = combinedWidth - leftWidth - rightWidth; + return kerning * fontSize; + } catch (Exception e) { + } + + return 0f; } private Float calculateGlyphBasedWidth(PDFont font, String text, float fontSize) { @@ -196,7 +201,6 @@ public class WidthCalculator { int codePoint = text.codePointAt(i); String character = new String(Character.toChars(codePoint)); - // Try to get glyph information more comprehensively Float charWidth = calculateGlyphWidthComprehensively(font, character, codePoint, fontSize); if (charWidth == null) { @@ -207,11 +211,9 @@ public class WidthCalculator { i += Character.charCount(codePoint); } - log.debug("Glyph-based width calculation: {}", totalWidth); - return totalWidth; + return totalWidth >= 0 ? totalWidth : null; } catch (Exception e) { - log.debug("Glyph-based calculation failed: {}", e.getMessage()); return null; } } @@ -219,70 +221,118 @@ public class WidthCalculator { private Float calculateGlyphWidthComprehensively( PDFont font, String character, int codePoint, float fontSize) { try { - // Method 1: Try standard encoding - try { - byte[] encoded = font.encode(character); - if (encoded.length > 0) { - Float width = calculateWidthFromEncodedBytes(font, encoded, fontSize); - if (width != null && width >= 0) { - return width; - } + byte[] encoded = font.encode(character); + if (encoded.length > 0) { + Float width = calculateWidthFromEncodedBytes(font, encoded, fontSize); + if (width != null && width >= 0) { + return width; } - } catch (Exception e) { - log.debug( - "Standard encoding failed for U+{}: {}", - Integer.toHexString(codePoint), - e.getMessage()); } - - // Method 2: Try Unicode code point directly - try { - float glyphWidth = font.getWidth(codePoint); - if (glyphWidth > 0) { - return (glyphWidth / FONT_SCALE_FACTOR) * fontSize; - } - } catch (Exception e) { - log.debug( - "Unicode code point width failed for U+{}: {}", - Integer.toHexString(codePoint), - e.getMessage()); - } - - // Method 3: Character category based estimation - return calculateCategoryBasedWidth(font, codePoint, fontSize); - } catch (Exception e) { - log.debug("Comprehensive glyph width calculation failed: {}", e.getMessage()); - return calculateAverageCharacterWidth(font, fontSize); } + + try { + float glyphWidth = font.getWidth(codePoint) / 1000f; + if (glyphWidth >= 0) { + return glyphWidth * fontSize; + } + } catch (Exception e) { + } + + try { + if (codePoint <= 0xFFFF) { + float glyphWidth = font.getWidth(codePoint) / 1000f; + if (glyphWidth >= 0) { + return glyphWidth * fontSize; + } + } + } catch (Exception e) { + } + + try { + for (int code = 0; code <= 0xFF; code++) { + try { + String decoded = font.toUnicode(code); + if (decoded != null && decoded.equals(character)) { + float glyphWidth = font.getWidth(code) / 1000f; + if (glyphWidth >= 0) { + return glyphWidth * fontSize; + } + } + } catch (Exception e) { + } + } + } catch (Exception e) { + } + + return calculateCategoryBasedWidth(font, codePoint, fontSize); } private Float calculateWidthFromEncodedBytes(PDFont font, byte[] encoded, float fontSize) { - // Try each byte as a potential glyph code - for (byte b : encoded) { + if (encoded == null || encoded.length == 0) return null; + + if (font instanceof PDType0Font && encoded.length >= 2) { try { - int glyphCode = b & 0xFF; - float width = font.getWidth(glyphCode); - if (width > 0) { - return (width / FONT_SCALE_FACTOR) * fontSize; + int glyphCode = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF); + float width = font.getWidth(glyphCode) / 1000f; + if (width >= 0) { + return width * fontSize; + } + } catch (Exception e) { + } + + try { + for (int i = 0; i <= encoded.length - 2; i++) { + int glyphCode = ((encoded[i] & 0xFF) << 8) | (encoded[i + 1] & 0xFF); + float width = font.getWidth(glyphCode) / 1000f; + if (width >= 0) { + return width * fontSize; + } } } catch (Exception e) { - // Continue trying other bytes } } - if (encoded.length >= 2 && font instanceof PDType0Font) { + for (byte b : encoded) { try { - int glyphCode = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF); - float width = font.getWidth(glyphCode); - if (width > 0) { - return (width / FONT_SCALE_FACTOR) * fontSize; + int glyphCode = b & 0xFF; + float width = font.getWidth(glyphCode) / 1000f; + if (width >= 0) { + return width * fontSize; } } catch (Exception e) { - log.debug("Multi-byte glyph code interpretation failed: {}", e.getMessage()); } } + try { + if (encoded.length >= 3) { + int glyphCode = + ((encoded[0] & 0xFF) << 16) + | ((encoded[1] & 0xFF) << 8) + | (encoded[2] & 0xFF); + float width = font.getWidth(glyphCode) / 1000f; + if (width >= 0) { + return width * fontSize; + } + } + } catch (Exception e) { + } + + try { + if (encoded.length >= 4) { + int glyphCode = + ((encoded[0] & 0xFF) << 24) + | ((encoded[1] & 0xFF) << 16) + | ((encoded[2] & 0xFF) << 8) + | (encoded[3] & 0xFF); + float width = font.getWidth(glyphCode) / 1000f; + if (width >= 0) { + return width * fontSize; + } + } + } catch (Exception e) { + } + return null; } @@ -291,198 +341,237 @@ public class WidthCalculator { int category = Character.getType(codePoint); float baseWidth = calculateAverageCharacterWidth(font, fontSize); - // Adjust width based on character category float multiplier = switch (category) { case Character.UPPERCASE_LETTER -> 1.2f; case Character.LOWERCASE_LETTER -> 1.0f; - case Character.DECIMAL_DIGIT_NUMBER -> 1.0f; - case Character.SPACE_SEPARATOR -> 0.5f; - case Character.DASH_PUNCTUATION -> 0.8f; - case Character.OTHER_PUNCTUATION -> 0.6f; - case Character.CURRENCY_SYMBOL -> 1.1f; - case Character.MATH_SYMBOL -> 1.0f; + case Character.TITLECASE_LETTER -> 1.15f; case Character.MODIFIER_LETTER -> 0.7f; - case Character.NON_SPACING_MARK -> 0.0f; // Combining characters + case Character.OTHER_LETTER -> 1.0f; + case Character.DECIMAL_DIGIT_NUMBER -> 1.0f; + case Character.LETTER_NUMBER -> 1.0f; + case Character.OTHER_NUMBER -> 1.0f; + case Character.SPACE_SEPARATOR -> 0.5f; + case Character.LINE_SEPARATOR -> 0.0f; + case Character.PARAGRAPH_SEPARATOR -> 0.0f; + case Character.NON_SPACING_MARK -> 0.0f; case Character.ENCLOSING_MARK -> 0.0f; case Character.COMBINING_SPACING_MARK -> 0.3f; + case Character.DASH_PUNCTUATION -> 0.8f; + case Character.START_PUNCTUATION -> 0.6f; + case Character.END_PUNCTUATION -> 0.6f; + case Character.CONNECTOR_PUNCTUATION -> 0.6f; + case Character.OTHER_PUNCTUATION -> 0.6f; + case Character.MATH_SYMBOL -> 1.0f; + case Character.CURRENCY_SYMBOL -> 1.1f; + case Character.MODIFIER_SYMBOL -> 0.8f; + case Character.OTHER_SYMBOL -> 1.0f; + case Character.INITIAL_QUOTE_PUNCTUATION -> 0.6f; + case Character.FINAL_QUOTE_PUNCTUATION -> 0.6f; + case Character.CONTROL -> 0.0f; + case Character.FORMAT -> 0.0f; + case Character.PRIVATE_USE -> 1.0f; + case Character.SURROGATE -> 0.0f; + case Character.UNASSIGNED -> 1.0f; default -> 1.0f; }; - return baseWidth * multiplier; + float result = baseWidth * multiplier; + return result >= 0 ? result : baseWidth; } catch (Exception e) { - log.debug("Category-based width calculation failed: {}", e.getMessage()); return calculateAverageCharacterWidth(font, fontSize); } } private float calculateAverageCharacterWidth(PDFont font, float fontSize) { try { - float avgWidth = font.getAverageFontWidth(); - return (avgWidth / FONT_SCALE_FACTOR) * fontSize; + float avgWidth = font.getAverageFontWidth() / 1000f; + if (avgWidth > 0) { + return avgWidth * fontSize; + } } catch (Exception e) { - log.debug("Average character width calculation failed: {}", e.getMessage()); - return CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize; } + + try { + String[] testChars = { + "a", "A", "e", "E", "i", "I", "o", "O", "n", "N", "t", "T", "r", "R", "s", "S", "0", + "1", "2", "3", "4", "5" + }; + float totalWidth = 0; + int successCount = 0; + + for (String testChar : testChars) { + try { + float width = font.getStringWidth(testChar) / 1000f; + if (width > 0) { + totalWidth += width; + successCount++; + } + } catch (Exception e) { + } + } + + if (successCount > 0) { + return (totalWidth / successCount) * fontSize; + } + } catch (Exception e) { + } + + try { + for (int code = 32; code <= 126; code++) { + try { + float width = font.getWidth(code) / 1000f; + if (width > 0) { + return width * fontSize; + } + } catch (Exception e) { + } + } + } catch (Exception e) { + } + + try { + if (font.getFontDescriptor() != null) { + PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); + if (bbox != null) { + float avgCharWidth = bbox.getWidth() / 2000f; + return avgCharWidth * fontSize; + } + } + } catch (Exception e) { + } + + return CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize; } private float calculateComprehensiveFallbackWidth(PDFont font, String text, float fontSize) { + if (text == null || text.isEmpty()) { + return 0; + } + + try { + float charWidth = calculateAverageCharacterWidth(font, fontSize); + float totalWidth = 0; + + for (int i = 0; i < text.length(); ) { + int codePoint = text.codePointAt(i); + Float specificWidth = calculateCategoryBasedWidth(font, codePoint, fontSize); + if (specificWidth != null) { + totalWidth += specificWidth; + } else { + totalWidth += charWidth; + } + i += Character.charCount(codePoint); + } + + return totalWidth; + } catch (Exception e) { + } + try { - // Strategy 1: Use font bounding box with character analysis if (font.getFontDescriptor() != null && font.getFontDescriptor().getFontBoundingBox() != null) { - PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); - float avgCharWidth = bbox.getWidth() / FONT_SCALE_FACTOR; - - // Analyze text composition for better estimation - float adjustedWidth = analyzeTextComposition(text, avgCharWidth, fontSize); - log.debug("Bounding box based fallback width: {}", adjustedWidth); - return adjustedWidth; + float avgCharWidth = bbox.getWidth() / 1000f; + return text.length() * avgCharWidth * BBOX_CHAR_WIDTH_RATIO * fontSize; } - - // Strategy 2: Enhanced average width calculation - float enhancedAverage = calculateEnhancedAverageWidth(font, text, fontSize); - log.debug("Enhanced average fallback width: {}", enhancedAverage); - return enhancedAverage; - } catch (Exception e) { - float conservativeWidth = text.length() * CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize; - log.debug("Conservative fallback width: {}", conservativeWidth); - return conservativeWidth; - } - } - - private float analyzeTextComposition(String text, float avgCharWidth, float fontSize) { - float totalWidth = 0; - int spaceCount = 0; - int upperCount = 0; - int lowerCount = 0; - int digitCount = 0; - int punctCount = 0; - - for (int i = 0; i < text.length(); ) { - int codePoint = text.codePointAt(i); - int category = Character.getType(codePoint); - - switch (category) { - case Character.SPACE_SEPARATOR -> { - spaceCount++; - totalWidth += avgCharWidth * 0.5f * fontSize; - } - case Character.UPPERCASE_LETTER -> { - upperCount++; - totalWidth += avgCharWidth * 1.2f * fontSize; - } - case Character.LOWERCASE_LETTER -> { - lowerCount++; - totalWidth += avgCharWidth * 1.0f * fontSize; - } - case Character.DECIMAL_DIGIT_NUMBER -> { - digitCount++; - totalWidth += avgCharWidth * 1.0f * fontSize; - } - case Character.OTHER_PUNCTUATION, Character.DASH_PUNCTUATION -> { - punctCount++; - totalWidth += avgCharWidth * 0.7f * fontSize; - } - default -> totalWidth += avgCharWidth * BBOX_CHAR_WIDTH_RATIO * fontSize; - } - - i += Character.charCount(codePoint); } - log.debug( - "Text composition analysis - Spaces: {}, Upper: {}, Lower: {}, Digits: {}, Punct: {}", - spaceCount, - upperCount, - lowerCount, - digitCount, - punctCount); - - return totalWidth; - } - - private float calculateEnhancedAverageWidth(PDFont font, String text, float fontSize) { - try { - float baseAverage = font.getAverageFontWidth(); - - float capHeight = 0; - float xHeight = 0; - - if (font.getFontDescriptor() != null) { - capHeight = font.getFontDescriptor().getCapHeight(); - xHeight = font.getFontDescriptor().getXHeight(); - } - - float adjustmentFactor = 1.0f; - if (capHeight > 0 && xHeight > 0) { - adjustmentFactor = Math.max(0.8f, Math.min(1.2f, xHeight / capHeight)); - } - - float adjustedAverage = (baseAverage * adjustmentFactor / FONT_SCALE_FACTOR) * fontSize; - return text.length() * adjustedAverage; - - } catch (Exception e) { - log.debug("Enhanced average width calculation failed: {}", e.getMessage()); - return text.length() * CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize; - } + return text.length() * calculateAverageCharacterWidth(font, fontSize); } public boolean isWidthCalculationReliable(PDFont font) { - if (font == null) { - return false; + if (font == null) return false; + + try { + if (font.isDamaged()) return false; + } catch (Exception e) { } - String cacheKey = createReliabilityCacheKey(font); - Boolean cachedResult = reliabilityCache.get(cacheKey); - if (cachedResult != null) { - log.debug( - "Using cached reliability result for font {}: {}", - font.getName(), - cachedResult); - return cachedResult; + try { + if (!TextEncodingHelper.canCalculateBasicWidths(font)) return false; + } catch (Exception e) { } - boolean result = performReliabilityCheck(font); + try { + font.getStringWidth("A"); + return true; + } catch (Exception e) { + } - reliabilityCache.put(cacheKey, result); - return result; + try { + font.getAverageFontWidth(); + return true; + } catch (Exception e) { + } + + try { + float width = font.getWidth(65); + return width >= 0; + } catch (Exception e) { + } + + return false; } - private boolean performReliabilityCheck(PDFont font) { + public float calculateMinimumTextWidth(PDFont font, String text, float fontSize) { + if (font == null || text == null || text.isEmpty() || fontSize <= 0) { + return 0; + } + try { - if (font.isDamaged()) { - log.debug("Font {} is damaged", font.getName()); - return false; + float minWidth = calculateAccurateWidth(font, text, fontSize); + if (minWidth > 0) { + return minWidth * 0.8f; } - - if (!TextEncodingHelper.canCalculateBasicWidths(font)) { - log.debug("Font {} cannot perform basic width calculations", font.getName()); - return false; - } - - try { - font.getStringWidth("A"); - return true; - } catch (Exception e) { - log.debug("Font {} failed basic width test: {}", font.getName(), e.getMessage()); - } - - // Check if we can at least get average width - try { - float avgWidth = font.getAverageFontWidth(); - return avgWidth > 0; - } catch (Exception e) { - log.debug( - "Font {} cannot provide average width: {}", font.getName(), e.getMessage()); - } - - return false; - } catch (Exception e) { - log.debug("Reliability check failed for font {}: {}", font.getName(), e.getMessage()); + } + + return text.length() * fontSize * 0.3f; + } + + public float calculateMaximumTextWidth(PDFont font, String text, float fontSize) { + if (font == null || text == null || text.isEmpty() || fontSize <= 0) { + return 0; + } + + try { + float maxWidth = calculateAccurateWidth(font, text, fontSize); + if (maxWidth > 0) { + return maxWidth * 1.2f; + } + } catch (Exception e) { + } + + return text.length() * fontSize * 1.0f; + } + + public boolean canCalculateWidthForText(PDFont font, String text) { + if (font == null || text == null) { return false; } + + if (text.isEmpty()) { + return true; + } + + try { + Float width = calculateDirectWidth(font, text, 12f); + if (width != null) { + return true; + } + } catch (Exception e) { + } + + try { + Float width = calculateCharacterByCharacterWidth(font, text, 12f); + if (width != null) { + return true; + } + } catch (Exception e) { + } + + return true; } }