diff --git a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index d25847434..45c67a407 100644 --- a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -1,8 +1,12 @@ package stirling.software.SPDF.pdf; import java.io.IOException; +import java.text.Normalizer; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; @@ -15,45 +19,41 @@ import stirling.software.SPDF.model.PDFText; @Slf4j public class TextFinder extends PDFTextStripper { - private final String searchTerm; - private final boolean useRegex; - private final boolean wholeWordSearch; - private final List foundTexts = new ArrayList<>(); - - private final List pageTextPositions = new ArrayList<>(); - private final StringBuilder pageTextBuilder = new StringBuilder(); - - public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch) - throws IOException { - this.searchTerm = searchTerm; - this.useRegex = useRegex; - this.wholeWordSearch = wholeWordSearch; - this.setWordSeparator(" "); + private static String removeDiacritics(String input) { + if (input == null || input.isEmpty()) return input; + String nfd = Normalizer.normalize(input, Normalizer.Form.NFD); + // remove combining diacritical marks + String stripped = nfd.replaceAll("\\p{M}+", ""); + return Normalizer.normalize(stripped, Normalizer.Form.NFC); } - @Override - protected void startPage(PDPage page) throws IOException { - super.startPage(page); - pageTextPositions.clear(); - pageTextBuilder.setLength(0); - } - - @Override - protected void writeString(String text, List textPositions) { - pageTextBuilder.append(text); - pageTextPositions.addAll(textPositions); - } - - @Override - protected void writeWordSeparator() { - pageTextBuilder.append(getWordSeparator()); - pageTextPositions.add(null); // Placeholder for separator - } - - @Override - protected void writeLineSeparator() { - pageTextBuilder.append(getLineSeparator()); - pageTextPositions.add(null); // Placeholder for separator + private static NormalizedMap buildNormalizedMap(String original) { + if (original == null) return new NormalizedMap("", new int[0]); + StringBuilder sb = new StringBuilder(original.length()); + // Worst case map size equals original length + int[] tempMap = new int[original.length() * 2]; + int normIdx = 0; + for (int i = 0; i < original.length(); i++) { + char ch = original.charAt(i); + // Normalize this single char; handle precomposed accents common in PDF text + String nfd = Normalizer.normalize(String.valueOf(ch), Normalizer.Form.NFD); + String base = nfd.replaceAll("\\p{M}+", ""); + // Append each resulting char and map back to original index i + for (int j = 0; j < base.length(); j++) { + char b = base.charAt(j); + sb.append(b); + if (normIdx >= tempMap.length) { + // expand temp map + int[] newMap = new int[tempMap.length * 2]; + System.arraycopy(tempMap, 0, newMap, 0, tempMap.length); + tempMap = newMap; + } + tempMap[normIdx++] = i; + } + } + int[] map = new int[normIdx]; + System.arraycopy(tempMap, 0, map, 0, normIdx); + return new NormalizedMap(sb.toString(), map); } @Override @@ -86,6 +86,71 @@ public class TextFinder extends PDFTextStripper { } } if (activePattern == null) { + if (!this.useRegex) { + NormalizedMap nm = buildNormalizedMap(text); + String normText = nm.normalized(); + String normTerm = removeDiacritics(processedSearchTerm); + List normPatterns = + stirling.software.SPDF.utils.text.TextFinderUtils + .createOptimizedSearchPatterns( + Collections.singleton(normTerm), + false, + this.wholeWordSearch); + Matcher nMatcher = null; + Pattern nActive = null; + for (Pattern p : normPatterns) { + nMatcher = p.matcher(normText); + if (nMatcher.find()) { + nActive = p; + break; + } + } + if (nActive != null) { + nMatcher = nActive.matcher(normText); + int matchCount = 0; + while (nMatcher.find()) { + matchCount++; + int nStart = nMatcher.start(); + int nEnd = nMatcher.end(); + int origStart = nm.indexMap()[nStart]; + int origEnd = nm.indexMap()[nEnd - 1] + 1; + + float minX = Float.MAX_VALUE; + float minY = Float.MAX_VALUE; + float maxX = Float.MIN_VALUE; + float maxY = Float.MIN_VALUE; + boolean foundPosition = false; + + for (int i = origStart; i < origEnd; i++) { + if (i >= pageTextPositions.size()) continue; + org.apache.pdfbox.text.TextPosition pos = pageTextPositions.get(i); + if (pos != null) { + foundPosition = true; + minX = Math.min(minX, pos.getX()); + maxX = Math.max(maxX, pos.getX() + pos.getWidth()); + minY = Math.min(minY, pos.getY() - pos.getHeight()); + maxY = Math.max(maxY, pos.getY()); + } + } + if (foundPosition) { + String matchedOriginal = + text.substring( + Math.max(0, origStart), + Math.min(text.length(), origEnd)); + foundTexts.add( + new PDFText( + this.getCurrentPageNo() - 1, + minX, + minY, + maxX, + maxY, + matchedOriginal)); + } + } + super.endPage(page); + return; + } + } super.endPage(page); return; } @@ -105,6 +170,26 @@ public class TextFinder extends PDFTextStripper { int matchStart = matcher.start(); int matchEnd = matcher.end(); + if (this.wholeWordSearch + && processedSearchTerm.length() == 1 + && Character.isDigit(processedSearchTerm.charAt(0))) { + char left = matchStart > 0 ? text.charAt(matchStart - 1) : '\0'; + char right = matchEnd < text.length() ? text.charAt(matchEnd) : '\0'; + if (Character.isLetterOrDigit(left) || Character.isLetterOrDigit(right)) { + continue; // skip + } + if ((right == '.' || right == ',') + && (matchEnd + 1 < text.length() + && Character.isDigit(text.charAt(matchEnd + 1)))) { + continue; // skip + } + if ((left == '.' || left == ',') + && (matchStart - 2 >= 0 + && Character.isDigit(text.charAt(matchStart - 2)))) { + continue; // skip + } + } + log.debug( "Found match #{} at positions {}-{}: '{}'", matchCount, @@ -192,6 +277,65 @@ public class TextFinder extends PDFTextStripper { super.endPage(page); } + private final String searchTerm; + private final boolean useRegex; + private final boolean wholeWordSearch; + private final List foundTexts = new ArrayList<>(); + + private final List pageTextPositions = new ArrayList<>(); + private final StringBuilder pageTextBuilder = new StringBuilder(); + + public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch) + throws IOException { + this.searchTerm = searchTerm; + this.useRegex = useRegex; + this.wholeWordSearch = wholeWordSearch; + this.setWordSeparator(" "); + } + + @Override + protected void startPage(PDPage page) throws IOException { + super.startPage(page); + pageTextPositions.clear(); + pageTextBuilder.setLength(0); + } + + @Override + protected void writeString(String text, List textPositions) { + pageTextBuilder.append(text); + pageTextPositions.addAll(textPositions); + } + + @Override + protected void writeWordSeparator() { + pageTextBuilder.append(getWordSeparator()); + pageTextPositions.add(null); // Placeholder for separator + } + + @Override + protected void writeLineSeparator() { + pageTextBuilder.append(getLineSeparator()); + pageTextPositions.add(null); // Placeholder for separator + } + + private static class NormalizedMap { + private final String normalized; + private final int[] indexMap; + + NormalizedMap(String normalized, int[] indexMap) { + this.normalized = normalized; + this.indexMap = indexMap; + } + + public String normalized() { + return normalized; + } + + public int[] indexMap() { + return indexMap; + } + } + public List getFoundTexts() { return foundTexts; } diff --git a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java index 0c2879d28..ed086a213 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java @@ -5,7 +5,6 @@ import java.awt.image.BufferedImage; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.ArrayDeque; import java.util.ArrayList; @@ -317,7 +316,7 @@ public class RedactionService { int start = Integer.parseInt(range[0].trim()); int end = Integer.parseInt(range[1].trim()); - if (start <= end && start > 0 && end > 0) { + if (start <= end && start > 0) { for (int i = start; i <= end; i++) { result.add(i); } @@ -347,7 +346,7 @@ public class RedactionService { } String colorString = hex.trim(); - if (!colorString.startsWith("#")) { + if (colorString.charAt(0) != '#') { colorString = "#" + colorString; } @@ -852,7 +851,7 @@ public class RedactionService { copy.add(newDict); } else if (obj instanceof List nestedList && !nestedList.isEmpty() - && nestedList.get(0) instanceof Object) { + && nestedList.get(0) != null) { try { @SuppressWarnings("unchecked") List objectList = (List) nestedList; @@ -892,8 +891,7 @@ public class RedactionService { TextFinderUtils.createOptimizedSearchPatterns( targetWords, useRegex, wholeWordSearch); - for (int i = 0; i < segments.size(); i++) { - TextSegment segment = segments.get(i); + for (TextSegment segment : segments) { String segmentText = segment.getText(); if (segmentText == null || segmentText.isEmpty()) { continue; @@ -1658,39 +1656,6 @@ public class RedactionService { } } - private static String tryEncodingFallbacks(COSString cosString) { - try { - byte[] bytes = cosString.getBytes(); - if (bytes.length == 0) return ""; - - String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"}; - - for (String encoding : encodings) { - try { - if (bytes.length >= 2) { - if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) { - // UTF-16BE BOM - return new String( - bytes, 2, bytes.length - 2, StandardCharsets.UTF_16BE); - } else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) { - // UTF-16LE BOM - return new String( - bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE); - } - } - - String decoded = new String(bytes, encoding); - if (!isGibberish(decoded)) { - return decoded; - } - } catch (Exception ignored) { - } - } - } catch (Exception e) { - } - return null; - } - private float applySafetyBounds( WidthCalculationResult result, TextSegment segment, String text) { if (result.processedMatches() == 0) return 0f; diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java index 38d700572..e12d19383 100644 --- a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextFinderUtils.java @@ -88,7 +88,14 @@ public class TextFinderUtils { if (originalTerm.length() == 1) { char c = originalTerm.charAt(0); if (Character.isDigit(c)) { - return "(? - Converts to image with visual redactions for maximum security.