fix Texfinder stuff, minor simplifications

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-09-03 00:33:29 +02:00
parent e64bbebfd5
commit c249ab7487
4 changed files with 193 additions and 78 deletions

View File

@ -1,8 +1,12 @@
package stirling.software.SPDF.pdf; package stirling.software.SPDF.pdf;
import java.io.IOException; import java.io.IOException;
import java.text.Normalizer;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
@ -15,45 +19,41 @@ import stirling.software.SPDF.model.PDFText;
@Slf4j @Slf4j
public class TextFinder extends PDFTextStripper { public class TextFinder extends PDFTextStripper {
private final String searchTerm; private static String removeDiacritics(String input) {
private final boolean useRegex; if (input == null || input.isEmpty()) return input;
private final boolean wholeWordSearch; String nfd = Normalizer.normalize(input, Normalizer.Form.NFD);
private final List<PDFText> foundTexts = new ArrayList<>(); // remove combining diacritical marks
String stripped = nfd.replaceAll("\\p{M}+", "");
private final List<TextPosition> pageTextPositions = new ArrayList<>(); return Normalizer.normalize(stripped, Normalizer.Form.NFC);
private final StringBuilder pageTextBuilder = new StringBuilder();
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
throws IOException {
this.searchTerm = searchTerm;
this.useRegex = useRegex;
this.wholeWordSearch = wholeWordSearch;
this.setWordSeparator(" ");
} }
@Override private static NormalizedMap buildNormalizedMap(String original) {
protected void startPage(PDPage page) throws IOException { if (original == null) return new NormalizedMap("", new int[0]);
super.startPage(page); StringBuilder sb = new StringBuilder(original.length());
pageTextPositions.clear(); // Worst case map size equals original length
pageTextBuilder.setLength(0); int[] tempMap = new int[original.length() * 2];
int normIdx = 0;
for (int i = 0; i < original.length(); i++) {
char ch = original.charAt(i);
// Normalize this single char; handle precomposed accents common in PDF text
String nfd = Normalizer.normalize(String.valueOf(ch), Normalizer.Form.NFD);
String base = nfd.replaceAll("\\p{M}+", "");
// Append each resulting char and map back to original index i
for (int j = 0; j < base.length(); j++) {
char b = base.charAt(j);
sb.append(b);
if (normIdx >= tempMap.length) {
// expand temp map
int[] newMap = new int[tempMap.length * 2];
System.arraycopy(tempMap, 0, newMap, 0, tempMap.length);
tempMap = newMap;
} }
tempMap[normIdx++] = i;
@Override
protected void writeString(String text, List<TextPosition> textPositions) {
pageTextBuilder.append(text);
pageTextPositions.addAll(textPositions);
} }
@Override
protected void writeWordSeparator() {
pageTextBuilder.append(getWordSeparator());
pageTextPositions.add(null); // Placeholder for separator
} }
int[] map = new int[normIdx];
@Override System.arraycopy(tempMap, 0, map, 0, normIdx);
protected void writeLineSeparator() { return new NormalizedMap(sb.toString(), map);
pageTextBuilder.append(getLineSeparator());
pageTextPositions.add(null); // Placeholder for separator
} }
@Override @Override
@ -86,6 +86,71 @@ public class TextFinder extends PDFTextStripper {
} }
} }
if (activePattern == null) { if (activePattern == null) {
if (!this.useRegex) {
NormalizedMap nm = buildNormalizedMap(text);
String normText = nm.normalized();
String normTerm = removeDiacritics(processedSearchTerm);
List<Pattern> normPatterns =
stirling.software.SPDF.utils.text.TextFinderUtils
.createOptimizedSearchPatterns(
Collections.singleton(normTerm),
false,
this.wholeWordSearch);
Matcher nMatcher = null;
Pattern nActive = null;
for (Pattern p : normPatterns) {
nMatcher = p.matcher(normText);
if (nMatcher.find()) {
nActive = p;
break;
}
}
if (nActive != null) {
nMatcher = nActive.matcher(normText);
int matchCount = 0;
while (nMatcher.find()) {
matchCount++;
int nStart = nMatcher.start();
int nEnd = nMatcher.end();
int origStart = nm.indexMap()[nStart];
int origEnd = nm.indexMap()[nEnd - 1] + 1;
float minX = Float.MAX_VALUE;
float minY = Float.MAX_VALUE;
float maxX = Float.MIN_VALUE;
float maxY = Float.MIN_VALUE;
boolean foundPosition = false;
for (int i = origStart; i < origEnd; i++) {
if (i >= pageTextPositions.size()) continue;
org.apache.pdfbox.text.TextPosition pos = pageTextPositions.get(i);
if (pos != null) {
foundPosition = true;
minX = Math.min(minX, pos.getX());
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
minY = Math.min(minY, pos.getY() - pos.getHeight());
maxY = Math.max(maxY, pos.getY());
}
}
if (foundPosition) {
String matchedOriginal =
text.substring(
Math.max(0, origStart),
Math.min(text.length(), origEnd));
foundTexts.add(
new PDFText(
this.getCurrentPageNo() - 1,
minX,
minY,
maxX,
maxY,
matchedOriginal));
}
}
super.endPage(page);
return;
}
}
super.endPage(page); super.endPage(page);
return; return;
} }
@ -105,6 +170,26 @@ public class TextFinder extends PDFTextStripper {
int matchStart = matcher.start(); int matchStart = matcher.start();
int matchEnd = matcher.end(); int matchEnd = matcher.end();
if (this.wholeWordSearch
&& processedSearchTerm.length() == 1
&& Character.isDigit(processedSearchTerm.charAt(0))) {
char left = matchStart > 0 ? text.charAt(matchStart - 1) : '\0';
char right = matchEnd < text.length() ? text.charAt(matchEnd) : '\0';
if (Character.isLetterOrDigit(left) || Character.isLetterOrDigit(right)) {
continue; // skip
}
if ((right == '.' || right == ',')
&& (matchEnd + 1 < text.length()
&& Character.isDigit(text.charAt(matchEnd + 1)))) {
continue; // skip
}
if ((left == '.' || left == ',')
&& (matchStart - 2 >= 0
&& Character.isDigit(text.charAt(matchStart - 2)))) {
continue; // skip
}
}
log.debug( log.debug(
"Found match #{} at positions {}-{}: '{}'", "Found match #{} at positions {}-{}: '{}'",
matchCount, matchCount,
@ -192,6 +277,65 @@ public class TextFinder extends PDFTextStripper {
super.endPage(page); super.endPage(page);
} }
private final String searchTerm;
private final boolean useRegex;
private final boolean wholeWordSearch;
private final List<PDFText> foundTexts = new ArrayList<>();
private final List<TextPosition> pageTextPositions = new ArrayList<>();
private final StringBuilder pageTextBuilder = new StringBuilder();
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
throws IOException {
this.searchTerm = searchTerm;
this.useRegex = useRegex;
this.wholeWordSearch = wholeWordSearch;
this.setWordSeparator(" ");
}
@Override
protected void startPage(PDPage page) throws IOException {
super.startPage(page);
pageTextPositions.clear();
pageTextBuilder.setLength(0);
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) {
pageTextBuilder.append(text);
pageTextPositions.addAll(textPositions);
}
@Override
protected void writeWordSeparator() {
pageTextBuilder.append(getWordSeparator());
pageTextPositions.add(null); // Placeholder for separator
}
@Override
protected void writeLineSeparator() {
pageTextBuilder.append(getLineSeparator());
pageTextPositions.add(null); // Placeholder for separator
}
private static class NormalizedMap {
private final String normalized;
private final int[] indexMap;
NormalizedMap(String normalized, int[] indexMap) {
this.normalized = normalized;
this.indexMap = indexMap;
}
public String normalized() {
return normalized;
}
public int[] indexMap() {
return indexMap;
}
}
public List<PDFText> getFoundTexts() { public List<PDFText> getFoundTexts() {
return foundTexts; return foundTexts;
} }

View File

@ -5,7 +5,6 @@ import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayDeque; import java.util.ArrayDeque;
import java.util.ArrayList; import java.util.ArrayList;
@ -317,7 +316,7 @@ public class RedactionService {
int start = Integer.parseInt(range[0].trim()); int start = Integer.parseInt(range[0].trim());
int end = Integer.parseInt(range[1].trim()); int end = Integer.parseInt(range[1].trim());
if (start <= end && start > 0 && end > 0) { if (start <= end && start > 0) {
for (int i = start; i <= end; i++) { for (int i = start; i <= end; i++) {
result.add(i); result.add(i);
} }
@ -347,7 +346,7 @@ public class RedactionService {
} }
String colorString = hex.trim(); String colorString = hex.trim();
if (!colorString.startsWith("#")) { if (colorString.charAt(0) != '#') {
colorString = "#" + colorString; colorString = "#" + colorString;
} }
@ -852,7 +851,7 @@ public class RedactionService {
copy.add(newDict); copy.add(newDict);
} else if (obj instanceof List<?> nestedList } else if (obj instanceof List<?> nestedList
&& !nestedList.isEmpty() && !nestedList.isEmpty()
&& nestedList.get(0) instanceof Object) { && nestedList.get(0) != null) {
try { try {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
List<Object> objectList = (List<Object>) nestedList; List<Object> objectList = (List<Object>) nestedList;
@ -892,8 +891,7 @@ public class RedactionService {
TextFinderUtils.createOptimizedSearchPatterns( TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch); targetWords, useRegex, wholeWordSearch);
for (int i = 0; i < segments.size(); i++) { for (TextSegment segment : segments) {
TextSegment segment = segments.get(i);
String segmentText = segment.getText(); String segmentText = segment.getText();
if (segmentText == null || segmentText.isEmpty()) { if (segmentText == null || segmentText.isEmpty()) {
continue; continue;
@ -1658,39 +1656,6 @@ public class RedactionService {
} }
} }
private static String tryEncodingFallbacks(COSString cosString) {
try {
byte[] bytes = cosString.getBytes();
if (bytes.length == 0) return "";
String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"};
for (String encoding : encodings) {
try {
if (bytes.length >= 2) {
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
// UTF-16BE BOM
return new String(
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16BE);
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
// UTF-16LE BOM
return new String(
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
}
}
String decoded = new String(bytes, encoding);
if (!isGibberish(decoded)) {
return decoded;
}
} catch (Exception ignored) {
}
}
} catch (Exception e) {
}
return null;
}
private float applySafetyBounds( private float applySafetyBounds(
WidthCalculationResult result, TextSegment segment, String text) { WidthCalculationResult result, TextSegment segment, String text) {
if (result.processedMatches() == 0) return 0f; if (result.processedMatches() == 0) return 0f;

View File

@ -88,7 +88,14 @@ public class TextFinderUtils {
if (originalTerm.length() == 1) { if (originalTerm.length() == 1) {
char c = originalTerm.charAt(0); char c = originalTerm.charAt(0);
if (Character.isDigit(c)) { if (Character.isDigit(c)) {
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])"; // Single digit as a strict standalone token:
// - Not adjacent to letters or digits
// - Not part of a decimal number (e.g., 1.0 or 2,50)
// by excluding cases where a digit is immediately followed by [.,]\d
// or immediately preceded by \d[.,]
String leftBoundary = "(?<![\\p{L}\\p{N}])(?<!\\d\\.)(?<!\\d,)";
String rightBoundary = "(?![\\p{L}\\p{N}])(?![.,]\\d)";
return leftBoundary + patternString + rightBoundary;
} else if (Character.isLetter(c)) { } else if (Character.isLetter(c)) {
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])"; return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
} else { } else {

View File

@ -103,7 +103,6 @@
<div class="form-check mb-2"> <div class="form-check mb-2">
<input aria-describedby="visual-desc" checked class="form-check-input" id="visualImage" name="redactionMode" type="radio" value="visual"> <input aria-describedby="visual-desc" checked class="form-check-input" id="visualImage" name="redactionMode" type="radio" value="visual">
<label class="form-check-label" for="visualImage" th:text="#{autoRedact.visualRedactionLabel}">Visual</label> <label class="form-check-label" for="visualImage" th:text="#{autoRedact.visualRedactionLabel}">Visual</label>
<small class="form-text text-muted d-block mt-1" id="visual-desc" th:text="#{autoRedact.visualRedactionDescription}">Converts to image with visual redactions for maximum security.</small>
</div> </div>
<div class="form-check mb-2"> <div class="form-check mb-2">
<input aria-describedby="delete-desc" class="form-check-input" id="deleteText" name="redactionMode" type="radio" value="aggressive"> <input aria-describedby="delete-desc" class="form-check-input" id="deleteText" name="redactionMode" type="radio" value="aggressive">