mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
fix Texfinder stuff, minor simplifications
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
e64bbebfd5
commit
c249ab7487
@ -1,8 +1,12 @@
|
||||
package stirling.software.SPDF.pdf;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.Normalizer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
@ -15,45 +19,41 @@ import stirling.software.SPDF.model.PDFText;
|
||||
@Slf4j
|
||||
public class TextFinder extends PDFTextStripper {
|
||||
|
||||
private final String searchTerm;
|
||||
private final boolean useRegex;
|
||||
private final boolean wholeWordSearch;
|
||||
private final List<PDFText> foundTexts = new ArrayList<>();
|
||||
|
||||
private final List<TextPosition> pageTextPositions = new ArrayList<>();
|
||||
private final StringBuilder pageTextBuilder = new StringBuilder();
|
||||
|
||||
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
|
||||
throws IOException {
|
||||
this.searchTerm = searchTerm;
|
||||
this.useRegex = useRegex;
|
||||
this.wholeWordSearch = wholeWordSearch;
|
||||
this.setWordSeparator(" ");
|
||||
private static String removeDiacritics(String input) {
|
||||
if (input == null || input.isEmpty()) return input;
|
||||
String nfd = Normalizer.normalize(input, Normalizer.Form.NFD);
|
||||
// remove combining diacritical marks
|
||||
String stripped = nfd.replaceAll("\\p{M}+", "");
|
||||
return Normalizer.normalize(stripped, Normalizer.Form.NFC);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void startPage(PDPage page) throws IOException {
|
||||
super.startPage(page);
|
||||
pageTextPositions.clear();
|
||||
pageTextBuilder.setLength(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeString(String text, List<TextPosition> textPositions) {
|
||||
pageTextBuilder.append(text);
|
||||
pageTextPositions.addAll(textPositions);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeWordSeparator() {
|
||||
pageTextBuilder.append(getWordSeparator());
|
||||
pageTextPositions.add(null); // Placeholder for separator
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeLineSeparator() {
|
||||
pageTextBuilder.append(getLineSeparator());
|
||||
pageTextPositions.add(null); // Placeholder for separator
|
||||
private static NormalizedMap buildNormalizedMap(String original) {
|
||||
if (original == null) return new NormalizedMap("", new int[0]);
|
||||
StringBuilder sb = new StringBuilder(original.length());
|
||||
// Worst case map size equals original length
|
||||
int[] tempMap = new int[original.length() * 2];
|
||||
int normIdx = 0;
|
||||
for (int i = 0; i < original.length(); i++) {
|
||||
char ch = original.charAt(i);
|
||||
// Normalize this single char; handle precomposed accents common in PDF text
|
||||
String nfd = Normalizer.normalize(String.valueOf(ch), Normalizer.Form.NFD);
|
||||
String base = nfd.replaceAll("\\p{M}+", "");
|
||||
// Append each resulting char and map back to original index i
|
||||
for (int j = 0; j < base.length(); j++) {
|
||||
char b = base.charAt(j);
|
||||
sb.append(b);
|
||||
if (normIdx >= tempMap.length) {
|
||||
// expand temp map
|
||||
int[] newMap = new int[tempMap.length * 2];
|
||||
System.arraycopy(tempMap, 0, newMap, 0, tempMap.length);
|
||||
tempMap = newMap;
|
||||
}
|
||||
tempMap[normIdx++] = i;
|
||||
}
|
||||
}
|
||||
int[] map = new int[normIdx];
|
||||
System.arraycopy(tempMap, 0, map, 0, normIdx);
|
||||
return new NormalizedMap(sb.toString(), map);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -86,6 +86,71 @@ public class TextFinder extends PDFTextStripper {
|
||||
}
|
||||
}
|
||||
if (activePattern == null) {
|
||||
if (!this.useRegex) {
|
||||
NormalizedMap nm = buildNormalizedMap(text);
|
||||
String normText = nm.normalized();
|
||||
String normTerm = removeDiacritics(processedSearchTerm);
|
||||
List<Pattern> normPatterns =
|
||||
stirling.software.SPDF.utils.text.TextFinderUtils
|
||||
.createOptimizedSearchPatterns(
|
||||
Collections.singleton(normTerm),
|
||||
false,
|
||||
this.wholeWordSearch);
|
||||
Matcher nMatcher = null;
|
||||
Pattern nActive = null;
|
||||
for (Pattern p : normPatterns) {
|
||||
nMatcher = p.matcher(normText);
|
||||
if (nMatcher.find()) {
|
||||
nActive = p;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (nActive != null) {
|
||||
nMatcher = nActive.matcher(normText);
|
||||
int matchCount = 0;
|
||||
while (nMatcher.find()) {
|
||||
matchCount++;
|
||||
int nStart = nMatcher.start();
|
||||
int nEnd = nMatcher.end();
|
||||
int origStart = nm.indexMap()[nStart];
|
||||
int origEnd = nm.indexMap()[nEnd - 1] + 1;
|
||||
|
||||
float minX = Float.MAX_VALUE;
|
||||
float minY = Float.MAX_VALUE;
|
||||
float maxX = Float.MIN_VALUE;
|
||||
float maxY = Float.MIN_VALUE;
|
||||
boolean foundPosition = false;
|
||||
|
||||
for (int i = origStart; i < origEnd; i++) {
|
||||
if (i >= pageTextPositions.size()) continue;
|
||||
org.apache.pdfbox.text.TextPosition pos = pageTextPositions.get(i);
|
||||
if (pos != null) {
|
||||
foundPosition = true;
|
||||
minX = Math.min(minX, pos.getX());
|
||||
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
|
||||
minY = Math.min(minY, pos.getY() - pos.getHeight());
|
||||
maxY = Math.max(maxY, pos.getY());
|
||||
}
|
||||
}
|
||||
if (foundPosition) {
|
||||
String matchedOriginal =
|
||||
text.substring(
|
||||
Math.max(0, origStart),
|
||||
Math.min(text.length(), origEnd));
|
||||
foundTexts.add(
|
||||
new PDFText(
|
||||
this.getCurrentPageNo() - 1,
|
||||
minX,
|
||||
minY,
|
||||
maxX,
|
||||
maxY,
|
||||
matchedOriginal));
|
||||
}
|
||||
}
|
||||
super.endPage(page);
|
||||
return;
|
||||
}
|
||||
}
|
||||
super.endPage(page);
|
||||
return;
|
||||
}
|
||||
@ -105,6 +170,26 @@ public class TextFinder extends PDFTextStripper {
|
||||
int matchStart = matcher.start();
|
||||
int matchEnd = matcher.end();
|
||||
|
||||
if (this.wholeWordSearch
|
||||
&& processedSearchTerm.length() == 1
|
||||
&& Character.isDigit(processedSearchTerm.charAt(0))) {
|
||||
char left = matchStart > 0 ? text.charAt(matchStart - 1) : '\0';
|
||||
char right = matchEnd < text.length() ? text.charAt(matchEnd) : '\0';
|
||||
if (Character.isLetterOrDigit(left) || Character.isLetterOrDigit(right)) {
|
||||
continue; // skip
|
||||
}
|
||||
if ((right == '.' || right == ',')
|
||||
&& (matchEnd + 1 < text.length()
|
||||
&& Character.isDigit(text.charAt(matchEnd + 1)))) {
|
||||
continue; // skip
|
||||
}
|
||||
if ((left == '.' || left == ',')
|
||||
&& (matchStart - 2 >= 0
|
||||
&& Character.isDigit(text.charAt(matchStart - 2)))) {
|
||||
continue; // skip
|
||||
}
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"Found match #{} at positions {}-{}: '{}'",
|
||||
matchCount,
|
||||
@ -192,6 +277,65 @@ public class TextFinder extends PDFTextStripper {
|
||||
super.endPage(page);
|
||||
}
|
||||
|
||||
private final String searchTerm;
|
||||
private final boolean useRegex;
|
||||
private final boolean wholeWordSearch;
|
||||
private final List<PDFText> foundTexts = new ArrayList<>();
|
||||
|
||||
private final List<TextPosition> pageTextPositions = new ArrayList<>();
|
||||
private final StringBuilder pageTextBuilder = new StringBuilder();
|
||||
|
||||
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
|
||||
throws IOException {
|
||||
this.searchTerm = searchTerm;
|
||||
this.useRegex = useRegex;
|
||||
this.wholeWordSearch = wholeWordSearch;
|
||||
this.setWordSeparator(" ");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void startPage(PDPage page) throws IOException {
|
||||
super.startPage(page);
|
||||
pageTextPositions.clear();
|
||||
pageTextBuilder.setLength(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeString(String text, List<TextPosition> textPositions) {
|
||||
pageTextBuilder.append(text);
|
||||
pageTextPositions.addAll(textPositions);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeWordSeparator() {
|
||||
pageTextBuilder.append(getWordSeparator());
|
||||
pageTextPositions.add(null); // Placeholder for separator
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeLineSeparator() {
|
||||
pageTextBuilder.append(getLineSeparator());
|
||||
pageTextPositions.add(null); // Placeholder for separator
|
||||
}
|
||||
|
||||
private static class NormalizedMap {
|
||||
private final String normalized;
|
||||
private final int[] indexMap;
|
||||
|
||||
NormalizedMap(String normalized, int[] indexMap) {
|
||||
this.normalized = normalized;
|
||||
this.indexMap = indexMap;
|
||||
}
|
||||
|
||||
public String normalized() {
|
||||
return normalized;
|
||||
}
|
||||
|
||||
public int[] indexMap() {
|
||||
return indexMap;
|
||||
}
|
||||
}
|
||||
|
||||
public List<PDFText> getFoundTexts() {
|
||||
return foundTexts;
|
||||
}
|
||||
|
@ -5,7 +5,6 @@ import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.ArrayList;
|
||||
@ -317,7 +316,7 @@ public class RedactionService {
|
||||
int start = Integer.parseInt(range[0].trim());
|
||||
int end = Integer.parseInt(range[1].trim());
|
||||
|
||||
if (start <= end && start > 0 && end > 0) {
|
||||
if (start <= end && start > 0) {
|
||||
for (int i = start; i <= end; i++) {
|
||||
result.add(i);
|
||||
}
|
||||
@ -347,7 +346,7 @@ public class RedactionService {
|
||||
}
|
||||
|
||||
String colorString = hex.trim();
|
||||
if (!colorString.startsWith("#")) {
|
||||
if (colorString.charAt(0) != '#') {
|
||||
colorString = "#" + colorString;
|
||||
}
|
||||
|
||||
@ -852,7 +851,7 @@ public class RedactionService {
|
||||
copy.add(newDict);
|
||||
} else if (obj instanceof List<?> nestedList
|
||||
&& !nestedList.isEmpty()
|
||||
&& nestedList.get(0) instanceof Object) {
|
||||
&& nestedList.get(0) != null) {
|
||||
try {
|
||||
@SuppressWarnings("unchecked")
|
||||
List<Object> objectList = (List<Object>) nestedList;
|
||||
@ -892,8 +891,7 @@ public class RedactionService {
|
||||
TextFinderUtils.createOptimizedSearchPatterns(
|
||||
targetWords, useRegex, wholeWordSearch);
|
||||
|
||||
for (int i = 0; i < segments.size(); i++) {
|
||||
TextSegment segment = segments.get(i);
|
||||
for (TextSegment segment : segments) {
|
||||
String segmentText = segment.getText();
|
||||
if (segmentText == null || segmentText.isEmpty()) {
|
||||
continue;
|
||||
@ -1658,39 +1656,6 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private static String tryEncodingFallbacks(COSString cosString) {
|
||||
try {
|
||||
byte[] bytes = cosString.getBytes();
|
||||
if (bytes.length == 0) return "";
|
||||
|
||||
String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"};
|
||||
|
||||
for (String encoding : encodings) {
|
||||
try {
|
||||
if (bytes.length >= 2) {
|
||||
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
|
||||
// UTF-16BE BOM
|
||||
return new String(
|
||||
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16BE);
|
||||
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
|
||||
// UTF-16LE BOM
|
||||
return new String(
|
||||
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
|
||||
}
|
||||
}
|
||||
|
||||
String decoded = new String(bytes, encoding);
|
||||
if (!isGibberish(decoded)) {
|
||||
return decoded;
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private float applySafetyBounds(
|
||||
WidthCalculationResult result, TextSegment segment, String text) {
|
||||
if (result.processedMatches() == 0) return 0f;
|
||||
|
@ -88,7 +88,14 @@ public class TextFinderUtils {
|
||||
if (originalTerm.length() == 1) {
|
||||
char c = originalTerm.charAt(0);
|
||||
if (Character.isDigit(c)) {
|
||||
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
|
||||
// Single digit as a strict standalone token:
|
||||
// - Not adjacent to letters or digits
|
||||
// - Not part of a decimal number (e.g., 1.0 or 2,50)
|
||||
// by excluding cases where a digit is immediately followed by [.,]\d
|
||||
// or immediately preceded by \d[.,]
|
||||
String leftBoundary = "(?<![\\p{L}\\p{N}])(?<!\\d\\.)(?<!\\d,)";
|
||||
String rightBoundary = "(?![\\p{L}\\p{N}])(?![.,]\\d)";
|
||||
return leftBoundary + patternString + rightBoundary;
|
||||
} else if (Character.isLetter(c)) {
|
||||
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
|
||||
} else {
|
||||
|
@ -103,7 +103,6 @@
|
||||
<div class="form-check mb-2">
|
||||
<input aria-describedby="visual-desc" checked class="form-check-input" id="visualImage" name="redactionMode" type="radio" value="visual">
|
||||
<label class="form-check-label" for="visualImage" th:text="#{autoRedact.visualRedactionLabel}">Visual</label>
|
||||
<small class="form-text text-muted d-block mt-1" id="visual-desc" th:text="#{autoRedact.visualRedactionDescription}">Converts to image with visual redactions for maximum security.</small>
|
||||
</div>
|
||||
<div class="form-check mb-2">
|
||||
<input aria-describedby="delete-desc" class="form-check-input" id="deleteText" name="redactionMode" type="radio" value="aggressive">
|
||||
|
Loading…
Reference in New Issue
Block a user