mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
fix Texfinder stuff, minor simplifications
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
e64bbebfd5
commit
c249ab7487
@ -1,8 +1,12 @@
|
|||||||
package stirling.software.SPDF.pdf;
|
package stirling.software.SPDF.pdf;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.text.Normalizer;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
@ -15,45 +19,41 @@ import stirling.software.SPDF.model.PDFText;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class TextFinder extends PDFTextStripper {
|
public class TextFinder extends PDFTextStripper {
|
||||||
|
|
||||||
private final String searchTerm;
|
private static String removeDiacritics(String input) {
|
||||||
private final boolean useRegex;
|
if (input == null || input.isEmpty()) return input;
|
||||||
private final boolean wholeWordSearch;
|
String nfd = Normalizer.normalize(input, Normalizer.Form.NFD);
|
||||||
private final List<PDFText> foundTexts = new ArrayList<>();
|
// remove combining diacritical marks
|
||||||
|
String stripped = nfd.replaceAll("\\p{M}+", "");
|
||||||
private final List<TextPosition> pageTextPositions = new ArrayList<>();
|
return Normalizer.normalize(stripped, Normalizer.Form.NFC);
|
||||||
private final StringBuilder pageTextBuilder = new StringBuilder();
|
|
||||||
|
|
||||||
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
|
|
||||||
throws IOException {
|
|
||||||
this.searchTerm = searchTerm;
|
|
||||||
this.useRegex = useRegex;
|
|
||||||
this.wholeWordSearch = wholeWordSearch;
|
|
||||||
this.setWordSeparator(" ");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
private static NormalizedMap buildNormalizedMap(String original) {
|
||||||
protected void startPage(PDPage page) throws IOException {
|
if (original == null) return new NormalizedMap("", new int[0]);
|
||||||
super.startPage(page);
|
StringBuilder sb = new StringBuilder(original.length());
|
||||||
pageTextPositions.clear();
|
// Worst case map size equals original length
|
||||||
pageTextBuilder.setLength(0);
|
int[] tempMap = new int[original.length() * 2];
|
||||||
}
|
int normIdx = 0;
|
||||||
|
for (int i = 0; i < original.length(); i++) {
|
||||||
@Override
|
char ch = original.charAt(i);
|
||||||
protected void writeString(String text, List<TextPosition> textPositions) {
|
// Normalize this single char; handle precomposed accents common in PDF text
|
||||||
pageTextBuilder.append(text);
|
String nfd = Normalizer.normalize(String.valueOf(ch), Normalizer.Form.NFD);
|
||||||
pageTextPositions.addAll(textPositions);
|
String base = nfd.replaceAll("\\p{M}+", "");
|
||||||
}
|
// Append each resulting char and map back to original index i
|
||||||
|
for (int j = 0; j < base.length(); j++) {
|
||||||
@Override
|
char b = base.charAt(j);
|
||||||
protected void writeWordSeparator() {
|
sb.append(b);
|
||||||
pageTextBuilder.append(getWordSeparator());
|
if (normIdx >= tempMap.length) {
|
||||||
pageTextPositions.add(null); // Placeholder for separator
|
// expand temp map
|
||||||
}
|
int[] newMap = new int[tempMap.length * 2];
|
||||||
|
System.arraycopy(tempMap, 0, newMap, 0, tempMap.length);
|
||||||
@Override
|
tempMap = newMap;
|
||||||
protected void writeLineSeparator() {
|
}
|
||||||
pageTextBuilder.append(getLineSeparator());
|
tempMap[normIdx++] = i;
|
||||||
pageTextPositions.add(null); // Placeholder for separator
|
}
|
||||||
|
}
|
||||||
|
int[] map = new int[normIdx];
|
||||||
|
System.arraycopy(tempMap, 0, map, 0, normIdx);
|
||||||
|
return new NormalizedMap(sb.toString(), map);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -86,6 +86,71 @@ public class TextFinder extends PDFTextStripper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (activePattern == null) {
|
if (activePattern == null) {
|
||||||
|
if (!this.useRegex) {
|
||||||
|
NormalizedMap nm = buildNormalizedMap(text);
|
||||||
|
String normText = nm.normalized();
|
||||||
|
String normTerm = removeDiacritics(processedSearchTerm);
|
||||||
|
List<Pattern> normPatterns =
|
||||||
|
stirling.software.SPDF.utils.text.TextFinderUtils
|
||||||
|
.createOptimizedSearchPatterns(
|
||||||
|
Collections.singleton(normTerm),
|
||||||
|
false,
|
||||||
|
this.wholeWordSearch);
|
||||||
|
Matcher nMatcher = null;
|
||||||
|
Pattern nActive = null;
|
||||||
|
for (Pattern p : normPatterns) {
|
||||||
|
nMatcher = p.matcher(normText);
|
||||||
|
if (nMatcher.find()) {
|
||||||
|
nActive = p;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (nActive != null) {
|
||||||
|
nMatcher = nActive.matcher(normText);
|
||||||
|
int matchCount = 0;
|
||||||
|
while (nMatcher.find()) {
|
||||||
|
matchCount++;
|
||||||
|
int nStart = nMatcher.start();
|
||||||
|
int nEnd = nMatcher.end();
|
||||||
|
int origStart = nm.indexMap()[nStart];
|
||||||
|
int origEnd = nm.indexMap()[nEnd - 1] + 1;
|
||||||
|
|
||||||
|
float minX = Float.MAX_VALUE;
|
||||||
|
float minY = Float.MAX_VALUE;
|
||||||
|
float maxX = Float.MIN_VALUE;
|
||||||
|
float maxY = Float.MIN_VALUE;
|
||||||
|
boolean foundPosition = false;
|
||||||
|
|
||||||
|
for (int i = origStart; i < origEnd; i++) {
|
||||||
|
if (i >= pageTextPositions.size()) continue;
|
||||||
|
org.apache.pdfbox.text.TextPosition pos = pageTextPositions.get(i);
|
||||||
|
if (pos != null) {
|
||||||
|
foundPosition = true;
|
||||||
|
minX = Math.min(minX, pos.getX());
|
||||||
|
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
|
||||||
|
minY = Math.min(minY, pos.getY() - pos.getHeight());
|
||||||
|
maxY = Math.max(maxY, pos.getY());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (foundPosition) {
|
||||||
|
String matchedOriginal =
|
||||||
|
text.substring(
|
||||||
|
Math.max(0, origStart),
|
||||||
|
Math.min(text.length(), origEnd));
|
||||||
|
foundTexts.add(
|
||||||
|
new PDFText(
|
||||||
|
this.getCurrentPageNo() - 1,
|
||||||
|
minX,
|
||||||
|
minY,
|
||||||
|
maxX,
|
||||||
|
maxY,
|
||||||
|
matchedOriginal));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
super.endPage(page);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
super.endPage(page);
|
super.endPage(page);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -105,6 +170,26 @@ public class TextFinder extends PDFTextStripper {
|
|||||||
int matchStart = matcher.start();
|
int matchStart = matcher.start();
|
||||||
int matchEnd = matcher.end();
|
int matchEnd = matcher.end();
|
||||||
|
|
||||||
|
if (this.wholeWordSearch
|
||||||
|
&& processedSearchTerm.length() == 1
|
||||||
|
&& Character.isDigit(processedSearchTerm.charAt(0))) {
|
||||||
|
char left = matchStart > 0 ? text.charAt(matchStart - 1) : '\0';
|
||||||
|
char right = matchEnd < text.length() ? text.charAt(matchEnd) : '\0';
|
||||||
|
if (Character.isLetterOrDigit(left) || Character.isLetterOrDigit(right)) {
|
||||||
|
continue; // skip
|
||||||
|
}
|
||||||
|
if ((right == '.' || right == ',')
|
||||||
|
&& (matchEnd + 1 < text.length()
|
||||||
|
&& Character.isDigit(text.charAt(matchEnd + 1)))) {
|
||||||
|
continue; // skip
|
||||||
|
}
|
||||||
|
if ((left == '.' || left == ',')
|
||||||
|
&& (matchStart - 2 >= 0
|
||||||
|
&& Character.isDigit(text.charAt(matchStart - 2)))) {
|
||||||
|
continue; // skip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
log.debug(
|
log.debug(
|
||||||
"Found match #{} at positions {}-{}: '{}'",
|
"Found match #{} at positions {}-{}: '{}'",
|
||||||
matchCount,
|
matchCount,
|
||||||
@ -192,6 +277,65 @@ public class TextFinder extends PDFTextStripper {
|
|||||||
super.endPage(page);
|
super.endPage(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final String searchTerm;
|
||||||
|
private final boolean useRegex;
|
||||||
|
private final boolean wholeWordSearch;
|
||||||
|
private final List<PDFText> foundTexts = new ArrayList<>();
|
||||||
|
|
||||||
|
private final List<TextPosition> pageTextPositions = new ArrayList<>();
|
||||||
|
private final StringBuilder pageTextBuilder = new StringBuilder();
|
||||||
|
|
||||||
|
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
|
||||||
|
throws IOException {
|
||||||
|
this.searchTerm = searchTerm;
|
||||||
|
this.useRegex = useRegex;
|
||||||
|
this.wholeWordSearch = wholeWordSearch;
|
||||||
|
this.setWordSeparator(" ");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void startPage(PDPage page) throws IOException {
|
||||||
|
super.startPage(page);
|
||||||
|
pageTextPositions.clear();
|
||||||
|
pageTextBuilder.setLength(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeString(String text, List<TextPosition> textPositions) {
|
||||||
|
pageTextBuilder.append(text);
|
||||||
|
pageTextPositions.addAll(textPositions);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeWordSeparator() {
|
||||||
|
pageTextBuilder.append(getWordSeparator());
|
||||||
|
pageTextPositions.add(null); // Placeholder for separator
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeLineSeparator() {
|
||||||
|
pageTextBuilder.append(getLineSeparator());
|
||||||
|
pageTextPositions.add(null); // Placeholder for separator
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class NormalizedMap {
|
||||||
|
private final String normalized;
|
||||||
|
private final int[] indexMap;
|
||||||
|
|
||||||
|
NormalizedMap(String normalized, int[] indexMap) {
|
||||||
|
this.normalized = normalized;
|
||||||
|
this.indexMap = indexMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String normalized() {
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int[] indexMap() {
|
||||||
|
return indexMap;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public List<PDFText> getFoundTexts() {
|
public List<PDFText> getFoundTexts() {
|
||||||
return foundTexts;
|
return foundTexts;
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,6 @@ import java.awt.image.BufferedImage;
|
|||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayDeque;
|
import java.util.ArrayDeque;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -317,7 +316,7 @@ public class RedactionService {
|
|||||||
int start = Integer.parseInt(range[0].trim());
|
int start = Integer.parseInt(range[0].trim());
|
||||||
int end = Integer.parseInt(range[1].trim());
|
int end = Integer.parseInt(range[1].trim());
|
||||||
|
|
||||||
if (start <= end && start > 0 && end > 0) {
|
if (start <= end && start > 0) {
|
||||||
for (int i = start; i <= end; i++) {
|
for (int i = start; i <= end; i++) {
|
||||||
result.add(i);
|
result.add(i);
|
||||||
}
|
}
|
||||||
@ -347,7 +346,7 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
String colorString = hex.trim();
|
String colorString = hex.trim();
|
||||||
if (!colorString.startsWith("#")) {
|
if (colorString.charAt(0) != '#') {
|
||||||
colorString = "#" + colorString;
|
colorString = "#" + colorString;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -852,7 +851,7 @@ public class RedactionService {
|
|||||||
copy.add(newDict);
|
copy.add(newDict);
|
||||||
} else if (obj instanceof List<?> nestedList
|
} else if (obj instanceof List<?> nestedList
|
||||||
&& !nestedList.isEmpty()
|
&& !nestedList.isEmpty()
|
||||||
&& nestedList.get(0) instanceof Object) {
|
&& nestedList.get(0) != null) {
|
||||||
try {
|
try {
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
List<Object> objectList = (List<Object>) nestedList;
|
List<Object> objectList = (List<Object>) nestedList;
|
||||||
@ -892,8 +891,7 @@ public class RedactionService {
|
|||||||
TextFinderUtils.createOptimizedSearchPatterns(
|
TextFinderUtils.createOptimizedSearchPatterns(
|
||||||
targetWords, useRegex, wholeWordSearch);
|
targetWords, useRegex, wholeWordSearch);
|
||||||
|
|
||||||
for (int i = 0; i < segments.size(); i++) {
|
for (TextSegment segment : segments) {
|
||||||
TextSegment segment = segments.get(i);
|
|
||||||
String segmentText = segment.getText();
|
String segmentText = segment.getText();
|
||||||
if (segmentText == null || segmentText.isEmpty()) {
|
if (segmentText == null || segmentText.isEmpty()) {
|
||||||
continue;
|
continue;
|
||||||
@ -1658,39 +1656,6 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String tryEncodingFallbacks(COSString cosString) {
|
|
||||||
try {
|
|
||||||
byte[] bytes = cosString.getBytes();
|
|
||||||
if (bytes.length == 0) return "";
|
|
||||||
|
|
||||||
String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"};
|
|
||||||
|
|
||||||
for (String encoding : encodings) {
|
|
||||||
try {
|
|
||||||
if (bytes.length >= 2) {
|
|
||||||
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
|
|
||||||
// UTF-16BE BOM
|
|
||||||
return new String(
|
|
||||||
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16BE);
|
|
||||||
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
|
|
||||||
// UTF-16LE BOM
|
|
||||||
return new String(
|
|
||||||
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
String decoded = new String(bytes, encoding);
|
|
||||||
if (!isGibberish(decoded)) {
|
|
||||||
return decoded;
|
|
||||||
}
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private float applySafetyBounds(
|
private float applySafetyBounds(
|
||||||
WidthCalculationResult result, TextSegment segment, String text) {
|
WidthCalculationResult result, TextSegment segment, String text) {
|
||||||
if (result.processedMatches() == 0) return 0f;
|
if (result.processedMatches() == 0) return 0f;
|
||||||
|
@ -88,7 +88,14 @@ public class TextFinderUtils {
|
|||||||
if (originalTerm.length() == 1) {
|
if (originalTerm.length() == 1) {
|
||||||
char c = originalTerm.charAt(0);
|
char c = originalTerm.charAt(0);
|
||||||
if (Character.isDigit(c)) {
|
if (Character.isDigit(c)) {
|
||||||
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
|
// Single digit as a strict standalone token:
|
||||||
|
// - Not adjacent to letters or digits
|
||||||
|
// - Not part of a decimal number (e.g., 1.0 or 2,50)
|
||||||
|
// by excluding cases where a digit is immediately followed by [.,]\d
|
||||||
|
// or immediately preceded by \d[.,]
|
||||||
|
String leftBoundary = "(?<![\\p{L}\\p{N}])(?<!\\d\\.)(?<!\\d,)";
|
||||||
|
String rightBoundary = "(?![\\p{L}\\p{N}])(?![.,]\\d)";
|
||||||
|
return leftBoundary + patternString + rightBoundary;
|
||||||
} else if (Character.isLetter(c)) {
|
} else if (Character.isLetter(c)) {
|
||||||
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
|
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
|
||||||
} else {
|
} else {
|
||||||
|
@ -103,7 +103,6 @@
|
|||||||
<div class="form-check mb-2">
|
<div class="form-check mb-2">
|
||||||
<input aria-describedby="visual-desc" checked class="form-check-input" id="visualImage" name="redactionMode" type="radio" value="visual">
|
<input aria-describedby="visual-desc" checked class="form-check-input" id="visualImage" name="redactionMode" type="radio" value="visual">
|
||||||
<label class="form-check-label" for="visualImage" th:text="#{autoRedact.visualRedactionLabel}">Visual</label>
|
<label class="form-check-label" for="visualImage" th:text="#{autoRedact.visualRedactionLabel}">Visual</label>
|
||||||
<small class="form-text text-muted d-block mt-1" id="visual-desc" th:text="#{autoRedact.visualRedactionDescription}">Converts to image with visual redactions for maximum security.</small>
|
|
||||||
</div>
|
</div>
|
||||||
<div class="form-check mb-2">
|
<div class="form-check mb-2">
|
||||||
<input aria-describedby="delete-desc" class="form-check-input" id="deleteText" name="redactionMode" type="radio" value="aggressive">
|
<input aria-describedby="delete-desc" class="form-check-input" id="deleteText" name="redactionMode" type="radio" value="aggressive">
|
||||||
|
Loading…
Reference in New Issue
Block a user