mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
enhance kerning adjustments and improve text handling in RedactionService and TextFinder
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
150174cba9
commit
e396b6cbb8
@ -6,6 +6,7 @@ import java.util.List;
|
|||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
@ -20,6 +21,7 @@ public class TextFinder extends PDFTextStripper {
|
|||||||
private final String searchTerm;
|
private final String searchTerm;
|
||||||
private final boolean useRegex;
|
private final boolean useRegex;
|
||||||
private final boolean wholeWordSearch;
|
private final boolean wholeWordSearch;
|
||||||
|
@Getter
|
||||||
private final List<PDFText> foundTexts = new ArrayList<>();
|
private final List<PDFText> foundTexts = new ArrayList<>();
|
||||||
|
|
||||||
private final List<TextPosition> pageTextPositions = new ArrayList<>();
|
private final List<TextPosition> pageTextPositions = new ArrayList<>();
|
||||||
@ -68,11 +70,17 @@ public class TextFinder extends PDFTextStripper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
String processedSearchTerm = this.searchTerm.trim();
|
String processedSearchTerm = this.searchTerm.trim();
|
||||||
|
|
||||||
|
if (processedSearchTerm.isEmpty()) {
|
||||||
|
super.endPage(page);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
|
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
|
||||||
if (this.wholeWordSearch) {
|
if (this.wholeWordSearch) {
|
||||||
if (processedSearchTerm.length() == 1
|
if (processedSearchTerm.length() == 1
|
||||||
&& Character.isDigit(processedSearchTerm.charAt(0))) {
|
&& Character.isDigit(processedSearchTerm.charAt(0))) {
|
||||||
regex = "(?<![\\w])" + regex + "(?![\\w])";
|
regex = "(?<![\\w])(?<!\\d[\\.,])" + regex + "(?![\\w])(?![\\.,]\\d)";
|
||||||
} else if (processedSearchTerm.length() == 1) {
|
} else if (processedSearchTerm.length() == 1) {
|
||||||
regex = "(?<![\\w])" + regex + "(?![\\w])";
|
regex = "(?<![\\w])" + regex + "(?![\\w])";
|
||||||
} else {
|
} else {
|
||||||
@ -184,10 +192,6 @@ public class TextFinder extends PDFTextStripper {
|
|||||||
super.endPage(page);
|
super.endPage(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<PDFText> getFoundTexts() {
|
|
||||||
return foundTexts;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getDebugInfo() {
|
public String getDebugInfo() {
|
||||||
StringBuilder debug = new StringBuilder();
|
StringBuilder debug = new StringBuilder();
|
||||||
debug.append("Extracted text length: ").append(pageTextBuilder.length()).append("\n");
|
debug.append("Extracted text length: ").append(pageTextBuilder.length()).append("\n");
|
||||||
|
@ -1104,97 +1104,39 @@ public class RedactionService {
|
|||||||
return allMatches;
|
return allMatches;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Object> applyRedactionsToTokens(
|
private static String createAlternativePlaceholder(
|
||||||
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
|
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
||||||
List<Object> newTokens = new ArrayList<>(tokens);
|
final String repeat =
|
||||||
if (this.aggressiveMode) {
|
" ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
||||||
Map<Integer, List<AggressiveSegMatch>> perSeg = this.aggressiveSegMatches;
|
|
||||||
if (perSeg != null && !perSeg.isEmpty()) {
|
|
||||||
List<Integer> segIndices = new ArrayList<>(perSeg.keySet());
|
|
||||||
segIndices.sort(
|
|
||||||
(a, b) ->
|
|
||||||
Integer.compare(
|
|
||||||
textSegments.get(b).tokenIndex,
|
|
||||||
textSegments.get(a).tokenIndex));
|
|
||||||
for (Integer segIndex : segIndices) {
|
|
||||||
TextSegment segment = textSegments.get(segIndex);
|
|
||||||
List<AggressiveSegMatch> segMatches = perSeg.getOrDefault(segIndex, List.of());
|
|
||||||
if (segMatches.isEmpty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Object token = newTokens.get(segment.tokenIndex);
|
|
||||||
String opName = segment.operatorName;
|
|
||||||
if (("Tj".equals(opName) || "'".equals(opName) || "\"".equals(opName))
|
|
||||||
&& token instanceof COSString cs) {
|
|
||||||
COSString redacted =
|
|
||||||
redactCosStringByDecodedRanges(segment.font, cs, segMatches);
|
|
||||||
newTokens.set(segment.tokenIndex, redacted);
|
|
||||||
} else if ("TJ".equals(opName) && token instanceof COSArray arr) {
|
|
||||||
COSArray redacted =
|
|
||||||
redactTJArrayByDecodedRanges(segment.font, arr, segMatches);
|
|
||||||
newTokens.set(segment.tokenIndex, redacted);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return newTokens;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
|
|
||||||
for (MatchRange match : matches) {
|
|
||||||
for (int i = 0; i < textSegments.size(); i++) {
|
|
||||||
TextSegment segment = textSegments.get(i);
|
|
||||||
int overlapStart = Math.max(match.startPos, segment.startPos);
|
|
||||||
int overlapEnd = Math.min(match.endPos, segment.endPos);
|
|
||||||
if (overlapStart < overlapEnd) {
|
|
||||||
matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
List<ModificationTask> tasks = new ArrayList<>();
|
|
||||||
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
|
|
||||||
int segmentIndex = entry.getKey();
|
|
||||||
List<MatchRange> segmentMatches = entry.getValue();
|
|
||||||
|
|
||||||
if (segmentIndex < 0 || segmentIndex >= textSegments.size()) continue;
|
|
||||||
TextSegment segment = textSegments.get(segmentIndex);
|
|
||||||
if (segment == null) continue;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) {
|
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
|
||||||
String newText = applyRedactionsToSegmentText(segment, segmentMatches);
|
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
|
||||||
if (newText == null) newText = "";
|
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
|
||||||
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
|
if (spaceWidth > 0) {
|
||||||
tasks.add(new ModificationTask(segment, newText, adjustment));
|
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
|
||||||
} else if ("TJ".equals(segment.operatorName)) {
|
int maxSpaces = (originalWord != null ? originalWord.length() : 1) * 2;
|
||||||
tasks.add(new ModificationTask(segment, "", 0));
|
return " ".repeat(Math.min(spaceCount, maxSpaces));
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
// Skip this segment
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
|
for (String alt : alternatives) {
|
||||||
|
if (" ".equals(alt)) continue;
|
||||||
int maxTasksToProcess = Math.min(tasks.size(), 1000);
|
|
||||||
|
|
||||||
for (int i = 0; i < maxTasksToProcess && i < tasks.size(); i++) {
|
|
||||||
ModificationTask task = tasks.get(i);
|
|
||||||
try {
|
try {
|
||||||
List<MatchRange> segmentMatches =
|
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
|
||||||
matchesBySegment.getOrDefault(
|
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
|
||||||
textSegments.indexOf(task.segment), Collections.emptyList());
|
if (cw > 0) {
|
||||||
|
int count = Math.max(1, Math.round(targetWidth / cw));
|
||||||
if (task.segment.tokenIndex >= newTokens.size()) continue;
|
int max = (originalWord != null ? originalWord.length() : 1) * 2;
|
||||||
if (task.segment.getText() == null || task.segment.getText().isEmpty()) continue;
|
return " ".repeat(Math.min(count, max));
|
||||||
|
}
|
||||||
modifyTokenForRedaction(
|
} catch (Exception ignored) {
|
||||||
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
|
}
|
||||||
|
}
|
||||||
|
return repeat;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// Skip this task
|
return repeat;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return newTokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
private float calculateWidthAdjustment(TextSegment segment, List<MatchRange> matches) {
|
private float calculateWidthAdjustment(TextSegment segment, List<MatchRange> matches) {
|
||||||
if (segment == null
|
if (segment == null
|
||||||
|| matches == null
|
|| matches == null
|
||||||
@ -1311,36 +1253,164 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String createAlternativePlaceholder(
|
private List<Object> applyRedactionsToTokens(
|
||||||
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
|
||||||
final String repeat =
|
List<Object> newTokens = new ArrayList<>(tokens);
|
||||||
" ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
if (this.aggressiveMode) {
|
||||||
|
Map<Integer, List<AggressiveSegMatch>> perSeg = this.aggressiveSegMatches;
|
||||||
|
if (perSeg != null && !perSeg.isEmpty()) {
|
||||||
|
List<Integer> segIndices = new ArrayList<>(perSeg.keySet());
|
||||||
|
segIndices.sort(
|
||||||
|
(a, b) ->
|
||||||
|
Integer.compare(
|
||||||
|
textSegments.get(b).tokenIndex,
|
||||||
|
textSegments.get(a).tokenIndex));
|
||||||
|
for (Integer segIndex : segIndices) {
|
||||||
|
TextSegment segment = textSegments.get(segIndex);
|
||||||
|
List<AggressiveSegMatch> segMatches = perSeg.getOrDefault(segIndex, List.of());
|
||||||
|
if (segMatches.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Object token = newTokens.get(segment.tokenIndex);
|
||||||
|
String opName = segment.operatorName;
|
||||||
|
if (("Tj".equals(opName) || "'".equals(opName) || "\"".equals(opName))
|
||||||
|
&& token instanceof COSString cs) {
|
||||||
|
COSString redacted =
|
||||||
|
redactCosStringByDecodedRanges(segment.font, cs, segMatches);
|
||||||
|
if (segment.font != null && segment.fontSize > 0) {
|
||||||
|
String originalText = getDecodedString(cs, segment.font);
|
||||||
|
String modifiedText = getDecodedString(redacted, segment.font);
|
||||||
|
float wOrig =
|
||||||
|
calculateSafeWidth(
|
||||||
|
originalText, segment.font, segment.fontSize);
|
||||||
|
float wMod =
|
||||||
|
calculateSafeWidth(
|
||||||
|
modifiedText, segment.font, segment.fontSize);
|
||||||
|
float adjustment = wOrig - wMod;
|
||||||
|
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
||||||
|
COSArray arr = new COSArray();
|
||||||
|
arr.add(redacted);
|
||||||
|
float kerning =
|
||||||
|
(-adjustment / segment.fontSize) * FONT_SCALE_FACTOR;
|
||||||
|
arr.add(new COSFloat(kerning));
|
||||||
|
newTokens.set(segment.tokenIndex, arr);
|
||||||
|
updateOperatorSafely(newTokens, segment.tokenIndex, opName);
|
||||||
|
} else {
|
||||||
|
newTokens.set(segment.tokenIndex, redacted);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
newTokens.set(segment.tokenIndex, redacted);
|
||||||
|
}
|
||||||
|
} else if ("TJ".equals(opName) && token instanceof COSArray arr) {
|
||||||
|
COSArray redacted =
|
||||||
|
redactTJArrayByDecodedRanges(segment.font, arr, segMatches);
|
||||||
|
// Inject kerning adjustments per string element to preserve layout
|
||||||
|
COSArray withKerning = buildKerningAdjustedTJArray(arr, redacted, segment);
|
||||||
|
newTokens.set(segment.tokenIndex, withKerning);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return newTokens;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
|
||||||
|
for (MatchRange match : matches) {
|
||||||
|
for (int i = 0; i < textSegments.size(); i++) {
|
||||||
|
TextSegment segment = textSegments.get(i);
|
||||||
|
int overlapStart = Math.max(match.startPos, segment.startPos);
|
||||||
|
int overlapEnd = Math.min(match.endPos, segment.endPos);
|
||||||
|
if (overlapStart < overlapEnd) {
|
||||||
|
matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
List<ModificationTask> tasks = new ArrayList<>();
|
||||||
|
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
|
||||||
|
int segmentIndex = entry.getKey();
|
||||||
|
List<MatchRange> segmentMatches = entry.getValue();
|
||||||
|
|
||||||
|
if (segmentIndex < 0 || segmentIndex >= textSegments.size()) continue;
|
||||||
|
TextSegment segment = textSegments.get(segmentIndex);
|
||||||
|
if (segment == null) continue;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
|
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) {
|
||||||
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
|
String newText = applyRedactionsToSegmentText(segment, segmentMatches);
|
||||||
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
|
if (newText == null) newText = "";
|
||||||
if (spaceWidth > 0) {
|
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
|
||||||
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
|
tasks.add(new ModificationTask(segment, newText, adjustment));
|
||||||
int maxSpaces = originalWord.length() * 2;
|
} else if ("TJ".equals(segment.operatorName)) {
|
||||||
return " ".repeat(Math.min(spaceCount, maxSpaces));
|
tasks.add(new ModificationTask(segment, "", 0));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
for (String alt : alternatives) {
|
|
||||||
if (" ".equals(alt)) continue;
|
|
||||||
try {
|
|
||||||
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
|
|
||||||
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
|
|
||||||
if (cw > 0) {
|
|
||||||
int count = Math.max(1, Math.round(targetWidth / cw));
|
|
||||||
int max = originalWord.length() * 2;
|
|
||||||
return " ".repeat(Math.min(count, max));
|
|
||||||
}
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return repeat;
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return repeat;
|
// Skip this segment
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
|
||||||
|
|
||||||
|
int maxTasksToProcess = Math.min(tasks.size(), 1000);
|
||||||
|
|
||||||
|
for (int i = 0; i < maxTasksToProcess && i < tasks.size(); i++) {
|
||||||
|
ModificationTask task = tasks.get(i);
|
||||||
|
try {
|
||||||
|
List<MatchRange> segmentMatches =
|
||||||
|
matchesBySegment.getOrDefault(
|
||||||
|
textSegments.indexOf(task.segment), Collections.emptyList());
|
||||||
|
|
||||||
|
if (task.segment.tokenIndex >= newTokens.size()) continue;
|
||||||
|
if (task.segment.getText() == null || task.segment.getText().isEmpty()) continue;
|
||||||
|
|
||||||
|
modifyTokenForRedaction(
|
||||||
|
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
|
||||||
|
} catch (Exception e) {
|
||||||
|
// Skip this task
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
private COSArray buildKerningAdjustedTJArray(
|
||||||
|
COSArray originalArray, COSArray redactedArray, TextSegment segment) {
|
||||||
|
try {
|
||||||
|
if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0)
|
||||||
|
return redactedArray;
|
||||||
|
|
||||||
|
COSArray out = new COSArray();
|
||||||
|
int size = redactedArray.size();
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
COSBase redEl = redactedArray.get(i);
|
||||||
|
COSBase origEl =
|
||||||
|
(originalArray != null && i < originalArray.size())
|
||||||
|
? originalArray.get(i)
|
||||||
|
: null;
|
||||||
|
|
||||||
|
out.add(redEl);
|
||||||
|
|
||||||
|
if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) {
|
||||||
|
String origText = getDecodedString(origStr, segment.getFont());
|
||||||
|
String modText = getDecodedString(redStr, segment.getFont());
|
||||||
|
float wOrig =
|
||||||
|
calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
|
||||||
|
float wMod =
|
||||||
|
calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
|
||||||
|
float adjustment = wOrig - wMod;
|
||||||
|
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
||||||
|
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
|
||||||
|
// If next token is a number, combine; otherwise insert new number
|
||||||
|
if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
|
||||||
|
// Skip adding the next separately and add combined value
|
||||||
|
i++;
|
||||||
|
float combined = num.floatValue() + kerning;
|
||||||
|
out.add(new COSFloat(combined));
|
||||||
|
} else {
|
||||||
|
out.add(new COSFloat(kerning));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
} catch (Exception e) {
|
||||||
|
return redactedArray;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user