mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
refactor
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
e14941695e
commit
ebe17f4c93
@ -47,7 +47,6 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
|||||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
|
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
|
import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
|
||||||
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
|
|
||||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
@ -815,7 +814,7 @@ public class RedactionService {
|
|||||||
TextSegment segment) {
|
TextSegment segment) {
|
||||||
try {
|
try {
|
||||||
if (!isValidTokenIndex(tokens, tokenIndex) || segment == null) {
|
if (!isValidTokenIndex(tokens, tokenIndex) || segment == null) {
|
||||||
return TokenModificationResult.failure("Invalid token index or segment");
|
return TokenModificationResult.failure();
|
||||||
}
|
}
|
||||||
COSArray array = new COSArray();
|
COSArray array = new COSArray();
|
||||||
COSString cos =
|
COSString cos =
|
||||||
@ -831,21 +830,10 @@ public class RedactionService {
|
|||||||
updateOperatorSafely(tokens, tokenIndex, originalOperator);
|
updateOperatorSafely(tokens, tokenIndex, originalOperator);
|
||||||
return TokenModificationResult.success();
|
return TokenModificationResult.success();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return TokenModificationResult.failure("Conversion to TJ failed: " + e.getMessage());
|
return TokenModificationResult.failure();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean isTextSafeForRedaction(String text) {
|
|
||||||
if (text == null || text.isEmpty()) return true;
|
|
||||||
|
|
||||||
for (char c : text.toCharArray()) {
|
|
||||||
if (c >= 65488 || (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static List<Object> deepCopyTokens(List<Object> original) {
|
private static List<Object> deepCopyTokens(List<Object> original) {
|
||||||
if (original == null) {
|
if (original == null) {
|
||||||
return new ArrayList<>();
|
return new ArrayList<>();
|
||||||
@ -904,8 +892,6 @@ public class RedactionService {
|
|||||||
TextFinderUtils.createOptimizedSearchPatterns(
|
TextFinderUtils.createOptimizedSearchPatterns(
|
||||||
targetWords, useRegex, wholeWordSearch);
|
targetWords, useRegex, wholeWordSearch);
|
||||||
|
|
||||||
int totalMatchesFound = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < segments.size(); i++) {
|
for (int i = 0; i < segments.size(); i++) {
|
||||||
TextSegment segment = segments.get(i);
|
TextSegment segment = segments.get(i);
|
||||||
String segmentText = segment.getText();
|
String segmentText = segment.getText();
|
||||||
@ -918,7 +904,6 @@ public class RedactionService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int segmentMatches = 0;
|
|
||||||
for (Pattern pattern : patterns) {
|
for (Pattern pattern : patterns) {
|
||||||
try {
|
try {
|
||||||
var matcher = pattern.matcher(segmentText);
|
var matcher = pattern.matcher(segmentText);
|
||||||
@ -929,17 +914,15 @@ public class RedactionService {
|
|||||||
if (matchStart >= 0
|
if (matchStart >= 0
|
||||||
&& matchEnd <= segmentText.length()
|
&& matchEnd <= segmentText.length()
|
||||||
&& matchStart < matchEnd) {
|
&& matchStart < matchEnd) {
|
||||||
String matchedText = segmentText.substring(matchStart, matchEnd);
|
|
||||||
|
|
||||||
allMatches.add(
|
allMatches.add(
|
||||||
new MatchRange(
|
new MatchRange(
|
||||||
segment.getStartPos() + matchStart,
|
segment.getStartPos() + matchStart,
|
||||||
segment.getStartPos() + matchEnd));
|
segment.getStartPos() + matchEnd));
|
||||||
segmentMatches++;
|
|
||||||
totalMatchesFound++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
log.debug("Error matching pattern '{}': {}", pattern.pattern(), e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -952,25 +935,6 @@ public class RedactionService {
|
|||||||
return wipeAllSemanticTextInTokens(tokens, true);
|
return wipeAllSemanticTextInTokens(tokens, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String normalizeTextForRedaction(String text) {
|
|
||||||
if (text == null) return null;
|
|
||||||
|
|
||||||
StringBuilder normalized = new StringBuilder(text.length());
|
|
||||||
for (int i = 0; i < text.length(); i++) {
|
|
||||||
char c = text.charAt(i);
|
|
||||||
|
|
||||||
if (c >= 65488) {
|
|
||||||
normalized.append(' ');
|
|
||||||
} else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
|
|
||||||
normalized.append(' ');
|
|
||||||
} else {
|
|
||||||
normalized.append(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return normalized.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean isOcrMyPdfAvailable() {
|
private static boolean isOcrMyPdfAvailable() {
|
||||||
try {
|
try {
|
||||||
ProcessExecutorResult result =
|
ProcessExecutorResult result =
|
||||||
@ -1164,23 +1128,6 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte[] applySemanticScrubbing(MultipartFile file, Set<ScrubOption> scrubOptions)
|
|
||||||
throws IOException {
|
|
||||||
if (scrubOptions == null || scrubOptions.isEmpty()) {
|
|
||||||
return file.getBytes();
|
|
||||||
}
|
|
||||||
|
|
||||||
try (PDDocument document = pdfDocumentFactory.load(file)) {
|
|
||||||
DefaultSemanticScrubber scrubber = new DefaultSemanticScrubber();
|
|
||||||
scrubber.scrub(document, scrubOptions);
|
|
||||||
|
|
||||||
try (ByteArrayOutputStream output = new ByteArrayOutputStream()) {
|
|
||||||
document.save(output);
|
|
||||||
return output.toByteArray();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static float calculateCharacterSumWidth(PDFont font, String text) {
|
private static float calculateCharacterSumWidth(PDFont font, String text) {
|
||||||
if (font == null || text == null || text.isEmpty()) {
|
if (font == null || text == null || text.isEmpty()) {
|
||||||
return -1f;
|
return -1f;
|
||||||
@ -1237,7 +1184,7 @@ public class RedactionService {
|
|||||||
float adjustment,
|
float adjustment,
|
||||||
TextSegment segment) {
|
TextSegment segment) {
|
||||||
if (!(token instanceof COSString)) {
|
if (!(token instanceof COSString)) {
|
||||||
return TokenModificationResult.failure("Expected COSString");
|
return TokenModificationResult.failure();
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -1251,7 +1198,7 @@ public class RedactionService {
|
|||||||
tokens, tokenIndex, operatorName, newText, adjustment, segment);
|
tokens, tokenIndex, operatorName, newText, adjustment, segment);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return TokenModificationResult.failure("Modification failed: " + e.getMessage());
|
return TokenModificationResult.failure();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1506,6 +1453,7 @@ public class RedactionService {
|
|||||||
return WidthCalculator.calculateAccurateWidth(font, text, fontSize);
|
return WidthCalculator.calculateAccurateWidth(font, text, fontSize);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
log.error("Failed to calculate safe width for text: {}", text, e);
|
||||||
}
|
}
|
||||||
return 0f;
|
return 0f;
|
||||||
}
|
}
|
||||||
@ -1549,6 +1497,7 @@ public class RedactionService {
|
|||||||
return alt.repeat(Math.min(count, max));
|
return alt.repeat(Math.min(count, max));
|
||||||
}
|
}
|
||||||
} catch (Exception ignored) {
|
} catch (Exception ignored) {
|
||||||
|
log.error("Failed to calculate alternative placeholder width for {}", alt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return repeat;
|
return repeat;
|
||||||
@ -1836,19 +1785,11 @@ public class RedactionService {
|
|||||||
return problematicRatio > 0.3;
|
return problematicRatio > 0.3;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void processResidualText(PDDocument document, PDPage page, List<Object> filtered) {
|
private static String handleTjOperator(Object token, PDFont font) {
|
||||||
try {
|
// Note: TJ vs Tj is different
|
||||||
var sem = wipeAllSemanticTextInTokens(filtered);
|
return (token instanceof COSString cosString)
|
||||||
filtered = sem.tokens;
|
? extractStringWithFallbacks(cosString, font)
|
||||||
PDResources res = page.getResources();
|
: "";
|
||||||
if (res != null) {
|
|
||||||
wipeAllSemanticTextInProperties(res);
|
|
||||||
wipeAllTextInXObjects(document, res);
|
|
||||||
wipeAllTextInPatterns(document, res);
|
|
||||||
}
|
|
||||||
writeFilteredContentStream(document, page, filtered);
|
|
||||||
} catch (Exception ignored) {
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean performTextReplacement(
|
public boolean performTextReplacement(
|
||||||
@ -1965,21 +1906,11 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private TokenModificationResult performTokenModification(
|
private static String handleQuotedOperator(Object token, PDFont font) {
|
||||||
List<Object> tokens,
|
// Do not add an extra newline; it shifts indices and breaks match ranges (important!!!)
|
||||||
Object token,
|
return (token instanceof COSString cosString)
|
||||||
String operatorName,
|
? extractStringWithFallbacks(cosString, font)
|
||||||
String newText,
|
: "";
|
||||||
float adjustment,
|
|
||||||
TextSegment segment,
|
|
||||||
List<MatchRange> matches) {
|
|
||||||
return switch (operatorName) {
|
|
||||||
case "Tj", "'", "\"" ->
|
|
||||||
modifySimpleTextOperator(
|
|
||||||
tokens, token, operatorName, newText, adjustment, segment);
|
|
||||||
case "TJ" -> modifyTJOperator(tokens, token, segment, matches);
|
|
||||||
default -> TokenModificationResult.failure("Unsupported operator: " + operatorName);
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void processPages(
|
private void processPages(
|
||||||
@ -2003,155 +1934,36 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String handleTjOperator(Object token, PDFont font) {
|
private static String extractTextFromToken(
|
||||||
return (token instanceof COSString cosString)
|
Object token, String operatorName, PDFont currentFont) {
|
||||||
? extractStringWithFallbacks(cosString, font)
|
if (token == null || operatorName == null) return "";
|
||||||
: "";
|
|
||||||
|
try {
|
||||||
|
return switch (operatorName) {
|
||||||
|
case "Tj" -> handleTjOperator(token, currentFont);
|
||||||
|
case "'", "\"" -> handleQuotedOperator(token, currentFont);
|
||||||
|
case "TJ" -> handleTJOperator(token, currentFont);
|
||||||
|
default -> "";
|
||||||
|
};
|
||||||
|
} catch (Exception e) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Object> applyRedactionsToTokens(
|
private void processResidualText(PDDocument document, PDPage page, List<Object> filtered) {
|
||||||
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
|
try {
|
||||||
|
var sem = wipeAllSemanticTextInTokens(filtered);
|
||||||
List<Object> newTokens = new ArrayList<>(tokens);
|
filtered = sem.tokens;
|
||||||
|
PDResources res = page.getResources();
|
||||||
if (this.aggressiveMode) {
|
if (res != null) {
|
||||||
|
wipeAllSemanticTextInProperties(res);
|
||||||
Map<Integer, List<AggressiveSegMatch>> perSeg = this.aggressiveSegMatches;
|
wipeAllTextInXObjects(document, res);
|
||||||
if (perSeg != null && !perSeg.isEmpty()) {
|
wipeAllTextInPatterns(document, res);
|
||||||
|
|
||||||
List<Integer> segIndices = new ArrayList<>(perSeg.keySet());
|
|
||||||
segIndices.sort(
|
|
||||||
(a, b) ->
|
|
||||||
Integer.compare(
|
|
||||||
textSegments.get(b).tokenIndex,
|
|
||||||
textSegments.get(a).tokenIndex));
|
|
||||||
for (Integer segIndex : segIndices) {
|
|
||||||
TextSegment segment = textSegments.get(segIndex);
|
|
||||||
List<AggressiveSegMatch> segMatches = perSeg.getOrDefault(segIndex, List.of());
|
|
||||||
if (segMatches.isEmpty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Object token = newTokens.get(segment.tokenIndex);
|
|
||||||
String opName = segment.operatorName;
|
|
||||||
if (("Tj".equals(opName) || "'".equals(opName) || "\"".equals(opName))
|
|
||||||
&& token instanceof COSString cs) {
|
|
||||||
|
|
||||||
COSString redacted =
|
|
||||||
redactCosStringByDecodedRanges(segment.font, cs, segMatches);
|
|
||||||
if (segment.font != null && segment.fontSize > 0) {
|
|
||||||
String originalText = getDecodedString(cs, segment.font);
|
|
||||||
String modifiedText = getDecodedString(redacted, segment.font);
|
|
||||||
|
|
||||||
float wOrig =
|
|
||||||
calculateSafeWidth(
|
|
||||||
originalText, segment.font, segment.fontSize);
|
|
||||||
float wMod =
|
|
||||||
calculateSafeWidth(
|
|
||||||
modifiedText, segment.font, segment.fontSize);
|
|
||||||
float adjustment = wOrig - wMod;
|
|
||||||
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
|
||||||
|
|
||||||
COSArray arr = new COSArray();
|
|
||||||
arr.add(redacted);
|
|
||||||
float kerning =
|
|
||||||
(-adjustment / segment.fontSize) * FONT_SCALE_FACTOR;
|
|
||||||
arr.add(new COSFloat(kerning));
|
|
||||||
newTokens.set(segment.tokenIndex, arr);
|
|
||||||
updateOperatorSafely(newTokens, segment.tokenIndex, opName);
|
|
||||||
} else {
|
|
||||||
newTokens.set(segment.tokenIndex, redacted);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
newTokens.set(segment.tokenIndex, redacted);
|
|
||||||
}
|
|
||||||
} else if ("TJ".equals(opName) && token instanceof COSArray arr) {
|
|
||||||
|
|
||||||
COSArray redacted =
|
|
||||||
redactTJArrayByDecodedRanges(segment.font, arr, segMatches);
|
|
||||||
COSArray withKerning = buildKerningAdjustedTJArray(arr, redacted, segment);
|
|
||||||
newTokens.set(segment.tokenIndex, withKerning);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return newTokens;
|
|
||||||
}
|
}
|
||||||
|
writeFilteredContentStream(document, page, filtered);
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
log.debug("Error processing residual text: {}", ignored.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
|
|
||||||
for (MatchRange match : matches) {
|
|
||||||
for (int i = 0; i < textSegments.size(); i++) {
|
|
||||||
TextSegment segment = textSegments.get(i);
|
|
||||||
int overlapStart = Math.max(match.startPos, segment.startPos);
|
|
||||||
int overlapEnd = Math.min(match.endPos, segment.endPos);
|
|
||||||
if (overlapStart < overlapEnd) {
|
|
||||||
matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
List<ModificationTask> tasks = new ArrayList<>();
|
|
||||||
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
|
|
||||||
int segmentIndex = entry.getKey();
|
|
||||||
List<MatchRange> segmentMatches = entry.getValue();
|
|
||||||
|
|
||||||
if (segmentIndex < 0 || segmentIndex >= textSegments.size()) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
TextSegment segment = textSegments.get(segmentIndex);
|
|
||||||
if (segment == null) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
if ("Tj".equals(segment.operatorName)
|
|
||||||
|| "'".equals(segment.operatorName)
|
|
||||||
|| "\"".equals(segment.operatorName)) {
|
|
||||||
|
|
||||||
String newText = applyRedactionsToSegmentText(segment, segmentMatches);
|
|
||||||
if (newText == null) newText = "";
|
|
||||||
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
|
|
||||||
tasks.add(new ModificationTask(segment, newText, adjustment));
|
|
||||||
|
|
||||||
} else if ("TJ".equals(segment.operatorName)) {
|
|
||||||
|
|
||||||
tasks.add(new ModificationTask(segment, "", 0));
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
|
|
||||||
|
|
||||||
int maxTasksToProcess = Math.min(tasks.size(), 1000);
|
|
||||||
for (int i = 0; i < maxTasksToProcess && i < tasks.size(); i++) {
|
|
||||||
ModificationTask task = tasks.get(i);
|
|
||||||
try {
|
|
||||||
List<MatchRange> segmentMatches =
|
|
||||||
matchesBySegment.getOrDefault(
|
|
||||||
textSegments.indexOf(task.segment), Collections.emptyList());
|
|
||||||
|
|
||||||
if (task.segment.tokenIndex >= newTokens.size()) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (task.segment.getText() == null || task.segment.getText().isEmpty()) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
modifyTokenForRedaction(
|
|
||||||
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return newTokens;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<TextSegment> extractTextSegmentsFromTokens(
|
private List<TextSegment> extractTextSegmentsFromTokens(
|
||||||
@ -2200,11 +2012,21 @@ public class RedactionService {
|
|||||||
return segments;
|
return segments;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String handleQuotedOperator(Object token, PDFont font) {
|
private TokenModificationResult performTokenModification(
|
||||||
// Do not add an extra newline; it shifts indices and breaks match ranges
|
List<Object> tokens,
|
||||||
return (token instanceof COSString cosString)
|
Object token,
|
||||||
? extractStringWithFallbacks(cosString, font)
|
String operatorName,
|
||||||
: "";
|
String newText,
|
||||||
|
float adjustment,
|
||||||
|
TextSegment segment,
|
||||||
|
List<MatchRange> matches) {
|
||||||
|
return switch (operatorName) {
|
||||||
|
case "Tj", "'", "\"" ->
|
||||||
|
modifySimpleTextOperator(
|
||||||
|
tokens, token, operatorName, newText, adjustment, segment);
|
||||||
|
case "TJ" -> modifyTJOperator(tokens, token, segment, matches);
|
||||||
|
default -> TokenModificationResult.failure();
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<MatchRange> findAllMatchesAggressive(
|
private List<MatchRange> findAllMatchesAggressive(
|
||||||
@ -2461,22 +2283,149 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private TokenModificationResult modifyTJOperator(
|
private List<Object> applyRedactionsToTokens(
|
||||||
List<Object> tokens, Object token, TextSegment segment, List<MatchRange> matches) {
|
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
|
||||||
if (!(token instanceof COSArray originalArray)) {
|
|
||||||
return TokenModificationResult.failure("Expected COSArray for TJ operator");
|
List<Object> newTokens = new ArrayList<>(tokens);
|
||||||
|
|
||||||
|
if (this.aggressiveMode) {
|
||||||
|
|
||||||
|
Map<Integer, List<AggressiveSegMatch>> perSeg = this.aggressiveSegMatches;
|
||||||
|
if (perSeg != null && !perSeg.isEmpty()) {
|
||||||
|
|
||||||
|
List<Integer> segIndices = new ArrayList<>(perSeg.keySet());
|
||||||
|
segIndices.sort(
|
||||||
|
(a, b) ->
|
||||||
|
Integer.compare(
|
||||||
|
textSegments.get(b).tokenIndex,
|
||||||
|
textSegments.get(a).tokenIndex));
|
||||||
|
for (Integer segIndex : segIndices) {
|
||||||
|
TextSegment segment = textSegments.get(segIndex);
|
||||||
|
List<AggressiveSegMatch> segMatches = perSeg.getOrDefault(segIndex, List.of());
|
||||||
|
if (segMatches.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Object token = newTokens.get(segment.tokenIndex);
|
||||||
|
String opName = segment.operatorName;
|
||||||
|
if (("Tj".equals(opName) || "'".equals(opName) || "\"".equals(opName))
|
||||||
|
&& token instanceof COSString cs) {
|
||||||
|
|
||||||
|
COSString redacted =
|
||||||
|
redactCosStringByDecodedRanges(segment.font, cs, segMatches);
|
||||||
|
if (segment.font != null && segment.fontSize > 0) {
|
||||||
|
String originalText = getDecodedString(cs, segment.font);
|
||||||
|
String modifiedText = getDecodedString(redacted, segment.font);
|
||||||
|
|
||||||
|
float wOrig =
|
||||||
|
calculateSafeWidth(
|
||||||
|
originalText, segment.font, segment.fontSize);
|
||||||
|
float wMod =
|
||||||
|
calculateSafeWidth(
|
||||||
|
modifiedText, segment.font, segment.fontSize);
|
||||||
|
float adjustment = wOrig - wMod;
|
||||||
|
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
||||||
|
|
||||||
|
COSArray arr = new COSArray();
|
||||||
|
arr.add(redacted);
|
||||||
|
float kerning =
|
||||||
|
(-adjustment / segment.fontSize) * FONT_SCALE_FACTOR;
|
||||||
|
arr.add(new COSFloat(kerning));
|
||||||
|
newTokens.set(segment.tokenIndex, arr);
|
||||||
|
updateOperatorSafely(newTokens, segment.tokenIndex, opName);
|
||||||
|
} else {
|
||||||
|
newTokens.set(segment.tokenIndex, redacted);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
newTokens.set(segment.tokenIndex, redacted);
|
||||||
|
}
|
||||||
|
} else if ("TJ".equals(opName) && token instanceof COSArray arr) {
|
||||||
|
|
||||||
|
COSArray redacted =
|
||||||
|
redactTJArrayByDecodedRanges(segment.font, arr, segMatches);
|
||||||
|
COSArray withKerning = buildKerningAdjustedTJArray(arr, redacted, segment);
|
||||||
|
newTokens.set(segment.tokenIndex, withKerning);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newTokens;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
|
||||||
COSArray newArray = createRedactedTJArray(originalArray, segment, matches);
|
for (MatchRange match : matches) {
|
||||||
if (!isValidTJArray(newArray)) {
|
for (int i = 0; i < textSegments.size(); i++) {
|
||||||
return TokenModificationResult.failure("Generated invalid TJ array");
|
TextSegment segment = textSegments.get(i);
|
||||||
|
int overlapStart = Math.max(match.startPos, segment.startPos);
|
||||||
|
int overlapEnd = Math.min(match.endPos, segment.endPos);
|
||||||
|
if (overlapStart < overlapEnd) {
|
||||||
|
matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
tokens.set(segment.tokenIndex, newArray);
|
|
||||||
return TokenModificationResult.success();
|
|
||||||
} catch (Exception e) {
|
|
||||||
return TokenModificationResult.failure("TJ modification failed: " + e.getMessage());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<ModificationTask> tasks = new ArrayList<>();
|
||||||
|
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
|
||||||
|
int segmentIndex = entry.getKey();
|
||||||
|
List<MatchRange> segmentMatches = entry.getValue();
|
||||||
|
|
||||||
|
if (segmentIndex < 0 || segmentIndex >= textSegments.size()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
TextSegment segment = textSegments.get(segmentIndex);
|
||||||
|
if (segment == null) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
if ("Tj".equals(segment.operatorName)
|
||||||
|
|| "'".equals(segment.operatorName)
|
||||||
|
|| "\"".equals(segment.operatorName)) {
|
||||||
|
|
||||||
|
String newText = applyRedactionsToSegmentText(segment, segmentMatches);
|
||||||
|
if (newText == null) newText = "";
|
||||||
|
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
|
||||||
|
tasks.add(new ModificationTask(segment, newText, adjustment));
|
||||||
|
|
||||||
|
} else if ("TJ".equals(segment.operatorName)) {
|
||||||
|
|
||||||
|
tasks.add(new ModificationTask(segment, "", 0));
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Error processing token: {}", e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
|
||||||
|
|
||||||
|
int maxTasksToProcess = Math.min(tasks.size(), 1000);
|
||||||
|
for (int i = 0; i < maxTasksToProcess && i < tasks.size(); i++) {
|
||||||
|
ModificationTask task = tasks.get(i);
|
||||||
|
try {
|
||||||
|
List<MatchRange> segmentMatches =
|
||||||
|
matchesBySegment.getOrDefault(
|
||||||
|
textSegments.indexOf(task.segment), Collections.emptyList());
|
||||||
|
|
||||||
|
if (task.segment.tokenIndex >= newTokens.size()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (task.segment.getText() == null || task.segment.getText().isEmpty()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
modifyTokenForRedaction(
|
||||||
|
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newTokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String extractStringWithFallbacks(COSString cosString, PDFont font) {
|
private static String extractStringWithFallbacks(COSString cosString, PDFont font) {
|
||||||
@ -2552,18 +2501,21 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) {
|
private TokenModificationResult modifyTJOperator(
|
||||||
if (token == null || operatorName == null) return "";
|
List<Object> tokens, Object token, TextSegment segment, List<MatchRange> matches) {
|
||||||
|
if (!(token instanceof COSArray originalArray)) {
|
||||||
|
return TokenModificationResult.failure();
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return switch (operatorName) {
|
COSArray newArray = createRedactedTJArray(originalArray, segment, matches);
|
||||||
case "Tj" -> handleTjOperator(token, currentFont);
|
if (!isValidTJArray(newArray)) {
|
||||||
case "'", "\"" -> handleQuotedOperator(token, currentFont);
|
return TokenModificationResult.failure();
|
||||||
case "TJ" -> handleTJOperator(token, currentFont);
|
}
|
||||||
default -> "";
|
tokens.set(segment.tokenIndex, newArray);
|
||||||
};
|
return TokenModificationResult.success();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return "";
|
return TokenModificationResult.failure();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2791,12 +2743,7 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private record WidthCalculationResult(float adjustment, int processedMatches) {
|
private record WidthCalculationResult(float adjustment, int processedMatches) {}
|
||||||
private WidthCalculationResult(float adjustment, int processedMatches) {
|
|
||||||
this.adjustment = adjustment;
|
|
||||||
this.processedMatches = processedMatches;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public enum FallbackStrategy {
|
public enum FallbackStrategy {
|
||||||
EMBED_WIDTH,
|
EMBED_WIDTH,
|
||||||
@ -2807,16 +2754,16 @@ public class RedactionService {
|
|||||||
private static class TokenModificationResult {
|
private static class TokenModificationResult {
|
||||||
@Getter private final boolean success;
|
@Getter private final boolean success;
|
||||||
|
|
||||||
private TokenModificationResult(boolean success, String errorMessage) {
|
private TokenModificationResult(boolean success) {
|
||||||
this.success = success;
|
this.success = success;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static TokenModificationResult success() {
|
public static TokenModificationResult success() {
|
||||||
return new TokenModificationResult(true, null);
|
return new TokenModificationResult(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static TokenModificationResult failure(String errorMessage) {
|
public static TokenModificationResult failure() {
|
||||||
return new TokenModificationResult(false, errorMessage);
|
return new TokenModificationResult(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2883,15 +2830,6 @@ public class RedactionService {
|
|||||||
int modifications;
|
int modifications;
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum ScrubOption {
|
|
||||||
REMOVE_ACTUALTEXT,
|
|
||||||
REMOVE_ALT,
|
|
||||||
REMOVE_TU,
|
|
||||||
NORMALIZE_WHITESPACE
|
|
||||||
}
|
|
||||||
|
|
||||||
public interface SemanticScrubber {}
|
|
||||||
|
|
||||||
private static class GlyphCoverageProbe {
|
private static class GlyphCoverageProbe {
|
||||||
private final PDFont font;
|
private final PDFont font;
|
||||||
private final Set<Integer> availableGlyphs;
|
private final Set<Integer> availableGlyphs;
|
||||||
@ -2901,7 +2839,7 @@ public class RedactionService {
|
|||||||
this.availableGlyphs = buildGlyphCoverage(font);
|
this.availableGlyphs = buildGlyphCoverage(font);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<Integer> buildGlyphCoverage(PDFont font) {
|
private static Set<Integer> buildGlyphCoverage(PDFont font) {
|
||||||
Set<Integer> coverage = new HashSet<>();
|
Set<Integer> coverage = new HashSet<>();
|
||||||
if (font == null) return coverage;
|
if (font == null) return coverage;
|
||||||
|
|
||||||
@ -2938,21 +2876,8 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public float getWidthWithFallback(
|
private static float getLegacySumFallback(float fontSize) {
|
||||||
int codePoint, FallbackStrategy strategy, float fontSize) {
|
return fontSize * 0.6f;
|
||||||
if (hasGlyph(codePoint)) {
|
|
||||||
try {
|
|
||||||
String charStr = new String(Character.toChars(codePoint));
|
|
||||||
return font.getStringWidth(charStr) / FONT_SCALE_FACTOR * fontSize;
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.debug("Failed to get width for codepoint {}", codePoint, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return switch (strategy) {
|
|
||||||
case EMBED_WIDTH -> getEmbeddedProgramWidth(fontSize);
|
|
||||||
case AVERAGE_WIDTH -> getAverageFontWidth(fontSize);
|
|
||||||
case LEGACY_SUM -> getLegacySumFallback(codePoint, fontSize);
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private float getEmbeddedProgramWidth(float fontSize) {
|
private float getEmbeddedProgramWidth(float fontSize) {
|
||||||
@ -3002,110 +2927,21 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static float getLegacySumFallback(int codePoint, float fontSize) {
|
public float getWidthWithFallback(
|
||||||
return fontSize * 0.6f;
|
int codePoint, FallbackStrategy strategy, float fontSize) {
|
||||||
}
|
if (hasGlyph(codePoint)) {
|
||||||
}
|
try {
|
||||||
|
String charStr = new String(Character.toChars(codePoint));
|
||||||
public static class DefaultSemanticScrubber implements SemanticScrubber {
|
return font.getStringWidth(charStr) / FONT_SCALE_FACTOR * fontSize;
|
||||||
|
} catch (Exception e) {
|
||||||
private void scrub(PDDocument document, Set<ScrubOption> options) {
|
log.debug("Failed to get width for codepoint {}", codePoint, e);
|
||||||
if (document == null || options == null || options.isEmpty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
scrubStructureTree(document, options);
|
|
||||||
|
|
||||||
if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)
|
|
||||||
|| options.contains(ScrubOption.REMOVE_ALT)
|
|
||||||
|| options.contains(ScrubOption.REMOVE_TU)) {
|
|
||||||
scrubAnnotations(document, options);
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.debug("Failed to scrub document", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void scrubStructureTree(PDDocument document, Set<ScrubOption> options) {
|
|
||||||
try {
|
|
||||||
COSDictionary catalog = document.getDocumentCatalog().getCOSObject();
|
|
||||||
COSBase structTreeRoot = catalog.getDictionaryObject(COSName.STRUCT_TREE_ROOT);
|
|
||||||
|
|
||||||
if (structTreeRoot instanceof COSDictionary structRoot) {
|
|
||||||
scrubStructureElement(structRoot, options);
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.debug("Failed to scrub structure tree", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void scrubStructureElement(COSDictionary element, Set<ScrubOption> options) {
|
|
||||||
if (element == null) return;
|
|
||||||
|
|
||||||
if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)) {
|
|
||||||
element.removeItem(COSName.ACTUAL_TEXT);
|
|
||||||
}
|
|
||||||
if (options.contains(ScrubOption.REMOVE_ALT)) {
|
|
||||||
element.removeItem(COSName.ALT);
|
|
||||||
}
|
|
||||||
if (options.contains(ScrubOption.REMOVE_TU)) {
|
|
||||||
element.removeItem(COSName.TU);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.contains(ScrubOption.NORMALIZE_WHITESPACE)) {
|
|
||||||
normalizeWhitespaceInElement(element);
|
|
||||||
}
|
|
||||||
|
|
||||||
COSBase kids = element.getDictionaryObject(COSName.K);
|
|
||||||
if (kids instanceof COSArray kidsArray) {
|
|
||||||
for (COSBase kid : kidsArray) {
|
|
||||||
if (kid instanceof COSDictionary kidDict) {
|
|
||||||
scrubStructureElement(kidDict, options);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (kids instanceof COSDictionary kidDict) {
|
|
||||||
scrubStructureElement(kidDict, options);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void normalizeWhitespaceInElement(COSDictionary element) {
|
|
||||||
for (COSName key : List.of(COSName.ACTUAL_TEXT, COSName.ALT, COSName.TU)) {
|
|
||||||
COSBase value = element.getDictionaryObject(key);
|
|
||||||
if (value instanceof COSString cosString) {
|
|
||||||
String text = cosString.getString();
|
|
||||||
String normalized = text.replaceAll("\\s+", " ").trim();
|
|
||||||
if (normalized.length() > 256) {
|
|
||||||
normalized = normalized.substring(0, 256);
|
|
||||||
}
|
|
||||||
element.setString(key, normalized);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
return switch (strategy) {
|
||||||
|
case EMBED_WIDTH -> getEmbeddedProgramWidth(fontSize);
|
||||||
private void scrubAnnotations(PDDocument document, Set<ScrubOption> options) {
|
case AVERAGE_WIDTH -> getAverageFontWidth(fontSize);
|
||||||
try {
|
case LEGACY_SUM -> getLegacySumFallback(fontSize);
|
||||||
for (PDPage page : document.getPages()) {
|
};
|
||||||
for (PDAnnotation annotation : page.getAnnotations()) {
|
|
||||||
COSDictionary annotDict = annotation.getCOSObject();
|
|
||||||
|
|
||||||
if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)) {
|
|
||||||
annotDict.removeItem(COSName.ACTUAL_TEXT);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.contains(ScrubOption.REMOVE_ALT)) {
|
|
||||||
annotDict.removeItem(COSName.ALT);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.contains(ScrubOption.REMOVE_TU)) {
|
|
||||||
annotDict.removeItem(COSName.TU);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.debug("Failed to scrub annotations", e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user