Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-09-02 21:38:23 +02:00
parent e14941695e
commit ebe17f4c93

View File

@ -47,7 +47,6 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern; import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.multipart.MultipartFile;
@ -815,7 +814,7 @@ public class RedactionService {
TextSegment segment) { TextSegment segment) {
try { try {
if (!isValidTokenIndex(tokens, tokenIndex) || segment == null) { if (!isValidTokenIndex(tokens, tokenIndex) || segment == null) {
return TokenModificationResult.failure("Invalid token index or segment"); return TokenModificationResult.failure();
} }
COSArray array = new COSArray(); COSArray array = new COSArray();
COSString cos = COSString cos =
@ -831,21 +830,10 @@ public class RedactionService {
updateOperatorSafely(tokens, tokenIndex, originalOperator); updateOperatorSafely(tokens, tokenIndex, originalOperator);
return TokenModificationResult.success(); return TokenModificationResult.success();
} catch (Exception e) { } catch (Exception e) {
return TokenModificationResult.failure("Conversion to TJ failed: " + e.getMessage()); return TokenModificationResult.failure();
} }
} }
private static boolean isTextSafeForRedaction(String text) {
if (text == null || text.isEmpty()) return true;
for (char c : text.toCharArray()) {
if (c >= 65488 || (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')) {
return false;
}
}
return true;
}
private static List<Object> deepCopyTokens(List<Object> original) { private static List<Object> deepCopyTokens(List<Object> original) {
if (original == null) { if (original == null) {
return new ArrayList<>(); return new ArrayList<>();
@ -904,8 +892,6 @@ public class RedactionService {
TextFinderUtils.createOptimizedSearchPatterns( TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch); targetWords, useRegex, wholeWordSearch);
int totalMatchesFound = 0;
for (int i = 0; i < segments.size(); i++) { for (int i = 0; i < segments.size(); i++) {
TextSegment segment = segments.get(i); TextSegment segment = segments.get(i);
String segmentText = segment.getText(); String segmentText = segment.getText();
@ -918,7 +904,6 @@ public class RedactionService {
continue; continue;
} }
int segmentMatches = 0;
for (Pattern pattern : patterns) { for (Pattern pattern : patterns) {
try { try {
var matcher = pattern.matcher(segmentText); var matcher = pattern.matcher(segmentText);
@ -929,17 +914,15 @@ public class RedactionService {
if (matchStart >= 0 if (matchStart >= 0
&& matchEnd <= segmentText.length() && matchEnd <= segmentText.length()
&& matchStart < matchEnd) { && matchStart < matchEnd) {
String matchedText = segmentText.substring(matchStart, matchEnd);
allMatches.add( allMatches.add(
new MatchRange( new MatchRange(
segment.getStartPos() + matchStart, segment.getStartPos() + matchStart,
segment.getStartPos() + matchEnd)); segment.getStartPos() + matchEnd));
segmentMatches++;
totalMatchesFound++;
} }
} }
} catch (Exception e) { } catch (Exception e) {
log.debug("Error matching pattern '{}': {}", pattern.pattern(), e.getMessage());
} }
} }
} }
@ -952,25 +935,6 @@ public class RedactionService {
return wipeAllSemanticTextInTokens(tokens, true); return wipeAllSemanticTextInTokens(tokens, true);
} }
private static String normalizeTextForRedaction(String text) {
if (text == null) return null;
StringBuilder normalized = new StringBuilder(text.length());
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (c >= 65488) {
normalized.append(' ');
} else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
normalized.append(' ');
} else {
normalized.append(c);
}
}
return normalized.toString();
}
private static boolean isOcrMyPdfAvailable() { private static boolean isOcrMyPdfAvailable() {
try { try {
ProcessExecutorResult result = ProcessExecutorResult result =
@ -1164,23 +1128,6 @@ public class RedactionService {
} }
} }
public byte[] applySemanticScrubbing(MultipartFile file, Set<ScrubOption> scrubOptions)
throws IOException {
if (scrubOptions == null || scrubOptions.isEmpty()) {
return file.getBytes();
}
try (PDDocument document = pdfDocumentFactory.load(file)) {
DefaultSemanticScrubber scrubber = new DefaultSemanticScrubber();
scrubber.scrub(document, scrubOptions);
try (ByteArrayOutputStream output = new ByteArrayOutputStream()) {
document.save(output);
return output.toByteArray();
}
}
}
private static float calculateCharacterSumWidth(PDFont font, String text) { private static float calculateCharacterSumWidth(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) { if (font == null || text == null || text.isEmpty()) {
return -1f; return -1f;
@ -1237,7 +1184,7 @@ public class RedactionService {
float adjustment, float adjustment,
TextSegment segment) { TextSegment segment) {
if (!(token instanceof COSString)) { if (!(token instanceof COSString)) {
return TokenModificationResult.failure("Expected COSString"); return TokenModificationResult.failure();
} }
try { try {
@ -1251,7 +1198,7 @@ public class RedactionService {
tokens, tokenIndex, operatorName, newText, adjustment, segment); tokens, tokenIndex, operatorName, newText, adjustment, segment);
} }
} catch (Exception e) { } catch (Exception e) {
return TokenModificationResult.failure("Modification failed: " + e.getMessage()); return TokenModificationResult.failure();
} }
} }
@ -1506,6 +1453,7 @@ public class RedactionService {
return WidthCalculator.calculateAccurateWidth(font, text, fontSize); return WidthCalculator.calculateAccurateWidth(font, text, fontSize);
} }
} catch (Exception e) { } catch (Exception e) {
log.error("Failed to calculate safe width for text: {}", text, e);
} }
return 0f; return 0f;
} }
@ -1549,6 +1497,7 @@ public class RedactionService {
return alt.repeat(Math.min(count, max)); return alt.repeat(Math.min(count, max));
} }
} catch (Exception ignored) { } catch (Exception ignored) {
log.error("Failed to calculate alternative placeholder width for {}", alt);
} }
} }
return repeat; return repeat;
@ -1836,19 +1785,11 @@ public class RedactionService {
return problematicRatio > 0.3; return problematicRatio > 0.3;
} }
private void processResidualText(PDDocument document, PDPage page, List<Object> filtered) { private static String handleTjOperator(Object token, PDFont font) {
try { // Note: TJ vs Tj is different
var sem = wipeAllSemanticTextInTokens(filtered); return (token instanceof COSString cosString)
filtered = sem.tokens; ? extractStringWithFallbacks(cosString, font)
PDResources res = page.getResources(); : "";
if (res != null) {
wipeAllSemanticTextInProperties(res);
wipeAllTextInXObjects(document, res);
wipeAllTextInPatterns(document, res);
}
writeFilteredContentStream(document, page, filtered);
} catch (Exception ignored) {
}
} }
public boolean performTextReplacement( public boolean performTextReplacement(
@ -1965,21 +1906,11 @@ public class RedactionService {
} }
} }
private TokenModificationResult performTokenModification( private static String handleQuotedOperator(Object token, PDFont font) {
List<Object> tokens, // Do not add an extra newline; it shifts indices and breaks match ranges (important!!!)
Object token, return (token instanceof COSString cosString)
String operatorName, ? extractStringWithFallbacks(cosString, font)
String newText, : "";
float adjustment,
TextSegment segment,
List<MatchRange> matches) {
return switch (operatorName) {
case "Tj", "'", "\"" ->
modifySimpleTextOperator(
tokens, token, operatorName, newText, adjustment, segment);
case "TJ" -> modifyTJOperator(tokens, token, segment, matches);
default -> TokenModificationResult.failure("Unsupported operator: " + operatorName);
};
} }
private void processPages( private void processPages(
@ -2003,155 +1934,36 @@ public class RedactionService {
} }
} }
private static String handleTjOperator(Object token, PDFont font) { private static String extractTextFromToken(
return (token instanceof COSString cosString) Object token, String operatorName, PDFont currentFont) {
? extractStringWithFallbacks(cosString, font) if (token == null || operatorName == null) return "";
: "";
}
private List<Object> applyRedactionsToTokens(
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
List<Object> newTokens = new ArrayList<>(tokens);
if (this.aggressiveMode) {
Map<Integer, List<AggressiveSegMatch>> perSeg = this.aggressiveSegMatches;
if (perSeg != null && !perSeg.isEmpty()) {
List<Integer> segIndices = new ArrayList<>(perSeg.keySet());
segIndices.sort(
(a, b) ->
Integer.compare(
textSegments.get(b).tokenIndex,
textSegments.get(a).tokenIndex));
for (Integer segIndex : segIndices) {
TextSegment segment = textSegments.get(segIndex);
List<AggressiveSegMatch> segMatches = perSeg.getOrDefault(segIndex, List.of());
if (segMatches.isEmpty()) {
continue;
}
Object token = newTokens.get(segment.tokenIndex);
String opName = segment.operatorName;
if (("Tj".equals(opName) || "'".equals(opName) || "\"".equals(opName))
&& token instanceof COSString cs) {
COSString redacted =
redactCosStringByDecodedRanges(segment.font, cs, segMatches);
if (segment.font != null && segment.fontSize > 0) {
String originalText = getDecodedString(cs, segment.font);
String modifiedText = getDecodedString(redacted, segment.font);
float wOrig =
calculateSafeWidth(
originalText, segment.font, segment.fontSize);
float wMod =
calculateSafeWidth(
modifiedText, segment.font, segment.fontSize);
float adjustment = wOrig - wMod;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
COSArray arr = new COSArray();
arr.add(redacted);
float kerning =
(-adjustment / segment.fontSize) * FONT_SCALE_FACTOR;
arr.add(new COSFloat(kerning));
newTokens.set(segment.tokenIndex, arr);
updateOperatorSafely(newTokens, segment.tokenIndex, opName);
} else {
newTokens.set(segment.tokenIndex, redacted);
}
} else {
newTokens.set(segment.tokenIndex, redacted);
}
} else if ("TJ".equals(opName) && token instanceof COSArray arr) {
COSArray redacted =
redactTJArrayByDecodedRanges(segment.font, arr, segMatches);
COSArray withKerning = buildKerningAdjustedTJArray(arr, redacted, segment);
newTokens.set(segment.tokenIndex, withKerning);
}
}
return newTokens;
}
}
Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
for (MatchRange match : matches) {
for (int i = 0; i < textSegments.size(); i++) {
TextSegment segment = textSegments.get(i);
int overlapStart = Math.max(match.startPos, segment.startPos);
int overlapEnd = Math.min(match.endPos, segment.endPos);
if (overlapStart < overlapEnd) {
matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
}
}
}
List<ModificationTask> tasks = new ArrayList<>();
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
int segmentIndex = entry.getKey();
List<MatchRange> segmentMatches = entry.getValue();
if (segmentIndex < 0 || segmentIndex >= textSegments.size()) {
continue;
}
TextSegment segment = textSegments.get(segmentIndex);
if (segment == null) {
continue;
}
try { try {
if ("Tj".equals(segment.operatorName) return switch (operatorName) {
|| "'".equals(segment.operatorName) case "Tj" -> handleTjOperator(token, currentFont);
|| "\"".equals(segment.operatorName)) { case "'", "\"" -> handleQuotedOperator(token, currentFont);
case "TJ" -> handleTJOperator(token, currentFont);
String newText = applyRedactionsToSegmentText(segment, segmentMatches); default -> "";
if (newText == null) newText = ""; };
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
tasks.add(new ModificationTask(segment, newText, adjustment));
} else if ("TJ".equals(segment.operatorName)) {
tasks.add(new ModificationTask(segment, "", 0));
}
} catch (Exception e) { } catch (Exception e) {
return "";
} }
} }
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex)); private void processResidualText(PDDocument document, PDPage page, List<Object> filtered) {
int maxTasksToProcess = Math.min(tasks.size(), 1000);
for (int i = 0; i < maxTasksToProcess && i < tasks.size(); i++) {
ModificationTask task = tasks.get(i);
try { try {
List<MatchRange> segmentMatches = var sem = wipeAllSemanticTextInTokens(filtered);
matchesBySegment.getOrDefault( filtered = sem.tokens;
textSegments.indexOf(task.segment), Collections.emptyList()); PDResources res = page.getResources();
if (res != null) {
if (task.segment.tokenIndex >= newTokens.size()) { wipeAllSemanticTextInProperties(res);
wipeAllTextInXObjects(document, res);
continue; wipeAllTextInPatterns(document, res);
} }
if (task.segment.getText() == null || task.segment.getText().isEmpty()) { writeFilteredContentStream(document, page, filtered);
} catch (Exception ignored) {
continue; log.debug("Error processing residual text: {}", ignored.getMessage());
} }
modifyTokenForRedaction(
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
} catch (Exception e) {
}
}
return newTokens;
} }
private List<TextSegment> extractTextSegmentsFromTokens( private List<TextSegment> extractTextSegmentsFromTokens(
@ -2200,11 +2012,21 @@ public class RedactionService {
return segments; return segments;
} }
private static String handleQuotedOperator(Object token, PDFont font) { private TokenModificationResult performTokenModification(
// Do not add an extra newline; it shifts indices and breaks match ranges List<Object> tokens,
return (token instanceof COSString cosString) Object token,
? extractStringWithFallbacks(cosString, font) String operatorName,
: ""; String newText,
float adjustment,
TextSegment segment,
List<MatchRange> matches) {
return switch (operatorName) {
case "Tj", "'", "\"" ->
modifySimpleTextOperator(
tokens, token, operatorName, newText, adjustment, segment);
case "TJ" -> modifyTJOperator(tokens, token, segment, matches);
default -> TokenModificationResult.failure();
};
} }
private List<MatchRange> findAllMatchesAggressive( private List<MatchRange> findAllMatchesAggressive(
@ -2461,24 +2283,151 @@ public class RedactionService {
} }
} }
private TokenModificationResult modifyTJOperator( private List<Object> applyRedactionsToTokens(
List<Object> tokens, Object token, TextSegment segment, List<MatchRange> matches) { List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
if (!(token instanceof COSArray originalArray)) {
return TokenModificationResult.failure("Expected COSArray for TJ operator"); List<Object> newTokens = new ArrayList<>(tokens);
if (this.aggressiveMode) {
Map<Integer, List<AggressiveSegMatch>> perSeg = this.aggressiveSegMatches;
if (perSeg != null && !perSeg.isEmpty()) {
List<Integer> segIndices = new ArrayList<>(perSeg.keySet());
segIndices.sort(
(a, b) ->
Integer.compare(
textSegments.get(b).tokenIndex,
textSegments.get(a).tokenIndex));
for (Integer segIndex : segIndices) {
TextSegment segment = textSegments.get(segIndex);
List<AggressiveSegMatch> segMatches = perSeg.getOrDefault(segIndex, List.of());
if (segMatches.isEmpty()) {
continue;
}
Object token = newTokens.get(segment.tokenIndex);
String opName = segment.operatorName;
if (("Tj".equals(opName) || "'".equals(opName) || "\"".equals(opName))
&& token instanceof COSString cs) {
COSString redacted =
redactCosStringByDecodedRanges(segment.font, cs, segMatches);
if (segment.font != null && segment.fontSize > 0) {
String originalText = getDecodedString(cs, segment.font);
String modifiedText = getDecodedString(redacted, segment.font);
float wOrig =
calculateSafeWidth(
originalText, segment.font, segment.fontSize);
float wMod =
calculateSafeWidth(
modifiedText, segment.font, segment.fontSize);
float adjustment = wOrig - wMod;
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
COSArray arr = new COSArray();
arr.add(redacted);
float kerning =
(-adjustment / segment.fontSize) * FONT_SCALE_FACTOR;
arr.add(new COSFloat(kerning));
newTokens.set(segment.tokenIndex, arr);
updateOperatorSafely(newTokens, segment.tokenIndex, opName);
} else {
newTokens.set(segment.tokenIndex, redacted);
}
} else {
newTokens.set(segment.tokenIndex, redacted);
}
} else if ("TJ".equals(opName) && token instanceof COSArray arr) {
COSArray redacted =
redactTJArrayByDecodedRanges(segment.font, arr, segMatches);
COSArray withKerning = buildKerningAdjustedTJArray(arr, redacted, segment);
newTokens.set(segment.tokenIndex, withKerning);
}
}
return newTokens;
}
}
Map<Integer, List<MatchRange>> matchesBySegment = new HashMap<>();
for (MatchRange match : matches) {
for (int i = 0; i < textSegments.size(); i++) {
TextSegment segment = textSegments.get(i);
int overlapStart = Math.max(match.startPos, segment.startPos);
int overlapEnd = Math.min(match.endPos, segment.endPos);
if (overlapStart < overlapEnd) {
matchesBySegment.computeIfAbsent(i, k -> new ArrayList<>()).add(match);
}
}
}
List<ModificationTask> tasks = new ArrayList<>();
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
int segmentIndex = entry.getKey();
List<MatchRange> segmentMatches = entry.getValue();
if (segmentIndex < 0 || segmentIndex >= textSegments.size()) {
continue;
}
TextSegment segment = textSegments.get(segmentIndex);
if (segment == null) {
continue;
} }
try { try {
COSArray newArray = createRedactedTJArray(originalArray, segment, matches); if ("Tj".equals(segment.operatorName)
if (!isValidTJArray(newArray)) { || "'".equals(segment.operatorName)
return TokenModificationResult.failure("Generated invalid TJ array"); || "\"".equals(segment.operatorName)) {
String newText = applyRedactionsToSegmentText(segment, segmentMatches);
if (newText == null) newText = "";
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
tasks.add(new ModificationTask(segment, newText, adjustment));
} else if ("TJ".equals(segment.operatorName)) {
tasks.add(new ModificationTask(segment, "", 0));
} }
tokens.set(segment.tokenIndex, newArray);
return TokenModificationResult.success();
} catch (Exception e) { } catch (Exception e) {
return TokenModificationResult.failure("TJ modification failed: " + e.getMessage()); log.warn("Error processing token: {}", e.getMessage());
} }
} }
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
int maxTasksToProcess = Math.min(tasks.size(), 1000);
for (int i = 0; i < maxTasksToProcess && i < tasks.size(); i++) {
ModificationTask task = tasks.get(i);
try {
List<MatchRange> segmentMatches =
matchesBySegment.getOrDefault(
textSegments.indexOf(task.segment), Collections.emptyList());
if (task.segment.tokenIndex >= newTokens.size()) {
continue;
}
if (task.segment.getText() == null || task.segment.getText().isEmpty()) {
continue;
}
modifyTokenForRedaction(
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
} catch (Exception e) {
}
}
return newTokens;
}
private static String extractStringWithFallbacks(COSString cosString, PDFont font) { private static String extractStringWithFallbacks(COSString cosString, PDFont font) {
if (cosString == null) return ""; if (cosString == null) return "";
@ -2552,18 +2501,21 @@ public class RedactionService {
} }
} }
private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) { private TokenModificationResult modifyTJOperator(
if (token == null || operatorName == null) return ""; List<Object> tokens, Object token, TextSegment segment, List<MatchRange> matches) {
if (!(token instanceof COSArray originalArray)) {
return TokenModificationResult.failure();
}
try { try {
return switch (operatorName) { COSArray newArray = createRedactedTJArray(originalArray, segment, matches);
case "Tj" -> handleTjOperator(token, currentFont); if (!isValidTJArray(newArray)) {
case "'", "\"" -> handleQuotedOperator(token, currentFont); return TokenModificationResult.failure();
case "TJ" -> handleTJOperator(token, currentFont); }
default -> ""; tokens.set(segment.tokenIndex, newArray);
}; return TokenModificationResult.success();
} catch (Exception e) { } catch (Exception e) {
return ""; return TokenModificationResult.failure();
} }
} }
@ -2791,12 +2743,7 @@ public class RedactionService {
} }
} }
private record WidthCalculationResult(float adjustment, int processedMatches) { private record WidthCalculationResult(float adjustment, int processedMatches) {}
private WidthCalculationResult(float adjustment, int processedMatches) {
this.adjustment = adjustment;
this.processedMatches = processedMatches;
}
}
public enum FallbackStrategy { public enum FallbackStrategy {
EMBED_WIDTH, EMBED_WIDTH,
@ -2807,16 +2754,16 @@ public class RedactionService {
private static class TokenModificationResult { private static class TokenModificationResult {
@Getter private final boolean success; @Getter private final boolean success;
private TokenModificationResult(boolean success, String errorMessage) { private TokenModificationResult(boolean success) {
this.success = success; this.success = success;
} }
public static TokenModificationResult success() { public static TokenModificationResult success() {
return new TokenModificationResult(true, null); return new TokenModificationResult(true);
} }
public static TokenModificationResult failure(String errorMessage) { public static TokenModificationResult failure() {
return new TokenModificationResult(false, errorMessage); return new TokenModificationResult(false);
} }
} }
@ -2883,15 +2830,6 @@ public class RedactionService {
int modifications; int modifications;
} }
public enum ScrubOption {
REMOVE_ACTUALTEXT,
REMOVE_ALT,
REMOVE_TU,
NORMALIZE_WHITESPACE
}
public interface SemanticScrubber {}
private static class GlyphCoverageProbe { private static class GlyphCoverageProbe {
private final PDFont font; private final PDFont font;
private final Set<Integer> availableGlyphs; private final Set<Integer> availableGlyphs;
@ -2901,7 +2839,7 @@ public class RedactionService {
this.availableGlyphs = buildGlyphCoverage(font); this.availableGlyphs = buildGlyphCoverage(font);
} }
private Set<Integer> buildGlyphCoverage(PDFont font) { private static Set<Integer> buildGlyphCoverage(PDFont font) {
Set<Integer> coverage = new HashSet<>(); Set<Integer> coverage = new HashSet<>();
if (font == null) return coverage; if (font == null) return coverage;
@ -2938,21 +2876,8 @@ public class RedactionService {
} }
} }
public float getWidthWithFallback( private static float getLegacySumFallback(float fontSize) {
int codePoint, FallbackStrategy strategy, float fontSize) { return fontSize * 0.6f;
if (hasGlyph(codePoint)) {
try {
String charStr = new String(Character.toChars(codePoint));
return font.getStringWidth(charStr) / FONT_SCALE_FACTOR * fontSize;
} catch (Exception e) {
log.debug("Failed to get width for codepoint {}", codePoint, e);
}
}
return switch (strategy) {
case EMBED_WIDTH -> getEmbeddedProgramWidth(fontSize);
case AVERAGE_WIDTH -> getAverageFontWidth(fontSize);
case LEGACY_SUM -> getLegacySumFallback(codePoint, fontSize);
};
} }
private float getEmbeddedProgramWidth(float fontSize) { private float getEmbeddedProgramWidth(float fontSize) {
@ -3002,110 +2927,21 @@ public class RedactionService {
} }
} }
private static float getLegacySumFallback(int codePoint, float fontSize) { public float getWidthWithFallback(
return fontSize * 0.6f; int codePoint, FallbackStrategy strategy, float fontSize) {
} if (hasGlyph(codePoint)) {
}
public static class DefaultSemanticScrubber implements SemanticScrubber {
private void scrub(PDDocument document, Set<ScrubOption> options) {
if (document == null || options == null || options.isEmpty()) {
return;
}
try { try {
scrubStructureTree(document, options); String charStr = new String(Character.toChars(codePoint));
return font.getStringWidth(charStr) / FONT_SCALE_FACTOR * fontSize;
if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)
|| options.contains(ScrubOption.REMOVE_ALT)
|| options.contains(ScrubOption.REMOVE_TU)) {
scrubAnnotations(document, options);
}
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to scrub document", e); log.debug("Failed to get width for codepoint {}", codePoint, e);
} }
} }
return switch (strategy) {
private void scrubStructureTree(PDDocument document, Set<ScrubOption> options) { case EMBED_WIDTH -> getEmbeddedProgramWidth(fontSize);
try { case AVERAGE_WIDTH -> getAverageFontWidth(fontSize);
COSDictionary catalog = document.getDocumentCatalog().getCOSObject(); case LEGACY_SUM -> getLegacySumFallback(fontSize);
COSBase structTreeRoot = catalog.getDictionaryObject(COSName.STRUCT_TREE_ROOT); };
if (structTreeRoot instanceof COSDictionary structRoot) {
scrubStructureElement(structRoot, options);
}
} catch (Exception e) {
log.debug("Failed to scrub structure tree", e);
}
}
private static void scrubStructureElement(COSDictionary element, Set<ScrubOption> options) {
if (element == null) return;
if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)) {
element.removeItem(COSName.ACTUAL_TEXT);
}
if (options.contains(ScrubOption.REMOVE_ALT)) {
element.removeItem(COSName.ALT);
}
if (options.contains(ScrubOption.REMOVE_TU)) {
element.removeItem(COSName.TU);
}
if (options.contains(ScrubOption.NORMALIZE_WHITESPACE)) {
normalizeWhitespaceInElement(element);
}
COSBase kids = element.getDictionaryObject(COSName.K);
if (kids instanceof COSArray kidsArray) {
for (COSBase kid : kidsArray) {
if (kid instanceof COSDictionary kidDict) {
scrubStructureElement(kidDict, options);
}
}
} else if (kids instanceof COSDictionary kidDict) {
scrubStructureElement(kidDict, options);
}
}
private static void normalizeWhitespaceInElement(COSDictionary element) {
for (COSName key : List.of(COSName.ACTUAL_TEXT, COSName.ALT, COSName.TU)) {
COSBase value = element.getDictionaryObject(key);
if (value instanceof COSString cosString) {
String text = cosString.getString();
String normalized = text.replaceAll("\\s+", " ").trim();
if (normalized.length() > 256) {
normalized = normalized.substring(0, 256);
}
element.setString(key, normalized);
}
}
}
private void scrubAnnotations(PDDocument document, Set<ScrubOption> options) {
try {
for (PDPage page : document.getPages()) {
for (PDAnnotation annotation : page.getAnnotations()) {
COSDictionary annotDict = annotation.getCOSObject();
if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)) {
annotDict.removeItem(COSName.ACTUAL_TEXT);
}
if (options.contains(ScrubOption.REMOVE_ALT)) {
annotDict.removeItem(COSName.ALT);
}
if (options.contains(ScrubOption.REMOVE_TU)) {
annotDict.removeItem(COSName.TU);
}
}
}
} catch (Exception e) {
log.debug("Failed to scrub annotations", e);
}
} }
} }
} }