enhance placeholder creation and width calculation with improved error handling and logging

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-22 20:08:17 +02:00
parent 8c6aa246a7
commit 04d9b6ace2
2 changed files with 966 additions and 236 deletions

View File

@ -41,6 +41,7 @@ import org.springframework.web.multipart.MultipartFile;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
import lombok.Getter;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -704,55 +705,134 @@ public class RedactionService {
} }
String createPlaceholderWithFont(String originalWord, PDFont font) { String createPlaceholderWithFont(String originalWord, PDFont font) {
if (originalWord == null || originalWord.isEmpty()) { try {
return originalWord; if (originalWord == null || originalWord.isEmpty()) {
} log.debug(
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) { "createPlaceholderWithFont: originalWord is null or empty, returning space");
try { return " ";
float originalWidth = safeGetStringWidth(font, originalWord) / FONT_SCALE_FACTOR;
return createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
} catch (Exception e) {
return "";
} }
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
try {
float originalWidth =
safeGetStringWidth(font, originalWord) / FONT_SCALE_FACTOR;
String result =
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
if (result == null) {
log.warn("createAlternativePlaceholder returned null, using fallback");
return " ".repeat(Math.max(1, originalWord.length()));
}
return result;
} catch (Exception e) {
log.debug(
"Error in createPlaceholderWithFont subset logic: {}", e.getMessage());
return " ".repeat(Math.max(1, originalWord.length()));
}
}
int length = Math.max(1, originalWord.length());
String result = " ".repeat(length);
log.debug("createPlaceholderWithFont: returning '{}' for '{}'", result, originalWord);
return result;
} catch (Exception e) {
log.error("Unexpected error in createPlaceholderWithFont: {}", e.getMessage());
return " ";
} }
return " ".repeat(originalWord.length());
} }
String createPlaceholderWithWidth( String createPlaceholderWithWidth(
String originalWord, float targetWidth, PDFont font, float fontSize) { String originalWord, float targetWidth, PDFont font, float fontSize) {
if (originalWord == null || originalWord.isEmpty()) {
return originalWord;
}
if (font == null || fontSize <= 0) {
return " ".repeat(originalWord.length());
}
try { try {
if (originalWord == null || originalWord.isEmpty()) {
log.debug(
"createPlaceholderWithWidth: originalWord is null or empty, returning space");
return " ";
}
if (font == null || fontSize <= 0) {
int length = Math.max(1, originalWord.length());
String result = " ".repeat(length);
log.debug(
"createPlaceholderWithWidth: invalid font/size, returning '{}' for '{}'",
result,
originalWord);
return result;
}
if (!WidthCalculator.isWidthCalculationReliable(font)) { if (!WidthCalculator.isWidthCalculationReliable(font)) {
return " ".repeat(originalWord.length()); int length = Math.max(1, originalWord.length());
String result = " ".repeat(length);
log.debug(
"createPlaceholderWithWidth: font not reliable, returning '{}' for '{}'",
result,
originalWord);
return result;
} }
if (TextEncodingHelper.isFontSubset(font.getName())) { if (TextEncodingHelper.isFontSubset(font.getName())) {
return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize); String result =
createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize);
if (result == null) {
log.warn("createSubsetFontPlaceholder returned null, using fallback");
return " ".repeat(Math.max(1, originalWord.length()));
}
log.debug(
"createPlaceholderWithWidth: subset font, returning '{}' for '{}'",
result,
originalWord);
return result;
} }
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
if (spaceWidth <= 0) { try {
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
if (spaceWidth <= 0) {
log.debug(
"createPlaceholderWithWidth: invalid space width, using alternative placeholder");
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
}
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces =
Math.max(
originalWord.length() * 2,
Math.round(targetWidth / spaceWidth * 1.5f));
int finalSpaces = Math.min(spaceCount, maxSpaces);
String result = " ".repeat(finalSpaces);
log.debug(
"createPlaceholderWithWidth: calculated {} spaces for '{}' (targetWidth: {}, spaceWidth: {})",
finalSpaces,
originalWord,
targetWidth,
spaceWidth);
return result;
} catch (Exception e) {
log.debug("Error calculating space width, using alternative: {}", e.getMessage());
String result =
createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
if (result == null) {
return " ".repeat(Math.max(1, originalWord.length()));
}
return result;
} }
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces =
Math.max(
originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f));
return " ".repeat(Math.min(spaceCount, maxSpaces));
} catch (Exception e) { } catch (Exception e) {
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); log.error("Unexpected error in createPlaceholderWithWidth: {}", e.getMessage());
return " ".repeat(Math.max(1, originalWord.length()));
} }
} }
private String createSubsetFontPlaceholder( private String createSubsetFontPlaceholder(
String originalWord, float targetWidth, PDFont font, float fontSize) { String originalWord, float targetWidth, PDFont font, float fontSize) {
try { try {
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize); String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
if (result == null) {
log.warn(
"createAlternativePlaceholder returned null in subset font, using fallback");
return " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
}
return result;
} catch (Exception e) { } catch (Exception e) {
return ""; log.error("Error in createSubsetFontPlaceholder: {}", e.getMessage());
return " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
} }
} }
@ -785,9 +865,12 @@ public class RedactionService {
} catch (Exception ignored) { } catch (Exception ignored) {
} }
} }
return ""; log.debug(
"createAlternativePlaceholder: no suitable alternative found, returning spaces");
return " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
} catch (Exception e) { } catch (Exception e) {
return ""; log.error("Unexpected error in createAlternativePlaceholder: {}", e.getMessage());
return " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
} }
} }
@ -971,7 +1054,7 @@ public class RedactionService {
} }
if (isTextShowingOperator(opName) && i > 0) { if (isTextShowingOperator(opName) && i > 0) {
String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font); String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
if (!textContent.isEmpty()) { if (textContent != null && !textContent.trim().isEmpty()) {
if (aggressive if (aggressive
&& gs.font != null && gs.font != null
&& tokens.get(i - 1) instanceof COSString cs) { && tokens.get(i - 1) instanceof COSString cs) {
@ -1017,7 +1100,7 @@ public class RedactionService {
} }
if (isTextShowingOperator(opName) && i > 0) { if (isTextShowingOperator(opName) && i > 0) {
String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font); String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
if (!textContent.isEmpty()) { if (textContent != null && !textContent.trim().isEmpty()) {
segments.add( segments.add(
new TextSegment( new TextSegment(
i - 1, i - 1,
@ -1070,11 +1153,14 @@ public class RedactionService {
} }
List<TextSegment> textSegments = extractTextSegments(page, tokens, this.aggressiveMode); List<TextSegment> textSegments = extractTextSegments(page, tokens, this.aggressiveMode);
String completeText = buildCompleteText(textSegments); String completeText = buildCompleteText(textSegments);
List<MatchRange> matches = List<MatchRange> matches;
this.aggressiveMode if (this.aggressiveMode) {
? findAllMatchesAggressive( matches =
textSegments, tokens, targetWords, useRegex, wholeWordSearch) findAllMatchesAggressive(
: findAllMatches(completeText, targetWords, useRegex, wholeWordSearch); textSegments, tokens, targetWords, useRegex, wholeWordSearch);
} else {
matches = findMatchesInSegments(textSegments, targetWords, useRegex, wholeWordSearch);
}
return applyRedactionsToTokens(tokens, textSegments, matches); return applyRedactionsToTokens(tokens, textSegments, matches);
} }
@ -1329,49 +1415,65 @@ public class RedactionService {
} }
private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) { private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) {
if (segment == null || matches == null || matches.isEmpty()) {
return segment != null && segment.getText() != null ? segment.getText() : "";
}
String text = segment.getText(); String text = segment.getText();
if (!this.aggressiveMode if (text == null) return "";
if (!aggressiveMode
&& segment.getFont() != null && segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) { && !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) {
return text; return text;
} }
StringBuilder result = new StringBuilder(text); try {
for (MatchRange match : matches) { StringBuilder result = new StringBuilder(text);
int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos()); for (MatchRange match : matches) {
int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); int segmentStart = Math.max(0, match.getStartPos() - segment.getStartPos());
if (segmentStart < text.length() && segmentEnd > segmentStart) { int segmentEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
String originalPart = text.substring(segmentStart, segmentEnd);
if (!this.aggressiveMode
&& segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(
segment.getFont(), originalPart)) {
continue;
}
if (this.aggressiveMode) { if (segmentStart < text.length() && segmentEnd > segmentStart) {
result.replace(segmentStart, segmentEnd, ""); String originalPart = text.substring(segmentStart, segmentEnd);
} else {
float originalWidth = 0; if (!aggressiveMode
if (segment.getFont() != null && segment.getFontSize() > 0) { && segment.getFont() != null
originalWidth = && !TextEncodingHelper.isTextSegmentRemovable(
safeGetStringWidth(segment.getFont(), originalPart) segment.getFont(), originalPart)) {
/ FONT_SCALE_FACTOR continue;
* segment.getFontSize(); }
if (aggressiveMode) {
result.replace(segmentStart, segmentEnd, "");
} else {
float originalWidth = 0;
if (segment.getFont() != null && segment.getFontSize() > 0) {
originalWidth =
safeGetStringWidth(segment.getFont(), originalPart)
/ FONT_SCALE_FACTOR
* segment.getFontSize();
}
String placeholder =
originalWidth > 0
? createPlaceholderWithWidth(
originalPart,
originalWidth,
segment.getFont(),
segment.getFontSize())
: createPlaceholderWithFont(
originalPart, segment.getFont());
if (placeholder == null) placeholder = " ";
result.replace(segmentStart, segmentEnd, placeholder);
} }
String placeholder =
(originalWidth > 0)
? createPlaceholderWithWidth(
originalPart,
originalWidth,
segment.getFont(),
segment.getFontSize())
: createPlaceholderWithFont(originalPart, segment.getFont());
result.replace(segmentStart, segmentEnd, placeholder);
} }
} }
return result.toString();
} catch (Exception e) {
return text;
} }
return result.toString();
} }
private List<MatchRange> findAllMatchesAggressive( private List<MatchRange> findAllMatchesAggressive(
@ -1569,6 +1671,50 @@ public class RedactionService {
return result; return result;
} }
private List<MatchRange> findMatchesInSegments(
List<TextSegment> segments,
Set<String> targetWords,
boolean useRegex,
boolean wholeWordSearch) {
List<MatchRange> allMatches = new ArrayList<>();
List<Pattern> patterns =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
for (TextSegment segment : segments) {
String segmentText = segment.getText();
if (segmentText == null || segmentText.isEmpty()) continue;
if (segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
continue;
}
for (Pattern pattern : patterns) {
try {
var matcher = pattern.matcher(segmentText);
while (matcher.find()) {
int matchStart = matcher.start();
int matchEnd = matcher.end();
if (matchStart >= 0
&& matchEnd <= segmentText.length()
&& matchStart < matchEnd) {
allMatches.add(
new MatchRange(
segment.getStartPos() + matchStart,
segment.getStartPos() + matchEnd));
}
}
} catch (Exception e) {
}
}
}
allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
return allMatches;
}
private List<Object> applyRedactionsToTokens( private List<Object> applyRedactionsToTokens(
List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) { List<Object> tokens, List<TextSegment> textSegments, List<MatchRange> matches) {
List<Object> newTokens = new ArrayList<>(tokens); List<Object> newTokens = new ArrayList<>(tokens);
@ -1618,24 +1764,97 @@ public class RedactionService {
for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) { for (Map.Entry<Integer, List<MatchRange>> entry : matchesBySegment.entrySet()) {
int segmentIndex = entry.getKey(); int segmentIndex = entry.getKey();
List<MatchRange> segmentMatches = entry.getValue(); List<MatchRange> segmentMatches = entry.getValue();
if (segmentIndex < 0 || segmentIndex >= textSegments.size()) {
log.warn(
"Invalid segment index: {} (textSegments size: {})",
segmentIndex,
textSegments.size());
continue;
}
TextSegment segment = textSegments.get(segmentIndex); TextSegment segment = textSegments.get(segmentIndex);
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) { if (segment == null) {
String newText = applyRedactionsToSegmentText(segment, segmentMatches); log.warn("Segment is null at index: {}", segmentIndex);
float adjustment = 0; continue;
adjustment = calculateWidthAdjustment(segment, segmentMatches); }
tasks.add(new ModificationTask(segment, newText, adjustment));
} else if ("TJ".equals(segment.operatorName)) { log.debug(
tasks.add(new ModificationTask(segment, null, 0)); "Creating task for segment {} with operator '{}' and {} matches",
segmentIndex,
segment.operatorName,
segmentMatches.size());
try {
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) {
String newText = applyRedactionsToSegmentText(segment, segmentMatches);
if (newText == null) {
log.warn(
"applyRedactionsToSegmentText returned null for segment {}, using empty string",
segmentIndex);
newText = ""; // Ensure it's never null
}
float adjustment = calculateWidthAdjustment(segment, segmentMatches);
tasks.add(new ModificationTask(segment, newText, adjustment));
log.debug(
"Created Tj/' task with newText: '{}' (length: {})",
newText,
newText.length());
} else if ("TJ".equals(segment.operatorName)) {
tasks.add(
new ModificationTask(
segment, "", 0)); // Use empty string instead of null for TJ
log.debug("Created TJ task with empty newText (was null)");
} else {
log.debug("Skipping segment with operator: {}", segment.operatorName);
}
} catch (Exception e) {
log.error("Error creating task for segment {}: {}", segmentIndex, e.getMessage());
} }
} }
tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex)); tasks.sort((a, b) -> Integer.compare(b.segment.tokenIndex, a.segment.tokenIndex));
for (ModificationTask task : tasks) {
List<MatchRange> segmentMatches = int processedCount = 0;
matchesBySegment.getOrDefault( int maxTasksToProcess = Math.min(tasks.size(), 1000); // Safety limit
textSegments.indexOf(task.segment), Collections.emptyList());
modifyTokenForRedaction( for (int i = 0; i < maxTasksToProcess && i < tasks.size(); i++) {
newTokens, task.segment, task.newText, task.adjustment, segmentMatches); ModificationTask task = tasks.get(i);
try {
List<MatchRange> segmentMatches =
matchesBySegment.getOrDefault(
textSegments.indexOf(task.segment), Collections.emptyList());
if (task.segment.tokenIndex >= newTokens.size()) {
log.debug(
"Skipping segment with invalid token index {} (tokens size: {})",
task.segment.tokenIndex,
newTokens.size());
continue;
}
if (task.segment.getText() == null || task.segment.getText().isEmpty()) {
log.debug(
"Skipping segment with empty text at index {}",
task.segment.tokenIndex);
continue;
}
modifyTokenForRedaction(
newTokens, task.segment, task.newText, task.adjustment, segmentMatches);
processedCount++;
} catch (Exception e) {
log.warn(
"Failed to process modification task for segment at {}: {}",
task.segment.tokenIndex,
e.getMessage());
}
} }
log.debug(
"Successfully processed {} out of {} modification tasks",
processedCount,
tasks.size());
return newTokens; return newTokens;
} }
@ -1837,20 +2056,67 @@ public class RedactionService {
String newText, String newText,
float adjustment, float adjustment,
List<MatchRange> matches) { List<MatchRange> matches) {
if (tokens == null // Defensive null handling
|| segment == null if (tokens == null || segment == null) {
|| newText == null log.warn(
|| !isValidTokenIndex(tokens, segment.tokenIndex) "Invalid input to modifyTokenForRedaction: tokens={}, segment={}",
|| segment.operatorName == null) { tokens == null ? "null" : "valid",
log.warn("Invalid input to modifyTokenForRedaction"); segment == null ? "null" : "valid");
return;
}
// Handle null newText by providing a default
if (newText == null) {
log.warn("newText is null, providing default empty string");
log.warn(
"Segment details: tokenIndex={}, operatorName={}, font={}, fontSize={}, text='{}'",
segment.tokenIndex,
segment.operatorName,
segment.getFont() != null ? segment.getFont().getName() : "null",
segment.getFontSize(),
segment.getText() != null ? segment.getText() : "null");
log.warn("This should not happen with the new null safety measures!");
newText = ""; // Default to empty string
}
if (!isValidTokenIndex(tokens, segment.tokenIndex)) {
log.warn(
"Invalid input to modifyTokenForRedaction: invalid token index {} (tokens size: {})",
segment.tokenIndex,
tokens.size());
log.debug(
"Segment details: operator={}, font={}, fontSize={}, startPos={}, endPos={}",
segment.operatorName,
segment.getFont(),
segment.getFontSize(),
segment.getStartPos(),
segment.getEndPos());
return;
}
if (segment.operatorName == null) {
log.warn("Invalid input to modifyTokenForRedaction: operatorName is null");
return; return;
} }
try { try {
Object token = tokens.get(segment.tokenIndex);
if (token == null) {
log.warn("Token at index {} is null, skipping modification", segment.tokenIndex);
return;
}
if (!isValidTokenForOperator(token, segment.operatorName)) {
log.warn(
"Token at index {} is not valid for operator {}, skipping modification",
segment.tokenIndex,
segment.operatorName);
return;
}
TokenModificationResult result = TokenModificationResult result =
performTokenModification( performTokenModification(
tokens, tokens,
tokens.get(segment.tokenIndex), token,
segment.operatorName, segment.operatorName,
newText, newText,
adjustment, adjustment,
@ -1860,12 +2126,24 @@ public class RedactionService {
if (!result.isSuccess()) { if (!result.isSuccess()) {
performFallbackModification(tokens, segment.tokenIndex, newText); performFallbackModification(tokens, segment.tokenIndex, newText);
} }
} catch (IndexOutOfBoundsException e) {
log.warn(
"Token index {} is out of bounds (tokens size: {}), skipping modification",
segment.tokenIndex,
tokens.size());
} catch (Exception e) { } catch (Exception e) {
log.error( log.error(
"Token modification failed at index {}: {}", "Token modification failed at index {}: {}",
segment.tokenIndex, segment.tokenIndex,
e.getMessage()); e.getMessage());
performEmergencyFallback(tokens, segment.tokenIndex); try {
performEmergencyFallback(tokens, segment.tokenIndex);
} catch (Exception emergencyError) {
log.error(
"Emergency fallback also failed at index {}: {}",
segment.tokenIndex,
emergencyError.getMessage());
}
} }
} }
@ -1873,6 +2151,18 @@ public class RedactionService {
return index >= 0 && index < tokens.size(); return index >= 0 && index < tokens.size();
} }
private boolean isValidTokenForOperator(Object token, String operatorName) {
if (token == null || operatorName == null) {
return false;
}
return switch (operatorName) {
case "Tj", "'", "\"" -> token instanceof COSString;
case "TJ" -> token instanceof COSArray;
default -> true;
};
}
private COSArray createRedactedTJArray( private COSArray createRedactedTJArray(
COSArray originalArray, TextSegment segment, List<MatchRange> matches) { COSArray originalArray, TextSegment segment, List<MatchRange> matches) {
@ -2403,29 +2693,44 @@ public class RedactionService {
return totalMods; return totalMods;
} }
private static class WidthCalculationResult { private List<TextSegment> extractTextSegmentsFromXObject(
private final float adjustment; PDResources resources, List<Object> tokens) {
private final int processedMatches; List<TextSegment> segments = new ArrayList<>();
private final List<String> warnings; int currentTextPos = 0;
GraphicsState gs = new GraphicsState();
public WidthCalculationResult( for (int i = 0; i < tokens.size(); i++) {
float adjustment, int processedMatches, List<String> warnings) { Object currentToken = tokens.get(i);
this.adjustment = adjustment; if (currentToken instanceof Operator op) {
this.processedMatches = processedMatches; String opName = op.getName();
this.warnings = new ArrayList<>(warnings); if ("Tf".equals(opName) && i >= 2) {
} try {
COSName fontName = (COSName) tokens.get(i - 2);
public float getAdjustment() { COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
return adjustment; if (fontSizeBase instanceof COSNumber cosNumber) {
} gs.setFont(resources.getFont(fontName));
gs.setFontSize(cosNumber.floatValue());
public int getProcessedMatches() { }
return processedMatches; } catch (ClassCastException | IOException ignored) {
} }
}
public List<String> getWarnings() { if (isTextShowingOperator(opName) && i > 0) {
return new ArrayList<>(warnings); String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
if (textContent != null && !textContent.trim().isEmpty()) {
segments.add(
new TextSegment(
i - 1,
opName,
textContent,
currentTextPos,
currentTextPos + textContent.length(),
gs.font,
gs.fontSize));
currentTextPos += textContent.length();
}
}
}
} }
return segments;
} }
private int wipeAllTextInFormXObject(PDDocument document, PDFormXObject formXObject) private int wipeAllTextInFormXObject(PDDocument document, PDFormXObject formXObject)
@ -2485,25 +2790,20 @@ public class RedactionService {
} }
} }
private static class TokenModificationResult { private static class WidthCalculationResult {
private final boolean success; @Getter private final float adjustment;
private final String errorMessage; @Getter private final int processedMatches;
private final List<String> warnings;
private TokenModificationResult(boolean success, String errorMessage) { public WidthCalculationResult(
this.success = success; float adjustment, int processedMatches, List<String> warnings) {
this.errorMessage = errorMessage; this.adjustment = adjustment;
this.processedMatches = processedMatches;
this.warnings = new ArrayList<>(warnings);
} }
public static TokenModificationResult success() { public List<String> getWarnings() {
return new TokenModificationResult(true, null); return new ArrayList<>(warnings);
}
public static TokenModificationResult failure(String errorMessage) {
return new TokenModificationResult(false, errorMessage);
}
public boolean isSuccess() {
return success;
} }
} }
@ -2556,44 +2856,22 @@ public class RedactionService {
} }
} }
private List<TextSegment> extractTextSegmentsFromXObject( private static class TokenModificationResult {
PDResources resources, List<Object> tokens) { @Getter private final boolean success;
List<TextSegment> segments = new ArrayList<>(); private final String errorMessage;
int currentTextPos = 0;
GraphicsState gs = new GraphicsState(); private TokenModificationResult(boolean success, String errorMessage) {
for (int i = 0; i < tokens.size(); i++) { this.success = success;
Object currentToken = tokens.get(i); this.errorMessage = errorMessage;
if (currentToken instanceof Operator op) { }
String opName = op.getName();
if ("Tf".equals(opName) && i >= 2) { public static TokenModificationResult success() {
try { return new TokenModificationResult(true, null);
COSName fontName = (COSName) tokens.get(i - 2); }
COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
if (fontSizeBase instanceof COSNumber cosNumber) { public static TokenModificationResult failure(String errorMessage) {
gs.setFont(resources.getFont(fontName)); return new TokenModificationResult(false, errorMessage);
gs.setFontSize(cosNumber.floatValue());
}
} catch (ClassCastException | IOException ignored) {
}
}
if (isTextShowingOperator(opName) && i > 0) {
String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
if (!textContent.isEmpty()) {
segments.add(
new TextSegment(
i - 1,
opName,
textContent,
currentTextPos,
currentTextPos + textContent.length(),
gs.font,
gs.fontSize));
currentTextPos += textContent.length();
}
}
}
} }
return segments;
} }
@Data @Data

View File

@ -1,7 +1,14 @@
package stirling.software.SPDF.utils.text; package stirling.software.SPDF.utils.text;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -11,127 +18,572 @@ import lombok.extern.slf4j.Slf4j;
public class WidthCalculator { public class WidthCalculator {
private final int FONT_SCALE_FACTOR = 1000; private final int FONT_SCALE_FACTOR = 1000;
private final float CONSERVATIVE_CHAR_WIDTH_RATIO = 0.55f;
private final float BBOX_CHAR_WIDTH_RATIO = 0.65f;
private final Map<String, Float> widthCache = new ConcurrentHashMap<>();
private final Map<String, Boolean> reliabilityCache = new ConcurrentHashMap<>();
private String createCacheKey(PDFont font, String text, float fontSize) {
return String.format("%s|%s|%.2f", font.getName(), text, fontSize);
}
private String createReliabilityCacheKey(PDFont font) {
return font.getName();
}
public float calculateAccurateWidth(PDFont font, String text, float fontSize) { public float calculateAccurateWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) { return calculateAccurateWidth(font, text, fontSize, true);
return 0; }
public float calculateAccurateWidth(
PDFont font, String text, float fontSize, boolean useCache) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) return 0;
if (useCache) {
String cacheKey = createCacheKey(font, text, fontSize);
Float cachedWidth = widthCache.get(cacheKey);
if (cachedWidth != null) return cachedWidth;
} }
if (!TextEncodingHelper.canEncodeCharacters(font, text)) { String normalizedText = normalizeText(text);
log.debug(
"Text cannot be encoded by font {}, using fallback width calculation", Float directWidth = calculateDirectWidth(font, normalizedText, fontSize);
font.getName()); if (directWidth != null) {
return calculateFallbackWidth(font, text, fontSize); if (useCache) widthCache.put(createCacheKey(font, text, fontSize), directWidth);
return directWidth;
} }
Float charByCharWidth = calculateCharacterByCharacterWidth(font, normalizedText, fontSize);
if (charByCharWidth != null) {
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), charByCharWidth);
return charByCharWidth;
}
Float glyphWidth = calculateGlyphBasedWidth(font, normalizedText, fontSize);
if (glyphWidth != null) {
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), glyphWidth);
return glyphWidth;
}
float fallbackWidth = calculateComprehensiveFallbackWidth(font, normalizedText, fontSize);
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), fallbackWidth);
return fallbackWidth;
}
private String normalizeText(String text) {
return Normalizer.normalize(text, Normalizer.Form.NFC);
}
private Float calculateDirectWidth(PDFont font, String text, float fontSize) {
if (!TextEncodingHelper.canEncodeCharacters(font, text)) return null;
try { try {
float rawWidth = font.getStringWidth(text); float rawWidth = font.getStringWidth(text);
float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize; float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize;
return rawWidth >= 0 && scaledWidth >= 0 ? scaledWidth : null;
log.debug(
"Direct width calculation successful for font {}: {} -> {}",
font.getName(),
rawWidth,
scaledWidth);
return scaledWidth;
} catch (Exception e) { } catch (Exception e) {
log.debug( return null;
"Direct width calculation failed for font {}: {}",
font.getName(),
e.getMessage());
return calculateWidthWithCharacterIteration(font, text, fontSize);
} }
} }
private float calculateWidthWithCharacterIteration(PDFont font, String text, float fontSize) { private Float calculateCharacterByCharacterWidth(PDFont font, String text, float fontSize) {
try { try {
List<Integer> codePoints = getCodePoints(text);
float totalWidth = 0; float totalWidth = 0;
int previousCodePoint = -1;
for (int i = 0; i < text.length(); i++) { for (int codePoint : codePoints) {
String character = text.substring(i, i + 1); String character = new String(Character.toChars(codePoint));
Float charWidth = calculateSingleCharacterWidth(font, character, fontSize);
if (charWidth == null) return null;
totalWidth += charWidth;
if (previousCodePoint != -1) {
totalWidth += calculateKerning(font, previousCodePoint, codePoint, fontSize);
}
previousCodePoint = codePoint;
}
return totalWidth;
} catch (Exception e) {
return null;
}
}
private List<Integer> getCodePoints(String text) {
List<Integer> codePoints = new ArrayList<>();
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
codePoints.add(codePoint);
i += Character.charCount(codePoint);
}
return codePoints;
}
private Float calculateSingleCharacterWidth(PDFont font, String character, float fontSize) {
try {
byte[] encoded = null;
try {
encoded = font.encode(character);
if (encoded.length == 0) encoded = null;
} catch (Exception e) {
log.debug("Direct encoding failed for '{}': {}", character, e.getMessage());
}
if (encoded == null && font instanceof PDType0Font) {
try { try {
byte[] encoded = font.encode(character); encoded = character.getBytes("UTF-8");
if (encoded.length > 0) { } catch (Exception e) {
int glyphCode = encoded[0] & 0xFF; log.debug("UTF-8 encoding failed for '{}': {}", character, e.getMessage());
float glyphWidth = font.getWidth(glyphCode);
if (glyphWidth == 0) {
try {
glyphWidth = font.getWidthFromFont(glyphCode);
} catch (Exception e2) {
glyphWidth = font.getAverageFontWidth();
}
}
totalWidth += (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
} else {
totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize;
}
} catch (Exception e2) {
totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize;
} }
} }
log.debug("Character iteration width calculation: {}", totalWidth); if (encoded != null && encoded.length > 0) {
return totalWidth; Float width = calculateGlyphWidth(font, encoded, fontSize);
if (width != null && width >= 0) return width;
}
return calculateAverageCharacterWidth(font, fontSize);
} catch (Exception e) { } catch (Exception e) {
log.debug("Character iteration failed: {}", e.getMessage()); log.debug(
return calculateFallbackWidth(font, text, fontSize); "Single character width calculation failed for '{}': {}",
character,
e.getMessage());
return calculateAverageCharacterWidth(font, fontSize);
} }
} }
private float calculateFallbackWidth(PDFont font, String text, float fontSize) { private Float calculateGlyphWidth(PDFont font, byte[] encoded, float fontSize) {
for (byte b : encoded) {
try {
int glyphCode = b & 0xFF;
float glyphWidth = font.getWidth(glyphCode);
if (glyphWidth > 0) {
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
}
// Try alternative width methods
try {
glyphWidth = font.getWidthFromFont(glyphCode);
if (glyphWidth > 0) {
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
}
} catch (Exception e) {
log.debug(
"getWidthFromFont failed for glyph {}: {}", glyphCode, e.getMessage());
}
} catch (Exception e) {
log.debug("Glyph width calculation failed for byte {}: {}", b, e.getMessage());
}
}
return null;
}
private float calculateKerning(
PDFont font, int leftCodePoint, int rightCodePoint, float fontSize) {
return 0;
}
private Float calculateGlyphBasedWidth(PDFont font, String text, float fontSize) {
try { try {
float totalWidth = 0;
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
String character = new String(Character.toChars(codePoint));
// Try to get glyph information more comprehensively
Float charWidth =
calculateGlyphWidthComprehensively(font, character, codePoint, fontSize);
if (charWidth == null) {
return null;
}
totalWidth += charWidth;
i += Character.charCount(codePoint);
}
log.debug("Glyph-based width calculation: {}", totalWidth);
return totalWidth;
} catch (Exception e) {
log.debug("Glyph-based calculation failed: {}", e.getMessage());
return null;
}
}
private Float calculateGlyphWidthComprehensively(
PDFont font, String character, int codePoint, float fontSize) {
try {
// Method 1: Try standard encoding
try {
byte[] encoded = font.encode(character);
if (encoded.length > 0) {
Float width = calculateWidthFromEncodedBytes(font, encoded, fontSize);
if (width != null && width >= 0) {
return width;
}
}
} catch (Exception e) {
log.debug(
"Standard encoding failed for U+{}: {}",
Integer.toHexString(codePoint),
e.getMessage());
}
// Method 2: Try Unicode code point directly
try {
float glyphWidth = font.getWidth(codePoint);
if (glyphWidth > 0) {
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
}
} catch (Exception e) {
log.debug(
"Unicode code point width failed for U+{}: {}",
Integer.toHexString(codePoint),
e.getMessage());
}
// Method 3: Character category based estimation
return calculateCategoryBasedWidth(font, codePoint, fontSize);
} catch (Exception e) {
log.debug("Comprehensive glyph width calculation failed: {}", e.getMessage());
return calculateAverageCharacterWidth(font, fontSize);
}
}
private Float calculateWidthFromEncodedBytes(PDFont font, byte[] encoded, float fontSize) {
// Try each byte as a potential glyph code
for (byte b : encoded) {
try {
int glyphCode = b & 0xFF;
float width = font.getWidth(glyphCode);
if (width > 0) {
return (width / FONT_SCALE_FACTOR) * fontSize;
}
} catch (Exception e) {
// Continue trying other bytes
}
}
// Try multi-byte interpretation for Unicode fonts
if (encoded.length >= 2 && font instanceof PDType0Font) {
try {
int glyphCode = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
float width = font.getWidth(glyphCode);
if (width > 0) {
return (width / FONT_SCALE_FACTOR) * fontSize;
}
} catch (Exception e) {
log.debug("Multi-byte glyph code interpretation failed: {}", e.getMessage());
}
}
return null;
}
private Float calculateCategoryBasedWidth(PDFont font, int codePoint, float fontSize) {
try {
int category = Character.getType(codePoint);
float baseWidth = calculateAverageCharacterWidth(font, fontSize);
// Adjust width based on character category
float multiplier =
switch (category) {
case Character.UPPERCASE_LETTER -> 1.2f;
case Character.LOWERCASE_LETTER -> 1.0f;
case Character.DECIMAL_DIGIT_NUMBER -> 1.0f;
case Character.SPACE_SEPARATOR -> 0.5f;
case Character.DASH_PUNCTUATION -> 0.8f;
case Character.OTHER_PUNCTUATION -> 0.6f;
case Character.CURRENCY_SYMBOL -> 1.1f;
case Character.MATH_SYMBOL -> 1.0f;
case Character.MODIFIER_LETTER -> 0.7f;
case Character.NON_SPACING_MARK -> 0.0f; // Combining characters
case Character.ENCLOSING_MARK -> 0.0f;
case Character.COMBINING_SPACING_MARK -> 0.3f;
default -> 1.0f;
};
return baseWidth * multiplier;
} catch (Exception e) {
log.debug("Category-based width calculation failed: {}", e.getMessage());
return calculateAverageCharacterWidth(font, fontSize);
}
}
private float calculateAverageCharacterWidth(PDFont font, float fontSize) {
try {
float avgWidth = font.getAverageFontWidth();
return (avgWidth / FONT_SCALE_FACTOR) * fontSize;
} catch (Exception e) {
log.debug("Average character width calculation failed: {}", e.getMessage());
return CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
}
}
private float calculateComprehensiveFallbackWidth(PDFont font, String text, float fontSize) {
try {
// Strategy 1: Use font bounding box with character analysis
if (font.getFontDescriptor() != null if (font.getFontDescriptor() != null
&& font.getFontDescriptor().getFontBoundingBox() != null) { && font.getFontDescriptor().getFontBoundingBox() != null) {
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox(); PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
float avgCharWidth = float avgCharWidth = bbox.getWidth() / FONT_SCALE_FACTOR;
bbox.getWidth() / FONT_SCALE_FACTOR * 0.6f; // Conservative estimate
float fallbackWidth = text.length() * avgCharWidth * fontSize;
log.debug("Bounding box fallback width: {}", fallbackWidth); // Analyze text composition for better estimation
return fallbackWidth; float adjustedWidth = analyzeTextComposition(text, avgCharWidth, fontSize);
log.debug("Bounding box based fallback width: {}", adjustedWidth);
return adjustedWidth;
} }
float avgWidth = font.getAverageFontWidth(); // Strategy 2: Enhanced average width calculation
float fallbackWidth = (text.length() * avgWidth / FONT_SCALE_FACTOR) * fontSize; float enhancedAverage = calculateEnhancedAverageWidth(font, text, fontSize);
log.debug("Enhanced average fallback width: {}", enhancedAverage);
log.debug("Average width fallback: {}", fallbackWidth); return enhancedAverage;
return fallbackWidth;
} catch (Exception e) { } catch (Exception e) {
float conservativeWidth = text.length() * 0.5f * fontSize; // Ultimate fallback
log.debug( float conservativeWidth = text.length() * CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
"Conservative fallback width for font {}: {}", log.debug("Conservative fallback width: {}", conservativeWidth);
font.getName(),
conservativeWidth);
return conservativeWidth; return conservativeWidth;
} }
} }
private float analyzeTextComposition(String text, float avgCharWidth, float fontSize) {
float totalWidth = 0;
int spaceCount = 0;
int upperCount = 0;
int lowerCount = 0;
int digitCount = 0;
int punctCount = 0;
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
int category = Character.getType(codePoint);
switch (category) {
case Character.SPACE_SEPARATOR -> {
spaceCount++;
totalWidth += avgCharWidth * 0.5f * fontSize;
}
case Character.UPPERCASE_LETTER -> {
upperCount++;
totalWidth += avgCharWidth * 1.2f * fontSize;
}
case Character.LOWERCASE_LETTER -> {
lowerCount++;
totalWidth += avgCharWidth * 1.0f * fontSize;
}
case Character.DECIMAL_DIGIT_NUMBER -> {
digitCount++;
totalWidth += avgCharWidth * 1.0f * fontSize;
}
case Character.OTHER_PUNCTUATION, Character.DASH_PUNCTUATION -> {
punctCount++;
totalWidth += avgCharWidth * 0.7f * fontSize;
}
default -> totalWidth += avgCharWidth * BBOX_CHAR_WIDTH_RATIO * fontSize;
}
i += Character.charCount(codePoint);
}
// Log composition analysis for debugging
log.debug(
"Text composition analysis - Spaces: {}, Upper: {}, Lower: {}, Digits: {}, Punct: {}",
spaceCount,
upperCount,
lowerCount,
digitCount,
punctCount);
return totalWidth;
}
private float calculateEnhancedAverageWidth(PDFont font, String text, float fontSize) {
try {
float baseAverage = font.getAverageFontWidth();
// Try to get more specific metrics
float capHeight = 0;
float xHeight = 0;
if (font.getFontDescriptor() != null) {
capHeight = font.getFontDescriptor().getCapHeight();
xHeight = font.getFontDescriptor().getXHeight();
}
// Use metrics to adjust the average width estimation
float adjustmentFactor = 1.0f;
if (capHeight > 0 && xHeight > 0) {
adjustmentFactor = Math.max(0.8f, Math.min(1.2f, xHeight / capHeight));
}
float adjustedAverage = (baseAverage * adjustmentFactor / FONT_SCALE_FACTOR) * fontSize;
return text.length() * adjustedAverage;
} catch (Exception e) {
log.debug("Enhanced average width calculation failed: {}", e.getMessage());
return text.length() * CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
}
}
public boolean isWidthCalculationReliable(PDFont font) { public boolean isWidthCalculationReliable(PDFont font) {
if (font == null) { if (font == null) {
return false; return false;
} }
if (font.isDamaged()) { // Check cache first
log.debug("Font {} is damaged", font.getName()); String cacheKey = createReliabilityCacheKey(font);
return false; Boolean cachedResult = reliabilityCache.get(cacheKey);
if (cachedResult != null) {
log.debug(
"Using cached reliability result for font {}: {}",
font.getName(),
cachedResult);
return cachedResult;
} }
if (!TextEncodingHelper.canCalculateBasicWidths(font)) { boolean result = performReliabilityCheck(font);
log.debug("Font {} cannot perform basic width calculations", font.getName());
// Cache the result
reliabilityCache.put(cacheKey, result);
return result;
}
private boolean performReliabilityCheck(PDFont font) {
try {
// Check if font is damaged
if (font.isDamaged()) {
log.debug("Font {} is damaged", font.getName());
return false;
}
// Check basic width calculation capability
if (!TextEncodingHelper.canCalculateBasicWidths(font)) {
log.debug("Font {} cannot perform basic width calculations", font.getName());
return false;
}
// Test with a simple character
try {
font.getStringWidth("A");
return true;
} catch (Exception e) {
log.debug("Font {} failed basic width test: {}", font.getName(), e.getMessage());
}
// Check if we can at least get average width
try {
float avgWidth = font.getAverageFontWidth();
return avgWidth > 0;
} catch (Exception e) {
log.debug(
"Font {} cannot provide average width: {}", font.getName(), e.getMessage());
}
return false;
} catch (Exception e) {
log.debug("Reliability check failed for font {}: {}", font.getName(), e.getMessage());
return false; return false;
} }
}
if (TextEncodingHelper.hasCustomEncoding(font)) { public float calculateCharacterWidth(PDFont font, String character, float fontSize) {
log.debug("Font {} has custom encoding", font.getName()); if (font == null || character == null || character.isEmpty() || fontSize <= 0) return 0;
return false;
String cacheKey = createCacheKey(font, character, fontSize);
Float cachedWidth = widthCache.get(cacheKey);
if (cachedWidth != null) return cachedWidth;
Float width = calculateSingleCharacterWidth(font, character, fontSize);
if (width == null) width = calculateAverageCharacterWidth(font, fontSize);
widthCache.put(cacheKey, width);
return width;
}
public String createWidthMatchingPlaceholder(
String originalText,
float targetWidth,
PDFont font,
float fontSize,
String placeholderChar) {
if (originalText == null || originalText.isEmpty() || targetWidth <= 0) return "";
if (placeholderChar == null || placeholderChar.isEmpty()) placeholderChar = " ";
try {
float placeholderCharWidth = calculateCharacterWidth(font, placeholderChar, fontSize);
if (placeholderCharWidth <= 0) {
return " ".repeat(Math.max(1, originalText.length()));
}
int placeholderCount = Math.max(1, Math.round(targetWidth / placeholderCharWidth));
int originalLength = originalText.length();
int maxReasonableLength = Math.max(originalLength * 3, Math.max(placeholderCount, 10));
placeholderCount = Math.min(placeholderCount, maxReasonableLength);
placeholderCount = Math.max(1, placeholderCount);
return placeholderChar.repeat(placeholderCount);
} catch (Exception e) {
return " ".repeat(Math.max(1, originalText.length()));
}
}
public boolean canCalculateTextWidth(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) return false;
if (!isWidthCalculationReliable(font)) return false;
List<Integer> codePoints = getCodePoints(text);
int testSampleSize = Math.min(5, codePoints.size());
for (int i = 0; i < testSampleSize; i++) {
int codePoint = codePoints.get(i);
String character = new String(Character.toChars(codePoint));
try {
if (!TextEncodingHelper.canEncodeCharacters(font, character)) {
log.debug(
"Cannot encode character U+{} in text '{}'",
Integer.toHexString(codePoint),
text);
return false;
}
float width = calculateCharacterWidth(font, character, 12.0f);
if (width <= 0) {
log.debug(
"Character U+{} has invalid width: {}",
Integer.toHexString(codePoint),
width);
return false;
}
} catch (Exception e) {
log.debug(
"Error testing character U+{}: {}",
Integer.toHexString(codePoint),
e.getMessage());
return false;
}
} }
return true; return true;
} }
public void clearWidthCache() {
widthCache.clear();
}
public void clearReliabilityCache() {
reliabilityCache.clear();
}
} }