improve RedactionService and TextDecodingHelper for improved font handling and page number parsing

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-23 14:22:58 +02:00
parent 48967f7061
commit 4cafb998f7
2 changed files with 481 additions and 346 deletions

View File

@ -4,10 +4,12 @@ import java.awt.Color;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayDeque;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.Deque;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -33,7 +35,6 @@ import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern; import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
@ -64,7 +65,6 @@ import stirling.software.common.util.PdfUtils;
public class RedactionService { public class RedactionService {
private static final Pattern FUZZY_STRIP = Pattern.compile("[^a-z0-9]+"); private static final Pattern FUZZY_STRIP = Pattern.compile("[^a-z0-9]+");
private static final Pattern PAGE_SPLIT = Pattern.compile("[,\\s]+");
private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.6f; private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.6f;
private static final float PRECISION_THRESHOLD = 1e-3f; private static final float PRECISION_THRESHOLD = 1e-3f;
private static final int FONT_SCALE_FACTOR = 1000; private static final int FONT_SCALE_FACTOR = 1000;
@ -75,38 +75,6 @@ public class RedactionService {
private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null; private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
private static PDFont getFontSafely(PDResources resources, COSName fontName) {
if (resources == null || fontName == null) {
return null;
}
try {
PDFont font = resources.getFont(fontName);
if (font == null) {
return null;
}
try {
String fontNameCheck = font.getName();
if (fontNameCheck == null || fontNameCheck.trim().isEmpty()) {
log.debug("Font {} has null or empty name, skipping", fontName.getName());
return null;
}
} catch (Exception e) {
log.debug(
"Error accessing font name for {}, skipping: {}",
fontName.getName(),
e.getMessage());
return null;
}
return font;
} catch (Exception e) {
log.debug("Error retrieving font {}: {}", fontName.getName(), e.getMessage());
return null;
}
}
private static void redactAreas( private static void redactAreas(
List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages) List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
throws IOException { throws IOException {
@ -161,10 +129,15 @@ public class RedactionService {
ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages) ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages)
throws IOException { throws IOException {
Color redactColor = decodeOrDefault(request.getPageRedactionColor()); Color redactColor = decodeOrDefault(request.getPageRedactionColor());
List<Integer> pageNumbers = getPageNumbers(request, allPages.getCount()); String pageNumbers = request.getPageNumbers();
for (Integer pageNumber : pageNumbers) { List<Integer> pageNumberList = parsePageNumbers(pageNumbers);
PDPage page = allPages.get(pageNumber);
for (Integer pageNumber : pageNumberList) {
if (pageNumber <= 0 || pageNumber > allPages.getCount()) {
continue; // Skip invalid page numbers
}
PDPage page = allPages.get(pageNumber - 1); // Convert to 0-based index
try (PDPageContentStream contentStream = try (PDPageContentStream contentStream =
new PDPageContentStream( new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) { document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
@ -176,6 +149,39 @@ public class RedactionService {
} }
} }
private static List<Integer> parsePageNumbers(String pageNumbers) {
if (pageNumbers == null || pageNumbers.trim().isEmpty()) {
return Collections.emptyList();
}
List<Integer> result = new ArrayList<>();
String[] parts = pageNumbers.split(",");
for (String part : parts) {
part = part.trim();
if (part.contains("-")) {
String[] range = part.split("-");
if (range.length == 2) {
try {
int start = Integer.parseInt(range[0].trim());
int end = Integer.parseInt(range[1].trim());
for (int i = start; i <= end; i++) {
result.add(i);
}
} catch (NumberFormatException ignored) {
}
}
} else {
try {
result.add(Integer.parseInt(part));
} catch (NumberFormatException ignored) {
}
}
}
return result;
}
private static Color decodeOrDefault(String hex) { private static Color decodeOrDefault(String hex) {
if (hex == null) { if (hex == null) {
return Color.BLACK; return Color.BLACK;
@ -188,41 +194,6 @@ public class RedactionService {
} }
} }
private static List<Integer> getPageNumbers(ManualRedactPdfRequest request, int pagesCount) {
String pageNumbersInput = request.getPageNumbers();
String[] parts =
(pageNumbersInput != null) ? PAGE_SPLIT.split(pageNumbersInput) : new String[0];
List<Integer> pageNumbers = new ArrayList<>();
if (parts.length == 0 || parts[0].isEmpty()) {
return pageNumbers;
}
for (String token : parts) {
if (token.contains("-")) {
String[] range = token.split("-");
if (range.length == 2) {
int start = Integer.parseInt(range[0]);
int end = Integer.parseInt(range[1]);
if (start > 0 && end > 0 && start <= end) {
for (int i = start; i <= end; i++) {
if (i <= pagesCount) {
pageNumbers.add(i - 1);
}
}
}
}
} else {
try {
int num = Integer.parseInt(token);
if (num > 0 && num <= pagesCount) {
pageNumbers.add(num - 1);
}
} catch (NumberFormatException ignored) {
}
}
}
return pageNumbers;
}
private static void redactFoundText( private static void redactFoundText(
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor) PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
throws IOException { throws IOException {
@ -363,11 +334,9 @@ public class RedactionService {
Color redactColor = decodeOrDefault(colorString); Color redactColor = decodeOrDefault(colorString);
redactFoundText(document, allFoundTexts, customPadding, redactColor); redactFoundText(document, allFoundTexts, customPadding, redactColor);
} }
cleanDocumentMetadata(document);
} }
if (Boolean.TRUE.equals(convertToImage)) { if (Boolean.TRUE.equals(convertToImage)) {
try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) { try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) {
cleanDocumentMetadata(convertedPdf);
ByteArrayOutputStream baos = new ByteArrayOutputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream();
convertedPdf.save(baos); convertedPdf.save(baos);
return baos.toByteArray(); return baos.toByteArray();
@ -378,22 +347,6 @@ public class RedactionService {
return baos.toByteArray(); return baos.toByteArray();
} }
private static void cleanDocumentMetadata(PDDocument document) {
try {
var info = document.getDocumentInformation();
if (info != null) {
info.setAuthor(null);
info.setSubject(null);
info.setKeywords(null);
info.setModificationDate(java.util.Calendar.getInstance());
}
if (document.getDocumentCatalog() != null) {
document.getDocumentCatalog().setMetadata(null);
}
} catch (Exception ignored) {
}
}
private static String normalizeForFuzzy(String s) { private static String normalizeForFuzzy(String s) {
if (s == null) { if (s == null) {
return ""; return "";
@ -445,64 +398,6 @@ public class RedactionService {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
private static DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) {
DecodedMapping map = new DecodedMapping();
if (font == null || bytes == null) {
map.text = "";
map.charByteStart = new int[0];
map.charByteEnd = new int[0];
return map;
}
StringBuilder sb = new StringBuilder();
List<Integer> starts = new ArrayList<>();
List<Integer> ends = new ArrayList<>();
int i = 0;
boolean isType0 = font instanceof PDType0Font;
while (i < bytes.length) {
int b1 = bytes[i] & 0xFF;
String u = null;
int consumed = 1;
try {
if (isType0 && i + 1 < bytes.length) {
int b2 = bytes[i + 1] & 0xFF;
int code = (b1 << 8) | b2;
String u2 = null;
try {
u2 = font.toUnicode(code);
} catch (Exception ignored) {
}
if (u2 != null) {
u = u2;
consumed = 2;
}
}
if (u == null) {
try {
u = font.toUnicode(b1);
} catch (Exception ignored) {
}
if (u == null) {
u = "?";
}
}
} catch (Exception e) {
u = "?";
}
int start = i;
int end = i + consumed;
for (int k = 0; k < u.length(); k++) {
sb.append(u.charAt(k));
starts.add(start);
ends.add(end);
}
i += consumed;
}
map.text = sb.toString();
map.charByteStart = starts.stream().mapToInt(Integer::intValue).toArray();
map.charByteEnd = ends.stream().mapToInt(Integer::intValue).toArray();
return map;
}
private static void performFallbackModification( private static void performFallbackModification(
List<Object> tokens, int tokenIndex, String newText) { List<Object> tokens, int tokenIndex, String newText) {
try { try {
@ -520,7 +415,7 @@ public class RedactionService {
for (COSBase element : originalArray) { for (COSBase element : originalArray) {
if (element instanceof COSString cosString) { if (element instanceof COSString cosString) {
byte[] bytes = cosString.getBytes(); byte[] bytes = cosString.getBytes();
DecodedMapping dm = buildDecodeMapping(font, bytes); DecodedMapping dm = TextDecodingHelper.buildDecodeMapping(font, bytes);
int decodedLen = dm.text.length(); int decodedLen = dm.text.length();
if (decodedLen == 0 || dm.charByteStart.length == 0) { if (decodedLen == 0 || dm.charByteStart.length == 0) {
newArray.add(element); newArray.add(element);
@ -576,8 +471,9 @@ public class RedactionService {
&& newTokens.get(i - 1) instanceof COSString) { && newTokens.get(i - 1) instanceof COSString) {
newTokens.set(i - 1, EMPTY_COS_STRING); newTokens.set(i - 1, EMPTY_COS_STRING);
modifications++; modifications++;
} else if ("TJ".equals(name) && i > 0 && newTokens.get(i - 1) instanceof COSArray) { } else if ("TJ".equals(name)
COSArray arr = (COSArray) newTokens.get(i - 1); && i > 0
&& newTokens.get(i - 1) instanceof COSArray arr) {
COSArray newArr = new COSArray(); COSArray newArr = new COSArray();
for (int j = 0; j < arr.size(); j++) { for (int j = 0; j < arr.size(); j++) {
COSBase el = arr.get(j); COSBase el = arr.get(j);
@ -717,7 +613,7 @@ public class RedactionService {
private static int processSemanticTokens(List<Object> tokens, boolean removeTU) { private static int processSemanticTokens(List<Object> tokens, boolean removeTU) {
int modifications = 0; int modifications = 0;
java.util.Stack<Integer> markedContentStack = new java.util.Stack<>(); Deque<Integer> markedContentStack = new ArrayDeque<>();
for (int i = 0; i < tokens.size(); i++) { for (int i = 0; i < tokens.size(); i++) {
Object t = tokens.get(i); Object t = tokens.get(i);
@ -784,37 +680,12 @@ public class RedactionService {
} }
} }
private COSString redactCosStringByDecodedRanges( private static String createSubsetFontPlaceholder(
PDFont font, COSString cosString, List<AggressiveSegMatch> decRanges) { String originalWord, float targetWidth, PDFont font, float fontSize) {
try { String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
byte[] bytes = cosString.getBytes(); return result != null
DecodedMapping dm = buildDecodeMapping(font, bytes); ? result
if (dm.text.isEmpty() || dm.charByteStart.length == 0) { : " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
return cosString;
}
boolean[] delete = new boolean[bytes.length];
for (AggressiveSegMatch r : decRanges) {
int ds = Math.max(0, Math.min(r.decodedStart, dm.charByteStart.length));
int de = Math.max(ds, Math.min(r.decodedEnd, dm.charByteStart.length));
if (ds >= de) {
continue;
}
int byteStart = dm.charByteStart[ds];
int byteEnd = dm.charByteEnd[de - 1];
for (int bi = Math.max(0, byteStart); bi < Math.min(bytes.length, byteEnd); bi++) {
delete[bi] = true;
}
}
ByteArrayOutputStream baos = new ByteArrayOutputStream(bytes.length);
for (int bi = 0; bi < bytes.length; bi++) {
if (!delete[bi]) {
baos.write(bytes[bi]);
}
}
return new COSString(baos.toByteArray());
} catch (Exception e) {
return this.aggressiveMode ? EMPTY_COS_STRING : cosString;
}
} }
public void performTextReplacementAggressive( public void performTextReplacementAggressive(
@ -904,15 +775,7 @@ public class RedactionService {
return index >= 0 && index < tokens.size(); return index >= 0 && index < tokens.size();
} }
private String createSubsetFontPlaceholder( private static String buildCompleteText(List<TextSegment> segments) {
String originalWord, float targetWidth, PDFont font, float fontSize) {
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
return result != null
? result
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
}
private String buildCompleteText(List<TextSegment> segments) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (TextSegment segment : segments) { for (TextSegment segment : segments) {
sb.append(segment.text); sb.append(segment.text);
@ -920,6 +783,14 @@ public class RedactionService {
return sb.toString(); return sb.toString();
} }
private static boolean isProperFontSubset(String fontName) {
if (fontName.length() < 7) return false;
for (int i = 0; i < 6; i++) {
if (fontName.charAt(i) < 'A' || fontName.charAt(i) > 'Z') return false;
}
return fontName.charAt(6) == '+';
}
List<Object> createTokensWithoutTargetText( List<Object> createTokensWithoutTargetText(
PDDocument document, PDDocument document,
PDPage page, PDPage page,
@ -1006,50 +877,16 @@ public class RedactionService {
return extractTextSegmentsFromTokens(page.getResources(), tokens, aggressive); return extractTextSegmentsFromTokens(page.getResources(), tokens, aggressive);
} }
private List<TextSegment> extractTextSegmentsFromTokens( private static boolean hasReliableWidthMetrics(PDFont font) {
PDResources resources, List<Object> tokens, boolean aggressive) { try {
List<TextSegment> segments = new ArrayList<>(); String testString = "AbCdEf123";
int currentTextPos = 0; float width1 = font.getStringWidth(testString);
GraphicsState gs = new GraphicsState(); float width2 = calculateCharacterSumWidth(font, testString);
for (int i = 0; i < tokens.size(); i++) { if (width1 <= 0 || width2 <= 0) return false;
Object currentToken = tokens.get(i); return Math.abs(width1 - width2) / Math.max(width1, width2) < 0.05f;
if (currentToken instanceof Operator op) { } catch (Exception e) {
String opName = op.getName(); return false;
if ("Tf".equals(opName) && i >= 2) {
try {
COSName fontName = (COSName) tokens.get(i - 2);
COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
if (fontSizeBase instanceof COSNumber cosNumber) {
PDFont safeFont = getFontSafely(resources, fontName);
gs.setFont(safeFont);
gs.setFontSize(cosNumber.floatValue());
}
} catch (Exception ignored) {
}
}
if (isTextShowingOperator(opName) && i > 0) {
String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
if (textContent != null && !textContent.trim().isEmpty()) {
if (aggressive
&& gs.font != null
&& tokens.get(i - 1) instanceof COSString cs) {
TextDecodingHelper.tryDecodeWithFontEnhanced(gs.font, cs);
}
segments.add(
new TextSegment(
i - 1,
opName,
textContent,
currentTextPos,
currentTextPos + textContent.length(),
gs.font,
gs.fontSize));
currentTextPos += textContent.length();
}
}
}
} }
return segments;
} }
private static String sanitizeText(String text) { private static String sanitizeText(String text) {
@ -1393,23 +1230,47 @@ public class RedactionService {
} }
} }
private boolean isProperFontSubset(String fontName) { static String createPlaceholderWithFont(String originalWord, PDFont font) {
if (fontName.length() < 7) return false; if (originalWord == null || originalWord.isEmpty()) return " ";
for (int i = 0; i < 6; i++) {
if (fontName.charAt(i) < 'A' || fontName.charAt(i) > 'Z') return false; final String repeat = " ".repeat(Math.max(1, originalWord.length()));
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
try {
float originalWidth =
WidthCalculator.calculateAccurateWidth(font, originalWord, 1.0f);
String result =
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
return result != null ? result : repeat;
} catch (Exception e) {
return repeat;
}
} }
return fontName.charAt(6) == '+';
return repeat;
} }
private boolean hasReliableWidthMetrics(PDFont font) { private static TokenModificationResult convertToTJWithAdjustment(
List<Object> tokens,
int tokenIndex,
String originalOperator,
String newText,
float adjustment,
TextSegment segment) {
try { try {
String testString = "AbCdEf123"; COSArray newArray = new COSArray();
float width1 = font.getStringWidth(testString); newArray.add(new COSString(newText));
float width2 = calculateCharacterSumWidth(font, testString);
if (width1 <= 0 || width2 <= 0) return false; if (segment.getFontSize() > 0) {
return Math.abs(width1 - width2) / Math.max(width1, width2) < 0.05f; float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
if (Math.abs(kerning) <= 10000f) {
newArray.add(new COSFloat(kerning));
}
}
tokens.set(tokenIndex, newArray);
return updateOperatorSafely(tokens, tokenIndex, originalOperator);
} catch (Exception e) { } catch (Exception e) {
return false; return TokenModificationResult.failure("TJ conversion failed: " + e.getMessage());
} }
} }
@ -1450,24 +1311,36 @@ public class RedactionService {
} }
} }
private WidthMeasurement measureTextWidth(PDFont font, String text, float fontSize) { private static String createAlternativePlaceholder(
String originalWord, float targetWidth, PDFont font, float fontSize) {
final String repeat =
" ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
try { try {
float fontUnits = safeGetStringWidth(font, text); String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
if (fontUnits < 0) return WidthMeasurement.invalid(); if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
float actualWidth = (fontUnits / FONT_SCALE_FACTOR) * fontSize; if (spaceWidth > 0) {
float characterSumWidth = calculateCharacterSumWidth(font, text); int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces = originalWord.length() * 2;
if (characterSumWidth > 0) { return " ".repeat(Math.min(spaceCount, maxSpaces));
float characterActualWidth = (characterSumWidth / FONT_SCALE_FACTOR) * fontSize;
if (Math.abs(actualWidth - characterActualWidth) / actualWidth > 0.1f) {
actualWidth = Math.max(actualWidth, characterActualWidth);
} }
} }
for (String alt : alternatives) {
return new WidthMeasurement(actualWidth, true); if (" ".equals(alt)) continue;
try {
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
if (cw > 0) {
int count = Math.max(1, Math.round(targetWidth / cw));
int max = originalWord.length() * 2;
return " ".repeat(Math.min(count, max));
}
} catch (Exception ignored) {
}
}
return repeat;
} catch (Exception e) { } catch (Exception e) {
return WidthMeasurement.invalid(); return repeat;
} }
} }
@ -1616,24 +1489,37 @@ public class RedactionService {
} }
} }
String createPlaceholderWithFont(String originalWord, PDFont font) { private COSString redactCosStringByDecodedRanges(
if (originalWord == null || originalWord.isEmpty()) return " "; PDFont font, COSString cosString, List<AggressiveSegMatch> decRanges) {
try {
final String repeat = " ".repeat(Math.max(1, originalWord.length())); byte[] bytes = cosString.getBytes();
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) { DecodedMapping dm = TextDecodingHelper.buildDecodeMapping(font, bytes);
try { if (dm.text.isEmpty() || dm.charByteStart.length == 0) {
// Use helper to get accurate width at fontSize=1.0 return cosString;
float originalWidth =
WidthCalculator.calculateAccurateWidth(font, originalWord, 1.0f);
String result =
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
return result != null ? result : repeat;
} catch (Exception e) {
return repeat;
} }
boolean[] delete = new boolean[bytes.length];
for (AggressiveSegMatch r : decRanges) {
int ds = Math.max(0, Math.min(r.decodedStart, dm.charByteStart.length));
int de = Math.max(ds, Math.min(r.decodedEnd, dm.charByteStart.length));
if (ds >= de) {
continue;
}
int byteStart = dm.charByteStart[ds];
int byteEnd = dm.charByteEnd[de - 1];
for (int bi = Math.max(0, byteStart); bi < Math.min(bytes.length, byteEnd); bi++) {
delete[bi] = true;
}
}
ByteArrayOutputStream baos = new ByteArrayOutputStream(bytes.length);
for (int bi = 0; bi < bytes.length; bi++) {
if (!delete[bi]) {
baos.write(bytes[bi]);
}
}
return new COSString(baos.toByteArray());
} catch (Exception e) {
return this.aggressiveMode ? EMPTY_COS_STRING : cosString;
} }
return repeat;
} }
private TokenModificationResult performTokenModification( private TokenModificationResult performTokenModification(
@ -1724,61 +1610,71 @@ public class RedactionService {
} }
} }
private TokenModificationResult convertToTJWithAdjustment( private List<TextSegment> extractTextSegmentsFromTokens(
List<Object> tokens, PDResources resources, List<Object> tokens, boolean aggressive) {
int tokenIndex, List<TextSegment> segments = new ArrayList<>();
String originalOperator, int currentTextPos = 0;
String newText, GraphicsState gs = new GraphicsState();
float adjustment, for (int i = 0; i < tokens.size(); i++) {
TextSegment segment) { Object currentToken = tokens.get(i);
try { if (currentToken instanceof Operator op) {
COSArray newArray = new COSArray(); String opName = op.getName();
newArray.add(new COSString(newText)); if ("Tf".equals(opName) && i >= 2) {
try {
if (segment.getFontSize() > 0) { COSName fontName = (COSName) tokens.get(i - 2);
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR; COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
if (Math.abs(kerning) <= 10000f) { if (fontSizeBase instanceof COSNumber cosNumber) {
newArray.add(new COSFloat(kerning)); PDFont safeFont = TextDecodingHelper.getFontSafely(resources, fontName);
gs.setFont(safeFont);
gs.setFontSize(cosNumber.floatValue());
}
} catch (Exception ignored) {
}
}
if (isTextShowingOperator(opName) && i > 0) {
String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
if (textContent != null && !textContent.trim().isEmpty()) {
if (aggressive
&& gs.font != null
&& tokens.get(i - 1) instanceof COSString cs) {
TextDecodingHelper.tryDecodeWithFontEnhanced(gs.font, cs);
}
segments.add(
new TextSegment(
i - 1,
opName,
textContent,
currentTextPos,
currentTextPos + textContent.length(),
gs.font,
gs.fontSize));
currentTextPos += textContent.length();
}
} }
} }
tokens.set(tokenIndex, newArray);
return updateOperatorSafely(tokens, tokenIndex, originalOperator);
} catch (Exception e) {
return TokenModificationResult.failure("TJ conversion failed: " + e.getMessage());
} }
return segments;
} }
private String createAlternativePlaceholder( private WidthMeasurement measureTextWidth(PDFont font, String text, float fontSize) {
String originalWord, float targetWidth, PDFont font, float fontSize) {
final String repeat =
" ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
try { try {
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"}; float fontUnits = safeGetStringWidth(font, text);
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) { if (fontUnits < 0) return WidthMeasurement.invalid();
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
if (spaceWidth > 0) { float actualWidth = (fontUnits / FONT_SCALE_FACTOR) * fontSize;
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth)); float characterSumWidth = calculateCharacterSumWidth(font, text);
int maxSpaces = originalWord.length() * 2;
return " ".repeat(Math.min(spaceCount, maxSpaces)); if (characterSumWidth > 0) {
float characterActualWidth = (characterSumWidth / FONT_SCALE_FACTOR) * fontSize;
if (actualWidth != 0
&& Math.abs(actualWidth - characterActualWidth) / actualWidth > 0.1f) {
actualWidth = Math.max(actualWidth, characterActualWidth);
} }
} }
for (String alt : alternatives) {
if (" ".equals(alt)) continue; return new WidthMeasurement(actualWidth, true);
try {
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
if (cw > 0) {
int count = Math.max(1, Math.round(targetWidth / cw));
int max = originalWord.length() * 2;
return " ".repeat(Math.min(count, max));
}
} catch (Exception ignored) {
}
}
return repeat;
} catch (Exception e) { } catch (Exception e) {
return repeat; return WidthMeasurement.invalid();
} }
} }
@ -1876,13 +1772,7 @@ public class RedactionService {
int gStart = idx; int gStart = idx;
int gEnd = idx + w.length(); int gEnd = idx + w.length();
mapStartToEnd( mapStartToEnd(
(List<TextSegment>) segments, segments, result, perSegMatches, decStarts, decEnds, gStart, gEnd);
(List<MatchRange>) result,
(Map<Integer, List<AggressiveSegMatch>>) perSegMatches,
decStarts,
decEnds,
gStart,
gEnd);
idx = lower.indexOf(w, idx + 1); idx = lower.indexOf(w, idx + 1);
} }
} }
@ -2083,7 +1973,7 @@ public class RedactionService {
segment.getFont(), segment.getFont(),
segment.getFontSize()); segment.getFontSize());
} catch (Exception e) { } catch (Exception e) {
return "".repeat(Math.max(1, originalText.length())); return " ".repeat(Math.max(1, originalText.length()));
} }
} }
@ -2321,11 +2211,6 @@ public class RedactionService {
this.processedMatches = processedMatches; this.processedMatches = processedMatches;
this.warnings = new ArrayList<>(warnings); this.warnings = new ArrayList<>(warnings);
} }
@Override
public List<String> warnings() {
return new ArrayList<>(warnings);
}
} }
private void processFormXObject( private void processFormXObject(
@ -2380,12 +2265,8 @@ public class RedactionService {
private static class TokenModificationResult { private static class TokenModificationResult {
@Getter private final boolean success; @Getter private final boolean success;
@SuppressWarnings("unused")
private final String errorMessage;
private TokenModificationResult(boolean success, String errorMessage) { private TokenModificationResult(boolean success, String errorMessage) {
this.success = success; this.success = success;
this.errorMessage = errorMessage;
} }
public static TokenModificationResult success() { public static TokenModificationResult success() {
@ -2440,10 +2321,10 @@ public class RedactionService {
} }
@Data @Data
private static class DecodedMapping { public static class DecodedMapping {
String text; public String text;
int[] charByteStart; public int[] charByteStart;
int[] charByteEnd; public int[] charByteEnd;
} }
@Data @Data

View File

@ -4,14 +4,19 @@ import java.nio.ByteBuffer;
import java.nio.CharBuffer; import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.PDType0Font; import org.apache.pdfbox.pdmodel.font.*;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.service.RedactionService;
@Slf4j @Slf4j
@UtilityClass @UtilityClass
public class TextDecodingHelper { public class TextDecodingHelper {
@ -21,6 +26,38 @@ public class TextDecodingHelper {
private final int EXTENDED_ASCII_LOWER_BOUND = 160; private final int EXTENDED_ASCII_LOWER_BOUND = 160;
private final int EXTENDED_ASCII_UPPER_BOUND = 255; private final int EXTENDED_ASCII_UPPER_BOUND = 255;
public PDFont getFontSafely(PDResources resources, COSName fontName) {
if (resources == null || fontName == null) {
return null;
}
try {
PDFont font = resources.getFont(fontName);
if (font == null) {
return null;
}
try {
String fontNameCheck = font.getName();
if (fontNameCheck == null || fontNameCheck.trim().isEmpty()) {
log.debug("Font {} has null or empty name, skipping", fontName.getName());
return null;
}
} catch (Exception e) {
log.debug(
"Error accessing font name for {}, skipping: {}",
fontName.getName(),
e.getMessage());
return null;
}
return font;
} catch (Exception e) {
log.debug("Error retrieving font {}: {}", fontName.getName(), e.getMessage());
return null;
}
}
public void tryDecodeWithFontEnhanced(PDFont font, COSString cosString) { public void tryDecodeWithFontEnhanced(PDFont font, COSString cosString) {
if (font == null || cosString == null) { if (font == null || cosString == null) {
return; return;
@ -229,4 +266,221 @@ public class TextDecodingHelper {
return null; return null;
} }
} }
public static RedactionService.DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) {
RedactionService.DecodedMapping map = new RedactionService.DecodedMapping();
if (font == null || bytes == null) {
map.text = "";
map.charByteStart = new int[0];
map.charByteEnd = new int[0];
return map;
}
StringBuilder sb = new StringBuilder();
List<Integer> starts = new ArrayList<>();
List<Integer> ends = new ArrayList<>();
int i = 0;
// Determine font type and encoding characteristics
boolean isType0 = font instanceof PDType0Font;
boolean isType1 = font instanceof PDType1Font;
boolean isType3 = font instanceof PDType3Font;
boolean isTrueType = font instanceof PDTrueTypeFont;
while (i < bytes.length) {
int start = i;
String decodedChar = null;
int consumed = 1;
try {
if (isType0) {
// Handle CID fonts and multi-byte encodings
decodedChar = decodeType0Font((PDType0Font) font, bytes, i);
consumed = getType0CharLength((PDType0Font) font, bytes, i);
} else if (isType1) {
// Handle Type1 fonts with specific encoding
decodedChar = decodeType1Font((PDType1Font) font, bytes, i);
consumed = getType1CharLength((PDType1Font) font, bytes, i);
} else if (isType3) {
// Handle Type3 bitmap fonts
decodedChar = decodeType3Font((PDType3Font) font, bytes, i);
consumed = 1; // Type3 typically single byte
} else if (isTrueType) {
// Handle TrueType fonts
decodedChar = decodeTrueTypeFont((PDTrueTypeFont) font, bytes, i);
consumed = getTrueTypeCharLength((PDTrueTypeFont) font, bytes, i);
} else {
// Generic fallback for other font types
decodedChar = decodeGenericFont(font, bytes, i);
consumed = getGenericCharLength(font, bytes, i);
}
// Validate the consumed length
if (consumed <= 0 || i + consumed > bytes.length) {
consumed = 1;
}
} catch (Exception e) {
// Log the error for debugging purposes
System.err.println(
"Error decoding character at position " + i + ": " + e.getMessage());
decodedChar = null;
consumed = 1;
}
// Handle null or empty decoded characters
if (decodedChar == null || decodedChar.isEmpty()) {
decodedChar = handleUndecodableChar(bytes, i, consumed);
}
int end = i + consumed;
// Add each Unicode character separately
for (int k = 0; k < decodedChar.length(); k++) {
sb.append(decodedChar.charAt(k));
starts.add(start);
ends.add(end);
}
i += consumed;
}
map.text = sb.toString();
map.charByteStart = starts.stream().mapToInt(Integer::intValue).toArray();
map.charByteEnd = ends.stream().mapToInt(Integer::intValue).toArray();
return map;
}
private static String decodeType0Font(PDType0Font font, byte[] bytes, int position) {
try {
// Try multi-byte decoding first (common for CJK fonts)
if (position + 1 < bytes.length) {
int b1 = bytes[position] & 0xFF;
int b2 = bytes[position + 1] & 0xFF;
int code = (b1 << 8) | b2;
String unicode = font.toUnicode(code);
if (unicode != null && !unicode.isEmpty()) {
return unicode;
}
}
int code = bytes[position] & 0xFF;
return font.toUnicode(code);
} catch (Exception e) {
return null;
}
}
private static int getType0CharLength(PDType0Font font, byte[] bytes, int position) {
try {
if (position + 1 < bytes.length) {
int b1 = bytes[position] & 0xFF;
int b2 = bytes[position + 1] & 0xFF;
int code = (b1 << 8) | b2;
String unicode = font.toUnicode(code);
if (unicode != null && !unicode.isEmpty()) {
return 2;
}
}
return 1;
} catch (Exception e) {
return 1;
}
}
private static String decodeType1Font(PDType1Font font, byte[] bytes, int position) {
try {
int code = bytes[position] & 0xFF;
return font.toUnicode(code);
} catch (Exception e) {
return null;
}
}
private static int getType1CharLength(PDType1Font font, byte[] bytes, int position) {
return 1; // Type1 fonts are typically single-byte
}
private static String decodeType3Font(PDType3Font font, byte[] bytes, int position) {
try {
int code = bytes[position] & 0xFF;
return font.toUnicode(code);
} catch (Exception e) {
return null;
}
}
private static String decodeTrueTypeFont(PDTrueTypeFont font, byte[] bytes, int position) {
try {
int code = bytes[position] & 0xFF;
String unicode = font.toUnicode(code);
if ((unicode == null || unicode.isEmpty()) && position + 1 < bytes.length) {
int b1 = bytes[position] & 0xFF;
int b2 = bytes[position + 1] & 0xFF;
int multiByteCode = (b1 << 8) | b2;
unicode = font.toUnicode(multiByteCode);
}
return unicode;
} catch (Exception e) {
return null;
}
}
private static int getTrueTypeCharLength(PDTrueTypeFont font, byte[] bytes, int position) {
try {
// First try single byte
int code = bytes[position] & 0xFF;
String unicode = font.toUnicode(code);
if (unicode != null && !unicode.isEmpty()) {
return 1;
}
if (position + 1 < bytes.length) {
int b1 = bytes[position] & 0xFF;
int b2 = bytes[position + 1] & 0xFF;
int multiByteCode = (b1 << 8) | b2;
unicode = font.toUnicode(multiByteCode);
if (unicode != null && !unicode.isEmpty()) {
return 2;
}
}
return 1; // Default fallback
} catch (Exception e) {
return 1;
}
}
private static String decodeGenericFont(PDFont font, byte[] bytes, int position) {
try {
int code = bytes[position] & 0xFF;
return font.toUnicode(code);
} catch (Exception e) {
return null;
}
}
private static int getGenericCharLength(PDFont font, byte[] bytes, int position) {
return 1; // Default to single byte for unknown font types
}
private static String handleUndecodableChar(byte[] bytes, int position, int length) {
// Or try to interpret as ISO-8859-1 (Latin-1) as fallback
try {
byte[] charBytes = new byte[length];
System.arraycopy(bytes, position, charBytes, 0, length);
String fallback = new String(charBytes, StandardCharsets.ISO_8859_1);
if (!fallback.trim().isEmpty()) {
return fallback;
}
} catch (Exception e) {
// Ignore and fall through to default
}
return "<EFBFBD>"; // Unicode replacement character instead of "?"
}
} }