Refactor VisualRedactionService and add TextDecodingHelper class

- Updated VisualRedactionService to improve code readability by adjusting indentation and formatting.
- Introduced a new TextDecodingHelper class to enhance text decoding capabilities for PDF documents.
- Implemented methods for decoding characters with improved handling of various font types and encodings.
- Added fallback mechanisms for character mapping to ensure better text extraction from PDFs.

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-20 22:35:33 +02:00
parent a5a1a6218c
commit f9d2d9bbe5
2 changed files with 242 additions and 117 deletions

View File

@ -46,6 +46,7 @@ import stirling.software.SPDF.model.PDFText;
import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest; import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest;
import stirling.software.SPDF.model.api.security.RedactPdfRequest; import stirling.software.SPDF.model.api.security.RedactPdfRequest;
import stirling.software.SPDF.pdf.TextFinder; import stirling.software.SPDF.pdf.TextFinder;
import stirling.software.SPDF.utils.text.TextDecodingHelper;
import stirling.software.SPDF.utils.text.TextEncodingHelper; import stirling.software.SPDF.utils.text.TextEncodingHelper;
import stirling.software.SPDF.utils.text.TextFinderUtils; import stirling.software.SPDF.utils.text.TextFinderUtils;
import stirling.software.SPDF.utils.text.WidthCalculator; import stirling.software.SPDF.utils.text.WidthCalculator;
@ -337,48 +338,7 @@ public class RedactionService {
} }
} }
private static String tryDecodeWithFontEnhanced(PDFont font, COSString cosString) { // Local decoding helpers removed in favor of TextDecodingHelper
try {
if (font == null || cosString == null) {
return null;
}
byte[] bytes = cosString.getBytes();
if (bytes.length == 0) {
return "";
}
String basicDecoded = tryDecodeWithFont(font, cosString);
if (basicDecoded != null && !basicDecoded.contains("?")) {
return basicDecoded;
}
StringBuilder out = new StringBuilder();
for (byte aByte : bytes) {
int code = aByte & 0xFF;
String charStr = null;
try {
charStr = font.toUnicode(code);
} catch (Exception ignored) {
}
if (charStr == null && font.getName() != null && font.getName().contains("+")) {
charStr = mapSubsetCharacter(code);
}
out.append(charStr != null ? charStr : "");
}
return out.toString();
} catch (Exception e) {
return tryDecodeWithFont(font, cosString);
}
}
private static String mapSubsetCharacter(int code) {
if (code >= 32 && code <= 126) {
return String.valueOf((char) code);
}
if (code >= 160 && code <= 255) {
return String.valueOf((char) (code - 128));
}
return null;
}
private static String normalizeForFuzzy(String s) { private static String normalizeForFuzzy(String s) {
if (s == null) { if (s == null) {
@ -632,71 +592,6 @@ public class RedactionService {
return text.length() * 500f; return text.length() * 500f;
} }
private static String tryDecodeWithFont(PDFont font, COSString cosString) {
try {
if (font == null || cosString == null) {
return null;
}
byte[] bytes = cosString.getBytes();
if (bytes.length == 0) {
return "";
}
boolean anyMapped = false;
StringBuilder out = new StringBuilder();
for (byte b : bytes) {
int code = b & 0xFF;
String uni = null;
try {
uni = font.toUnicode(code);
} catch (Exception ignored) {
}
if (uni != null) {
out.append(uni);
anyMapped = true;
} else {
out.append('?');
}
}
if (anyMapped) {
return out.toString();
}
out.setLength(0);
anyMapped = false;
for (int i = 0; i < bytes.length; ) {
int b1 = bytes[i] & 0xFF;
String u1 = null;
try {
u1 = font.toUnicode(b1);
} catch (Exception ignored) {
}
if (i + 1 < bytes.length) {
int b2 = bytes[i + 1] & 0xFF;
int code = (b1 << 8) | b2;
String u2 = null;
try {
u2 = font.toUnicode(code);
} catch (Exception ignored) {
}
if (u2 != null) {
out.append(u2);
i += 2;
anyMapped = true;
continue;
}
}
if (u1 != null) {
out.append(u1);
} else {
out.append('?');
}
i += 1;
}
return anyMapped ? out.toString() : null;
} catch (Exception e) {
return null;
}
}
private static WipeResult wipeAllTextShowingOperators(List<Object> tokens) { private static WipeResult wipeAllTextShowingOperators(List<Object> tokens) {
List<Object> newTokens = new ArrayList<>(tokens); List<Object> newTokens = new ArrayList<>(tokens);
int modifications = 0; int modifications = 0;
@ -1062,7 +957,7 @@ public class RedactionService {
if (aggressive if (aggressive
&& gs.font != null && gs.font != null
&& tokens.get(i - 1) instanceof COSString cs) { && tokens.get(i - 1) instanceof COSString cs) {
tryDecodeWithFontEnhanced(gs.font, cs); TextDecodingHelper.tryDecodeWithFontEnhanced(gs.font, cs);
} }
segments.add( segments.add(
new TextSegment( new TextSegment(
@ -1175,12 +1070,12 @@ public class RedactionService {
|| "'".equals(seg.getOperatorName()) || "'".equals(seg.getOperatorName())
|| "\"".equals(seg.getOperatorName())) || "\"".equals(seg.getOperatorName()))
&& tok instanceof COSString cs) { && tok instanceof COSString cs) {
decoded = tryDecodeWithFont(seg.getFont(), cs); decoded = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), cs);
} else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) { } else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (COSBase el : arr) { for (COSBase el : arr) {
if (el instanceof COSString s) { if (el instanceof COSString s) {
String d = tryDecodeWithFont(seg.getFont(), s); String d = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), s);
sb.append(d != null ? d : s.getString()); sb.append(d != null ? d : s.getString());
} }
} }
@ -1272,12 +1167,12 @@ public class RedactionService {
Object tok = tokens.get(seg.getTokenIndex()); Object tok = tokens.get(seg.getTokenIndex());
if (("Tj".equals(seg.getOperatorName()) || "'".equals(seg.getOperatorName())) if (("Tj".equals(seg.getOperatorName()) || "'".equals(seg.getOperatorName()))
&& tok instanceof COSString cs) { && tok instanceof COSString cs) {
decoded = tryDecodeWithFont(seg.getFont(), cs); decoded = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), cs);
} else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) { } else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (COSBase el : arr) { for (COSBase el : arr) {
if (el instanceof COSString s) { if (el instanceof COSString s) {
String d = tryDecodeWithFont(seg.getFont(), s); String d = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), s);
sb.append(d != null ? d : s.getString()); sb.append(d != null ? d : s.getString());
} }
} }
@ -1715,7 +1610,7 @@ public class RedactionService {
} }
private int wipeAllTextInResources(PDDocument document, PDResources resources) { private int wipeAllTextInResources(PDDocument document, PDResources resources) {
int totalMods = 0; int totalMods = 0; // aggregated but currently not returned to caller
try { try {
totalMods += wipeAllSemanticTextInProperties(resources); totalMods += wipeAllSemanticTextInProperties(resources);
for (COSName xobjName : resources.getXObjectNames()) { for (COSName xobjName : resources.getXObjectNames()) {
@ -1776,7 +1671,6 @@ public class RedactionService {
} }
private void wipeAllTextInPatterns(PDDocument document, PDResources resources) { private void wipeAllTextInPatterns(PDDocument document, PDResources resources) {
int totalMods = 0;
try { try {
for (COSName patName : resources.getPatternNames()) { for (COSName patName : resources.getPatternNames()) {
try { try {
@ -1786,7 +1680,7 @@ public class RedactionService {
org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern tiling) { org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern tiling) {
PDResources patRes = tiling.getResources(); PDResources patRes = tiling.getResources();
if (patRes != null) { if (patRes != null) {
totalMods += wipeAllTextInResources(document, patRes); wipeAllTextInResources(document, patRes);
} }
PDFStreamParser parser = new PDFStreamParser(tiling); PDFStreamParser parser = new PDFStreamParser(tiling);
List<Object> tokens = new ArrayList<>(); List<Object> tokens = new ArrayList<>();
@ -1795,9 +1689,7 @@ public class RedactionService {
tokens.add(token); tokens.add(token);
} }
WipeResult wrText = wipeAllTextShowingOperators(tokens); WipeResult wrText = wipeAllTextShowingOperators(tokens);
totalMods += wrText.modifications;
WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens); WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens);
totalMods += wrSem.modifications;
if (wrText.modifications > 0 || wrSem.modifications > 0) { if (wrText.modifications > 0 || wrSem.modifications > 0) {
writeRedactedContentToPattern(tiling, wrSem.tokens); writeRedactedContentToPattern(tiling, wrSem.tokens);
} }
@ -1809,6 +1701,7 @@ public class RedactionService {
} }
} }
@SuppressWarnings("unused")
private int wipeAllTextInAnnotations(PDDocument document, PDPage page) { private int wipeAllTextInAnnotations(PDDocument document, PDPage page) {
int totalMods = 0; int totalMods = 0;
try { try {

View File

@ -0,0 +1,232 @@
package stirling.software.SPDF.utils.text;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TextDecodingHelper {
private static final int ASCII_LOWER_BOUND = 32;
private static final int ASCII_UPPER_BOUND = 126;
private static final int EXTENDED_ASCII_LOWER_BOUND = 160;
private static final int EXTENDED_ASCII_UPPER_BOUND = 255;
public static void tryDecodeWithFontEnhanced(PDFont font, COSString cosString) {
if (font == null || cosString == null) {
return;
}
try {
byte[] bytes = cosString.getBytes();
if (bytes.length == 0) {
return;
}
String basicDecoded = tryDecodeWithFont(font, cosString);
if (basicDecoded != null
&& !basicDecoded.contains("?")
&& !basicDecoded.trim().isEmpty()) {
return;
}
decodeCharactersEnhanced(font, bytes);
} catch (Exception e) {
log.error("Decoding failed: {}", e.getMessage(), e);
try {
tryDecodeWithFont(font, cosString);
} catch (Exception fallbackException) {
// Ultimate fallback: return hex representation for analysis
}
}
}
public static String decodeCharactersEnhanced(PDFont font, byte[] bytes) {
StringBuilder out = new StringBuilder();
boolean hasValidCharacters = false;
int i = 0;
while (i < bytes.length) {
int code = bytes[i] & 0xFF;
String charStr = decodeSingleCharacter(font, code, bytes);
// Heuristic for multi-byte: if high byte, try combining with next
if (charStr == null && code >= 128 && i + 1 < bytes.length) {
int combinedCode = (code << 8) | (bytes[i + 1] & 0xFF);
charStr = decodeSingleCharacter(font, combinedCode, bytes);
if (charStr != null) {
i += 2; // Skip the next byte
out.append(charStr);
hasValidCharacters = true;
continue;
}
}
if (charStr != null && !charStr.isEmpty()) {
out.append(charStr);
hasValidCharacters = true;
} else {
out.append('?');
}
i++;
}
String result = out.toString();
return hasValidCharacters ? result : null;
}
public static String decodeSingleCharacter(PDFont font, int code, byte[] bytes) {
String charStr = null;
try {
charStr = font.toUnicode(code);
} catch (Exception ignored) {
}
// Enhanced CID Font and Composite Font Handling
if (charStr == null
&& font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font type0Font) {
try {
// Attempt CID-specific decoding for multi-byte codes
int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
charStr = type0Font.toUnicode(cid);
log.debug("CID decoding successful for code {}: {}", cid, charStr);
} catch (Exception e) {
log.debug("CID decoding failed for code {}: {}", code, e.getMessage());
}
}
if (charStr == null && font.getName() != null && font.getName().contains("+")) {
charStr = mapSubsetCharacter(code);
}
if (charStr == null) {
charStr = fallbackCharacterMapping(code, bytes, font);
}
return charStr;
}
public static String fallbackCharacterMapping(int code, byte[] bytes, PDFont font) {
try {
if (font instanceof PDType0Font && bytes.length > 1) {
return null;
}
if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) {
return String.valueOf((char) code);
}
if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) {
return String.valueOf((char) code);
}
String fontName = font.getName();
if (fontName != null) {
String lowerName = fontName.toLowerCase();
if (lowerName.contains("cjk")
|| lowerName.contains("gb")
|| lowerName.contains("jp")) {
// Basic CJK fallback (expand with a lookup table if needed)
if (code >= 0x4E00 && code <= 0x9FFF) {
return String.valueOf(
(char) code); // Unicode Basic Multilingual Plane for CJK
}
}
}
// Fallback to UTF-8/16 decoding attempt for unknown encodings
try {
if (bytes.length >= 2) {
java.nio.ByteBuffer buffer = java.nio.ByteBuffer.wrap(bytes);
java.nio.charset.CharsetDecoder decoder =
java.nio.charset.StandardCharsets.UTF_16BE.newDecoder();
java.nio.CharBuffer charBuffer = decoder.decode(buffer);
return charBuffer.toString();
}
} catch (Exception e) {
log.debug("UTF fallback failed: {}", e.getMessage());
}
return null;
} catch (Exception e) {
return null;
}
}
public static String mapSubsetCharacter(int code) {
if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) {
return String.valueOf((char) code);
}
if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) {
return String.valueOf((char) (code - 128));
}
return null;
}
public static String tryDecodeWithFont(PDFont font, COSString cosString) {
try {
if (font == null || cosString == null) {
return null;
}
byte[] bytes = cosString.getBytes();
if (bytes.length == 0) {
return "";
}
boolean anyMapped = false;
StringBuilder out = new StringBuilder();
for (byte b : bytes) {
int code = b & 0xFF;
String uni = null;
try {
uni = font.toUnicode(code);
} catch (Exception ignored) {
}
if (uni != null) {
out.append(uni);
anyMapped = true;
} else {
out.append('?');
}
}
if (anyMapped) {
return out.toString();
}
out.setLength(0);
anyMapped = false;
for (int i = 0; i < bytes.length; ) {
int b1 = bytes[i] & 0xFF;
String u1 = null;
try {
u1 = font.toUnicode(b1);
} catch (Exception ignored) {
}
if (i + 1 < bytes.length) {
int b2 = bytes[i + 1] & 0xFF;
int code = (b1 << 8) | b2;
String u2 = null;
try {
u2 = font.toUnicode(code);
} catch (Exception ignored) {
}
if (u2 != null) {
out.append(u2);
i += 2;
anyMapped = true;
continue;
}
}
if (u1 != null) {
out.append(u1);
} else {
out.append('?');
}
i += 1;
}
return anyMapped ? out.toString() : null;
} catch (Exception e) {
return null;
}
}
}