mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
Refactor VisualRedactionService and add TextDecodingHelper class
- Updated VisualRedactionService to improve code readability by adjusting indentation and formatting. - Introduced a new TextDecodingHelper class to enhance text decoding capabilities for PDF documents. - Implemented methods for decoding characters with improved handling of various font types and encodings. - Added fallback mechanisms for character mapping to ensure better text extraction from PDFs. Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
a5a1a6218c
commit
f9d2d9bbe5
@ -46,6 +46,7 @@ import stirling.software.SPDF.model.PDFText;
|
||||
import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest;
|
||||
import stirling.software.SPDF.model.api.security.RedactPdfRequest;
|
||||
import stirling.software.SPDF.pdf.TextFinder;
|
||||
import stirling.software.SPDF.utils.text.TextDecodingHelper;
|
||||
import stirling.software.SPDF.utils.text.TextEncodingHelper;
|
||||
import stirling.software.SPDF.utils.text.TextFinderUtils;
|
||||
import stirling.software.SPDF.utils.text.WidthCalculator;
|
||||
@ -337,48 +338,7 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private static String tryDecodeWithFontEnhanced(PDFont font, COSString cosString) {
|
||||
try {
|
||||
if (font == null || cosString == null) {
|
||||
return null;
|
||||
}
|
||||
byte[] bytes = cosString.getBytes();
|
||||
if (bytes.length == 0) {
|
||||
return "";
|
||||
}
|
||||
String basicDecoded = tryDecodeWithFont(font, cosString);
|
||||
if (basicDecoded != null && !basicDecoded.contains("?")) {
|
||||
return basicDecoded;
|
||||
}
|
||||
StringBuilder out = new StringBuilder();
|
||||
for (byte aByte : bytes) {
|
||||
int code = aByte & 0xFF;
|
||||
String charStr = null;
|
||||
try {
|
||||
charStr = font.toUnicode(code);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
if (charStr == null && font.getName() != null && font.getName().contains("+")) {
|
||||
charStr = mapSubsetCharacter(code);
|
||||
}
|
||||
|
||||
out.append(charStr != null ? charStr : "");
|
||||
}
|
||||
return out.toString();
|
||||
} catch (Exception e) {
|
||||
return tryDecodeWithFont(font, cosString);
|
||||
}
|
||||
}
|
||||
|
||||
private static String mapSubsetCharacter(int code) {
|
||||
if (code >= 32 && code <= 126) {
|
||||
return String.valueOf((char) code);
|
||||
}
|
||||
if (code >= 160 && code <= 255) {
|
||||
return String.valueOf((char) (code - 128));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
// Local decoding helpers removed in favor of TextDecodingHelper
|
||||
|
||||
private static String normalizeForFuzzy(String s) {
|
||||
if (s == null) {
|
||||
@ -632,71 +592,6 @@ public class RedactionService {
|
||||
return text.length() * 500f;
|
||||
}
|
||||
|
||||
private static String tryDecodeWithFont(PDFont font, COSString cosString) {
|
||||
try {
|
||||
if (font == null || cosString == null) {
|
||||
return null;
|
||||
}
|
||||
byte[] bytes = cosString.getBytes();
|
||||
if (bytes.length == 0) {
|
||||
return "";
|
||||
}
|
||||
boolean anyMapped = false;
|
||||
StringBuilder out = new StringBuilder();
|
||||
for (byte b : bytes) {
|
||||
int code = b & 0xFF;
|
||||
String uni = null;
|
||||
try {
|
||||
uni = font.toUnicode(code);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
if (uni != null) {
|
||||
out.append(uni);
|
||||
anyMapped = true;
|
||||
} else {
|
||||
out.append('?');
|
||||
}
|
||||
}
|
||||
if (anyMapped) {
|
||||
return out.toString();
|
||||
}
|
||||
out.setLength(0);
|
||||
anyMapped = false;
|
||||
for (int i = 0; i < bytes.length; ) {
|
||||
int b1 = bytes[i] & 0xFF;
|
||||
String u1 = null;
|
||||
try {
|
||||
u1 = font.toUnicode(b1);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
if (i + 1 < bytes.length) {
|
||||
int b2 = bytes[i + 1] & 0xFF;
|
||||
int code = (b1 << 8) | b2;
|
||||
String u2 = null;
|
||||
try {
|
||||
u2 = font.toUnicode(code);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
if (u2 != null) {
|
||||
out.append(u2);
|
||||
i += 2;
|
||||
anyMapped = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (u1 != null) {
|
||||
out.append(u1);
|
||||
} else {
|
||||
out.append('?');
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
return anyMapped ? out.toString() : null;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static WipeResult wipeAllTextShowingOperators(List<Object> tokens) {
|
||||
List<Object> newTokens = new ArrayList<>(tokens);
|
||||
int modifications = 0;
|
||||
@ -1062,7 +957,7 @@ public class RedactionService {
|
||||
if (aggressive
|
||||
&& gs.font != null
|
||||
&& tokens.get(i - 1) instanceof COSString cs) {
|
||||
tryDecodeWithFontEnhanced(gs.font, cs);
|
||||
TextDecodingHelper.tryDecodeWithFontEnhanced(gs.font, cs);
|
||||
}
|
||||
segments.add(
|
||||
new TextSegment(
|
||||
@ -1175,12 +1070,12 @@ public class RedactionService {
|
||||
|| "'".equals(seg.getOperatorName())
|
||||
|| "\"".equals(seg.getOperatorName()))
|
||||
&& tok instanceof COSString cs) {
|
||||
decoded = tryDecodeWithFont(seg.getFont(), cs);
|
||||
decoded = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), cs);
|
||||
} else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (COSBase el : arr) {
|
||||
if (el instanceof COSString s) {
|
||||
String d = tryDecodeWithFont(seg.getFont(), s);
|
||||
String d = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), s);
|
||||
sb.append(d != null ? d : s.getString());
|
||||
}
|
||||
}
|
||||
@ -1272,12 +1167,12 @@ public class RedactionService {
|
||||
Object tok = tokens.get(seg.getTokenIndex());
|
||||
if (("Tj".equals(seg.getOperatorName()) || "'".equals(seg.getOperatorName()))
|
||||
&& tok instanceof COSString cs) {
|
||||
decoded = tryDecodeWithFont(seg.getFont(), cs);
|
||||
decoded = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), cs);
|
||||
} else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (COSBase el : arr) {
|
||||
if (el instanceof COSString s) {
|
||||
String d = tryDecodeWithFont(seg.getFont(), s);
|
||||
String d = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), s);
|
||||
sb.append(d != null ? d : s.getString());
|
||||
}
|
||||
}
|
||||
@ -1715,7 +1610,7 @@ public class RedactionService {
|
||||
}
|
||||
|
||||
private int wipeAllTextInResources(PDDocument document, PDResources resources) {
|
||||
int totalMods = 0;
|
||||
int totalMods = 0; // aggregated but currently not returned to caller
|
||||
try {
|
||||
totalMods += wipeAllSemanticTextInProperties(resources);
|
||||
for (COSName xobjName : resources.getXObjectNames()) {
|
||||
@ -1776,7 +1671,6 @@ public class RedactionService {
|
||||
}
|
||||
|
||||
private void wipeAllTextInPatterns(PDDocument document, PDResources resources) {
|
||||
int totalMods = 0;
|
||||
try {
|
||||
for (COSName patName : resources.getPatternNames()) {
|
||||
try {
|
||||
@ -1786,7 +1680,7 @@ public class RedactionService {
|
||||
org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern tiling) {
|
||||
PDResources patRes = tiling.getResources();
|
||||
if (patRes != null) {
|
||||
totalMods += wipeAllTextInResources(document, patRes);
|
||||
wipeAllTextInResources(document, patRes);
|
||||
}
|
||||
PDFStreamParser parser = new PDFStreamParser(tiling);
|
||||
List<Object> tokens = new ArrayList<>();
|
||||
@ -1795,9 +1689,7 @@ public class RedactionService {
|
||||
tokens.add(token);
|
||||
}
|
||||
WipeResult wrText = wipeAllTextShowingOperators(tokens);
|
||||
totalMods += wrText.modifications;
|
||||
WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens);
|
||||
totalMods += wrSem.modifications;
|
||||
if (wrText.modifications > 0 || wrSem.modifications > 0) {
|
||||
writeRedactedContentToPattern(tiling, wrSem.tokens);
|
||||
}
|
||||
@ -1809,6 +1701,7 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private int wipeAllTextInAnnotations(PDDocument document, PDPage page) {
|
||||
int totalMods = 0;
|
||||
try {
|
||||
|
@ -0,0 +1,232 @@
|
||||
package stirling.software.SPDF.utils.text;
|
||||
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class TextDecodingHelper {
|
||||
|
||||
private static final int ASCII_LOWER_BOUND = 32;
|
||||
private static final int ASCII_UPPER_BOUND = 126;
|
||||
private static final int EXTENDED_ASCII_LOWER_BOUND = 160;
|
||||
private static final int EXTENDED_ASCII_UPPER_BOUND = 255;
|
||||
|
||||
public static void tryDecodeWithFontEnhanced(PDFont font, COSString cosString) {
|
||||
if (font == null || cosString == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
byte[] bytes = cosString.getBytes();
|
||||
if (bytes.length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
String basicDecoded = tryDecodeWithFont(font, cosString);
|
||||
if (basicDecoded != null
|
||||
&& !basicDecoded.contains("?")
|
||||
&& !basicDecoded.trim().isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
decodeCharactersEnhanced(font, bytes);
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("Decoding failed: {}", e.getMessage(), e);
|
||||
try {
|
||||
tryDecodeWithFont(font, cosString);
|
||||
} catch (Exception fallbackException) {
|
||||
// Ultimate fallback: return hex representation for analysis
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static String decodeCharactersEnhanced(PDFont font, byte[] bytes) {
|
||||
StringBuilder out = new StringBuilder();
|
||||
boolean hasValidCharacters = false;
|
||||
int i = 0;
|
||||
while (i < bytes.length) {
|
||||
int code = bytes[i] & 0xFF;
|
||||
String charStr = decodeSingleCharacter(font, code, bytes);
|
||||
|
||||
// Heuristic for multi-byte: if high byte, try combining with next
|
||||
if (charStr == null && code >= 128 && i + 1 < bytes.length) {
|
||||
int combinedCode = (code << 8) | (bytes[i + 1] & 0xFF);
|
||||
charStr = decodeSingleCharacter(font, combinedCode, bytes);
|
||||
if (charStr != null) {
|
||||
i += 2; // Skip the next byte
|
||||
out.append(charStr);
|
||||
hasValidCharacters = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (charStr != null && !charStr.isEmpty()) {
|
||||
out.append(charStr);
|
||||
hasValidCharacters = true;
|
||||
} else {
|
||||
out.append('?');
|
||||
}
|
||||
i++;
|
||||
}
|
||||
String result = out.toString();
|
||||
return hasValidCharacters ? result : null;
|
||||
}
|
||||
|
||||
public static String decodeSingleCharacter(PDFont font, int code, byte[] bytes) {
|
||||
String charStr = null;
|
||||
|
||||
try {
|
||||
charStr = font.toUnicode(code);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
// Enhanced CID Font and Composite Font Handling
|
||||
if (charStr == null
|
||||
&& font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font type0Font) {
|
||||
try {
|
||||
// Attempt CID-specific decoding for multi-byte codes
|
||||
int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
|
||||
charStr = type0Font.toUnicode(cid);
|
||||
log.debug("CID decoding successful for code {}: {}", cid, charStr);
|
||||
} catch (Exception e) {
|
||||
log.debug("CID decoding failed for code {}: {}", code, e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
if (charStr == null && font.getName() != null && font.getName().contains("+")) {
|
||||
charStr = mapSubsetCharacter(code);
|
||||
}
|
||||
|
||||
if (charStr == null) {
|
||||
charStr = fallbackCharacterMapping(code, bytes, font);
|
||||
}
|
||||
|
||||
return charStr;
|
||||
}
|
||||
|
||||
public static String fallbackCharacterMapping(int code, byte[] bytes, PDFont font) {
|
||||
try {
|
||||
if (font instanceof PDType0Font && bytes.length > 1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) {
|
||||
return String.valueOf((char) code);
|
||||
}
|
||||
|
||||
if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) {
|
||||
return String.valueOf((char) code);
|
||||
}
|
||||
|
||||
String fontName = font.getName();
|
||||
if (fontName != null) {
|
||||
String lowerName = fontName.toLowerCase();
|
||||
if (lowerName.contains("cjk")
|
||||
|| lowerName.contains("gb")
|
||||
|| lowerName.contains("jp")) {
|
||||
// Basic CJK fallback (expand with a lookup table if needed)
|
||||
if (code >= 0x4E00 && code <= 0x9FFF) {
|
||||
return String.valueOf(
|
||||
(char) code); // Unicode Basic Multilingual Plane for CJK
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to UTF-8/16 decoding attempt for unknown encodings
|
||||
try {
|
||||
if (bytes.length >= 2) {
|
||||
java.nio.ByteBuffer buffer = java.nio.ByteBuffer.wrap(bytes);
|
||||
java.nio.charset.CharsetDecoder decoder =
|
||||
java.nio.charset.StandardCharsets.UTF_16BE.newDecoder();
|
||||
java.nio.CharBuffer charBuffer = decoder.decode(buffer);
|
||||
return charBuffer.toString();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("UTF fallback failed: {}", e.getMessage());
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static String mapSubsetCharacter(int code) {
|
||||
if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) {
|
||||
return String.valueOf((char) code);
|
||||
}
|
||||
if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) {
|
||||
return String.valueOf((char) (code - 128));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static String tryDecodeWithFont(PDFont font, COSString cosString) {
|
||||
try {
|
||||
if (font == null || cosString == null) {
|
||||
return null;
|
||||
}
|
||||
byte[] bytes = cosString.getBytes();
|
||||
if (bytes.length == 0) {
|
||||
return "";
|
||||
}
|
||||
boolean anyMapped = false;
|
||||
StringBuilder out = new StringBuilder();
|
||||
for (byte b : bytes) {
|
||||
int code = b & 0xFF;
|
||||
String uni = null;
|
||||
try {
|
||||
uni = font.toUnicode(code);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
if (uni != null) {
|
||||
out.append(uni);
|
||||
anyMapped = true;
|
||||
} else {
|
||||
out.append('?');
|
||||
}
|
||||
}
|
||||
if (anyMapped) {
|
||||
return out.toString();
|
||||
}
|
||||
out.setLength(0);
|
||||
anyMapped = false;
|
||||
for (int i = 0; i < bytes.length; ) {
|
||||
int b1 = bytes[i] & 0xFF;
|
||||
String u1 = null;
|
||||
try {
|
||||
u1 = font.toUnicode(b1);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
if (i + 1 < bytes.length) {
|
||||
int b2 = bytes[i + 1] & 0xFF;
|
||||
int code = (b1 << 8) | b2;
|
||||
String u2 = null;
|
||||
try {
|
||||
u2 = font.toUnicode(code);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
if (u2 != null) {
|
||||
out.append(u2);
|
||||
i += 2;
|
||||
anyMapped = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (u1 != null) {
|
||||
out.append(u1);
|
||||
} else {
|
||||
out.append('?');
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
return anyMapped ? out.toString() : null;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user