mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
enhance text extraction with font support and improved error handling
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
5dc7358219
commit
0bbf1dd344
@ -999,7 +999,7 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (isTextShowingOperator(opName) && i > 0) {
|
if (isTextShowingOperator(opName) && i > 0) {
|
||||||
String textContent = extractTextFromToken(tokens.get(i - 1), opName);
|
String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
|
||||||
if (!textContent.isEmpty()) {
|
if (!textContent.isEmpty()) {
|
||||||
if (aggressive
|
if (aggressive
|
||||||
&& gs.font != null
|
&& gs.font != null
|
||||||
@ -1045,7 +1045,7 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (isTextShowingOperator(opName) && i > 0) {
|
if (isTextShowingOperator(opName) && i > 0) {
|
||||||
String textContent = extractTextFromToken(tokens.get(i - 1), opName);
|
String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
|
||||||
if (!textContent.isEmpty()) {
|
if (!textContent.isEmpty()) {
|
||||||
segments.add(
|
segments.add(
|
||||||
new TextSegment(
|
new TextSegment(
|
||||||
@ -1752,65 +1752,318 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private String extractTextFromToken(Object token, String operatorName) {
|
private String extractTextFromToken(Object token, String operatorName) {
|
||||||
return switch (operatorName) {
|
return extractTextFromToken(token, operatorName, null);
|
||||||
case "Tj", "'", "\"" -> {
|
|
||||||
if (token instanceof COSString cosString) {
|
|
||||||
yield cosString.getString();
|
|
||||||
}
|
|
||||||
yield "";
|
|
||||||
}
|
|
||||||
case "TJ" -> {
|
|
||||||
if (token instanceof COSArray cosArray) {
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
for (COSBase element : cosArray) {
|
|
||||||
if (element instanceof COSString cosString) {
|
|
||||||
sb.append(cosString.getString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
yield sb.toString();
|
|
||||||
}
|
|
||||||
yield "";
|
|
||||||
}
|
|
||||||
default -> "";
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private WipeResult wipeAllSemanticTextInTokens(List<Object> tokens) {
|
private String extractTextFromToken(Object token, String operatorName, PDFont currentFont) {
|
||||||
List<Object> newTokens = new ArrayList<>(tokens);
|
if (token == null || operatorName == null) {
|
||||||
int modifications = 0;
|
return "";
|
||||||
for (int i = 0; i < newTokens.size(); i++) {
|
}
|
||||||
Object t = newTokens.get(i);
|
|
||||||
if (t instanceof Operator op) {
|
try {
|
||||||
String name = op.getName();
|
return switch (operatorName) {
|
||||||
if ("BDC".equals(name) && i > 0) {
|
case "Tj" -> handleTjOperator(token, currentFont);
|
||||||
Object maybeDict = newTokens.get(i - 1);
|
case "'" -> handleSingleQuoteOperator(token, currentFont);
|
||||||
if (maybeDict instanceof COSDictionary dict) {
|
case "\"" -> handleDoubleQuoteOperator(token, currentFont);
|
||||||
boolean changed = false;
|
case "TJ" -> handleTJOperator(token, currentFont);
|
||||||
if (dict.containsKey(COSName.getPDFName("ActualText"))) {
|
default -> "";
|
||||||
dict.removeItem(COSName.getPDFName("ActualText"));
|
};
|
||||||
changed = true;
|
} catch (Exception e) {
|
||||||
}
|
log.warn(
|
||||||
if (dict.containsKey(COSName.getPDFName("Alt"))) {
|
"Failed to extract text from token for operator {}: {}",
|
||||||
dict.removeItem(COSName.getPDFName("Alt"));
|
operatorName,
|
||||||
changed = true;
|
e.getMessage());
|
||||||
}
|
return "";
|
||||||
if (dict.containsKey(COSName.getPDFName("TU"))) {
|
}
|
||||||
dict.removeItem(COSName.getPDFName("TU"));
|
}
|
||||||
changed = true;
|
|
||||||
}
|
private String handleTjOperator(Object token, PDFont font) {
|
||||||
if (changed) {
|
if (token instanceof COSString cosString) {
|
||||||
modifications++;
|
return extractStringWithFallbacks(cosString, font);
|
||||||
}
|
}
|
||||||
}
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
private String handleSingleQuoteOperator(Object token, PDFont font) {
|
||||||
|
if (token instanceof COSString cosString) {
|
||||||
|
return "\n" + extractStringWithFallbacks(cosString, font);
|
||||||
|
}
|
||||||
|
return "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
private String handleDoubleQuoteOperator(Object token, PDFont font) {
|
||||||
|
if (token instanceof COSString cosString) {
|
||||||
|
return "\n" + extractStringWithFallbacks(cosString, font);
|
||||||
|
}
|
||||||
|
return "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
private String handleTJOperator(Object token, PDFont font) {
|
||||||
|
if (!(token instanceof COSArray cosArray)) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder textBuilder = new StringBuilder();
|
||||||
|
|
||||||
|
for (COSBase element : cosArray) {
|
||||||
|
if (element instanceof COSString cosString) {
|
||||||
|
String extractedText = extractStringWithFallbacks(cosString, font);
|
||||||
|
textBuilder.append(extractedText);
|
||||||
|
|
||||||
|
} else if (element instanceof COSNumber cosNumber) {
|
||||||
|
double displacement = cosNumber.floatValue();
|
||||||
|
|
||||||
|
if (displacement < -100.0) {
|
||||||
|
textBuilder.append(" "); // Add space for significant gaps
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return textBuilder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String extractStringWithFallbacks(COSString cosString, PDFont font) {
|
||||||
|
if (cosString == null) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
String text = cosString.getString();
|
||||||
|
|
||||||
|
if (text != null && !text.trim().isEmpty() && !isGibberish(text)) {
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback 1: Try enhanced font-based decoding if available
|
||||||
|
if (font != null) {
|
||||||
|
String fontBasedText = tryFontBasedExtraction(cosString, font);
|
||||||
|
if (fontBasedText != null && !isGibberish(fontBasedText)) {
|
||||||
|
log.debug("Used font-based fallback extraction");
|
||||||
|
return fontBasedText;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback 2: Try different encoding interpretations
|
||||||
|
String encodingFallback = tryEncodingFallbacks(cosString);
|
||||||
|
if (encodingFallback != null && !isGibberish(encodingFallback)) {
|
||||||
|
log.debug("Used encoding fallback extraction");
|
||||||
|
return encodingFallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback 3: Return original with sanitization
|
||||||
|
return sanitizeText(text != null ? text : "");
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.debug("All extraction methods failed for COSString: {}", e.getMessage());
|
||||||
|
return "\uFFFD"; // Unicode replacement character
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String tryFontBasedExtraction(COSString cosString, PDFont font) {
|
||||||
|
try {
|
||||||
|
byte[] bytes = cosString.getBytes();
|
||||||
|
if (bytes.length == 0) return "";
|
||||||
|
|
||||||
|
StringBuilder result = new StringBuilder();
|
||||||
|
for (byte b : bytes) {
|
||||||
|
int code = b & 0xFF;
|
||||||
|
try {
|
||||||
|
String unicode = font.toUnicode(code);
|
||||||
|
if (unicode != null && !unicode.isEmpty()) {
|
||||||
|
result.append(unicode);
|
||||||
|
} else {
|
||||||
|
result.append("\uFFFD");
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
result.append("\uFFFD");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.toString();
|
||||||
|
} catch (Exception e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String tryEncodingFallbacks(COSString cosString) {
|
||||||
|
try {
|
||||||
|
byte[] bytes = cosString.getBytes();
|
||||||
|
if (bytes.length == 0) return "";
|
||||||
|
|
||||||
|
String[] encodings = {"UTF-8", "UTF-16BE", "UTF-16LE", "ISO-8859-1", "Windows-1252"};
|
||||||
|
|
||||||
|
for (String encoding : encodings) {
|
||||||
|
try {
|
||||||
|
if (bytes.length >= 2) {
|
||||||
|
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
|
||||||
|
return new String(bytes, 2, bytes.length - 2, "UTF-16BE");
|
||||||
|
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
|
||||||
|
return new String(bytes, 2, bytes.length - 2, "UTF-16LE");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String decoded = new String(bytes, encoding);
|
||||||
|
if (!isGibberish(decoded)) {
|
||||||
|
return decoded;
|
||||||
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isGibberish(String text) {
|
||||||
|
if (text == null || text.trim().isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int questionMarks = 0;
|
||||||
|
int replacementChars = 0;
|
||||||
|
int totalChars = text.length();
|
||||||
|
|
||||||
|
for (char c : text.toCharArray()) {
|
||||||
|
if (c == '?') questionMarks++;
|
||||||
|
if (c == '\uFFFD') replacementChars++;
|
||||||
|
}
|
||||||
|
|
||||||
|
double problematicRatio = (double) (questionMarks + replacementChars) / totalChars;
|
||||||
|
return problematicRatio > 0.3;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String sanitizeText(String text) {
|
||||||
|
if (text == null) return "";
|
||||||
|
|
||||||
|
StringBuilder sanitized = new StringBuilder();
|
||||||
|
for (char c : text.toCharArray()) {
|
||||||
|
if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
|
||||||
|
sanitized.append('\uFFFD');
|
||||||
|
} else {
|
||||||
|
sanitized.append(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sanitized.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private WipeResult wipeAllSemanticTextInTokens(List<Object> tokens) {
|
||||||
|
return wipeAllSemanticTextInTokens(
|
||||||
|
tokens, true); // Default to removing TU for backward compatibility
|
||||||
|
}
|
||||||
|
|
||||||
|
private WipeResult wipeAllSemanticTextInTokens(List<Object> tokens, boolean removeTU) {
|
||||||
|
if (tokens == null || tokens.isEmpty()) {
|
||||||
|
log.warn("Empty or null token list; no modifications made");
|
||||||
|
WipeResult res = new WipeResult();
|
||||||
|
res.tokens = new ArrayList<>();
|
||||||
|
res.modifications = 0;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Object> newTokens = deepCopyTokens(tokens);
|
||||||
|
int modifications = processSemanticTokens(newTokens, removeTU);
|
||||||
|
|
||||||
WipeResult res = new WipeResult();
|
WipeResult res = new WipeResult();
|
||||||
res.tokens = newTokens;
|
res.tokens = newTokens;
|
||||||
res.modifications = modifications;
|
res.modifications = modifications;
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int processSemanticTokens(List<Object> tokens, boolean removeTU) {
|
||||||
|
int modifications = 0;
|
||||||
|
java.util.Stack<Integer> markedContentStack =
|
||||||
|
new java.util.Stack<>(); // Track nesting for correctness
|
||||||
|
|
||||||
|
for (int i = 0; i < tokens.size(); i++) {
|
||||||
|
Object t = tokens.get(i);
|
||||||
|
if (t instanceof Operator op) {
|
||||||
|
String name = op.getName();
|
||||||
|
|
||||||
|
// Handle BDC (with dictionary) and BMC (without, for completeness)
|
||||||
|
if ("BDC".equals(name) || "BMC".equals(name)) {
|
||||||
|
markedContentStack.push(i); // Track start for nesting validation
|
||||||
|
|
||||||
|
if ("BDC".equals(name) && i > 0) {
|
||||||
|
Object prev = tokens.get(i - 1);
|
||||||
|
if (prev instanceof COSDictionary dict) {
|
||||||
|
boolean changed = removeSemanticProperties(dict, removeTU);
|
||||||
|
if (changed) {
|
||||||
|
modifications++;
|
||||||
|
log.debug(
|
||||||
|
"Removed semantic properties from dictionary at index {}",
|
||||||
|
i - 1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.warn("BDC at index {} lacks preceding COSDictionary; skipping", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if ("EMC".equals(name)) {
|
||||||
|
if (markedContentStack.isEmpty()) {
|
||||||
|
log.warn(
|
||||||
|
"Unmatched EMC at index {}; potential malformed content stream", i);
|
||||||
|
} else {
|
||||||
|
markedContentStack.pop(); // Validate pairing
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!markedContentStack.isEmpty()) {
|
||||||
|
log.warn(
|
||||||
|
"Unmatched marked content starts: {} (potential nesting issues)",
|
||||||
|
markedContentStack.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
return modifications;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean removeSemanticProperties(COSDictionary dict, boolean removeTU) {
|
||||||
|
boolean changed = false;
|
||||||
|
COSName actualText = COSName.getPDFName("ActualText");
|
||||||
|
COSName alt = COSName.getPDFName("Alt");
|
||||||
|
COSName tu = COSName.getPDFName("TU");
|
||||||
|
|
||||||
|
if (dict.containsKey(actualText)) {
|
||||||
|
dict.removeItem(actualText);
|
||||||
|
changed = true;
|
||||||
|
}
|
||||||
|
if (dict.containsKey(alt)) {
|
||||||
|
dict.removeItem(alt);
|
||||||
|
changed = true;
|
||||||
|
}
|
||||||
|
if (removeTU && dict.containsKey(tu)) {
|
||||||
|
dict.removeItem(tu);
|
||||||
|
changed = true;
|
||||||
|
log.info("Removed non-standard TU property (confirm if needed for your PDFs)");
|
||||||
|
}
|
||||||
|
|
||||||
|
return changed;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Object> deepCopyTokens(List<Object> original) {
|
||||||
|
List<Object> copy = new ArrayList<>(original.size());
|
||||||
|
for (Object obj : original) {
|
||||||
|
if (obj instanceof COSDictionary dict) {
|
||||||
|
COSDictionary newDict = new COSDictionary();
|
||||||
|
for (COSName key : dict.keySet()) {
|
||||||
|
newDict.setItem(key, dict.getDictionaryObject(key));
|
||||||
|
}
|
||||||
|
copy.add(newDict);
|
||||||
|
} else if (obj instanceof List<?> nestedList
|
||||||
|
&& !nestedList.isEmpty()
|
||||||
|
&& nestedList.get(0) instanceof Object) {
|
||||||
|
try {
|
||||||
|
List<Object> objectList = (List<Object>) nestedList;
|
||||||
|
copy.add(deepCopyTokens(objectList));
|
||||||
|
} catch (ClassCastException e) {
|
||||||
|
copy.add(obj); // Fallback to shallow copy if cast fails
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
copy.add(obj); // Shallow copy for primitives/operators
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return copy;
|
||||||
|
}
|
||||||
|
|
||||||
private int wipeAllTextInResources(PDDocument document, PDResources resources) {
|
private int wipeAllTextInResources(PDDocument document, PDResources resources) {
|
||||||
int totalMods = 0; // aggregated but currently not returned to caller
|
int totalMods = 0; // aggregated but currently not returned to caller
|
||||||
try {
|
try {
|
||||||
@ -2018,7 +2271,7 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (isTextShowingOperator(opName) && i > 0) {
|
if (isTextShowingOperator(opName) && i > 0) {
|
||||||
String textContent = extractTextFromToken(tokens.get(i - 1), opName);
|
String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
|
||||||
if (!textContent.isEmpty()) {
|
if (!textContent.isEmpty()) {
|
||||||
segments.add(
|
segments.add(
|
||||||
new TextSegment(
|
new TextSegment(
|
||||||
|
Loading…
Reference in New Issue
Block a user