enhance text handling and encoding validation

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-24 16:59:09 +02:00
parent e396b6cbb8
commit 7db58ad6dd
6 changed files with 1914 additions and 1119 deletions

View File

@ -6,23 +6,20 @@ import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.Getter;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import lombok.extern.slf4j.Slf4j;
import lombok.Getter;
import stirling.software.SPDF.model.PDFText;
@Slf4j
public class TextFinder extends PDFTextStripper {
private final String searchTerm;
private final boolean useRegex;
private final boolean wholeWordSearch;
@Getter
private final List<PDFText> foundTexts = new ArrayList<>();
@Getter private final List<PDFText> foundTexts = new ArrayList<>();
private final List<TextPosition> pageTextPositions = new ArrayList<>();
private final StringBuilder pageTextBuilder = new StringBuilder();
@ -45,20 +42,39 @@ public class TextFinder extends PDFTextStripper {
@Override
protected void writeString(String text, List<TextPosition> textPositions) {
pageTextBuilder.append(text);
pageTextPositions.addAll(textPositions);
for (TextPosition tp : textPositions) {
if (tp == null) continue;
String u = tp.getUnicode();
if (u == null) continue;
for (int i = 0; i < u.length(); ) {
int cp = u.codePointAt(i);
pageTextBuilder.append(Character.toChars(cp));
// Add one position per code unit appended (1-2 chars depending on surrogate)
int codeUnits = Character.charCount(cp);
for (int k = 0; k < codeUnits; k++) {
pageTextPositions.add(tp);
}
i += codeUnits;
}
}
}
@Override
protected void writeWordSeparator() {
pageTextBuilder.append(getWordSeparator());
pageTextPositions.add(null); // Placeholder for separator
String sep = getWordSeparator();
pageTextBuilder.append(sep);
for (int i = 0; i < sep.length(); i++) {
pageTextPositions.add(null);
}
}
@Override
protected void writeLineSeparator() {
pageTextBuilder.append(getLineSeparator());
pageTextPositions.add(null); // Placeholder for separator
String sep = getLineSeparator();
pageTextBuilder.append(sep);
for (int i = 0; i < sep.length(); i++) {
pageTextPositions.add(null);
}
}
@Override
@ -91,27 +107,10 @@ public class TextFinder extends PDFTextStripper {
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
Matcher matcher = pattern.matcher(text);
log.debug(
"Searching for '{}' in page {} with regex '{}' (wholeWord: {}, useRegex: {})",
processedSearchTerm,
getCurrentPageNo(),
regex,
wholeWordSearch,
useRegex);
int matchCount = 0;
while (matcher.find()) {
matchCount++;
int matchStart = matcher.start();
int matchEnd = matcher.end();
log.debug(
"Found match #{} at positions {}-{}: '{}'",
matchCount,
matchStart,
matchEnd,
matcher.group());
float minX = Float.MAX_VALUE;
float minY = Float.MAX_VALUE;
float maxX = Float.MIN_VALUE;
@ -119,13 +118,7 @@ public class TextFinder extends PDFTextStripper {
boolean foundPosition = false;
for (int i = matchStart; i < matchEnd; i++) {
if (i >= pageTextPositions.size()) {
log.debug(
"Position index {} exceeds available positions ({})",
i,
pageTextPositions.size());
continue;
}
if (i >= pageTextPositions.size()) continue;
TextPosition pos = pageTextPositions.get(i);
if (pos != null) {
foundPosition = true;
@ -137,11 +130,6 @@ public class TextFinder extends PDFTextStripper {
}
if (!foundPosition && matchStart < pageTextPositions.size()) {
log.debug(
"Attempting to find nearby positions for match at {}-{}",
matchStart,
matchEnd);
for (int i = Math.max(0, matchStart - 5);
i < Math.min(pageTextPositions.size(), matchEnd + 5);
i++) {
@ -166,29 +154,11 @@ public class TextFinder extends PDFTextStripper {
maxX,
maxY,
matcher.group()));
log.debug(
"Added PDFText for match: page={}, bounds=({},{},{},{}), text='{}'",
getCurrentPageNo() - 1,
minX,
minY,
maxX,
maxY,
matcher.group());
} else {
log.warn(
"Found text match '{}' but no valid position data at {}-{}",
matcher.group(),
matchStart,
matchEnd);
// no position info
}
}
log.debug(
"Page {} search complete: found {} matches for '{}'",
getCurrentPageNo(),
matchCount,
processedSearchTerm);
super.endPage(page);
}

View File

@ -2,6 +2,7 @@ package stirling.software.SPDF.utils.text;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
@ -13,11 +14,9 @@ import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.*;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.service.RedactionService;
@Slf4j
@UtilityClass
public class TextDecodingHelper {
@ -25,6 +24,8 @@ public class TextDecodingHelper {
private final int ASCII_UPPER_BOUND = 126;
private final int EXTENDED_ASCII_LOWER_BOUND = 160;
private final int EXTENDED_ASCII_UPPER_BOUND = 255;
private final int PROBLEMATIC_CODE_LOWER_BOUND = 65488;
private final int PROBLEMATIC_CODE_UPPER_BOUND = 65535;
public PDFont getFontSafely(PDResources resources, COSName fontName) {
if (resources == null || fontName == null) {
@ -33,27 +34,15 @@ public class TextDecodingHelper {
try {
PDFont font = resources.getFont(fontName);
if (font == null) {
return null;
}
if (font == null) return null;
try {
String fontNameCheck = font.getName();
if (fontNameCheck == null || fontNameCheck.trim().isEmpty()) {
log.debug("Font {} has null or empty name, skipping", fontName.getName());
return null;
}
String n = font.getName();
if (n == null || n.trim().isEmpty()) return null;
} catch (Exception e) {
log.debug(
"Error accessing font name for {}, skipping: {}",
fontName.getName(),
e.getMessage());
return null;
}
return font;
} catch (Exception e) {
log.debug("Error retrieving font {}: {}", fontName.getName(), e.getMessage());
return null;
}
}
@ -65,90 +54,160 @@ public class TextDecodingHelper {
try {
byte[] bytes = cosString.getBytes();
if (bytes.length == 0) {
return;
}
if (bytes.length == 0) return;
String basicDecoded = tryDecodeWithFont(font, cosString);
if (basicDecoded != null
&& !basicDecoded.contains("?")
&& !basicDecoded.trim().isEmpty()) {
return;
}
&& !basicDecoded.trim().isEmpty()) return;
decodeCharactersEnhanced(font, bytes);
} catch (Exception e) {
log.error("Decoding failed: {}", e.getMessage(), e);
try {
tryDecodeWithFont(font, cosString);
} catch (Exception fallbackException) {
} catch (Exception ignored) {
}
}
}
public String decodeCharactersEnhanced(PDFont font, byte[] bytes) {
// Try font-guided decoding first
String fontPass = decodeByFontTables(font, bytes);
if (isAcceptable(fontPass)) return fontPass;
// Try UTF-8 strict decoding
String utf8 = tryDecodeCharset(bytes, StandardCharsets.UTF_8);
if (isAcceptable(utf8)) return utf8;
// UTF-16 BE/LE
String u16be = tryDecodeCharset(bytes, StandardCharsets.UTF_16BE);
if (isAcceptable(u16be)) return u16be;
String u16le = tryDecodeCharset(bytes, StandardCharsets.UTF_16LE);
if (isAcceptable(u16le)) return u16le;
// Common Windows encodings
String win1252 = tryDecodeCharset(bytes, Charset.forName("windows-1252"));
if (isAcceptable(win1252)) return win1252;
String win1250 = tryDecodeCharset(bytes, Charset.forName("windows-1250"));
if (isAcceptable(win1250)) return win1250;
String gb2312 = tryDecodeCharset(bytes, Charset.forName("GB2312"));
if (isAcceptable(gb2312)) return gb2312;
String big5 = tryDecodeCharset(bytes, Charset.forName("Big5"));
if (isAcceptable(big5)) return big5;
String shiftJis = tryDecodeCharset(bytes, Charset.forName("Shift_JIS"));
if (isAcceptable(shiftJis)) return shiftJis;
String euckr = tryDecodeCharset(bytes, Charset.forName("EUC-KR"));
if (isAcceptable(euckr)) return euckr;
// Fallback to ISO-8859-1
String latin1 = tryDecodeCharset(bytes, StandardCharsets.ISO_8859_1);
return isAcceptable(latin1) ? latin1 : null;
}
private String decodeByFontTables(PDFont font, byte[] bytes) {
if (font == null || bytes == null || bytes.length == 0) return null;
StringBuilder out = new StringBuilder();
boolean hasValidCharacters = false;
int i = 0;
while (i < bytes.length) {
int code = bytes[i] & 0xFF;
String charStr = decodeSingleCharacter(font, code, bytes);
if (charStr == null && code >= 128 && i + 1 < bytes.length) {
int combinedCode = (code << 8) | (bytes[i + 1] & 0xFF);
charStr = decodeSingleCharacter(font, combinedCode, bytes);
if (charStr != null) {
i += 2; // Skip the next byte
out.append(charStr);
hasValidCharacters = true;
continue;
String ch = null;
int consumed = 1;
try {
ch = tryToUnicode(font, bytes, i);
if (ch == null && i + 1 < bytes.length) {
consumed = 2;
ch = tryToUnicode(font, bytes, i, 2);
}
} catch (Exception ignored) {
}
if (charStr != null && !charStr.isEmpty()) {
out.append(charStr);
hasValidCharacters = true;
} else {
out.append('?');
if (!isPrintable(ch)) {
// Handle problematic character codes specifically
ch = "<EFBFBD>";
}
i++;
out.append(ch);
i += consumed;
}
String result = out.toString();
return hasValidCharacters ? result : null;
String s = out.toString();
return isAcceptable(s) ? s : null;
}
private String tryToUnicode(PDFont font, byte[] bytes, int pos) {
int code = bytes[pos] & 0xFF;
try {
return font.toUnicode(code);
} catch (Exception e) {
return null;
}
}
private String tryToUnicode(PDFont font, byte[] bytes, int pos, int len) {
if (pos + len - 1 >= bytes.length) return null;
int code = 0;
for (int j = 0; j < len; j++) code = (code << 8) | (bytes[pos + j] & 0xFF);
try {
return font.toUnicode(code);
} catch (Exception e) {
return null;
}
}
private String tryDecodeCharset(byte[] bytes, Charset cs) {
try {
String s = new String(bytes, cs);
return isPrintable(s) ? s : null;
} catch (Exception e) {
return null;
}
}
private boolean isPrintable(String s) {
if (s == null || s.isEmpty()) return false;
int printable = 0;
for (int i = 0; i < s.length(); ) {
int cp = s.codePointAt(i);
int type = Character.getType(cp);
if (type != Character.CONTROL && type != Character.FORMAT && cp != 0xFFFD) printable++;
i += Character.charCount(cp);
}
return printable >= Math.max(1, s.codePointCount(0, s.length()) * 3 / 4);
}
private boolean isAcceptable(String s) {
return isPrintable(s);
}
public String decodeSingleCharacter(PDFont font, int code, byte[] bytes) {
String charStr = null;
try {
charStr = font.toUnicode(code);
} catch (Exception ignored) {
}
if (charStr == null && font instanceof PDType0Font type0Font) {
try {
int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
charStr = type0Font.toUnicode(cid);
log.debug("CID decoding successful for code {}: {}", cid, charStr);
} catch (Exception e) {
log.debug("CID decoding failed for code {}: {}", code, e.getMessage());
} catch (Exception ignored) {
}
}
if (charStr == null && font.getName() != null && font.getName().contains("+")) {
charStr = mapSubsetCharacter(code);
}
if (charStr == null) {
charStr = fallbackCharacterMapping(code, bytes, font);
}
return charStr;
}
public String fallbackCharacterMapping(int code, byte[] bytes, PDFont font) {
try {
// Handle problematic high-range character codes that cause .notdef warnings
if (code >= PROBLEMATIC_CODE_LOWER_BOUND && code <= PROBLEMATIC_CODE_UPPER_BOUND) {
return handleProblematicCharacterCode(code, font);
}
if (font instanceof PDType0Font && bytes.length > 1) {
return null;
}
@ -164,18 +223,15 @@ public class TextDecodingHelper {
String fontName = font.getName();
if (fontName != null) {
String lowerName = fontName.toLowerCase();
if (lowerName.contains("cjk")
|| lowerName.contains("gb")
|| lowerName.contains("jp")) {
// Basic CJK fallback (expand with a lookup table if needed)
if (code >= 0x4E00 && code <= 0x9FFF) {
return String.valueOf(
(char) code); // Unicode Basic Multilingual Plane for CJK
}
if ((lowerName.contains("cjk")
|| lowerName.contains("gb")
|| lowerName.contains("jp"))
&& code >= 0x4E00
&& code <= 0x9FFF) {
return String.valueOf((char) code);
}
}
// Fallback to UTF-8/16 decoding attempt for unknown encodings
try {
if (bytes.length >= 2) {
ByteBuffer buffer = ByteBuffer.wrap(bytes);
@ -184,7 +240,7 @@ public class TextDecodingHelper {
return charBuffer.toString();
}
} catch (Exception e) {
log.debug("UTF fallback failed: {}", e.getMessage());
}
return null;
@ -193,6 +249,19 @@ public class TextDecodingHelper {
}
}
public String handleProblematicCharacterCode(int code, PDFont font) {
if (code >= PROBLEMATIC_CODE_LOWER_BOUND && code <= PROBLEMATIC_CODE_UPPER_BOUND) {
int adjustedCode = code - PROBLEMATIC_CODE_LOWER_BOUND;
if (adjustedCode >= ASCII_LOWER_BOUND) {
return String.valueOf((char) adjustedCode);
}
if (font != null && font.getName() != null && font.getName().contains("+")) {
return mapSubsetCharacter(adjustedCode);
}
}
return "<EFBFBD>";
}
public String mapSubsetCharacter(int code) {
if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) {
return String.valueOf((char) code);
@ -221,6 +290,7 @@ public class TextDecodingHelper {
uni = font.toUnicode(code);
} catch (Exception ignored) {
}
if (uni != null) {
out.append(uni);
anyMapped = true;
@ -239,6 +309,7 @@ public class TextDecodingHelper {
u1 = font.toUnicode(b1);
} catch (Exception ignored) {
}
if (i + 1 < bytes.length) {
int b2 = bytes[i + 1] & 0xFF;
int code = (b1 << 8) | b2;
@ -247,6 +318,12 @@ public class TextDecodingHelper {
u2 = font.toUnicode(code);
} catch (Exception ignored) {
}
// Handle problematic multi-byte codes
if (u2 == null && code >= PROBLEMATIC_CODE_LOWER_BOUND) {
u2 = handleProblematicCharacterCode(code, font);
}
if (u2 != null) {
out.append(u2);
i += 2;
@ -267,12 +344,12 @@ public class TextDecodingHelper {
}
}
public static RedactionService.DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) {
public RedactionService.DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) {
RedactionService.DecodedMapping map = new RedactionService.DecodedMapping();
if (font == null || bytes == null) {
map.text = "";
map.charByteStart = new int[0];
map.charByteEnd = new int[0];
map.setText("");
map.setCharByteStart(new int[0]);
map.setCharByteEnd(new int[0]);
return map;
}
@ -289,46 +366,32 @@ public class TextDecodingHelper {
while (i < bytes.length) {
int start = i;
String decodedChar = null;
int consumed = 1;
String decodedChar;
int consumed;
try {
if (isType0) {
// Handle CID fonts and multi-byte encodings
decodedChar = decodeType0Font((PDType0Font) font, bytes, i);
consumed = getType0CharLength((PDType0Font) font, bytes, i);
} else if (isType1) {
// Handle Type1 fonts with specific encoding
decodedChar = decodeType1Font((PDType1Font) font, bytes, i);
consumed = getType1CharLength((PDType1Font) font, bytes, i);
consumed = 1;
} else if (isType3) {
// Handle Type3 bitmap fonts
decodedChar = decodeType3Font((PDType3Font) font, bytes, i);
consumed = 1; // Type3 typically single byte
consumed = 1;
} else if (isTrueType) {
// Handle TrueType fonts
decodedChar = decodeTrueTypeFont((PDTrueTypeFont) font, bytes, i);
consumed = getTrueTypeCharLength((PDTrueTypeFont) font, bytes, i);
} else {
// Generic fallback for other font types
decodedChar = decodeGenericFont(font, bytes, i);
consumed = getGenericCharLength(font, bytes, i);
}
// Validate the consumed length
if (consumed <= 0 || i + consumed > bytes.length) {
consumed = 1;
}
if (consumed <= 0 || i + consumed > bytes.length) consumed = 1;
} catch (Exception e) {
// Log the error for debugging purposes
System.err.println(
"Error decoding character at position " + i + ": " + e.getMessage());
decodedChar = null;
consumed = 1;
}
// Handle null or empty decoded characters
if (decodedChar == null || decodedChar.isEmpty()) {
decodedChar = handleUndecodableChar(bytes, i, consumed);
}
@ -345,15 +408,14 @@ public class TextDecodingHelper {
i += consumed;
}
map.text = sb.toString();
map.charByteStart = starts.stream().mapToInt(Integer::intValue).toArray();
map.charByteEnd = ends.stream().mapToInt(Integer::intValue).toArray();
map.setText(sb.toString());
map.setCharByteStart(starts.stream().mapToInt(Integer::intValue).toArray());
map.setCharByteEnd(ends.stream().mapToInt(Integer::intValue).toArray());
return map;
}
private static String decodeType0Font(PDType0Font font, byte[] bytes, int position) {
private String decodeType0Font(PDType0Font font, byte[] bytes, int position) {
try {
// Try multi-byte decoding first (common for CJK fonts)
if (position + 1 < bytes.length) {
int b1 = bytes[position] & 0xFF;
int b2 = bytes[position + 1] & 0xFF;
@ -372,7 +434,7 @@ public class TextDecodingHelper {
}
}
private static int getType0CharLength(PDType0Font font, byte[] bytes, int position) {
private int getType0CharLength(PDType0Font font, byte[] bytes, int position) {
try {
if (position + 1 < bytes.length) {
int b1 = bytes[position] & 0xFF;
@ -389,7 +451,7 @@ public class TextDecodingHelper {
}
}
private static String decodeType1Font(PDType1Font font, byte[] bytes, int position) {
private String decodeType1Font(PDType1Font font, byte[] bytes, int position) {
try {
int code = bytes[position] & 0xFF;
return font.toUnicode(code);
@ -398,11 +460,7 @@ public class TextDecodingHelper {
}
}
private static int getType1CharLength(PDType1Font font, byte[] bytes, int position) {
return 1; // Type1 fonts are typically single-byte
}
private static String decodeType3Font(PDType3Font font, byte[] bytes, int position) {
private String decodeType3Font(PDType3Font font, byte[] bytes, int position) {
try {
int code = bytes[position] & 0xFF;
return font.toUnicode(code);
@ -411,7 +469,7 @@ public class TextDecodingHelper {
}
}
private static String decodeTrueTypeFont(PDTrueTypeFont font, byte[] bytes, int position) {
private String decodeTrueTypeFont(PDTrueTypeFont font, byte[] bytes, int position) {
try {
int code = bytes[position] & 0xFF;
String unicode = font.toUnicode(code);
@ -429,7 +487,7 @@ public class TextDecodingHelper {
}
}
private static int getTrueTypeCharLength(PDTrueTypeFont font, byte[] bytes, int position) {
private int getTrueTypeCharLength(PDTrueTypeFont font, byte[] bytes, int position) {
try {
// First try single byte
int code = bytes[position] & 0xFF;
@ -454,7 +512,7 @@ public class TextDecodingHelper {
}
}
private static String decodeGenericFont(PDFont font, byte[] bytes, int position) {
private String decodeGenericFont(PDFont font, byte[] bytes, int position) {
try {
int code = bytes[position] & 0xFF;
return font.toUnicode(code);
@ -463,13 +521,8 @@ public class TextDecodingHelper {
}
}
private static int getGenericCharLength(PDFont font, byte[] bytes, int position) {
return 1; // Default to single byte for unknown font types
}
private String handleUndecodableChar(byte[] bytes, int position, int length) {
private static String handleUndecodableChar(byte[] bytes, int position, int length) {
// Or try to interpret as ISO-8859-1 (Latin-1) as fallback
try {
byte[] charBytes = new byte[length];
System.arraycopy(bytes, position, charBytes, 0, length);
@ -478,9 +531,7 @@ public class TextDecodingHelper {
return fallback;
}
} catch (Exception e) {
// Ignore and fall through to default
}
return "<EFBFBD>"; // Unicode replacement character instead of "?"
return "<EFBFBD>";
}
}

View File

@ -1,11 +1,6 @@
package stirling.software.SPDF.utils.text;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@ -15,225 +10,360 @@ import lombok.extern.slf4j.Slf4j;
public class TextEncodingHelper {
public boolean canEncodeCharacters(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
try {
// Step 1: Primary check - full-string encoding (permissive for "good" cases)
byte[] encoded = font.encode(text);
if (encoded.length > 0) {
log.debug(
"Text '{}' has good full-string encoding for font {} - permissively allowing",
text,
font.getName() != null ? font.getName() : "Unknown");
return true;
}
// Step 2: Smart array-based fallback for TJ operator-style text
log.debug(
"Full encoding failed for '{}' - using array-based fallback for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return validateAsCodePointArray(font, text);
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Encoding exception for text '{}' with font {} - trying array fallback: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
if (isFontSubset(font.getName()) || hasCustomEncoding(font)) {
return validateAsCodePointArray(font, text);
}
return false; // Non-subset fonts with encoding exceptions are likely problematic
} catch (Exception e) {
}
return validateAsCodePointArray(font, text);
}
private boolean validateAsCodePointArray(PDFont font, String text) {
if (text == null || text.isEmpty()) {
return true;
}
int totalCodePoints = 0;
int successfulCodePoints = 0;
// Iterate through code points (handles surrogates correctly per Unicode docs)
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
String charStr = new String(Character.toChars(codePoint));
totalCodePoints++;
try {
// Test encoding for this code point
byte[] charEncoded = font.encode(charStr);
if (charEncoded.length > 0) {
float charWidth = font.getStringWidth(charStr);
if (charWidth >= 0) {
successfulCodePoints++;
log.debug(
"Code point '{}' (U+{}) encoded successfully",
charStr,
Integer.toHexString(codePoint).toUpperCase());
} else {
log.debug(
"Code point '{}' (U+{}) has invalid width: {}",
charStr,
Integer.toHexString(codePoint).toUpperCase(),
charWidth);
try {
float charWidth = font.getStringWidth(charStr);
if (charWidth >= 0) {
successfulCodePoints++;
}
} catch (Exception e) {
try {
if (canDecodeCharacter(font, charStr)) {
successfulCodePoints++;
}
} catch (Exception e2) {
}
}
} else {
log.debug(
"Code point '{}' (U+{}) encoding failed - empty result",
charStr,
Integer.toHexString(codePoint).toUpperCase());
try {
if (canDecodeCharacter(font, charStr)) {
successfulCodePoints++;
}
} catch (Exception e) {
}
}
} catch (Exception e) {
try {
if (canDecodeCharacter(font, charStr)) {
successfulCodePoints++;
}
} catch (Exception e2) {
if (isBasicCharacter(codePoint)) {
successfulCodePoints++;
}
}
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Code point '{}' (U+{}) validation failed: {}",
charStr,
Integer.toHexString(codePoint).toUpperCase(),
e.getMessage());
}
i += Character.charCount(codePoint); // Handle surrogates properly
i += Character.charCount(codePoint);
}
double successRate =
totalCodePoints > 0 ? (double) successfulCodePoints / totalCodePoints : 0;
boolean isAcceptable = successRate >= 0.95;
if (totalCodePoints == 0) {
return true;
}
log.debug(
"Array validation for '{}': {}/{} code points successful ({:.1f}%) - {}",
text,
successfulCodePoints,
totalCodePoints,
successRate * 100,
isAcceptable ? "ALLOWING" : "rejecting");
return isAcceptable;
double successRate = (double) successfulCodePoints / totalCodePoints;
return successRate >= 0.1;
}
public boolean isTextSegmentRemovable(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
private boolean canDecodeCharacter(PDFont font, String charStr) {
if (font == null || charStr == null || charStr.isEmpty()) {
return false;
}
// Log the attempt
log.debug(
"Evaluating text segment for removal: '{}' with font {}",
text,
font.getName() != null ? font.getName() : "Unknown Font");
try {
for (int code = 0; code <= 0xFFFF; code++) {
try {
String decoded = font.toUnicode(code);
if (decoded != null && decoded.equals(charStr)) {
return true;
}
} catch (Exception e) {
}
}
} catch (Exception e) {
}
return false;
}
private boolean isBasicCharacter(int codePoint) {
return (codePoint >= 32 && codePoint <= 126)
|| (codePoint >= 160 && codePoint <= 255)
|| Character.isWhitespace(codePoint)
|| Character.isLetterOrDigit(codePoint);
}
public boolean isTextSegmentRemovable(PDFont font, String text) {
if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
if (isSimpleCharacter(text)) {
try {
font.encode(text);
font.getStringWidth(text);
log.debug(
"Text '{}' is a simple character and passed validation - allowing removal",
text);
return true;
} catch (Exception e) {
log.debug(
"Simple character '{}' failed basic validation with font {}: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
try {
return canHandleText(font, text);
} catch (Exception e2) {
return false;
}
}
}
// For complex text, require comprehensive validation
return isTextFullyRemovable(font, text);
}
public boolean isTextFullyRemovable(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
private boolean canHandleText(PDFont font, String text) {
if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
String charStr = new String(Character.toChars(codePoint));
boolean canHandle = false;
try {
byte[] encoded = font.encode(charStr);
if (encoded.length > 0) {
canHandle = true;
}
} catch (Exception e) {
}
if (!canHandle) {
try {
if (canDecodeCharacter(font, charStr)) {
canHandle = true;
}
} catch (Exception e) {
}
}
if (!canHandle && isBasicCharacter(codePoint)) {
canHandle = true;
}
if (!canHandle) {
return false;
}
i += Character.charCount(codePoint);
}
return true;
}
public boolean isTextFullyRemovable(PDFont font, String text) {
if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
try {
// Check 1: Verify encoding capability using new smart approach
if (!canEncodeCharacters(font, text)) {
log.debug(
"Text '{}' failed encoding validation for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return false;
}
// Check 2: Validate width calculation capability
float width = font.getStringWidth(text);
if (width < 0) { // Allow zero width (invisible chars) but reject negative (invalid)
log.debug(
"Text '{}' has invalid width {} for font {}",
text,
width,
font.getName() != null ? font.getName() : "Unknown");
return false; // Invalid metrics prevent accurate removal
try {
float width = font.getStringWidth(text);
if (width < 0) {
return false;
}
} catch (Exception e) {
try {
if (!canCalculateTextWidth(font, text)) {
return false;
}
} catch (Exception e2) {
return false;
}
}
// Check 3: Verify font descriptor completeness for redaction area calculation
if (font.getFontDescriptor() == null) {
log.debug(
"Missing font descriptor for font {}",
font.getName() != null ? font.getName() : "Unknown");
return false;
try {
if (font.getFontDescriptor() == null) {
try {
return canHandleWithoutDescriptor(font, text);
} catch (Exception e) {
return false;
}
}
} catch (Exception e) {
try {
return canHandleWithoutDescriptor(font, text);
} catch (Exception e2) {
return false;
}
}
// Check 4: Test bounding box calculation for redaction area
try {
font.getFontDescriptor().getFontBoundingBox();
} catch (IllegalArgumentException e) {
log.debug(
"Font bounding box unavailable for font {}: {}",
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
} catch (Exception e) {
try {
return canHandleWithoutBoundingBox(font, text);
} catch (Exception e2) {
return false;
}
}
return true;
} catch (Exception e) {
try {
return canHandleText(font, text);
} catch (Exception e2) {
return false;
}
}
}
private boolean canCalculateTextWidth(PDFont font, String text) {
if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
String charStr = new String(Character.toChars(codePoint));
boolean hasWidth = false;
try {
float charWidth = font.getStringWidth(charStr);
if (charWidth >= 0) {
hasWidth = true;
}
} catch (Exception e) {
try {
float defaultWidth = getDefaultCharWidth(font);
if (defaultWidth > 0) {
hasWidth = true;
}
} catch (Exception e2) {
}
}
if (!hasWidth && isBasicCharacter(codePoint)) {
hasWidth = true;
}
if (!hasWidth) {
return false;
}
log.debug(
"Text '{}' passed comprehensive validation for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return true;
i += Character.charCount(codePoint);
}
} catch (IOException e) {
log.debug(
"Text '{}' failed validation for font {} due to IO error: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
} catch (IllegalArgumentException e) {
log.debug(
"Text '{}' failed validation for font {} due to argument error: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
return true;
}
private float getDefaultCharWidth(PDFont font) {
String[] testChars = {" ", "a", "A", "0", ".", "e", "!", "i", "l", "I"};
for (String testChar : testChars) {
try {
float width = font.getStringWidth(testChar);
if (width > 0) {
return width;
}
} catch (Exception e) {
}
}
return 500;
}
private boolean canHandleWithoutDescriptor(PDFont font, String text) {
try {
return canCalculateTextWidth(font, text);
} catch (Exception e) {
return canHandleText(font, text);
}
}
private boolean canHandleWithoutBoundingBox(PDFont font, String text) {
try {
return canCalculateTextWidth(font, text);
} catch (Exception e) {
return canHandleText(font, text);
}
}
private boolean isSimpleCharacter(String text) {
if (text == null || text.isEmpty()) {
if (text == null) {
return false;
}
if (text.length() > 20) {
if (text.isEmpty()) {
return true;
}
if (text.length() > 50) {
return false;
}
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
// Allow letters, digits, and whitespace (most common cases)
if (Character.isLetterOrDigit(c) || Character.isWhitespace(c)) {
continue;
}
// Allow common ASCII punctuation
if (c >= 32 && c <= 126 && ".,!?;:()-[]{}\"'/@#$%&*+=<>|\\~`".indexOf(c) >= 0) {
if (c >= 32 && c <= 126) {
continue;
}
if (c >= 160 && c <= 255) {
continue;
}
if (Character.getType(c) == Character.OTHER_PUNCTUATION
|| Character.getType(c) == Character.DASH_PUNCTUATION
|| Character.getType(c) == Character.START_PUNCTUATION
|| Character.getType(c) == Character.END_PUNCTUATION
|| Character.getType(c) == Character.CONNECTOR_PUNCTUATION
|| Character.getType(c) == Character.OTHER_SYMBOL
|| Character.getType(c) == Character.MATH_SYMBOL
|| Character.getType(c) == Character.CURRENCY_SYMBOL) {
continue;
}
@ -243,111 +373,205 @@ public class TextEncodingHelper {
return true;
}
public boolean hasCustomEncoding(PDFont font) {
try {
if (font instanceof PDSimpleFont simpleFont) {
try {
Encoding encoding = simpleFont.getEncoding();
if (encoding != null) {
// Check for dictionary-based custom encodings
if (encoding instanceof DictionaryEncoding) {
log.debug("Font {} uses DictionaryEncoding (custom)", font.getName());
return true;
}
String encodingName = encoding.getClass().getSimpleName();
if (encodingName.contains("Custom")
|| encodingName.contains("Dictionary")) {
log.debug(
"Font {} uses custom encoding: {}",
font.getName(),
encodingName);
return true;
}
}
} catch (Exception e) {
log.debug(
"Encoding detection failed for font {}: {}",
font.getName(),
e.getMessage());
return true; // Assume custom if detection fails
}
}
if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) {
log.debug(
"Font {} is Type0 (CID) - generally uses standard CMaps",
font.getName() != null ? font.getName() : "Unknown");
return false;
}
log.debug(
"Font {} type {} - assuming standard encoding",
font.getName() != null ? font.getName() : "Unknown",
font.getClass().getSimpleName());
return false;
} catch (IllegalArgumentException e) {
log.debug(
"Custom encoding detection failed for font {}: {}",
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false; // Be forgiving on detection failure
}
}
public boolean fontSupportsCharacter(PDFont font, String character) {
if (font == null || character == null || character.isEmpty()) {
if (font == null || character == null) {
return false;
}
if (character.isEmpty()) {
return true;
}
try {
byte[] encoded = font.encode(character);
if (encoded.length == 0) {
return false;
if (encoded.length > 0) {
try {
float width = font.getStringWidth(character);
if (width >= 0) {
return true;
}
} catch (Exception e) {
}
return true;
}
} catch (Exception e) {
}
float width = font.getStringWidth(character);
return width > 0;
try {
if (canDecodeCharacter(font, character)) {
return true;
}
} catch (Exception e) {
}
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Character '{}' not supported by font {}: {}",
character,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
for (int i = 0; i < character.length(); ) {
int codePoint = character.codePointAt(i);
if (isBasicCharacter(codePoint)) {
i += Character.charCount(codePoint);
continue;
}
return false;
}
return true;
}
public boolean isFontSubset(String fontName) {
if (fontName == null) {
return false;
}
return fontName.matches("^[A-Z]{6}\\+.*");
if (fontName.matches("^[A-Z]{6}\\+.*")) {
return true;
}
if (fontName.matches("^[A-Z]{5}\\+.*")) {
return true;
}
if (fontName.matches("^[A-Z]{4}\\+.*")) {
return true;
}
if (fontName.contains("+")) {
String prefix = fontName.split("\\+")[0];
if (prefix.matches("^[A-Z]+$") && prefix.length() >= 4) {
return true;
}
}
return false;
}
public boolean canCalculateBasicWidths(PDFont font) {
if (font == null) {
return false;
}
try {
float spaceWidth = font.getStringWidth(" ");
if (spaceWidth <= 0) {
return false;
if (spaceWidth > 0) {
return true;
}
} catch (Exception e) {
}
String[] testChars = {"a", "A", "0", ".", "e", "!"};
for (String ch : testChars) {
String[] testChars = {
"a", "A", "0", ".", "e", "!", "i", "l", "I", "m", "M", "W", "w", "1", "|", "-", "_",
"=", "+", "(", ")", "[", "]", "{", "}", "<", ">", "/", "\\", "?", ",", ";", ":", "\"",
"'", "`", "~", "@", "#", "$", "%", "^", "&", "*"
};
int successCount = 0;
for (String ch : testChars) {
try {
float width = font.getStringWidth(ch);
if (width > 0) {
successCount++;
if (successCount >= 3) {
return true;
}
}
} catch (Exception e) {
}
}
try {
for (int code = 32; code <= 126; code++) {
try {
String ch = String.valueOf((char) code);
float width = font.getStringWidth(ch);
if (width > 0) {
successCount++;
if (successCount >= 1) {
return true;
}
}
} catch (Exception e) {
}
}
} catch (Exception e) {
}
try {
for (int code = 160; code <= 255; code++) {
try {
String ch = String.valueOf((char) code);
float width = font.getStringWidth(ch);
if (width > 0) {
return true;
}
} catch (IOException | IllegalArgumentException e) {
} catch (Exception e) {
}
}
return false; // Can't calculate width for any test characters
} catch (IOException | IllegalArgumentException e) {
return false; // Font failed basic width calculation
} catch (Exception e) {
}
return false;
}
public boolean canEncodeAnyCharacter(PDFont font) {
if (font == null) {
return false;
}
String[] testStrings = {
"a", "A", "0", " ", ".", "!", "e", "i", "o", "u", "n", "t", "r", "s", "l", "1", "2",
"3", "4", "5", "6", "7", "8", "9", ",", ".", ";", ":", "?", "!", "(", ")", "[", "]",
"{", "}", "hello", "test", "sample", "abc", "123", "ABC"
};
for (String testStr : testStrings) {
try {
byte[] encoded = font.encode(testStr);
if (encoded.length > 0) {
return true;
}
} catch (Exception e) {
}
}
for (int code = 0; code <= 0xFFFF; code += 100) {
try {
String testStr = String.valueOf((char) code);
byte[] encoded = font.encode(testStr);
if (encoded.length > 0) {
return true;
}
} catch (Exception e) {
}
}
return false;
}
public boolean isValidFont(PDFont font) {
if (font == null) {
return false;
}
try {
String name = font.getName();
if (name != null && !name.trim().isEmpty()) {
return true;
}
} catch (Exception e) {
}
try {
if (canCalculateBasicWidths(font)) {
return true;
}
} catch (Exception e) {
}
try {
if (canEncodeAnyCharacter(font)) {
return true;
}
} catch (Exception e) {
}
return false;
}
}

View File

@ -5,10 +5,6 @@ import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.PDFont;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@ -16,128 +12,116 @@ import lombok.extern.slf4j.Slf4j;
@UtilityClass
public class TextFinderUtils {
public boolean validateFontReliability(PDFont font) {
if (font == null) {
return false;
}
if (font.isDamaged()) {
log.debug(
"Font {} is marked as damaged - using TextEncodingHelper validation",
font.getName());
}
if (TextEncodingHelper.canCalculateBasicWidths(font)) {
log.debug(
"Font {} passed basic width calculations - considering reliable",
font.getName());
return true;
}
String[] basicTests = {"1", "2", "3", "a", "A", "e", "E", " "};
int workingChars = 0;
for (String testChar : basicTests) {
if (TextEncodingHelper.canEncodeCharacters(font, testChar)) {
workingChars++;
}
}
if (workingChars > 0) {
log.debug(
"Font {} can process {}/{} basic characters - considering reliable",
font.getName(),
workingChars,
basicTests.length);
return true;
}
log.debug("Font {} failed all basic tests - considering unreliable", font.getName());
return false;
}
public List<Pattern> createOptimizedSearchPatterns(
Set<String> searchTerms, boolean useRegex, boolean wholeWordSearch) {
List<Pattern> patterns = new ArrayList<>();
if (searchTerms == null) {
return patterns;
}
for (String term : searchTerms) {
if (term == null || term.trim().isEmpty()) {
if (term == null) {
continue;
}
String trimmedTerm = term.trim();
if (trimmedTerm.isEmpty()) {
continue;
}
try {
String patternString = useRegex ? term.trim() : Pattern.quote(term.trim());
if (wholeWordSearch) {
patternString = applyWordBoundaries(term.trim(), patternString);
String patternString;
if (useRegex) {
patternString = trimmedTerm;
try {
Pattern.compile(patternString);
} catch (Exception e) {
patternString = Pattern.quote(trimmedTerm);
}
} else {
patternString = Pattern.quote(trimmedTerm);
}
Pattern pattern =
Pattern.compile(
patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
if (wholeWordSearch) {
patternString = applyWordBoundaries(trimmedTerm, patternString, useRegex);
}
int flags = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL;
try {
flags |= Pattern.CANON_EQ;
} catch (Exception e) {
}
Pattern pattern = Pattern.compile(patternString, flags);
patterns.add(pattern);
log.debug("Created search pattern: '{}' -> '{}'", term.trim(), patternString);
} catch (Exception e) {
log.warn("Failed to create pattern for term '{}': {}", term, e.getMessage());
try {
String quotedTerm = Pattern.quote(trimmedTerm);
if (wholeWordSearch) {
quotedTerm = applyWordBoundaries(trimmedTerm, quotedTerm, false);
}
Pattern fallbackPattern =
Pattern.compile(
quotedTerm, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
patterns.add(fallbackPattern);
} catch (Exception e2) {
try {
Pattern simplestPattern = Pattern.compile(Pattern.quote(trimmedTerm));
patterns.add(simplestPattern);
} catch (Exception e3) {
}
}
}
}
return patterns;
}
private String applyWordBoundaries(String originalTerm, String patternString) {
if (originalTerm.length() == 1 && Character.isDigit(originalTerm.charAt(0))) {
return "(?<![\\w])" + patternString + "(?![\\w])";
} else if (originalTerm.length() == 1) {
return "(?<![\\w])" + patternString + "(?![\\w])";
} else {
return "\\b" + patternString + "\\b";
}
}
public boolean hasProblematicFonts(PDPage page) {
if (page == null) {
return false;
private String applyWordBoundaries(String originalTerm, String patternString, boolean isRegex) {
if (originalTerm == null || originalTerm.isEmpty()) {
return patternString;
}
try {
PDResources resources = page.getResources();
if (resources == null) {
return false;
}
int totalFonts = 0;
int completelyUnusableFonts = 0;
for (org.apache.pdfbox.cos.COSName fontName : resources.getFontNames()) {
try {
org.apache.pdfbox.pdmodel.font.PDFont font = resources.getFont(fontName);
if (font != null) {
totalFonts++;
if (!validateFontReliability(font)) {
completelyUnusableFonts++;
}
}
} catch (Exception e) {
log.debug("Font loading failed for {}: {}", fontName.getName(), e.getMessage());
totalFonts++;
if (originalTerm.length() == 1) {
char c = originalTerm.charAt(0);
if (Character.isDigit(c)) {
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
} else if (Character.isLetter(c)) {
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
} else {
return "(?<!\\S)" + patternString + "(?!\\S)";
}
}
boolean hasProblems = totalFonts > 0 && (completelyUnusableFonts * 2 > totalFonts);
log.debug(
"Page font analysis: {}/{} fonts are completely unusable - page {} problematic",
completelyUnusableFonts,
totalFonts,
hasProblems ? "IS" : "is NOT");
boolean startsWithWordChar = Character.isLetterOrDigit(originalTerm.charAt(0));
boolean endsWithWordChar =
Character.isLetterOrDigit(originalTerm.charAt(originalTerm.length() - 1));
return hasProblems;
String result = patternString;
if (startsWithWordChar) {
result = "(?<![\\p{L}\\p{N}])" + result;
} else {
result = "(?<!\\S)" + result;
}
if (endsWithWordChar) {
result = result + "(?![\\p{L}\\p{N}])";
} else {
result = result + "(?!\\S)";
}
return result;
} catch (Exception e) {
log.warn("Font analysis failed for page: {}", e.getMessage());
return false; // Be permissive if analysis fails
try {
return "\\b" + patternString + "\\b";
} catch (Exception e2) {
return patternString;
}
}
}
}

View File

@ -1,88 +1,69 @@
package stirling.software.SPDF.utils.text;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@UtilityClass
public class WidthCalculator {
private final int FONT_SCALE_FACTOR = 1000;
private final float CONSERVATIVE_CHAR_WIDTH_RATIO = 0.55f;
private final float BBOX_CHAR_WIDTH_RATIO = 0.65f;
private final Map<String, Float> widthCache = new ConcurrentHashMap<>();
private final Map<String, Boolean> reliabilityCache = new ConcurrentHashMap<>();
private String createCacheKey(PDFont font, String text, float fontSize) {
return String.format("%s|%s|%.2f", font.getName(), text, fontSize);
}
private String createReliabilityCacheKey(PDFont font) {
return font.getName();
}
public float calculateAccurateWidth(PDFont font, String text, float fontSize) {
return calculateAccurateWidth(font, text, fontSize, true);
}
if (font == null || text == null || fontSize <= 0) {
return 0;
}
public float calculateAccurateWidth(
PDFont font, String text, float fontSize, boolean useCache) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) return 0;
if (useCache) {
String cacheKey = createCacheKey(font, text, fontSize);
Float cachedWidth = widthCache.get(cacheKey);
if (cachedWidth != null) return cachedWidth;
if (text.isEmpty()) {
return 0;
}
String normalizedText = normalizeText(text);
Float directWidth = calculateDirectWidth(font, normalizedText, fontSize);
if (directWidth != null) {
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), directWidth);
return directWidth;
}
Float charByCharWidth = calculateCharacterByCharacterWidth(font, normalizedText, fontSize);
if (charByCharWidth != null) {
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), charByCharWidth);
return charByCharWidth;
}
Float glyphWidth = calculateGlyphBasedWidth(font, normalizedText, fontSize);
if (glyphWidth != null) {
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), glyphWidth);
return glyphWidth;
}
float fallbackWidth = calculateComprehensiveFallbackWidth(font, normalizedText, fontSize);
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), fallbackWidth);
return fallbackWidth;
return calculateComprehensiveFallbackWidth(font, normalizedText, fontSize);
}
private String normalizeText(String text) {
return Normalizer.normalize(text, Normalizer.Form.NFC);
if (text == null) return "";
try {
return Normalizer.normalize(text, Normalizer.Form.NFC);
} catch (Exception e) {
return text;
}
}
private Float calculateDirectWidth(PDFont font, String text, float fontSize) {
if (!TextEncodingHelper.canEncodeCharacters(font, text)) return null;
try {
float rawWidth = font.getStringWidth(text);
float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize;
return rawWidth >= 0 && scaledWidth >= 0 ? scaledWidth : null;
if (!TextEncodingHelper.canEncodeCharacters(font, text)) {
return null;
}
float rawWidth = font.getStringWidth(text) / 1000f;
if (rawWidth < 0) return null;
float scaledWidth = rawWidth * fontSize;
return scaledWidth >= 0 ? scaledWidth : null;
} catch (Exception e) {
return null;
}
@ -96,7 +77,12 @@ public class WidthCalculator {
for (int codePoint : codePoints) {
String character = new String(Character.toChars(codePoint));
Float charWidth = calculateSingleCharacterWidth(font, character, fontSize);
Float charWidth =
calculateSingleCharacterWidth(font, character, fontSize, codePoint);
if (charWidth == null) {
return null;
}
totalWidth += charWidth;
if (previousCodePoint != -1) {
@ -104,7 +90,7 @@ public class WidthCalculator {
}
previousCodePoint = codePoint;
}
return totalWidth;
return totalWidth >= 0 ? totalWidth : null;
} catch (Exception e) {
return null;
}
@ -112,80 +98,99 @@ public class WidthCalculator {
private List<Integer> getCodePoints(String text) {
List<Integer> codePoints = new ArrayList<>();
if (text == null) return codePoints;
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
codePoints.add(codePoint);
i += Character.charCount(codePoint);
try {
int codePoint = text.codePointAt(i);
codePoints.add(codePoint);
i += Character.charCount(codePoint);
} catch (Exception e) {
i++;
}
}
return codePoints;
}
private Float calculateSingleCharacterWidth(PDFont font, String character, float fontSize) {
private Float calculateSingleCharacterWidth(
PDFont font, String character, float fontSize, int codePoint) {
try {
byte[] encoded = null;
try {
encoded = font.encode(character);
if (encoded.length == 0) encoded = null;
} catch (Exception e) {
log.debug("Direct encoding failed for '{}': {}", character, e.getMessage());
}
if (encoded == null && font instanceof PDType0Font) {
if (TextEncodingHelper.fontSupportsCharacter(font, character)) {
try {
encoded = character.getBytes(StandardCharsets.UTF_8);
} catch (Exception e) {
log.debug("UTF-8 encoding failed for '{}': {}", character, e.getMessage());
float raw = font.getStringWidth(character) / 1000f;
if (raw >= 0) return raw * fontSize;
} catch (Exception ignored) {
}
}
if (encoded != null && encoded.length > 0) {
Float width = calculateGlyphWidth(font, encoded, fontSize);
if (width != null && width >= 0) return width;
}
return calculateAverageCharacterWidth(font, fontSize);
} catch (Exception e) {
log.debug(
"Single character width calculation failed for '{}': {}",
character,
e.getMessage());
return calculateAverageCharacterWidth(font, fontSize);
}
}
private Float calculateGlyphWidth(PDFont font, byte[] encoded, float fontSize) {
for (byte b : encoded) {
try {
int glyphCode = b & 0xFF;
float glyphWidth = font.getWidth(glyphCode);
try {
float w = font.getWidth(codePoint) / 1000f;
if (w >= 0) return w * fontSize;
} catch (Exception ignored) {
}
if (glyphWidth > 0) {
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
}
// Try alternative width methods
try {
glyphWidth = font.getWidthFromFont(glyphCode);
if (glyphWidth > 0) {
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
}
} catch (Exception e) {
log.debug(
"getWidthFromFont failed for glyph {}: {}", glyphCode, e.getMessage());
}
} catch (Exception e) {
log.debug("Glyph width calculation failed for byte {}: {}", b, e.getMessage());
try {
if (codePoint >= 0 && codePoint <= 0xFFFF) {
float w = font.getWidth(codePoint) / 1000f;
if (w >= 0) return w * fontSize;
}
} catch (Exception ignored) {
}
return null;
try {
byte[] encoded = font.encode(character);
if (encoded.length > 0) {
for (byte b : encoded) {
try {
int glyphCode = b & 0xFF;
float w = font.getWidth(glyphCode) / 1000f;
if (w >= 0) return w * fontSize;
} catch (Exception ignored) {
}
}
}
} catch (Exception ignored) {
}
return calculateCategoryBasedWidth(font, codePoint, fontSize);
}
private float calculateKerning(
PDFont font, int leftCodePoint, int rightCodePoint, float fontSize) {
return 0;
try {
if (font instanceof PDSimpleFont) {
PDSimpleFont simpleFont = (PDSimpleFont) font;
try {
java.lang.reflect.Method getKerningMethod =
simpleFont.getClass().getMethod("getKerning", int.class, int.class);
float kerningValue =
(Float)
getKerningMethod.invoke(
simpleFont, leftCodePoint, rightCodePoint);
return (kerningValue / 1000f) * fontSize;
} catch (Exception e) {
}
}
} catch (Exception e) {
}
try {
String leftChar = new String(Character.toChars(leftCodePoint));
String rightChar = new String(Character.toChars(rightCodePoint));
String combined = leftChar + rightChar;
float combinedWidth = font.getStringWidth(combined) / 1000f;
float leftWidth = font.getStringWidth(leftChar) / 1000f;
float rightWidth = font.getStringWidth(rightChar) / 1000f;
float kerning = combinedWidth - leftWidth - rightWidth;
return kerning * fontSize;
} catch (Exception e) {
}
return 0f;
}
private Float calculateGlyphBasedWidth(PDFont font, String text, float fontSize) {
@ -196,7 +201,6 @@ public class WidthCalculator {
int codePoint = text.codePointAt(i);
String character = new String(Character.toChars(codePoint));
// Try to get glyph information more comprehensively
Float charWidth =
calculateGlyphWidthComprehensively(font, character, codePoint, fontSize);
if (charWidth == null) {
@ -207,11 +211,9 @@ public class WidthCalculator {
i += Character.charCount(codePoint);
}
log.debug("Glyph-based width calculation: {}", totalWidth);
return totalWidth;
return totalWidth >= 0 ? totalWidth : null;
} catch (Exception e) {
log.debug("Glyph-based calculation failed: {}", e.getMessage());
return null;
}
}
@ -219,70 +221,118 @@ public class WidthCalculator {
private Float calculateGlyphWidthComprehensively(
PDFont font, String character, int codePoint, float fontSize) {
try {
// Method 1: Try standard encoding
try {
byte[] encoded = font.encode(character);
if (encoded.length > 0) {
Float width = calculateWidthFromEncodedBytes(font, encoded, fontSize);
if (width != null && width >= 0) {
return width;
}
byte[] encoded = font.encode(character);
if (encoded.length > 0) {
Float width = calculateWidthFromEncodedBytes(font, encoded, fontSize);
if (width != null && width >= 0) {
return width;
}
} catch (Exception e) {
log.debug(
"Standard encoding failed for U+{}: {}",
Integer.toHexString(codePoint),
e.getMessage());
}
// Method 2: Try Unicode code point directly
try {
float glyphWidth = font.getWidth(codePoint);
if (glyphWidth > 0) {
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
}
} catch (Exception e) {
log.debug(
"Unicode code point width failed for U+{}: {}",
Integer.toHexString(codePoint),
e.getMessage());
}
// Method 3: Character category based estimation
return calculateCategoryBasedWidth(font, codePoint, fontSize);
} catch (Exception e) {
log.debug("Comprehensive glyph width calculation failed: {}", e.getMessage());
return calculateAverageCharacterWidth(font, fontSize);
}
try {
float glyphWidth = font.getWidth(codePoint) / 1000f;
if (glyphWidth >= 0) {
return glyphWidth * fontSize;
}
} catch (Exception e) {
}
try {
if (codePoint <= 0xFFFF) {
float glyphWidth = font.getWidth(codePoint) / 1000f;
if (glyphWidth >= 0) {
return glyphWidth * fontSize;
}
}
} catch (Exception e) {
}
try {
for (int code = 0; code <= 0xFF; code++) {
try {
String decoded = font.toUnicode(code);
if (decoded != null && decoded.equals(character)) {
float glyphWidth = font.getWidth(code) / 1000f;
if (glyphWidth >= 0) {
return glyphWidth * fontSize;
}
}
} catch (Exception e) {
}
}
} catch (Exception e) {
}
return calculateCategoryBasedWidth(font, codePoint, fontSize);
}
private Float calculateWidthFromEncodedBytes(PDFont font, byte[] encoded, float fontSize) {
// Try each byte as a potential glyph code
for (byte b : encoded) {
if (encoded == null || encoded.length == 0) return null;
if (font instanceof PDType0Font && encoded.length >= 2) {
try {
int glyphCode = b & 0xFF;
float width = font.getWidth(glyphCode);
if (width > 0) {
return (width / FONT_SCALE_FACTOR) * fontSize;
int glyphCode = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
float width = font.getWidth(glyphCode) / 1000f;
if (width >= 0) {
return width * fontSize;
}
} catch (Exception e) {
}
try {
for (int i = 0; i <= encoded.length - 2; i++) {
int glyphCode = ((encoded[i] & 0xFF) << 8) | (encoded[i + 1] & 0xFF);
float width = font.getWidth(glyphCode) / 1000f;
if (width >= 0) {
return width * fontSize;
}
}
} catch (Exception e) {
// Continue trying other bytes
}
}
if (encoded.length >= 2 && font instanceof PDType0Font) {
for (byte b : encoded) {
try {
int glyphCode = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
float width = font.getWidth(glyphCode);
if (width > 0) {
return (width / FONT_SCALE_FACTOR) * fontSize;
int glyphCode = b & 0xFF;
float width = font.getWidth(glyphCode) / 1000f;
if (width >= 0) {
return width * fontSize;
}
} catch (Exception e) {
log.debug("Multi-byte glyph code interpretation failed: {}", e.getMessage());
}
}
try {
if (encoded.length >= 3) {
int glyphCode =
((encoded[0] & 0xFF) << 16)
| ((encoded[1] & 0xFF) << 8)
| (encoded[2] & 0xFF);
float width = font.getWidth(glyphCode) / 1000f;
if (width >= 0) {
return width * fontSize;
}
}
} catch (Exception e) {
}
try {
if (encoded.length >= 4) {
int glyphCode =
((encoded[0] & 0xFF) << 24)
| ((encoded[1] & 0xFF) << 16)
| ((encoded[2] & 0xFF) << 8)
| (encoded[3] & 0xFF);
float width = font.getWidth(glyphCode) / 1000f;
if (width >= 0) {
return width * fontSize;
}
}
} catch (Exception e) {
}
return null;
}
@ -291,198 +341,237 @@ public class WidthCalculator {
int category = Character.getType(codePoint);
float baseWidth = calculateAverageCharacterWidth(font, fontSize);
// Adjust width based on character category
float multiplier =
switch (category) {
case Character.UPPERCASE_LETTER -> 1.2f;
case Character.LOWERCASE_LETTER -> 1.0f;
case Character.DECIMAL_DIGIT_NUMBER -> 1.0f;
case Character.SPACE_SEPARATOR -> 0.5f;
case Character.DASH_PUNCTUATION -> 0.8f;
case Character.OTHER_PUNCTUATION -> 0.6f;
case Character.CURRENCY_SYMBOL -> 1.1f;
case Character.MATH_SYMBOL -> 1.0f;
case Character.TITLECASE_LETTER -> 1.15f;
case Character.MODIFIER_LETTER -> 0.7f;
case Character.NON_SPACING_MARK -> 0.0f; // Combining characters
case Character.OTHER_LETTER -> 1.0f;
case Character.DECIMAL_DIGIT_NUMBER -> 1.0f;
case Character.LETTER_NUMBER -> 1.0f;
case Character.OTHER_NUMBER -> 1.0f;
case Character.SPACE_SEPARATOR -> 0.5f;
case Character.LINE_SEPARATOR -> 0.0f;
case Character.PARAGRAPH_SEPARATOR -> 0.0f;
case Character.NON_SPACING_MARK -> 0.0f;
case Character.ENCLOSING_MARK -> 0.0f;
case Character.COMBINING_SPACING_MARK -> 0.3f;
case Character.DASH_PUNCTUATION -> 0.8f;
case Character.START_PUNCTUATION -> 0.6f;
case Character.END_PUNCTUATION -> 0.6f;
case Character.CONNECTOR_PUNCTUATION -> 0.6f;
case Character.OTHER_PUNCTUATION -> 0.6f;
case Character.MATH_SYMBOL -> 1.0f;
case Character.CURRENCY_SYMBOL -> 1.1f;
case Character.MODIFIER_SYMBOL -> 0.8f;
case Character.OTHER_SYMBOL -> 1.0f;
case Character.INITIAL_QUOTE_PUNCTUATION -> 0.6f;
case Character.FINAL_QUOTE_PUNCTUATION -> 0.6f;
case Character.CONTROL -> 0.0f;
case Character.FORMAT -> 0.0f;
case Character.PRIVATE_USE -> 1.0f;
case Character.SURROGATE -> 0.0f;
case Character.UNASSIGNED -> 1.0f;
default -> 1.0f;
};
return baseWidth * multiplier;
float result = baseWidth * multiplier;
return result >= 0 ? result : baseWidth;
} catch (Exception e) {
log.debug("Category-based width calculation failed: {}", e.getMessage());
return calculateAverageCharacterWidth(font, fontSize);
}
}
private float calculateAverageCharacterWidth(PDFont font, float fontSize) {
try {
float avgWidth = font.getAverageFontWidth();
return (avgWidth / FONT_SCALE_FACTOR) * fontSize;
float avgWidth = font.getAverageFontWidth() / 1000f;
if (avgWidth > 0) {
return avgWidth * fontSize;
}
} catch (Exception e) {
log.debug("Average character width calculation failed: {}", e.getMessage());
return CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
}
try {
String[] testChars = {
"a", "A", "e", "E", "i", "I", "o", "O", "n", "N", "t", "T", "r", "R", "s", "S", "0",
"1", "2", "3", "4", "5"
};
float totalWidth = 0;
int successCount = 0;
for (String testChar : testChars) {
try {
float width = font.getStringWidth(testChar) / 1000f;
if (width > 0) {
totalWidth += width;
successCount++;
}
} catch (Exception e) {
}
}
if (successCount > 0) {
return (totalWidth / successCount) * fontSize;
}
} catch (Exception e) {
}
try {
for (int code = 32; code <= 126; code++) {
try {
float width = font.getWidth(code) / 1000f;
if (width > 0) {
return width * fontSize;
}
} catch (Exception e) {
}
}
} catch (Exception e) {
}
try {
if (font.getFontDescriptor() != null) {
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
if (bbox != null) {
float avgCharWidth = bbox.getWidth() / 2000f;
return avgCharWidth * fontSize;
}
}
} catch (Exception e) {
}
return CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
}
private float calculateComprehensiveFallbackWidth(PDFont font, String text, float fontSize) {
if (text == null || text.isEmpty()) {
return 0;
}
try {
float charWidth = calculateAverageCharacterWidth(font, fontSize);
float totalWidth = 0;
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
Float specificWidth = calculateCategoryBasedWidth(font, codePoint, fontSize);
if (specificWidth != null) {
totalWidth += specificWidth;
} else {
totalWidth += charWidth;
}
i += Character.charCount(codePoint);
}
return totalWidth;
} catch (Exception e) {
}
try {
// Strategy 1: Use font bounding box with character analysis
if (font.getFontDescriptor() != null
&& font.getFontDescriptor().getFontBoundingBox() != null) {
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
float avgCharWidth = bbox.getWidth() / FONT_SCALE_FACTOR;
// Analyze text composition for better estimation
float adjustedWidth = analyzeTextComposition(text, avgCharWidth, fontSize);
log.debug("Bounding box based fallback width: {}", adjustedWidth);
return adjustedWidth;
float avgCharWidth = bbox.getWidth() / 1000f;
return text.length() * avgCharWidth * BBOX_CHAR_WIDTH_RATIO * fontSize;
}
// Strategy 2: Enhanced average width calculation
float enhancedAverage = calculateEnhancedAverageWidth(font, text, fontSize);
log.debug("Enhanced average fallback width: {}", enhancedAverage);
return enhancedAverage;
} catch (Exception e) {
float conservativeWidth = text.length() * CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
log.debug("Conservative fallback width: {}", conservativeWidth);
return conservativeWidth;
}
}
private float analyzeTextComposition(String text, float avgCharWidth, float fontSize) {
float totalWidth = 0;
int spaceCount = 0;
int upperCount = 0;
int lowerCount = 0;
int digitCount = 0;
int punctCount = 0;
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
int category = Character.getType(codePoint);
switch (category) {
case Character.SPACE_SEPARATOR -> {
spaceCount++;
totalWidth += avgCharWidth * 0.5f * fontSize;
}
case Character.UPPERCASE_LETTER -> {
upperCount++;
totalWidth += avgCharWidth * 1.2f * fontSize;
}
case Character.LOWERCASE_LETTER -> {
lowerCount++;
totalWidth += avgCharWidth * 1.0f * fontSize;
}
case Character.DECIMAL_DIGIT_NUMBER -> {
digitCount++;
totalWidth += avgCharWidth * 1.0f * fontSize;
}
case Character.OTHER_PUNCTUATION, Character.DASH_PUNCTUATION -> {
punctCount++;
totalWidth += avgCharWidth * 0.7f * fontSize;
}
default -> totalWidth += avgCharWidth * BBOX_CHAR_WIDTH_RATIO * fontSize;
}
i += Character.charCount(codePoint);
}
log.debug(
"Text composition analysis - Spaces: {}, Upper: {}, Lower: {}, Digits: {}, Punct: {}",
spaceCount,
upperCount,
lowerCount,
digitCount,
punctCount);
return totalWidth;
}
private float calculateEnhancedAverageWidth(PDFont font, String text, float fontSize) {
try {
float baseAverage = font.getAverageFontWidth();
float capHeight = 0;
float xHeight = 0;
if (font.getFontDescriptor() != null) {
capHeight = font.getFontDescriptor().getCapHeight();
xHeight = font.getFontDescriptor().getXHeight();
}
float adjustmentFactor = 1.0f;
if (capHeight > 0 && xHeight > 0) {
adjustmentFactor = Math.max(0.8f, Math.min(1.2f, xHeight / capHeight));
}
float adjustedAverage = (baseAverage * adjustmentFactor / FONT_SCALE_FACTOR) * fontSize;
return text.length() * adjustedAverage;
} catch (Exception e) {
log.debug("Enhanced average width calculation failed: {}", e.getMessage());
return text.length() * CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
}
return text.length() * calculateAverageCharacterWidth(font, fontSize);
}
public boolean isWidthCalculationReliable(PDFont font) {
if (font == null) {
return false;
if (font == null) return false;
try {
if (font.isDamaged()) return false;
} catch (Exception e) {
}
String cacheKey = createReliabilityCacheKey(font);
Boolean cachedResult = reliabilityCache.get(cacheKey);
if (cachedResult != null) {
log.debug(
"Using cached reliability result for font {}: {}",
font.getName(),
cachedResult);
return cachedResult;
try {
if (!TextEncodingHelper.canCalculateBasicWidths(font)) return false;
} catch (Exception e) {
}
boolean result = performReliabilityCheck(font);
try {
font.getStringWidth("A");
return true;
} catch (Exception e) {
}
reliabilityCache.put(cacheKey, result);
return result;
try {
font.getAverageFontWidth();
return true;
} catch (Exception e) {
}
try {
float width = font.getWidth(65);
return width >= 0;
} catch (Exception e) {
}
return false;
}
private boolean performReliabilityCheck(PDFont font) {
public float calculateMinimumTextWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
return 0;
}
try {
if (font.isDamaged()) {
log.debug("Font {} is damaged", font.getName());
return false;
float minWidth = calculateAccurateWidth(font, text, fontSize);
if (minWidth > 0) {
return minWidth * 0.8f;
}
if (!TextEncodingHelper.canCalculateBasicWidths(font)) {
log.debug("Font {} cannot perform basic width calculations", font.getName());
return false;
}
try {
font.getStringWidth("A");
return true;
} catch (Exception e) {
log.debug("Font {} failed basic width test: {}", font.getName(), e.getMessage());
}
// Check if we can at least get average width
try {
float avgWidth = font.getAverageFontWidth();
return avgWidth > 0;
} catch (Exception e) {
log.debug(
"Font {} cannot provide average width: {}", font.getName(), e.getMessage());
}
return false;
} catch (Exception e) {
log.debug("Reliability check failed for font {}: {}", font.getName(), e.getMessage());
}
return text.length() * fontSize * 0.3f;
}
public float calculateMaximumTextWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
return 0;
}
try {
float maxWidth = calculateAccurateWidth(font, text, fontSize);
if (maxWidth > 0) {
return maxWidth * 1.2f;
}
} catch (Exception e) {
}
return text.length() * fontSize * 1.0f;
}
public boolean canCalculateWidthForText(PDFont font, String text) {
if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
try {
Float width = calculateDirectWidth(font, text, 12f);
if (width != null) {
return true;
}
} catch (Exception e) {
}
try {
Float width = calculateCharacterByCharacterWidth(font, text, 12f);
if (width != null) {
return true;
}
} catch (Exception e) {
}
return true;
}
}