enhance text handling and encoding validation

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-24 16:59:09 +02:00
parent e396b6cbb8
commit 7db58ad6dd
6 changed files with 1914 additions and 1119 deletions

View File

@ -6,23 +6,20 @@ import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import lombok.Getter;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.text.TextPosition;
import lombok.extern.slf4j.Slf4j; import lombok.Getter;
import stirling.software.SPDF.model.PDFText; import stirling.software.SPDF.model.PDFText;
@Slf4j
public class TextFinder extends PDFTextStripper { public class TextFinder extends PDFTextStripper {
private final String searchTerm; private final String searchTerm;
private final boolean useRegex; private final boolean useRegex;
private final boolean wholeWordSearch; private final boolean wholeWordSearch;
@Getter @Getter private final List<PDFText> foundTexts = new ArrayList<>();
private final List<PDFText> foundTexts = new ArrayList<>();
private final List<TextPosition> pageTextPositions = new ArrayList<>(); private final List<TextPosition> pageTextPositions = new ArrayList<>();
private final StringBuilder pageTextBuilder = new StringBuilder(); private final StringBuilder pageTextBuilder = new StringBuilder();
@ -45,20 +42,39 @@ public class TextFinder extends PDFTextStripper {
@Override @Override
protected void writeString(String text, List<TextPosition> textPositions) { protected void writeString(String text, List<TextPosition> textPositions) {
pageTextBuilder.append(text); for (TextPosition tp : textPositions) {
pageTextPositions.addAll(textPositions); if (tp == null) continue;
String u = tp.getUnicode();
if (u == null) continue;
for (int i = 0; i < u.length(); ) {
int cp = u.codePointAt(i);
pageTextBuilder.append(Character.toChars(cp));
// Add one position per code unit appended (1-2 chars depending on surrogate)
int codeUnits = Character.charCount(cp);
for (int k = 0; k < codeUnits; k++) {
pageTextPositions.add(tp);
}
i += codeUnits;
}
}
} }
@Override @Override
protected void writeWordSeparator() { protected void writeWordSeparator() {
pageTextBuilder.append(getWordSeparator()); String sep = getWordSeparator();
pageTextPositions.add(null); // Placeholder for separator pageTextBuilder.append(sep);
for (int i = 0; i < sep.length(); i++) {
pageTextPositions.add(null);
}
} }
@Override @Override
protected void writeLineSeparator() { protected void writeLineSeparator() {
pageTextBuilder.append(getLineSeparator()); String sep = getLineSeparator();
pageTextPositions.add(null); // Placeholder for separator pageTextBuilder.append(sep);
for (int i = 0; i < sep.length(); i++) {
pageTextPositions.add(null);
}
} }
@Override @Override
@ -91,27 +107,10 @@ public class TextFinder extends PDFTextStripper {
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
Matcher matcher = pattern.matcher(text); Matcher matcher = pattern.matcher(text);
log.debug(
"Searching for '{}' in page {} with regex '{}' (wholeWord: {}, useRegex: {})",
processedSearchTerm,
getCurrentPageNo(),
regex,
wholeWordSearch,
useRegex);
int matchCount = 0;
while (matcher.find()) { while (matcher.find()) {
matchCount++;
int matchStart = matcher.start(); int matchStart = matcher.start();
int matchEnd = matcher.end(); int matchEnd = matcher.end();
log.debug(
"Found match #{} at positions {}-{}: '{}'",
matchCount,
matchStart,
matchEnd,
matcher.group());
float minX = Float.MAX_VALUE; float minX = Float.MAX_VALUE;
float minY = Float.MAX_VALUE; float minY = Float.MAX_VALUE;
float maxX = Float.MIN_VALUE; float maxX = Float.MIN_VALUE;
@ -119,13 +118,7 @@ public class TextFinder extends PDFTextStripper {
boolean foundPosition = false; boolean foundPosition = false;
for (int i = matchStart; i < matchEnd; i++) { for (int i = matchStart; i < matchEnd; i++) {
if (i >= pageTextPositions.size()) { if (i >= pageTextPositions.size()) continue;
log.debug(
"Position index {} exceeds available positions ({})",
i,
pageTextPositions.size());
continue;
}
TextPosition pos = pageTextPositions.get(i); TextPosition pos = pageTextPositions.get(i);
if (pos != null) { if (pos != null) {
foundPosition = true; foundPosition = true;
@ -137,11 +130,6 @@ public class TextFinder extends PDFTextStripper {
} }
if (!foundPosition && matchStart < pageTextPositions.size()) { if (!foundPosition && matchStart < pageTextPositions.size()) {
log.debug(
"Attempting to find nearby positions for match at {}-{}",
matchStart,
matchEnd);
for (int i = Math.max(0, matchStart - 5); for (int i = Math.max(0, matchStart - 5);
i < Math.min(pageTextPositions.size(), matchEnd + 5); i < Math.min(pageTextPositions.size(), matchEnd + 5);
i++) { i++) {
@ -166,29 +154,11 @@ public class TextFinder extends PDFTextStripper {
maxX, maxX,
maxY, maxY,
matcher.group())); matcher.group()));
log.debug(
"Added PDFText for match: page={}, bounds=({},{},{},{}), text='{}'",
getCurrentPageNo() - 1,
minX,
minY,
maxX,
maxY,
matcher.group());
} else { } else {
log.warn( // no position info
"Found text match '{}' but no valid position data at {}-{}",
matcher.group(),
matchStart,
matchEnd);
} }
} }
log.debug(
"Page {} search complete: found {} matches for '{}'",
getCurrentPageNo(),
matchCount,
processedSearchTerm);
super.endPage(page); super.endPage(page);
} }

View File

@ -2,6 +2,7 @@ package stirling.software.SPDF.utils.text;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.CharBuffer; import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
@ -13,11 +14,9 @@ import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.*; import org.apache.pdfbox.pdmodel.font.*;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.service.RedactionService; import stirling.software.SPDF.service.RedactionService;
@Slf4j
@UtilityClass @UtilityClass
public class TextDecodingHelper { public class TextDecodingHelper {
@ -25,6 +24,8 @@ public class TextDecodingHelper {
private final int ASCII_UPPER_BOUND = 126; private final int ASCII_UPPER_BOUND = 126;
private final int EXTENDED_ASCII_LOWER_BOUND = 160; private final int EXTENDED_ASCII_LOWER_BOUND = 160;
private final int EXTENDED_ASCII_UPPER_BOUND = 255; private final int EXTENDED_ASCII_UPPER_BOUND = 255;
private final int PROBLEMATIC_CODE_LOWER_BOUND = 65488;
private final int PROBLEMATIC_CODE_UPPER_BOUND = 65535;
public PDFont getFontSafely(PDResources resources, COSName fontName) { public PDFont getFontSafely(PDResources resources, COSName fontName) {
if (resources == null || fontName == null) { if (resources == null || fontName == null) {
@ -33,27 +34,15 @@ public class TextDecodingHelper {
try { try {
PDFont font = resources.getFont(fontName); PDFont font = resources.getFont(fontName);
if (font == null) { if (font == null) return null;
return null;
}
try { try {
String fontNameCheck = font.getName(); String n = font.getName();
if (fontNameCheck == null || fontNameCheck.trim().isEmpty()) { if (n == null || n.trim().isEmpty()) return null;
log.debug("Font {} has null or empty name, skipping", fontName.getName());
return null;
}
} catch (Exception e) { } catch (Exception e) {
log.debug(
"Error accessing font name for {}, skipping: {}",
fontName.getName(),
e.getMessage());
return null; return null;
} }
return font; return font;
} catch (Exception e) { } catch (Exception e) {
log.debug("Error retrieving font {}: {}", fontName.getName(), e.getMessage());
return null; return null;
} }
} }
@ -65,90 +54,160 @@ public class TextDecodingHelper {
try { try {
byte[] bytes = cosString.getBytes(); byte[] bytes = cosString.getBytes();
if (bytes.length == 0) { if (bytes.length == 0) return;
return;
}
String basicDecoded = tryDecodeWithFont(font, cosString); String basicDecoded = tryDecodeWithFont(font, cosString);
if (basicDecoded != null if (basicDecoded != null
&& !basicDecoded.contains("?") && !basicDecoded.contains("?")
&& !basicDecoded.trim().isEmpty()) { && !basicDecoded.trim().isEmpty()) return;
return;
}
decodeCharactersEnhanced(font, bytes); decodeCharactersEnhanced(font, bytes);
} catch (Exception e) { } catch (Exception e) {
log.error("Decoding failed: {}", e.getMessage(), e);
try { try {
tryDecodeWithFont(font, cosString); tryDecodeWithFont(font, cosString);
} catch (Exception fallbackException) { } catch (Exception ignored) {
} }
} }
} }
public String decodeCharactersEnhanced(PDFont font, byte[] bytes) { public String decodeCharactersEnhanced(PDFont font, byte[] bytes) {
// Try font-guided decoding first
String fontPass = decodeByFontTables(font, bytes);
if (isAcceptable(fontPass)) return fontPass;
// Try UTF-8 strict decoding
String utf8 = tryDecodeCharset(bytes, StandardCharsets.UTF_8);
if (isAcceptable(utf8)) return utf8;
// UTF-16 BE/LE
String u16be = tryDecodeCharset(bytes, StandardCharsets.UTF_16BE);
if (isAcceptable(u16be)) return u16be;
String u16le = tryDecodeCharset(bytes, StandardCharsets.UTF_16LE);
if (isAcceptable(u16le)) return u16le;
// Common Windows encodings
String win1252 = tryDecodeCharset(bytes, Charset.forName("windows-1252"));
if (isAcceptable(win1252)) return win1252;
String win1250 = tryDecodeCharset(bytes, Charset.forName("windows-1250"));
if (isAcceptable(win1250)) return win1250;
String gb2312 = tryDecodeCharset(bytes, Charset.forName("GB2312"));
if (isAcceptable(gb2312)) return gb2312;
String big5 = tryDecodeCharset(bytes, Charset.forName("Big5"));
if (isAcceptable(big5)) return big5;
String shiftJis = tryDecodeCharset(bytes, Charset.forName("Shift_JIS"));
if (isAcceptable(shiftJis)) return shiftJis;
String euckr = tryDecodeCharset(bytes, Charset.forName("EUC-KR"));
if (isAcceptable(euckr)) return euckr;
// Fallback to ISO-8859-1
String latin1 = tryDecodeCharset(bytes, StandardCharsets.ISO_8859_1);
return isAcceptable(latin1) ? latin1 : null;
}
private String decodeByFontTables(PDFont font, byte[] bytes) {
if (font == null || bytes == null || bytes.length == 0) return null;
StringBuilder out = new StringBuilder(); StringBuilder out = new StringBuilder();
boolean hasValidCharacters = false;
int i = 0; int i = 0;
while (i < bytes.length) { while (i < bytes.length) {
int code = bytes[i] & 0xFF; String ch = null;
String charStr = decodeSingleCharacter(font, code, bytes); int consumed = 1;
try {
ch = tryToUnicode(font, bytes, i);
if (ch == null && i + 1 < bytes.length) {
consumed = 2;
ch = tryToUnicode(font, bytes, i, 2);
}
} catch (Exception ignored) {
}
if (!isPrintable(ch)) {
// Handle problematic character codes specifically
ch = "<EFBFBD>";
}
out.append(ch);
i += consumed;
}
String s = out.toString();
return isAcceptable(s) ? s : null;
}
if (charStr == null && code >= 128 && i + 1 < bytes.length) { private String tryToUnicode(PDFont font, byte[] bytes, int pos) {
int combinedCode = (code << 8) | (bytes[i + 1] & 0xFF); int code = bytes[pos] & 0xFF;
charStr = decodeSingleCharacter(font, combinedCode, bytes); try {
if (charStr != null) { return font.toUnicode(code);
i += 2; // Skip the next byte } catch (Exception e) {
out.append(charStr); return null;
hasValidCharacters = true;
continue;
} }
} }
if (charStr != null && !charStr.isEmpty()) { private String tryToUnicode(PDFont font, byte[] bytes, int pos, int len) {
out.append(charStr); if (pos + len - 1 >= bytes.length) return null;
hasValidCharacters = true; int code = 0;
} else { for (int j = 0; j < len; j++) code = (code << 8) | (bytes[pos + j] & 0xFF);
out.append('?'); try {
return font.toUnicode(code);
} catch (Exception e) {
return null;
} }
i++;
} }
String result = out.toString();
return hasValidCharacters ? result : null; private String tryDecodeCharset(byte[] bytes, Charset cs) {
try {
String s = new String(bytes, cs);
return isPrintable(s) ? s : null;
} catch (Exception e) {
return null;
}
}
private boolean isPrintable(String s) {
if (s == null || s.isEmpty()) return false;
int printable = 0;
for (int i = 0; i < s.length(); ) {
int cp = s.codePointAt(i);
int type = Character.getType(cp);
if (type != Character.CONTROL && type != Character.FORMAT && cp != 0xFFFD) printable++;
i += Character.charCount(cp);
}
return printable >= Math.max(1, s.codePointCount(0, s.length()) * 3 / 4);
}
private boolean isAcceptable(String s) {
return isPrintable(s);
} }
public String decodeSingleCharacter(PDFont font, int code, byte[] bytes) { public String decodeSingleCharacter(PDFont font, int code, byte[] bytes) {
String charStr = null; String charStr = null;
try { try {
charStr = font.toUnicode(code); charStr = font.toUnicode(code);
} catch (Exception ignored) { } catch (Exception ignored) {
} }
if (charStr == null && font instanceof PDType0Font type0Font) { if (charStr == null && font instanceof PDType0Font type0Font) {
try { try {
int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code; int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
charStr = type0Font.toUnicode(cid); charStr = type0Font.toUnicode(cid);
log.debug("CID decoding successful for code {}: {}", cid, charStr); } catch (Exception ignored) {
} catch (Exception e) {
log.debug("CID decoding failed for code {}: {}", code, e.getMessage());
} }
} }
if (charStr == null && font.getName() != null && font.getName().contains("+")) { if (charStr == null && font.getName() != null && font.getName().contains("+")) {
charStr = mapSubsetCharacter(code); charStr = mapSubsetCharacter(code);
} }
if (charStr == null) { if (charStr == null) {
charStr = fallbackCharacterMapping(code, bytes, font); charStr = fallbackCharacterMapping(code, bytes, font);
} }
return charStr; return charStr;
} }
public String fallbackCharacterMapping(int code, byte[] bytes, PDFont font) { public String fallbackCharacterMapping(int code, byte[] bytes, PDFont font) {
try { try {
// Handle problematic high-range character codes that cause .notdef warnings
if (code >= PROBLEMATIC_CODE_LOWER_BOUND && code <= PROBLEMATIC_CODE_UPPER_BOUND) {
return handleProblematicCharacterCode(code, font);
}
if (font instanceof PDType0Font && bytes.length > 1) { if (font instanceof PDType0Font && bytes.length > 1) {
return null; return null;
} }
@ -164,18 +223,15 @@ public class TextDecodingHelper {
String fontName = font.getName(); String fontName = font.getName();
if (fontName != null) { if (fontName != null) {
String lowerName = fontName.toLowerCase(); String lowerName = fontName.toLowerCase();
if (lowerName.contains("cjk") if ((lowerName.contains("cjk")
|| lowerName.contains("gb") || lowerName.contains("gb")
|| lowerName.contains("jp")) { || lowerName.contains("jp"))
// Basic CJK fallback (expand with a lookup table if needed) && code >= 0x4E00
if (code >= 0x4E00 && code <= 0x9FFF) { && code <= 0x9FFF) {
return String.valueOf( return String.valueOf((char) code);
(char) code); // Unicode Basic Multilingual Plane for CJK
}
} }
} }
// Fallback to UTF-8/16 decoding attempt for unknown encodings
try { try {
if (bytes.length >= 2) { if (bytes.length >= 2) {
ByteBuffer buffer = ByteBuffer.wrap(bytes); ByteBuffer buffer = ByteBuffer.wrap(bytes);
@ -184,7 +240,7 @@ public class TextDecodingHelper {
return charBuffer.toString(); return charBuffer.toString();
} }
} catch (Exception e) { } catch (Exception e) {
log.debug("UTF fallback failed: {}", e.getMessage());
} }
return null; return null;
@ -193,6 +249,19 @@ public class TextDecodingHelper {
} }
} }
public String handleProblematicCharacterCode(int code, PDFont font) {
if (code >= PROBLEMATIC_CODE_LOWER_BOUND && code <= PROBLEMATIC_CODE_UPPER_BOUND) {
int adjustedCode = code - PROBLEMATIC_CODE_LOWER_BOUND;
if (adjustedCode >= ASCII_LOWER_BOUND) {
return String.valueOf((char) adjustedCode);
}
if (font != null && font.getName() != null && font.getName().contains("+")) {
return mapSubsetCharacter(adjustedCode);
}
}
return "<EFBFBD>";
}
public String mapSubsetCharacter(int code) { public String mapSubsetCharacter(int code) {
if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) { if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) {
return String.valueOf((char) code); return String.valueOf((char) code);
@ -221,6 +290,7 @@ public class TextDecodingHelper {
uni = font.toUnicode(code); uni = font.toUnicode(code);
} catch (Exception ignored) { } catch (Exception ignored) {
} }
if (uni != null) { if (uni != null) {
out.append(uni); out.append(uni);
anyMapped = true; anyMapped = true;
@ -239,6 +309,7 @@ public class TextDecodingHelper {
u1 = font.toUnicode(b1); u1 = font.toUnicode(b1);
} catch (Exception ignored) { } catch (Exception ignored) {
} }
if (i + 1 < bytes.length) { if (i + 1 < bytes.length) {
int b2 = bytes[i + 1] & 0xFF; int b2 = bytes[i + 1] & 0xFF;
int code = (b1 << 8) | b2; int code = (b1 << 8) | b2;
@ -247,6 +318,12 @@ public class TextDecodingHelper {
u2 = font.toUnicode(code); u2 = font.toUnicode(code);
} catch (Exception ignored) { } catch (Exception ignored) {
} }
// Handle problematic multi-byte codes
if (u2 == null && code >= PROBLEMATIC_CODE_LOWER_BOUND) {
u2 = handleProblematicCharacterCode(code, font);
}
if (u2 != null) { if (u2 != null) {
out.append(u2); out.append(u2);
i += 2; i += 2;
@ -267,12 +344,12 @@ public class TextDecodingHelper {
} }
} }
public static RedactionService.DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) { public RedactionService.DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) {
RedactionService.DecodedMapping map = new RedactionService.DecodedMapping(); RedactionService.DecodedMapping map = new RedactionService.DecodedMapping();
if (font == null || bytes == null) { if (font == null || bytes == null) {
map.text = ""; map.setText("");
map.charByteStart = new int[0]; map.setCharByteStart(new int[0]);
map.charByteEnd = new int[0]; map.setCharByteEnd(new int[0]);
return map; return map;
} }
@ -289,46 +366,32 @@ public class TextDecodingHelper {
while (i < bytes.length) { while (i < bytes.length) {
int start = i; int start = i;
String decodedChar = null; String decodedChar;
int consumed = 1; int consumed;
try { try {
if (isType0) { if (isType0) {
// Handle CID fonts and multi-byte encodings
decodedChar = decodeType0Font((PDType0Font) font, bytes, i); decodedChar = decodeType0Font((PDType0Font) font, bytes, i);
consumed = getType0CharLength((PDType0Font) font, bytes, i); consumed = getType0CharLength((PDType0Font) font, bytes, i);
} else if (isType1) { } else if (isType1) {
// Handle Type1 fonts with specific encoding
decodedChar = decodeType1Font((PDType1Font) font, bytes, i); decodedChar = decodeType1Font((PDType1Font) font, bytes, i);
consumed = getType1CharLength((PDType1Font) font, bytes, i); consumed = 1;
} else if (isType3) { } else if (isType3) {
// Handle Type3 bitmap fonts
decodedChar = decodeType3Font((PDType3Font) font, bytes, i); decodedChar = decodeType3Font((PDType3Font) font, bytes, i);
consumed = 1; // Type3 typically single byte consumed = 1;
} else if (isTrueType) { } else if (isTrueType) {
// Handle TrueType fonts
decodedChar = decodeTrueTypeFont((PDTrueTypeFont) font, bytes, i); decodedChar = decodeTrueTypeFont((PDTrueTypeFont) font, bytes, i);
consumed = getTrueTypeCharLength((PDTrueTypeFont) font, bytes, i); consumed = getTrueTypeCharLength((PDTrueTypeFont) font, bytes, i);
} else { } else {
// Generic fallback for other font types
decodedChar = decodeGenericFont(font, bytes, i); decodedChar = decodeGenericFont(font, bytes, i);
consumed = getGenericCharLength(font, bytes, i);
}
// Validate the consumed length
if (consumed <= 0 || i + consumed > bytes.length) {
consumed = 1; consumed = 1;
} }
if (consumed <= 0 || i + consumed > bytes.length) consumed = 1;
} catch (Exception e) { } catch (Exception e) {
// Log the error for debugging purposes
System.err.println(
"Error decoding character at position " + i + ": " + e.getMessage());
decodedChar = null; decodedChar = null;
consumed = 1; consumed = 1;
} }
// Handle null or empty decoded characters
if (decodedChar == null || decodedChar.isEmpty()) { if (decodedChar == null || decodedChar.isEmpty()) {
decodedChar = handleUndecodableChar(bytes, i, consumed); decodedChar = handleUndecodableChar(bytes, i, consumed);
} }
@ -345,15 +408,14 @@ public class TextDecodingHelper {
i += consumed; i += consumed;
} }
map.text = sb.toString(); map.setText(sb.toString());
map.charByteStart = starts.stream().mapToInt(Integer::intValue).toArray(); map.setCharByteStart(starts.stream().mapToInt(Integer::intValue).toArray());
map.charByteEnd = ends.stream().mapToInt(Integer::intValue).toArray(); map.setCharByteEnd(ends.stream().mapToInt(Integer::intValue).toArray());
return map; return map;
} }
private static String decodeType0Font(PDType0Font font, byte[] bytes, int position) { private String decodeType0Font(PDType0Font font, byte[] bytes, int position) {
try { try {
// Try multi-byte decoding first (common for CJK fonts)
if (position + 1 < bytes.length) { if (position + 1 < bytes.length) {
int b1 = bytes[position] & 0xFF; int b1 = bytes[position] & 0xFF;
int b2 = bytes[position + 1] & 0xFF; int b2 = bytes[position + 1] & 0xFF;
@ -372,7 +434,7 @@ public class TextDecodingHelper {
} }
} }
private static int getType0CharLength(PDType0Font font, byte[] bytes, int position) { private int getType0CharLength(PDType0Font font, byte[] bytes, int position) {
try { try {
if (position + 1 < bytes.length) { if (position + 1 < bytes.length) {
int b1 = bytes[position] & 0xFF; int b1 = bytes[position] & 0xFF;
@ -389,7 +451,7 @@ public class TextDecodingHelper {
} }
} }
private static String decodeType1Font(PDType1Font font, byte[] bytes, int position) { private String decodeType1Font(PDType1Font font, byte[] bytes, int position) {
try { try {
int code = bytes[position] & 0xFF; int code = bytes[position] & 0xFF;
return font.toUnicode(code); return font.toUnicode(code);
@ -398,11 +460,7 @@ public class TextDecodingHelper {
} }
} }
private static int getType1CharLength(PDType1Font font, byte[] bytes, int position) { private String decodeType3Font(PDType3Font font, byte[] bytes, int position) {
return 1; // Type1 fonts are typically single-byte
}
private static String decodeType3Font(PDType3Font font, byte[] bytes, int position) {
try { try {
int code = bytes[position] & 0xFF; int code = bytes[position] & 0xFF;
return font.toUnicode(code); return font.toUnicode(code);
@ -411,7 +469,7 @@ public class TextDecodingHelper {
} }
} }
private static String decodeTrueTypeFont(PDTrueTypeFont font, byte[] bytes, int position) { private String decodeTrueTypeFont(PDTrueTypeFont font, byte[] bytes, int position) {
try { try {
int code = bytes[position] & 0xFF; int code = bytes[position] & 0xFF;
String unicode = font.toUnicode(code); String unicode = font.toUnicode(code);
@ -429,7 +487,7 @@ public class TextDecodingHelper {
} }
} }
private static int getTrueTypeCharLength(PDTrueTypeFont font, byte[] bytes, int position) { private int getTrueTypeCharLength(PDTrueTypeFont font, byte[] bytes, int position) {
try { try {
// First try single byte // First try single byte
int code = bytes[position] & 0xFF; int code = bytes[position] & 0xFF;
@ -454,7 +512,7 @@ public class TextDecodingHelper {
} }
} }
private static String decodeGenericFont(PDFont font, byte[] bytes, int position) { private String decodeGenericFont(PDFont font, byte[] bytes, int position) {
try { try {
int code = bytes[position] & 0xFF; int code = bytes[position] & 0xFF;
return font.toUnicode(code); return font.toUnicode(code);
@ -463,13 +521,8 @@ public class TextDecodingHelper {
} }
} }
private static int getGenericCharLength(PDFont font, byte[] bytes, int position) { private String handleUndecodableChar(byte[] bytes, int position, int length) {
return 1; // Default to single byte for unknown font types
}
private static String handleUndecodableChar(byte[] bytes, int position, int length) {
// Or try to interpret as ISO-8859-1 (Latin-1) as fallback
try { try {
byte[] charBytes = new byte[length]; byte[] charBytes = new byte[length];
System.arraycopy(bytes, position, charBytes, 0, length); System.arraycopy(bytes, position, charBytes, 0, length);
@ -478,9 +531,7 @@ public class TextDecodingHelper {
return fallback; return fallback;
} }
} catch (Exception e) { } catch (Exception e) {
// Ignore and fall through to default
} }
return "<EFBFBD>";
return "<EFBFBD>"; // Unicode replacement character instead of "?"
} }
} }

View File

@ -1,11 +1,6 @@
package stirling.software.SPDF.utils.text; package stirling.software.SPDF.utils.text;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -15,225 +10,360 @@ import lombok.extern.slf4j.Slf4j;
public class TextEncodingHelper { public class TextEncodingHelper {
public boolean canEncodeCharacters(PDFont font, String text) { public boolean canEncodeCharacters(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) { if (font == null || text == null) {
return false; return false;
} }
try { if (text.isEmpty()) {
// Step 1: Primary check - full-string encoding (permissive for "good" cases)
byte[] encoded = font.encode(text);
if (encoded.length > 0) {
log.debug(
"Text '{}' has good full-string encoding for font {} - permissively allowing",
text,
font.getName() != null ? font.getName() : "Unknown");
return true; return true;
} }
// Step 2: Smart array-based fallback for TJ operator-style text try {
log.debug( byte[] encoded = font.encode(text);
"Full encoding failed for '{}' - using array-based fallback for font {}", if (encoded.length > 0) {
text, return true;
font.getName() != null ? font.getName() : "Unknown"); }
} catch (Exception e) {
return validateAsCodePointArray(font, text);
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Encoding exception for text '{}' with font {} - trying array fallback: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
if (isFontSubset(font.getName()) || hasCustomEncoding(font)) {
return validateAsCodePointArray(font, text);
} }
return false; // Non-subset fonts with encoding exceptions are likely problematic return validateAsCodePointArray(font, text);
}
} }
private boolean validateAsCodePointArray(PDFont font, String text) { private boolean validateAsCodePointArray(PDFont font, String text) {
if (text == null || text.isEmpty()) {
return true;
}
int totalCodePoints = 0; int totalCodePoints = 0;
int successfulCodePoints = 0; int successfulCodePoints = 0;
// Iterate through code points (handles surrogates correctly per Unicode docs)
for (int i = 0; i < text.length(); ) { for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i); int codePoint = text.codePointAt(i);
String charStr = new String(Character.toChars(codePoint)); String charStr = new String(Character.toChars(codePoint));
totalCodePoints++; totalCodePoints++;
try { try {
// Test encoding for this code point
byte[] charEncoded = font.encode(charStr); byte[] charEncoded = font.encode(charStr);
if (charEncoded.length > 0) { if (charEncoded.length > 0) {
try {
float charWidth = font.getStringWidth(charStr); float charWidth = font.getStringWidth(charStr);
if (charWidth >= 0) { if (charWidth >= 0) {
successfulCodePoints++; successfulCodePoints++;
log.debug( }
"Code point '{}' (U+{}) encoded successfully", } catch (Exception e) {
charStr, try {
Integer.toHexString(codePoint).toUpperCase()); if (canDecodeCharacter(font, charStr)) {
} else { successfulCodePoints++;
log.debug( }
"Code point '{}' (U+{}) has invalid width: {}", } catch (Exception e2) {
charStr, }
Integer.toHexString(codePoint).toUpperCase(),
charWidth);
} }
} else { } else {
log.debug( try {
"Code point '{}' (U+{}) encoding failed - empty result", if (canDecodeCharacter(font, charStr)) {
charStr, successfulCodePoints++;
Integer.toHexString(codePoint).toUpperCase()); }
} catch (Exception e) {
}
}
} catch (Exception e) {
try {
if (canDecodeCharacter(font, charStr)) {
successfulCodePoints++;
}
} catch (Exception e2) {
if (isBasicCharacter(codePoint)) {
successfulCodePoints++;
}
} }
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Code point '{}' (U+{}) validation failed: {}",
charStr,
Integer.toHexString(codePoint).toUpperCase(),
e.getMessage());
} }
i += Character.charCount(codePoint); // Handle surrogates properly i += Character.charCount(codePoint);
} }
double successRate = if (totalCodePoints == 0) {
totalCodePoints > 0 ? (double) successfulCodePoints / totalCodePoints : 0; return true;
boolean isAcceptable = successRate >= 0.95;
log.debug(
"Array validation for '{}': {}/{} code points successful ({:.1f}%) - {}",
text,
successfulCodePoints,
totalCodePoints,
successRate * 100,
isAcceptable ? "ALLOWING" : "rejecting");
return isAcceptable;
} }
public boolean isTextSegmentRemovable(PDFont font, String text) { double successRate = (double) successfulCodePoints / totalCodePoints;
if (font == null || text == null || text.isEmpty()) { return successRate >= 0.1;
}
private boolean canDecodeCharacter(PDFont font, String charStr) {
if (font == null || charStr == null || charStr.isEmpty()) {
return false; return false;
} }
// Log the attempt try {
log.debug( for (int code = 0; code <= 0xFFFF; code++) {
"Evaluating text segment for removal: '{}' with font {}", try {
text, String decoded = font.toUnicode(code);
font.getName() != null ? font.getName() : "Unknown Font"); if (decoded != null && decoded.equals(charStr)) {
return true;
}
} catch (Exception e) {
}
}
} catch (Exception e) {
}
return false;
}
private boolean isBasicCharacter(int codePoint) {
return (codePoint >= 32 && codePoint <= 126)
|| (codePoint >= 160 && codePoint <= 255)
|| Character.isWhitespace(codePoint)
|| Character.isLetterOrDigit(codePoint);
}
public boolean isTextSegmentRemovable(PDFont font, String text) {
if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
if (isSimpleCharacter(text)) { if (isSimpleCharacter(text)) {
try { try {
font.encode(text); font.encode(text);
font.getStringWidth(text); font.getStringWidth(text);
log.debug(
"Text '{}' is a simple character and passed validation - allowing removal",
text);
return true; return true;
} catch (Exception e) { } catch (Exception e) {
log.debug( try {
"Simple character '{}' failed basic validation with font {}: {}", return canHandleText(font, text);
text, } catch (Exception e2) {
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false; return false;
} }
} }
}
// For complex text, require comprehensive validation
return isTextFullyRemovable(font, text); return isTextFullyRemovable(font, text);
} }
private boolean canHandleText(PDFont font, String text) {
if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
String charStr = new String(Character.toChars(codePoint));
boolean canHandle = false;
try {
byte[] encoded = font.encode(charStr);
if (encoded.length > 0) {
canHandle = true;
}
} catch (Exception e) {
}
if (!canHandle) {
try {
if (canDecodeCharacter(font, charStr)) {
canHandle = true;
}
} catch (Exception e) {
}
}
if (!canHandle && isBasicCharacter(codePoint)) {
canHandle = true;
}
if (!canHandle) {
return false;
}
i += Character.charCount(codePoint);
}
return true;
}
public boolean isTextFullyRemovable(PDFont font, String text) { public boolean isTextFullyRemovable(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) { if (font == null || text == null) {
return false;
}
if (text.isEmpty()) {
return true;
}
try {
if (!canEncodeCharacters(font, text)) {
return false; return false;
} }
try { try {
// Check 1: Verify encoding capability using new smart approach
if (!canEncodeCharacters(font, text)) {
log.debug(
"Text '{}' failed encoding validation for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return false;
}
// Check 2: Validate width calculation capability
float width = font.getStringWidth(text); float width = font.getStringWidth(text);
if (width < 0) { // Allow zero width (invisible chars) but reject negative (invalid) if (width < 0) {
log.debug(
"Text '{}' has invalid width {} for font {}",
text,
width,
font.getName() != null ? font.getName() : "Unknown");
return false; // Invalid metrics prevent accurate removal
}
// Check 3: Verify font descriptor completeness for redaction area calculation
if (font.getFontDescriptor() == null) {
log.debug(
"Missing font descriptor for font {}",
font.getName() != null ? font.getName() : "Unknown");
return false; return false;
} }
} catch (Exception e) {
try {
if (!canCalculateTextWidth(font, text)) {
return false;
}
} catch (Exception e2) {
return false;
}
}
try {
if (font.getFontDescriptor() == null) {
try {
return canHandleWithoutDescriptor(font, text);
} catch (Exception e) {
return false;
}
}
} catch (Exception e) {
try {
return canHandleWithoutDescriptor(font, text);
} catch (Exception e2) {
return false;
}
}
// Check 4: Test bounding box calculation for redaction area
try { try {
font.getFontDescriptor().getFontBoundingBox(); font.getFontDescriptor().getFontBoundingBox();
} catch (IllegalArgumentException e) { } catch (Exception e) {
log.debug( try {
"Font bounding box unavailable for font {}: {}", return canHandleWithoutBoundingBox(font, text);
font.getName() != null ? font.getName() : "Unknown", } catch (Exception e2) {
e.getMessage()); return false;
}
}
return true;
} catch (Exception e) {
try {
return canHandleText(font, text);
} catch (Exception e2) {
return false;
}
}
}
private boolean canCalculateTextWidth(PDFont font, String text) {
if (font == null || text == null) {
return false; return false;
} }
log.debug( if (text.isEmpty()) {
"Text '{}' passed comprehensive validation for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return true; return true;
}
} catch (IOException e) { for (int i = 0; i < text.length(); ) {
log.debug( int codePoint = text.codePointAt(i);
"Text '{}' failed validation for font {} due to IO error: {}", String charStr = new String(Character.toChars(codePoint));
text,
font.getName() != null ? font.getName() : "Unknown", boolean hasWidth = false;
e.getMessage()); try {
return false; float charWidth = font.getStringWidth(charStr);
} catch (IllegalArgumentException e) { if (charWidth >= 0) {
log.debug( hasWidth = true;
"Text '{}' failed validation for font {} due to argument error: {}", }
text, } catch (Exception e) {
font.getName() != null ? font.getName() : "Unknown", try {
e.getMessage()); float defaultWidth = getDefaultCharWidth(font);
if (defaultWidth > 0) {
hasWidth = true;
}
} catch (Exception e2) {
}
}
if (!hasWidth && isBasicCharacter(codePoint)) {
hasWidth = true;
}
if (!hasWidth) {
return false; return false;
} }
i += Character.charCount(codePoint);
}
return true;
}
private float getDefaultCharWidth(PDFont font) {
String[] testChars = {" ", "a", "A", "0", ".", "e", "!", "i", "l", "I"};
for (String testChar : testChars) {
try {
float width = font.getStringWidth(testChar);
if (width > 0) {
return width;
}
} catch (Exception e) {
}
}
return 500;
}
private boolean canHandleWithoutDescriptor(PDFont font, String text) {
try {
return canCalculateTextWidth(font, text);
} catch (Exception e) {
return canHandleText(font, text);
}
}
private boolean canHandleWithoutBoundingBox(PDFont font, String text) {
try {
return canCalculateTextWidth(font, text);
} catch (Exception e) {
return canHandleText(font, text);
}
} }
private boolean isSimpleCharacter(String text) { private boolean isSimpleCharacter(String text) {
if (text == null || text.isEmpty()) { if (text == null) {
return false; return false;
} }
if (text.length() > 20) { if (text.isEmpty()) {
return true;
}
if (text.length() > 50) {
return false; return false;
} }
for (int i = 0; i < text.length(); i++) { for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i); char c = text.charAt(i);
// Allow letters, digits, and whitespace (most common cases)
if (Character.isLetterOrDigit(c) || Character.isWhitespace(c)) { if (Character.isLetterOrDigit(c) || Character.isWhitespace(c)) {
continue; continue;
} }
// Allow common ASCII punctuation if (c >= 32 && c <= 126) {
if (c >= 32 && c <= 126 && ".,!?;:()-[]{}\"'/@#$%&*+=<>|\\~`".indexOf(c) >= 0) { continue;
}
if (c >= 160 && c <= 255) {
continue;
}
if (Character.getType(c) == Character.OTHER_PUNCTUATION
|| Character.getType(c) == Character.DASH_PUNCTUATION
|| Character.getType(c) == Character.START_PUNCTUATION
|| Character.getType(c) == Character.END_PUNCTUATION
|| Character.getType(c) == Character.CONNECTOR_PUNCTUATION
|| Character.getType(c) == Character.OTHER_SYMBOL
|| Character.getType(c) == Character.MATH_SYMBOL
|| Character.getType(c) == Character.CURRENCY_SYMBOL) {
continue; continue;
} }
@ -243,111 +373,205 @@ public class TextEncodingHelper {
return true; return true;
} }
public boolean hasCustomEncoding(PDFont font) {
try {
if (font instanceof PDSimpleFont simpleFont) {
try {
Encoding encoding = simpleFont.getEncoding();
if (encoding != null) {
// Check for dictionary-based custom encodings
if (encoding instanceof DictionaryEncoding) {
log.debug("Font {} uses DictionaryEncoding (custom)", font.getName());
return true;
}
String encodingName = encoding.getClass().getSimpleName();
if (encodingName.contains("Custom")
|| encodingName.contains("Dictionary")) {
log.debug(
"Font {} uses custom encoding: {}",
font.getName(),
encodingName);
return true;
}
}
} catch (Exception e) {
log.debug(
"Encoding detection failed for font {}: {}",
font.getName(),
e.getMessage());
return true; // Assume custom if detection fails
}
}
if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) {
log.debug(
"Font {} is Type0 (CID) - generally uses standard CMaps",
font.getName() != null ? font.getName() : "Unknown");
return false;
}
log.debug(
"Font {} type {} - assuming standard encoding",
font.getName() != null ? font.getName() : "Unknown",
font.getClass().getSimpleName());
return false;
} catch (IllegalArgumentException e) {
log.debug(
"Custom encoding detection failed for font {}: {}",
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false; // Be forgiving on detection failure
}
}
public boolean fontSupportsCharacter(PDFont font, String character) { public boolean fontSupportsCharacter(PDFont font, String character) {
if (font == null || character == null || character.isEmpty()) { if (font == null || character == null) {
return false; return false;
} }
if (character.isEmpty()) {
return true;
}
try { try {
byte[] encoded = font.encode(character); byte[] encoded = font.encode(character);
if (encoded.length == 0) { if (encoded.length > 0) {
return false; try {
}
float width = font.getStringWidth(character); float width = font.getStringWidth(character);
return width > 0; if (width >= 0) {
return true;
}
} catch (Exception e) {
}
return true;
}
} catch (Exception e) {
}
} catch (IOException | IllegalArgumentException e) { try {
log.debug( if (canDecodeCharacter(font, character)) {
"Character '{}' not supported by font {}: {}", return true;
character, }
font.getName() != null ? font.getName() : "Unknown", } catch (Exception e) {
e.getMessage()); }
for (int i = 0; i < character.length(); ) {
int codePoint = character.codePointAt(i);
if (isBasicCharacter(codePoint)) {
i += Character.charCount(codePoint);
continue;
}
return false; return false;
} }
return true;
} }
public boolean isFontSubset(String fontName) { public boolean isFontSubset(String fontName) {
if (fontName == null) { if (fontName == null) {
return false; return false;
} }
return fontName.matches("^[A-Z]{6}\\+.*");
if (fontName.matches("^[A-Z]{6}\\+.*")) {
return true;
}
if (fontName.matches("^[A-Z]{5}\\+.*")) {
return true;
}
if (fontName.matches("^[A-Z]{4}\\+.*")) {
return true;
}
if (fontName.contains("+")) {
String prefix = fontName.split("\\+")[0];
if (prefix.matches("^[A-Z]+$") && prefix.length() >= 4) {
return true;
}
} }
public boolean canCalculateBasicWidths(PDFont font) {
try {
float spaceWidth = font.getStringWidth(" ");
if (spaceWidth <= 0) {
return false; return false;
} }
String[] testChars = {"a", "A", "0", ".", "e", "!"}; public boolean canCalculateBasicWidths(PDFont font) {
if (font == null) {
return false;
}
try {
float spaceWidth = font.getStringWidth(" ");
if (spaceWidth > 0) {
return true;
}
} catch (Exception e) {
}
String[] testChars = {
"a", "A", "0", ".", "e", "!", "i", "l", "I", "m", "M", "W", "w", "1", "|", "-", "_",
"=", "+", "(", ")", "[", "]", "{", "}", "<", ">", "/", "\\", "?", ",", ";", ":", "\"",
"'", "`", "~", "@", "#", "$", "%", "^", "&", "*"
};
int successCount = 0;
for (String ch : testChars) { for (String ch : testChars) {
try { try {
float width = font.getStringWidth(ch); float width = font.getStringWidth(ch);
if (width > 0) { if (width > 0) {
successCount++;
if (successCount >= 3) {
return true; return true;
} }
} catch (IOException | IllegalArgumentException e) { }
} catch (Exception e) {
} }
} }
return false; // Can't calculate width for any test characters try {
} catch (IOException | IllegalArgumentException e) { for (int code = 32; code <= 126; code++) {
return false; // Font failed basic width calculation try {
String ch = String.valueOf((char) code);
float width = font.getStringWidth(ch);
if (width > 0) {
successCount++;
if (successCount >= 1) {
return true;
} }
} }
} catch (Exception e) {
}
}
} catch (Exception e) {
}
try {
for (int code = 160; code <= 255; code++) {
try {
String ch = String.valueOf((char) code);
float width = font.getStringWidth(ch);
if (width > 0) {
return true;
}
} catch (Exception e) {
}
}
} catch (Exception e) {
}
return false;
}
public boolean canEncodeAnyCharacter(PDFont font) {
if (font == null) {
return false;
}
String[] testStrings = {
"a", "A", "0", " ", ".", "!", "e", "i", "o", "u", "n", "t", "r", "s", "l", "1", "2",
"3", "4", "5", "6", "7", "8", "9", ",", ".", ";", ":", "?", "!", "(", ")", "[", "]",
"{", "}", "hello", "test", "sample", "abc", "123", "ABC"
};
for (String testStr : testStrings) {
try {
byte[] encoded = font.encode(testStr);
if (encoded.length > 0) {
return true;
}
} catch (Exception e) {
}
}
for (int code = 0; code <= 0xFFFF; code += 100) {
try {
String testStr = String.valueOf((char) code);
byte[] encoded = font.encode(testStr);
if (encoded.length > 0) {
return true;
}
} catch (Exception e) {
}
}
return false;
}
public boolean isValidFont(PDFont font) {
if (font == null) {
return false;
}
try {
String name = font.getName();
if (name != null && !name.trim().isEmpty()) {
return true;
}
} catch (Exception e) {
}
try {
if (canCalculateBasicWidths(font)) {
return true;
}
} catch (Exception e) {
}
try {
if (canEncodeAnyCharacter(font)) {
return true;
}
} catch (Exception e) {
}
return false;
}
} }

View File

@ -5,10 +5,6 @@ import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.PDFont;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -16,128 +12,116 @@ import lombok.extern.slf4j.Slf4j;
@UtilityClass @UtilityClass
public class TextFinderUtils { public class TextFinderUtils {
public boolean validateFontReliability(PDFont font) {
if (font == null) {
return false;
}
if (font.isDamaged()) {
log.debug(
"Font {} is marked as damaged - using TextEncodingHelper validation",
font.getName());
}
if (TextEncodingHelper.canCalculateBasicWidths(font)) {
log.debug(
"Font {} passed basic width calculations - considering reliable",
font.getName());
return true;
}
String[] basicTests = {"1", "2", "3", "a", "A", "e", "E", " "};
int workingChars = 0;
for (String testChar : basicTests) {
if (TextEncodingHelper.canEncodeCharacters(font, testChar)) {
workingChars++;
}
}
if (workingChars > 0) {
log.debug(
"Font {} can process {}/{} basic characters - considering reliable",
font.getName(),
workingChars,
basicTests.length);
return true;
}
log.debug("Font {} failed all basic tests - considering unreliable", font.getName());
return false;
}
public List<Pattern> createOptimizedSearchPatterns( public List<Pattern> createOptimizedSearchPatterns(
Set<String> searchTerms, boolean useRegex, boolean wholeWordSearch) { Set<String> searchTerms, boolean useRegex, boolean wholeWordSearch) {
List<Pattern> patterns = new ArrayList<>(); List<Pattern> patterns = new ArrayList<>();
if (searchTerms == null) {
return patterns;
}
for (String term : searchTerms) { for (String term : searchTerms) {
if (term == null || term.trim().isEmpty()) { if (term == null) {
continue;
}
String trimmedTerm = term.trim();
if (trimmedTerm.isEmpty()) {
continue; continue;
} }
try { try {
String patternString = useRegex ? term.trim() : Pattern.quote(term.trim()); String patternString;
if (useRegex) {
if (wholeWordSearch) { patternString = trimmedTerm;
patternString = applyWordBoundaries(term.trim(), patternString); try {
Pattern.compile(patternString);
} catch (Exception e) {
patternString = Pattern.quote(trimmedTerm);
}
} else {
patternString = Pattern.quote(trimmedTerm);
} }
Pattern pattern = if (wholeWordSearch) {
Pattern.compile( patternString = applyWordBoundaries(trimmedTerm, patternString, useRegex);
patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); }
int flags = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL;
try {
flags |= Pattern.CANON_EQ;
} catch (Exception e) {
}
Pattern pattern = Pattern.compile(patternString, flags);
patterns.add(pattern); patterns.add(pattern);
log.debug("Created search pattern: '{}' -> '{}'", term.trim(), patternString);
} catch (Exception e) { } catch (Exception e) {
log.warn("Failed to create pattern for term '{}': {}", term, e.getMessage()); try {
String quotedTerm = Pattern.quote(trimmedTerm);
if (wholeWordSearch) {
quotedTerm = applyWordBoundaries(trimmedTerm, quotedTerm, false);
}
Pattern fallbackPattern =
Pattern.compile(
quotedTerm, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
patterns.add(fallbackPattern);
} catch (Exception e2) {
try {
Pattern simplestPattern = Pattern.compile(Pattern.quote(trimmedTerm));
patterns.add(simplestPattern);
} catch (Exception e3) {
}
}
} }
} }
return patterns; return patterns;
} }
private String applyWordBoundaries(String originalTerm, String patternString) { private String applyWordBoundaries(String originalTerm, String patternString, boolean isRegex) {
if (originalTerm.length() == 1 && Character.isDigit(originalTerm.charAt(0))) { if (originalTerm == null || originalTerm.isEmpty()) {
return "(?<![\\w])" + patternString + "(?![\\w])"; return patternString;
} else if (originalTerm.length() == 1) { }
return "(?<![\\w])" + patternString + "(?![\\w])";
try {
if (originalTerm.length() == 1) {
char c = originalTerm.charAt(0);
if (Character.isDigit(c)) {
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
} else if (Character.isLetter(c)) {
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
} else { } else {
return "(?<!\\S)" + patternString + "(?!\\S)";
}
}
boolean startsWithWordChar = Character.isLetterOrDigit(originalTerm.charAt(0));
boolean endsWithWordChar =
Character.isLetterOrDigit(originalTerm.charAt(originalTerm.length() - 1));
String result = patternString;
if (startsWithWordChar) {
result = "(?<![\\p{L}\\p{N}])" + result;
} else {
result = "(?<!\\S)" + result;
}
if (endsWithWordChar) {
result = result + "(?![\\p{L}\\p{N}])";
} else {
result = result + "(?!\\S)";
}
return result;
} catch (Exception e) {
try {
return "\\b" + patternString + "\\b"; return "\\b" + patternString + "\\b";
} catch (Exception e2) {
return patternString;
} }
} }
public boolean hasProblematicFonts(PDPage page) {
if (page == null) {
return false;
}
try {
PDResources resources = page.getResources();
if (resources == null) {
return false;
}
int totalFonts = 0;
int completelyUnusableFonts = 0;
for (org.apache.pdfbox.cos.COSName fontName : resources.getFontNames()) {
try {
org.apache.pdfbox.pdmodel.font.PDFont font = resources.getFont(fontName);
if (font != null) {
totalFonts++;
if (!validateFontReliability(font)) {
completelyUnusableFonts++;
}
}
} catch (Exception e) {
log.debug("Font loading failed for {}: {}", fontName.getName(), e.getMessage());
totalFonts++;
}
}
boolean hasProblems = totalFonts > 0 && (completelyUnusableFonts * 2 > totalFonts);
log.debug(
"Page font analysis: {}/{} fonts are completely unusable - page {} problematic",
completelyUnusableFonts,
totalFonts,
hasProblems ? "IS" : "is NOT");
return hasProblems;
} catch (Exception e) {
log.warn("Font analysis failed for page: {}", e.getMessage());
return false; // Be permissive if analysis fails
}
} }
} }

View File

@ -1,88 +1,69 @@
package stirling.software.SPDF.utils.text; package stirling.software.SPDF.utils.text;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font; import org.apache.pdfbox.pdmodel.font.PDType0Font;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@UtilityClass @UtilityClass
public class WidthCalculator { public class WidthCalculator {
private final int FONT_SCALE_FACTOR = 1000;
private final float CONSERVATIVE_CHAR_WIDTH_RATIO = 0.55f; private final float CONSERVATIVE_CHAR_WIDTH_RATIO = 0.55f;
private final float BBOX_CHAR_WIDTH_RATIO = 0.65f; private final float BBOX_CHAR_WIDTH_RATIO = 0.65f;
private final Map<String, Float> widthCache = new ConcurrentHashMap<>();
private final Map<String, Boolean> reliabilityCache = new ConcurrentHashMap<>();
private String createCacheKey(PDFont font, String text, float fontSize) {
return String.format("%s|%s|%.2f", font.getName(), text, fontSize);
}
private String createReliabilityCacheKey(PDFont font) {
return font.getName();
}
public float calculateAccurateWidth(PDFont font, String text, float fontSize) { public float calculateAccurateWidth(PDFont font, String text, float fontSize) {
return calculateAccurateWidth(font, text, fontSize, true); if (font == null || text == null || fontSize <= 0) {
return 0;
} }
public float calculateAccurateWidth( if (text.isEmpty()) {
PDFont font, String text, float fontSize, boolean useCache) { return 0;
if (font == null || text == null || text.isEmpty() || fontSize <= 0) return 0;
if (useCache) {
String cacheKey = createCacheKey(font, text, fontSize);
Float cachedWidth = widthCache.get(cacheKey);
if (cachedWidth != null) return cachedWidth;
} }
String normalizedText = normalizeText(text); String normalizedText = normalizeText(text);
Float directWidth = calculateDirectWidth(font, normalizedText, fontSize); Float directWidth = calculateDirectWidth(font, normalizedText, fontSize);
if (directWidth != null) { if (directWidth != null) {
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), directWidth);
return directWidth; return directWidth;
} }
Float charByCharWidth = calculateCharacterByCharacterWidth(font, normalizedText, fontSize); Float charByCharWidth = calculateCharacterByCharacterWidth(font, normalizedText, fontSize);
if (charByCharWidth != null) { if (charByCharWidth != null) {
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), charByCharWidth);
return charByCharWidth; return charByCharWidth;
} }
Float glyphWidth = calculateGlyphBasedWidth(font, normalizedText, fontSize); Float glyphWidth = calculateGlyphBasedWidth(font, normalizedText, fontSize);
if (glyphWidth != null) { if (glyphWidth != null) {
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), glyphWidth);
return glyphWidth; return glyphWidth;
} }
float fallbackWidth = calculateComprehensiveFallbackWidth(font, normalizedText, fontSize); return calculateComprehensiveFallbackWidth(font, normalizedText, fontSize);
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), fallbackWidth);
return fallbackWidth;
} }
private String normalizeText(String text) { private String normalizeText(String text) {
if (text == null) return "";
try {
return Normalizer.normalize(text, Normalizer.Form.NFC); return Normalizer.normalize(text, Normalizer.Form.NFC);
} catch (Exception e) {
return text;
}
} }
private Float calculateDirectWidth(PDFont font, String text, float fontSize) { private Float calculateDirectWidth(PDFont font, String text, float fontSize) {
if (!TextEncodingHelper.canEncodeCharacters(font, text)) return null;
try { try {
float rawWidth = font.getStringWidth(text); if (!TextEncodingHelper.canEncodeCharacters(font, text)) {
float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize; return null;
return rawWidth >= 0 && scaledWidth >= 0 ? scaledWidth : null; }
float rawWidth = font.getStringWidth(text) / 1000f;
if (rawWidth < 0) return null;
float scaledWidth = rawWidth * fontSize;
return scaledWidth >= 0 ? scaledWidth : null;
} catch (Exception e) { } catch (Exception e) {
return null; return null;
} }
@ -96,7 +77,12 @@ public class WidthCalculator {
for (int codePoint : codePoints) { for (int codePoint : codePoints) {
String character = new String(Character.toChars(codePoint)); String character = new String(Character.toChars(codePoint));
Float charWidth = calculateSingleCharacterWidth(font, character, fontSize); Float charWidth =
calculateSingleCharacterWidth(font, character, fontSize, codePoint);
if (charWidth == null) {
return null;
}
totalWidth += charWidth; totalWidth += charWidth;
if (previousCodePoint != -1) { if (previousCodePoint != -1) {
@ -104,7 +90,7 @@ public class WidthCalculator {
} }
previousCodePoint = codePoint; previousCodePoint = codePoint;
} }
return totalWidth; return totalWidth >= 0 ? totalWidth : null;
} catch (Exception e) { } catch (Exception e) {
return null; return null;
} }
@ -112,80 +98,99 @@ public class WidthCalculator {
private List<Integer> getCodePoints(String text) { private List<Integer> getCodePoints(String text) {
List<Integer> codePoints = new ArrayList<>(); List<Integer> codePoints = new ArrayList<>();
if (text == null) return codePoints;
for (int i = 0; i < text.length(); ) { for (int i = 0; i < text.length(); ) {
try {
int codePoint = text.codePointAt(i); int codePoint = text.codePointAt(i);
codePoints.add(codePoint); codePoints.add(codePoint);
i += Character.charCount(codePoint); i += Character.charCount(codePoint);
} catch (Exception e) {
i++;
}
} }
return codePoints; return codePoints;
} }
private Float calculateSingleCharacterWidth(PDFont font, String character, float fontSize) { private Float calculateSingleCharacterWidth(
PDFont font, String character, float fontSize, int codePoint) {
try { try {
byte[] encoded = null; if (TextEncodingHelper.fontSupportsCharacter(font, character)) {
try {
float raw = font.getStringWidth(character) / 1000f;
if (raw >= 0) return raw * fontSize;
} catch (Exception ignored) {
}
}
} catch (Exception e) {
}
try { try {
encoded = font.encode(character); float w = font.getWidth(codePoint) / 1000f;
if (encoded.length == 0) encoded = null; if (w >= 0) return w * fontSize;
} catch (Exception e) { } catch (Exception ignored) {
log.debug("Direct encoding failed for '{}': {}", character, e.getMessage());
} }
if (encoded == null && font instanceof PDType0Font) {
try { try {
encoded = character.getBytes(StandardCharsets.UTF_8); if (codePoint >= 0 && codePoint <= 0xFFFF) {
} catch (Exception e) { float w = font.getWidth(codePoint) / 1000f;
log.debug("UTF-8 encoding failed for '{}': {}", character, e.getMessage()); if (w >= 0) return w * fontSize;
} }
} catch (Exception ignored) {
} }
if (encoded != null && encoded.length > 0) { try {
Float width = calculateGlyphWidth(font, encoded, fontSize); byte[] encoded = font.encode(character);
if (width != null && width >= 0) return width; if (encoded.length > 0) {
}
return calculateAverageCharacterWidth(font, fontSize);
} catch (Exception e) {
log.debug(
"Single character width calculation failed for '{}': {}",
character,
e.getMessage());
return calculateAverageCharacterWidth(font, fontSize);
}
}
private Float calculateGlyphWidth(PDFont font, byte[] encoded, float fontSize) {
for (byte b : encoded) { for (byte b : encoded) {
try { try {
int glyphCode = b & 0xFF; int glyphCode = b & 0xFF;
float glyphWidth = font.getWidth(glyphCode); float w = font.getWidth(glyphCode) / 1000f;
if (w >= 0) return w * fontSize;
if (glyphWidth > 0) { } catch (Exception ignored) {
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize; }
}
}
} catch (Exception ignored) {
} }
// Try alternative width methods return calculateCategoryBasedWidth(font, codePoint, fontSize);
try {
glyphWidth = font.getWidthFromFont(glyphCode);
if (glyphWidth > 0) {
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
}
} catch (Exception e) {
log.debug(
"getWidthFromFont failed for glyph {}: {}", glyphCode, e.getMessage());
}
} catch (Exception e) {
log.debug("Glyph width calculation failed for byte {}: {}", b, e.getMessage());
}
}
return null;
} }
private float calculateKerning( private float calculateKerning(
PDFont font, int leftCodePoint, int rightCodePoint, float fontSize) { PDFont font, int leftCodePoint, int rightCodePoint, float fontSize) {
return 0; try {
if (font instanceof PDSimpleFont) {
PDSimpleFont simpleFont = (PDSimpleFont) font;
try {
java.lang.reflect.Method getKerningMethod =
simpleFont.getClass().getMethod("getKerning", int.class, int.class);
float kerningValue =
(Float)
getKerningMethod.invoke(
simpleFont, leftCodePoint, rightCodePoint);
return (kerningValue / 1000f) * fontSize;
} catch (Exception e) {
}
}
} catch (Exception e) {
}
try {
String leftChar = new String(Character.toChars(leftCodePoint));
String rightChar = new String(Character.toChars(rightCodePoint));
String combined = leftChar + rightChar;
float combinedWidth = font.getStringWidth(combined) / 1000f;
float leftWidth = font.getStringWidth(leftChar) / 1000f;
float rightWidth = font.getStringWidth(rightChar) / 1000f;
float kerning = combinedWidth - leftWidth - rightWidth;
return kerning * fontSize;
} catch (Exception e) {
}
return 0f;
} }
private Float calculateGlyphBasedWidth(PDFont font, String text, float fontSize) { private Float calculateGlyphBasedWidth(PDFont font, String text, float fontSize) {
@ -196,7 +201,6 @@ public class WidthCalculator {
int codePoint = text.codePointAt(i); int codePoint = text.codePointAt(i);
String character = new String(Character.toChars(codePoint)); String character = new String(Character.toChars(codePoint));
// Try to get glyph information more comprehensively
Float charWidth = Float charWidth =
calculateGlyphWidthComprehensively(font, character, codePoint, fontSize); calculateGlyphWidthComprehensively(font, character, codePoint, fontSize);
if (charWidth == null) { if (charWidth == null) {
@ -207,19 +211,15 @@ public class WidthCalculator {
i += Character.charCount(codePoint); i += Character.charCount(codePoint);
} }
log.debug("Glyph-based width calculation: {}", totalWidth); return totalWidth >= 0 ? totalWidth : null;
return totalWidth;
} catch (Exception e) { } catch (Exception e) {
log.debug("Glyph-based calculation failed: {}", e.getMessage());
return null; return null;
} }
} }
private Float calculateGlyphWidthComprehensively( private Float calculateGlyphWidthComprehensively(
PDFont font, String character, int codePoint, float fontSize) { PDFont font, String character, int codePoint, float fontSize) {
try {
// Method 1: Try standard encoding
try { try {
byte[] encoded = font.encode(character); byte[] encoded = font.encode(character);
if (encoded.length > 0) { if (encoded.length > 0) {
@ -229,60 +229,110 @@ public class WidthCalculator {
} }
} }
} catch (Exception e) { } catch (Exception e) {
log.debug(
"Standard encoding failed for U+{}: {}",
Integer.toHexString(codePoint),
e.getMessage());
} }
// Method 2: Try Unicode code point directly
try { try {
float glyphWidth = font.getWidth(codePoint); float glyphWidth = font.getWidth(codePoint) / 1000f;
if (glyphWidth > 0) { if (glyphWidth >= 0) {
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize; return glyphWidth * fontSize;
}
} catch (Exception e) {
}
try {
if (codePoint <= 0xFFFF) {
float glyphWidth = font.getWidth(codePoint) / 1000f;
if (glyphWidth >= 0) {
return glyphWidth * fontSize;
}
}
} catch (Exception e) {
}
try {
for (int code = 0; code <= 0xFF; code++) {
try {
String decoded = font.toUnicode(code);
if (decoded != null && decoded.equals(character)) {
float glyphWidth = font.getWidth(code) / 1000f;
if (glyphWidth >= 0) {
return glyphWidth * fontSize;
}
}
} catch (Exception e) {
}
} }
} catch (Exception e) { } catch (Exception e) {
log.debug(
"Unicode code point width failed for U+{}: {}",
Integer.toHexString(codePoint),
e.getMessage());
} }
// Method 3: Character category based estimation
return calculateCategoryBasedWidth(font, codePoint, fontSize); return calculateCategoryBasedWidth(font, codePoint, fontSize);
} catch (Exception e) {
log.debug("Comprehensive glyph width calculation failed: {}", e.getMessage());
return calculateAverageCharacterWidth(font, fontSize);
}
} }
private Float calculateWidthFromEncodedBytes(PDFont font, byte[] encoded, float fontSize) { private Float calculateWidthFromEncodedBytes(PDFont font, byte[] encoded, float fontSize) {
// Try each byte as a potential glyph code if (encoded == null || encoded.length == 0) return null;
for (byte b : encoded) {
if (font instanceof PDType0Font && encoded.length >= 2) {
try { try {
int glyphCode = b & 0xFF; int glyphCode = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
float width = font.getWidth(glyphCode); float width = font.getWidth(glyphCode) / 1000f;
if (width > 0) { if (width >= 0) {
return (width / FONT_SCALE_FACTOR) * fontSize; return width * fontSize;
}
} catch (Exception e) {
}
try {
for (int i = 0; i <= encoded.length - 2; i++) {
int glyphCode = ((encoded[i] & 0xFF) << 8) | (encoded[i + 1] & 0xFF);
float width = font.getWidth(glyphCode) / 1000f;
if (width >= 0) {
return width * fontSize;
}
} }
} catch (Exception e) { } catch (Exception e) {
// Continue trying other bytes
} }
} }
if (encoded.length >= 2 && font instanceof PDType0Font) { for (byte b : encoded) {
try { try {
int glyphCode = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF); int glyphCode = b & 0xFF;
float width = font.getWidth(glyphCode); float width = font.getWidth(glyphCode) / 1000f;
if (width > 0) { if (width >= 0) {
return (width / FONT_SCALE_FACTOR) * fontSize; return width * fontSize;
} }
} catch (Exception e) { } catch (Exception e) {
log.debug("Multi-byte glyph code interpretation failed: {}", e.getMessage());
} }
} }
try {
if (encoded.length >= 3) {
int glyphCode =
((encoded[0] & 0xFF) << 16)
| ((encoded[1] & 0xFF) << 8)
| (encoded[2] & 0xFF);
float width = font.getWidth(glyphCode) / 1000f;
if (width >= 0) {
return width * fontSize;
}
}
} catch (Exception e) {
}
try {
if (encoded.length >= 4) {
int glyphCode =
((encoded[0] & 0xFF) << 24)
| ((encoded[1] & 0xFF) << 16)
| ((encoded[2] & 0xFF) << 8)
| (encoded[3] & 0xFF);
float width = font.getWidth(glyphCode) / 1000f;
if (width >= 0) {
return width * fontSize;
}
}
} catch (Exception e) {
}
return null; return null;
} }
@ -291,198 +341,237 @@ public class WidthCalculator {
int category = Character.getType(codePoint); int category = Character.getType(codePoint);
float baseWidth = calculateAverageCharacterWidth(font, fontSize); float baseWidth = calculateAverageCharacterWidth(font, fontSize);
// Adjust width based on character category
float multiplier = float multiplier =
switch (category) { switch (category) {
case Character.UPPERCASE_LETTER -> 1.2f; case Character.UPPERCASE_LETTER -> 1.2f;
case Character.LOWERCASE_LETTER -> 1.0f; case Character.LOWERCASE_LETTER -> 1.0f;
case Character.DECIMAL_DIGIT_NUMBER -> 1.0f; case Character.TITLECASE_LETTER -> 1.15f;
case Character.SPACE_SEPARATOR -> 0.5f;
case Character.DASH_PUNCTUATION -> 0.8f;
case Character.OTHER_PUNCTUATION -> 0.6f;
case Character.CURRENCY_SYMBOL -> 1.1f;
case Character.MATH_SYMBOL -> 1.0f;
case Character.MODIFIER_LETTER -> 0.7f; case Character.MODIFIER_LETTER -> 0.7f;
case Character.NON_SPACING_MARK -> 0.0f; // Combining characters case Character.OTHER_LETTER -> 1.0f;
case Character.DECIMAL_DIGIT_NUMBER -> 1.0f;
case Character.LETTER_NUMBER -> 1.0f;
case Character.OTHER_NUMBER -> 1.0f;
case Character.SPACE_SEPARATOR -> 0.5f;
case Character.LINE_SEPARATOR -> 0.0f;
case Character.PARAGRAPH_SEPARATOR -> 0.0f;
case Character.NON_SPACING_MARK -> 0.0f;
case Character.ENCLOSING_MARK -> 0.0f; case Character.ENCLOSING_MARK -> 0.0f;
case Character.COMBINING_SPACING_MARK -> 0.3f; case Character.COMBINING_SPACING_MARK -> 0.3f;
case Character.DASH_PUNCTUATION -> 0.8f;
case Character.START_PUNCTUATION -> 0.6f;
case Character.END_PUNCTUATION -> 0.6f;
case Character.CONNECTOR_PUNCTUATION -> 0.6f;
case Character.OTHER_PUNCTUATION -> 0.6f;
case Character.MATH_SYMBOL -> 1.0f;
case Character.CURRENCY_SYMBOL -> 1.1f;
case Character.MODIFIER_SYMBOL -> 0.8f;
case Character.OTHER_SYMBOL -> 1.0f;
case Character.INITIAL_QUOTE_PUNCTUATION -> 0.6f;
case Character.FINAL_QUOTE_PUNCTUATION -> 0.6f;
case Character.CONTROL -> 0.0f;
case Character.FORMAT -> 0.0f;
case Character.PRIVATE_USE -> 1.0f;
case Character.SURROGATE -> 0.0f;
case Character.UNASSIGNED -> 1.0f;
default -> 1.0f; default -> 1.0f;
}; };
return baseWidth * multiplier; float result = baseWidth * multiplier;
return result >= 0 ? result : baseWidth;
} catch (Exception e) { } catch (Exception e) {
log.debug("Category-based width calculation failed: {}", e.getMessage());
return calculateAverageCharacterWidth(font, fontSize); return calculateAverageCharacterWidth(font, fontSize);
} }
} }
private float calculateAverageCharacterWidth(PDFont font, float fontSize) { private float calculateAverageCharacterWidth(PDFont font, float fontSize) {
try { try {
float avgWidth = font.getAverageFontWidth(); float avgWidth = font.getAverageFontWidth() / 1000f;
return (avgWidth / FONT_SCALE_FACTOR) * fontSize; if (avgWidth > 0) {
} catch (Exception e) { return avgWidth * fontSize;
log.debug("Average character width calculation failed: {}", e.getMessage());
return CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
} }
} catch (Exception e) {
}
try {
String[] testChars = {
"a", "A", "e", "E", "i", "I", "o", "O", "n", "N", "t", "T", "r", "R", "s", "S", "0",
"1", "2", "3", "4", "5"
};
float totalWidth = 0;
int successCount = 0;
for (String testChar : testChars) {
try {
float width = font.getStringWidth(testChar) / 1000f;
if (width > 0) {
totalWidth += width;
successCount++;
}
} catch (Exception e) {
}
}
if (successCount > 0) {
return (totalWidth / successCount) * fontSize;
}
} catch (Exception e) {
}
try {
for (int code = 32; code <= 126; code++) {
try {
float width = font.getWidth(code) / 1000f;
if (width > 0) {
return width * fontSize;
}
} catch (Exception e) {
}
}
} catch (Exception e) {
}
try {
if (font.getFontDescriptor() != null) {
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
if (bbox != null) {
float avgCharWidth = bbox.getWidth() / 2000f;
return avgCharWidth * fontSize;
}
}
} catch (Exception e) {
}
return CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
} }
private float calculateComprehensiveFallbackWidth(PDFont font, String text, float fontSize) { private float calculateComprehensiveFallbackWidth(PDFont font, String text, float fontSize) {
if (text == null || text.isEmpty()) {
return 0;
}
try { try {
// Strategy 1: Use font bounding box with character analysis float charWidth = calculateAverageCharacterWidth(font, fontSize);
if (font.getFontDescriptor() != null
&& font.getFontDescriptor().getFontBoundingBox() != null) {
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
float avgCharWidth = bbox.getWidth() / FONT_SCALE_FACTOR;
// Analyze text composition for better estimation
float adjustedWidth = analyzeTextComposition(text, avgCharWidth, fontSize);
log.debug("Bounding box based fallback width: {}", adjustedWidth);
return adjustedWidth;
}
// Strategy 2: Enhanced average width calculation
float enhancedAverage = calculateEnhancedAverageWidth(font, text, fontSize);
log.debug("Enhanced average fallback width: {}", enhancedAverage);
return enhancedAverage;
} catch (Exception e) {
float conservativeWidth = text.length() * CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
log.debug("Conservative fallback width: {}", conservativeWidth);
return conservativeWidth;
}
}
private float analyzeTextComposition(String text, float avgCharWidth, float fontSize) {
float totalWidth = 0; float totalWidth = 0;
int spaceCount = 0;
int upperCount = 0;
int lowerCount = 0;
int digitCount = 0;
int punctCount = 0;
for (int i = 0; i < text.length(); ) { for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i); int codePoint = text.codePointAt(i);
int category = Character.getType(codePoint); Float specificWidth = calculateCategoryBasedWidth(font, codePoint, fontSize);
if (specificWidth != null) {
switch (category) { totalWidth += specificWidth;
case Character.SPACE_SEPARATOR -> { } else {
spaceCount++; totalWidth += charWidth;
totalWidth += avgCharWidth * 0.5f * fontSize;
} }
case Character.UPPERCASE_LETTER -> {
upperCount++;
totalWidth += avgCharWidth * 1.2f * fontSize;
}
case Character.LOWERCASE_LETTER -> {
lowerCount++;
totalWidth += avgCharWidth * 1.0f * fontSize;
}
case Character.DECIMAL_DIGIT_NUMBER -> {
digitCount++;
totalWidth += avgCharWidth * 1.0f * fontSize;
}
case Character.OTHER_PUNCTUATION, Character.DASH_PUNCTUATION -> {
punctCount++;
totalWidth += avgCharWidth * 0.7f * fontSize;
}
default -> totalWidth += avgCharWidth * BBOX_CHAR_WIDTH_RATIO * fontSize;
}
i += Character.charCount(codePoint); i += Character.charCount(codePoint);
} }
log.debug(
"Text composition analysis - Spaces: {}, Upper: {}, Lower: {}, Digits: {}, Punct: {}",
spaceCount,
upperCount,
lowerCount,
digitCount,
punctCount);
return totalWidth; return totalWidth;
}
private float calculateEnhancedAverageWidth(PDFont font, String text, float fontSize) {
try {
float baseAverage = font.getAverageFontWidth();
float capHeight = 0;
float xHeight = 0;
if (font.getFontDescriptor() != null) {
capHeight = font.getFontDescriptor().getCapHeight();
xHeight = font.getFontDescriptor().getXHeight();
}
float adjustmentFactor = 1.0f;
if (capHeight > 0 && xHeight > 0) {
adjustmentFactor = Math.max(0.8f, Math.min(1.2f, xHeight / capHeight));
}
float adjustedAverage = (baseAverage * adjustmentFactor / FONT_SCALE_FACTOR) * fontSize;
return text.length() * adjustedAverage;
} catch (Exception e) { } catch (Exception e) {
log.debug("Enhanced average width calculation failed: {}", e.getMessage());
return text.length() * CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
} }
try {
if (font.getFontDescriptor() != null
&& font.getFontDescriptor().getFontBoundingBox() != null) {
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
float avgCharWidth = bbox.getWidth() / 1000f;
return text.length() * avgCharWidth * BBOX_CHAR_WIDTH_RATIO * fontSize;
}
} catch (Exception e) {
}
return text.length() * calculateAverageCharacterWidth(font, fontSize);
} }
public boolean isWidthCalculationReliable(PDFont font) { public boolean isWidthCalculationReliable(PDFont font) {
if (font == null) { if (font == null) return false;
return false;
}
String cacheKey = createReliabilityCacheKey(font);
Boolean cachedResult = reliabilityCache.get(cacheKey);
if (cachedResult != null) {
log.debug(
"Using cached reliability result for font {}: {}",
font.getName(),
cachedResult);
return cachedResult;
}
boolean result = performReliabilityCheck(font);
reliabilityCache.put(cacheKey, result);
return result;
}
private boolean performReliabilityCheck(PDFont font) {
try { try {
if (font.isDamaged()) { if (font.isDamaged()) return false;
log.debug("Font {} is damaged", font.getName()); } catch (Exception e) {
return false;
} }
if (!TextEncodingHelper.canCalculateBasicWidths(font)) { try {
log.debug("Font {} cannot perform basic width calculations", font.getName()); if (!TextEncodingHelper.canCalculateBasicWidths(font)) return false;
return false; } catch (Exception e) {
} }
try { try {
font.getStringWidth("A"); font.getStringWidth("A");
return true; return true;
} catch (Exception e) { } catch (Exception e) {
log.debug("Font {} failed basic width test: {}", font.getName(), e.getMessage());
} }
// Check if we can at least get average width
try { try {
float avgWidth = font.getAverageFontWidth(); font.getAverageFontWidth();
return avgWidth > 0; return true;
} catch (Exception e) {
}
try {
float width = font.getWidth(65);
return width >= 0;
} catch (Exception e) { } catch (Exception e) {
log.debug(
"Font {} cannot provide average width: {}", font.getName(), e.getMessage());
} }
return false; return false;
}
public float calculateMinimumTextWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
return 0;
}
try {
float minWidth = calculateAccurateWidth(font, text, fontSize);
if (minWidth > 0) {
return minWidth * 0.8f;
}
} catch (Exception e) { } catch (Exception e) {
log.debug("Reliability check failed for font {}: {}", font.getName(), e.getMessage()); }
return text.length() * fontSize * 0.3f;
}
public float calculateMaximumTextWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
return 0;
}
try {
float maxWidth = calculateAccurateWidth(font, text, fontSize);
if (maxWidth > 0) {
return maxWidth * 1.2f;
}
} catch (Exception e) {
}
return text.length() * fontSize * 1.0f;
}
public boolean canCalculateWidthForText(PDFont font, String text) {
if (font == null || text == null) {
return false; return false;
} }
if (text.isEmpty()) {
return true;
}
try {
Float width = calculateDirectWidth(font, text, 12f);
if (width != null) {
return true;
}
} catch (Exception e) {
}
try {
Float width = calculateCharacterByCharacterWidth(font, text, 12f);
if (width != null) {
return true;
}
} catch (Exception e) {
}
return true;
} }
} }