mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
enhance text handling and encoding validation
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
e396b6cbb8
commit
7db58ad6dd
@ -6,23 +6,20 @@ import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.Getter;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import lombok.Getter;
|
||||
|
||||
import stirling.software.SPDF.model.PDFText;
|
||||
|
||||
@Slf4j
|
||||
public class TextFinder extends PDFTextStripper {
|
||||
|
||||
private final String searchTerm;
|
||||
private final boolean useRegex;
|
||||
private final boolean wholeWordSearch;
|
||||
@Getter
|
||||
private final List<PDFText> foundTexts = new ArrayList<>();
|
||||
@Getter private final List<PDFText> foundTexts = new ArrayList<>();
|
||||
|
||||
private final List<TextPosition> pageTextPositions = new ArrayList<>();
|
||||
private final StringBuilder pageTextBuilder = new StringBuilder();
|
||||
@ -45,20 +42,39 @@ public class TextFinder extends PDFTextStripper {
|
||||
|
||||
@Override
|
||||
protected void writeString(String text, List<TextPosition> textPositions) {
|
||||
pageTextBuilder.append(text);
|
||||
pageTextPositions.addAll(textPositions);
|
||||
for (TextPosition tp : textPositions) {
|
||||
if (tp == null) continue;
|
||||
String u = tp.getUnicode();
|
||||
if (u == null) continue;
|
||||
for (int i = 0; i < u.length(); ) {
|
||||
int cp = u.codePointAt(i);
|
||||
pageTextBuilder.append(Character.toChars(cp));
|
||||
// Add one position per code unit appended (1-2 chars depending on surrogate)
|
||||
int codeUnits = Character.charCount(cp);
|
||||
for (int k = 0; k < codeUnits; k++) {
|
||||
pageTextPositions.add(tp);
|
||||
}
|
||||
i += codeUnits;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeWordSeparator() {
|
||||
pageTextBuilder.append(getWordSeparator());
|
||||
pageTextPositions.add(null); // Placeholder for separator
|
||||
String sep = getWordSeparator();
|
||||
pageTextBuilder.append(sep);
|
||||
for (int i = 0; i < sep.length(); i++) {
|
||||
pageTextPositions.add(null);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeLineSeparator() {
|
||||
pageTextBuilder.append(getLineSeparator());
|
||||
pageTextPositions.add(null); // Placeholder for separator
|
||||
String sep = getLineSeparator();
|
||||
pageTextBuilder.append(sep);
|
||||
for (int i = 0; i < sep.length(); i++) {
|
||||
pageTextPositions.add(null);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -91,27 +107,10 @@ public class TextFinder extends PDFTextStripper {
|
||||
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
|
||||
Matcher matcher = pattern.matcher(text);
|
||||
|
||||
log.debug(
|
||||
"Searching for '{}' in page {} with regex '{}' (wholeWord: {}, useRegex: {})",
|
||||
processedSearchTerm,
|
||||
getCurrentPageNo(),
|
||||
regex,
|
||||
wholeWordSearch,
|
||||
useRegex);
|
||||
|
||||
int matchCount = 0;
|
||||
while (matcher.find()) {
|
||||
matchCount++;
|
||||
int matchStart = matcher.start();
|
||||
int matchEnd = matcher.end();
|
||||
|
||||
log.debug(
|
||||
"Found match #{} at positions {}-{}: '{}'",
|
||||
matchCount,
|
||||
matchStart,
|
||||
matchEnd,
|
||||
matcher.group());
|
||||
|
||||
float minX = Float.MAX_VALUE;
|
||||
float minY = Float.MAX_VALUE;
|
||||
float maxX = Float.MIN_VALUE;
|
||||
@ -119,13 +118,7 @@ public class TextFinder extends PDFTextStripper {
|
||||
boolean foundPosition = false;
|
||||
|
||||
for (int i = matchStart; i < matchEnd; i++) {
|
||||
if (i >= pageTextPositions.size()) {
|
||||
log.debug(
|
||||
"Position index {} exceeds available positions ({})",
|
||||
i,
|
||||
pageTextPositions.size());
|
||||
continue;
|
||||
}
|
||||
if (i >= pageTextPositions.size()) continue;
|
||||
TextPosition pos = pageTextPositions.get(i);
|
||||
if (pos != null) {
|
||||
foundPosition = true;
|
||||
@ -137,11 +130,6 @@ public class TextFinder extends PDFTextStripper {
|
||||
}
|
||||
|
||||
if (!foundPosition && matchStart < pageTextPositions.size()) {
|
||||
log.debug(
|
||||
"Attempting to find nearby positions for match at {}-{}",
|
||||
matchStart,
|
||||
matchEnd);
|
||||
|
||||
for (int i = Math.max(0, matchStart - 5);
|
||||
i < Math.min(pageTextPositions.size(), matchEnd + 5);
|
||||
i++) {
|
||||
@ -166,29 +154,11 @@ public class TextFinder extends PDFTextStripper {
|
||||
maxX,
|
||||
maxY,
|
||||
matcher.group()));
|
||||
log.debug(
|
||||
"Added PDFText for match: page={}, bounds=({},{},{},{}), text='{}'",
|
||||
getCurrentPageNo() - 1,
|
||||
minX,
|
||||
minY,
|
||||
maxX,
|
||||
maxY,
|
||||
matcher.group());
|
||||
} else {
|
||||
log.warn(
|
||||
"Found text match '{}' but no valid position data at {}-{}",
|
||||
matcher.group(),
|
||||
matchStart,
|
||||
matchEnd);
|
||||
// no position info
|
||||
}
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"Page {} search complete: found {} matches for '{}'",
|
||||
getCurrentPageNo(),
|
||||
matchCount,
|
||||
processedSearchTerm);
|
||||
|
||||
super.endPage(page);
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,7 @@ package stirling.software.SPDF.utils.text;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
@ -13,11 +14,9 @@ import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.font.*;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.service.RedactionService;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class TextDecodingHelper {
|
||||
|
||||
@ -25,6 +24,8 @@ public class TextDecodingHelper {
|
||||
private final int ASCII_UPPER_BOUND = 126;
|
||||
private final int EXTENDED_ASCII_LOWER_BOUND = 160;
|
||||
private final int EXTENDED_ASCII_UPPER_BOUND = 255;
|
||||
private final int PROBLEMATIC_CODE_LOWER_BOUND = 65488;
|
||||
private final int PROBLEMATIC_CODE_UPPER_BOUND = 65535;
|
||||
|
||||
public PDFont getFontSafely(PDResources resources, COSName fontName) {
|
||||
if (resources == null || fontName == null) {
|
||||
@ -33,27 +34,15 @@ public class TextDecodingHelper {
|
||||
|
||||
try {
|
||||
PDFont font = resources.getFont(fontName);
|
||||
if (font == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (font == null) return null;
|
||||
try {
|
||||
String fontNameCheck = font.getName();
|
||||
if (fontNameCheck == null || fontNameCheck.trim().isEmpty()) {
|
||||
log.debug("Font {} has null or empty name, skipping", fontName.getName());
|
||||
return null;
|
||||
}
|
||||
String n = font.getName();
|
||||
if (n == null || n.trim().isEmpty()) return null;
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Error accessing font name for {}, skipping: {}",
|
||||
fontName.getName(),
|
||||
e.getMessage());
|
||||
return null;
|
||||
}
|
||||
|
||||
return font;
|
||||
} catch (Exception e) {
|
||||
log.debug("Error retrieving font {}: {}", fontName.getName(), e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -65,90 +54,160 @@ public class TextDecodingHelper {
|
||||
|
||||
try {
|
||||
byte[] bytes = cosString.getBytes();
|
||||
if (bytes.length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (bytes.length == 0) return;
|
||||
String basicDecoded = tryDecodeWithFont(font, cosString);
|
||||
if (basicDecoded != null
|
||||
&& !basicDecoded.contains("?")
|
||||
&& !basicDecoded.trim().isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
&& !basicDecoded.trim().isEmpty()) return;
|
||||
decodeCharactersEnhanced(font, bytes);
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("Decoding failed: {}", e.getMessage(), e);
|
||||
try {
|
||||
tryDecodeWithFont(font, cosString);
|
||||
} catch (Exception fallbackException) {
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String decodeCharactersEnhanced(PDFont font, byte[] bytes) {
|
||||
// Try font-guided decoding first
|
||||
String fontPass = decodeByFontTables(font, bytes);
|
||||
if (isAcceptable(fontPass)) return fontPass;
|
||||
|
||||
// Try UTF-8 strict decoding
|
||||
String utf8 = tryDecodeCharset(bytes, StandardCharsets.UTF_8);
|
||||
if (isAcceptable(utf8)) return utf8;
|
||||
|
||||
// UTF-16 BE/LE
|
||||
String u16be = tryDecodeCharset(bytes, StandardCharsets.UTF_16BE);
|
||||
if (isAcceptable(u16be)) return u16be;
|
||||
|
||||
String u16le = tryDecodeCharset(bytes, StandardCharsets.UTF_16LE);
|
||||
if (isAcceptable(u16le)) return u16le;
|
||||
|
||||
// Common Windows encodings
|
||||
String win1252 = tryDecodeCharset(bytes, Charset.forName("windows-1252"));
|
||||
if (isAcceptable(win1252)) return win1252;
|
||||
|
||||
String win1250 = tryDecodeCharset(bytes, Charset.forName("windows-1250"));
|
||||
if (isAcceptable(win1250)) return win1250;
|
||||
|
||||
String gb2312 = tryDecodeCharset(bytes, Charset.forName("GB2312"));
|
||||
if (isAcceptable(gb2312)) return gb2312;
|
||||
|
||||
String big5 = tryDecodeCharset(bytes, Charset.forName("Big5"));
|
||||
if (isAcceptable(big5)) return big5;
|
||||
|
||||
String shiftJis = tryDecodeCharset(bytes, Charset.forName("Shift_JIS"));
|
||||
if (isAcceptable(shiftJis)) return shiftJis;
|
||||
|
||||
String euckr = tryDecodeCharset(bytes, Charset.forName("EUC-KR"));
|
||||
if (isAcceptable(euckr)) return euckr;
|
||||
|
||||
// Fallback to ISO-8859-1
|
||||
String latin1 = tryDecodeCharset(bytes, StandardCharsets.ISO_8859_1);
|
||||
return isAcceptable(latin1) ? latin1 : null;
|
||||
}
|
||||
|
||||
private String decodeByFontTables(PDFont font, byte[] bytes) {
|
||||
if (font == null || bytes == null || bytes.length == 0) return null;
|
||||
StringBuilder out = new StringBuilder();
|
||||
boolean hasValidCharacters = false;
|
||||
int i = 0;
|
||||
while (i < bytes.length) {
|
||||
int code = bytes[i] & 0xFF;
|
||||
String charStr = decodeSingleCharacter(font, code, bytes);
|
||||
|
||||
if (charStr == null && code >= 128 && i + 1 < bytes.length) {
|
||||
int combinedCode = (code << 8) | (bytes[i + 1] & 0xFF);
|
||||
charStr = decodeSingleCharacter(font, combinedCode, bytes);
|
||||
if (charStr != null) {
|
||||
i += 2; // Skip the next byte
|
||||
out.append(charStr);
|
||||
hasValidCharacters = true;
|
||||
continue;
|
||||
String ch = null;
|
||||
int consumed = 1;
|
||||
try {
|
||||
ch = tryToUnicode(font, bytes, i);
|
||||
if (ch == null && i + 1 < bytes.length) {
|
||||
consumed = 2;
|
||||
ch = tryToUnicode(font, bytes, i, 2);
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
if (charStr != null && !charStr.isEmpty()) {
|
||||
out.append(charStr);
|
||||
hasValidCharacters = true;
|
||||
} else {
|
||||
out.append('?');
|
||||
if (!isPrintable(ch)) {
|
||||
// Handle problematic character codes specifically
|
||||
ch = "<EFBFBD>";
|
||||
}
|
||||
i++;
|
||||
out.append(ch);
|
||||
i += consumed;
|
||||
}
|
||||
String result = out.toString();
|
||||
return hasValidCharacters ? result : null;
|
||||
String s = out.toString();
|
||||
return isAcceptable(s) ? s : null;
|
||||
}
|
||||
|
||||
private String tryToUnicode(PDFont font, byte[] bytes, int pos) {
|
||||
int code = bytes[pos] & 0xFF;
|
||||
try {
|
||||
return font.toUnicode(code);
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private String tryToUnicode(PDFont font, byte[] bytes, int pos, int len) {
|
||||
if (pos + len - 1 >= bytes.length) return null;
|
||||
int code = 0;
|
||||
for (int j = 0; j < len; j++) code = (code << 8) | (bytes[pos + j] & 0xFF);
|
||||
try {
|
||||
return font.toUnicode(code);
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private String tryDecodeCharset(byte[] bytes, Charset cs) {
|
||||
try {
|
||||
String s = new String(bytes, cs);
|
||||
return isPrintable(s) ? s : null;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isPrintable(String s) {
|
||||
if (s == null || s.isEmpty()) return false;
|
||||
int printable = 0;
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int cp = s.codePointAt(i);
|
||||
int type = Character.getType(cp);
|
||||
if (type != Character.CONTROL && type != Character.FORMAT && cp != 0xFFFD) printable++;
|
||||
i += Character.charCount(cp);
|
||||
}
|
||||
return printable >= Math.max(1, s.codePointCount(0, s.length()) * 3 / 4);
|
||||
}
|
||||
|
||||
private boolean isAcceptable(String s) {
|
||||
return isPrintable(s);
|
||||
}
|
||||
|
||||
public String decodeSingleCharacter(PDFont font, int code, byte[] bytes) {
|
||||
String charStr = null;
|
||||
|
||||
try {
|
||||
charStr = font.toUnicode(code);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
if (charStr == null && font instanceof PDType0Font type0Font) {
|
||||
try {
|
||||
int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
|
||||
charStr = type0Font.toUnicode(cid);
|
||||
log.debug("CID decoding successful for code {}: {}", cid, charStr);
|
||||
} catch (Exception e) {
|
||||
log.debug("CID decoding failed for code {}: {}", code, e.getMessage());
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
|
||||
if (charStr == null && font.getName() != null && font.getName().contains("+")) {
|
||||
charStr = mapSubsetCharacter(code);
|
||||
}
|
||||
|
||||
if (charStr == null) {
|
||||
charStr = fallbackCharacterMapping(code, bytes, font);
|
||||
}
|
||||
|
||||
return charStr;
|
||||
}
|
||||
|
||||
public String fallbackCharacterMapping(int code, byte[] bytes, PDFont font) {
|
||||
try {
|
||||
// Handle problematic high-range character codes that cause .notdef warnings
|
||||
if (code >= PROBLEMATIC_CODE_LOWER_BOUND && code <= PROBLEMATIC_CODE_UPPER_BOUND) {
|
||||
return handleProblematicCharacterCode(code, font);
|
||||
}
|
||||
|
||||
if (font instanceof PDType0Font && bytes.length > 1) {
|
||||
return null;
|
||||
}
|
||||
@ -164,18 +223,15 @@ public class TextDecodingHelper {
|
||||
String fontName = font.getName();
|
||||
if (fontName != null) {
|
||||
String lowerName = fontName.toLowerCase();
|
||||
if (lowerName.contains("cjk")
|
||||
|| lowerName.contains("gb")
|
||||
|| lowerName.contains("jp")) {
|
||||
// Basic CJK fallback (expand with a lookup table if needed)
|
||||
if (code >= 0x4E00 && code <= 0x9FFF) {
|
||||
return String.valueOf(
|
||||
(char) code); // Unicode Basic Multilingual Plane for CJK
|
||||
}
|
||||
if ((lowerName.contains("cjk")
|
||||
|| lowerName.contains("gb")
|
||||
|| lowerName.contains("jp"))
|
||||
&& code >= 0x4E00
|
||||
&& code <= 0x9FFF) {
|
||||
return String.valueOf((char) code);
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to UTF-8/16 decoding attempt for unknown encodings
|
||||
try {
|
||||
if (bytes.length >= 2) {
|
||||
ByteBuffer buffer = ByteBuffer.wrap(bytes);
|
||||
@ -184,7 +240,7 @@ public class TextDecodingHelper {
|
||||
return charBuffer.toString();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("UTF fallback failed: {}", e.getMessage());
|
||||
|
||||
}
|
||||
|
||||
return null;
|
||||
@ -193,6 +249,19 @@ public class TextDecodingHelper {
|
||||
}
|
||||
}
|
||||
|
||||
public String handleProblematicCharacterCode(int code, PDFont font) {
|
||||
if (code >= PROBLEMATIC_CODE_LOWER_BOUND && code <= PROBLEMATIC_CODE_UPPER_BOUND) {
|
||||
int adjustedCode = code - PROBLEMATIC_CODE_LOWER_BOUND;
|
||||
if (adjustedCode >= ASCII_LOWER_BOUND) {
|
||||
return String.valueOf((char) adjustedCode);
|
||||
}
|
||||
if (font != null && font.getName() != null && font.getName().contains("+")) {
|
||||
return mapSubsetCharacter(adjustedCode);
|
||||
}
|
||||
}
|
||||
return "<EFBFBD>";
|
||||
}
|
||||
|
||||
public String mapSubsetCharacter(int code) {
|
||||
if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) {
|
||||
return String.valueOf((char) code);
|
||||
@ -221,6 +290,7 @@ public class TextDecodingHelper {
|
||||
uni = font.toUnicode(code);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
if (uni != null) {
|
||||
out.append(uni);
|
||||
anyMapped = true;
|
||||
@ -239,6 +309,7 @@ public class TextDecodingHelper {
|
||||
u1 = font.toUnicode(b1);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
if (i + 1 < bytes.length) {
|
||||
int b2 = bytes[i + 1] & 0xFF;
|
||||
int code = (b1 << 8) | b2;
|
||||
@ -247,6 +318,12 @@ public class TextDecodingHelper {
|
||||
u2 = font.toUnicode(code);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
// Handle problematic multi-byte codes
|
||||
if (u2 == null && code >= PROBLEMATIC_CODE_LOWER_BOUND) {
|
||||
u2 = handleProblematicCharacterCode(code, font);
|
||||
}
|
||||
|
||||
if (u2 != null) {
|
||||
out.append(u2);
|
||||
i += 2;
|
||||
@ -267,12 +344,12 @@ public class TextDecodingHelper {
|
||||
}
|
||||
}
|
||||
|
||||
public static RedactionService.DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) {
|
||||
public RedactionService.DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) {
|
||||
RedactionService.DecodedMapping map = new RedactionService.DecodedMapping();
|
||||
if (font == null || bytes == null) {
|
||||
map.text = "";
|
||||
map.charByteStart = new int[0];
|
||||
map.charByteEnd = new int[0];
|
||||
map.setText("");
|
||||
map.setCharByteStart(new int[0]);
|
||||
map.setCharByteEnd(new int[0]);
|
||||
return map;
|
||||
}
|
||||
|
||||
@ -289,46 +366,32 @@ public class TextDecodingHelper {
|
||||
|
||||
while (i < bytes.length) {
|
||||
int start = i;
|
||||
String decodedChar = null;
|
||||
int consumed = 1;
|
||||
String decodedChar;
|
||||
int consumed;
|
||||
|
||||
try {
|
||||
if (isType0) {
|
||||
// Handle CID fonts and multi-byte encodings
|
||||
decodedChar = decodeType0Font((PDType0Font) font, bytes, i);
|
||||
consumed = getType0CharLength((PDType0Font) font, bytes, i);
|
||||
} else if (isType1) {
|
||||
// Handle Type1 fonts with specific encoding
|
||||
decodedChar = decodeType1Font((PDType1Font) font, bytes, i);
|
||||
consumed = getType1CharLength((PDType1Font) font, bytes, i);
|
||||
consumed = 1;
|
||||
} else if (isType3) {
|
||||
// Handle Type3 bitmap fonts
|
||||
decodedChar = decodeType3Font((PDType3Font) font, bytes, i);
|
||||
consumed = 1; // Type3 typically single byte
|
||||
consumed = 1;
|
||||
} else if (isTrueType) {
|
||||
// Handle TrueType fonts
|
||||
decodedChar = decodeTrueTypeFont((PDTrueTypeFont) font, bytes, i);
|
||||
consumed = getTrueTypeCharLength((PDTrueTypeFont) font, bytes, i);
|
||||
} else {
|
||||
// Generic fallback for other font types
|
||||
decodedChar = decodeGenericFont(font, bytes, i);
|
||||
consumed = getGenericCharLength(font, bytes, i);
|
||||
}
|
||||
|
||||
// Validate the consumed length
|
||||
if (consumed <= 0 || i + consumed > bytes.length) {
|
||||
consumed = 1;
|
||||
}
|
||||
|
||||
if (consumed <= 0 || i + consumed > bytes.length) consumed = 1;
|
||||
} catch (Exception e) {
|
||||
// Log the error for debugging purposes
|
||||
System.err.println(
|
||||
"Error decoding character at position " + i + ": " + e.getMessage());
|
||||
decodedChar = null;
|
||||
consumed = 1;
|
||||
}
|
||||
|
||||
// Handle null or empty decoded characters
|
||||
if (decodedChar == null || decodedChar.isEmpty()) {
|
||||
decodedChar = handleUndecodableChar(bytes, i, consumed);
|
||||
}
|
||||
@ -345,15 +408,14 @@ public class TextDecodingHelper {
|
||||
i += consumed;
|
||||
}
|
||||
|
||||
map.text = sb.toString();
|
||||
map.charByteStart = starts.stream().mapToInt(Integer::intValue).toArray();
|
||||
map.charByteEnd = ends.stream().mapToInt(Integer::intValue).toArray();
|
||||
map.setText(sb.toString());
|
||||
map.setCharByteStart(starts.stream().mapToInt(Integer::intValue).toArray());
|
||||
map.setCharByteEnd(ends.stream().mapToInt(Integer::intValue).toArray());
|
||||
return map;
|
||||
}
|
||||
|
||||
private static String decodeType0Font(PDType0Font font, byte[] bytes, int position) {
|
||||
private String decodeType0Font(PDType0Font font, byte[] bytes, int position) {
|
||||
try {
|
||||
// Try multi-byte decoding first (common for CJK fonts)
|
||||
if (position + 1 < bytes.length) {
|
||||
int b1 = bytes[position] & 0xFF;
|
||||
int b2 = bytes[position + 1] & 0xFF;
|
||||
@ -372,7 +434,7 @@ public class TextDecodingHelper {
|
||||
}
|
||||
}
|
||||
|
||||
private static int getType0CharLength(PDType0Font font, byte[] bytes, int position) {
|
||||
private int getType0CharLength(PDType0Font font, byte[] bytes, int position) {
|
||||
try {
|
||||
if (position + 1 < bytes.length) {
|
||||
int b1 = bytes[position] & 0xFF;
|
||||
@ -389,7 +451,7 @@ public class TextDecodingHelper {
|
||||
}
|
||||
}
|
||||
|
||||
private static String decodeType1Font(PDType1Font font, byte[] bytes, int position) {
|
||||
private String decodeType1Font(PDType1Font font, byte[] bytes, int position) {
|
||||
try {
|
||||
int code = bytes[position] & 0xFF;
|
||||
return font.toUnicode(code);
|
||||
@ -398,11 +460,7 @@ public class TextDecodingHelper {
|
||||
}
|
||||
}
|
||||
|
||||
private static int getType1CharLength(PDType1Font font, byte[] bytes, int position) {
|
||||
return 1; // Type1 fonts are typically single-byte
|
||||
}
|
||||
|
||||
private static String decodeType3Font(PDType3Font font, byte[] bytes, int position) {
|
||||
private String decodeType3Font(PDType3Font font, byte[] bytes, int position) {
|
||||
try {
|
||||
int code = bytes[position] & 0xFF;
|
||||
return font.toUnicode(code);
|
||||
@ -411,7 +469,7 @@ public class TextDecodingHelper {
|
||||
}
|
||||
}
|
||||
|
||||
private static String decodeTrueTypeFont(PDTrueTypeFont font, byte[] bytes, int position) {
|
||||
private String decodeTrueTypeFont(PDTrueTypeFont font, byte[] bytes, int position) {
|
||||
try {
|
||||
int code = bytes[position] & 0xFF;
|
||||
String unicode = font.toUnicode(code);
|
||||
@ -429,7 +487,7 @@ public class TextDecodingHelper {
|
||||
}
|
||||
}
|
||||
|
||||
private static int getTrueTypeCharLength(PDTrueTypeFont font, byte[] bytes, int position) {
|
||||
private int getTrueTypeCharLength(PDTrueTypeFont font, byte[] bytes, int position) {
|
||||
try {
|
||||
// First try single byte
|
||||
int code = bytes[position] & 0xFF;
|
||||
@ -454,7 +512,7 @@ public class TextDecodingHelper {
|
||||
}
|
||||
}
|
||||
|
||||
private static String decodeGenericFont(PDFont font, byte[] bytes, int position) {
|
||||
private String decodeGenericFont(PDFont font, byte[] bytes, int position) {
|
||||
try {
|
||||
int code = bytes[position] & 0xFF;
|
||||
return font.toUnicode(code);
|
||||
@ -463,13 +521,8 @@ public class TextDecodingHelper {
|
||||
}
|
||||
}
|
||||
|
||||
private static int getGenericCharLength(PDFont font, byte[] bytes, int position) {
|
||||
return 1; // Default to single byte for unknown font types
|
||||
}
|
||||
private String handleUndecodableChar(byte[] bytes, int position, int length) {
|
||||
|
||||
private static String handleUndecodableChar(byte[] bytes, int position, int length) {
|
||||
|
||||
// Or try to interpret as ISO-8859-1 (Latin-1) as fallback
|
||||
try {
|
||||
byte[] charBytes = new byte[length];
|
||||
System.arraycopy(bytes, position, charBytes, 0, length);
|
||||
@ -478,9 +531,7 @@ public class TextDecodingHelper {
|
||||
return fallback;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// Ignore and fall through to default
|
||||
}
|
||||
|
||||
return "<EFBFBD>"; // Unicode replacement character instead of "?"
|
||||
return "<EFBFBD>";
|
||||
}
|
||||
}
|
||||
|
@ -1,11 +1,6 @@
|
||||
package stirling.software.SPDF.utils.text;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -15,225 +10,360 @@ import lombok.extern.slf4j.Slf4j;
|
||||
public class TextEncodingHelper {
|
||||
|
||||
public boolean canEncodeCharacters(PDFont font, String text) {
|
||||
if (font == null || text == null || text.isEmpty()) {
|
||||
if (font == null || text == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (text.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
// Step 1: Primary check - full-string encoding (permissive for "good" cases)
|
||||
byte[] encoded = font.encode(text);
|
||||
if (encoded.length > 0) {
|
||||
log.debug(
|
||||
"Text '{}' has good full-string encoding for font {} - permissively allowing",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return true;
|
||||
}
|
||||
|
||||
// Step 2: Smart array-based fallback for TJ operator-style text
|
||||
log.debug(
|
||||
"Full encoding failed for '{}' - using array-based fallback for font {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
|
||||
return validateAsCodePointArray(font, text);
|
||||
|
||||
} catch (IOException | IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Encoding exception for text '{}' with font {} - trying array fallback: {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
|
||||
if (isFontSubset(font.getName()) || hasCustomEncoding(font)) {
|
||||
return validateAsCodePointArray(font, text);
|
||||
}
|
||||
|
||||
return false; // Non-subset fonts with encoding exceptions are likely problematic
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return validateAsCodePointArray(font, text);
|
||||
}
|
||||
|
||||
private boolean validateAsCodePointArray(PDFont font, String text) {
|
||||
if (text == null || text.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int totalCodePoints = 0;
|
||||
int successfulCodePoints = 0;
|
||||
|
||||
// Iterate through code points (handles surrogates correctly per Unicode docs)
|
||||
for (int i = 0; i < text.length(); ) {
|
||||
int codePoint = text.codePointAt(i);
|
||||
String charStr = new String(Character.toChars(codePoint));
|
||||
totalCodePoints++;
|
||||
|
||||
try {
|
||||
// Test encoding for this code point
|
||||
byte[] charEncoded = font.encode(charStr);
|
||||
if (charEncoded.length > 0) {
|
||||
float charWidth = font.getStringWidth(charStr);
|
||||
|
||||
if (charWidth >= 0) {
|
||||
successfulCodePoints++;
|
||||
log.debug(
|
||||
"Code point '{}' (U+{}) encoded successfully",
|
||||
charStr,
|
||||
Integer.toHexString(codePoint).toUpperCase());
|
||||
} else {
|
||||
log.debug(
|
||||
"Code point '{}' (U+{}) has invalid width: {}",
|
||||
charStr,
|
||||
Integer.toHexString(codePoint).toUpperCase(),
|
||||
charWidth);
|
||||
try {
|
||||
float charWidth = font.getStringWidth(charStr);
|
||||
if (charWidth >= 0) {
|
||||
successfulCodePoints++;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
if (canDecodeCharacter(font, charStr)) {
|
||||
successfulCodePoints++;
|
||||
}
|
||||
} catch (Exception e2) {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log.debug(
|
||||
"Code point '{}' (U+{}) encoding failed - empty result",
|
||||
charStr,
|
||||
Integer.toHexString(codePoint).toUpperCase());
|
||||
try {
|
||||
if (canDecodeCharacter(font, charStr)) {
|
||||
successfulCodePoints++;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
if (canDecodeCharacter(font, charStr)) {
|
||||
successfulCodePoints++;
|
||||
}
|
||||
} catch (Exception e2) {
|
||||
if (isBasicCharacter(codePoint)) {
|
||||
successfulCodePoints++;
|
||||
}
|
||||
}
|
||||
} catch (IOException | IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Code point '{}' (U+{}) validation failed: {}",
|
||||
charStr,
|
||||
Integer.toHexString(codePoint).toUpperCase(),
|
||||
e.getMessage());
|
||||
}
|
||||
|
||||
i += Character.charCount(codePoint); // Handle surrogates properly
|
||||
i += Character.charCount(codePoint);
|
||||
}
|
||||
|
||||
double successRate =
|
||||
totalCodePoints > 0 ? (double) successfulCodePoints / totalCodePoints : 0;
|
||||
boolean isAcceptable = successRate >= 0.95;
|
||||
if (totalCodePoints == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"Array validation for '{}': {}/{} code points successful ({:.1f}%) - {}",
|
||||
text,
|
||||
successfulCodePoints,
|
||||
totalCodePoints,
|
||||
successRate * 100,
|
||||
isAcceptable ? "ALLOWING" : "rejecting");
|
||||
|
||||
return isAcceptable;
|
||||
double successRate = (double) successfulCodePoints / totalCodePoints;
|
||||
return successRate >= 0.1;
|
||||
}
|
||||
|
||||
public boolean isTextSegmentRemovable(PDFont font, String text) {
|
||||
if (font == null || text == null || text.isEmpty()) {
|
||||
private boolean canDecodeCharacter(PDFont font, String charStr) {
|
||||
if (font == null || charStr == null || charStr.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Log the attempt
|
||||
log.debug(
|
||||
"Evaluating text segment for removal: '{}' with font {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown Font");
|
||||
try {
|
||||
for (int code = 0; code <= 0xFFFF; code++) {
|
||||
try {
|
||||
String decoded = font.toUnicode(code);
|
||||
if (decoded != null && decoded.equals(charStr)) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean isBasicCharacter(int codePoint) {
|
||||
return (codePoint >= 32 && codePoint <= 126)
|
||||
|| (codePoint >= 160 && codePoint <= 255)
|
||||
|| Character.isWhitespace(codePoint)
|
||||
|| Character.isLetterOrDigit(codePoint);
|
||||
}
|
||||
|
||||
public boolean isTextSegmentRemovable(PDFont font, String text) {
|
||||
if (font == null || text == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (text.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isSimpleCharacter(text)) {
|
||||
try {
|
||||
font.encode(text);
|
||||
font.getStringWidth(text);
|
||||
log.debug(
|
||||
"Text '{}' is a simple character and passed validation - allowing removal",
|
||||
text);
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Simple character '{}' failed basic validation with font {}: {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
return false;
|
||||
try {
|
||||
return canHandleText(font, text);
|
||||
} catch (Exception e2) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For complex text, require comprehensive validation
|
||||
return isTextFullyRemovable(font, text);
|
||||
}
|
||||
|
||||
public boolean isTextFullyRemovable(PDFont font, String text) {
|
||||
if (font == null || text == null || text.isEmpty()) {
|
||||
private boolean canHandleText(PDFont font, String text) {
|
||||
if (font == null || text == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (text.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (int i = 0; i < text.length(); ) {
|
||||
int codePoint = text.codePointAt(i);
|
||||
String charStr = new String(Character.toChars(codePoint));
|
||||
|
||||
boolean canHandle = false;
|
||||
|
||||
try {
|
||||
byte[] encoded = font.encode(charStr);
|
||||
if (encoded.length > 0) {
|
||||
canHandle = true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
if (!canHandle) {
|
||||
try {
|
||||
if (canDecodeCharacter(font, charStr)) {
|
||||
canHandle = true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!canHandle && isBasicCharacter(codePoint)) {
|
||||
canHandle = true;
|
||||
}
|
||||
|
||||
if (!canHandle) {
|
||||
return false;
|
||||
}
|
||||
|
||||
i += Character.charCount(codePoint);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean isTextFullyRemovable(PDFont font, String text) {
|
||||
if (font == null || text == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (text.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
// Check 1: Verify encoding capability using new smart approach
|
||||
if (!canEncodeCharacters(font, text)) {
|
||||
log.debug(
|
||||
"Text '{}' failed encoding validation for font {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check 2: Validate width calculation capability
|
||||
float width = font.getStringWidth(text);
|
||||
if (width < 0) { // Allow zero width (invisible chars) but reject negative (invalid)
|
||||
log.debug(
|
||||
"Text '{}' has invalid width {} for font {}",
|
||||
text,
|
||||
width,
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return false; // Invalid metrics prevent accurate removal
|
||||
try {
|
||||
float width = font.getStringWidth(text);
|
||||
if (width < 0) {
|
||||
return false;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
if (!canCalculateTextWidth(font, text)) {
|
||||
return false;
|
||||
}
|
||||
} catch (Exception e2) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check 3: Verify font descriptor completeness for redaction area calculation
|
||||
if (font.getFontDescriptor() == null) {
|
||||
log.debug(
|
||||
"Missing font descriptor for font {}",
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return false;
|
||||
try {
|
||||
if (font.getFontDescriptor() == null) {
|
||||
try {
|
||||
return canHandleWithoutDescriptor(font, text);
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
return canHandleWithoutDescriptor(font, text);
|
||||
} catch (Exception e2) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check 4: Test bounding box calculation for redaction area
|
||||
try {
|
||||
font.getFontDescriptor().getFontBoundingBox();
|
||||
} catch (IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Font bounding box unavailable for font {}: {}",
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
return canHandleWithoutBoundingBox(font, text);
|
||||
} catch (Exception e2) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
return canHandleText(font, text);
|
||||
} catch (Exception e2) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean canCalculateTextWidth(PDFont font, String text) {
|
||||
if (font == null || text == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (text.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (int i = 0; i < text.length(); ) {
|
||||
int codePoint = text.codePointAt(i);
|
||||
String charStr = new String(Character.toChars(codePoint));
|
||||
|
||||
boolean hasWidth = false;
|
||||
try {
|
||||
float charWidth = font.getStringWidth(charStr);
|
||||
if (charWidth >= 0) {
|
||||
hasWidth = true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
float defaultWidth = getDefaultCharWidth(font);
|
||||
if (defaultWidth > 0) {
|
||||
hasWidth = true;
|
||||
}
|
||||
} catch (Exception e2) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasWidth && isBasicCharacter(codePoint)) {
|
||||
hasWidth = true;
|
||||
}
|
||||
|
||||
if (!hasWidth) {
|
||||
return false;
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"Text '{}' passed comprehensive validation for font {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return true;
|
||||
i += Character.charCount(codePoint);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
log.debug(
|
||||
"Text '{}' failed validation for font {} due to IO error: {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
return false;
|
||||
} catch (IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Text '{}' failed validation for font {} due to argument error: {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private float getDefaultCharWidth(PDFont font) {
|
||||
String[] testChars = {" ", "a", "A", "0", ".", "e", "!", "i", "l", "I"};
|
||||
for (String testChar : testChars) {
|
||||
try {
|
||||
float width = font.getStringWidth(testChar);
|
||||
if (width > 0) {
|
||||
return width;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
return 500;
|
||||
}
|
||||
|
||||
private boolean canHandleWithoutDescriptor(PDFont font, String text) {
|
||||
try {
|
||||
return canCalculateTextWidth(font, text);
|
||||
} catch (Exception e) {
|
||||
return canHandleText(font, text);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean canHandleWithoutBoundingBox(PDFont font, String text) {
|
||||
try {
|
||||
return canCalculateTextWidth(font, text);
|
||||
} catch (Exception e) {
|
||||
return canHandleText(font, text);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSimpleCharacter(String text) {
|
||||
if (text == null || text.isEmpty()) {
|
||||
if (text == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (text.length() > 20) {
|
||||
if (text.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (text.length() > 50) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
|
||||
// Allow letters, digits, and whitespace (most common cases)
|
||||
if (Character.isLetterOrDigit(c) || Character.isWhitespace(c)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Allow common ASCII punctuation
|
||||
if (c >= 32 && c <= 126 && ".,!?;:()-[]{}\"'/@#$%&*+=<>|\\~`".indexOf(c) >= 0) {
|
||||
if (c >= 32 && c <= 126) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c >= 160 && c <= 255) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (Character.getType(c) == Character.OTHER_PUNCTUATION
|
||||
|| Character.getType(c) == Character.DASH_PUNCTUATION
|
||||
|| Character.getType(c) == Character.START_PUNCTUATION
|
||||
|| Character.getType(c) == Character.END_PUNCTUATION
|
||||
|| Character.getType(c) == Character.CONNECTOR_PUNCTUATION
|
||||
|| Character.getType(c) == Character.OTHER_SYMBOL
|
||||
|| Character.getType(c) == Character.MATH_SYMBOL
|
||||
|| Character.getType(c) == Character.CURRENCY_SYMBOL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -243,111 +373,205 @@ public class TextEncodingHelper {
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean hasCustomEncoding(PDFont font) {
|
||||
try {
|
||||
if (font instanceof PDSimpleFont simpleFont) {
|
||||
try {
|
||||
Encoding encoding = simpleFont.getEncoding();
|
||||
if (encoding != null) {
|
||||
// Check for dictionary-based custom encodings
|
||||
if (encoding instanceof DictionaryEncoding) {
|
||||
log.debug("Font {} uses DictionaryEncoding (custom)", font.getName());
|
||||
return true;
|
||||
}
|
||||
|
||||
String encodingName = encoding.getClass().getSimpleName();
|
||||
if (encodingName.contains("Custom")
|
||||
|| encodingName.contains("Dictionary")) {
|
||||
log.debug(
|
||||
"Font {} uses custom encoding: {}",
|
||||
font.getName(),
|
||||
encodingName);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Encoding detection failed for font {}: {}",
|
||||
font.getName(),
|
||||
e.getMessage());
|
||||
return true; // Assume custom if detection fails
|
||||
}
|
||||
}
|
||||
|
||||
if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) {
|
||||
log.debug(
|
||||
"Font {} is Type0 (CID) - generally uses standard CMaps",
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return false;
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"Font {} type {} - assuming standard encoding",
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
font.getClass().getSimpleName());
|
||||
return false;
|
||||
|
||||
} catch (IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Custom encoding detection failed for font {}: {}",
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
return false; // Be forgiving on detection failure
|
||||
}
|
||||
}
|
||||
|
||||
public boolean fontSupportsCharacter(PDFont font, String character) {
|
||||
if (font == null || character == null || character.isEmpty()) {
|
||||
if (font == null || character == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (character.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
byte[] encoded = font.encode(character);
|
||||
if (encoded.length == 0) {
|
||||
return false;
|
||||
if (encoded.length > 0) {
|
||||
try {
|
||||
float width = font.getStringWidth(character);
|
||||
if (width >= 0) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
float width = font.getStringWidth(character);
|
||||
return width > 0;
|
||||
try {
|
||||
if (canDecodeCharacter(font, character)) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
} catch (IOException | IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Character '{}' not supported by font {}: {}",
|
||||
character,
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
for (int i = 0; i < character.length(); ) {
|
||||
int codePoint = character.codePointAt(i);
|
||||
if (isBasicCharacter(codePoint)) {
|
||||
i += Character.charCount(codePoint);
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean isFontSubset(String fontName) {
|
||||
if (fontName == null) {
|
||||
return false;
|
||||
}
|
||||
return fontName.matches("^[A-Z]{6}\\+.*");
|
||||
|
||||
if (fontName.matches("^[A-Z]{6}\\+.*")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (fontName.matches("^[A-Z]{5}\\+.*")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (fontName.matches("^[A-Z]{4}\\+.*")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (fontName.contains("+")) {
|
||||
String prefix = fontName.split("\\+")[0];
|
||||
if (prefix.matches("^[A-Z]+$") && prefix.length() >= 4) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean canCalculateBasicWidths(PDFont font) {
|
||||
if (font == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
float spaceWidth = font.getStringWidth(" ");
|
||||
if (spaceWidth <= 0) {
|
||||
return false;
|
||||
if (spaceWidth > 0) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
String[] testChars = {"a", "A", "0", ".", "e", "!"};
|
||||
for (String ch : testChars) {
|
||||
String[] testChars = {
|
||||
"a", "A", "0", ".", "e", "!", "i", "l", "I", "m", "M", "W", "w", "1", "|", "-", "_",
|
||||
"=", "+", "(", ")", "[", "]", "{", "}", "<", ">", "/", "\\", "?", ",", ";", ":", "\"",
|
||||
"'", "`", "~", "@", "#", "$", "%", "^", "&", "*"
|
||||
};
|
||||
int successCount = 0;
|
||||
|
||||
for (String ch : testChars) {
|
||||
try {
|
||||
float width = font.getStringWidth(ch);
|
||||
if (width > 0) {
|
||||
successCount++;
|
||||
if (successCount >= 3) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
for (int code = 32; code <= 126; code++) {
|
||||
try {
|
||||
String ch = String.valueOf((char) code);
|
||||
float width = font.getStringWidth(ch);
|
||||
if (width > 0) {
|
||||
successCount++;
|
||||
if (successCount >= 1) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
for (int code = 160; code <= 255; code++) {
|
||||
try {
|
||||
String ch = String.valueOf((char) code);
|
||||
float width = font.getStringWidth(ch);
|
||||
if (width > 0) {
|
||||
return true;
|
||||
}
|
||||
} catch (IOException | IllegalArgumentException e) {
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
return false; // Can't calculate width for any test characters
|
||||
} catch (IOException | IllegalArgumentException e) {
|
||||
return false; // Font failed basic width calculation
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean canEncodeAnyCharacter(PDFont font) {
|
||||
if (font == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
String[] testStrings = {
|
||||
"a", "A", "0", " ", ".", "!", "e", "i", "o", "u", "n", "t", "r", "s", "l", "1", "2",
|
||||
"3", "4", "5", "6", "7", "8", "9", ",", ".", ";", ":", "?", "!", "(", ")", "[", "]",
|
||||
"{", "}", "hello", "test", "sample", "abc", "123", "ABC"
|
||||
};
|
||||
|
||||
for (String testStr : testStrings) {
|
||||
try {
|
||||
byte[] encoded = font.encode(testStr);
|
||||
if (encoded.length > 0) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
for (int code = 0; code <= 0xFFFF; code += 100) {
|
||||
try {
|
||||
String testStr = String.valueOf((char) code);
|
||||
byte[] encoded = font.encode(testStr);
|
||||
if (encoded.length > 0) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isValidFont(PDFont font) {
|
||||
if (font == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
String name = font.getName();
|
||||
if (name != null && !name.trim().isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
if (canCalculateBasicWidths(font)) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
if (canEncodeAnyCharacter(font)) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -5,10 +5,6 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -16,128 +12,116 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@UtilityClass
|
||||
public class TextFinderUtils {
|
||||
|
||||
public boolean validateFontReliability(PDFont font) {
|
||||
if (font == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (font.isDamaged()) {
|
||||
log.debug(
|
||||
"Font {} is marked as damaged - using TextEncodingHelper validation",
|
||||
font.getName());
|
||||
}
|
||||
|
||||
if (TextEncodingHelper.canCalculateBasicWidths(font)) {
|
||||
log.debug(
|
||||
"Font {} passed basic width calculations - considering reliable",
|
||||
font.getName());
|
||||
return true;
|
||||
}
|
||||
|
||||
String[] basicTests = {"1", "2", "3", "a", "A", "e", "E", " "};
|
||||
|
||||
int workingChars = 0;
|
||||
for (String testChar : basicTests) {
|
||||
if (TextEncodingHelper.canEncodeCharacters(font, testChar)) {
|
||||
workingChars++;
|
||||
}
|
||||
}
|
||||
|
||||
if (workingChars > 0) {
|
||||
log.debug(
|
||||
"Font {} can process {}/{} basic characters - considering reliable",
|
||||
font.getName(),
|
||||
workingChars,
|
||||
basicTests.length);
|
||||
return true;
|
||||
}
|
||||
|
||||
log.debug("Font {} failed all basic tests - considering unreliable", font.getName());
|
||||
return false;
|
||||
}
|
||||
|
||||
public List<Pattern> createOptimizedSearchPatterns(
|
||||
Set<String> searchTerms, boolean useRegex, boolean wholeWordSearch) {
|
||||
List<Pattern> patterns = new ArrayList<>();
|
||||
|
||||
if (searchTerms == null) {
|
||||
return patterns;
|
||||
}
|
||||
|
||||
for (String term : searchTerms) {
|
||||
if (term == null || term.trim().isEmpty()) {
|
||||
if (term == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String trimmedTerm = term.trim();
|
||||
if (trimmedTerm.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
String patternString = useRegex ? term.trim() : Pattern.quote(term.trim());
|
||||
|
||||
if (wholeWordSearch) {
|
||||
patternString = applyWordBoundaries(term.trim(), patternString);
|
||||
String patternString;
|
||||
if (useRegex) {
|
||||
patternString = trimmedTerm;
|
||||
try {
|
||||
Pattern.compile(patternString);
|
||||
} catch (Exception e) {
|
||||
patternString = Pattern.quote(trimmedTerm);
|
||||
}
|
||||
} else {
|
||||
patternString = Pattern.quote(trimmedTerm);
|
||||
}
|
||||
|
||||
Pattern pattern =
|
||||
Pattern.compile(
|
||||
patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
|
||||
if (wholeWordSearch) {
|
||||
patternString = applyWordBoundaries(trimmedTerm, patternString, useRegex);
|
||||
}
|
||||
|
||||
int flags = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL;
|
||||
try {
|
||||
flags |= Pattern.CANON_EQ;
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
Pattern pattern = Pattern.compile(patternString, flags);
|
||||
patterns.add(pattern);
|
||||
|
||||
log.debug("Created search pattern: '{}' -> '{}'", term.trim(), patternString);
|
||||
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to create pattern for term '{}': {}", term, e.getMessage());
|
||||
try {
|
||||
String quotedTerm = Pattern.quote(trimmedTerm);
|
||||
if (wholeWordSearch) {
|
||||
quotedTerm = applyWordBoundaries(trimmedTerm, quotedTerm, false);
|
||||
}
|
||||
Pattern fallbackPattern =
|
||||
Pattern.compile(
|
||||
quotedTerm, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
|
||||
patterns.add(fallbackPattern);
|
||||
} catch (Exception e2) {
|
||||
try {
|
||||
Pattern simplestPattern = Pattern.compile(Pattern.quote(trimmedTerm));
|
||||
patterns.add(simplestPattern);
|
||||
} catch (Exception e3) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return patterns;
|
||||
}
|
||||
|
||||
private String applyWordBoundaries(String originalTerm, String patternString) {
|
||||
if (originalTerm.length() == 1 && Character.isDigit(originalTerm.charAt(0))) {
|
||||
return "(?<![\\w])" + patternString + "(?![\\w])";
|
||||
} else if (originalTerm.length() == 1) {
|
||||
return "(?<![\\w])" + patternString + "(?![\\w])";
|
||||
} else {
|
||||
return "\\b" + patternString + "\\b";
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasProblematicFonts(PDPage page) {
|
||||
if (page == null) {
|
||||
return false;
|
||||
private String applyWordBoundaries(String originalTerm, String patternString, boolean isRegex) {
|
||||
if (originalTerm == null || originalTerm.isEmpty()) {
|
||||
return patternString;
|
||||
}
|
||||
|
||||
try {
|
||||
PDResources resources = page.getResources();
|
||||
if (resources == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int totalFonts = 0;
|
||||
int completelyUnusableFonts = 0;
|
||||
|
||||
for (org.apache.pdfbox.cos.COSName fontName : resources.getFontNames()) {
|
||||
try {
|
||||
org.apache.pdfbox.pdmodel.font.PDFont font = resources.getFont(fontName);
|
||||
if (font != null) {
|
||||
totalFonts++;
|
||||
if (!validateFontReliability(font)) {
|
||||
completelyUnusableFonts++;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("Font loading failed for {}: {}", fontName.getName(), e.getMessage());
|
||||
totalFonts++;
|
||||
if (originalTerm.length() == 1) {
|
||||
char c = originalTerm.charAt(0);
|
||||
if (Character.isDigit(c)) {
|
||||
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
|
||||
} else if (Character.isLetter(c)) {
|
||||
return "(?<![\\p{L}\\p{N}])" + patternString + "(?![\\p{L}\\p{N}])";
|
||||
} else {
|
||||
return "(?<!\\S)" + patternString + "(?!\\S)";
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasProblems = totalFonts > 0 && (completelyUnusableFonts * 2 > totalFonts);
|
||||
log.debug(
|
||||
"Page font analysis: {}/{} fonts are completely unusable - page {} problematic",
|
||||
completelyUnusableFonts,
|
||||
totalFonts,
|
||||
hasProblems ? "IS" : "is NOT");
|
||||
boolean startsWithWordChar = Character.isLetterOrDigit(originalTerm.charAt(0));
|
||||
boolean endsWithWordChar =
|
||||
Character.isLetterOrDigit(originalTerm.charAt(originalTerm.length() - 1));
|
||||
|
||||
return hasProblems;
|
||||
String result = patternString;
|
||||
|
||||
if (startsWithWordChar) {
|
||||
result = "(?<![\\p{L}\\p{N}])" + result;
|
||||
} else {
|
||||
result = "(?<!\\S)" + result;
|
||||
}
|
||||
|
||||
if (endsWithWordChar) {
|
||||
result = result + "(?![\\p{L}\\p{N}])";
|
||||
} else {
|
||||
result = result + "(?!\\S)";
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.warn("Font analysis failed for page: {}", e.getMessage());
|
||||
return false; // Be permissive if analysis fails
|
||||
try {
|
||||
return "\\b" + patternString + "\\b";
|
||||
} catch (Exception e2) {
|
||||
return patternString;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,88 +1,69 @@
|
||||
package stirling.software.SPDF.utils.text;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class WidthCalculator {
|
||||
|
||||
private final int FONT_SCALE_FACTOR = 1000;
|
||||
private final float CONSERVATIVE_CHAR_WIDTH_RATIO = 0.55f;
|
||||
private final float BBOX_CHAR_WIDTH_RATIO = 0.65f;
|
||||
|
||||
private final Map<String, Float> widthCache = new ConcurrentHashMap<>();
|
||||
private final Map<String, Boolean> reliabilityCache = new ConcurrentHashMap<>();
|
||||
|
||||
private String createCacheKey(PDFont font, String text, float fontSize) {
|
||||
return String.format("%s|%s|%.2f", font.getName(), text, fontSize);
|
||||
}
|
||||
|
||||
private String createReliabilityCacheKey(PDFont font) {
|
||||
return font.getName();
|
||||
}
|
||||
|
||||
public float calculateAccurateWidth(PDFont font, String text, float fontSize) {
|
||||
return calculateAccurateWidth(font, text, fontSize, true);
|
||||
}
|
||||
if (font == null || text == null || fontSize <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public float calculateAccurateWidth(
|
||||
PDFont font, String text, float fontSize, boolean useCache) {
|
||||
if (font == null || text == null || text.isEmpty() || fontSize <= 0) return 0;
|
||||
|
||||
if (useCache) {
|
||||
String cacheKey = createCacheKey(font, text, fontSize);
|
||||
Float cachedWidth = widthCache.get(cacheKey);
|
||||
if (cachedWidth != null) return cachedWidth;
|
||||
if (text.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
String normalizedText = normalizeText(text);
|
||||
|
||||
Float directWidth = calculateDirectWidth(font, normalizedText, fontSize);
|
||||
if (directWidth != null) {
|
||||
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), directWidth);
|
||||
return directWidth;
|
||||
}
|
||||
|
||||
Float charByCharWidth = calculateCharacterByCharacterWidth(font, normalizedText, fontSize);
|
||||
if (charByCharWidth != null) {
|
||||
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), charByCharWidth);
|
||||
return charByCharWidth;
|
||||
}
|
||||
|
||||
Float glyphWidth = calculateGlyphBasedWidth(font, normalizedText, fontSize);
|
||||
if (glyphWidth != null) {
|
||||
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), glyphWidth);
|
||||
return glyphWidth;
|
||||
}
|
||||
|
||||
float fallbackWidth = calculateComprehensiveFallbackWidth(font, normalizedText, fontSize);
|
||||
if (useCache) widthCache.put(createCacheKey(font, text, fontSize), fallbackWidth);
|
||||
return fallbackWidth;
|
||||
return calculateComprehensiveFallbackWidth(font, normalizedText, fontSize);
|
||||
}
|
||||
|
||||
private String normalizeText(String text) {
|
||||
return Normalizer.normalize(text, Normalizer.Form.NFC);
|
||||
if (text == null) return "";
|
||||
try {
|
||||
return Normalizer.normalize(text, Normalizer.Form.NFC);
|
||||
} catch (Exception e) {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
|
||||
private Float calculateDirectWidth(PDFont font, String text, float fontSize) {
|
||||
if (!TextEncodingHelper.canEncodeCharacters(font, text)) return null;
|
||||
|
||||
try {
|
||||
float rawWidth = font.getStringWidth(text);
|
||||
float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize;
|
||||
return rawWidth >= 0 && scaledWidth >= 0 ? scaledWidth : null;
|
||||
if (!TextEncodingHelper.canEncodeCharacters(font, text)) {
|
||||
return null;
|
||||
}
|
||||
float rawWidth = font.getStringWidth(text) / 1000f;
|
||||
if (rawWidth < 0) return null;
|
||||
float scaledWidth = rawWidth * fontSize;
|
||||
return scaledWidth >= 0 ? scaledWidth : null;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
@ -96,7 +77,12 @@ public class WidthCalculator {
|
||||
|
||||
for (int codePoint : codePoints) {
|
||||
String character = new String(Character.toChars(codePoint));
|
||||
Float charWidth = calculateSingleCharacterWidth(font, character, fontSize);
|
||||
Float charWidth =
|
||||
calculateSingleCharacterWidth(font, character, fontSize, codePoint);
|
||||
|
||||
if (charWidth == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
totalWidth += charWidth;
|
||||
if (previousCodePoint != -1) {
|
||||
@ -104,7 +90,7 @@ public class WidthCalculator {
|
||||
}
|
||||
previousCodePoint = codePoint;
|
||||
}
|
||||
return totalWidth;
|
||||
return totalWidth >= 0 ? totalWidth : null;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
@ -112,80 +98,99 @@ public class WidthCalculator {
|
||||
|
||||
private List<Integer> getCodePoints(String text) {
|
||||
List<Integer> codePoints = new ArrayList<>();
|
||||
if (text == null) return codePoints;
|
||||
|
||||
for (int i = 0; i < text.length(); ) {
|
||||
int codePoint = text.codePointAt(i);
|
||||
codePoints.add(codePoint);
|
||||
i += Character.charCount(codePoint);
|
||||
try {
|
||||
int codePoint = text.codePointAt(i);
|
||||
codePoints.add(codePoint);
|
||||
i += Character.charCount(codePoint);
|
||||
} catch (Exception e) {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return codePoints;
|
||||
}
|
||||
|
||||
private Float calculateSingleCharacterWidth(PDFont font, String character, float fontSize) {
|
||||
private Float calculateSingleCharacterWidth(
|
||||
PDFont font, String character, float fontSize, int codePoint) {
|
||||
try {
|
||||
byte[] encoded = null;
|
||||
|
||||
try {
|
||||
encoded = font.encode(character);
|
||||
if (encoded.length == 0) encoded = null;
|
||||
} catch (Exception e) {
|
||||
log.debug("Direct encoding failed for '{}': {}", character, e.getMessage());
|
||||
}
|
||||
|
||||
if (encoded == null && font instanceof PDType0Font) {
|
||||
if (TextEncodingHelper.fontSupportsCharacter(font, character)) {
|
||||
try {
|
||||
encoded = character.getBytes(StandardCharsets.UTF_8);
|
||||
} catch (Exception e) {
|
||||
log.debug("UTF-8 encoding failed for '{}': {}", character, e.getMessage());
|
||||
float raw = font.getStringWidth(character) / 1000f;
|
||||
if (raw >= 0) return raw * fontSize;
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
|
||||
if (encoded != null && encoded.length > 0) {
|
||||
Float width = calculateGlyphWidth(font, encoded, fontSize);
|
||||
if (width != null && width >= 0) return width;
|
||||
}
|
||||
|
||||
return calculateAverageCharacterWidth(font, fontSize);
|
||||
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Single character width calculation failed for '{}': {}",
|
||||
character,
|
||||
e.getMessage());
|
||||
return calculateAverageCharacterWidth(font, fontSize);
|
||||
}
|
||||
}
|
||||
|
||||
private Float calculateGlyphWidth(PDFont font, byte[] encoded, float fontSize) {
|
||||
for (byte b : encoded) {
|
||||
try {
|
||||
int glyphCode = b & 0xFF;
|
||||
float glyphWidth = font.getWidth(glyphCode);
|
||||
try {
|
||||
float w = font.getWidth(codePoint) / 1000f;
|
||||
if (w >= 0) return w * fontSize;
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
if (glyphWidth > 0) {
|
||||
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
|
||||
}
|
||||
|
||||
// Try alternative width methods
|
||||
try {
|
||||
glyphWidth = font.getWidthFromFont(glyphCode);
|
||||
if (glyphWidth > 0) {
|
||||
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"getWidthFromFont failed for glyph {}: {}", glyphCode, e.getMessage());
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.debug("Glyph width calculation failed for byte {}: {}", b, e.getMessage());
|
||||
try {
|
||||
if (codePoint >= 0 && codePoint <= 0xFFFF) {
|
||||
float w = font.getWidth(codePoint) / 1000f;
|
||||
if (w >= 0) return w * fontSize;
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
return null;
|
||||
|
||||
try {
|
||||
byte[] encoded = font.encode(character);
|
||||
if (encoded.length > 0) {
|
||||
for (byte b : encoded) {
|
||||
try {
|
||||
int glyphCode = b & 0xFF;
|
||||
float w = font.getWidth(glyphCode) / 1000f;
|
||||
if (w >= 0) return w * fontSize;
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
return calculateCategoryBasedWidth(font, codePoint, fontSize);
|
||||
}
|
||||
|
||||
private float calculateKerning(
|
||||
PDFont font, int leftCodePoint, int rightCodePoint, float fontSize) {
|
||||
return 0;
|
||||
try {
|
||||
if (font instanceof PDSimpleFont) {
|
||||
PDSimpleFont simpleFont = (PDSimpleFont) font;
|
||||
try {
|
||||
java.lang.reflect.Method getKerningMethod =
|
||||
simpleFont.getClass().getMethod("getKerning", int.class, int.class);
|
||||
float kerningValue =
|
||||
(Float)
|
||||
getKerningMethod.invoke(
|
||||
simpleFont, leftCodePoint, rightCodePoint);
|
||||
return (kerningValue / 1000f) * fontSize;
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
String leftChar = new String(Character.toChars(leftCodePoint));
|
||||
String rightChar = new String(Character.toChars(rightCodePoint));
|
||||
String combined = leftChar + rightChar;
|
||||
|
||||
float combinedWidth = font.getStringWidth(combined) / 1000f;
|
||||
float leftWidth = font.getStringWidth(leftChar) / 1000f;
|
||||
float rightWidth = font.getStringWidth(rightChar) / 1000f;
|
||||
|
||||
float kerning = combinedWidth - leftWidth - rightWidth;
|
||||
return kerning * fontSize;
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return 0f;
|
||||
}
|
||||
|
||||
private Float calculateGlyphBasedWidth(PDFont font, String text, float fontSize) {
|
||||
@ -196,7 +201,6 @@ public class WidthCalculator {
|
||||
int codePoint = text.codePointAt(i);
|
||||
String character = new String(Character.toChars(codePoint));
|
||||
|
||||
// Try to get glyph information more comprehensively
|
||||
Float charWidth =
|
||||
calculateGlyphWidthComprehensively(font, character, codePoint, fontSize);
|
||||
if (charWidth == null) {
|
||||
@ -207,11 +211,9 @@ public class WidthCalculator {
|
||||
i += Character.charCount(codePoint);
|
||||
}
|
||||
|
||||
log.debug("Glyph-based width calculation: {}", totalWidth);
|
||||
return totalWidth;
|
||||
return totalWidth >= 0 ? totalWidth : null;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.debug("Glyph-based calculation failed: {}", e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -219,70 +221,118 @@ public class WidthCalculator {
|
||||
private Float calculateGlyphWidthComprehensively(
|
||||
PDFont font, String character, int codePoint, float fontSize) {
|
||||
try {
|
||||
// Method 1: Try standard encoding
|
||||
try {
|
||||
byte[] encoded = font.encode(character);
|
||||
if (encoded.length > 0) {
|
||||
Float width = calculateWidthFromEncodedBytes(font, encoded, fontSize);
|
||||
if (width != null && width >= 0) {
|
||||
return width;
|
||||
}
|
||||
byte[] encoded = font.encode(character);
|
||||
if (encoded.length > 0) {
|
||||
Float width = calculateWidthFromEncodedBytes(font, encoded, fontSize);
|
||||
if (width != null && width >= 0) {
|
||||
return width;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Standard encoding failed for U+{}: {}",
|
||||
Integer.toHexString(codePoint),
|
||||
e.getMessage());
|
||||
}
|
||||
|
||||
// Method 2: Try Unicode code point directly
|
||||
try {
|
||||
float glyphWidth = font.getWidth(codePoint);
|
||||
if (glyphWidth > 0) {
|
||||
return (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Unicode code point width failed for U+{}: {}",
|
||||
Integer.toHexString(codePoint),
|
||||
e.getMessage());
|
||||
}
|
||||
|
||||
// Method 3: Character category based estimation
|
||||
return calculateCategoryBasedWidth(font, codePoint, fontSize);
|
||||
|
||||
} catch (Exception e) {
|
||||
log.debug("Comprehensive glyph width calculation failed: {}", e.getMessage());
|
||||
return calculateAverageCharacterWidth(font, fontSize);
|
||||
}
|
||||
|
||||
try {
|
||||
float glyphWidth = font.getWidth(codePoint) / 1000f;
|
||||
if (glyphWidth >= 0) {
|
||||
return glyphWidth * fontSize;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
if (codePoint <= 0xFFFF) {
|
||||
float glyphWidth = font.getWidth(codePoint) / 1000f;
|
||||
if (glyphWidth >= 0) {
|
||||
return glyphWidth * fontSize;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
for (int code = 0; code <= 0xFF; code++) {
|
||||
try {
|
||||
String decoded = font.toUnicode(code);
|
||||
if (decoded != null && decoded.equals(character)) {
|
||||
float glyphWidth = font.getWidth(code) / 1000f;
|
||||
if (glyphWidth >= 0) {
|
||||
return glyphWidth * fontSize;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return calculateCategoryBasedWidth(font, codePoint, fontSize);
|
||||
}
|
||||
|
||||
private Float calculateWidthFromEncodedBytes(PDFont font, byte[] encoded, float fontSize) {
|
||||
// Try each byte as a potential glyph code
|
||||
for (byte b : encoded) {
|
||||
if (encoded == null || encoded.length == 0) return null;
|
||||
|
||||
if (font instanceof PDType0Font && encoded.length >= 2) {
|
||||
try {
|
||||
int glyphCode = b & 0xFF;
|
||||
float width = font.getWidth(glyphCode);
|
||||
if (width > 0) {
|
||||
return (width / FONT_SCALE_FACTOR) * fontSize;
|
||||
int glyphCode = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
|
||||
float width = font.getWidth(glyphCode) / 1000f;
|
||||
if (width >= 0) {
|
||||
return width * fontSize;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
for (int i = 0; i <= encoded.length - 2; i++) {
|
||||
int glyphCode = ((encoded[i] & 0xFF) << 8) | (encoded[i + 1] & 0xFF);
|
||||
float width = font.getWidth(glyphCode) / 1000f;
|
||||
if (width >= 0) {
|
||||
return width * fontSize;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// Continue trying other bytes
|
||||
}
|
||||
}
|
||||
|
||||
if (encoded.length >= 2 && font instanceof PDType0Font) {
|
||||
for (byte b : encoded) {
|
||||
try {
|
||||
int glyphCode = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
|
||||
float width = font.getWidth(glyphCode);
|
||||
if (width > 0) {
|
||||
return (width / FONT_SCALE_FACTOR) * fontSize;
|
||||
int glyphCode = b & 0xFF;
|
||||
float width = font.getWidth(glyphCode) / 1000f;
|
||||
if (width >= 0) {
|
||||
return width * fontSize;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("Multi-byte glyph code interpretation failed: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
if (encoded.length >= 3) {
|
||||
int glyphCode =
|
||||
((encoded[0] & 0xFF) << 16)
|
||||
| ((encoded[1] & 0xFF) << 8)
|
||||
| (encoded[2] & 0xFF);
|
||||
float width = font.getWidth(glyphCode) / 1000f;
|
||||
if (width >= 0) {
|
||||
return width * fontSize;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
if (encoded.length >= 4) {
|
||||
int glyphCode =
|
||||
((encoded[0] & 0xFF) << 24)
|
||||
| ((encoded[1] & 0xFF) << 16)
|
||||
| ((encoded[2] & 0xFF) << 8)
|
||||
| (encoded[3] & 0xFF);
|
||||
float width = font.getWidth(glyphCode) / 1000f;
|
||||
if (width >= 0) {
|
||||
return width * fontSize;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@ -291,198 +341,237 @@ public class WidthCalculator {
|
||||
int category = Character.getType(codePoint);
|
||||
float baseWidth = calculateAverageCharacterWidth(font, fontSize);
|
||||
|
||||
// Adjust width based on character category
|
||||
float multiplier =
|
||||
switch (category) {
|
||||
case Character.UPPERCASE_LETTER -> 1.2f;
|
||||
case Character.LOWERCASE_LETTER -> 1.0f;
|
||||
case Character.DECIMAL_DIGIT_NUMBER -> 1.0f;
|
||||
case Character.SPACE_SEPARATOR -> 0.5f;
|
||||
case Character.DASH_PUNCTUATION -> 0.8f;
|
||||
case Character.OTHER_PUNCTUATION -> 0.6f;
|
||||
case Character.CURRENCY_SYMBOL -> 1.1f;
|
||||
case Character.MATH_SYMBOL -> 1.0f;
|
||||
case Character.TITLECASE_LETTER -> 1.15f;
|
||||
case Character.MODIFIER_LETTER -> 0.7f;
|
||||
case Character.NON_SPACING_MARK -> 0.0f; // Combining characters
|
||||
case Character.OTHER_LETTER -> 1.0f;
|
||||
case Character.DECIMAL_DIGIT_NUMBER -> 1.0f;
|
||||
case Character.LETTER_NUMBER -> 1.0f;
|
||||
case Character.OTHER_NUMBER -> 1.0f;
|
||||
case Character.SPACE_SEPARATOR -> 0.5f;
|
||||
case Character.LINE_SEPARATOR -> 0.0f;
|
||||
case Character.PARAGRAPH_SEPARATOR -> 0.0f;
|
||||
case Character.NON_SPACING_MARK -> 0.0f;
|
||||
case Character.ENCLOSING_MARK -> 0.0f;
|
||||
case Character.COMBINING_SPACING_MARK -> 0.3f;
|
||||
case Character.DASH_PUNCTUATION -> 0.8f;
|
||||
case Character.START_PUNCTUATION -> 0.6f;
|
||||
case Character.END_PUNCTUATION -> 0.6f;
|
||||
case Character.CONNECTOR_PUNCTUATION -> 0.6f;
|
||||
case Character.OTHER_PUNCTUATION -> 0.6f;
|
||||
case Character.MATH_SYMBOL -> 1.0f;
|
||||
case Character.CURRENCY_SYMBOL -> 1.1f;
|
||||
case Character.MODIFIER_SYMBOL -> 0.8f;
|
||||
case Character.OTHER_SYMBOL -> 1.0f;
|
||||
case Character.INITIAL_QUOTE_PUNCTUATION -> 0.6f;
|
||||
case Character.FINAL_QUOTE_PUNCTUATION -> 0.6f;
|
||||
case Character.CONTROL -> 0.0f;
|
||||
case Character.FORMAT -> 0.0f;
|
||||
case Character.PRIVATE_USE -> 1.0f;
|
||||
case Character.SURROGATE -> 0.0f;
|
||||
case Character.UNASSIGNED -> 1.0f;
|
||||
default -> 1.0f;
|
||||
};
|
||||
|
||||
return baseWidth * multiplier;
|
||||
float result = baseWidth * multiplier;
|
||||
return result >= 0 ? result : baseWidth;
|
||||
} catch (Exception e) {
|
||||
log.debug("Category-based width calculation failed: {}", e.getMessage());
|
||||
return calculateAverageCharacterWidth(font, fontSize);
|
||||
}
|
||||
}
|
||||
|
||||
private float calculateAverageCharacterWidth(PDFont font, float fontSize) {
|
||||
try {
|
||||
float avgWidth = font.getAverageFontWidth();
|
||||
return (avgWidth / FONT_SCALE_FACTOR) * fontSize;
|
||||
float avgWidth = font.getAverageFontWidth() / 1000f;
|
||||
if (avgWidth > 0) {
|
||||
return avgWidth * fontSize;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("Average character width calculation failed: {}", e.getMessage());
|
||||
return CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
|
||||
}
|
||||
|
||||
try {
|
||||
String[] testChars = {
|
||||
"a", "A", "e", "E", "i", "I", "o", "O", "n", "N", "t", "T", "r", "R", "s", "S", "0",
|
||||
"1", "2", "3", "4", "5"
|
||||
};
|
||||
float totalWidth = 0;
|
||||
int successCount = 0;
|
||||
|
||||
for (String testChar : testChars) {
|
||||
try {
|
||||
float width = font.getStringWidth(testChar) / 1000f;
|
||||
if (width > 0) {
|
||||
totalWidth += width;
|
||||
successCount++;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
if (successCount > 0) {
|
||||
return (totalWidth / successCount) * fontSize;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
for (int code = 32; code <= 126; code++) {
|
||||
try {
|
||||
float width = font.getWidth(code) / 1000f;
|
||||
if (width > 0) {
|
||||
return width * fontSize;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
if (font.getFontDescriptor() != null) {
|
||||
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
|
||||
if (bbox != null) {
|
||||
float avgCharWidth = bbox.getWidth() / 2000f;
|
||||
return avgCharWidth * fontSize;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
|
||||
}
|
||||
|
||||
private float calculateComprehensiveFallbackWidth(PDFont font, String text, float fontSize) {
|
||||
if (text == null || text.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
float charWidth = calculateAverageCharacterWidth(font, fontSize);
|
||||
float totalWidth = 0;
|
||||
|
||||
for (int i = 0; i < text.length(); ) {
|
||||
int codePoint = text.codePointAt(i);
|
||||
Float specificWidth = calculateCategoryBasedWidth(font, codePoint, fontSize);
|
||||
if (specificWidth != null) {
|
||||
totalWidth += specificWidth;
|
||||
} else {
|
||||
totalWidth += charWidth;
|
||||
}
|
||||
i += Character.charCount(codePoint);
|
||||
}
|
||||
|
||||
return totalWidth;
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
// Strategy 1: Use font bounding box with character analysis
|
||||
if (font.getFontDescriptor() != null
|
||||
&& font.getFontDescriptor().getFontBoundingBox() != null) {
|
||||
|
||||
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
|
||||
float avgCharWidth = bbox.getWidth() / FONT_SCALE_FACTOR;
|
||||
|
||||
// Analyze text composition for better estimation
|
||||
float adjustedWidth = analyzeTextComposition(text, avgCharWidth, fontSize);
|
||||
log.debug("Bounding box based fallback width: {}", adjustedWidth);
|
||||
return adjustedWidth;
|
||||
float avgCharWidth = bbox.getWidth() / 1000f;
|
||||
return text.length() * avgCharWidth * BBOX_CHAR_WIDTH_RATIO * fontSize;
|
||||
}
|
||||
|
||||
// Strategy 2: Enhanced average width calculation
|
||||
float enhancedAverage = calculateEnhancedAverageWidth(font, text, fontSize);
|
||||
log.debug("Enhanced average fallback width: {}", enhancedAverage);
|
||||
return enhancedAverage;
|
||||
|
||||
} catch (Exception e) {
|
||||
float conservativeWidth = text.length() * CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
|
||||
log.debug("Conservative fallback width: {}", conservativeWidth);
|
||||
return conservativeWidth;
|
||||
}
|
||||
}
|
||||
|
||||
private float analyzeTextComposition(String text, float avgCharWidth, float fontSize) {
|
||||
float totalWidth = 0;
|
||||
int spaceCount = 0;
|
||||
int upperCount = 0;
|
||||
int lowerCount = 0;
|
||||
int digitCount = 0;
|
||||
int punctCount = 0;
|
||||
|
||||
for (int i = 0; i < text.length(); ) {
|
||||
int codePoint = text.codePointAt(i);
|
||||
int category = Character.getType(codePoint);
|
||||
|
||||
switch (category) {
|
||||
case Character.SPACE_SEPARATOR -> {
|
||||
spaceCount++;
|
||||
totalWidth += avgCharWidth * 0.5f * fontSize;
|
||||
}
|
||||
case Character.UPPERCASE_LETTER -> {
|
||||
upperCount++;
|
||||
totalWidth += avgCharWidth * 1.2f * fontSize;
|
||||
}
|
||||
case Character.LOWERCASE_LETTER -> {
|
||||
lowerCount++;
|
||||
totalWidth += avgCharWidth * 1.0f * fontSize;
|
||||
}
|
||||
case Character.DECIMAL_DIGIT_NUMBER -> {
|
||||
digitCount++;
|
||||
totalWidth += avgCharWidth * 1.0f * fontSize;
|
||||
}
|
||||
case Character.OTHER_PUNCTUATION, Character.DASH_PUNCTUATION -> {
|
||||
punctCount++;
|
||||
totalWidth += avgCharWidth * 0.7f * fontSize;
|
||||
}
|
||||
default -> totalWidth += avgCharWidth * BBOX_CHAR_WIDTH_RATIO * fontSize;
|
||||
}
|
||||
|
||||
i += Character.charCount(codePoint);
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"Text composition analysis - Spaces: {}, Upper: {}, Lower: {}, Digits: {}, Punct: {}",
|
||||
spaceCount,
|
||||
upperCount,
|
||||
lowerCount,
|
||||
digitCount,
|
||||
punctCount);
|
||||
|
||||
return totalWidth;
|
||||
}
|
||||
|
||||
private float calculateEnhancedAverageWidth(PDFont font, String text, float fontSize) {
|
||||
try {
|
||||
float baseAverage = font.getAverageFontWidth();
|
||||
|
||||
float capHeight = 0;
|
||||
float xHeight = 0;
|
||||
|
||||
if (font.getFontDescriptor() != null) {
|
||||
capHeight = font.getFontDescriptor().getCapHeight();
|
||||
xHeight = font.getFontDescriptor().getXHeight();
|
||||
}
|
||||
|
||||
float adjustmentFactor = 1.0f;
|
||||
if (capHeight > 0 && xHeight > 0) {
|
||||
adjustmentFactor = Math.max(0.8f, Math.min(1.2f, xHeight / capHeight));
|
||||
}
|
||||
|
||||
float adjustedAverage = (baseAverage * adjustmentFactor / FONT_SCALE_FACTOR) * fontSize;
|
||||
return text.length() * adjustedAverage;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.debug("Enhanced average width calculation failed: {}", e.getMessage());
|
||||
return text.length() * CONSERVATIVE_CHAR_WIDTH_RATIO * fontSize;
|
||||
}
|
||||
return text.length() * calculateAverageCharacterWidth(font, fontSize);
|
||||
}
|
||||
|
||||
public boolean isWidthCalculationReliable(PDFont font) {
|
||||
if (font == null) {
|
||||
return false;
|
||||
if (font == null) return false;
|
||||
|
||||
try {
|
||||
if (font.isDamaged()) return false;
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
String cacheKey = createReliabilityCacheKey(font);
|
||||
Boolean cachedResult = reliabilityCache.get(cacheKey);
|
||||
if (cachedResult != null) {
|
||||
log.debug(
|
||||
"Using cached reliability result for font {}: {}",
|
||||
font.getName(),
|
||||
cachedResult);
|
||||
return cachedResult;
|
||||
try {
|
||||
if (!TextEncodingHelper.canCalculateBasicWidths(font)) return false;
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
boolean result = performReliabilityCheck(font);
|
||||
try {
|
||||
font.getStringWidth("A");
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
reliabilityCache.put(cacheKey, result);
|
||||
return result;
|
||||
try {
|
||||
font.getAverageFontWidth();
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
float width = font.getWidth(65);
|
||||
return width >= 0;
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean performReliabilityCheck(PDFont font) {
|
||||
public float calculateMinimumTextWidth(PDFont font, String text, float fontSize) {
|
||||
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
if (font.isDamaged()) {
|
||||
log.debug("Font {} is damaged", font.getName());
|
||||
return false;
|
||||
float minWidth = calculateAccurateWidth(font, text, fontSize);
|
||||
if (minWidth > 0) {
|
||||
return minWidth * 0.8f;
|
||||
}
|
||||
|
||||
if (!TextEncodingHelper.canCalculateBasicWidths(font)) {
|
||||
log.debug("Font {} cannot perform basic width calculations", font.getName());
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
font.getStringWidth("A");
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
log.debug("Font {} failed basic width test: {}", font.getName(), e.getMessage());
|
||||
}
|
||||
|
||||
// Check if we can at least get average width
|
||||
try {
|
||||
float avgWidth = font.getAverageFontWidth();
|
||||
return avgWidth > 0;
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Font {} cannot provide average width: {}", font.getName(), e.getMessage());
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.debug("Reliability check failed for font {}: {}", font.getName(), e.getMessage());
|
||||
}
|
||||
|
||||
return text.length() * fontSize * 0.3f;
|
||||
}
|
||||
|
||||
public float calculateMaximumTextWidth(PDFont font, String text, float fontSize) {
|
||||
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
float maxWidth = calculateAccurateWidth(font, text, fontSize);
|
||||
if (maxWidth > 0) {
|
||||
return maxWidth * 1.2f;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return text.length() * fontSize * 1.0f;
|
||||
}
|
||||
|
||||
public boolean canCalculateWidthForText(PDFont font, String text) {
|
||||
if (font == null || text == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (text.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
Float width = calculateDirectWidth(font, text, 12f);
|
||||
if (width != null) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
Float width = calculateCharacterByCharacterWidth(font, text, 12f);
|
||||
if (width != null) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user