mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-03-13 02:18:16 +01:00
feat: Auto-redact to support text removal on True PDFs/non-custom encoded PDFs, JUnit tests for RedactController, and TextFinder (#3936)
# Description of Changes ## Overview This enhancement adds **true PDF text removal** to RedactController. It changes auto-redaction from visual covering to actual text removal. The feature removes text from True PDFs completely while keeping compatibility with other PDF types. ## Features ### 1. True PDF Text Removal - Removes text from PDF structure instead of just hiding it - No impact to manual redaction or other types of PDFs (e.g.: to searchable PDFs or custom encoded PDFs) ### 2. Advanced Content Stream Processing #### How It Works (only high level overview) - Token Processing: Breaks PDF content into small pieces for exact text finding - Font Tracking: Keeps track of fonts and formatting - Text Operators: Finds PDF commands that show text (`Tj`, `TJ`, `'`, `"`) - Position Mapping: Maps text to exact locations for removal - Rebuilds PDF: Rebuilds PDFs without the text, while keeping formatting operators #### No change for other types PDFs - Because the iteration through the PDF for token/text removal and for box placing are two separate completely methods - This means when the there is custom encoded PDF the token/text removal won't find any text to remove (because there is no logic for decoding for, for now) but the box finding methods still reliably finds redacted words and puts a box onto them. So no change. ### 3. Enhanced TextFinder Integration #### Minor Improvements - Page Grouping: Groups found text by page for faster processing ### JUnit tests for both of files. - Added JUnit tests for both files. - Might need future improvement. ### TODOs - Support for additional PDF types besides true PDFs (currently a WIP), e.g.: searchable PDF/custom encoded PDF - Feature to be expected in few weeks (best case scenario, and only if I succeed), sadly that is significantly harder task so only true PDFs for now ### UI - No UI change for now ### Sample files: [Free_Test_Data_500KB_PDF_redacted.pdf](https://github.com/user-attachments/files/21195841/Free_Test_Data_500KB_PDF_redacted.pdf) [lorem-ipsum_redacted.pdf](https://github.com/user-attachments/files/21195842/lorem-ipsum_redacted.pdf) [true-pdf-sample-1_redacted.pdf](https://github.com/user-attachments/files/21195843/true-pdf-sample-1_redacted.pdf) [true-pdf-sample-2_redacted.pdf](https://github.com/user-attachments/files/21195844/true-pdf-sample-2_redacted.pdf) [true-pdf-sample-3_redacted.pdf](https://github.com/user-attachments/files/21195845/true-pdf-sample-3_redacted.pdf) Closes: does not actually close any issues, since it only works with true PDFs --- ## Checklist ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [x] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [x] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@ import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
@@ -17,91 +17,200 @@ import stirling.software.SPDF.model.PDFText;
|
||||
@Slf4j
|
||||
public class TextFinder extends PDFTextStripper {
|
||||
|
||||
private final String searchText;
|
||||
private final String searchTerm;
|
||||
private final boolean useRegex;
|
||||
private final boolean wholeWordSearch;
|
||||
private final List<PDFText> textOccurrences = new ArrayList<>();
|
||||
private final List<PDFText> foundTexts = new ArrayList<>();
|
||||
|
||||
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch)
|
||||
private final List<TextPosition> pageTextPositions = new ArrayList<>();
|
||||
private final StringBuilder pageTextBuilder = new StringBuilder();
|
||||
|
||||
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
|
||||
throws IOException {
|
||||
this.searchText = searchText.toLowerCase();
|
||||
super();
|
||||
this.searchTerm = searchTerm;
|
||||
this.useRegex = useRegex;
|
||||
this.wholeWordSearch = wholeWordSearch;
|
||||
setSortByPosition(true);
|
||||
this.setWordSeparator(" ");
|
||||
}
|
||||
|
||||
private List<MatchInfo> findOccurrencesInText(String searchText, String content) {
|
||||
List<MatchInfo> matches = new ArrayList<>();
|
||||
|
||||
Pattern pattern;
|
||||
|
||||
if (useRegex) {
|
||||
// Use regex-based search
|
||||
pattern =
|
||||
wholeWordSearch
|
||||
? Pattern.compile("\\b" + searchText + "\\b")
|
||||
: Pattern.compile(searchText);
|
||||
} else {
|
||||
// Use normal text search
|
||||
pattern =
|
||||
wholeWordSearch
|
||||
? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
|
||||
: Pattern.compile(Pattern.quote(searchText));
|
||||
}
|
||||
|
||||
Matcher matcher = pattern.matcher(content);
|
||||
while (matcher.find()) {
|
||||
matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
|
||||
}
|
||||
return matches;
|
||||
@Override
|
||||
protected void startPage(PDPage page) throws IOException {
|
||||
super.startPage(page);
|
||||
pageTextPositions.clear();
|
||||
pageTextBuilder.setLength(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeString(String text, List<TextPosition> textPositions) {
|
||||
for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) {
|
||||
int index = match.startIndex;
|
||||
if (index + match.matchLength <= textPositions.size()) {
|
||||
// Initial values based on the first character
|
||||
TextPosition first = textPositions.get(index);
|
||||
float minX = first.getX();
|
||||
float minY = first.getY();
|
||||
float maxX = first.getX() + first.getWidth();
|
||||
float maxY = first.getY() + first.getHeight();
|
||||
pageTextBuilder.append(text);
|
||||
pageTextPositions.addAll(textPositions);
|
||||
}
|
||||
|
||||
// Loop over the rest of the characters and adjust bounding box values
|
||||
for (int i = index; i < index + match.matchLength; i++) {
|
||||
TextPosition position = textPositions.get(i);
|
||||
minX = Math.min(minX, position.getX());
|
||||
minY = Math.min(minY, position.getY());
|
||||
maxX = Math.max(maxX, position.getX() + position.getWidth());
|
||||
maxY = Math.max(maxY, position.getY() + position.getHeight());
|
||||
}
|
||||
@Override
|
||||
protected void writeWordSeparator() {
|
||||
pageTextBuilder.append(getWordSeparator());
|
||||
pageTextPositions.add(null); // Placeholder for separator
|
||||
}
|
||||
|
||||
textOccurrences.add(
|
||||
new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text));
|
||||
@Override
|
||||
protected void writeLineSeparator() {
|
||||
pageTextBuilder.append(getLineSeparator());
|
||||
pageTextPositions.add(null); // Placeholder for separator
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void endPage(PDPage page) throws IOException {
|
||||
String text = pageTextBuilder.toString();
|
||||
if (text.isEmpty() || this.searchTerm == null || this.searchTerm.isEmpty()) {
|
||||
super.endPage(page);
|
||||
return;
|
||||
}
|
||||
|
||||
String processedSearchTerm = this.searchTerm.trim();
|
||||
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
|
||||
if (this.wholeWordSearch) {
|
||||
if (processedSearchTerm.length() == 1
|
||||
&& Character.isDigit(processedSearchTerm.charAt(0))) {
|
||||
regex = "(?<![\\w])" + regex + "(?![\\w])";
|
||||
} else if (processedSearchTerm.length() == 1) {
|
||||
regex = "(?<![\\w])" + regex + "(?![\\w])";
|
||||
} else {
|
||||
regex = "\\b" + regex + "\\b";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<PDFText> getTextLocations(PDDocument document) throws Exception {
|
||||
this.getText(document);
|
||||
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
|
||||
Matcher matcher = pattern.matcher(text);
|
||||
|
||||
log.debug(
|
||||
"Found "
|
||||
+ textOccurrences.size()
|
||||
+ " occurrences of '"
|
||||
+ searchText
|
||||
+ "' in the document.");
|
||||
"Searching for '{}' in page {} with regex '{}' (wholeWord: {}, useRegex: {})",
|
||||
processedSearchTerm,
|
||||
getCurrentPageNo(),
|
||||
regex,
|
||||
wholeWordSearch,
|
||||
useRegex);
|
||||
|
||||
return textOccurrences;
|
||||
int matchCount = 0;
|
||||
while (matcher.find()) {
|
||||
matchCount++;
|
||||
int matchStart = matcher.start();
|
||||
int matchEnd = matcher.end();
|
||||
|
||||
log.debug(
|
||||
"Found match #{} at positions {}-{}: '{}'",
|
||||
matchCount,
|
||||
matchStart,
|
||||
matchEnd,
|
||||
matcher.group());
|
||||
|
||||
float minX = Float.MAX_VALUE;
|
||||
float minY = Float.MAX_VALUE;
|
||||
float maxX = Float.MIN_VALUE;
|
||||
float maxY = Float.MIN_VALUE;
|
||||
boolean foundPosition = false;
|
||||
|
||||
for (int i = matchStart; i < matchEnd; i++) {
|
||||
if (i >= pageTextPositions.size()) {
|
||||
log.debug(
|
||||
"Position index {} exceeds available positions ({})",
|
||||
i,
|
||||
pageTextPositions.size());
|
||||
continue;
|
||||
}
|
||||
TextPosition pos = pageTextPositions.get(i);
|
||||
if (pos != null) {
|
||||
foundPosition = true;
|
||||
minX = Math.min(minX, pos.getX());
|
||||
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
|
||||
minY = Math.min(minY, pos.getY() - pos.getHeight());
|
||||
maxY = Math.max(maxY, pos.getY());
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundPosition && matchStart < pageTextPositions.size()) {
|
||||
log.debug(
|
||||
"Attempting to find nearby positions for match at {}-{}",
|
||||
matchStart,
|
||||
matchEnd);
|
||||
|
||||
for (int i = Math.max(0, matchStart - 5);
|
||||
i < Math.min(pageTextPositions.size(), matchEnd + 5);
|
||||
i++) {
|
||||
TextPosition pos = pageTextPositions.get(i);
|
||||
if (pos != null) {
|
||||
foundPosition = true;
|
||||
minX = Math.min(minX, pos.getX());
|
||||
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
|
||||
minY = Math.min(minY, pos.getY() - pos.getHeight());
|
||||
maxY = Math.max(maxY, pos.getY());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (foundPosition) {
|
||||
foundTexts.add(
|
||||
new PDFText(
|
||||
this.getCurrentPageNo() - 1,
|
||||
minX,
|
||||
minY,
|
||||
maxX,
|
||||
maxY,
|
||||
matcher.group()));
|
||||
log.debug(
|
||||
"Added PDFText for match: page={}, bounds=({},{},{},{}), text='{}'",
|
||||
getCurrentPageNo() - 1,
|
||||
minX,
|
||||
minY,
|
||||
maxX,
|
||||
maxY,
|
||||
matcher.group());
|
||||
} else {
|
||||
log.warn(
|
||||
"Found text match '{}' but no valid position data at {}-{}",
|
||||
matcher.group(),
|
||||
matchStart,
|
||||
matchEnd);
|
||||
}
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"Page {} search complete: found {} matches for '{}'",
|
||||
getCurrentPageNo(),
|
||||
matchCount,
|
||||
processedSearchTerm);
|
||||
|
||||
super.endPage(page);
|
||||
}
|
||||
|
||||
private class MatchInfo {
|
||||
int startIndex;
|
||||
int matchLength;
|
||||
public List<PDFText> getFoundTexts() {
|
||||
return foundTexts;
|
||||
}
|
||||
|
||||
MatchInfo(int startIndex, int matchLength) {
|
||||
this.startIndex = startIndex;
|
||||
this.matchLength = matchLength;
|
||||
public String getDebugInfo() {
|
||||
StringBuilder debug = new StringBuilder();
|
||||
debug.append("Extracted text length: ").append(pageTextBuilder.length()).append("\n");
|
||||
debug.append("Position count: ").append(pageTextPositions.size()).append("\n");
|
||||
debug.append("Text content: '")
|
||||
.append(pageTextBuilder.toString().replace("\n", "\\n").replace("\r", "\\r"))
|
||||
.append("'\n");
|
||||
|
||||
String text = pageTextBuilder.toString();
|
||||
for (int i = 0; i < Math.min(text.length(), 50); i++) {
|
||||
char c = text.charAt(i);
|
||||
TextPosition pos = i < pageTextPositions.size() ? pageTextPositions.get(i) : null;
|
||||
debug.append(
|
||||
String.format(
|
||||
" [%d] '%c' (0x%02X) -> %s\n",
|
||||
i,
|
||||
c,
|
||||
(int) c,
|
||||
pos != null
|
||||
? String.format("(%.1f,%.1f)", pos.getX(), pos.getY())
|
||||
: "null"));
|
||||
}
|
||||
|
||||
return debug.toString();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,351 @@
|
||||
package stirling.software.SPDF.utils.text;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class TextEncodingHelper {
|
||||
|
||||
public static boolean canEncodeCharacters(PDFont font, String text) {
|
||||
if (font == null || text == null || text.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
// Step 1: Primary check - full-string encoding (permissive for "good" cases)
|
||||
byte[] encoded = font.encode(text);
|
||||
if (encoded.length > 0) {
|
||||
log.debug(
|
||||
"Text '{}' has good full-string encoding for font {} - permissively allowing",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return true;
|
||||
}
|
||||
|
||||
// Step 2: Smart array-based fallback for TJ operator-style text
|
||||
log.debug(
|
||||
"Full encoding failed for '{}' - using array-based fallback for font {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
|
||||
return validateAsCodePointArray(font, text);
|
||||
|
||||
} catch (IOException | IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Encoding exception for text '{}' with font {} - trying array fallback: {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
|
||||
if (isFontSubset(font.getName()) || hasCustomEncoding(font)) {
|
||||
return validateAsCodePointArray(font, text);
|
||||
}
|
||||
|
||||
return false; // Non-subset fonts with encoding exceptions are likely problematic
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean validateAsCodePointArray(PDFont font, String text) {
|
||||
int totalCodePoints = 0;
|
||||
int successfulCodePoints = 0;
|
||||
|
||||
// Iterate through code points (handles surrogates correctly per Unicode docs)
|
||||
for (int i = 0; i < text.length(); ) {
|
||||
int codePoint = text.codePointAt(i);
|
||||
String charStr = new String(Character.toChars(codePoint));
|
||||
totalCodePoints++;
|
||||
|
||||
try {
|
||||
// Test encoding for this code point
|
||||
byte[] charEncoded = font.encode(charStr);
|
||||
if (charEncoded.length > 0) {
|
||||
float charWidth = font.getStringWidth(charStr);
|
||||
|
||||
if (charWidth >= 0) {
|
||||
successfulCodePoints++;
|
||||
log.debug(
|
||||
"Code point '{}' (U+{}) encoded successfully",
|
||||
charStr,
|
||||
Integer.toHexString(codePoint).toUpperCase());
|
||||
} else {
|
||||
log.debug(
|
||||
"Code point '{}' (U+{}) has invalid width: {}",
|
||||
charStr,
|
||||
Integer.toHexString(codePoint).toUpperCase(),
|
||||
charWidth);
|
||||
}
|
||||
} else {
|
||||
log.debug(
|
||||
"Code point '{}' (U+{}) encoding failed - empty result",
|
||||
charStr,
|
||||
Integer.toHexString(codePoint).toUpperCase());
|
||||
}
|
||||
} catch (IOException | IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Code point '{}' (U+{}) validation failed: {}",
|
||||
charStr,
|
||||
Integer.toHexString(codePoint).toUpperCase(),
|
||||
e.getMessage());
|
||||
}
|
||||
|
||||
i += Character.charCount(codePoint); // Handle surrogates properly
|
||||
}
|
||||
|
||||
double successRate =
|
||||
totalCodePoints > 0 ? (double) successfulCodePoints / totalCodePoints : 0;
|
||||
boolean isAcceptable = successRate >= 0.95;
|
||||
|
||||
log.debug(
|
||||
"Array validation for '{}': {}/{} code points successful ({:.1f}%) - {}",
|
||||
text,
|
||||
successfulCodePoints,
|
||||
totalCodePoints,
|
||||
successRate * 100,
|
||||
isAcceptable ? "ALLOWING" : "rejecting");
|
||||
|
||||
return isAcceptable;
|
||||
}
|
||||
|
||||
public static boolean isTextSegmentRemovable(PDFont font, String text) {
|
||||
if (font == null || text == null || text.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Log the attempt
|
||||
log.debug(
|
||||
"Evaluating text segment for removal: '{}' with font {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown Font");
|
||||
|
||||
if (isSimpleCharacter(text)) {
|
||||
try {
|
||||
font.encode(text);
|
||||
font.getStringWidth(text);
|
||||
log.debug(
|
||||
"Text '{}' is a simple character and passed validation - allowing removal",
|
||||
text);
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Simple character '{}' failed basic validation with font {}: {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// For complex text, require comprehensive validation
|
||||
return isTextFullyRemovable(font, text);
|
||||
}
|
||||
|
||||
public static boolean isTextFullyRemovable(PDFont font, String text) {
|
||||
if (font == null || text == null || text.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
// Check 1: Verify encoding capability using new smart approach
|
||||
if (!canEncodeCharacters(font, text)) {
|
||||
log.debug(
|
||||
"Text '{}' failed encoding validation for font {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check 2: Validate width calculation capability
|
||||
float width = font.getStringWidth(text);
|
||||
if (width < 0) { // Allow zero width (invisible chars) but reject negative (invalid)
|
||||
log.debug(
|
||||
"Text '{}' has invalid width {} for font {}",
|
||||
text,
|
||||
width,
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return false; // Invalid metrics prevent accurate removal
|
||||
}
|
||||
|
||||
// Check 3: Verify font descriptor completeness for redaction area calculation
|
||||
if (font.getFontDescriptor() == null) {
|
||||
log.debug(
|
||||
"Missing font descriptor for font {}",
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check 4: Test bounding box calculation for redaction area
|
||||
try {
|
||||
font.getFontDescriptor().getFontBoundingBox();
|
||||
} catch (IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Font bounding box unavailable for font {}: {}",
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
return false;
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"Text '{}' passed comprehensive validation for font {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return true;
|
||||
|
||||
} catch (IOException e) {
|
||||
log.debug(
|
||||
"Text '{}' failed validation for font {} due to IO error: {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
return false;
|
||||
} catch (IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Text '{}' failed validation for font {} due to argument error: {}",
|
||||
text,
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isSimpleCharacter(String text) {
|
||||
if (text == null || text.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (text.length() > 20) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
|
||||
// Allow letters, digits, and whitespace (most common cases)
|
||||
if (Character.isLetterOrDigit(c) || Character.isWhitespace(c)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Allow common ASCII punctuation
|
||||
if (c >= 32 && c <= 126 && ".,!?;:()-[]{}\"'/@#$%&*+=<>|\\~`".indexOf(c) >= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean hasCustomEncoding(PDFont font) {
|
||||
try {
|
||||
if (font instanceof PDSimpleFont simpleFont) {
|
||||
try {
|
||||
Encoding encoding = simpleFont.getEncoding();
|
||||
if (encoding != null) {
|
||||
// Check for dictionary-based custom encodings
|
||||
if (encoding instanceof DictionaryEncoding) {
|
||||
log.debug("Font {} uses DictionaryEncoding (custom)", font.getName());
|
||||
return true;
|
||||
}
|
||||
|
||||
String encodingName = encoding.getClass().getSimpleName();
|
||||
if (encodingName.contains("Custom")
|
||||
|| encodingName.contains("Dictionary")) {
|
||||
log.debug(
|
||||
"Font {} uses custom encoding: {}",
|
||||
font.getName(),
|
||||
encodingName);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Encoding detection failed for font {}: {}",
|
||||
font.getName(),
|
||||
e.getMessage());
|
||||
return true; // Assume custom if detection fails
|
||||
}
|
||||
}
|
||||
|
||||
if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) {
|
||||
log.debug(
|
||||
"Font {} is Type0 (CID) - generally uses standard CMaps",
|
||||
font.getName() != null ? font.getName() : "Unknown");
|
||||
return false;
|
||||
}
|
||||
|
||||
log.debug(
|
||||
"Font {} type {} - assuming standard encoding",
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
font.getClass().getSimpleName());
|
||||
return false;
|
||||
|
||||
} catch (IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Custom encoding detection failed for font {}: {}",
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
return false; // Be forgiving on detection failure
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean fontSupportsCharacter(PDFont font, String character) {
|
||||
if (font == null || character == null || character.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
byte[] encoded = font.encode(character);
|
||||
if (encoded.length == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
float width = font.getStringWidth(character);
|
||||
return width > 0;
|
||||
|
||||
} catch (IOException | IllegalArgumentException e) {
|
||||
log.debug(
|
||||
"Character '{}' not supported by font {}: {}",
|
||||
character,
|
||||
font.getName() != null ? font.getName() : "Unknown",
|
||||
e.getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean isFontSubset(String fontName) {
|
||||
if (fontName == null) {
|
||||
return false;
|
||||
}
|
||||
return fontName.matches("^[A-Z]{6}\\+.*");
|
||||
}
|
||||
|
||||
public static boolean canCalculateBasicWidths(PDFont font) {
|
||||
try {
|
||||
float spaceWidth = font.getStringWidth(" ");
|
||||
if (spaceWidth <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
String[] testChars = {"a", "A", "0", ".", "e", "!"};
|
||||
for (String ch : testChars) {
|
||||
try {
|
||||
float width = font.getStringWidth(ch);
|
||||
if (width > 0) {
|
||||
return true;
|
||||
}
|
||||
} catch (IOException | IllegalArgumentException e) {
|
||||
}
|
||||
}
|
||||
|
||||
return false; // Can't calculate width for any test characters
|
||||
} catch (IOException | IllegalArgumentException e) {
|
||||
return false; // Font failed basic width calculation
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,140 @@
|
||||
package stirling.software.SPDF.utils.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class TextFinderUtils {
|
||||
|
||||
public static boolean validateFontReliability(org.apache.pdfbox.pdmodel.font.PDFont font) {
|
||||
if (font == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (font.isDamaged()) {
|
||||
log.debug(
|
||||
"Font {} is marked as damaged - using TextEncodingHelper validation",
|
||||
font.getName());
|
||||
}
|
||||
|
||||
if (TextEncodingHelper.canCalculateBasicWidths(font)) {
|
||||
log.debug(
|
||||
"Font {} passed basic width calculations - considering reliable",
|
||||
font.getName());
|
||||
return true;
|
||||
}
|
||||
|
||||
String[] basicTests = {"1", "2", "3", "a", "A", "e", "E", " "};
|
||||
|
||||
int workingChars = 0;
|
||||
for (String testChar : basicTests) {
|
||||
if (TextEncodingHelper.canEncodeCharacters(font, testChar)) {
|
||||
workingChars++;
|
||||
}
|
||||
}
|
||||
|
||||
if (workingChars > 0) {
|
||||
log.debug(
|
||||
"Font {} can process {}/{} basic characters - considering reliable",
|
||||
font.getName(),
|
||||
workingChars,
|
||||
basicTests.length);
|
||||
return true;
|
||||
}
|
||||
|
||||
log.debug("Font {} failed all basic tests - considering unreliable", font.getName());
|
||||
return false;
|
||||
}
|
||||
|
||||
public static List<Pattern> createOptimizedSearchPatterns(
|
||||
Set<String> searchTerms, boolean useRegex, boolean wholeWordSearch) {
|
||||
List<Pattern> patterns = new ArrayList<>();
|
||||
|
||||
for (String term : searchTerms) {
|
||||
if (term == null || term.trim().isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
String patternString = useRegex ? term.trim() : Pattern.quote(term.trim());
|
||||
|
||||
if (wholeWordSearch) {
|
||||
patternString = applyWordBoundaries(term.trim(), patternString);
|
||||
}
|
||||
|
||||
Pattern pattern =
|
||||
Pattern.compile(
|
||||
patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
|
||||
patterns.add(pattern);
|
||||
|
||||
log.debug("Created search pattern: '{}' -> '{}'", term.trim(), patternString);
|
||||
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to create pattern for term '{}': {}", term, e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
return patterns;
|
||||
}
|
||||
|
||||
private static String applyWordBoundaries(String originalTerm, String patternString) {
|
||||
if (originalTerm.length() == 1 && Character.isDigit(originalTerm.charAt(0))) {
|
||||
return "(?<![\\w])" + patternString + "(?![\\w])";
|
||||
} else if (originalTerm.length() == 1) {
|
||||
return "(?<![\\w])" + patternString + "(?![\\w])";
|
||||
} else {
|
||||
return "\\b" + patternString + "\\b";
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean hasProblematicFonts(PDPage page) {
|
||||
if (page == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
PDResources resources = page.getResources();
|
||||
if (resources == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int totalFonts = 0;
|
||||
int completelyUnusableFonts = 0;
|
||||
|
||||
for (org.apache.pdfbox.cos.COSName fontName : resources.getFontNames()) {
|
||||
try {
|
||||
org.apache.pdfbox.pdmodel.font.PDFont font = resources.getFont(fontName);
|
||||
if (font != null) {
|
||||
totalFonts++;
|
||||
if (!validateFontReliability(font)) {
|
||||
completelyUnusableFonts++;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug("Font loading failed for {}: {}", fontName.getName(), e.getMessage());
|
||||
totalFonts++;
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasProblems = totalFonts > 0 && (completelyUnusableFonts * 2 > totalFonts);
|
||||
log.debug(
|
||||
"Page font analysis: {}/{} fonts are completely unusable - page {} problematic",
|
||||
completelyUnusableFonts,
|
||||
totalFonts,
|
||||
hasProblems ? "IS" : "is NOT");
|
||||
|
||||
return hasProblems;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.warn("Font analysis failed for page: {}", e.getMessage());
|
||||
return false; // Be permissive if analysis fails
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
package stirling.software.SPDF.utils.text;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class WidthCalculator {
|
||||
|
||||
private static final int FONT_SCALE_FACTOR = 1000;
|
||||
|
||||
public static float calculateAccurateWidth(PDFont font, String text, float fontSize) {
|
||||
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!TextEncodingHelper.canEncodeCharacters(font, text)) {
|
||||
log.debug(
|
||||
"Text cannot be encoded by font {}, using fallback width calculation",
|
||||
font.getName());
|
||||
return calculateFallbackWidth(font, text, fontSize);
|
||||
}
|
||||
|
||||
try {
|
||||
float rawWidth = font.getStringWidth(text);
|
||||
float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize;
|
||||
|
||||
log.debug(
|
||||
"Direct width calculation successful for font {}: {} -> {}",
|
||||
font.getName(),
|
||||
rawWidth,
|
||||
scaledWidth);
|
||||
return scaledWidth;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Direct width calculation failed for font {}: {}",
|
||||
font.getName(),
|
||||
e.getMessage());
|
||||
return calculateWidthWithCharacterIteration(font, text, fontSize);
|
||||
}
|
||||
}
|
||||
|
||||
private static float calculateWidthWithCharacterIteration(
|
||||
PDFont font, String text, float fontSize) {
|
||||
try {
|
||||
float totalWidth = 0;
|
||||
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
String character = text.substring(i, i + 1);
|
||||
try {
|
||||
byte[] encoded = font.encode(character);
|
||||
if (encoded.length > 0) {
|
||||
int glyphCode = encoded[0] & 0xFF;
|
||||
float glyphWidth = font.getWidth(glyphCode);
|
||||
|
||||
if (glyphWidth == 0) {
|
||||
try {
|
||||
glyphWidth = font.getWidthFromFont(glyphCode);
|
||||
} catch (Exception e2) {
|
||||
glyphWidth = font.getAverageFontWidth();
|
||||
}
|
||||
}
|
||||
|
||||
totalWidth += (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
|
||||
} else {
|
||||
totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize;
|
||||
}
|
||||
} catch (Exception e2) {
|
||||
totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize;
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("Character iteration width calculation: {}", totalWidth);
|
||||
return totalWidth;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.debug("Character iteration failed: {}", e.getMessage());
|
||||
return calculateFallbackWidth(font, text, fontSize);
|
||||
}
|
||||
}
|
||||
|
||||
private static float calculateFallbackWidth(PDFont font, String text, float fontSize) {
|
||||
try {
|
||||
if (font.getFontDescriptor() != null
|
||||
&& font.getFontDescriptor().getFontBoundingBox() != null) {
|
||||
|
||||
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
|
||||
float avgCharWidth =
|
||||
bbox.getWidth() / FONT_SCALE_FACTOR * 0.6f; // Conservative estimate
|
||||
float fallbackWidth = text.length() * avgCharWidth * fontSize;
|
||||
|
||||
log.debug("Bounding box fallback width: {}", fallbackWidth);
|
||||
return fallbackWidth;
|
||||
}
|
||||
|
||||
float avgWidth = font.getAverageFontWidth();
|
||||
float fallbackWidth = (text.length() * avgWidth / FONT_SCALE_FACTOR) * fontSize;
|
||||
|
||||
log.debug("Average width fallback: {}", fallbackWidth);
|
||||
return fallbackWidth;
|
||||
|
||||
} catch (Exception e) {
|
||||
float conservativeWidth = text.length() * 0.5f * fontSize;
|
||||
log.debug(
|
||||
"Conservative fallback width for font {}: {}",
|
||||
font.getName(),
|
||||
conservativeWidth);
|
||||
return conservativeWidth;
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean isWidthCalculationReliable(PDFont font) {
|
||||
if (font == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (font.isDamaged()) {
|
||||
log.debug("Font {} is damaged", font.getName());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!TextEncodingHelper.canCalculateBasicWidths(font)) {
|
||||
log.debug("Font {} cannot perform basic width calculations", font.getName());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (TextEncodingHelper.hasCustomEncoding(font)) {
|
||||
log.debug("Font {} has custom encoding", font.getName());
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user