This commit is contained in:
Balázs Szücs 2025-07-26 17:27:43 -04:00 committed by GitHub
commit f63aff1153
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 4250 additions and 179 deletions

View File

@ -6,7 +6,7 @@ import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
@ -17,91 +17,200 @@ import stirling.software.SPDF.model.PDFText;
@Slf4j
public class TextFinder extends PDFTextStripper {
private final String searchText;
private final String searchTerm;
private final boolean useRegex;
private final boolean wholeWordSearch;
private final List<PDFText> textOccurrences = new ArrayList<>();
private final List<PDFText> foundTexts = new ArrayList<>();
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch)
private final List<TextPosition> pageTextPositions = new ArrayList<>();
private final StringBuilder pageTextBuilder = new StringBuilder();
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
throws IOException {
this.searchText = searchText.toLowerCase();
super();
this.searchTerm = searchTerm;
this.useRegex = useRegex;
this.wholeWordSearch = wholeWordSearch;
setSortByPosition(true);
this.setWordSeparator(" ");
}
private List<MatchInfo> findOccurrencesInText(String searchText, String content) {
List<MatchInfo> matches = new ArrayList<>();
Pattern pattern;
if (useRegex) {
// Use regex-based search
pattern =
wholeWordSearch
? Pattern.compile("\\b" + searchText + "\\b")
: Pattern.compile(searchText);
} else {
// Use normal text search
pattern =
wholeWordSearch
? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
: Pattern.compile(Pattern.quote(searchText));
}
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
}
return matches;
@Override
protected void startPage(PDPage page) throws IOException {
super.startPage(page);
pageTextPositions.clear();
pageTextBuilder.setLength(0);
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) {
for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) {
int index = match.startIndex;
if (index + match.matchLength <= textPositions.size()) {
// Initial values based on the first character
TextPosition first = textPositions.get(index);
float minX = first.getX();
float minY = first.getY();
float maxX = first.getX() + first.getWidth();
float maxY = first.getY() + first.getHeight();
pageTextBuilder.append(text);
pageTextPositions.addAll(textPositions);
}
// Loop over the rest of the characters and adjust bounding box values
for (int i = index; i < index + match.matchLength; i++) {
TextPosition position = textPositions.get(i);
minX = Math.min(minX, position.getX());
minY = Math.min(minY, position.getY());
maxX = Math.max(maxX, position.getX() + position.getWidth());
maxY = Math.max(maxY, position.getY() + position.getHeight());
}
@Override
protected void writeWordSeparator() {
pageTextBuilder.append(getWordSeparator());
pageTextPositions.add(null); // Placeholder for separator
}
textOccurrences.add(
new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text));
@Override
protected void writeLineSeparator() {
pageTextBuilder.append(getLineSeparator());
pageTextPositions.add(null); // Placeholder for separator
}
@Override
protected void endPage(PDPage page) throws IOException {
String text = pageTextBuilder.toString();
if (text.isEmpty() || this.searchTerm == null || this.searchTerm.isEmpty()) {
super.endPage(page);
return;
}
String processedSearchTerm = this.searchTerm.trim();
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
if (this.wholeWordSearch) {
if (processedSearchTerm.length() == 1
&& Character.isDigit(processedSearchTerm.charAt(0))) {
regex = "(?<![\\w])" + regex + "(?![\\w])";
} else if (processedSearchTerm.length() == 1) {
regex = "(?<![\\w])" + regex + "(?![\\w])";
} else {
regex = "\\b" + regex + "\\b";
}
}
}
public List<PDFText> getTextLocations(PDDocument document) throws Exception {
this.getText(document);
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
Matcher matcher = pattern.matcher(text);
log.debug(
"Found "
+ textOccurrences.size()
+ " occurrences of '"
+ searchText
+ "' in the document.");
"Searching for '{}' in page {} with regex '{}' (wholeWord: {}, useRegex: {})",
processedSearchTerm,
getCurrentPageNo(),
regex,
wholeWordSearch,
useRegex);
return textOccurrences;
int matchCount = 0;
while (matcher.find()) {
matchCount++;
int matchStart = matcher.start();
int matchEnd = matcher.end();
log.debug(
"Found match #{} at positions {}-{}: '{}'",
matchCount,
matchStart,
matchEnd,
matcher.group());
float minX = Float.MAX_VALUE;
float minY = Float.MAX_VALUE;
float maxX = Float.MIN_VALUE;
float maxY = Float.MIN_VALUE;
boolean foundPosition = false;
for (int i = matchStart; i < matchEnd; i++) {
if (i >= pageTextPositions.size()) {
log.debug(
"Position index {} exceeds available positions ({})",
i,
pageTextPositions.size());
continue;
}
TextPosition pos = pageTextPositions.get(i);
if (pos != null) {
foundPosition = true;
minX = Math.min(minX, pos.getX());
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
minY = Math.min(minY, pos.getY() - pos.getHeight());
maxY = Math.max(maxY, pos.getY());
}
}
if (!foundPosition && matchStart < pageTextPositions.size()) {
log.debug(
"Attempting to find nearby positions for match at {}-{}",
matchStart,
matchEnd);
for (int i = Math.max(0, matchStart - 5);
i < Math.min(pageTextPositions.size(), matchEnd + 5);
i++) {
TextPosition pos = pageTextPositions.get(i);
if (pos != null) {
foundPosition = true;
minX = Math.min(minX, pos.getX());
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
minY = Math.min(minY, pos.getY() - pos.getHeight());
maxY = Math.max(maxY, pos.getY());
break;
}
}
}
if (foundPosition) {
foundTexts.add(
new PDFText(
this.getCurrentPageNo() - 1,
minX,
minY,
maxX,
maxY,
matcher.group()));
log.debug(
"Added PDFText for match: page={}, bounds=({},{},{},{}), text='{}'",
getCurrentPageNo() - 1,
minX,
minY,
maxX,
maxY,
matcher.group());
} else {
log.warn(
"Found text match '{}' but no valid position data at {}-{}",
matcher.group(),
matchStart,
matchEnd);
}
}
log.debug(
"Page {} search complete: found {} matches for '{}'",
getCurrentPageNo(),
matchCount,
processedSearchTerm);
super.endPage(page);
}
private class MatchInfo {
int startIndex;
int matchLength;
public List<PDFText> getFoundTexts() {
return foundTexts;
}
MatchInfo(int startIndex, int matchLength) {
this.startIndex = startIndex;
this.matchLength = matchLength;
public String getDebugInfo() {
StringBuilder debug = new StringBuilder();
debug.append("Extracted text length: ").append(pageTextBuilder.length()).append("\n");
debug.append("Position count: ").append(pageTextPositions.size()).append("\n");
debug.append("Text content: '")
.append(pageTextBuilder.toString().replace("\n", "\\n").replace("\r", "\\r"))
.append("'\n");
String text = pageTextBuilder.toString();
for (int i = 0; i < Math.min(text.length(), 50); i++) {
char c = text.charAt(i);
TextPosition pos = i < pageTextPositions.size() ? pageTextPositions.get(i) : null;
debug.append(
String.format(
" [%d] '%c' (0x%02X) -> %s\n",
i,
c,
(int) c,
pos != null
? String.format("(%.1f,%.1f)", pos.getX(), pos.getY())
: "null"));
}
return debug.toString();
}
}

View File

@ -0,0 +1,351 @@
package stirling.software.SPDF.utils.text;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TextEncodingHelper {
public static boolean canEncodeCharacters(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
return false;
}
try {
// Step 1: Primary check - full-string encoding (permissive for "good" cases)
byte[] encoded = font.encode(text);
if (encoded.length > 0) {
log.debug(
"Text '{}' has good full-string encoding for font {} - permissively allowing",
text,
font.getName() != null ? font.getName() : "Unknown");
return true;
}
// Step 2: Smart array-based fallback for TJ operator-style text
log.debug(
"Full encoding failed for '{}' - using array-based fallback for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return validateAsCodePointArray(font, text);
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Encoding exception for text '{}' with font {} - trying array fallback: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
if (isFontSubset(font.getName()) || hasCustomEncoding(font)) {
return validateAsCodePointArray(font, text);
}
return false; // Non-subset fonts with encoding exceptions are likely problematic
}
}
private static boolean validateAsCodePointArray(PDFont font, String text) {
int totalCodePoints = 0;
int successfulCodePoints = 0;
// Iterate through code points (handles surrogates correctly per Unicode docs)
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
String charStr = new String(Character.toChars(codePoint));
totalCodePoints++;
try {
// Test encoding for this code point
byte[] charEncoded = font.encode(charStr);
if (charEncoded.length > 0) {
float charWidth = font.getStringWidth(charStr);
if (charWidth >= 0) {
successfulCodePoints++;
log.debug(
"Code point '{}' (U+{}) encoded successfully",
charStr,
Integer.toHexString(codePoint).toUpperCase());
} else {
log.debug(
"Code point '{}' (U+{}) has invalid width: {}",
charStr,
Integer.toHexString(codePoint).toUpperCase(),
charWidth);
}
} else {
log.debug(
"Code point '{}' (U+{}) encoding failed - empty result",
charStr,
Integer.toHexString(codePoint).toUpperCase());
}
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Code point '{}' (U+{}) validation failed: {}",
charStr,
Integer.toHexString(codePoint).toUpperCase(),
e.getMessage());
}
i += Character.charCount(codePoint); // Handle surrogates properly
}
double successRate =
totalCodePoints > 0 ? (double) successfulCodePoints / totalCodePoints : 0;
boolean isAcceptable = successRate >= 0.95;
log.debug(
"Array validation for '{}': {}/{} code points successful ({:.1f}%) - {}",
text,
successfulCodePoints,
totalCodePoints,
successRate * 100,
isAcceptable ? "ALLOWING" : "rejecting");
return isAcceptable;
}
public static boolean isTextSegmentRemovable(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
return false;
}
// Log the attempt
log.debug(
"Evaluating text segment for removal: '{}' with font {}",
text,
font.getName() != null ? font.getName() : "Unknown Font");
if (isSimpleCharacter(text)) {
try {
font.encode(text);
font.getStringWidth(text);
log.debug(
"Text '{}' is a simple character and passed validation - allowing removal",
text);
return true;
} catch (Exception e) {
log.debug(
"Simple character '{}' failed basic validation with font {}: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
}
}
// For complex text, require comprehensive validation
return isTextFullyRemovable(font, text);
}
public static boolean isTextFullyRemovable(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
return false;
}
try {
// Check 1: Verify encoding capability using new smart approach
if (!canEncodeCharacters(font, text)) {
log.debug(
"Text '{}' failed encoding validation for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return false;
}
// Check 2: Validate width calculation capability
float width = font.getStringWidth(text);
if (width < 0) { // Allow zero width (invisible chars) but reject negative (invalid)
log.debug(
"Text '{}' has invalid width {} for font {}",
text,
width,
font.getName() != null ? font.getName() : "Unknown");
return false; // Invalid metrics prevent accurate removal
}
// Check 3: Verify font descriptor completeness for redaction area calculation
if (font.getFontDescriptor() == null) {
log.debug(
"Missing font descriptor for font {}",
font.getName() != null ? font.getName() : "Unknown");
return false;
}
// Check 4: Test bounding box calculation for redaction area
try {
font.getFontDescriptor().getFontBoundingBox();
} catch (IllegalArgumentException e) {
log.debug(
"Font bounding box unavailable for font {}: {}",
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
}
log.debug(
"Text '{}' passed comprehensive validation for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return true;
} catch (IOException e) {
log.debug(
"Text '{}' failed validation for font {} due to IO error: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
} catch (IllegalArgumentException e) {
log.debug(
"Text '{}' failed validation for font {} due to argument error: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
}
}
private static boolean isSimpleCharacter(String text) {
if (text == null || text.isEmpty()) {
return false;
}
if (text.length() > 20) {
return false;
}
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
// Allow letters, digits, and whitespace (most common cases)
if (Character.isLetterOrDigit(c) || Character.isWhitespace(c)) {
continue;
}
// Allow common ASCII punctuation
if (c >= 32 && c <= 126 && ".,!?;:()-[]{}\"'/@#$%&*+=<>|\\~`".indexOf(c) >= 0) {
continue;
}
return false;
}
return true;
}
public static boolean hasCustomEncoding(PDFont font) {
try {
if (font instanceof PDSimpleFont simpleFont) {
try {
Encoding encoding = simpleFont.getEncoding();
if (encoding != null) {
// Check for dictionary-based custom encodings
if (encoding instanceof DictionaryEncoding) {
log.debug("Font {} uses DictionaryEncoding (custom)", font.getName());
return true;
}
String encodingName = encoding.getClass().getSimpleName();
if (encodingName.contains("Custom")
|| encodingName.contains("Dictionary")) {
log.debug(
"Font {} uses custom encoding: {}",
font.getName(),
encodingName);
return true;
}
}
} catch (Exception e) {
log.debug(
"Encoding detection failed for font {}: {}",
font.getName(),
e.getMessage());
return true; // Assume custom if detection fails
}
}
if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) {
log.debug(
"Font {} is Type0 (CID) - generally uses standard CMaps",
font.getName() != null ? font.getName() : "Unknown");
return false;
}
log.debug(
"Font {} type {} - assuming standard encoding",
font.getName() != null ? font.getName() : "Unknown",
font.getClass().getSimpleName());
return false;
} catch (IllegalArgumentException e) {
log.debug(
"Custom encoding detection failed for font {}: {}",
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false; // Be forgiving on detection failure
}
}
public static boolean fontSupportsCharacter(PDFont font, String character) {
if (font == null || character == null || character.isEmpty()) {
return false;
}
try {
byte[] encoded = font.encode(character);
if (encoded.length == 0) {
return false;
}
float width = font.getStringWidth(character);
return width > 0;
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Character '{}' not supported by font {}: {}",
character,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
}
}
public static boolean isFontSubset(String fontName) {
if (fontName == null) {
return false;
}
return fontName.matches("^[A-Z]{6}\\+.*");
}
public static boolean canCalculateBasicWidths(PDFont font) {
try {
float spaceWidth = font.getStringWidth(" ");
if (spaceWidth <= 0) {
return false;
}
String[] testChars = {"a", "A", "0", ".", "e", "!"};
for (String ch : testChars) {
try {
float width = font.getStringWidth(ch);
if (width > 0) {
return true;
}
} catch (IOException | IllegalArgumentException e) {
}
}
return false; // Can't calculate width for any test characters
} catch (IOException | IllegalArgumentException e) {
return false; // Font failed basic width calculation
}
}
}

View File

@ -0,0 +1,140 @@
package stirling.software.SPDF.utils.text;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TextFinderUtils {
public static boolean validateFontReliability(org.apache.pdfbox.pdmodel.font.PDFont font) {
if (font == null) {
return false;
}
if (font.isDamaged()) {
log.debug(
"Font {} is marked as damaged - using TextEncodingHelper validation",
font.getName());
}
if (TextEncodingHelper.canCalculateBasicWidths(font)) {
log.debug(
"Font {} passed basic width calculations - considering reliable",
font.getName());
return true;
}
String[] basicTests = {"1", "2", "3", "a", "A", "e", "E", " "};
int workingChars = 0;
for (String testChar : basicTests) {
if (TextEncodingHelper.canEncodeCharacters(font, testChar)) {
workingChars++;
}
}
if (workingChars > 0) {
log.debug(
"Font {} can process {}/{} basic characters - considering reliable",
font.getName(),
workingChars,
basicTests.length);
return true;
}
log.debug("Font {} failed all basic tests - considering unreliable", font.getName());
return false;
}
public static List<Pattern> createOptimizedSearchPatterns(
Set<String> searchTerms, boolean useRegex, boolean wholeWordSearch) {
List<Pattern> patterns = new ArrayList<>();
for (String term : searchTerms) {
if (term == null || term.trim().isEmpty()) {
continue;
}
try {
String patternString = useRegex ? term.trim() : Pattern.quote(term.trim());
if (wholeWordSearch) {
patternString = applyWordBoundaries(term.trim(), patternString);
}
Pattern pattern =
Pattern.compile(
patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
patterns.add(pattern);
log.debug("Created search pattern: '{}' -> '{}'", term.trim(), patternString);
} catch (Exception e) {
log.warn("Failed to create pattern for term '{}': {}", term, e.getMessage());
}
}
return patterns;
}
private static String applyWordBoundaries(String originalTerm, String patternString) {
if (originalTerm.length() == 1 && Character.isDigit(originalTerm.charAt(0))) {
return "(?<![\\w])" + patternString + "(?![\\w])";
} else if (originalTerm.length() == 1) {
return "(?<![\\w])" + patternString + "(?![\\w])";
} else {
return "\\b" + patternString + "\\b";
}
}
public static boolean hasProblematicFonts(PDPage page) {
if (page == null) {
return false;
}
try {
PDResources resources = page.getResources();
if (resources == null) {
return false;
}
int totalFonts = 0;
int completelyUnusableFonts = 0;
for (org.apache.pdfbox.cos.COSName fontName : resources.getFontNames()) {
try {
org.apache.pdfbox.pdmodel.font.PDFont font = resources.getFont(fontName);
if (font != null) {
totalFonts++;
if (!validateFontReliability(font)) {
completelyUnusableFonts++;
}
}
} catch (Exception e) {
log.debug("Font loading failed for {}: {}", fontName.getName(), e.getMessage());
totalFonts++;
}
}
boolean hasProblems = totalFonts > 0 && (completelyUnusableFonts * 2 > totalFonts);
log.debug(
"Page font analysis: {}/{} fonts are completely unusable - page {} problematic",
completelyUnusableFonts,
totalFonts,
hasProblems ? "IS" : "is NOT");
return hasProblems;
} catch (Exception e) {
log.warn("Font analysis failed for page: {}", e.getMessage());
return false; // Be permissive if analysis fails
}
}
}

View File

@ -0,0 +1,136 @@
package stirling.software.SPDF.utils.text;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class WidthCalculator {
private static final int FONT_SCALE_FACTOR = 1000;
public static float calculateAccurateWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
return 0;
}
if (!TextEncodingHelper.canEncodeCharacters(font, text)) {
log.debug(
"Text cannot be encoded by font {}, using fallback width calculation",
font.getName());
return calculateFallbackWidth(font, text, fontSize);
}
try {
float rawWidth = font.getStringWidth(text);
float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize;
log.debug(
"Direct width calculation successful for font {}: {} -> {}",
font.getName(),
rawWidth,
scaledWidth);
return scaledWidth;
} catch (Exception e) {
log.debug(
"Direct width calculation failed for font {}: {}",
font.getName(),
e.getMessage());
return calculateWidthWithCharacterIteration(font, text, fontSize);
}
}
private static float calculateWidthWithCharacterIteration(
PDFont font, String text, float fontSize) {
try {
float totalWidth = 0;
for (int i = 0; i < text.length(); i++) {
String character = text.substring(i, i + 1);
try {
byte[] encoded = font.encode(character);
if (encoded.length > 0) {
int glyphCode = encoded[0] & 0xFF;
float glyphWidth = font.getWidth(glyphCode);
if (glyphWidth == 0) {
try {
glyphWidth = font.getWidthFromFont(glyphCode);
} catch (Exception e2) {
glyphWidth = font.getAverageFontWidth();
}
}
totalWidth += (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
} else {
totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize;
}
} catch (Exception e2) {
totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize;
}
}
log.debug("Character iteration width calculation: {}", totalWidth);
return totalWidth;
} catch (Exception e) {
log.debug("Character iteration failed: {}", e.getMessage());
return calculateFallbackWidth(font, text, fontSize);
}
}
private static float calculateFallbackWidth(PDFont font, String text, float fontSize) {
try {
if (font.getFontDescriptor() != null
&& font.getFontDescriptor().getFontBoundingBox() != null) {
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
float avgCharWidth =
bbox.getWidth() / FONT_SCALE_FACTOR * 0.6f; // Conservative estimate
float fallbackWidth = text.length() * avgCharWidth * fontSize;
log.debug("Bounding box fallback width: {}", fallbackWidth);
return fallbackWidth;
}
float avgWidth = font.getAverageFontWidth();
float fallbackWidth = (text.length() * avgWidth / FONT_SCALE_FACTOR) * fontSize;
log.debug("Average width fallback: {}", fallbackWidth);
return fallbackWidth;
} catch (Exception e) {
float conservativeWidth = text.length() * 0.5f * fontSize;
log.debug(
"Conservative fallback width for font {}: {}",
font.getName(),
conservativeWidth);
return conservativeWidth;
}
}
public static boolean isWidthCalculationReliable(PDFont font) {
if (font == null) {
return false;
}
if (font.isDamaged()) {
log.debug("Font {} is damaged", font.getName());
return false;
}
if (!TextEncodingHelper.canCalculateBasicWidths(font)) {
log.debug("Font {} cannot perform basic width calculations", font.getName());
return false;
}
if (TextEncodingHelper.hasCustomEncoding(font)) {
log.debug("Font {} has custom encoding", font.getName());
return false;
}
return true;
}
}

View File

@ -0,0 +1,588 @@
package stirling.software.SPDF.pdf;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.junit.jupiter.api.AfterEach;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import org.mockito.junit.jupiter.MockitoExtension;
import stirling.software.SPDF.model.PDFText;
@DisplayName("PDF Text Finder tests")
@ExtendWith(MockitoExtension.class)
class TextFinderTest {
private PDDocument document;
private PDPage page;
// Helpers
private void testTextFinding(String pageContent, String searchTerm, boolean useRegex, boolean wholeWord,
String[] expectedTexts, int expectedCount) throws IOException {
addTextToPage(pageContent);
TextFinder textFinder = new TextFinder(searchTerm, useRegex, wholeWord);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(expectedCount, foundTexts.size(),
String.format("Expected %d matches for search term '%s'", expectedCount, searchTerm));
if (expectedTexts != null) {
for (String expectedText : expectedTexts) {
assertTrue(foundTexts.stream().anyMatch(text -> text.getText().equals(expectedText)),
String.format("Expected to find text: '%s'", expectedText));
}
}
// Verify basic properties of found texts
foundTexts.forEach(text -> {
assertNotNull(text.getText());
assertTrue(text.getX1() >= 0);
assertTrue(text.getY1() >= 0);
assertTrue(text.getX2() >= text.getX1());
assertTrue(text.getY2() >= text.getY1());
assertEquals(0, text.getPageIndex()); // Single page test
});
}
@BeforeEach
void setUp() {
document = new PDDocument();
page = new PDPage(PDRectangle.A4);
document.addPage(page);
}
@AfterEach
void tearDown() throws IOException {
if (document != null) {
document.close();
}
}
@Nested
@DisplayName("Basic Text Search")
class BasicSearchTests {
@Test
@DisplayName("Should find simple text correctly")
void findSimpleText() throws IOException {
testTextFinding("This is a confidential document with secret information.",
"confidential", false, false,
new String[]{"confidential"}, 1);
}
@Test
@DisplayName("Should perform case-insensitive search")
void performCaseInsensitiveSearch() throws IOException {
testTextFinding("This document contains CONFIDENTIAL information.",
"confidential", false, false,
new String[]{"CONFIDENTIAL"}, 1);
}
@Test
@DisplayName("Should find multiple occurrences of same term")
void findMultipleOccurrences() throws IOException {
testTextFinding("The secret code is secret123. Keep this secret safe!",
"secret", false, false,
new String[]{"secret", "secret", "secret"}, 3);
}
@Test
@DisplayName("Should handle empty search term gracefully")
void handleEmptySearchTerm() throws IOException {
testTextFinding("This is a test document.", "", false, false, null, 0);
}
@Test
@DisplayName("Should handle null search term gracefully")
void handleNullSearchTerm() throws IOException {
testTextFinding("This is a test document.", null, false, false, null, 0);
}
@Test
@DisplayName("Should return no results when no match found")
void returnNoResultsWhenNoMatch() throws IOException {
testTextFinding("This is a test document.", "nonexistent", false, false, null, 0);
}
}
@Nested
@DisplayName("Whole Word Search")
class WholeWordSearchTests {
@Test
@DisplayName("Should find only whole words when enabled")
void findOnlyWholeWords() throws IOException {
testTextFinding("This is a test testing document with tested results.",
"test", false, true,
new String[]{"test"}, 1);
}
@Test
@DisplayName("Should find partial matches when whole word search disabled")
void findPartialMatches() throws IOException {
testTextFinding("This is a test testing document with tested results.",
"test", false, false,
new String[]{"test", "test", "test"}, 3);
}
@Test
@DisplayName("Should handle punctuation boundaries correctly")
void handlePunctuationBoundaries() throws IOException {
testTextFinding("Hello, world! Testing: test-case (test).",
"test", false, true,
new String[]{"test"}, 2); // Both standalone "test" and "test" in "test-case"
}
@Test
@DisplayName("Should handle word boundaries with special characters")
void handleSpecialCharacterBoundaries() throws IOException {
testTextFinding("Email: test@example.com and test.txt file",
"test", false, true,
new String[]{"test"}, 2); // Both in email and filename should match
}
}
@Nested
@DisplayName("Regular Expression Search")
class RegexSearchTests {
@Test
@DisplayName("Should find text matching regex pattern")
void findTextMatchingRegex() throws IOException {
testTextFinding("Contact John at 123-45-6789 or Jane at 987-65-4321 for details.",
"\\d{3}-\\d{2}-\\d{4}", true, false,
new String[]{"123-45-6789", "987-65-4321"}, 2);
}
@Test
@DisplayName("Should find email addresses with regex")
void findEmailAddresses() throws IOException {
testTextFinding("Email: test@example.com and admin@test.org",
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, false,
new String[]{"test@example.com", "admin@test.org"}, 2);
}
@Test
@DisplayName("Should combine regex with whole word search")
void combineRegexWithWholeWord() throws IOException {
testTextFinding("Email: test@example.com and admin@test.org",
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, true,
new String[]{"test@example.com", "admin@test.org"}, 2);
}
@Test
@DisplayName("Should find currency patterns")
void findCurrencyPatterns() throws IOException {
testTextFinding("Price: $100.50 and €75.25",
"\\$\\d+\\.\\d{2}", true, false,
new String[]{"$100.50"}, 1);
}
@ParameterizedTest
@ValueSource(strings = {
"\\d{4}-\\d{2}-\\d{2}", // Date pattern
"\\b[A-Z]{2,}\\b", // Uppercase words
"\\w+@\\w+\\.\\w+", // Simple email pattern
"\\$\\d+", // Simple currency
"\\b\\d{3,4}\\b" // 3-4 digit numbers
})
@DisplayName("Should handle various regex patterns")
void handleVariousRegexPatterns(String regexPattern) throws IOException {
String testContent = "Date: 2023-12-25, Email: test@domain.com, Price: $250, Code: ABC123, Number: 1234";
addTextToPage(testContent);
TextFinder textFinder = new TextFinder(regexPattern, true, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
// Each pattern should find at least one match in our test content
assertFalse(foundTexts.isEmpty(), String.format("Pattern '%s' should find at least one match", regexPattern));
}
@Test
@DisplayName("Should handle invalid regex gracefully")
void handleInvalidRegex() throws IOException {
addTextToPage("This is test content.");
try {
TextFinder textFinder = new TextFinder("[invalid regex(", true, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertNotNull(foundTexts);
} catch (java.util.regex.PatternSyntaxException e) {
assertNotNull(e.getMessage());
assertTrue(e.getMessage().contains("Unclosed character class") ||
e.getMessage().contains("syntax"),
"Exception should indicate regex syntax error");
} catch (RuntimeException | IOException e) {
assertNotNull(e.getMessage());
}
}
}
@Nested
@DisplayName("Special Characters and Encoding")
class SpecialCharacterTests {
@Test
@DisplayName("Should handle international characters")
void handleInternationalCharacters() throws IOException {
testTextFinding("Hello café naïve résumé",
"café", false, false,
new String[]{"café"}, 1);
}
@Test
@DisplayName("Should find text with accented characters")
void findAccentedCharacters() throws IOException {
testTextFinding("Café, naïve, résumé, piñata",
"café", false, false,
new String[]{"Café"}, 1); // Case insensitive
}
@Test
@DisplayName("Should handle special symbols")
void handleSpecialSymbols() throws IOException {
testTextFinding("Symbols: © ® ™ ± × ÷ § ¶",
"©", false, false,
new String[]{"©"}, 1);
}
@Test
@DisplayName("Should find currency symbols")
void findCurrencySymbols() throws IOException {
testTextFinding("Prices: $100 €75 £50 ¥1000",
"[€£¥]", true, false,
new String[]{"", "£", "¥"}, 3);
}
}
@Nested
@DisplayName("Multi-page Document Tests")
class MultiPageTests {
@Test
@DisplayName("Should find text across multiple pages")
void findTextAcrossPages() throws IOException {
PDPage secondPage = new PDPage(PDRectangle.A4);
document.addPage(secondPage);
addTextToPage("First page with confidential data.");
addTextToPage(secondPage, "Second page with secret information.");
TextFinder textFinder = new TextFinder("confidential|secret", true, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(2, foundTexts.size());
long page0Count = foundTexts.stream().filter(text -> text.getPageIndex() == 0).count();
long page1Count = foundTexts.stream().filter(text -> text.getPageIndex() == 1).count();
assertEquals(1, page0Count);
assertEquals(1, page1Count);
}
@Test
@DisplayName("Should handle empty pages gracefully")
void handleEmptyPages() throws IOException {
PDPage emptyPage = new PDPage(PDRectangle.A4);
document.addPage(emptyPage);
addTextToPage("Content on first page only.");
TextFinder textFinder = new TextFinder("content", false, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(1, foundTexts.size());
assertEquals(0, foundTexts.get(0).getPageIndex());
}
}
@Nested
@DisplayName("Performance and Boundary Tests")
class PerformanceTests {
@Test
@DisplayName("Should handle very long search terms")
void handleLongSearchTerms() throws IOException {
String longTerm = "a".repeat(1000);
String content = "Short text with " + longTerm + " embedded.";
testTextFinding(content, longTerm, false, false, new String[]{longTerm}, 1);
}
@Test
@DisplayName("Should handle documents with many pages efficiently")
void handleManyPages() throws IOException {
for (int i = 0; i < 10; i++) {
if (i > 0) { // The first page already exists
document.addPage(new PDPage(PDRectangle.A4));
}
addTextToPage(document.getPage(i), "Page " + i + " contains searchable content.");
}
long startTime = System.currentTimeMillis();
TextFinder textFinder = new TextFinder("searchable", false, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
long endTime = System.currentTimeMillis();
assertEquals(10, foundTexts.size());
assertTrue(endTime - startTime < 3000,
"Multi-page search should complete within 3 seconds");
}
}
@Nested
@DisplayName("Error Handling and Edge Cases")
class ErrorHandlingTests {
@Test
@DisplayName("Should handle null document gracefully")
void handleNullDocument() throws IOException {
TextFinder textFinder = new TextFinder("test", false, false);
try {
textFinder.getText(null);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertNotNull(foundTexts);
assertEquals(0, foundTexts.size());
} catch (Exception e) {
assertNotNull(e.getMessage());
}
}
@Test
@DisplayName("Should handle document without pages")
void handleDocumentWithoutPages() throws IOException {
try (PDDocument emptyDocument = new PDDocument()) {
TextFinder textFinder = new TextFinder("test", false, false);
textFinder.getText(emptyDocument);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(0, foundTexts.size());
}
}
@Test
@DisplayName("Should handle pages without content")
void handlePagesWithoutContent() throws IOException {
TextFinder textFinder = new TextFinder("test", false, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(0, foundTexts.size());
}
@Test
@DisplayName("Should handle extremely complex regex patterns")
void handleComplexRegexPatterns() throws IOException {
addTextToPage("Complex content with various patterns: abc123, def456, XYZ789");
String complexRegex = "(?=.*\\d)(?=.*[a-z])(?=.*[A-Z])[a-zA-Z\\d]{6}";
assertDoesNotThrow(() -> {
TextFinder textFinder = new TextFinder(complexRegex, true, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertNotNull(foundTexts);
});
}
@ParameterizedTest
@ValueSource(strings = {"", " ", "\t", "\n", "\r\n", " \t\n "})
@DisplayName("Should handle whitespace-only search terms")
void handleWhitespaceSearchTerms(String whitespacePattern) throws IOException {
addTextToPage("This is normal text content.");
TextFinder textFinder = new TextFinder(whitespacePattern, false, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(0, foundTexts.size());
}
}
@Nested
@DisplayName("Text Coordinate Verification")
class CoordinateTests {
@Test
@DisplayName("Should provide accurate text coordinates")
void provideAccurateCoordinates() throws IOException {
addTextToPage("Sample text for coordinate testing.");
TextFinder textFinder = new TextFinder("coordinate", false, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(1, foundTexts.size());
PDFText foundText = foundTexts.get(0);
assertTrue(foundText.getX1() >= 0, "X1 should be non-negative");
assertTrue(foundText.getY1() >= 0, "Y1 should be non-negative");
assertTrue(foundText.getX2() > foundText.getX1(), "X2 should be greater than X1");
assertTrue(foundText.getY2() > foundText.getY1(), "Y2 should be greater than Y1");
double width = foundText.getX2() - foundText.getX1();
double height = foundText.getY2() - foundText.getY1();
assertTrue(width > 0, "Text width should be positive");
assertTrue(height > 0, "Text height should be positive");
assertTrue(width < 1000, "Text width should be reasonable");
assertTrue(height < 100, "Text height should be reasonable");
}
@Test
@DisplayName("Should handle overlapping text regions")
void handleOverlappingTextRegions() throws IOException {
addTextToPage("Overlapping test text content.");
TextFinder textFinder = new TextFinder("test", false, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertFalse(foundTexts.isEmpty());
foundTexts.forEach(text -> {
assertNotNull(text.getText());
assertTrue(text.getX1() >= 0 && text.getY1() >= 0);
});
}
}
@Nested
@DisplayName("Single Character and Digit Tests")
class SingleCharacterAndDigitTests {
@Test
@DisplayName("Should find single digits in various contexts with whole word search")
void findSingleDigitsWholeWord() throws IOException {
String content = "Item 1 of 5 costs $2.50. Order number: 1234. Reference: A1B.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("1", false, true);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(1, foundTexts.size(),
"Should find exactly one standalone '1', not the ones embedded in other numbers/codes");
assertEquals("1", foundTexts.get(0).getText());
}
@Test
@DisplayName("Should find single digits without whole word search")
void findSingleDigitsNoWholeWord() throws IOException {
String content = "Item 1 of 5 costs $2.50. Order number: 1234. Reference: A1B.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("1", false, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertTrue(foundTexts.size() >= 3,
"Should find multiple instances of '1' including standalone, in '1234', and in 'A1B'");
}
@Test
@DisplayName("Should find single characters in various contexts")
void findSingleCharacters() throws IOException {
String content = "Grade: A. Section B has item A-1. The letter A appears multiple times.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("A", false, true);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertTrue(foundTexts.size() >= 2, "Should find multiple standalone 'A' characters");
for (PDFText found : foundTexts) {
assertEquals("A", found.getText());
}
}
@Test
@DisplayName("Should handle digits at word boundaries correctly")
void findDigitsAtWordBoundaries() throws IOException {
String content = "Numbers: 1, 2, 3. Code: 123. Version: 1.0. Item1 and Item2.";
addTextToPage(content);
TextFinder textFinder1 = new TextFinder("1", false, true);
textFinder1.getText(document);
List<PDFText> foundTexts1 = textFinder1.getFoundTexts();
assertEquals(1, foundTexts1.size(),
"Should find only the standalone '1' at the beginning");
TextFinder textFinder2 = new TextFinder("2", false, true);
textFinder2.getText(document);
List<PDFText> foundTexts2 = textFinder2.getFoundTexts();
assertEquals(1, foundTexts2.size(),
"Should find only the standalone '2' in the number list");
}
@Test
@DisplayName("Should handle special characters and punctuation boundaries")
void findDigitsWithPunctuationBoundaries() throws IOException {
String content = "Items: (1), [2], {3}, item#4, price$5, and 6%.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("1", false, true);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(1, foundTexts.size(), "Should find '1' surrounded by parentheses");
assertEquals("1", foundTexts.get(0).getText());
}
@Test
@DisplayName("Should handle edge case with spacing and formatting")
void findDigitsWithSpacingIssues() throws IOException {
String content = "List: 1 , 2 , 3 and item 1 here.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("1", false, true);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(2, foundTexts.size(),
"Should find both '1' instances despite spacing variations");
}
}
// Helper methods
private void addTextToPage(String text) throws IOException {
addTextToPage(page, text);
}
private void addTextToPage(PDPage targetPage, String text) throws IOException {
try (PDPageContentStream contentStream = new PDPageContentStream(document, targetPage)) {
contentStream.beginText();
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12);
contentStream.newLineAtOffset(50, 750);
contentStream.showText(text);
contentStream.endText();
}
}
}