feat: add TextFinderUtils and WidthCalculator for text processing and font validation, TextEncodingHelper for encoding support

This commit is contained in:
Balázs Szücs 2025-07-18 18:50:17 +02:00
parent 7a9f962172
commit 6315721e8f
6 changed files with 1106 additions and 230 deletions

View File

@ -32,9 +32,6 @@ import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.springframework.http.ResponseEntity;
@ -59,6 +56,9 @@ import stirling.software.SPDF.model.PDFText;
import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest;
import stirling.software.SPDF.model.api.security.RedactPdfRequest;
import stirling.software.SPDF.pdf.TextFinder;
import stirling.software.SPDF.utils.text.TextEncodingHelper;
import stirling.software.SPDF.utils.text.TextFinderUtils;
import stirling.software.SPDF.utils.text.WidthCalculator;
import stirling.software.common.model.api.security.RedactionArea;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.GeneralUtils;
@ -77,6 +77,9 @@ public class RedactController {
private static final float PRECISION_THRESHOLD = 1e-3f;
private static final int FONT_SCALE_FACTOR = 1000;
// Redaction box width reduction factor (10% reduction)
private static final float REDACTION_WIDTH_REDUCTION_FACTOR = 0.9f;
// Text showing operators
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
@ -229,7 +232,11 @@ public class RedactController {
}
private void redactFoundText(
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
PDDocument document,
List<PDFText> blocks,
float customPadding,
Color redactColor,
boolean isTextRemovalMode)
throws IOException {
var allPages = document.getDocumentCatalog().getPages();
@ -263,10 +270,28 @@ public class RedactController {
(block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER
+ customPadding;
float originalWidth = block.getX2() - block.getX1();
float boxWidth;
float boxX;
// Only apply width reduction when text is actually being removed
if (isTextRemovalMode) {
// Calculate reduced width and center the box
boxWidth =
originalWidth
* REDACTION_WIDTH_REDUCTION_FACTOR; // 10% reduction
float widthReduction = originalWidth - boxWidth;
boxX = block.getX1() + (widthReduction / 2); // Center the reduced box
} else {
// Use original width for box-only redaction
boxWidth = originalWidth;
boxX = block.getX1();
}
contentStream.addRect(
block.getX1(),
boxX,
pageBox.getHeight() - block.getY2() - padding,
block.getX2() - block.getX1(),
boxWidth,
block.getY2() - block.getY1() + 2 * padding);
}
@ -284,7 +309,7 @@ public class RedactController {
return originalWord;
}
if (font != null && isFontSubset(font.getName())) {
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
try {
float originalWidth = safeGetStringWidth(font, originalWord) / FONT_SCALE_FACTOR;
return createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
@ -300,6 +325,10 @@ public class RedactController {
return " ".repeat(originalWord.length());
}
/**
* Enhanced placeholder creation using advanced width calculation. Incorporates font validation
* and sophisticated fallback strategies.
*/
String createPlaceholderWithWidth(
String originalWord, float targetWidth, PDFont font, float fontSize) {
if (originalWord == null || originalWord.isEmpty()) {
@ -311,11 +340,21 @@ public class RedactController {
}
try {
if (isFontSubset(font.getName())) {
// Check font reliability before proceeding
if (!WidthCalculator.isWidthCalculationReliable(font)) {
log.debug(
"Font {} unreliable for width calculation, using simple placeholder",
font.getName());
return " ".repeat(originalWord.length());
}
// Use enhanced subset font detection
if (TextEncodingHelper.isFontSubset(font.getName())) {
return createSubsetFontPlaceholder(originalWord, targetWidth, font, fontSize);
}
float spaceWidth = safeGetStringWidth(font, " ") / FONT_SCALE_FACTOR * fontSize;
// Enhanced space width calculation
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
if (spaceWidth <= 0) {
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
@ -323,13 +362,16 @@ public class RedactController {
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
int maxSpaces = originalWord.length() * 2;
// More conservative space limit based on original word characteristics
int maxSpaces =
Math.max(
originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f));
spaceCount = Math.min(spaceCount, maxSpaces);
return " ".repeat(spaceCount);
} catch (Exception e) {
log.debug("Width-based placeholder creation failed: {}", e.getMessage());
log.debug("Enhanced placeholder creation failed: {}", e.getMessage());
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
}
}
@ -359,7 +401,7 @@ public class RedactController {
try {
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
if (fontSupportsCharacter(font, " ")) {
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
float spaceWidth = safeGetStringWidth(font, " ") / FONT_SCALE_FACTOR * fontSize;
if (spaceWidth > 0) {
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
@ -371,10 +413,10 @@ public class RedactController {
}
for (String altChar : alternatives) {
if (altChar.equals(" ")) continue; // Already tried spaces
if (" ".equals(altChar)) continue; // Already tried spaces
try {
if (!fontSupportsCharacter(font, altChar)) {
if (!TextEncodingHelper.fontSupportsCharacter(font, altChar)) {
continue;
}
@ -546,7 +588,8 @@ public class RedactController {
allFoundTextsByPage,
request.getRedactColor(),
request.getCustomPadding(),
request.getConvertPDFToImage());
request.getConvertPDFToImage(),
false); // Box-only mode, use original box sizes
return WebResponseUtils.bytesToWebResponse(
pdfContent,
@ -564,7 +607,8 @@ public class RedactController {
allFoundTextsByPage,
request.getRedactColor(),
request.getCustomPadding(),
request.getConvertPDFToImage());
request.getConvertPDFToImage(),
true); // Text removal mode, use reduced box sizes
return WebResponseUtils.bytesToWebResponse(
pdfContent,
@ -608,14 +652,31 @@ public class RedactController {
text = text.trim();
if (text.isEmpty()) continue;
log.debug(
"Searching for text: '{}' (regex: {}, wholeWord: {})",
text,
useRegex,
wholeWordSearch);
try {
TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearch);
textFinder.getText(document);
for (PDFText found : textFinder.getFoundTexts()) {
List<PDFText> foundTexts = textFinder.getFoundTexts();
log.debug("TextFinder found {} instances of '{}'", foundTexts.size(), text);
for (PDFText found : foundTexts) {
allFoundTextsByPage
.computeIfAbsent(found.getPageIndex(), k -> new ArrayList<>())
.add(found);
log.debug(
"Added match on page {} at ({},{},{},{}): '{}'",
found.getPageIndex(),
found.getX1(),
found.getY1(),
found.getX2(),
found.getY2(),
found.getText());
}
} catch (Exception e) {
log.error("Error processing search term '{}': {}", text, e.getMessage());
@ -673,7 +734,8 @@ public class RedactController {
Map<Integer, List<PDFText>> allFoundTextsByPage,
String colorString,
float customPadding,
Boolean convertToImage)
Boolean convertToImage,
boolean isTextRemovalMode)
throws IOException {
List<PDFText> allFoundTexts = new ArrayList<>();
@ -684,7 +746,7 @@ public class RedactController {
if (!allFoundTexts.isEmpty()) {
Color redactColor = decodeOrDefault(colorString);
redactFoundText(document, allFoundTexts, customPadding, redactColor);
redactFoundText(document, allFoundTexts, customPadding, redactColor, isTextRemovalMode);
cleanDocumentMetadata(document);
}
@ -870,16 +932,24 @@ public class RedactController {
boolean useRegex,
boolean wholeWordSearch) {
return targetWords.stream()
.map(
target -> {
String patternString = useRegex ? target : Pattern.quote(target);
if (wholeWordSearch) {
patternString = "\\b" + patternString + "\\b";
// Use the new utility for creating optimized patterns
List<Pattern> patterns =
TextFinderUtils.createOptimizedSearchPatterns(
targetWords, useRegex, wholeWordSearch);
return patterns.stream()
.flatMap(
pattern -> {
try {
return pattern.matcher(completeText).results();
} catch (Exception e) {
log.debug(
"Pattern matching failed for pattern {}: {}",
pattern.pattern(),
e.getMessage());
return java.util.stream.Stream.empty();
}
return Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
})
.flatMap(pattern -> pattern.matcher(completeText).results())
.map(matchResult -> new MatchRange(matchResult.start(), matchResult.end()))
.sorted(Comparator.comparingInt(MatchRange::getStartPos))
.collect(Collectors.toList());
@ -957,6 +1027,16 @@ public class RedactController {
private String applyRedactionsToSegmentText(TextSegment segment, List<MatchRange> matches) {
String text = segment.getText();
if (segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), text)) {
log.debug(
"Skipping text segment '{}' - font {} cannot process this text reliably",
text,
segment.getFont().getName());
return text; // Return original text unchanged
}
StringBuilder result = new StringBuilder(text);
for (MatchRange match : matches) {
@ -966,6 +1046,15 @@ public class RedactController {
if (segmentStart < text.length() && segmentEnd > segmentStart) {
String originalPart = text.substring(segmentStart, segmentEnd);
if (segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(
segment.getFont(), originalPart)) {
log.debug(
"Skipping text part '{}' within segment - cannot be processed reliably",
originalPart);
continue; // Skip this match, process others
}
float originalWidth = 0;
if (segment.getFont() != null && segment.getFontSize() > 0) {
try {
@ -1001,68 +1090,125 @@ public class RedactController {
return 0;
}
if (!WidthCalculator.isWidthCalculationReliable(font)) {
log.debug(
"Font {} flagged as unreliable for width calculation, using fallback",
font.getName());
return calculateConservativeWidth(font, text);
}
if (!TextEncodingHelper.canEncodeCharacters(font, text)) {
log.debug(
"Text cannot be encoded by font {}, using character-based fallback",
font.getName());
return calculateCharacterBasedWidth(font, text);
}
try {
return font.getStringWidth(text);
float width = font.getStringWidth(text);
log.debug("Direct width calculation successful for '{}': {}", text, width);
return width;
} catch (Exception e) {
try {
float totalWidth = 0;
for (int i = 0; i < text.length(); i++) {
String character = text.substring(i, i + 1);
try {
byte[] encoded = font.encode(character);
if (encoded.length > 0) {
int glyphCode = encoded[0] & 0xFF;
log.debug(
"Direct width calculation failed for font {}: {}",
font.getName(),
e.getMessage());
return calculateFallbackWidth(font, text);
}
}
float glyphWidth = font.getWidth(glyphCode);
private float calculateCharacterBasedWidth(PDFont font, String text) {
try {
float totalWidth = 0;
for (int i = 0; i < text.length(); i++) {
String character = text.substring(i, i + 1);
try {
// Validate character encoding first
if (!TextEncodingHelper.fontSupportsCharacter(font, character)) {
totalWidth += font.getAverageFontWidth();
continue;
}
if (glyphWidth == 0) {
try {
glyphWidth = font.getWidthFromFont(glyphCode);
} catch (Exception e2) {
glyphWidth = font.getAverageFontWidth();
}
byte[] encoded = font.encode(character);
if (encoded.length > 0) {
int glyphCode = encoded[0] & 0xFF;
float glyphWidth = font.getWidth(glyphCode);
// Try alternative width methods if primary fails
if (glyphWidth == 0) {
try {
glyphWidth = font.getWidthFromFont(glyphCode);
} catch (Exception e2) {
glyphWidth = font.getAverageFontWidth();
}
totalWidth += glyphWidth;
} else {
totalWidth += font.getAverageFontWidth();
}
} catch (Exception e2) {
totalWidth += glyphWidth;
} else {
totalWidth += font.getAverageFontWidth();
}
} catch (Exception e2) {
// Character processing failed, use average width
totalWidth += font.getAverageFontWidth();
}
return totalWidth;
} catch (Exception e2) {
log.debug("PDFBox API width calculation failed: {}", e2.getMessage());
}
try {
if (font.getFontDescriptor() != null
&& font.getFontDescriptor().getFontBoundingBox() != null) {
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
float avgCharWidth = bbox.getHeight() / 1000f * 0.865f;
return text.length() * avgCharWidth * FONT_SCALE_FACTOR;
}
} catch (Exception e2) {
log.debug("Font bounding box width calculation failed: {}", e2.getMessage());
log.debug("Character-based width calculation: {}", totalWidth);
return totalWidth;
} catch (Exception e) {
log.debug("Character-based width calculation failed: {}", e.getMessage());
return calculateConservativeWidth(font, text);
}
}
private float calculateFallbackWidth(PDFont font, String text) {
try {
// Method 1: Font bounding box approach
if (font.getFontDescriptor() != null
&& font.getFontDescriptor().getFontBoundingBox() != null) {
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
float avgCharWidth = bbox.getWidth() * 0.6f; // Conservative estimate
float fallbackWidth = text.length() * avgCharWidth;
log.debug("Bounding box fallback width: {}", fallbackWidth);
return fallbackWidth;
}
// Method 2: Average font width
try {
float avgWidth = font.getAverageFontWidth();
return text.length() * avgWidth;
if (avgWidth > 0) {
float fallbackWidth = text.length() * avgWidth;
log.debug("Average width fallback: {}", fallbackWidth);
return fallbackWidth;
}
} catch (Exception e2) {
log.debug("Average font width calculation failed: {}", e2.getMessage());
}
float conservativeWidth = text.length() * 500f; // 500 units per character
log.debug(
"All width calculation methods failed for font {}, using conservative estimate: {}",
font.getName(),
conservativeWidth);
return conservativeWidth;
// Method 3: Conservative estimate based on font metrics
return calculateConservativeWidth(font, text);
} catch (Exception e) {
log.debug("Fallback width calculation failed: {}", e.getMessage());
return calculateConservativeWidth(font, text);
}
}
private float calculateConservativeWidth(PDFont font, String text) {
float conservativeWidth = text.length() * 500f;
log.debug(
"Conservative width estimate for font {} text '{}': {}",
font.getName(),
text,
conservativeWidth);
return conservativeWidth;
}
private float calculateWidthAdjustment(TextSegment segment, List<MatchRange> matches) {
try {
if (segment.getFont() == null || segment.getFontSize() <= 0) {
@ -1070,7 +1216,8 @@ public class RedactController {
}
String fontName = segment.getFont().getName();
if (fontName != null && (fontName.contains("HOEPAP") || isFontSubset(fontName))) {
if (fontName != null
&& (fontName.contains("HOEPAP") || TextEncodingHelper.isFontSubset(fontName))) {
log.debug("Skipping width adjustment for problematic/subset font: {}", fontName);
return 0;
}
@ -1196,6 +1343,19 @@ public class RedactController {
for (COSBase element : originalArray) {
if (element instanceof COSString cosString) {
String originalText = cosString.getString();
if (segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(
segment.getFont(), originalText)) {
log.debug(
"Skipping TJ text part '{}' - cannot be processed reliably with font {}",
originalText,
segment.getFont().getName());
newArray.add(element); // Keep original unchanged
textOffsetInSegment += originalText.length();
continue;
}
StringBuilder newText = new StringBuilder(originalText);
boolean modified = false;
@ -1207,7 +1367,6 @@ public class RedactController {
int overlapEnd = Math.min(match.getEndPos(), stringEndInPage);
if (overlapStart < overlapEnd) {
modified = true;
int redactionStartInString = overlapStart - stringStartInPage;
int redactionEndInString = overlapEnd - stringStartInPage;
if (redactionStartInString >= 0
@ -1216,6 +1375,16 @@ public class RedactController {
originalText.substring(
redactionStartInString, redactionEndInString);
if (segment.getFont() != null
&& !TextEncodingHelper.isTextSegmentRemovable(
segment.getFont(), originalPart)) {
log.debug(
"Skipping TJ text part '{}' - cannot be redacted reliably",
originalPart);
continue; // Skip this redaction, keep original text
}
modified = true;
float originalWidth = 0;
if (segment.getFont() != null && segment.getFontSize() > 0) {
try {
@ -1320,8 +1489,13 @@ public class RedactController {
int totalFonts = 0;
int customEncodedFonts = 0;
int subsetFonts = 0;
int unreliableFonts = 0;
for (PDPage page : document.getPages()) {
if (TextFinderUtils.hasProblematicFonts(page)) {
log.debug("Page contains fonts flagged as problematic by TextFinderUtils");
}
PDResources resources = page.getResources();
if (resources == null) {
continue;
@ -1333,190 +1507,64 @@ public class RedactController {
if (font != null) {
totalFonts++;
boolean isSubset = isFontSubset(font.getName());
boolean isProblematic = hasProblematicFontCharacteristics(font);
// Enhanced analysis using helper classes
boolean isSubset = TextEncodingHelper.isFontSubset(font.getName());
boolean hasCustomEncoding = TextEncodingHelper.hasCustomEncoding(font);
boolean isReliable = WidthCalculator.isWidthCalculationReliable(font);
boolean canCalculateWidths =
TextEncodingHelper.canCalculateBasicWidths(font);
if (isSubset) {
subsetFonts++;
}
if (isProblematic) {
if (hasCustomEncoding) {
customEncodedFonts++;
log.debug("Font {} has custom encoding", font.getName());
}
if (!isReliable || !canCalculateWidths) {
unreliableFonts++;
log.debug(
"Detected problematic font: {} (type: {})",
"Font {} flagged as unreliable: reliable={}, canCalculateWidths={}",
font.getName(),
font.getClass().getSimpleName());
isReliable,
canCalculateWidths);
}
if (!TextFinderUtils.validateFontReliability(font)) {
log.debug(
"Font {} failed comprehensive reliability check",
font.getName());
}
}
} catch (IOException e) {
} catch (Exception e) {
log.debug(
"Font loading failed for {}: {}",
"Font loading/analysis failed for {}: {}",
fontName.getName(),
e.getMessage());
customEncodedFonts++;
unreliableFonts++;
totalFonts++;
}
}
}
log.info(
"Font analysis: {}/{} fonts use custom encoding, {}/{} are subset fonts (subset fonts with standard encodings are fine)",
"Enhanced font analysis: {}/{} custom encoding, {}/{} subset, {}/{} unreliable fonts",
customEncodedFonts,
totalFonts,
subsetFonts,
totalFonts,
unreliableFonts,
totalFonts);
return customEncodedFonts > 0;
} catch (Exception e) {
log.warn("Font detection analysis failed: {}", e.getMessage());
return false;
}
}
private boolean hasProblematicFontCharacteristics(PDFont font) {
try {
if (font.isDamaged()) {
log.debug("Font {} is marked as damaged by PDFBox", font.getName());
return true;
}
if (hasCustomEncoding(font)) {
log.debug(
"Font {} uses custom encoding - text replacement will be unreliable",
font.getName());
return true;
}
String fontType = font.getClass().getSimpleName();
if ("PDType3Font".equals(fontType)) {
log.debug("Font {} is Type3 - may have text replacement issues", font.getName());
return cannotCalculateBasicWidths(font);
}
log.debug("Font {} appears suitable for text replacement", font.getName());
return false;
// Consider document problematic if we have custom encodings or unreliable fonts
return customEncodedFonts > 0 || unreliableFonts > 0;
} catch (Exception e) {
log.debug("Font analysis failed for {}: {}", font.getName(), e.getMessage());
return false;
}
}
private boolean hasCustomEncoding(PDFont font) {
try {
if (font instanceof PDSimpleFont simpleFont) {
try {
Encoding encoding = simpleFont.getEncoding();
if (encoding != null) {
String encodingName = encoding.getEncodingName();
// Check if it's one of the standard encodings
if ("WinAnsiEncoding".equals(encodingName)
|| "MacRomanEncoding".equals(encodingName)
|| "StandardEncoding".equals(encodingName)
|| "MacExpertEncoding".equals(encodingName)
|| "SymbolEncoding".equals(encodingName)
|| "ZapfDingbatsEncoding".equals(encodingName)) {
log.debug(
"Font {} uses standard encoding: {}",
font.getName(),
encodingName);
return false;
}
if (encoding instanceof DictionaryEncoding) {
log.debug(
"Font {} uses DictionaryEncoding - likely custom",
font.getName());
return true;
}
log.debug(
"Font {} uses non-standard encoding: {}",
font.getName(),
encodingName);
return true;
}
} catch (Exception e) {
log.debug(
"Could not determine encoding for font {}: {}",
font.getName(),
e.getMessage());
}
}
if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) {
log.debug("Font {} is Type0 (CID) - generally uses standard CMaps", font.getName());
return false; // Be forgiving with CID fonts
}
log.debug(
"Font {} type {} - assuming standard encoding",
font.getName(),
font.getClass().getSimpleName());
return false;
} catch (Exception e) {
log.debug(
"Custom encoding detection failed for font {}: {}",
font.getName(),
e.getMessage());
return false; // Be forgiving on detection failure
}
}
private boolean cannotCalculateBasicWidths(PDFont font) {
try {
float spaceWidth = font.getStringWidth(" ");
if (spaceWidth <= 0) {
return true;
}
String[] testChars = {"a", "A", "0", ".", "e", "!"};
for (String ch : testChars) {
try {
float width = font.getStringWidth(ch);
if (width > 0) {
return false; // Found at least one character we can measure
}
} catch (Exception e) {
}
}
return true; // Can't calculate width for any test characters
} catch (Exception e) {
return true; // Font failed basic width calculation
}
}
private boolean isFontSubset(String fontName) {
if (fontName == null) {
return false;
}
return fontName.matches("^[A-Z]{6}\\+.*");
}
private boolean fontSupportsCharacter(PDFont font, String character) {
if (font == null || character == null || character.isEmpty()) {
return false;
}
try {
byte[] encoded = font.encode(character);
if (encoded.length == 0) {
return false;
}
float width = font.getStringWidth(character);
return width > 0;
} catch (Exception e) {
log.debug(
"Character '{}' not supported by font {}: {}",
character,
font.getName(),
e.getMessage());
return false;
log.warn("Enhanced font detection analysis failed: {}", e.getMessage());
return true; // Assume problematic if analysis fails
}
}

View File

@ -10,8 +10,11 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.PDFText;
@Slf4j
public class TextFinder extends PDFTextStripper {
private final String searchTerm;
@ -67,16 +70,40 @@ public class TextFinder extends PDFTextStripper {
String processedSearchTerm = this.searchTerm.trim();
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
if (this.wholeWordSearch) {
regex = "\\b" + regex + "\\b";
if (processedSearchTerm.length() == 1
&& Character.isDigit(processedSearchTerm.charAt(0))) {
regex = "(?<![\\w])" + regex + "(?![\\w])";
} else if (processedSearchTerm.length() == 1) {
regex = "(?<![\\w])" + regex + "(?![\\w])";
} else {
regex = "\\b" + regex + "\\b";
}
}
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
Matcher matcher = pattern.matcher(text);
log.debug(
"Searching for '{}' in page {} with regex '{}' (wholeWord: {}, useRegex: {})",
processedSearchTerm,
getCurrentPageNo(),
regex,
wholeWordSearch,
useRegex);
int matchCount = 0;
while (matcher.find()) {
matchCount++;
int matchStart = matcher.start();
int matchEnd = matcher.end();
log.debug(
"Found match #{} at positions {}-{}: '{}'",
matchCount,
matchStart,
matchEnd,
matcher.group());
float minX = Float.MAX_VALUE;
float minY = Float.MAX_VALUE;
float maxX = Float.MIN_VALUE;
@ -85,6 +112,10 @@ public class TextFinder extends PDFTextStripper {
for (int i = matchStart; i < matchEnd; i++) {
if (i >= pageTextPositions.size()) {
log.debug(
"Position index {} exceeds available positions ({})",
i,
pageTextPositions.size());
continue;
}
TextPosition pos = pageTextPositions.get(i);
@ -97,6 +128,27 @@ public class TextFinder extends PDFTextStripper {
}
}
if (!foundPosition && matchStart < pageTextPositions.size()) {
log.debug(
"Attempting to find nearby positions for match at {}-{}",
matchStart,
matchEnd);
for (int i = Math.max(0, matchStart - 5);
i < Math.min(pageTextPositions.size(), matchEnd + 5);
i++) {
TextPosition pos = pageTextPositions.get(i);
if (pos != null) {
foundPosition = true;
minX = Math.min(minX, pos.getX());
maxX = Math.max(maxX, pos.getX() + pos.getWidth());
minY = Math.min(minY, pos.getY() - pos.getHeight());
maxY = Math.max(maxY, pos.getY());
break;
}
}
}
if (foundPosition) {
foundTexts.add(
new PDFText(
@ -106,13 +158,59 @@ public class TextFinder extends PDFTextStripper {
maxX,
maxY,
matcher.group()));
log.debug(
"Added PDFText for match: page={}, bounds=({},{},{},{}), text='{}'",
getCurrentPageNo() - 1,
minX,
minY,
maxX,
maxY,
matcher.group());
} else {
log.warn(
"Found text match '{}' but no valid position data at {}-{}",
matcher.group(),
matchStart,
matchEnd);
}
}
log.debug(
"Page {} search complete: found {} matches for '{}'",
getCurrentPageNo(),
matchCount,
processedSearchTerm);
super.endPage(page);
}
public List<PDFText> getFoundTexts() {
return foundTexts;
}
public String getDebugInfo() {
StringBuilder debug = new StringBuilder();
debug.append("Extracted text length: ").append(pageTextBuilder.length()).append("\n");
debug.append("Position count: ").append(pageTextPositions.size()).append("\n");
debug.append("Text content: '")
.append(pageTextBuilder.toString().replace("\n", "\\n").replace("\r", "\\r"))
.append("'\n");
String text = pageTextBuilder.toString();
for (int i = 0; i < Math.min(text.length(), 50); i++) {
char c = text.charAt(i);
TextPosition pos = i < pageTextPositions.size() ? pageTextPositions.get(i) : null;
debug.append(
String.format(
" [%d] '%c' (0x%02X) -> %s\n",
i,
c,
(int) c,
pos != null
? String.format("(%.1f,%.1f)", pos.getX(), pos.getY())
: "null"));
}
return debug.toString();
}
}

View File

@ -0,0 +1,351 @@
package stirling.software.SPDF.utils.text;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TextEncodingHelper {
public static boolean canEncodeCharacters(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
return false;
}
try {
// Step 1: Primary check - full-string encoding (permissive for "good" cases)
byte[] encoded = font.encode(text);
if (encoded.length > 0) {
log.debug(
"Text '{}' has good full-string encoding for font {} - permissively allowing",
text,
font.getName() != null ? font.getName() : "Unknown");
return true;
}
// Step 2: Smart array-based fallback for TJ operator-style text
log.debug(
"Full encoding failed for '{}' - using array-based fallback for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return validateAsCodePointArray(font, text);
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Encoding exception for text '{}' with font {} - trying array fallback: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
if (isFontSubset(font.getName()) || hasCustomEncoding(font)) {
return validateAsCodePointArray(font, text);
}
return false; // Non-subset fonts with encoding exceptions are likely problematic
}
}
private static boolean validateAsCodePointArray(PDFont font, String text) {
int totalCodePoints = 0;
int successfulCodePoints = 0;
// Iterate through code points (handles surrogates correctly per Unicode docs)
for (int i = 0; i < text.length(); ) {
int codePoint = text.codePointAt(i);
String charStr = new String(Character.toChars(codePoint));
totalCodePoints++;
try {
// Test encoding for this code point
byte[] charEncoded = font.encode(charStr);
if (charEncoded.length > 0) {
float charWidth = font.getStringWidth(charStr);
if (charWidth >= 0) {
successfulCodePoints++;
log.debug(
"Code point '{}' (U+{}) encoded successfully",
charStr,
Integer.toHexString(codePoint).toUpperCase());
} else {
log.debug(
"Code point '{}' (U+{}) has invalid width: {}",
charStr,
Integer.toHexString(codePoint).toUpperCase(),
charWidth);
}
} else {
log.debug(
"Code point '{}' (U+{}) encoding failed - empty result",
charStr,
Integer.toHexString(codePoint).toUpperCase());
}
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Code point '{}' (U+{}) validation failed: {}",
charStr,
Integer.toHexString(codePoint).toUpperCase(),
e.getMessage());
}
i += Character.charCount(codePoint); // Handle surrogates properly
}
double successRate =
totalCodePoints > 0 ? (double) successfulCodePoints / totalCodePoints : 0;
boolean isAcceptable = successRate >= 0.95;
log.debug(
"Array validation for '{}': {}/{} code points successful ({:.1f}%) - {}",
text,
successfulCodePoints,
totalCodePoints,
successRate * 100,
isAcceptable ? "ALLOWING" : "rejecting");
return isAcceptable;
}
public static boolean isTextSegmentRemovable(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
return false;
}
// Log the attempt
log.debug(
"Evaluating text segment for removal: '{}' with font {}",
text,
font.getName() != null ? font.getName() : "Unknown Font");
if (isSimpleCharacter(text)) {
try {
font.encode(text);
font.getStringWidth(text);
log.debug(
"Text '{}' is a simple character and passed validation - allowing removal",
text);
return true;
} catch (Exception e) {
log.debug(
"Simple character '{}' failed basic validation with font {}: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
}
}
// For complex text, require comprehensive validation
return isTextFullyRemovable(font, text);
}
public static boolean isTextFullyRemovable(PDFont font, String text) {
if (font == null || text == null || text.isEmpty()) {
return false;
}
try {
// Check 1: Verify encoding capability using new smart approach
if (!canEncodeCharacters(font, text)) {
log.debug(
"Text '{}' failed encoding validation for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return false;
}
// Check 2: Validate width calculation capability
float width = font.getStringWidth(text);
if (width < 0) { // Allow zero width (invisible chars) but reject negative (invalid)
log.debug(
"Text '{}' has invalid width {} for font {}",
text,
width,
font.getName() != null ? font.getName() : "Unknown");
return false; // Invalid metrics prevent accurate removal
}
// Check 3: Verify font descriptor completeness for redaction area calculation
if (font.getFontDescriptor() == null) {
log.debug(
"Missing font descriptor for font {}",
font.getName() != null ? font.getName() : "Unknown");
return false;
}
// Check 4: Test bounding box calculation for redaction area
try {
font.getFontDescriptor().getFontBoundingBox();
} catch (IllegalArgumentException e) {
log.debug(
"Font bounding box unavailable for font {}: {}",
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
}
log.debug(
"Text '{}' passed comprehensive validation for font {}",
text,
font.getName() != null ? font.getName() : "Unknown");
return true;
} catch (IOException e) {
log.debug(
"Text '{}' failed validation for font {} due to IO error: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
} catch (IllegalArgumentException e) {
log.debug(
"Text '{}' failed validation for font {} due to argument error: {}",
text,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
}
}
private static boolean isSimpleCharacter(String text) {
if (text == null || text.isEmpty()) {
return false;
}
if (text.length() > 20) {
return false;
}
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
// Allow letters, digits, and whitespace (most common cases)
if (Character.isLetterOrDigit(c) || Character.isWhitespace(c)) {
continue;
}
// Allow common ASCII punctuation
if (c >= 32 && c <= 126 && ".,!?;:()-[]{}\"'/@#$%&*+=<>|\\~`".indexOf(c) >= 0) {
continue;
}
return false;
}
return true;
}
public static boolean hasCustomEncoding(PDFont font) {
try {
if (font instanceof PDSimpleFont simpleFont) {
try {
Encoding encoding = simpleFont.getEncoding();
if (encoding != null) {
// Check for dictionary-based custom encodings
if (encoding instanceof DictionaryEncoding) {
log.debug("Font {} uses DictionaryEncoding (custom)", font.getName());
return true;
}
String encodingName = encoding.getClass().getSimpleName();
if (encodingName.contains("Custom")
|| encodingName.contains("Dictionary")) {
log.debug(
"Font {} uses custom encoding: {}",
font.getName(),
encodingName);
return true;
}
}
} catch (Exception e) {
log.debug(
"Encoding detection failed for font {}: {}",
font.getName(),
e.getMessage());
return true; // Assume custom if detection fails
}
}
if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) {
log.debug(
"Font {} is Type0 (CID) - generally uses standard CMaps",
font.getName() != null ? font.getName() : "Unknown");
return false;
}
log.debug(
"Font {} type {} - assuming standard encoding",
font.getName() != null ? font.getName() : "Unknown",
font.getClass().getSimpleName());
return false;
} catch (IllegalArgumentException e) {
log.debug(
"Custom encoding detection failed for font {}: {}",
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false; // Be forgiving on detection failure
}
}
public static boolean fontSupportsCharacter(PDFont font, String character) {
if (font == null || character == null || character.isEmpty()) {
return false;
}
try {
byte[] encoded = font.encode(character);
if (encoded.length == 0) {
return false;
}
float width = font.getStringWidth(character);
return width > 0;
} catch (IOException | IllegalArgumentException e) {
log.debug(
"Character '{}' not supported by font {}: {}",
character,
font.getName() != null ? font.getName() : "Unknown",
e.getMessage());
return false;
}
}
public static boolean isFontSubset(String fontName) {
if (fontName == null) {
return false;
}
return fontName.matches("^[A-Z]{6}\\+.*");
}
public static boolean canCalculateBasicWidths(PDFont font) {
try {
float spaceWidth = font.getStringWidth(" ");
if (spaceWidth <= 0) {
return false;
}
String[] testChars = {"a", "A", "0", ".", "e", "!"};
for (String ch : testChars) {
try {
float width = font.getStringWidth(ch);
if (width > 0) {
return true;
}
} catch (IOException | IllegalArgumentException e) {
}
}
return false; // Can't calculate width for any test characters
} catch (IOException | IllegalArgumentException e) {
return false; // Font failed basic width calculation
}
}
}

View File

@ -0,0 +1,140 @@
package stirling.software.SPDF.utils.text;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TextFinderUtils {
public static boolean validateFontReliability(org.apache.pdfbox.pdmodel.font.PDFont font) {
if (font == null) {
return false;
}
if (font.isDamaged()) {
log.debug(
"Font {} is marked as damaged - using TextEncodingHelper validation",
font.getName());
}
if (TextEncodingHelper.canCalculateBasicWidths(font)) {
log.debug(
"Font {} passed basic width calculations - considering reliable",
font.getName());
return true;
}
String[] basicTests = {"1", "2", "3", "a", "A", "e", "E", " "};
int workingChars = 0;
for (String testChar : basicTests) {
if (TextEncodingHelper.canEncodeCharacters(font, testChar)) {
workingChars++;
}
}
if (workingChars > 0) {
log.debug(
"Font {} can process {}/{} basic characters - considering reliable",
font.getName(),
workingChars,
basicTests.length);
return true;
}
log.debug("Font {} failed all basic tests - considering unreliable", font.getName());
return false;
}
public static List<Pattern> createOptimizedSearchPatterns(
Set<String> searchTerms, boolean useRegex, boolean wholeWordSearch) {
List<Pattern> patterns = new ArrayList<>();
for (String term : searchTerms) {
if (term == null || term.trim().isEmpty()) {
continue;
}
try {
String patternString = useRegex ? term.trim() : Pattern.quote(term.trim());
if (wholeWordSearch) {
patternString = applyWordBoundaries(term.trim(), patternString);
}
Pattern pattern =
Pattern.compile(
patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
patterns.add(pattern);
log.debug("Created search pattern: '{}' -> '{}'", term.trim(), patternString);
} catch (Exception e) {
log.warn("Failed to create pattern for term '{}': {}", term, e.getMessage());
}
}
return patterns;
}
private static String applyWordBoundaries(String originalTerm, String patternString) {
if (originalTerm.length() == 1 && Character.isDigit(originalTerm.charAt(0))) {
return "(?<![\\w])" + patternString + "(?![\\w])";
} else if (originalTerm.length() == 1) {
return "(?<![\\w])" + patternString + "(?![\\w])";
} else {
return "\\b" + patternString + "\\b";
}
}
public static boolean hasProblematicFonts(PDPage page) {
if (page == null) {
return false;
}
try {
PDResources resources = page.getResources();
if (resources == null) {
return false;
}
int totalFonts = 0;
int completelyUnusableFonts = 0;
for (org.apache.pdfbox.cos.COSName fontName : resources.getFontNames()) {
try {
org.apache.pdfbox.pdmodel.font.PDFont font = resources.getFont(fontName);
if (font != null) {
totalFonts++;
if (!validateFontReliability(font)) {
completelyUnusableFonts++;
}
}
} catch (Exception e) {
log.debug("Font loading failed for {}: {}", fontName.getName(), e.getMessage());
totalFonts++;
}
}
boolean hasProblems = totalFonts > 0 && (completelyUnusableFonts * 2 > totalFonts);
log.debug(
"Page font analysis: {}/{} fonts are completely unusable - page {} problematic",
completelyUnusableFonts,
totalFonts,
hasProblems ? "IS" : "is NOT");
return hasProblems;
} catch (Exception e) {
log.warn("Font analysis failed for page: {}", e.getMessage());
return false; // Be permissive if analysis fails
}
}
}

View File

@ -0,0 +1,136 @@
package stirling.software.SPDF.utils.text;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class WidthCalculator {
private static final int FONT_SCALE_FACTOR = 1000;
public static float calculateAccurateWidth(PDFont font, String text, float fontSize) {
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
return 0;
}
if (!TextEncodingHelper.canEncodeCharacters(font, text)) {
log.debug(
"Text cannot be encoded by font {}, using fallback width calculation",
font.getName());
return calculateFallbackWidth(font, text, fontSize);
}
try {
float rawWidth = font.getStringWidth(text);
float scaledWidth = (rawWidth / FONT_SCALE_FACTOR) * fontSize;
log.debug(
"Direct width calculation successful for font {}: {} -> {}",
font.getName(),
rawWidth,
scaledWidth);
return scaledWidth;
} catch (Exception e) {
log.debug(
"Direct width calculation failed for font {}: {}",
font.getName(),
e.getMessage());
return calculateWidthWithCharacterIteration(font, text, fontSize);
}
}
private static float calculateWidthWithCharacterIteration(
PDFont font, String text, float fontSize) {
try {
float totalWidth = 0;
for (int i = 0; i < text.length(); i++) {
String character = text.substring(i, i + 1);
try {
byte[] encoded = font.encode(character);
if (encoded.length > 0) {
int glyphCode = encoded[0] & 0xFF;
float glyphWidth = font.getWidth(glyphCode);
if (glyphWidth == 0) {
try {
glyphWidth = font.getWidthFromFont(glyphCode);
} catch (Exception e2) {
glyphWidth = font.getAverageFontWidth();
}
}
totalWidth += (glyphWidth / FONT_SCALE_FACTOR) * fontSize;
} else {
totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize;
}
} catch (Exception e2) {
totalWidth += (font.getAverageFontWidth() / FONT_SCALE_FACTOR) * fontSize;
}
}
log.debug("Character iteration width calculation: {}", totalWidth);
return totalWidth;
} catch (Exception e) {
log.debug("Character iteration failed: {}", e.getMessage());
return calculateFallbackWidth(font, text, fontSize);
}
}
private static float calculateFallbackWidth(PDFont font, String text, float fontSize) {
try {
if (font.getFontDescriptor() != null
&& font.getFontDescriptor().getFontBoundingBox() != null) {
PDRectangle bbox = font.getFontDescriptor().getFontBoundingBox();
float avgCharWidth =
bbox.getWidth() / FONT_SCALE_FACTOR * 0.6f; // Conservative estimate
float fallbackWidth = text.length() * avgCharWidth * fontSize;
log.debug("Bounding box fallback width: {}", fallbackWidth);
return fallbackWidth;
}
float avgWidth = font.getAverageFontWidth();
float fallbackWidth = (text.length() * avgWidth / FONT_SCALE_FACTOR) * fontSize;
log.debug("Average width fallback: {}", fallbackWidth);
return fallbackWidth;
} catch (Exception e) {
float conservativeWidth = text.length() * 0.5f * fontSize;
log.debug(
"Conservative fallback width for font {}: {}",
font.getName(),
conservativeWidth);
return conservativeWidth;
}
}
public static boolean isWidthCalculationReliable(PDFont font) {
if (font == null) {
return false;
}
if (font.isDamaged()) {
log.debug("Font {} is damaged", font.getName());
return false;
}
if (!TextEncodingHelper.canCalculateBasicWidths(font)) {
log.debug("Font {} cannot perform basic width calculations", font.getName());
return false;
}
if (TextEncodingHelper.hasCustomEncoding(font)) {
log.debug("Font {} has custom encoding", font.getName());
return false;
}
return true;
}
}

View File

@ -1,7 +1,5 @@
package stirling.software.SPDF.pdf;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.util.List;
@ -12,6 +10,11 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.junit.jupiter.api.AfterEach;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested;
@ -468,6 +471,106 @@ class TextFinderTest {
}
}
@Nested
@DisplayName("Single Character and Digit Tests")
class SingleCharacterAndDigitTests {
@Test
@DisplayName("Should find single digits in various contexts with whole word search")
void findSingleDigitsWholeWord() throws IOException {
String content = "Item 1 of 5 costs $2.50. Order number: 1234. Reference: A1B.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("1", false, true);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(1, foundTexts.size(),
"Should find exactly one standalone '1', not the ones embedded in other numbers/codes");
assertEquals("1", foundTexts.get(0).getText());
}
@Test
@DisplayName("Should find single digits without whole word search")
void findSingleDigitsNoWholeWord() throws IOException {
String content = "Item 1 of 5 costs $2.50. Order number: 1234. Reference: A1B.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("1", false, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertTrue(foundTexts.size() >= 3,
"Should find multiple instances of '1' including standalone, in '1234', and in 'A1B'");
}
@Test
@DisplayName("Should find single characters in various contexts")
void findSingleCharacters() throws IOException {
String content = "Grade: A. Section B has item A-1. The letter A appears multiple times.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("A", false, true);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertTrue(foundTexts.size() >= 2, "Should find multiple standalone 'A' characters");
for (PDFText found : foundTexts) {
assertEquals("A", found.getText());
}
}
@Test
@DisplayName("Should handle digits at word boundaries correctly")
void findDigitsAtWordBoundaries() throws IOException {
String content = "Numbers: 1, 2, 3. Code: 123. Version: 1.0. Item1 and Item2.";
addTextToPage(content);
TextFinder textFinder1 = new TextFinder("1", false, true);
textFinder1.getText(document);
List<PDFText> foundTexts1 = textFinder1.getFoundTexts();
assertEquals(1, foundTexts1.size(),
"Should find only the standalone '1' at the beginning");
TextFinder textFinder2 = new TextFinder("2", false, true);
textFinder2.getText(document);
List<PDFText> foundTexts2 = textFinder2.getFoundTexts();
assertEquals(1, foundTexts2.size(),
"Should find only the standalone '2' in the number list");
}
@Test
@DisplayName("Should handle special characters and punctuation boundaries")
void findDigitsWithPunctuationBoundaries() throws IOException {
String content = "Items: (1), [2], {3}, item#4, price$5, and 6%.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("1", false, true);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(1, foundTexts.size(), "Should find '1' surrounded by parentheses");
assertEquals("1", foundTexts.get(0).getText());
}
@Test
@DisplayName("Should handle edge case with spacing and formatting")
void findDigitsWithSpacingIssues() throws IOException {
String content = "List: 1 , 2 , 3 and item 1 here.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("1", false, true);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(2, foundTexts.size(),
"Should find both '1' instances despite spacing variations");
}
}
// Helper methods
private void addTextToPage(String text) throws IOException {
addTextToPage(page, text);