mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
improve RedactionService and TextDecodingHelper for improved font handling and page number parsing
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
48967f7061
commit
4cafb998f7
@ -4,10 +4,12 @@ import java.awt.Color;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Deque;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -33,7 +35,6 @@ import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.common.PDStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
|
||||
@ -64,7 +65,6 @@ import stirling.software.common.util.PdfUtils;
|
||||
public class RedactionService {
|
||||
|
||||
private static final Pattern FUZZY_STRIP = Pattern.compile("[^a-z0-9]+");
|
||||
private static final Pattern PAGE_SPLIT = Pattern.compile("[,\\s]+");
|
||||
private static final float DEFAULT_TEXT_PADDING_MULTIPLIER = 0.6f;
|
||||
private static final float PRECISION_THRESHOLD = 1e-3f;
|
||||
private static final int FONT_SCALE_FACTOR = 1000;
|
||||
@ -75,38 +75,6 @@ public class RedactionService {
|
||||
private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
|
||||
private static PDFont getFontSafely(PDResources resources, COSName fontName) {
|
||||
if (resources == null || fontName == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
PDFont font = resources.getFont(fontName);
|
||||
if (font == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
String fontNameCheck = font.getName();
|
||||
if (fontNameCheck == null || fontNameCheck.trim().isEmpty()) {
|
||||
log.debug("Font {} has null or empty name, skipping", fontName.getName());
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Error accessing font name for {}, skipping: {}",
|
||||
fontName.getName(),
|
||||
e.getMessage());
|
||||
return null;
|
||||
}
|
||||
|
||||
return font;
|
||||
} catch (Exception e) {
|
||||
log.debug("Error retrieving font {}: {}", fontName.getName(), e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static void redactAreas(
|
||||
List<RedactionArea> redactionAreas, PDDocument document, PDPageTree allPages)
|
||||
throws IOException {
|
||||
@ -161,10 +129,15 @@ public class RedactionService {
|
||||
ManualRedactPdfRequest request, PDDocument document, PDPageTree allPages)
|
||||
throws IOException {
|
||||
Color redactColor = decodeOrDefault(request.getPageRedactionColor());
|
||||
List<Integer> pageNumbers = getPageNumbers(request, allPages.getCount());
|
||||
String pageNumbers = request.getPageNumbers();
|
||||
|
||||
for (Integer pageNumber : pageNumbers) {
|
||||
PDPage page = allPages.get(pageNumber);
|
||||
List<Integer> pageNumberList = parsePageNumbers(pageNumbers);
|
||||
|
||||
for (Integer pageNumber : pageNumberList) {
|
||||
if (pageNumber <= 0 || pageNumber > allPages.getCount()) {
|
||||
continue; // Skip invalid page numbers
|
||||
}
|
||||
PDPage page = allPages.get(pageNumber - 1); // Convert to 0-based index
|
||||
try (PDPageContentStream contentStream =
|
||||
new PDPageContentStream(
|
||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
|
||||
@ -176,6 +149,39 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private static List<Integer> parsePageNumbers(String pageNumbers) {
|
||||
if (pageNumbers == null || pageNumbers.trim().isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Integer> result = new ArrayList<>();
|
||||
String[] parts = pageNumbers.split(",");
|
||||
|
||||
for (String part : parts) {
|
||||
part = part.trim();
|
||||
if (part.contains("-")) {
|
||||
String[] range = part.split("-");
|
||||
if (range.length == 2) {
|
||||
try {
|
||||
int start = Integer.parseInt(range[0].trim());
|
||||
int end = Integer.parseInt(range[1].trim());
|
||||
for (int i = start; i <= end; i++) {
|
||||
result.add(i);
|
||||
}
|
||||
} catch (NumberFormatException ignored) {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
result.add(Integer.parseInt(part));
|
||||
} catch (NumberFormatException ignored) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static Color decodeOrDefault(String hex) {
|
||||
if (hex == null) {
|
||||
return Color.BLACK;
|
||||
@ -188,41 +194,6 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private static List<Integer> getPageNumbers(ManualRedactPdfRequest request, int pagesCount) {
|
||||
String pageNumbersInput = request.getPageNumbers();
|
||||
String[] parts =
|
||||
(pageNumbersInput != null) ? PAGE_SPLIT.split(pageNumbersInput) : new String[0];
|
||||
List<Integer> pageNumbers = new ArrayList<>();
|
||||
if (parts.length == 0 || parts[0].isEmpty()) {
|
||||
return pageNumbers;
|
||||
}
|
||||
for (String token : parts) {
|
||||
if (token.contains("-")) {
|
||||
String[] range = token.split("-");
|
||||
if (range.length == 2) {
|
||||
int start = Integer.parseInt(range[0]);
|
||||
int end = Integer.parseInt(range[1]);
|
||||
if (start > 0 && end > 0 && start <= end) {
|
||||
for (int i = start; i <= end; i++) {
|
||||
if (i <= pagesCount) {
|
||||
pageNumbers.add(i - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
int num = Integer.parseInt(token);
|
||||
if (num > 0 && num <= pagesCount) {
|
||||
pageNumbers.add(num - 1);
|
||||
}
|
||||
} catch (NumberFormatException ignored) {
|
||||
}
|
||||
}
|
||||
}
|
||||
return pageNumbers;
|
||||
}
|
||||
|
||||
private static void redactFoundText(
|
||||
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
|
||||
throws IOException {
|
||||
@ -363,11 +334,9 @@ public class RedactionService {
|
||||
Color redactColor = decodeOrDefault(colorString);
|
||||
redactFoundText(document, allFoundTexts, customPadding, redactColor);
|
||||
}
|
||||
cleanDocumentMetadata(document);
|
||||
}
|
||||
if (Boolean.TRUE.equals(convertToImage)) {
|
||||
try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) {
|
||||
cleanDocumentMetadata(convertedPdf);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
convertedPdf.save(baos);
|
||||
return baos.toByteArray();
|
||||
@ -378,22 +347,6 @@ public class RedactionService {
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
private static void cleanDocumentMetadata(PDDocument document) {
|
||||
try {
|
||||
var info = document.getDocumentInformation();
|
||||
if (info != null) {
|
||||
info.setAuthor(null);
|
||||
info.setSubject(null);
|
||||
info.setKeywords(null);
|
||||
info.setModificationDate(java.util.Calendar.getInstance());
|
||||
}
|
||||
if (document.getDocumentCatalog() != null) {
|
||||
document.getDocumentCatalog().setMetadata(null);
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
|
||||
private static String normalizeForFuzzy(String s) {
|
||||
if (s == null) {
|
||||
return "";
|
||||
@ -445,64 +398,6 @@ public class RedactionService {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) {
|
||||
DecodedMapping map = new DecodedMapping();
|
||||
if (font == null || bytes == null) {
|
||||
map.text = "";
|
||||
map.charByteStart = new int[0];
|
||||
map.charByteEnd = new int[0];
|
||||
return map;
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
List<Integer> starts = new ArrayList<>();
|
||||
List<Integer> ends = new ArrayList<>();
|
||||
int i = 0;
|
||||
boolean isType0 = font instanceof PDType0Font;
|
||||
while (i < bytes.length) {
|
||||
int b1 = bytes[i] & 0xFF;
|
||||
String u = null;
|
||||
int consumed = 1;
|
||||
try {
|
||||
if (isType0 && i + 1 < bytes.length) {
|
||||
int b2 = bytes[i + 1] & 0xFF;
|
||||
int code = (b1 << 8) | b2;
|
||||
String u2 = null;
|
||||
try {
|
||||
u2 = font.toUnicode(code);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
if (u2 != null) {
|
||||
u = u2;
|
||||
consumed = 2;
|
||||
}
|
||||
}
|
||||
if (u == null) {
|
||||
try {
|
||||
u = font.toUnicode(b1);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
if (u == null) {
|
||||
u = "?";
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
u = "?";
|
||||
}
|
||||
int start = i;
|
||||
int end = i + consumed;
|
||||
for (int k = 0; k < u.length(); k++) {
|
||||
sb.append(u.charAt(k));
|
||||
starts.add(start);
|
||||
ends.add(end);
|
||||
}
|
||||
i += consumed;
|
||||
}
|
||||
map.text = sb.toString();
|
||||
map.charByteStart = starts.stream().mapToInt(Integer::intValue).toArray();
|
||||
map.charByteEnd = ends.stream().mapToInt(Integer::intValue).toArray();
|
||||
return map;
|
||||
}
|
||||
|
||||
private static void performFallbackModification(
|
||||
List<Object> tokens, int tokenIndex, String newText) {
|
||||
try {
|
||||
@ -520,7 +415,7 @@ public class RedactionService {
|
||||
for (COSBase element : originalArray) {
|
||||
if (element instanceof COSString cosString) {
|
||||
byte[] bytes = cosString.getBytes();
|
||||
DecodedMapping dm = buildDecodeMapping(font, bytes);
|
||||
DecodedMapping dm = TextDecodingHelper.buildDecodeMapping(font, bytes);
|
||||
int decodedLen = dm.text.length();
|
||||
if (decodedLen == 0 || dm.charByteStart.length == 0) {
|
||||
newArray.add(element);
|
||||
@ -576,8 +471,9 @@ public class RedactionService {
|
||||
&& newTokens.get(i - 1) instanceof COSString) {
|
||||
newTokens.set(i - 1, EMPTY_COS_STRING);
|
||||
modifications++;
|
||||
} else if ("TJ".equals(name) && i > 0 && newTokens.get(i - 1) instanceof COSArray) {
|
||||
COSArray arr = (COSArray) newTokens.get(i - 1);
|
||||
} else if ("TJ".equals(name)
|
||||
&& i > 0
|
||||
&& newTokens.get(i - 1) instanceof COSArray arr) {
|
||||
COSArray newArr = new COSArray();
|
||||
for (int j = 0; j < arr.size(); j++) {
|
||||
COSBase el = arr.get(j);
|
||||
@ -717,7 +613,7 @@ public class RedactionService {
|
||||
|
||||
private static int processSemanticTokens(List<Object> tokens, boolean removeTU) {
|
||||
int modifications = 0;
|
||||
java.util.Stack<Integer> markedContentStack = new java.util.Stack<>();
|
||||
Deque<Integer> markedContentStack = new ArrayDeque<>();
|
||||
|
||||
for (int i = 0; i < tokens.size(); i++) {
|
||||
Object t = tokens.get(i);
|
||||
@ -784,37 +680,12 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private COSString redactCosStringByDecodedRanges(
|
||||
PDFont font, COSString cosString, List<AggressiveSegMatch> decRanges) {
|
||||
try {
|
||||
byte[] bytes = cosString.getBytes();
|
||||
DecodedMapping dm = buildDecodeMapping(font, bytes);
|
||||
if (dm.text.isEmpty() || dm.charByteStart.length == 0) {
|
||||
return cosString;
|
||||
}
|
||||
boolean[] delete = new boolean[bytes.length];
|
||||
for (AggressiveSegMatch r : decRanges) {
|
||||
int ds = Math.max(0, Math.min(r.decodedStart, dm.charByteStart.length));
|
||||
int de = Math.max(ds, Math.min(r.decodedEnd, dm.charByteStart.length));
|
||||
if (ds >= de) {
|
||||
continue;
|
||||
}
|
||||
int byteStart = dm.charByteStart[ds];
|
||||
int byteEnd = dm.charByteEnd[de - 1];
|
||||
for (int bi = Math.max(0, byteStart); bi < Math.min(bytes.length, byteEnd); bi++) {
|
||||
delete[bi] = true;
|
||||
}
|
||||
}
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(bytes.length);
|
||||
for (int bi = 0; bi < bytes.length; bi++) {
|
||||
if (!delete[bi]) {
|
||||
baos.write(bytes[bi]);
|
||||
}
|
||||
}
|
||||
return new COSString(baos.toByteArray());
|
||||
} catch (Exception e) {
|
||||
return this.aggressiveMode ? EMPTY_COS_STRING : cosString;
|
||||
}
|
||||
private static String createSubsetFontPlaceholder(
|
||||
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
||||
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
||||
return result != null
|
||||
? result
|
||||
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
||||
}
|
||||
|
||||
public void performTextReplacementAggressive(
|
||||
@ -904,15 +775,7 @@ public class RedactionService {
|
||||
return index >= 0 && index < tokens.size();
|
||||
}
|
||||
|
||||
private String createSubsetFontPlaceholder(
|
||||
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
||||
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
||||
return result != null
|
||||
? result
|
||||
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
||||
}
|
||||
|
||||
private String buildCompleteText(List<TextSegment> segments) {
|
||||
private static String buildCompleteText(List<TextSegment> segments) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (TextSegment segment : segments) {
|
||||
sb.append(segment.text);
|
||||
@ -920,6 +783,14 @@ public class RedactionService {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static boolean isProperFontSubset(String fontName) {
|
||||
if (fontName.length() < 7) return false;
|
||||
for (int i = 0; i < 6; i++) {
|
||||
if (fontName.charAt(i) < 'A' || fontName.charAt(i) > 'Z') return false;
|
||||
}
|
||||
return fontName.charAt(6) == '+';
|
||||
}
|
||||
|
||||
List<Object> createTokensWithoutTargetText(
|
||||
PDDocument document,
|
||||
PDPage page,
|
||||
@ -1006,50 +877,16 @@ public class RedactionService {
|
||||
return extractTextSegmentsFromTokens(page.getResources(), tokens, aggressive);
|
||||
}
|
||||
|
||||
private List<TextSegment> extractTextSegmentsFromTokens(
|
||||
PDResources resources, List<Object> tokens, boolean aggressive) {
|
||||
List<TextSegment> segments = new ArrayList<>();
|
||||
int currentTextPos = 0;
|
||||
GraphicsState gs = new GraphicsState();
|
||||
for (int i = 0; i < tokens.size(); i++) {
|
||||
Object currentToken = tokens.get(i);
|
||||
if (currentToken instanceof Operator op) {
|
||||
String opName = op.getName();
|
||||
if ("Tf".equals(opName) && i >= 2) {
|
||||
try {
|
||||
COSName fontName = (COSName) tokens.get(i - 2);
|
||||
COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
|
||||
if (fontSizeBase instanceof COSNumber cosNumber) {
|
||||
PDFont safeFont = getFontSafely(resources, fontName);
|
||||
gs.setFont(safeFont);
|
||||
gs.setFontSize(cosNumber.floatValue());
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
if (isTextShowingOperator(opName) && i > 0) {
|
||||
String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
|
||||
if (textContent != null && !textContent.trim().isEmpty()) {
|
||||
if (aggressive
|
||||
&& gs.font != null
|
||||
&& tokens.get(i - 1) instanceof COSString cs) {
|
||||
TextDecodingHelper.tryDecodeWithFontEnhanced(gs.font, cs);
|
||||
}
|
||||
segments.add(
|
||||
new TextSegment(
|
||||
i - 1,
|
||||
opName,
|
||||
textContent,
|
||||
currentTextPos,
|
||||
currentTextPos + textContent.length(),
|
||||
gs.font,
|
||||
gs.fontSize));
|
||||
currentTextPos += textContent.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
private static boolean hasReliableWidthMetrics(PDFont font) {
|
||||
try {
|
||||
String testString = "AbCdEf123";
|
||||
float width1 = font.getStringWidth(testString);
|
||||
float width2 = calculateCharacterSumWidth(font, testString);
|
||||
if (width1 <= 0 || width2 <= 0) return false;
|
||||
return Math.abs(width1 - width2) / Math.max(width1, width2) < 0.05f;
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
return segments;
|
||||
}
|
||||
|
||||
private static String sanitizeText(String text) {
|
||||
@ -1393,23 +1230,47 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isProperFontSubset(String fontName) {
|
||||
if (fontName.length() < 7) return false;
|
||||
for (int i = 0; i < 6; i++) {
|
||||
if (fontName.charAt(i) < 'A' || fontName.charAt(i) > 'Z') return false;
|
||||
static String createPlaceholderWithFont(String originalWord, PDFont font) {
|
||||
if (originalWord == null || originalWord.isEmpty()) return " ";
|
||||
|
||||
final String repeat = " ".repeat(Math.max(1, originalWord.length()));
|
||||
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
|
||||
try {
|
||||
float originalWidth =
|
||||
WidthCalculator.calculateAccurateWidth(font, originalWord, 1.0f);
|
||||
String result =
|
||||
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
|
||||
return result != null ? result : repeat;
|
||||
} catch (Exception e) {
|
||||
return repeat;
|
||||
}
|
||||
}
|
||||
return fontName.charAt(6) == '+';
|
||||
|
||||
return repeat;
|
||||
}
|
||||
|
||||
private boolean hasReliableWidthMetrics(PDFont font) {
|
||||
private static TokenModificationResult convertToTJWithAdjustment(
|
||||
List<Object> tokens,
|
||||
int tokenIndex,
|
||||
String originalOperator,
|
||||
String newText,
|
||||
float adjustment,
|
||||
TextSegment segment) {
|
||||
try {
|
||||
String testString = "AbCdEf123";
|
||||
float width1 = font.getStringWidth(testString);
|
||||
float width2 = calculateCharacterSumWidth(font, testString);
|
||||
if (width1 <= 0 || width2 <= 0) return false;
|
||||
return Math.abs(width1 - width2) / Math.max(width1, width2) < 0.05f;
|
||||
COSArray newArray = new COSArray();
|
||||
newArray.add(new COSString(newText));
|
||||
|
||||
if (segment.getFontSize() > 0) {
|
||||
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
|
||||
if (Math.abs(kerning) <= 10000f) {
|
||||
newArray.add(new COSFloat(kerning));
|
||||
}
|
||||
}
|
||||
|
||||
tokens.set(tokenIndex, newArray);
|
||||
return updateOperatorSafely(tokens, tokenIndex, originalOperator);
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
return TokenModificationResult.failure("TJ conversion failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@ -1450,24 +1311,36 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private WidthMeasurement measureTextWidth(PDFont font, String text, float fontSize) {
|
||||
private static String createAlternativePlaceholder(
|
||||
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
||||
final String repeat =
|
||||
" ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
||||
try {
|
||||
float fontUnits = safeGetStringWidth(font, text);
|
||||
if (fontUnits < 0) return WidthMeasurement.invalid();
|
||||
|
||||
float actualWidth = (fontUnits / FONT_SCALE_FACTOR) * fontSize;
|
||||
float characterSumWidth = calculateCharacterSumWidth(font, text);
|
||||
|
||||
if (characterSumWidth > 0) {
|
||||
float characterActualWidth = (characterSumWidth / FONT_SCALE_FACTOR) * fontSize;
|
||||
if (Math.abs(actualWidth - characterActualWidth) / actualWidth > 0.1f) {
|
||||
actualWidth = Math.max(actualWidth, characterActualWidth);
|
||||
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
|
||||
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
|
||||
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
|
||||
if (spaceWidth > 0) {
|
||||
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
|
||||
int maxSpaces = originalWord.length() * 2;
|
||||
return " ".repeat(Math.min(spaceCount, maxSpaces));
|
||||
}
|
||||
}
|
||||
|
||||
return new WidthMeasurement(actualWidth, true);
|
||||
for (String alt : alternatives) {
|
||||
if (" ".equals(alt)) continue;
|
||||
try {
|
||||
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
|
||||
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
|
||||
if (cw > 0) {
|
||||
int count = Math.max(1, Math.round(targetWidth / cw));
|
||||
int max = originalWord.length() * 2;
|
||||
return " ".repeat(Math.min(count, max));
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
return repeat;
|
||||
} catch (Exception e) {
|
||||
return WidthMeasurement.invalid();
|
||||
return repeat;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1616,24 +1489,37 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
String createPlaceholderWithFont(String originalWord, PDFont font) {
|
||||
if (originalWord == null || originalWord.isEmpty()) return " ";
|
||||
|
||||
final String repeat = " ".repeat(Math.max(1, originalWord.length()));
|
||||
if (font != null && TextEncodingHelper.isFontSubset(font.getName())) {
|
||||
try {
|
||||
// Use helper to get accurate width at fontSize=1.0
|
||||
float originalWidth =
|
||||
WidthCalculator.calculateAccurateWidth(font, originalWord, 1.0f);
|
||||
String result =
|
||||
createAlternativePlaceholder(originalWord, originalWidth, font, 1.0f);
|
||||
return result != null ? result : repeat;
|
||||
} catch (Exception e) {
|
||||
return repeat;
|
||||
private COSString redactCosStringByDecodedRanges(
|
||||
PDFont font, COSString cosString, List<AggressiveSegMatch> decRanges) {
|
||||
try {
|
||||
byte[] bytes = cosString.getBytes();
|
||||
DecodedMapping dm = TextDecodingHelper.buildDecodeMapping(font, bytes);
|
||||
if (dm.text.isEmpty() || dm.charByteStart.length == 0) {
|
||||
return cosString;
|
||||
}
|
||||
boolean[] delete = new boolean[bytes.length];
|
||||
for (AggressiveSegMatch r : decRanges) {
|
||||
int ds = Math.max(0, Math.min(r.decodedStart, dm.charByteStart.length));
|
||||
int de = Math.max(ds, Math.min(r.decodedEnd, dm.charByteStart.length));
|
||||
if (ds >= de) {
|
||||
continue;
|
||||
}
|
||||
int byteStart = dm.charByteStart[ds];
|
||||
int byteEnd = dm.charByteEnd[de - 1];
|
||||
for (int bi = Math.max(0, byteStart); bi < Math.min(bytes.length, byteEnd); bi++) {
|
||||
delete[bi] = true;
|
||||
}
|
||||
}
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(bytes.length);
|
||||
for (int bi = 0; bi < bytes.length; bi++) {
|
||||
if (!delete[bi]) {
|
||||
baos.write(bytes[bi]);
|
||||
}
|
||||
}
|
||||
return new COSString(baos.toByteArray());
|
||||
} catch (Exception e) {
|
||||
return this.aggressiveMode ? EMPTY_COS_STRING : cosString;
|
||||
}
|
||||
|
||||
return repeat;
|
||||
}
|
||||
|
||||
private TokenModificationResult performTokenModification(
|
||||
@ -1724,61 +1610,71 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private TokenModificationResult convertToTJWithAdjustment(
|
||||
List<Object> tokens,
|
||||
int tokenIndex,
|
||||
String originalOperator,
|
||||
String newText,
|
||||
float adjustment,
|
||||
TextSegment segment) {
|
||||
try {
|
||||
COSArray newArray = new COSArray();
|
||||
newArray.add(new COSString(newText));
|
||||
|
||||
if (segment.getFontSize() > 0) {
|
||||
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
|
||||
if (Math.abs(kerning) <= 10000f) {
|
||||
newArray.add(new COSFloat(kerning));
|
||||
private List<TextSegment> extractTextSegmentsFromTokens(
|
||||
PDResources resources, List<Object> tokens, boolean aggressive) {
|
||||
List<TextSegment> segments = new ArrayList<>();
|
||||
int currentTextPos = 0;
|
||||
GraphicsState gs = new GraphicsState();
|
||||
for (int i = 0; i < tokens.size(); i++) {
|
||||
Object currentToken = tokens.get(i);
|
||||
if (currentToken instanceof Operator op) {
|
||||
String opName = op.getName();
|
||||
if ("Tf".equals(opName) && i >= 2) {
|
||||
try {
|
||||
COSName fontName = (COSName) tokens.get(i - 2);
|
||||
COSBase fontSizeBase = (COSBase) tokens.get(i - 1);
|
||||
if (fontSizeBase instanceof COSNumber cosNumber) {
|
||||
PDFont safeFont = TextDecodingHelper.getFontSafely(resources, fontName);
|
||||
gs.setFont(safeFont);
|
||||
gs.setFontSize(cosNumber.floatValue());
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
if (isTextShowingOperator(opName) && i > 0) {
|
||||
String textContent = extractTextFromToken(tokens.get(i - 1), opName, gs.font);
|
||||
if (textContent != null && !textContent.trim().isEmpty()) {
|
||||
if (aggressive
|
||||
&& gs.font != null
|
||||
&& tokens.get(i - 1) instanceof COSString cs) {
|
||||
TextDecodingHelper.tryDecodeWithFontEnhanced(gs.font, cs);
|
||||
}
|
||||
segments.add(
|
||||
new TextSegment(
|
||||
i - 1,
|
||||
opName,
|
||||
textContent,
|
||||
currentTextPos,
|
||||
currentTextPos + textContent.length(),
|
||||
gs.font,
|
||||
gs.fontSize));
|
||||
currentTextPos += textContent.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tokens.set(tokenIndex, newArray);
|
||||
return updateOperatorSafely(tokens, tokenIndex, originalOperator);
|
||||
} catch (Exception e) {
|
||||
return TokenModificationResult.failure("TJ conversion failed: " + e.getMessage());
|
||||
}
|
||||
return segments;
|
||||
}
|
||||
|
||||
private String createAlternativePlaceholder(
|
||||
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
||||
final String repeat =
|
||||
" ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
||||
private WidthMeasurement measureTextWidth(PDFont font, String text, float fontSize) {
|
||||
try {
|
||||
String[] alternatives = {" ", ".", "-", "_", "~", "°", "·"};
|
||||
if (TextEncodingHelper.fontSupportsCharacter(font, " ")) {
|
||||
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
|
||||
if (spaceWidth > 0) {
|
||||
int spaceCount = Math.max(1, Math.round(targetWidth / spaceWidth));
|
||||
int maxSpaces = originalWord.length() * 2;
|
||||
return " ".repeat(Math.min(spaceCount, maxSpaces));
|
||||
float fontUnits = safeGetStringWidth(font, text);
|
||||
if (fontUnits < 0) return WidthMeasurement.invalid();
|
||||
|
||||
float actualWidth = (fontUnits / FONT_SCALE_FACTOR) * fontSize;
|
||||
float characterSumWidth = calculateCharacterSumWidth(font, text);
|
||||
|
||||
if (characterSumWidth > 0) {
|
||||
float characterActualWidth = (characterSumWidth / FONT_SCALE_FACTOR) * fontSize;
|
||||
if (actualWidth != 0
|
||||
&& Math.abs(actualWidth - characterActualWidth) / actualWidth > 0.1f) {
|
||||
actualWidth = Math.max(actualWidth, characterActualWidth);
|
||||
}
|
||||
}
|
||||
for (String alt : alternatives) {
|
||||
if (" ".equals(alt)) continue;
|
||||
try {
|
||||
if (!TextEncodingHelper.fontSupportsCharacter(font, alt)) continue;
|
||||
float cw = WidthCalculator.calculateAccurateWidth(font, alt, fontSize);
|
||||
if (cw > 0) {
|
||||
int count = Math.max(1, Math.round(targetWidth / cw));
|
||||
int max = originalWord.length() * 2;
|
||||
return " ".repeat(Math.min(count, max));
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
return repeat;
|
||||
|
||||
return new WidthMeasurement(actualWidth, true);
|
||||
} catch (Exception e) {
|
||||
return repeat;
|
||||
return WidthMeasurement.invalid();
|
||||
}
|
||||
}
|
||||
|
||||
@ -1876,13 +1772,7 @@ public class RedactionService {
|
||||
int gStart = idx;
|
||||
int gEnd = idx + w.length();
|
||||
mapStartToEnd(
|
||||
(List<TextSegment>) segments,
|
||||
(List<MatchRange>) result,
|
||||
(Map<Integer, List<AggressiveSegMatch>>) perSegMatches,
|
||||
decStarts,
|
||||
decEnds,
|
||||
gStart,
|
||||
gEnd);
|
||||
segments, result, perSegMatches, decStarts, decEnds, gStart, gEnd);
|
||||
idx = lower.indexOf(w, idx + 1);
|
||||
}
|
||||
}
|
||||
@ -2083,7 +1973,7 @@ public class RedactionService {
|
||||
segment.getFont(),
|
||||
segment.getFontSize());
|
||||
} catch (Exception e) {
|
||||
return "█".repeat(Math.max(1, originalText.length()));
|
||||
return " ".repeat(Math.max(1, originalText.length()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -2321,11 +2211,6 @@ public class RedactionService {
|
||||
this.processedMatches = processedMatches;
|
||||
this.warnings = new ArrayList<>(warnings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> warnings() {
|
||||
return new ArrayList<>(warnings);
|
||||
}
|
||||
}
|
||||
|
||||
private void processFormXObject(
|
||||
@ -2380,12 +2265,8 @@ public class RedactionService {
|
||||
private static class TokenModificationResult {
|
||||
@Getter private final boolean success;
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private final String errorMessage;
|
||||
|
||||
private TokenModificationResult(boolean success, String errorMessage) {
|
||||
this.success = success;
|
||||
this.errorMessage = errorMessage;
|
||||
}
|
||||
|
||||
public static TokenModificationResult success() {
|
||||
@ -2440,10 +2321,10 @@ public class RedactionService {
|
||||
}
|
||||
|
||||
@Data
|
||||
private static class DecodedMapping {
|
||||
String text;
|
||||
int[] charByteStart;
|
||||
int[] charByteEnd;
|
||||
public static class DecodedMapping {
|
||||
public String text;
|
||||
public int[] charByteStart;
|
||||
public int[] charByteEnd;
|
||||
}
|
||||
|
||||
@Data
|
||||
|
@ -4,14 +4,19 @@ import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.font.*;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import stirling.software.SPDF.service.RedactionService;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class TextDecodingHelper {
|
||||
@ -21,6 +26,38 @@ public class TextDecodingHelper {
|
||||
private final int EXTENDED_ASCII_LOWER_BOUND = 160;
|
||||
private final int EXTENDED_ASCII_UPPER_BOUND = 255;
|
||||
|
||||
public PDFont getFontSafely(PDResources resources, COSName fontName) {
|
||||
if (resources == null || fontName == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
PDFont font = resources.getFont(fontName);
|
||||
if (font == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
String fontNameCheck = font.getName();
|
||||
if (fontNameCheck == null || fontNameCheck.trim().isEmpty()) {
|
||||
log.debug("Font {} has null or empty name, skipping", fontName.getName());
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.debug(
|
||||
"Error accessing font name for {}, skipping: {}",
|
||||
fontName.getName(),
|
||||
e.getMessage());
|
||||
return null;
|
||||
}
|
||||
|
||||
return font;
|
||||
} catch (Exception e) {
|
||||
log.debug("Error retrieving font {}: {}", fontName.getName(), e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public void tryDecodeWithFontEnhanced(PDFont font, COSString cosString) {
|
||||
if (font == null || cosString == null) {
|
||||
return;
|
||||
@ -229,4 +266,221 @@ public class TextDecodingHelper {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static RedactionService.DecodedMapping buildDecodeMapping(PDFont font, byte[] bytes) {
|
||||
RedactionService.DecodedMapping map = new RedactionService.DecodedMapping();
|
||||
if (font == null || bytes == null) {
|
||||
map.text = "";
|
||||
map.charByteStart = new int[0];
|
||||
map.charByteEnd = new int[0];
|
||||
return map;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
List<Integer> starts = new ArrayList<>();
|
||||
List<Integer> ends = new ArrayList<>();
|
||||
int i = 0;
|
||||
|
||||
// Determine font type and encoding characteristics
|
||||
boolean isType0 = font instanceof PDType0Font;
|
||||
boolean isType1 = font instanceof PDType1Font;
|
||||
boolean isType3 = font instanceof PDType3Font;
|
||||
boolean isTrueType = font instanceof PDTrueTypeFont;
|
||||
|
||||
while (i < bytes.length) {
|
||||
int start = i;
|
||||
String decodedChar = null;
|
||||
int consumed = 1;
|
||||
|
||||
try {
|
||||
if (isType0) {
|
||||
// Handle CID fonts and multi-byte encodings
|
||||
decodedChar = decodeType0Font((PDType0Font) font, bytes, i);
|
||||
consumed = getType0CharLength((PDType0Font) font, bytes, i);
|
||||
} else if (isType1) {
|
||||
// Handle Type1 fonts with specific encoding
|
||||
decodedChar = decodeType1Font((PDType1Font) font, bytes, i);
|
||||
consumed = getType1CharLength((PDType1Font) font, bytes, i);
|
||||
} else if (isType3) {
|
||||
// Handle Type3 bitmap fonts
|
||||
decodedChar = decodeType3Font((PDType3Font) font, bytes, i);
|
||||
consumed = 1; // Type3 typically single byte
|
||||
} else if (isTrueType) {
|
||||
// Handle TrueType fonts
|
||||
decodedChar = decodeTrueTypeFont((PDTrueTypeFont) font, bytes, i);
|
||||
consumed = getTrueTypeCharLength((PDTrueTypeFont) font, bytes, i);
|
||||
} else {
|
||||
// Generic fallback for other font types
|
||||
decodedChar = decodeGenericFont(font, bytes, i);
|
||||
consumed = getGenericCharLength(font, bytes, i);
|
||||
}
|
||||
|
||||
// Validate the consumed length
|
||||
if (consumed <= 0 || i + consumed > bytes.length) {
|
||||
consumed = 1;
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
// Log the error for debugging purposes
|
||||
System.err.println(
|
||||
"Error decoding character at position " + i + ": " + e.getMessage());
|
||||
decodedChar = null;
|
||||
consumed = 1;
|
||||
}
|
||||
|
||||
// Handle null or empty decoded characters
|
||||
if (decodedChar == null || decodedChar.isEmpty()) {
|
||||
decodedChar = handleUndecodableChar(bytes, i, consumed);
|
||||
}
|
||||
|
||||
int end = i + consumed;
|
||||
|
||||
// Add each Unicode character separately
|
||||
for (int k = 0; k < decodedChar.length(); k++) {
|
||||
sb.append(decodedChar.charAt(k));
|
||||
starts.add(start);
|
||||
ends.add(end);
|
||||
}
|
||||
|
||||
i += consumed;
|
||||
}
|
||||
|
||||
map.text = sb.toString();
|
||||
map.charByteStart = starts.stream().mapToInt(Integer::intValue).toArray();
|
||||
map.charByteEnd = ends.stream().mapToInt(Integer::intValue).toArray();
|
||||
return map;
|
||||
}
|
||||
|
||||
private static String decodeType0Font(PDType0Font font, byte[] bytes, int position) {
|
||||
try {
|
||||
// Try multi-byte decoding first (common for CJK fonts)
|
||||
if (position + 1 < bytes.length) {
|
||||
int b1 = bytes[position] & 0xFF;
|
||||
int b2 = bytes[position + 1] & 0xFF;
|
||||
int code = (b1 << 8) | b2;
|
||||
String unicode = font.toUnicode(code);
|
||||
if (unicode != null && !unicode.isEmpty()) {
|
||||
return unicode;
|
||||
}
|
||||
}
|
||||
|
||||
int code = bytes[position] & 0xFF;
|
||||
return font.toUnicode(code);
|
||||
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static int getType0CharLength(PDType0Font font, byte[] bytes, int position) {
|
||||
try {
|
||||
if (position + 1 < bytes.length) {
|
||||
int b1 = bytes[position] & 0xFF;
|
||||
int b2 = bytes[position + 1] & 0xFF;
|
||||
int code = (b1 << 8) | b2;
|
||||
String unicode = font.toUnicode(code);
|
||||
if (unicode != null && !unicode.isEmpty()) {
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
} catch (Exception e) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
private static String decodeType1Font(PDType1Font font, byte[] bytes, int position) {
|
||||
try {
|
||||
int code = bytes[position] & 0xFF;
|
||||
return font.toUnicode(code);
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static int getType1CharLength(PDType1Font font, byte[] bytes, int position) {
|
||||
return 1; // Type1 fonts are typically single-byte
|
||||
}
|
||||
|
||||
private static String decodeType3Font(PDType3Font font, byte[] bytes, int position) {
|
||||
try {
|
||||
int code = bytes[position] & 0xFF;
|
||||
return font.toUnicode(code);
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static String decodeTrueTypeFont(PDTrueTypeFont font, byte[] bytes, int position) {
|
||||
try {
|
||||
int code = bytes[position] & 0xFF;
|
||||
String unicode = font.toUnicode(code);
|
||||
|
||||
if ((unicode == null || unicode.isEmpty()) && position + 1 < bytes.length) {
|
||||
int b1 = bytes[position] & 0xFF;
|
||||
int b2 = bytes[position + 1] & 0xFF;
|
||||
int multiByteCode = (b1 << 8) | b2;
|
||||
unicode = font.toUnicode(multiByteCode);
|
||||
}
|
||||
|
||||
return unicode;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static int getTrueTypeCharLength(PDTrueTypeFont font, byte[] bytes, int position) {
|
||||
try {
|
||||
// First try single byte
|
||||
int code = bytes[position] & 0xFF;
|
||||
String unicode = font.toUnicode(code);
|
||||
if (unicode != null && !unicode.isEmpty()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (position + 1 < bytes.length) {
|
||||
int b1 = bytes[position] & 0xFF;
|
||||
int b2 = bytes[position + 1] & 0xFF;
|
||||
int multiByteCode = (b1 << 8) | b2;
|
||||
unicode = font.toUnicode(multiByteCode);
|
||||
if (unicode != null && !unicode.isEmpty()) {
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 1; // Default fallback
|
||||
} catch (Exception e) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
private static String decodeGenericFont(PDFont font, byte[] bytes, int position) {
|
||||
try {
|
||||
int code = bytes[position] & 0xFF;
|
||||
return font.toUnicode(code);
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static int getGenericCharLength(PDFont font, byte[] bytes, int position) {
|
||||
return 1; // Default to single byte for unknown font types
|
||||
}
|
||||
|
||||
private static String handleUndecodableChar(byte[] bytes, int position, int length) {
|
||||
|
||||
// Or try to interpret as ISO-8859-1 (Latin-1) as fallback
|
||||
try {
|
||||
byte[] charBytes = new byte[length];
|
||||
System.arraycopy(bytes, position, charBytes, 0, length);
|
||||
String fallback = new String(charBytes, StandardCharsets.ISO_8859_1);
|
||||
if (!fallback.trim().isEmpty()) {
|
||||
return fallback;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// Ignore and fall through to default
|
||||
}
|
||||
|
||||
return "<EFBFBD>"; // Unicode replacement character instead of "?"
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user