mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
enhance null checks and improve error handling in RedactionService and auto-redact.html
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
e2ac7edad9
commit
5777b8e27e
@ -15,6 +15,7 @@ import java.util.HashMap;
|
|||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@ -89,6 +90,10 @@ public class RedactionService {
|
|||||||
private final TempFileManager tempFileManager;
|
private final TempFileManager tempFileManager;
|
||||||
|
|
||||||
private static List<Object> parseAllTokens(PDFStreamParser parser) throws IOException {
|
private static List<Object> parseAllTokens(PDFStreamParser parser) throws IOException {
|
||||||
|
if (parser == null) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
List<Object> tokens = new ArrayList<>();
|
List<Object> tokens = new ArrayList<>();
|
||||||
Object token;
|
Object token;
|
||||||
while ((token = parser.parseNextToken()) != null) {
|
while ((token = parser.parseNextToken()) != null) {
|
||||||
@ -98,8 +103,16 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static String buildLanguageOption(RedactPdfRequest request) {
|
private static String buildLanguageOption(RedactPdfRequest request) {
|
||||||
List<String> langs = (request != null) ? request.getLanguages() : null;
|
if (request == null) {
|
||||||
return (langs == null || langs.isEmpty()) ? "eng" : String.join("+", langs);
|
return "eng";
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> langs = request.getLanguages();
|
||||||
|
if (langs == null || langs.isEmpty()) {
|
||||||
|
return "eng";
|
||||||
|
}
|
||||||
|
|
||||||
|
return String.join("+", langs);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static byte[] processWithOcrMyPdfForRestoration(
|
private static byte[] processWithOcrMyPdfForRestoration(
|
||||||
@ -183,17 +196,15 @@ public class RedactionService {
|
|||||||
if (originalWord == null || originalWord.isEmpty()) return " ";
|
if (originalWord == null || originalWord.isEmpty()) return " ";
|
||||||
if (font == null || fontSize <= 0) return " ".repeat(originalWord.length());
|
if (font == null || fontSize <= 0) return " ".repeat(originalWord.length());
|
||||||
|
|
||||||
// Enhanced font subset handling
|
|
||||||
if (TextEncodingHelper.isFontSubset(font.getName())) {
|
|
||||||
return createEnhancedSubsetPlaceholder(originalWord, targetWidth, font, fontSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!WidthCalculator.isWidthCalculationReliable(font))
|
|
||||||
return " ".repeat(originalWord.length());
|
|
||||||
|
|
||||||
final String repeat = " ".repeat(Math.max(1, originalWord.length()));
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
if (TextEncodingHelper.isFontSubset(font.getName())) {
|
||||||
|
return createEnhancedSubsetPlaceholder(originalWord, targetWidth, font, fontSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!WidthCalculator.isWidthCalculationReliable(font)) {
|
||||||
|
return " ".repeat(originalWord.length());
|
||||||
|
}
|
||||||
|
|
||||||
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
|
float spaceWidth = WidthCalculator.calculateAccurateWidth(font, " ", fontSize);
|
||||||
if (spaceWidth <= 0) {
|
if (spaceWidth <= 0) {
|
||||||
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
return createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
||||||
@ -205,8 +216,9 @@ public class RedactionService {
|
|||||||
originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f));
|
originalWord.length() * 2, Math.round(targetWidth / spaceWidth * 1.5f));
|
||||||
return " ".repeat(Math.min(spaceCount, maxSpaces));
|
return " ".repeat(Math.min(spaceCount, maxSpaces));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
log.debug("Error creating placeholder with width: {}", e.getMessage());
|
||||||
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
||||||
return result != null ? result : repeat;
|
return result != null ? result : " ".repeat(Math.max(1, originalWord.length()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -304,23 +316,33 @@ public class RedactionService {
|
|||||||
String[] parts = pageNumbers.split(",");
|
String[] parts = pageNumbers.split(",");
|
||||||
|
|
||||||
for (String part : parts) {
|
for (String part : parts) {
|
||||||
String trim = part.trim();
|
String trimmedPart = part.trim();
|
||||||
if (trim.contains("-")) {
|
if (trimmedPart.isEmpty()) continue;
|
||||||
String[] range = trim.split("-");
|
|
||||||
|
if (trimmedPart.contains("-")) {
|
||||||
|
String[] range = trimmedPart.split("-", 2);
|
||||||
if (range.length == 2) {
|
if (range.length == 2) {
|
||||||
try {
|
try {
|
||||||
int start = Integer.parseInt(range[0].trim());
|
int start = Integer.parseInt(range[0].trim());
|
||||||
int end = Integer.parseInt(range[1].trim());
|
int end = Integer.parseInt(range[1].trim());
|
||||||
for (int i = start; i <= end; i++) {
|
|
||||||
result.add(i);
|
if (start <= end && start > 0 && end > 0) {
|
||||||
|
for (int i = start; i <= end; i++) {
|
||||||
|
result.add(i);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (NumberFormatException ignored) {
|
} catch (NumberFormatException e) {
|
||||||
|
log.warn("Invalid page range format: '{}'", trimmedPart);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
result.add(Integer.parseInt(trim));
|
int pageNum = Integer.parseInt(trimmedPart);
|
||||||
} catch (NumberFormatException ignored) {
|
if (pageNum > 0) {
|
||||||
|
result.add(pageNum);
|
||||||
|
}
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
log.warn("Invalid page number: '{}'", trimmedPart);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -329,13 +351,19 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static Color decodeOrDefault(String hex) {
|
private static Color decodeOrDefault(String hex) {
|
||||||
if (hex == null) {
|
if (hex == null || hex.trim().isEmpty()) {
|
||||||
return Color.BLACK;
|
return Color.BLACK;
|
||||||
}
|
}
|
||||||
String colorString = (!hex.isEmpty() && hex.charAt(0) == '#') ? hex : "#" + hex;
|
|
||||||
|
String colorString = hex.trim();
|
||||||
|
if (!colorString.startsWith("#")) {
|
||||||
|
colorString = "#" + colorString;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return Color.decode(colorString);
|
return Color.decode(colorString);
|
||||||
} catch (NumberFormatException e) {
|
} catch (NumberFormatException e) {
|
||||||
|
log.warn("Invalid color format '{}', using default black", hex);
|
||||||
return Color.BLACK;
|
return Color.BLACK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -343,18 +371,30 @@ public class RedactionService {
|
|||||||
private static void redactFoundText(
|
private static void redactFoundText(
|
||||||
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
|
PDDocument document, List<PDFText> blocks, float customPadding, Color redactColor)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
if (document == null || blocks == null || blocks.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
var allPages = document.getDocumentCatalog().getPages();
|
var allPages = document.getDocumentCatalog().getPages();
|
||||||
Map<Integer, List<PDFText>> blocksByPage = new HashMap<>();
|
Map<Integer, List<PDFText>> blocksByPage = new HashMap<>();
|
||||||
|
|
||||||
for (PDFText block : blocks) {
|
for (PDFText block : blocks) {
|
||||||
blocksByPage.computeIfAbsent(block.getPageIndex(), k -> new ArrayList<>()).add(block);
|
if (block != null && block.getPageIndex() >= 0) {
|
||||||
|
blocksByPage
|
||||||
|
.computeIfAbsent(block.getPageIndex(), k -> new ArrayList<>())
|
||||||
|
.add(block);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Map.Entry<Integer, List<PDFText>> entry : blocksByPage.entrySet()) {
|
for (Map.Entry<Integer, List<PDFText>> entry : blocksByPage.entrySet()) {
|
||||||
Integer pageIndex = entry.getKey();
|
Integer pageIndex = entry.getKey();
|
||||||
if (pageIndex >= allPages.getCount()) {
|
if (pageIndex == null || pageIndex >= allPages.getCount()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
PDPage page = allPages.get(pageIndex);
|
PDPage page = allPages.get(pageIndex);
|
||||||
List<PDFText> pageBlocks = entry.getValue();
|
List<PDFText> pageBlocks = entry.getValue();
|
||||||
|
|
||||||
try (PDPageContentStream cs =
|
try (PDPageContentStream cs =
|
||||||
new PDPageContentStream(
|
new PDPageContentStream(
|
||||||
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
|
document, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
|
||||||
@ -362,16 +402,19 @@ public class RedactionService {
|
|||||||
try {
|
try {
|
||||||
cs.setNonStrokingColor(redactColor);
|
cs.setNonStrokingColor(redactColor);
|
||||||
PDRectangle pageBox = page.getBBox();
|
PDRectangle pageBox = page.getBBox();
|
||||||
for (PDFText b : pageBlocks) {
|
|
||||||
|
for (PDFText block : pageBlocks) {
|
||||||
|
if (block == null) continue;
|
||||||
|
|
||||||
float padding =
|
float padding =
|
||||||
(b.getY2() - b.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER
|
(block.getY2() - block.getY1()) * DEFAULT_TEXT_PADDING_MULTIPLIER
|
||||||
+ customPadding;
|
+ customPadding;
|
||||||
float width = b.getX2() - b.getX1();
|
float width = block.getX2() - block.getX1();
|
||||||
cs.addRect(
|
cs.addRect(
|
||||||
b.getX1(),
|
block.getX1(),
|
||||||
pageBox.getHeight() - b.getY2() - padding,
|
pageBox.getHeight() - block.getY2() - padding,
|
||||||
width,
|
width,
|
||||||
b.getY2() - b.getY1() + 2 * padding);
|
block.getY2() - block.getY1() + 2 * padding);
|
||||||
}
|
}
|
||||||
cs.fill();
|
cs.fill();
|
||||||
} finally {
|
} finally {
|
||||||
@ -383,6 +426,10 @@ public class RedactionService {
|
|||||||
|
|
||||||
static void writeFilteredContentStream(PDDocument document, PDPage page, List<Object> tokens)
|
static void writeFilteredContentStream(PDDocument document, PDPage page, List<Object> tokens)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
if (document == null || page == null || tokens == null) {
|
||||||
|
throw new IllegalArgumentException("Document, page, and tokens cannot be null");
|
||||||
|
}
|
||||||
|
|
||||||
PDStream newStream = new PDStream(document);
|
PDStream newStream = new PDStream(document);
|
||||||
try (var out = newStream.createOutputStream()) {
|
try (var out = newStream.createOutputStream()) {
|
||||||
new ContentStreamWriter(out).writeTokens(tokens);
|
new ContentStreamWriter(out).writeTokens(tokens);
|
||||||
@ -400,6 +447,10 @@ public class RedactionService {
|
|||||||
Set<String> targetWords,
|
Set<String> targetWords,
|
||||||
boolean useRegex,
|
boolean useRegex,
|
||||||
boolean wholeWordSearch) {
|
boolean wholeWordSearch) {
|
||||||
|
if (document == null || targetWords == null || targetWords.isEmpty() || pageIndex < 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (String term : targetWords) {
|
for (String term : targetWords) {
|
||||||
if (term == null || term.isBlank()) continue;
|
if (term == null || term.isBlank()) continue;
|
||||||
@ -417,6 +468,10 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
log.warn(
|
||||||
|
"Error checking if page {} still contains targets: {}",
|
||||||
|
pageIndex,
|
||||||
|
e.getMessage());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -426,6 +481,10 @@ public class RedactionService {
|
|||||||
Set<String> targetWords,
|
Set<String> targetWords,
|
||||||
boolean useRegex,
|
boolean useRegex,
|
||||||
boolean wholeWordSearch) {
|
boolean wholeWordSearch) {
|
||||||
|
if (document == null || targetWords == null || targetWords.isEmpty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
|
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
|
||||||
if (pageStillContainsTargets(
|
if (pageStillContainsTargets(
|
||||||
@ -435,21 +494,28 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
log.warn("Error checking if document still contains targets: {}", e.getMessage());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Map<Integer, List<PDFText>> findTextToRedact(
|
public static Map<Integer, List<PDFText>> findTextToRedact(
|
||||||
PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) {
|
PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) {
|
||||||
|
if (document == null || listOfText == null) {
|
||||||
|
return Collections.emptyMap();
|
||||||
|
}
|
||||||
|
|
||||||
Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
|
Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
|
||||||
|
|
||||||
for (String text : listOfText) {
|
for (String text : listOfText) {
|
||||||
String t = text.trim();
|
if (text == null) continue;
|
||||||
if (t.isEmpty()) {
|
|
||||||
|
String trimmedText = text.trim();
|
||||||
|
if (trimmedText.isEmpty()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
TextFinder finder = new TextFinder(t, useRegex, wholeWordSearch);
|
TextFinder finder = new TextFinder(trimmedText, useRegex, wholeWordSearch);
|
||||||
finder.getText(document);
|
finder.getText(document);
|
||||||
List<PDFText> foundTexts = finder.getFoundTexts();
|
List<PDFText> foundTexts = finder.getFoundTexts();
|
||||||
|
|
||||||
@ -459,6 +525,7 @@ public class RedactionService {
|
|||||||
.add(found);
|
.add(found);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
log.warn("Error finding text '{}': {}", trimmedText, e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -473,10 +540,19 @@ public class RedactionService {
|
|||||||
Boolean convertToImage,
|
Boolean convertToImage,
|
||||||
boolean isTextRemovalMode)
|
boolean isTextRemovalMode)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
List<PDFText> allFoundTexts = new ArrayList<>();
|
if (document == null) {
|
||||||
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
|
throw new IllegalArgumentException("Document cannot be null");
|
||||||
allFoundTexts.addAll(pageTexts);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<PDFText> allFoundTexts = new ArrayList<>();
|
||||||
|
if (allFoundTextsByPage != null) {
|
||||||
|
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
|
||||||
|
if (pageTexts != null) {
|
||||||
|
allFoundTexts.addAll(pageTexts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!allFoundTexts.isEmpty() && !isTextRemovalMode) {
|
if (!allFoundTexts.isEmpty() && !isTextRemovalMode) {
|
||||||
Color redactColor = decodeOrDefault(colorString);
|
Color redactColor = decodeOrDefault(colorString);
|
||||||
redactFoundText(document, allFoundTexts, customPadding, redactColor);
|
redactFoundText(document, allFoundTexts, customPadding, redactColor);
|
||||||
@ -528,15 +604,24 @@ public class RedactionService {
|
|||||||
Set<String> targetWords,
|
Set<String> targetWords,
|
||||||
boolean useRegex,
|
boolean useRegex,
|
||||||
boolean wholeWordSearch) {
|
boolean wholeWordSearch) {
|
||||||
|
if (completeText == null || targetWords == null || targetWords.isEmpty()) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
List<Pattern> patterns =
|
List<Pattern> patterns =
|
||||||
TextFinderUtils.createOptimizedSearchPatterns(
|
TextFinderUtils.createOptimizedSearchPatterns(
|
||||||
targetWords, useRegex, wholeWordSearch);
|
targetWords, useRegex, wholeWordSearch);
|
||||||
|
|
||||||
return patterns.stream()
|
return patterns.stream()
|
||||||
.flatMap(
|
.flatMap(
|
||||||
pattern -> {
|
pattern -> {
|
||||||
try {
|
try {
|
||||||
return pattern.matcher(completeText).results();
|
return pattern.matcher(completeText).results();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
log.debug(
|
||||||
|
"Error matching pattern '{}': {}",
|
||||||
|
pattern.pattern(),
|
||||||
|
e.getMessage());
|
||||||
return java.util.stream.Stream.empty();
|
return java.util.stream.Stream.empty();
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@ -547,9 +632,16 @@ public class RedactionService {
|
|||||||
|
|
||||||
private static void performFallbackModification(
|
private static void performFallbackModification(
|
||||||
List<Object> tokens, int tokenIndex, String newText) {
|
List<Object> tokens, int tokenIndex, String newText) {
|
||||||
|
if (tokens == null || tokenIndex < 0 || tokenIndex >= tokens.size() || newText == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
tokens.set(tokenIndex, newText.isEmpty() ? EMPTY_COS_STRING : new COSString(newText));
|
tokens.set(tokenIndex, newText.isEmpty() ? EMPTY_COS_STRING : new COSString(newText));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
log.debug(
|
||||||
|
"Fallback modification failed, attempting emergency fallback: {}",
|
||||||
|
e.getMessage());
|
||||||
performEmergencyFallback(tokens, tokenIndex);
|
performEmergencyFallback(tokens, tokenIndex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -675,6 +767,10 @@ public class RedactionService {
|
|||||||
private static void writeRedactedContentToXObject(
|
private static void writeRedactedContentToXObject(
|
||||||
PDDocument document, PDFormXObject formXObject, List<Object> redactedTokens)
|
PDDocument document, PDFormXObject formXObject, List<Object> redactedTokens)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
if (document == null || formXObject == null || redactedTokens == null) {
|
||||||
|
throw new IllegalArgumentException("Document, form XObject, and tokens cannot be null");
|
||||||
|
}
|
||||||
|
|
||||||
var cosStream = formXObject.getCOSObject();
|
var cosStream = formXObject.getCOSObject();
|
||||||
try (var out = cosStream.createOutputStream()) {
|
try (var out = cosStream.createOutputStream()) {
|
||||||
new ContentStreamWriter(out).writeTokens(redactedTokens);
|
new ContentStreamWriter(out).writeTokens(redactedTokens);
|
||||||
@ -791,12 +887,19 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static List<Object> deepCopyTokens(List<Object> original) {
|
private static List<Object> deepCopyTokens(List<Object> original) {
|
||||||
|
if (original == null) {
|
||||||
|
return new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
List<Object> copy = new ArrayList<>(original.size());
|
List<Object> copy = new ArrayList<>(original.size());
|
||||||
for (Object obj : original) {
|
for (Object obj : original) {
|
||||||
if (obj instanceof COSDictionary dict) {
|
if (obj instanceof COSDictionary dict) {
|
||||||
COSDictionary newDict = new COSDictionary();
|
COSDictionary newDict = new COSDictionary();
|
||||||
for (COSName key : dict.keySet()) {
|
for (COSName key : dict.keySet()) {
|
||||||
newDict.setItem(key, dict.getDictionaryObject(key));
|
COSBase value = dict.getDictionaryObject(key);
|
||||||
|
if (value != null) {
|
||||||
|
newDict.setItem(key, value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
copy.add(newDict);
|
copy.add(newDict);
|
||||||
} else if (obj instanceof List<?> nestedList
|
} else if (obj instanceof List<?> nestedList
|
||||||
@ -838,7 +941,7 @@ public class RedactionService {
|
|||||||
private static String normalizeTextForRedaction(String text) {
|
private static String normalizeTextForRedaction(String text) {
|
||||||
if (text == null) return null;
|
if (text == null) return null;
|
||||||
|
|
||||||
StringBuilder normalized = new StringBuilder();
|
StringBuilder normalized = new StringBuilder(text.length());
|
||||||
for (int i = 0; i < text.length(); i++) {
|
for (int i = 0; i < text.length(); i++) {
|
||||||
char c = text.charAt(i);
|
char c = text.charAt(i);
|
||||||
|
|
||||||
@ -961,9 +1064,11 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static String sanitizeText(String text) {
|
private static String sanitizeText(String text) {
|
||||||
if (text == null) return "";
|
if (text == null || text.isEmpty()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
StringBuilder sanitized = new StringBuilder();
|
StringBuilder sanitized = new StringBuilder(text.length());
|
||||||
for (char c : text.toCharArray()) {
|
for (char c : text.toCharArray()) {
|
||||||
sanitized.append(
|
sanitized.append(
|
||||||
(Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')
|
(Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')
|
||||||
@ -1283,6 +1388,10 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static float calculateCharacterSumWidth(PDFont font, String text) {
|
private static float calculateCharacterSumWidth(PDFont font, String text) {
|
||||||
|
if (font == null || text == null || text.isEmpty()) {
|
||||||
|
return -1f;
|
||||||
|
}
|
||||||
|
|
||||||
float totalWidth = 0f;
|
float totalWidth = 0f;
|
||||||
for (char c : text.toCharArray()) {
|
for (char c : text.toCharArray()) {
|
||||||
try {
|
try {
|
||||||
@ -1295,21 +1404,33 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static boolean isValidTokenIndex(List<Object> tokens, int index) {
|
private static boolean isValidTokenIndex(List<Object> tokens, int index) {
|
||||||
return index >= 0 && index < tokens.size();
|
return tokens != null && index >= 0 && index < tokens.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String buildCompleteText(List<TextSegment> segments) {
|
private static String buildCompleteText(List<TextSegment> segments) {
|
||||||
|
if (segments == null || segments.isEmpty()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
for (TextSegment segment : segments) {
|
for (TextSegment segment : segments) {
|
||||||
sb.append(segment.text);
|
if (segment != null && segment.text != null) {
|
||||||
|
sb.append(segment.text);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean isProperFontSubset(String fontName) {
|
private static boolean isProperFontSubset(String fontName) {
|
||||||
if (fontName.length() < 7) return false;
|
if (fontName == null || fontName.length() < 7) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < 6; i++) {
|
for (int i = 0; i < 6; i++) {
|
||||||
if (fontName.charAt(i) < 'A' || fontName.charAt(i) > 'Z') return false;
|
char c = fontName.charAt(i);
|
||||||
|
if (c < 'A' || c > 'Z') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return fontName.charAt(6) == '+';
|
return fontName.charAt(6) == '+';
|
||||||
}
|
}
|
||||||
@ -1341,10 +1462,15 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static void performEmergencyFallback(List<Object> tokens, int tokenIndex) {
|
private static void performEmergencyFallback(List<Object> tokens, int tokenIndex) {
|
||||||
|
if (tokens == null || tokenIndex < 0 || tokenIndex >= tokens.size()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
tokens.set(tokenIndex, EMPTY_COS_STRING);
|
tokens.set(tokenIndex, EMPTY_COS_STRING);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("Emergency fallback failed: {}", e.getMessage());
|
log.error(
|
||||||
|
"Emergency fallback failed for token index {}: {}", tokenIndex, e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1380,12 +1506,21 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static boolean hasReliableWidthMetrics(PDFont font) {
|
private static boolean hasReliableWidthMetrics(PDFont font) {
|
||||||
|
if (font == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
String testString = "AbCdEf123";
|
String testString = "AbCdEf123";
|
||||||
float width1 = font.getStringWidth(testString);
|
float width1 = font.getStringWidth(testString);
|
||||||
float width2 = calculateCharacterSumWidth(font, testString);
|
float width2 = calculateCharacterSumWidth(font, testString);
|
||||||
if (width1 <= 0 || width2 <= 0) return false;
|
|
||||||
return Math.abs(width1 - width2) / Math.max(width1, width2) < 0.05f;
|
if (width1 <= 0 || width2 <= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
float maxWidth = Math.max(width1, width2);
|
||||||
|
return Math.abs(width1 - width2) / maxWidth < 0.05f;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1555,8 +1690,15 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static int getActualStringLength(COSString cosString, PDFont font) {
|
private static int getActualStringLength(COSString cosString, PDFont font) {
|
||||||
|
if (cosString == null) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (font == null) return cosString.getString().length();
|
if (font == null) {
|
||||||
|
return cosString.getString().length();
|
||||||
|
}
|
||||||
|
|
||||||
String decodedText = TextDecodingHelper.tryDecodeWithFont(font, cosString);
|
String decodedText = TextDecodingHelper.tryDecodeWithFont(font, cosString);
|
||||||
return decodedText != null ? decodedText.length() : cosString.getString().length();
|
return decodedText != null ? decodedText.length() : cosString.getString().length();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
@ -1575,7 +1717,10 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static boolean isValidTJArray(COSArray array) {
|
private static boolean isValidTJArray(COSArray array) {
|
||||||
if (array == null || array.size() == 0) return false;
|
if (array == null || array.size() == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
for (COSBase element : array) {
|
for (COSBase element : array) {
|
||||||
if (!(element instanceof COSString) && !(element instanceof COSNumber)) {
|
if (!(element instanceof COSString) && !(element instanceof COSNumber)) {
|
||||||
return false;
|
return false;
|
||||||
@ -1746,14 +1891,24 @@ public class RedactionService {
|
|||||||
String[] listOfText,
|
String[] listOfText,
|
||||||
boolean useRegex,
|
boolean useRegex,
|
||||||
boolean wholeWordSearchBool) {
|
boolean wholeWordSearchBool) {
|
||||||
if (allFoundTextsByPage.isEmpty()) return;
|
if (document == null
|
||||||
|
|| allFoundTextsByPage == null
|
||||||
|
|| allFoundTextsByPage.isEmpty()
|
||||||
|
|| listOfText == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
Set<String> allSearchTerms =
|
Set<String> allSearchTerms =
|
||||||
Arrays.stream(listOfText)
|
Arrays.stream(listOfText)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(String::trim)
|
.map(String::trim)
|
||||||
.filter(s -> !s.isEmpty())
|
.filter(s -> !s.isEmpty())
|
||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
if (allSearchTerms.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
this.aggressiveMode = true;
|
this.aggressiveMode = true;
|
||||||
this.aggressiveSegMatches = new HashMap<>();
|
this.aggressiveSegMatches = new HashMap<>();
|
||||||
|
|
||||||
@ -1783,7 +1938,11 @@ public class RedactionService {
|
|||||||
anyResidual = true;
|
anyResidual = true;
|
||||||
processResidualText(document, page, filtered);
|
processResidualText(document, page, filtered);
|
||||||
}
|
}
|
||||||
} catch (Exception ignored) {
|
} catch (Exception e) {
|
||||||
|
log.warn(
|
||||||
|
"Error processing page {} in aggressive mode: {}",
|
||||||
|
pageIndex,
|
||||||
|
e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1938,17 +2097,26 @@ public class RedactionService {
|
|||||||
String[] listOfText,
|
String[] listOfText,
|
||||||
boolean useRegex,
|
boolean useRegex,
|
||||||
boolean wholeWordSearchBool) {
|
boolean wholeWordSearchBool) {
|
||||||
if (allFoundTextsByPage.isEmpty()) {
|
if (document == null
|
||||||
log.info("No text found to redact");
|
|| allFoundTextsByPage == null
|
||||||
|
|| allFoundTextsByPage.isEmpty()
|
||||||
|
|| listOfText == null) {
|
||||||
|
log.info("No text found to redact or invalid input parameters");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<String> allSearchTerms =
|
Set<String> allSearchTerms =
|
||||||
Arrays.stream(listOfText)
|
Arrays.stream(listOfText)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(String::trim)
|
.map(String::trim)
|
||||||
.filter(s -> !s.isEmpty())
|
.filter(s -> !s.isEmpty())
|
||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
if (allSearchTerms.isEmpty()) {
|
||||||
|
log.info("No valid search terms provided");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
log.info("Starting text replacement with {} search terms", allSearchTerms.size());
|
log.info("Starting text replacement with {} search terms", allSearchTerms.size());
|
||||||
|
|
||||||
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||||
@ -1961,7 +2129,6 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verification attempts
|
|
||||||
for (int attempt = 0; attempt < 3; attempt++) {
|
for (int attempt = 0; attempt < 3; attempt++) {
|
||||||
if (!documentStillContainsTargets(
|
if (!documentStillContainsTargets(
|
||||||
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||||
@ -2063,6 +2230,10 @@ public class RedactionService {
|
|||||||
Set<String> allSearchTerms,
|
Set<String> allSearchTerms,
|
||||||
boolean useRegex,
|
boolean useRegex,
|
||||||
boolean wholeWordSearchBool) {
|
boolean wholeWordSearchBool) {
|
||||||
|
if (document == null || allSearchTerms == null || allSearchTerms.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
for (PDPage page : document.getPages()) {
|
for (PDPage page : document.getPages()) {
|
||||||
try {
|
try {
|
||||||
List<Object> filtered =
|
List<Object> filtered =
|
||||||
|
@ -73,7 +73,7 @@
|
|||||||
<div class="redaction-options-group">
|
<div class="redaction-options-group">
|
||||||
<label class="form-label fw-bold mb-3" th:text="#{autoRedact.redactionStyleLabel}"></label>
|
<label class="form-label fw-bold mb-3" th:text="#{autoRedact.redactionStyleLabel}"></label>
|
||||||
<div class="form-check mb-2">
|
<div class="form-check mb-2">
|
||||||
<input aria-describedby="visual-desc" class="form-check-input" id="visualImage" name="redactionMode" type="radio" value="visual">
|
<input aria-describedby="visual-desc" checked class="form-check-input" id="visualImage" name="redactionMode" type="radio" value="visual">
|
||||||
<label class="form-check-label" for="visualImage" th:text="#{autoRedact.visualRedactionLabel}">Visual</label>
|
<label class="form-check-label" for="visualImage" th:text="#{autoRedact.visualRedactionLabel}">Visual</label>
|
||||||
<small class="form-text text-muted d-block mt-1" id="visual-desc" th:text="#{autoRedact.visualRedactionDescription}">Converts to image with visual redactions for maximum security.</small>
|
<small class="form-text text-muted d-block mt-1" id="visual-desc" th:text="#{autoRedact.visualRedactionDescription}">Converts to image with visual redactions for maximum security.</small>
|
||||||
</div>
|
</div>
|
||||||
@ -83,7 +83,7 @@
|
|||||||
<small class="form-text text-muted d-block mt-1" id="delete-desc" th:text="#{autoRedact.deleteTextDescription}">Removes the text completely. This may alter the original layout or leave a gap.</small>
|
<small class="form-text text-muted d-block mt-1" id="delete-desc" th:text="#{autoRedact.deleteTextDescription}">Removes the text completely. This may alter the original layout or leave a gap.</small>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-check mb-3">
|
<div class="form-check mb-3">
|
||||||
<input aria-describedby="keep-desc" checked class="form-check-input" id="keepLayout" name="redactionMode" type="radio" value="moderate">
|
<input aria-describedby="keep-desc" class="form-check-input" id="keepLayout" name="redactionMode" type="radio" value="moderate">
|
||||||
<label class="form-check-label" for="keepLayout" th:text="#{autoRedact.keepLayoutLabel}">Keep Layout</label>
|
<label class="form-check-label" for="keepLayout" th:text="#{autoRedact.keepLayoutLabel}">Keep Layout</label>
|
||||||
<small class="form-text text-muted d-block mt-1" id="keep-desc" th:text="#{autoRedact.keepLayoutDescription}">Covers text with a redaction box, preserving the page's original design.</small>
|
<small class="form-text text-muted d-block mt-1" id="keep-desc" th:text="#{autoRedact.keepLayoutDescription}">Covers text with a redaction box, preserving the page's original design.</small>
|
||||||
</div>
|
</div>
|
||||||
@ -126,7 +126,7 @@
|
|||||||
<label class="form-label" for="languages">OCR Languages</label>
|
<label class="form-label" for="languages">OCR Languages</label>
|
||||||
<div id="languages">
|
<div id="languages">
|
||||||
<div class="form-check" th:each="language, iterStat : ${languages}">
|
<div class="form-check" th:each="language, iterStat : ${languages}">
|
||||||
<input onchange="handleLangSelection()" required th:id="${'language-' + language}" th:name="languages" th:value="${language}" type="checkbox" />
|
<input onchange="handleLangSelection()" required th:checked="${language == 'eng'}" th:id="${'language-' + language}" th:name="languages" th:value="${language}" type="checkbox" />
|
||||||
<label th:for="${'language-' + language}" th:text="${language}"></label>
|
<label th:for="${'language-' + language}" th:text="${language}"></label>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -275,6 +275,9 @@
|
|||||||
|
|
||||||
// Initialize language list ordering & labels
|
// Initialize language list ordering & labels
|
||||||
prioritizeLanguages();
|
prioritizeLanguages();
|
||||||
|
|
||||||
|
// Handle pre-selected English language
|
||||||
|
handleLangSelection();
|
||||||
});
|
});
|
||||||
</script>
|
</script>
|
||||||
</body>
|
</body>
|
||||||
|
Loading…
Reference in New Issue
Block a user