mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
refactor redaction services to improve resource management and streamline text processing
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
f236505cae
commit
3ac7f0df4c
@ -30,51 +30,39 @@ class AggressiveRedactionService implements RedactionModeStrategy {
|
||||
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
||||
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
||||
|
||||
PDDocument doc = null;
|
||||
PDDocument fb = null;
|
||||
try {
|
||||
doc = pdfDocumentFactory.load(request.getFileInput());
|
||||
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
||||
Map<Integer, List<PDFText>> allFound =
|
||||
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
||||
if (allFound.isEmpty()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
doc.save(baos);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
return toByteArray(doc);
|
||||
}
|
||||
|
||||
helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord);
|
||||
Map<Integer, List<PDFText>> residual =
|
||||
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
||||
boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0;
|
||||
String effectiveColor =
|
||||
(request.getRedactColor() == null || request.getRedactColor().isBlank())
|
||||
? "#000000"
|
||||
: request.getRedactColor();
|
||||
|
||||
if (residualExists) {
|
||||
// Use the new visual redaction with OCR restoration fallback
|
||||
return helper.performVisualRedactionWithOcrRestoration(
|
||||
request, listOfText, useRegex, wholeWord);
|
||||
}
|
||||
|
||||
return RedactionService.finalizeRedaction(
|
||||
doc,
|
||||
allFound,
|
||||
request.getRedactColor(),
|
||||
request.getCustomPadding(),
|
||||
request.getConvertPDFToImage(), /*text removal*/
|
||||
request.getConvertPDFToImage(),
|
||||
true);
|
||||
} catch (Exception e) {
|
||||
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
|
||||
} finally {
|
||||
if (doc != null)
|
||||
try {
|
||||
doc.close();
|
||||
} catch (IOException ignore) {
|
||||
}
|
||||
if (fb != null)
|
||||
try {
|
||||
fb.close();
|
||||
} catch (IOException ignore) {
|
||||
}
|
||||
|
||||
private byte[] toByteArray(PDDocument doc) throws IOException {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
doc.save(baos);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -29,49 +29,36 @@ class ModerateRedactionService implements RedactionModeStrategy {
|
||||
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
||||
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
||||
|
||||
PDDocument doc = null;
|
||||
PDDocument fallback = null;
|
||||
try {
|
||||
doc = pdfDocumentFactory.load(request.getFileInput());
|
||||
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
||||
Map<Integer, List<PDFText>> allFound =
|
||||
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
||||
if (allFound.isEmpty()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
doc.save(baos);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
return toByteArray(doc);
|
||||
}
|
||||
|
||||
boolean fallbackToBoxOnly =
|
||||
helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord);
|
||||
String effectiveColor =
|
||||
(request.getRedactColor() == null || request.getRedactColor().isBlank())
|
||||
? "#000000"
|
||||
: request.getRedactColor();
|
||||
if (fallbackToBoxOnly) {
|
||||
// Use the new visual redaction with OCR restoration fallback
|
||||
return helper.performVisualRedactionWithOcrRestoration(
|
||||
request, listOfText, useRegex, wholeWord);
|
||||
}
|
||||
|
||||
return RedactionService.finalizeRedaction(
|
||||
doc,
|
||||
allFound,
|
||||
effectiveColor,
|
||||
request.getRedactColor(),
|
||||
request.getCustomPadding(),
|
||||
request.getConvertPDFToImage(),
|
||||
false);
|
||||
} catch (Exception e) {
|
||||
throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
|
||||
} finally {
|
||||
if (doc != null)
|
||||
try {
|
||||
doc.close();
|
||||
} catch (IOException ignore) {
|
||||
}
|
||||
if (fallback != null)
|
||||
try {
|
||||
fallback.close();
|
||||
} catch (IOException ignore) {
|
||||
}
|
||||
|
||||
private byte[] toByteArray(PDDocument doc) throws IOException {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
doc.save(baos);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -81,6 +81,7 @@ public class RedactionService {
|
||||
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
|
||||
private static final COSString EMPTY_COS_STRING = new COSString("");
|
||||
private static final int MAX_SWEEPS = 3;
|
||||
private static final Pattern PATTERN = Pattern.compile(".*(hoepap|temp|generated).*");
|
||||
private boolean aggressiveMode = false;
|
||||
private Map<Integer, List<AggressiveSegMatch>> aggressiveSegMatches = null;
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
@ -266,26 +267,20 @@ public class RedactionService {
|
||||
boolean wholeWordSearch) {
|
||||
try {
|
||||
for (String term : targetWords) {
|
||||
if (term == null || term.isBlank()) {
|
||||
continue;
|
||||
}
|
||||
if (term == null || term.isBlank()) continue;
|
||||
|
||||
TextFinder finder = new TextFinder(term, useRegex, wholeWordSearch);
|
||||
finder.setStartPage(pageIndex + 1);
|
||||
finder.setEndPage(pageIndex + 1);
|
||||
finder.getText(document);
|
||||
|
||||
List<PDFText> foundTexts = finder.getFoundTexts();
|
||||
for (PDFText ft : foundTexts) {
|
||||
if (ft.getPageIndex() == pageIndex) {
|
||||
for (PDFText text : finder.getFoundTexts()) {
|
||||
if (text.getPageIndex() == pageIndex) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundTexts.isEmpty()) {}
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
} catch (Exception e) {
|
||||
return true;
|
||||
}
|
||||
@ -297,18 +292,13 @@ public class RedactionService {
|
||||
boolean useRegex,
|
||||
boolean wholeWordSearch) {
|
||||
try {
|
||||
int idx = -1;
|
||||
final int numberOfPages = document.getNumberOfPages();
|
||||
for (int i = 0; i < numberOfPages; i++) {
|
||||
idx++;
|
||||
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
|
||||
if (pageStillContainsTargets(
|
||||
document, idx, targetWords, useRegex, wholeWordSearch)) {
|
||||
document, pageIndex, targetWords, useRegex, wholeWordSearch)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
} catch (Exception e) {
|
||||
return true;
|
||||
}
|
||||
@ -352,12 +342,11 @@ public class RedactionService {
|
||||
for (List<PDFText> pageTexts : allFoundTextsByPage.values()) {
|
||||
allFoundTexts.addAll(pageTexts);
|
||||
}
|
||||
if (!allFoundTexts.isEmpty()) {
|
||||
if (!isTextRemovalMode) {
|
||||
if (!allFoundTexts.isEmpty() && !isTextRemovalMode) {
|
||||
Color redactColor = decodeOrDefault(colorString);
|
||||
redactFoundText(document, allFoundTexts, customPadding, redactColor);
|
||||
}
|
||||
}
|
||||
|
||||
if (Boolean.TRUE.equals(convertToImage)) {
|
||||
try (PDDocument convertedPdf = PdfUtils.convertPdfToPdfImage(document)) {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
@ -597,18 +586,11 @@ public class RedactionService {
|
||||
private static boolean isTextSafeForRedaction(String text) {
|
||||
if (text == null || text.isEmpty()) return true;
|
||||
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
int codePoint = c;
|
||||
|
||||
if (codePoint >= 65488) {
|
||||
return false;
|
||||
}
|
||||
if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
|
||||
for (char c : text.toCharArray()) {
|
||||
if (c >= 65488 || (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -657,56 +639,33 @@ public class RedactionService {
|
||||
return wipeAllSemanticTextInTokens(tokens, true);
|
||||
}
|
||||
|
||||
public byte[] performVisualRedactionWithOcrRestoration(
|
||||
RedactPdfRequest request,
|
||||
String[] listOfText,
|
||||
boolean useRegex,
|
||||
boolean wholeWordSearch)
|
||||
throws IOException {
|
||||
PDDocument visualRedactedDoc = null;
|
||||
try {
|
||||
visualRedactedDoc = pdfDocumentFactory.load(request.getFileInput());
|
||||
Map<Integer, List<PDFText>> allFound =
|
||||
findTextToRedact(visualRedactedDoc, listOfText, useRegex, wholeWordSearch);
|
||||
String effectiveColor =
|
||||
(request.getRedactColor() == null || request.getRedactColor().isBlank())
|
||||
? "#000000"
|
||||
: request.getRedactColor();
|
||||
byte[] visualRedactedBytes =
|
||||
finalizeRedaction(
|
||||
visualRedactedDoc,
|
||||
allFound,
|
||||
effectiveColor,
|
||||
request.getCustomPadding(),
|
||||
true,
|
||||
false);
|
||||
return performOcrRestoration(visualRedactedBytes, request);
|
||||
} catch (Exception e) {
|
||||
throw new IOException(
|
||||
"Visual redaction with OCR restoration failed: " + e.getMessage(), e);
|
||||
} finally {
|
||||
if (visualRedactedDoc != null) {
|
||||
try {
|
||||
visualRedactedDoc.close();
|
||||
} catch (IOException ignore) {
|
||||
}
|
||||
}
|
||||
private static String normalizeTextForRedaction(String text) {
|
||||
if (text == null) return null;
|
||||
|
||||
StringBuilder normalized = new StringBuilder();
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
|
||||
if (c >= 65488) {
|
||||
normalized.append(' ');
|
||||
} else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
|
||||
normalized.append(' ');
|
||||
} else {
|
||||
normalized.append(c);
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request)
|
||||
throws IOException, InterruptedException {
|
||||
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
|
||||
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
|
||||
java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes);
|
||||
if (isOcrMyPdfAvailable()) {
|
||||
return processWithOcrMyPdfForRestoration(
|
||||
tempInputFile.getPath(), tempOutputFile.getPath(), request);
|
||||
} else if (isTesseractAvailable()) {
|
||||
return processWithTesseractForRestoration(
|
||||
tempInputFile.getPath(), tempOutputFile.getPath(), request);
|
||||
return normalized.toString();
|
||||
}
|
||||
return redactedPdfBytes;
|
||||
|
||||
private static boolean isOcrMyPdfAvailable() {
|
||||
try {
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||
.runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version"));
|
||||
return result.getRc() == 0;
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -780,37 +739,7 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private static String normalizeTextForRedaction(String text) {
|
||||
if (text == null) return null;
|
||||
|
||||
StringBuilder normalized = new StringBuilder();
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
|
||||
if ((int) c >= 65488) {
|
||||
normalized.append(' ');
|
||||
} else if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
|
||||
normalized.append(' ');
|
||||
} else {
|
||||
normalized.append(c);
|
||||
}
|
||||
}
|
||||
|
||||
return normalized.toString();
|
||||
}
|
||||
|
||||
private boolean isOcrMyPdfAvailable() {
|
||||
try {
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||
.runCommandWithOutputHandling(Arrays.asList("ocrmypdf", "--version"));
|
||||
return result.getRc() == 0;
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isTesseractAvailable() {
|
||||
private static boolean isTesseractAvailable() {
|
||||
try {
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.TESSERACT)
|
||||
@ -826,7 +755,7 @@ public class RedactionService {
|
||||
String fontName = font.getName();
|
||||
if (fontName == null
|
||||
|| isProperFontSubset(fontName)
|
||||
|| fontName.toLowerCase().matches(".*(hoepap|temp|generated).*")) {
|
||||
|| PATTERN.matcher(fontName.toLowerCase()).matches()) {
|
||||
return false;
|
||||
}
|
||||
return hasReliableWidthMetrics(font);
|
||||
@ -835,6 +764,58 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private static String sanitizeText(String text) {
|
||||
if (text == null) return "";
|
||||
|
||||
StringBuilder sanitized = new StringBuilder();
|
||||
for (char c : text.toCharArray()) {
|
||||
sanitized.append(
|
||||
(Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r')
|
||||
? '\uFFFD'
|
||||
: c);
|
||||
}
|
||||
return sanitized.toString();
|
||||
}
|
||||
|
||||
private static byte[] processWithOcrMyPdfForRestoration(
|
||||
java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request)
|
||||
throws IOException, InterruptedException {
|
||||
List<String> command =
|
||||
Arrays.asList(
|
||||
"ocrmypdf",
|
||||
"--verbose",
|
||||
"1",
|
||||
"--output-type",
|
||||
"pdf",
|
||||
"--pdf-renderer",
|
||||
"sandwich",
|
||||
"--language",
|
||||
"eng",
|
||||
"--optimize",
|
||||
"0",
|
||||
"--jpeg-quality",
|
||||
"100",
|
||||
"--png-quality",
|
||||
"9",
|
||||
"--force-ocr",
|
||||
"--deskew",
|
||||
"--clean",
|
||||
"--clean-final",
|
||||
inputPath.toString(),
|
||||
outputPath.toString());
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
if (result.getRc() != 0) {
|
||||
throw new IOException(
|
||||
"OCRmyPDF restoration failed with return code: "
|
||||
+ result.getRc()
|
||||
+ ". Error: "
|
||||
+ result.getMessages());
|
||||
}
|
||||
return java.nio.file.Files.readAllBytes(outputPath);
|
||||
}
|
||||
|
||||
private static String createSubsetFontPlaceholder(
|
||||
String originalWord, float targetWidth, PDFont font, float fontSize) {
|
||||
String result = createAlternativePlaceholder(originalWord, targetWidth, font, fontSize);
|
||||
@ -843,77 +824,144 @@ public class RedactionService {
|
||||
: " ".repeat(Math.max(1, originalWord != null ? originalWord.length() : 1));
|
||||
}
|
||||
|
||||
public void performTextReplacementAggressive(
|
||||
PDDocument document,
|
||||
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
||||
String[] listOfText,
|
||||
private static COSArray buildKerningAdjustedTJArray(
|
||||
COSArray originalArray, COSArray redactedArray, TextSegment segment) {
|
||||
try {
|
||||
if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0)
|
||||
return redactedArray;
|
||||
|
||||
COSArray out = new COSArray();
|
||||
int size = redactedArray.size();
|
||||
for (int i = 0; i < size; i++) {
|
||||
COSBase redEl = redactedArray.get(i);
|
||||
COSBase origEl =
|
||||
(originalArray != null && i < originalArray.size())
|
||||
? originalArray.get(i)
|
||||
: null;
|
||||
|
||||
out.add(redEl);
|
||||
|
||||
if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) {
|
||||
String origText = getDecodedString(origStr, segment.getFont());
|
||||
String modText = getDecodedString(redStr, segment.getFont());
|
||||
float wOrig =
|
||||
calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
|
||||
float wMod =
|
||||
calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
|
||||
float adjustment = wOrig - wMod;
|
||||
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
||||
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
|
||||
if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
|
||||
i++;
|
||||
float combined = num.floatValue() + kerning;
|
||||
out.add(new COSFloat(combined));
|
||||
} else {
|
||||
out.add(new COSFloat(kerning));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return out;
|
||||
} catch (Exception e) {
|
||||
return redactedArray;
|
||||
}
|
||||
}
|
||||
|
||||
private static List<MatchRange> findMatchesInSegments(
|
||||
List<TextSegment> segments,
|
||||
Set<String> targetWords,
|
||||
boolean useRegex,
|
||||
boolean wholeWordSearchBool) {
|
||||
if (allFoundTextsByPage.isEmpty()) {
|
||||
return;
|
||||
boolean wholeWordSearch) {
|
||||
List<MatchRange> allMatches = new ArrayList<>();
|
||||
List<Pattern> patterns =
|
||||
TextFinderUtils.createOptimizedSearchPatterns(
|
||||
targetWords, useRegex, wholeWordSearch);
|
||||
|
||||
log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size());
|
||||
|
||||
int totalMatchesFound = 0;
|
||||
|
||||
for (int i = 0; i < segments.size(); i++) {
|
||||
TextSegment segment = segments.get(i);
|
||||
String segmentText = segment.getText();
|
||||
if (segmentText == null || segmentText.isEmpty()) {
|
||||
log.debug("Skipping empty segment {}", i);
|
||||
continue;
|
||||
}
|
||||
Set<String> allSearchTerms =
|
||||
Arrays.stream(listOfText)
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.collect(Collectors.toSet());
|
||||
this.aggressiveMode = true;
|
||||
this.aggressiveSegMatches = new HashMap<>();
|
||||
|
||||
log.debug("Processing segment {}: '{}'", i, segmentText);
|
||||
|
||||
if (segment.getFont() != null
|
||||
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
|
||||
log.debug(
|
||||
"Skipping segment {} - font not removable: {}",
|
||||
i,
|
||||
segment.getFont().getName());
|
||||
continue;
|
||||
}
|
||||
|
||||
int segmentMatches = 0;
|
||||
for (Pattern pattern : patterns) {
|
||||
try {
|
||||
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||
boolean anyResidual = false;
|
||||
int pageIndex = -1;
|
||||
for (PDPage page : document.getPages()) {
|
||||
pageIndex++;
|
||||
try {
|
||||
this.aggressiveSegMatches = new HashMap<>();
|
||||
List<Object> filtered =
|
||||
createTokensWithoutTargetText(
|
||||
document,
|
||||
page,
|
||||
allSearchTerms,
|
||||
useRegex,
|
||||
wholeWordSearchBool);
|
||||
writeFilteredContentStream(document, page, filtered);
|
||||
boolean residual =
|
||||
pageStillContainsTargets(
|
||||
document,
|
||||
pageIndex,
|
||||
allSearchTerms,
|
||||
useRegex,
|
||||
wholeWordSearchBool);
|
||||
if (residual) {
|
||||
anyResidual = true;
|
||||
try {
|
||||
var sem = wipeAllSemanticTextInTokens(filtered);
|
||||
filtered = sem.tokens;
|
||||
PDResources res = page.getResources();
|
||||
if (res != null) {
|
||||
wipeAllSemanticTextInProperties(res);
|
||||
wipeAllTextInXObjects(document, res);
|
||||
wipeAllTextInPatterns(document, res);
|
||||
}
|
||||
writeFilteredContentStream(document, page, filtered);
|
||||
} catch (Exception ignored) {
|
||||
log.debug(
|
||||
"Matching pattern '{}' against segment text '{}'",
|
||||
pattern.pattern(),
|
||||
segmentText);
|
||||
var matcher = pattern.matcher(segmentText);
|
||||
while (matcher.find()) {
|
||||
int matchStart = matcher.start();
|
||||
int matchEnd = matcher.end();
|
||||
|
||||
log.debug(
|
||||
"Found match in segment {}: positions {}-{}",
|
||||
i,
|
||||
matchStart,
|
||||
matchEnd);
|
||||
|
||||
if (matchStart >= 0
|
||||
&& matchEnd <= segmentText.length()
|
||||
&& matchStart < matchEnd) {
|
||||
String matchedText = segmentText.substring(matchStart, matchEnd);
|
||||
log.debug("Matched text: '{}'", matchedText);
|
||||
|
||||
allMatches.add(
|
||||
new MatchRange(
|
||||
segment.getStartPos() + matchStart,
|
||||
segment.getStartPos() + matchEnd));
|
||||
segmentMatches++;
|
||||
totalMatchesFound++;
|
||||
}
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
} catch (Exception e) {
|
||||
log.error("Error matching pattern in segment {}: {}", i, e.getMessage());
|
||||
}
|
||||
}
|
||||
if (!anyResidual) {
|
||||
break;
|
||||
}
|
||||
if (!documentStillContainsTargets(
|
||||
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||
break;
|
||||
|
||||
if (segmentMatches > 0) {
|
||||
log.info("Segment {} had {} matches", i, segmentMatches);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
this.aggressiveMode = false;
|
||||
this.aggressiveSegMatches = null;
|
||||
|
||||
log.info("Total matches found across all segments: {}", totalMatchesFound);
|
||||
allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
|
||||
|
||||
if (allMatches.isEmpty()) {
|
||||
log.warn("No matches found in segments. This might indicate:");
|
||||
log.warn("1. Text encoding issues preventing proper extraction");
|
||||
log.warn("2. Font compatibility issues");
|
||||
log.warn("3. Search terms not matching extracted text");
|
||||
log.warn("4. Whole word search filtering out matches");
|
||||
|
||||
if (!segments.isEmpty()) {
|
||||
log.warn("Sample segment text: '{}'", segments.get(0).getText());
|
||||
log.warn("Target words: {}", targetWords);
|
||||
log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch);
|
||||
}
|
||||
}
|
||||
|
||||
return allMatches;
|
||||
}
|
||||
|
||||
private static float calculateCharacterSumWidth(PDFont font, String text) {
|
||||
float totalWidth = 0f;
|
||||
for (char c : text.toCharArray()) {
|
||||
@ -1033,19 +1081,29 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private static String sanitizeText(String text) {
|
||||
if (text == null) return "";
|
||||
|
||||
StringBuilder sanitized = new StringBuilder();
|
||||
for (char c : text.toCharArray()) {
|
||||
if (Character.isISOControl(c) && c != '\n' && c != '\t' && c != '\r') {
|
||||
sanitized.append('\uFFFD');
|
||||
} else {
|
||||
sanitized.append(c);
|
||||
public byte[] performVisualRedactionWithOcrRestoration(
|
||||
RedactPdfRequest request,
|
||||
String[] listOfText,
|
||||
boolean useRegex,
|
||||
boolean wholeWordSearch)
|
||||
throws IOException {
|
||||
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
||||
Map<Integer, List<PDFText>> allFound =
|
||||
findTextToRedact(doc, listOfText, useRegex, wholeWordSearch);
|
||||
byte[] visualRedactedBytes =
|
||||
finalizeRedaction(
|
||||
doc,
|
||||
allFound,
|
||||
request.getRedactColor(),
|
||||
request.getCustomPadding(),
|
||||
true,
|
||||
false);
|
||||
return performOcrRestoration(visualRedactedBytes, request);
|
||||
} catch (Exception e) {
|
||||
throw new IOException(
|
||||
"Visual redaction with OCR restoration failed: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
return sanitized.toString();
|
||||
}
|
||||
|
||||
private static WipeResult wipeAllSemanticTextInTokens(List<Object> tokens, boolean removeTU) {
|
||||
if (tokens == null || tokens.isEmpty()) {
|
||||
@ -1064,43 +1122,21 @@ public class RedactionService {
|
||||
return res;
|
||||
}
|
||||
|
||||
private byte[] processWithOcrMyPdfForRestoration(
|
||||
java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request)
|
||||
private byte[] performOcrRestoration(byte[] redactedPdfBytes, RedactPdfRequest request)
|
||||
throws IOException, InterruptedException {
|
||||
List<String> command =
|
||||
Arrays.asList(
|
||||
"ocrmypdf",
|
||||
"--verbose",
|
||||
"1",
|
||||
"--output-type",
|
||||
"pdf",
|
||||
"--pdf-renderer",
|
||||
"sandwich",
|
||||
"--language",
|
||||
"eng",
|
||||
"--optimize",
|
||||
"0",
|
||||
"--jpeg-quality",
|
||||
"100",
|
||||
"--png-quality",
|
||||
"9",
|
||||
"--force-ocr",
|
||||
"--deskew",
|
||||
"--clean",
|
||||
"--clean-final",
|
||||
inputPath.toString(),
|
||||
outputPath.toString());
|
||||
ProcessExecutorResult result =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF)
|
||||
.runCommandWithOutputHandling(command);
|
||||
if (result.getRc() != 0) {
|
||||
throw new IOException(
|
||||
"OCRmyPDF restoration failed with return code: "
|
||||
+ result.getRc()
|
||||
+ ". Error: "
|
||||
+ result.getMessages());
|
||||
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
|
||||
TempFile tempOutputFile = new TempFile(tempFileManager, ".pdf")) {
|
||||
java.nio.file.Files.write(tempInputFile.getPath(), redactedPdfBytes);
|
||||
|
||||
if (isOcrMyPdfAvailable()) {
|
||||
return processWithOcrMyPdfForRestoration(
|
||||
tempInputFile.getPath(), tempOutputFile.getPath(), request);
|
||||
} else if (isTesseractAvailable()) {
|
||||
return processWithTesseractForRestoration(
|
||||
tempInputFile.getPath(), tempOutputFile.getPath(), request);
|
||||
}
|
||||
return redactedPdfBytes;
|
||||
}
|
||||
return java.nio.file.Files.readAllBytes(outputPath);
|
||||
}
|
||||
|
||||
private static boolean removeSemanticProperties(COSDictionary dict, boolean removeTU) {
|
||||
@ -1427,59 +1463,62 @@ public class RedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
private int getOriginalTokenCount(PDPage page) {
|
||||
public void performTextReplacementAggressive(
|
||||
PDDocument document,
|
||||
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
||||
String[] listOfText,
|
||||
boolean useRegex,
|
||||
boolean wholeWordSearchBool) {
|
||||
if (allFoundTextsByPage.isEmpty()) return;
|
||||
|
||||
Set<String> allSearchTerms =
|
||||
Arrays.stream(listOfText)
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
this.aggressiveMode = true;
|
||||
this.aggressiveSegMatches = new HashMap<>();
|
||||
|
||||
try {
|
||||
PDFStreamParser parser = new PDFStreamParser(page);
|
||||
int count = 0;
|
||||
while (parser.parseNextToken() != null) {
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
} catch (Exception e) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||
boolean anyResidual = false;
|
||||
|
||||
private COSArray buildKerningAdjustedTJArray(
|
||||
COSArray originalArray, COSArray redactedArray, TextSegment segment) {
|
||||
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
|
||||
PDPage page = document.getPages().get(pageIndex);
|
||||
try {
|
||||
if (segment == null || segment.getFont() == null || segment.getFontSize() <= 0)
|
||||
return redactedArray;
|
||||
this.aggressiveSegMatches = new HashMap<>();
|
||||
List<Object> filtered =
|
||||
createTokensWithoutTargetText(
|
||||
document,
|
||||
page,
|
||||
allSearchTerms,
|
||||
useRegex,
|
||||
wholeWordSearchBool);
|
||||
writeFilteredContentStream(document, page, filtered);
|
||||
|
||||
COSArray out = new COSArray();
|
||||
int size = redactedArray.size();
|
||||
for (int i = 0; i < size; i++) {
|
||||
COSBase redEl = redactedArray.get(i);
|
||||
COSBase origEl =
|
||||
(originalArray != null && i < originalArray.size())
|
||||
? originalArray.get(i)
|
||||
: null;
|
||||
if (pageStillContainsTargets(
|
||||
document,
|
||||
pageIndex,
|
||||
allSearchTerms,
|
||||
useRegex,
|
||||
wholeWordSearchBool)) {
|
||||
anyResidual = true;
|
||||
processResidualText(document, page, filtered);
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
|
||||
out.add(redEl);
|
||||
|
||||
if (redEl instanceof COSString redStr && origEl instanceof COSString origStr) {
|
||||
String origText = getDecodedString(origStr, segment.getFont());
|
||||
String modText = getDecodedString(redStr, segment.getFont());
|
||||
float wOrig =
|
||||
calculateSafeWidth(origText, segment.getFont(), segment.getFontSize());
|
||||
float wMod =
|
||||
calculateSafeWidth(modText, segment.getFont(), segment.getFontSize());
|
||||
float adjustment = wOrig - wMod;
|
||||
if (Math.abs(adjustment) > PRECISION_THRESHOLD) {
|
||||
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
|
||||
if (i + 1 < size && redactedArray.get(i + 1) instanceof COSNumber num) {
|
||||
i++;
|
||||
float combined = num.floatValue() + kerning;
|
||||
out.add(new COSFloat(combined));
|
||||
} else {
|
||||
out.add(new COSFloat(kerning));
|
||||
if (!anyResidual
|
||||
|| !documentStillContainsTargets(
|
||||
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return out;
|
||||
} catch (Exception e) {
|
||||
return redactedArray;
|
||||
} finally {
|
||||
this.aggressiveMode = false;
|
||||
this.aggressiveSegMatches = null;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1678,6 +1717,21 @@ public class RedactionService {
|
||||
return problematicRatio > 0.3;
|
||||
}
|
||||
|
||||
private void processResidualText(PDDocument document, PDPage page, List<Object> filtered) {
|
||||
try {
|
||||
var sem = wipeAllSemanticTextInTokens(filtered);
|
||||
filtered = sem.tokens;
|
||||
PDResources res = page.getResources();
|
||||
if (res != null) {
|
||||
wipeAllSemanticTextInProperties(res);
|
||||
wipeAllTextInXObjects(document, res);
|
||||
wipeAllTextInPatterns(document, res);
|
||||
}
|
||||
writeFilteredContentStream(document, page, filtered);
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
|
||||
public boolean performTextReplacement(
|
||||
PDDocument document,
|
||||
Map<Integer, List<PDFText>> allFoundTextsByPage,
|
||||
@ -1688,151 +1742,38 @@ public class RedactionService {
|
||||
log.info("No text found to redact");
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
|
||||
Set<String> allSearchTerms =
|
||||
Arrays.stream(listOfText)
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
log.info(
|
||||
"Starting text replacement with {} search terms: {}",
|
||||
allSearchTerms.size(),
|
||||
allSearchTerms);
|
||||
log.info("Total pages in document: {}", document.getNumberOfPages());
|
||||
log.info("Initial text found on {} pages", allFoundTextsByPage.size());
|
||||
log.info("Starting text replacement with {} search terms", allSearchTerms.size());
|
||||
|
||||
int initialTotalInstances =
|
||||
allFoundTextsByPage.values().stream().mapToInt(List::size).sum();
|
||||
log.info("Total initial instances to redact: {}", initialTotalInstances);
|
||||
|
||||
int finalSweepCount = 0;
|
||||
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||
finalSweepCount = sweep + 1;
|
||||
log.info("=== Starting sweep {} of {} ===", sweep + 1, MAX_SWEEPS);
|
||||
int pagesProcessed = 0;
|
||||
int totalModifications = 0;
|
||||
processPages(document, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||
|
||||
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
|
||||
PDPage page = document.getPages().get(pageIndex);
|
||||
List<PDFText> pageFoundTexts =
|
||||
allFoundTextsByPage.getOrDefault(pageIndex, List.of());
|
||||
|
||||
log.debug(
|
||||
"Processing page {} - found {} instances",
|
||||
pageIndex + 1,
|
||||
pageFoundTexts.size());
|
||||
|
||||
List<Object> filtered =
|
||||
createTokensWithoutTargetText(
|
||||
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||
writeFilteredContentStream(document, page, filtered);
|
||||
|
||||
int tokenDiff = Math.abs(filtered.size() - getOriginalTokenCount(page));
|
||||
totalModifications += tokenDiff;
|
||||
pagesProcessed++;
|
||||
|
||||
log.debug("Page {} - token modifications: {}", pageIndex + 1, tokenDiff);
|
||||
}
|
||||
|
||||
log.info(
|
||||
"Sweep {} completed - processed {} pages, total modifications: {}",
|
||||
sweep + 1,
|
||||
pagesProcessed,
|
||||
totalModifications);
|
||||
|
||||
boolean stillContainsTargets =
|
||||
documentStillContainsTargets(
|
||||
document, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||
|
||||
if (!stillContainsTargets) {
|
||||
if (!documentStillContainsTargets(
|
||||
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||
log.info("SUCCESS: All targets removed after {} sweeps", sweep + 1);
|
||||
break;
|
||||
} else {
|
||||
log.warn(
|
||||
"WARNING: Still contains targets after sweep {} - continuing...",
|
||||
sweep + 1);
|
||||
}
|
||||
}
|
||||
|
||||
boolean finalCheck = false;
|
||||
for (int verifyAttempt = 0; verifyAttempt < 3; verifyAttempt++) {
|
||||
log.info("Final verification attempt {} of 3", verifyAttempt + 1);
|
||||
finalCheck =
|
||||
documentStillContainsTargets(
|
||||
document, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||
|
||||
if (!finalCheck) {
|
||||
log.info(
|
||||
"Verification attempt {} passed - no targets found", verifyAttempt + 1);
|
||||
break;
|
||||
} else {
|
||||
log.warn("Verification attempt {} found remaining targets", verifyAttempt + 1);
|
||||
if (verifyAttempt < 2) {
|
||||
log.info("Performing additional cleanup sweep due to verification failure");
|
||||
for (PDPage page : document.getPages()) {
|
||||
List<Object> additionalFiltered =
|
||||
createTokensWithoutTargetText(
|
||||
document,
|
||||
page,
|
||||
allSearchTerms,
|
||||
useRegex,
|
||||
wholeWordSearchBool);
|
||||
writeFilteredContentStream(document, page, additionalFiltered);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (finalCheck) {
|
||||
log.error(
|
||||
"FAILURE: Document still contains targets after {} sweeps and {} verification attempts. Falling back to visual redaction with OCR restoration.",
|
||||
MAX_SWEEPS,
|
||||
3);
|
||||
log.error("Remaining search terms: {}", allSearchTerms);
|
||||
|
||||
log.error("=== DETAILED FAILURE ANALYSIS ===");
|
||||
for (int pageIdx = 0; pageIdx < document.getNumberOfPages(); pageIdx++) {
|
||||
for (String term : allSearchTerms) {
|
||||
try {
|
||||
TextFinder finder = new TextFinder(term, useRegex, wholeWordSearchBool);
|
||||
finder.setStartPage(pageIdx + 1);
|
||||
finder.setEndPage(pageIdx + 1);
|
||||
finder.getText(document);
|
||||
|
||||
for (PDFText found : finder.getFoundTexts()) {
|
||||
if (found.getPageIndex() == pageIdx) {
|
||||
log.error(
|
||||
"REMAINING: '{}' found on page {} at position ({}, {})",
|
||||
term,
|
||||
pageIdx + 1,
|
||||
found.getX1(),
|
||||
found.getY1());
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error(
|
||||
"Error during failure analysis for term '{}' on page {}: {}",
|
||||
term,
|
||||
pageIdx + 1,
|
||||
e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
log.error("=== END FAILURE ANALYSIS ===");
|
||||
|
||||
return true;
|
||||
} else {
|
||||
log.info(
|
||||
"SUCCESS: All text redaction completed successfully after {} sweeps",
|
||||
finalSweepCount);
|
||||
return false;
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("Exception during text replacement: {}", e.getMessage(), e);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Verification attempts
|
||||
for (int attempt = 0; attempt < 3; attempt++) {
|
||||
if (!documentStillContainsTargets(
|
||||
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||
return false;
|
||||
}
|
||||
if (attempt < 2) {
|
||||
processPages(document, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||
}
|
||||
}
|
||||
|
||||
log.error("FAILURE: Document still contains targets after {} sweeps", MAX_SWEEPS);
|
||||
return true;
|
||||
}
|
||||
|
||||
private COSArray createRedactedTJArray(
|
||||
@ -1917,99 +1858,21 @@ public class RedactionService {
|
||||
};
|
||||
}
|
||||
|
||||
private List<MatchRange> findMatchesInSegments(
|
||||
List<TextSegment> segments,
|
||||
Set<String> targetWords,
|
||||
private void processPages(
|
||||
PDDocument document,
|
||||
Set<String> allSearchTerms,
|
||||
boolean useRegex,
|
||||
boolean wholeWordSearch) {
|
||||
List<MatchRange> allMatches = new ArrayList<>();
|
||||
List<Pattern> patterns =
|
||||
TextFinderUtils.createOptimizedSearchPatterns(
|
||||
targetWords, useRegex, wholeWordSearch);
|
||||
|
||||
log.debug("Searching for {} patterns in {} segments", patterns.size(), segments.size());
|
||||
|
||||
int totalMatchesFound = 0;
|
||||
|
||||
for (int i = 0; i < segments.size(); i++) {
|
||||
TextSegment segment = segments.get(i);
|
||||
String segmentText = segment.getText();
|
||||
if (segmentText == null || segmentText.isEmpty()) {
|
||||
log.debug("Skipping empty segment {}", i);
|
||||
continue;
|
||||
}
|
||||
|
||||
log.debug("Processing segment {}: '{}'", i, segmentText);
|
||||
|
||||
if (segment.getFont() != null
|
||||
&& !TextEncodingHelper.isTextSegmentRemovable(segment.getFont(), segmentText)) {
|
||||
log.debug(
|
||||
"Skipping segment {} - font not removable: {}",
|
||||
i,
|
||||
segment.getFont().getName());
|
||||
continue;
|
||||
}
|
||||
|
||||
int segmentMatches = 0;
|
||||
for (Pattern pattern : patterns) {
|
||||
boolean wholeWordSearchBool) {
|
||||
for (PDPage page : document.getPages()) {
|
||||
try {
|
||||
log.debug(
|
||||
"Matching pattern '{}' against segment text '{}'",
|
||||
pattern.pattern(),
|
||||
segmentText);
|
||||
var matcher = pattern.matcher(segmentText);
|
||||
while (matcher.find()) {
|
||||
int matchStart = matcher.start();
|
||||
int matchEnd = matcher.end();
|
||||
|
||||
log.debug(
|
||||
"Found match in segment {}: positions {}-{}",
|
||||
i,
|
||||
matchStart,
|
||||
matchEnd);
|
||||
|
||||
if (matchStart >= 0
|
||||
&& matchEnd <= segmentText.length()
|
||||
&& matchStart < matchEnd) {
|
||||
String matchedText = segmentText.substring(matchStart, matchEnd);
|
||||
log.debug("Matched text: '{}'", matchedText);
|
||||
|
||||
allMatches.add(
|
||||
new MatchRange(
|
||||
segment.getStartPos() + matchStart,
|
||||
segment.getStartPos() + matchEnd));
|
||||
segmentMatches++;
|
||||
totalMatchesFound++;
|
||||
}
|
||||
}
|
||||
List<Object> filtered =
|
||||
createTokensWithoutTargetText(
|
||||
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||
writeFilteredContentStream(document, page, filtered);
|
||||
} catch (Exception e) {
|
||||
log.error("Error matching pattern in segment {}: {}", i, e.getMessage());
|
||||
log.warn("Error processing page: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
if (segmentMatches > 0) {
|
||||
log.info("Segment {} had {} matches", i, segmentMatches);
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Total matches found across all segments: {}", totalMatchesFound);
|
||||
allMatches.sort(Comparator.comparingInt(MatchRange::getStartPos));
|
||||
|
||||
if (allMatches.isEmpty()) {
|
||||
log.warn("No matches found in segments. This might indicate:");
|
||||
log.warn("1. Text encoding issues preventing proper extraction");
|
||||
log.warn("2. Font compatibility issues");
|
||||
log.warn("3. Search terms not matching extracted text");
|
||||
log.warn("4. Whole word search filtering out matches");
|
||||
|
||||
if (!segments.isEmpty()) {
|
||||
log.warn("Sample segment text: '{}'", segments.get(0).getText());
|
||||
log.warn("Target words: {}", targetWords);
|
||||
log.warn("Use regex: {}, Whole word search: {}", useRegex, wholeWordSearch);
|
||||
}
|
||||
}
|
||||
|
||||
return allMatches;
|
||||
}
|
||||
|
||||
private String createSafeReplacement(String originalPart, TextSegment segment) {
|
||||
@ -2962,9 +2825,9 @@ public class RedactionService {
|
||||
|
||||
@Data
|
||||
public static class DecodedMapping {
|
||||
public String text;
|
||||
public int[] charByteStart;
|
||||
public int[] charByteEnd;
|
||||
private String text;
|
||||
private int[] charByteStart;
|
||||
private int[] charByteEnd;
|
||||
}
|
||||
|
||||
@Data
|
||||
|
@ -5,10 +5,17 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class TextEncodingHelper {
|
||||
|
||||
private final Pattern PATTERN = Pattern.compile("^[A-Z]+$");
|
||||
private final Pattern REGEX = Pattern.compile("^[A-Z]{6}\\+.*");
|
||||
private final Pattern REGEXP = Pattern.compile("^[A-Z]{5}\\+.*");
|
||||
private final Pattern PATTERN1 = Pattern.compile("^[A-Z]{4}\\+.*");
|
||||
|
||||
public boolean canEncodeCharacters(PDFont font, String text) {
|
||||
if (font == null || text == null) {
|
||||
return false;
|
||||
@ -421,21 +428,21 @@ public class TextEncodingHelper {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (fontName.matches("^[A-Z]{6}\\+.*")) {
|
||||
if (REGEX.matcher(fontName).matches()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (fontName.matches("^[A-Z]{5}\\+.*")) {
|
||||
if (REGEXP.matcher(fontName).matches()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (fontName.matches("^[A-Z]{4}\\+.*")) {
|
||||
if (PATTERN1.matcher(fontName).matches()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (fontName.contains("+")) {
|
||||
String prefix = fontName.split("\\+")[0];
|
||||
if (prefix.matches("^[A-Z]+$") && prefix.length() >= 4) {
|
||||
if (PATTERN.matcher(prefix).matches() && prefix.length() >= 4) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -510,68 +517,4 @@ public class TextEncodingHelper {
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean canEncodeAnyCharacter(PDFont font) {
|
||||
if (font == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
String[] testStrings = {
|
||||
"a", "A", "0", " ", ".", "!", "e", "i", "o", "u", "n", "t", "r", "s", "l", "1", "2",
|
||||
"3", "4", "5", "6", "7", "8", "9", ",", ".", ";", ":", "?", "!", "(", ")", "[", "]",
|
||||
"{", "}", "hello", "test", "sample", "abc", "123", "ABC"
|
||||
};
|
||||
|
||||
for (String testStr : testStrings) {
|
||||
try {
|
||||
byte[] encoded = font.encode(testStr);
|
||||
if (encoded.length > 0) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
for (int code = 0; code <= 0xFFFF; code += 100) {
|
||||
try {
|
||||
String testStr = String.valueOf((char) code);
|
||||
byte[] encoded = font.encode(testStr);
|
||||
if (encoded.length > 0) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isValidFont(PDFont font) {
|
||||
if (font == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
String name = font.getName();
|
||||
if (name != null && !name.trim().isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
if (canCalculateBasicWidths(font)) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
if (canEncodeAnyCharacter(font)) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -80,10 +80,6 @@ public class WidthCalculator {
|
||||
Float charWidth =
|
||||
calculateSingleCharacterWidth(font, character, fontSize, codePoint);
|
||||
|
||||
if (charWidth == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
totalWidth += charWidth;
|
||||
if (previousCodePoint != -1) {
|
||||
totalWidth += calculateKerning(font, previousCodePoint, codePoint, fontSize);
|
||||
@ -203,9 +199,6 @@ public class WidthCalculator {
|
||||
|
||||
Float charWidth =
|
||||
calculateGlyphWidthComprehensively(font, character, codePoint, fontSize);
|
||||
if (charWidth == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
totalWidth += charWidth;
|
||||
i += Character.charCount(codePoint);
|
||||
@ -514,64 +507,4 @@ public class WidthCalculator {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public float calculateMinimumTextWidth(PDFont font, String text, float fontSize) {
|
||||
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
float minWidth = calculateAccurateWidth(font, text, fontSize);
|
||||
if (minWidth > 0) {
|
||||
return minWidth * 0.8f;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return text.length() * fontSize * 0.3f;
|
||||
}
|
||||
|
||||
public float calculateMaximumTextWidth(PDFont font, String text, float fontSize) {
|
||||
if (font == null || text == null || text.isEmpty() || fontSize <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
float maxWidth = calculateAccurateWidth(font, text, fontSize);
|
||||
if (maxWidth > 0) {
|
||||
return maxWidth * 1.2f;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return text.length() * fontSize * 1.0f;
|
||||
}
|
||||
|
||||
public boolean canCalculateWidthForText(PDFont font, String text) {
|
||||
if (font == null || text == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (text.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
Float width = calculateDirectWidth(font, text, 12f);
|
||||
if (width != null) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
try {
|
||||
Float width = calculateCharacterByCharacterWidth(font, text, 12f);
|
||||
if (width != null) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -13,20 +13,7 @@
|
||||
color: #6c757d !important;
|
||||
}
|
||||
|
||||
.btn-primary:focus {
|
||||
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
|
||||
outline: 2px solid #0d6efd;
|
||||
outline-offset: 2px;
|
||||
}
|
||||
|
||||
.form-check-input:focus {
|
||||
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
|
||||
outline: 2px solid #0d6efd;
|
||||
outline-offset: 2px;
|
||||
}
|
||||
|
||||
.form-control:focus, .form-select:focus {
|
||||
border-color: #0d6efd;
|
||||
.btn-primary:focus, .form-check-input:focus, .form-control:focus, .form-select:focus {
|
||||
box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
|
||||
outline: 2px solid #0d6efd;
|
||||
outline-offset: 2px;
|
||||
@ -36,20 +23,6 @@
|
||||
background-color: #0d6efd;
|
||||
border-color: #0d6efd;
|
||||
}
|
||||
|
||||
|
||||
|
||||
.sr-only {
|
||||
position: absolute;
|
||||
width: 1px;
|
||||
height: 1px;
|
||||
padding: 0;
|
||||
margin: -1px;
|
||||
overflow: hidden;
|
||||
clip: rect(0, 0, 0, 0);
|
||||
white-space: nowrap;
|
||||
border: 0;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user