Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-09-02 21:29:15 +02:00
parent f00952b856
commit e14941695e

View File

@ -6,6 +6,7 @@ import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.ArrayDeque; import java.util.ArrayDeque;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -46,6 +47,7 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern; import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.multipart.MultipartFile;
@ -2377,12 +2379,10 @@ public class RedactionService {
for (COSBase element : cosArray) { for (COSBase element : cosArray) {
if (element instanceof COSString cosString) { if (element instanceof COSString cosString) {
textBuilder.append(extractStringWithFallbacks(cosString, font)); textBuilder.append(extractStringWithFallbacks(cosString, font));
} else if (element instanceof COSNumber cosNumber) { } else if (element instanceof COSNumber cosNumber && cosNumber.floatValue() < -100.0) {
if (cosNumber.floatValue() < -100.0) {
textBuilder.append(" "); textBuilder.append(" ");
} }
} }
}
return textBuilder.toString(); return textBuilder.toString();
} }
@ -2417,29 +2417,16 @@ public class RedactionService {
TextSegment segment, List<MatchRange> matches, String text) { TextSegment segment, List<MatchRange> matches, String text) {
float totalOriginalWidth = 0f, totalPlaceholderWidth = 0f; float totalOriginalWidth = 0f, totalPlaceholderWidth = 0f;
int processedMatches = 0; int processedMatches = 0;
List<String> warnings = new ArrayList<>();
for (MatchRange match : matches) { for (MatchRange match : matches) {
try { try {
int segStart = Math.max(0, match.getStartPos() - segment.getStartPos()); int segStart = Math.max(0, match.getStartPos() - segment.getStartPos());
int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos()); int segEnd = Math.min(text.length(), match.getEndPos() - segment.getStartPos());
if (segStart >= text.length() || segEnd <= segStart || segStart < 0) {
warnings.add("Invalid bounds: " + segStart + "-" + segEnd);
continue;
}
String originalPart = text.substring(segStart, segEnd); String originalPart = text.substring(segStart, segEnd);
WidthMeasurement originalMeasurement = WidthMeasurement originalMeasurement =
measureTextWidth(segment.getFont(), originalPart, segment.getFontSize()); measureTextWidth(segment.getFont(), originalPart, segment.getFontSize());
if (!originalMeasurement.valid()) {
warnings.add(
"Cannot measure: "
+ originalPart.substring(
0, Math.min(10, originalPart.length())));
continue;
}
String placeholderPart = createSafePlaceholder(originalPart, segment); String placeholderPart = createSafePlaceholder(originalPart, segment);
WidthMeasurement placeholderMeasurement = WidthMeasurement placeholderMeasurement =
@ -2453,12 +2440,12 @@ public class RedactionService {
processedMatches++; processedMatches++;
} catch (Exception e) { } catch (Exception e) {
warnings.add("Error: " + e.getMessage()); log.warn("Error processing match: {}", e.getMessage());
} }
} }
return new WidthCalculationResult( return new WidthCalculationResult(
totalOriginalWidth - totalPlaceholderWidth, processedMatches, warnings); totalOriginalWidth - totalPlaceholderWidth, processedMatches);
} }
private static String createSafePlaceholder(String originalText, TextSegment segment) { private static String createSafePlaceholder(String originalText, TextSegment segment) {
@ -2528,6 +2515,7 @@ public class RedactionService {
float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR; float kerning = (-adjustment / segment.getFontSize()) * FONT_SCALE_FACTOR;
array.add(new COSFloat(kerning)); array.add(new COSFloat(kerning));
} catch (Exception ignored) { } catch (Exception ignored) {
log.warn("Failed to add kerning adjustment", ignored);
} }
} }
@ -2584,8 +2572,7 @@ public class RedactionService {
TextSegment segment, TextSegment segment,
List<MatchRange> matches, List<MatchRange> matches,
COSArray newArray, COSArray newArray,
int textOffsetInSegment) int textOffsetInSegment) {
throws Exception {
String originalText = getDecodedString(cosString, segment.getFont()); String originalText = getDecodedString(cosString, segment.getFont());
@ -2657,13 +2644,12 @@ public class RedactionService {
} }
private byte[] processWithTesseractForRestoration( private byte[] processWithTesseractForRestoration(
java.nio.file.Path inputPath, java.nio.file.Path outputPath, RedactPdfRequest request) Path inputPath, Path outputPath, RedactPdfRequest request)
throws IOException, InterruptedException { throws IOException, InterruptedException {
try (TempDirectory tempDir = new TempDirectory(tempFileManager)) { try (TempDirectory tempDir = new TempDirectory(tempFileManager)) {
java.io.File tempOutputDir = new java.io.File(tempDir.getPath().toFile(), "output"); File tempOutputDir = new File(tempDir.getPath().toFile(), "output");
java.io.File tempImagesDir = new java.io.File(tempDir.getPath().toFile(), "images"); File tempImagesDir = new File(tempDir.getPath().toFile(), "images");
java.io.File finalOutputFile = File finalOutputFile = new File(tempDir.getPath().toFile(), "final_output.pdf");
new java.io.File(tempDir.getPath().toFile(), "final_output.pdf");
tempOutputDir.mkdirs(); tempOutputDir.mkdirs();
tempImagesDir.mkdirs(); tempImagesDir.mkdirs();
try (PDDocument document = pdfDocumentFactory.load(inputPath.toFile())) { try (PDDocument document = pdfDocumentFactory.load(inputPath.toFile())) {
@ -2727,13 +2713,6 @@ public class RedactionService {
directFinder.setStartPage(document.getPages().indexOf(page) + 1); directFinder.setStartPage(document.getPages().indexOf(page) + 1);
directFinder.setEndPage(document.getPages().indexOf(page) + 1); directFinder.setEndPage(document.getPages().indexOf(page) + 1);
directFinder.getText(document); directFinder.getText(document);
StringBuilder pageText = new StringBuilder();
for (PDFText pdfText : directFinder.getFoundTexts()) {
if (pdfText.getText() != null) {
pageText.append(pdfText.getText()).append(" ");
}
}
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to get direct text from page", e); log.debug("Failed to get direct text from page", e);
} }
@ -2753,24 +2732,6 @@ public class RedactionService {
List<TextSegment> textSegments = List<TextSegment> textSegments =
extractTextSegmentsFromTokens(page.getResources(), tokens, this.aggressiveMode); extractTextSegmentsFromTokens(page.getResources(), tokens, this.aggressiveMode);
if (!textSegments.isEmpty()) {
StringBuilder allText = new StringBuilder();
boolean hasProblematicChars = false;
for (TextSegment seg : textSegments) {
if (seg.getText() != null && !seg.getText().trim().isEmpty()) {
String segmentText = seg.getText();
if (!isTextSafeForRedaction(segmentText)) {
hasProblematicChars = true;
segmentText = normalizeTextForRedaction(segmentText);
}
allText.append(segmentText).append(" ");
}
}
String completeText = allText.toString().trim();
}
List<MatchRange> matches; List<MatchRange> matches;
if (this.aggressiveMode) { if (this.aggressiveMode) {
matches = matches =
@ -2792,9 +2753,11 @@ public class RedactionService {
wipeAllTextInFormXObject(document, form); wipeAllTextInFormXObject(document, form);
} }
} catch (Exception ignored) { } catch (Exception ignored) {
log.debug("Failed to wipe text in xobject {}", xobjName);
} }
} }
} catch (Exception ignored) { } catch (Exception ignored) {
log.debug("Failed to wipe all text in XObjects", ignored);
} }
} }
@ -2828,13 +2791,10 @@ public class RedactionService {
} }
} }
private record WidthCalculationResult( private record WidthCalculationResult(float adjustment, int processedMatches) {
float adjustment, int processedMatches, List<String> warnings) { private WidthCalculationResult(float adjustment, int processedMatches) {
private WidthCalculationResult(
float adjustment, int processedMatches, List<String> warnings) {
this.adjustment = adjustment; this.adjustment = adjustment;
this.processedMatches = processedMatches; this.processedMatches = processedMatches;
this.warnings = new ArrayList<>(warnings);
} }
} }
@ -3064,7 +3024,7 @@ public class RedactionService {
} }
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to scrub document", e);
} }
} }
@ -3077,7 +3037,7 @@ public class RedactionService {
scrubStructureElement(structRoot, options); scrubStructureElement(structRoot, options);
} }
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to scrub structure tree", e);
} }
} }
@ -3126,9 +3086,8 @@ public class RedactionService {
private void scrubAnnotations(PDDocument document, Set<ScrubOption> options) { private void scrubAnnotations(PDDocument document, Set<ScrubOption> options) {
try { try {
for (org.apache.pdfbox.pdmodel.PDPage page : document.getPages()) { for (PDPage page : document.getPages()) {
for (org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation annotation : for (PDAnnotation annotation : page.getAnnotations()) {
page.getAnnotations()) {
COSDictionary annotDict = annotation.getCOSObject(); COSDictionary annotDict = annotation.getCOSObject();
if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)) { if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)) {