Refactor redaction services and utilities for improved readability and maintainability

- Adjusted indentation and formatting across multiple files for consistency.
- Improved imports ordering in utility classes for better organization.
- Enhanced `performTextReplacementAggressive` method with multi-sweep logic to handle residual text more effectively.
- Added helper methods for verifying document text targets to streamline aggressive redaction.
- Simplified logic and formatting in `RedactionService` and related classes.

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-20 22:45:08 +02:00
parent 8f19369c58
commit 1fac74a3ca
10 changed files with 441 additions and 396 deletions

View File

@ -53,12 +53,12 @@ public class RedactController {
throws IOException { throws IOException {
byte[] pdfContent = redactionService.redactPDF(request); byte[] pdfContent = redactionService.redactPDF(request);
return WebResponseUtils.bytesToWebResponse( return WebResponseUtils.bytesToWebResponse(
pdfContent, pdfContent,
removeFileExtension( removeFileExtension(
Objects.requireNonNull( Objects.requireNonNull(
Filenames.toSimpleFileName( Filenames.toSimpleFileName(
request.getFileInput().getOriginalFilename()))) request.getFileInput().getOriginalFilename())))
+ "_redacted.pdf"); + "_redacted.pdf");
} }
@PostMapping(value = "/auto-redact", consumes = "multipart/form-data") @PostMapping(value = "/auto-redact", consumes = "multipart/form-data")
@ -69,14 +69,14 @@ public class RedactController {
+ "Users can provide text patterns to redact, with options for regex and whole word matching. " + "Users can provide text patterns to redact, with options for regex and whole word matching. "
+ "Input:PDF Output:PDF Type:SISO") + "Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> redactPdf(@ModelAttribute RedactPdfRequest request) public ResponseEntity<byte[]> redactPdf(@ModelAttribute RedactPdfRequest request)
throws IOException { throws IOException {
byte[] pdfContent = redactionService.redactPdf(request); byte[] pdfContent = redactionService.redactPdf(request);
return WebResponseUtils.bytesToWebResponse( return WebResponseUtils.bytesToWebResponse(
pdfContent, pdfContent,
removeFileExtension( removeFileExtension(
Objects.requireNonNull( Objects.requireNonNull(
Filenames.toSimpleFileName( Filenames.toSimpleFileName(
request.getFileInput().getOriginalFilename()))) request.getFileInput().getOriginalFilename())))
+ "_redacted.pdf"); + "_redacted.pdf");
} }
} }

View File

@ -48,9 +48,9 @@ public class RedactPdfRequest extends PDFFile {
private Boolean convertPDFToImage; private Boolean convertPDFToImage;
@Schema( @Schema(
description = "Redaction mode: moderate, visual, or aggressive", description = "Redaction mode: moderate, visual, or aggressive",
defaultValue = "moderate", defaultValue = "moderate",
allowableValues = {"moderate", "visual", "aggressive"}, allowableValues = {"moderate", "visual", "aggressive"},
requiredMode = Schema.RequiredMode.NOT_REQUIRED) requiredMode = Schema.RequiredMode.NOT_REQUIRED)
private String redactionMode; private String redactionMode;
} }

View File

@ -17,7 +17,7 @@ class AggressiveRedactionService implements RedactionModeStrategy {
private final RedactionService helper; private final RedactionService helper;
AggressiveRedactionService( AggressiveRedactionService(
CustomPDFDocumentFactory pdfDocumentFactory, RedactionService helper) { CustomPDFDocumentFactory pdfDocumentFactory, RedactionService helper) {
this.pdfDocumentFactory = pdfDocumentFactory; this.pdfDocumentFactory = pdfDocumentFactory;
this.helper = helper; this.helper = helper;
} }
@ -33,7 +33,7 @@ class AggressiveRedactionService implements RedactionModeStrategy {
try { try {
doc = pdfDocumentFactory.load(request.getFileInput()); doc = pdfDocumentFactory.load(request.getFileInput());
Map<Integer, List<PDFText>> allFound = Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) { if (allFound.isEmpty()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos); doc.save(baos);
@ -42,31 +42,31 @@ class AggressiveRedactionService implements RedactionModeStrategy {
} }
helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord); helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord);
Map<Integer, List<PDFText>> residual = Map<Integer, List<PDFText>> residual =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0; boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0;
String effectiveColor = String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank()) (request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000" ? "#000000"
: request.getRedactColor(); : request.getRedactColor();
if (residualExists) { if (residualExists) {
fb = pdfDocumentFactory.load(request.getFileInput()); fb = pdfDocumentFactory.load(request.getFileInput());
Map<Integer, List<PDFText>> fbFound = Map<Integer, List<PDFText>> fbFound =
RedactionService.findTextToRedact(fb, listOfText, useRegex, wholeWord); RedactionService.findTextToRedact(fb, listOfText, useRegex, wholeWord);
return RedactionService.finalizeRedaction( return RedactionService.finalizeRedaction(
fb, fb,
fbFound, fbFound,
effectiveColor, effectiveColor,
request.getCustomPadding(), /*force*/ request.getCustomPadding(), /*force*/
true, true,
false); false);
} }
return RedactionService.finalizeRedaction( return RedactionService.finalizeRedaction(
doc, doc,
allFound, allFound,
request.getRedactColor(), request.getRedactColor(),
request.getCustomPadding(), request.getCustomPadding(),
request.getConvertPDFToImage(), /*text removal*/ request.getConvertPDFToImage(), /*text removal*/
true); true);
} catch (Exception e) { } catch (Exception e) {
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e); throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
} finally { } finally {

View File

@ -32,7 +32,7 @@ class ModerateRedactionService implements RedactionModeStrategy {
try { try {
doc = pdfDocumentFactory.load(request.getFileInput()); doc = pdfDocumentFactory.load(request.getFileInput());
Map<Integer, List<PDFText>> allFound = Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) { if (allFound.isEmpty()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos); doc.save(baos);
@ -40,31 +40,31 @@ class ModerateRedactionService implements RedactionModeStrategy {
} }
} }
boolean fallbackToBoxOnly = boolean fallbackToBoxOnly =
helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord); helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord);
String effectiveColor = String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank()) (request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000" ? "#000000"
: request.getRedactColor(); : request.getRedactColor();
if (fallbackToBoxOnly) { if (fallbackToBoxOnly) {
fallback = pdfDocumentFactory.load(request.getFileInput()); fallback = pdfDocumentFactory.load(request.getFileInput());
allFound = allFound =
RedactionService.findTextToRedact( RedactionService.findTextToRedact(
fallback, listOfText, useRegex, wholeWord); fallback, listOfText, useRegex, wholeWord);
return RedactionService.finalizeRedaction( return RedactionService.finalizeRedaction(
fallback, fallback,
allFound,
effectiveColor,
request.getCustomPadding(),
request.getConvertPDFToImage(),
false);
}
return RedactionService.finalizeRedaction(
doc,
allFound, allFound,
effectiveColor, effectiveColor,
request.getCustomPadding(), request.getCustomPadding(),
request.getConvertPDFToImage(), request.getConvertPDFToImage(),
false); false);
}
return RedactionService.finalizeRedaction(
doc,
allFound,
effectiveColor,
request.getCustomPadding(),
request.getConvertPDFToImage(),
false);
} catch (Exception e) { } catch (Exception e) {
throw new IOException("Moderate redaction failed: " + e.getMessage(), e); throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
} finally { } finally {

View File

@ -27,7 +27,7 @@ class VisualRedactionService implements RedactionModeStrategy {
try (PDDocument document = pdfDocumentFactory.load(request.getFileInput())) { try (PDDocument document = pdfDocumentFactory.load(request.getFileInput())) {
Map<Integer, List<PDFText>> allFound = Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(document, listOfText, useRegex, wholeWord); RedactionService.findTextToRedact(document, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) { if (allFound.isEmpty()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
document.save(baos); document.save(baos);
@ -35,16 +35,16 @@ class VisualRedactionService implements RedactionModeStrategy {
} }
} }
String effectiveColor = String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank()) (request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000" ? "#000000"
: request.getRedactColor(); : request.getRedactColor();
return RedactionService.finalizeRedaction( return RedactionService.finalizeRedaction(
document, document,
allFound, allFound,
effectiveColor, effectiveColor,
request.getCustomPadding(), request.getCustomPadding(),
request.getConvertPDFToImage(), request.getConvertPDFToImage(),
false); false);
} }
} }
} }

View File

@ -1,17 +1,17 @@
package stirling.software.SPDF.utils.text; package stirling.software.SPDF.utils.text;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import lombok.extern.slf4j.Slf4j;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.CharBuffer; import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@UtilityClass @UtilityClass
public class TextDecodingHelper { public class TextDecodingHelper {
@ -34,8 +34,8 @@ public class TextDecodingHelper {
String basicDecoded = tryDecodeWithFont(font, cosString); String basicDecoded = tryDecodeWithFont(font, cosString);
if (basicDecoded != null if (basicDecoded != null
&& !basicDecoded.contains("?") && !basicDecoded.contains("?")
&& !basicDecoded.trim().isEmpty()) { && !basicDecoded.trim().isEmpty()) {
return; return;
} }
@ -89,8 +89,7 @@ public class TextDecodingHelper {
} catch (Exception ignored) { } catch (Exception ignored) {
} }
if (charStr == null if (charStr == null && font instanceof PDType0Font type0Font) {
&& font instanceof PDType0Font type0Font) {
try { try {
int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code; int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
charStr = type0Font.toUnicode(cid); charStr = type0Font.toUnicode(cid);
@ -129,12 +128,12 @@ public class TextDecodingHelper {
if (fontName != null) { if (fontName != null) {
String lowerName = fontName.toLowerCase(); String lowerName = fontName.toLowerCase();
if (lowerName.contains("cjk") if (lowerName.contains("cjk")
|| lowerName.contains("gb") || lowerName.contains("gb")
|| lowerName.contains("jp")) { || lowerName.contains("jp")) {
// Basic CJK fallback (expand with a lookup table if needed) // Basic CJK fallback (expand with a lookup table if needed)
if (code >= 0x4E00 && code <= 0x9FFF) { if (code >= 0x4E00 && code <= 0x9FFF) {
return String.valueOf( return String.valueOf(
(char) code); // Unicode Basic Multilingual Plane for CJK (char) code); // Unicode Basic Multilingual Plane for CJK
} }
} }
} }
@ -143,8 +142,7 @@ public class TextDecodingHelper {
try { try {
if (bytes.length >= 2) { if (bytes.length >= 2) {
ByteBuffer buffer = ByteBuffer.wrap(bytes); ByteBuffer buffer = ByteBuffer.wrap(bytes);
CharsetDecoder decoder = CharsetDecoder decoder = StandardCharsets.UTF_16BE.newDecoder();
StandardCharsets.UTF_16BE.newDecoder();
CharBuffer charBuffer = decoder.decode(buffer); CharBuffer charBuffer = decoder.decode(buffer);
return charBuffer.toString(); return charBuffer.toString();
} }

View File

@ -2,12 +2,12 @@ package stirling.software.SPDF.utils.text;
import java.io.IOException; import java.io.IOException;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont; import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding; import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
import org.apache.pdfbox.pdmodel.font.encoding.Encoding; import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j

View File

@ -5,13 +5,13 @@ import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.PDResources;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@UtilityClass @UtilityClass
public class TextFinderUtils { public class TextFinderUtils {
@ -57,7 +57,7 @@ public class TextFinderUtils {
} }
public List<Pattern> createOptimizedSearchPatterns( public List<Pattern> createOptimizedSearchPatterns(
Set<String> searchTerms, boolean useRegex, boolean wholeWordSearch) { Set<String> searchTerms, boolean useRegex, boolean wholeWordSearch) {
List<Pattern> patterns = new ArrayList<>(); List<Pattern> patterns = new ArrayList<>();
for (String term : searchTerms) { for (String term : searchTerms) {

View File

@ -1,9 +1,9 @@
package stirling.software.SPDF.utils.text; package stirling.software.SPDF.utils.text;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@ -44,8 +44,7 @@ public class WidthCalculator {
} }
} }
private float calculateWidthWithCharacterIteration( private float calculateWidthWithCharacterIteration(PDFont font, String text, float fontSize) {
PDFont font, String text, float fontSize) {
try { try {
float totalWidth = 0; float totalWidth = 0;