Add multiple redaction strategies for PDF processing

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-19 21:54:57 +02:00
parent d23c2eaa30
commit 39a2b5054e
8 changed files with 2310 additions and 1631 deletions

View File

@ -46,4 +46,11 @@ public class RedactPdfRequest extends PDFFile {
defaultValue = "false",
requiredMode = Schema.RequiredMode.REQUIRED)
private Boolean convertPDFToImage;
@Schema(
description = "Redaction mode: moderate, visual, or aggressive",
defaultValue = "moderate",
allowableValues = {"moderate", "visual", "aggressive"},
requiredMode = Schema.RequiredMode.NOT_REQUIRED)
private String redactionMode;
}

View File

@ -0,0 +1,85 @@
package stirling.software.SPDF.service;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import stirling.software.SPDF.model.PDFText;
import stirling.software.SPDF.model.api.security.RedactPdfRequest;
import stirling.software.common.service.CustomPDFDocumentFactory;
class AggressiveRedactionService implements RedactionModeStrategy {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final RedactionService helper;
AggressiveRedactionService(
CustomPDFDocumentFactory pdfDocumentFactory, RedactionService helper) {
this.pdfDocumentFactory = pdfDocumentFactory;
this.helper = helper;
}
@Override
public byte[] redact(RedactPdfRequest request) throws IOException {
String[] listOfText = request.getListOfText().split("\n");
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
PDDocument doc = null;
PDDocument fb = null;
try {
doc = pdfDocumentFactory.load(request.getFileInput());
Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
}
helper.performTextReplacementAggressive(doc, allFound, listOfText, useRegex, wholeWord);
Map<Integer, List<PDFText>> residual =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
boolean residualExists = residual.values().stream().mapToInt(List::size).sum() > 0;
String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000"
: request.getRedactColor();
if (residualExists) {
fb = pdfDocumentFactory.load(request.getFileInput());
Map<Integer, List<PDFText>> fbFound =
RedactionService.findTextToRedact(fb, listOfText, useRegex, wholeWord);
return RedactionService.finalizeRedaction(
fb,
fbFound,
effectiveColor,
request.getCustomPadding(), /*force*/
true,
false);
}
return RedactionService.finalizeRedaction(
doc,
allFound,
request.getRedactColor(),
request.getCustomPadding(),
request.getConvertPDFToImage(), /*text removal*/
true);
} catch (Exception e) {
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
} finally {
if (doc != null)
try {
doc.close();
} catch (IOException ignore) {
}
if (fb != null)
try {
fb.close();
} catch (IOException ignore) {
}
}
}
}

View File

@ -0,0 +1,83 @@
package stirling.software.SPDF.service;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import stirling.software.SPDF.model.PDFText;
import stirling.software.SPDF.model.api.security.RedactPdfRequest;
import stirling.software.common.service.CustomPDFDocumentFactory;
class ModerateRedactionService implements RedactionModeStrategy {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final RedactionService helper;
ModerateRedactionService(CustomPDFDocumentFactory pdfDocumentFactory, RedactionService helper) {
this.pdfDocumentFactory = pdfDocumentFactory;
this.helper = helper;
}
@Override
public byte[] redact(RedactPdfRequest request) throws IOException {
String[] listOfText = request.getListOfText().split("\n");
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
PDDocument doc = null;
PDDocument fallback = null;
try {
doc = pdfDocumentFactory.load(request.getFileInput());
Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
}
boolean fallbackToBoxOnly =
helper.performTextReplacement(doc, allFound, listOfText, useRegex, wholeWord);
String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000"
: request.getRedactColor();
if (fallbackToBoxOnly) {
fallback = pdfDocumentFactory.load(request.getFileInput());
allFound =
RedactionService.findTextToRedact(
fallback, listOfText, useRegex, wholeWord);
return RedactionService.finalizeRedaction(
fallback,
allFound,
effectiveColor,
request.getCustomPadding(),
request.getConvertPDFToImage(),
false);
}
return RedactionService.finalizeRedaction(
doc,
allFound,
effectiveColor,
request.getCustomPadding(),
request.getConvertPDFToImage(),
false);
} catch (Exception e) {
throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
} finally {
if (doc != null)
try {
doc.close();
} catch (IOException ignore) {
}
if (fallback != null)
try {
fallback.close();
} catch (IOException ignore) {
}
}
}
}

View File

@ -0,0 +1,9 @@
package stirling.software.SPDF.service;
import java.io.IOException;
import stirling.software.SPDF.model.api.security.RedactPdfRequest;
public interface RedactionModeStrategy {
byte[] redact(RedactPdfRequest request) throws IOException;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,50 @@
package stirling.software.SPDF.service;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import stirling.software.SPDF.model.PDFText;
import stirling.software.SPDF.model.api.security.RedactPdfRequest;
import stirling.software.common.service.CustomPDFDocumentFactory;
class VisualRedactionService implements RedactionModeStrategy {
private final CustomPDFDocumentFactory pdfDocumentFactory;
VisualRedactionService(CustomPDFDocumentFactory pdfDocumentFactory, RedactionService helper) {
this.pdfDocumentFactory = pdfDocumentFactory;
}
@Override
public byte[] redact(RedactPdfRequest request) throws IOException {
String[] listOfText = request.getListOfText().split("\n");
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
try (PDDocument document = pdfDocumentFactory.load(request.getFileInput())) {
Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(document, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
document.save(baos);
return baos.toByteArray();
}
}
String effectiveColor =
(request.getRedactColor() == null || request.getRedactColor().isBlank())
? "#000000"
: request.getRedactColor();
return RedactionService.finalizeRedaction(
document,
allFound,
effectiveColor,
request.getCustomPadding(),
request.getConvertPDFToImage(),
false);
}
}
}

View File

@ -20,7 +20,7 @@
</svg>
<span class="tool-header-text" th:text="#{autoRedact.header}"></span>
</div>
<form th:action="@{'api/v1/security/auto-redact'}" method="post" enctype="multipart/form-data">
<form enctype="multipart/form-data" id="autoRedactForm" method="post" th:action="@{'api/v1/security/auto-redact'}">
<div class="mb-3">
<input type="file" class="form-control" id="fileInput" name="fileInput" required
accept="application/pdf">
@ -53,13 +53,42 @@
<script>
function handleColorChange(selectedValue) {
const container = document.getElementById('customColorContainer');
const input = document.getElementById('customColor');
if (selectedValue === "custom") {
document.getElementById('customColorContainer').style.display = 'block';
container.style.display = 'block';
if (!input.value) {
input.value = '#000000';
}
} else {
document.getElementById('customColorContainer').style.display = 'none';
document.getElementById('customColor').value = selectedValue;
container.style.display = 'none';
input.value = selectedValue;
}
}
document.addEventListener('DOMContentLoaded', function () {
const redactionModeSelect = document.getElementById('redactionMode');
const aggressiveModeHidden = document.getElementById('aggressiveMode');
const convertToImageCheckbox = document.getElementById('convertPDFToImage');
const defaultColor = document.getElementById('defaultColor');
redactionModeSelect.addEventListener('change', function () {
const mode = redactionModeSelect.value;
aggressiveModeHidden.value = (mode === 'aggressive') ? 'true' : 'false';
if (mode === 'visual') {
convertToImageCheckbox.checked = true;
}
});
if (defaultColor) {
handleColorChange(defaultColor.value);
}
aggressiveModeHidden.value = (redactionModeSelect.value === 'aggressive') ? 'true' : 'false';
if (redactionModeSelect.value === 'visual') {
convertToImageCheckbox.checked = true;
}
});
</script>
<div class="mb-3 form-check">
<input type="checkbox" id="useRegex" name="useRegex">
@ -82,6 +111,21 @@
<label for="convertPDFToImage" th:text="#{autoRedact.convertPDFToImageLabel}"></label>
</div>
<div class="mb-3">
<label class="form-label" for="redactionMode" th:text="#{autoRedact.redactionModeLabel}">Redaction Mode</label>
<select class="form-control" id="redactionMode" name="redactionMode">
<option th:text="#{autoRedact.redactionMode.moderate}" value="moderate">Moderate - Smart text removal with
fallback
</option>
<option th:text="#{autoRedact.redactionMode.visual}" value="visual">Visual - Black boxes only</option>
<option th:text="#{autoRedact.redactionMode.aggressive}" value="aggressive">Aggressive - Force text removal
</option>
</select>
</div>
<!-- Keep for backward compatibility -->
<input id="aggressiveMode" name="aggressiveMode" type="hidden" value="false">
<button type="submit" id="submitBtn" class="btn btn-primary"
th:text="#{autoRedact.submitButton}"></button>
</form>