improve search term extraction and enhance redaction options

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-08-26 12:24:11 +02:00
parent 8c38ecf899
commit 38c261a82e
5 changed files with 125 additions and 40 deletions

View File

@ -2,8 +2,10 @@ package stirling.software.SPDF.service;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -13,7 +15,7 @@ import stirling.software.SPDF.model.api.security.RedactPdfRequest;
import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.service.CustomPDFDocumentFactory;
@Service @Service
class AggressiveRedactionService implements RedactionModeStrategy { public final class AggressiveRedactionService implements RedactionModeStrategy {
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
private final RedactionService helper; private final RedactionService helper;
@ -24,13 +26,40 @@ class AggressiveRedactionService implements RedactionModeStrategy {
this.helper = helper; this.helper = helper;
} }
private static String[] extractSearchTerms(RedactPdfRequest request) {
if (request == null || request.getListOfText() == null) {
return new String[0];
}
// Normalize by line breaks (handles \n, \r\n, etc.), trim, and drop blanks/duplicates while
// preserving order
List<String> terms =
Arrays.stream(request.getListOfText().split("\\R"))
.map(String::trim)
.filter(s -> !s.isEmpty())
.distinct()
.collect(Collectors.toList());
return terms.toArray(new String[0]);
}
private static byte[] toByteArray(PDDocument doc) throws IOException {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
}
@Override @Override
public byte[] redact(RedactPdfRequest request) throws IOException { public byte[] redact(RedactPdfRequest request) throws IOException {
String[] listOfText = request.getListOfText().split("\n"); String[] listOfText = extractSearchTerms(request);
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch()); boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) { try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
// If no valid search terms provided, return original document unmodified
if (listOfText.length == 0) {
return toByteArray(doc);
}
Map<Integer, List<PDFText>> allFound = Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) { if (allFound.isEmpty()) {
@ -58,11 +87,4 @@ class AggressiveRedactionService implements RedactionModeStrategy {
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e); throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
} }
} }
private byte[] toByteArray(PDDocument doc) throws IOException {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
}
} }

View File

@ -2,8 +2,10 @@ package stirling.software.SPDF.service;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -13,7 +15,7 @@ import stirling.software.SPDF.model.api.security.RedactPdfRequest;
import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.service.CustomPDFDocumentFactory;
@Service @Service
class ModerateRedactionService implements RedactionModeStrategy { public final class ModerateRedactionService implements RedactionModeStrategy {
private final CustomPDFDocumentFactory pdfDocumentFactory; private final CustomPDFDocumentFactory pdfDocumentFactory;
private final RedactionService helper; private final RedactionService helper;
@ -23,13 +25,40 @@ class ModerateRedactionService implements RedactionModeStrategy {
this.helper = helper; this.helper = helper;
} }
private static String[] extractSearchTerms(RedactPdfRequest request) {
if (request == null || request.getListOfText() == null) {
return new String[0];
}
// Normalize by line breaks (handles \n, \r\n, etc.), trim, and drop blanks/duplicates while
// preserving order
List<String> terms =
Arrays.stream(request.getListOfText().split("\\R"))
.map(String::trim)
.filter(s -> !s.isEmpty())
.distinct()
.collect(Collectors.toList());
return terms.toArray(new String[0]);
}
private static byte[] toByteArray(PDDocument doc) throws IOException {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
}
@Override @Override
public byte[] redact(RedactPdfRequest request) throws IOException { public byte[] redact(RedactPdfRequest request) throws IOException {
String[] listOfText = request.getListOfText().split("\n"); String[] listOfText = extractSearchTerms(request);
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex()); boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch()); boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) { try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
// If no valid search terms provided, return original document unmodified
if (listOfText.length == 0) {
return toByteArray(doc);
}
Map<Integer, List<PDFText>> allFound = Map<Integer, List<PDFText>> allFound =
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord); RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
if (allFound.isEmpty()) { if (allFound.isEmpty()) {
@ -54,11 +83,4 @@ class ModerateRedactionService implements RedactionModeStrategy {
throw new IOException("Moderate redaction failed: " + e.getMessage(), e); throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
} }
} }
private byte[] toByteArray(PDDocument doc) throws IOException {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
doc.save(baos);
return baos.toByteArray();
}
}
} }

View File

@ -1654,7 +1654,8 @@ public class RedactionService {
if (cw > 0) { if (cw > 0) {
int count = Math.max(1, Math.round(targetWidth / cw)); int count = Math.max(1, Math.round(targetWidth / cw));
int max = (originalWord != null ? originalWord.length() : 1) * 2; int max = (originalWord != null ? originalWord.length() : 1) * 2;
return " ".repeat(Math.min(count, max)); // Repeat the chosen alternative character, not spaces
return alt.repeat(Math.min(count, max));
} }
} catch (Exception ignored) { } catch (Exception ignored) {
} }
@ -1969,9 +1970,11 @@ public class RedactionService {
try { try {
if (bytes.length >= 2) { if (bytes.length >= 2) {
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) { if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
// UTF-16BE BOM
return new String( return new String(
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE); bytes, 2, bytes.length - 2, StandardCharsets.UTF_16BE);
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) { } else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
// UTF-16LE BOM
return new String( return new String(
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE); bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
} }
@ -2384,7 +2387,9 @@ public class RedactionService {
} }
try { try {
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) { if ("Tj".equals(segment.operatorName)
|| "'".equals(segment.operatorName)
|| "\"".equals(segment.operatorName)) {
log.debug( log.debug(
"Creating modification task for Tj operator at segment {}", "Creating modification task for Tj operator at segment {}",
segmentIndex); segmentIndex);
@ -2702,9 +2707,10 @@ public class RedactionService {
} }
private String handleQuotedOperator(Object token, PDFont font) { private String handleQuotedOperator(Object token, PDFont font) {
// Do not add an extra newline; it shifts indices and breaks match ranges
return (token instanceof COSString cosString) return (token instanceof COSString cosString)
? "\n" + extractStringWithFallbacks(cosString, font) ? extractStringWithFallbacks(cosString, font)
: "\n"; : "";
} }
private String handleTJOperator(Object token, PDFont font) { private String handleTJOperator(Object token, PDFont font) {

View File

@ -926,10 +926,10 @@ autoRedact.redactionStyleLabel=Redaction Style
autoRedact.pdfImageDescription=For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality. autoRedact.pdfImageDescription=For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality.
autoRedact.visualRedactionLabel=Visual autoRedact.visualRedactionLabel=Visual
autoRedact.visualRedactionDescription=Converts to image with visual redactions for maximum security. autoRedact.visualRedactionDescription=Converts to image with visual redactions for maximum security.
autoRedact.deleteTextLabel=Delete Text autoRedact.deleteTextLabel=Remove Text
autoRedact.deleteTextDescription=Removes the text completely. This may alter the original layout or leave a gap. autoRedact.deleteTextDescription=Removes text completely, allowing the surrounding content to shift. This may change the document's original appearance.
autoRedact.keepLayoutLabel=Keep Layout autoRedact.keepLayoutLabel=Remove Text & Cover (Preserve Layout)
autoRedact.keepLayoutDescription=Covers text with a redaction box, preserving the page's original design. autoRedact.keepLayoutDescription=Removes the underlying text and places a redaction box in its place, preserving the document's original layout.
#redact #redact
redact.title=Manual Redaction redact.title=Manual Redaction

View File

@ -32,6 +32,34 @@
border-radius: 5px; border-radius: 5px;
padding: 10px; padding: 10px;
} }
/* Better visibility for selected redaction option */
.redaction-options-group .form-check {
border: 1px solid var(--md-sys-color-surface-3);
border-radius: 8px;
padding: 10px 12px;
transition: border-color .15s ease, background-color .15s ease, box-shadow .15s ease;
}
.redaction-options-group .form-check + .form-check { margin-top: .5rem; }
.redaction-options-group .form-check:hover {
background-color: var(--md-sys-color-surface-1);
}
.redaction-options-group .form-check.selected {
border-color: #0d6efd;
background-color: rgba(13,110,253,0.06);
box-shadow: 0 0 0 2px rgba(13,110,253,0.1) inset;
}
.redaction-options-group .form-check .form-check-label {
font-weight: 600;
}
.redaction-options-group small.form-text {
margin-left: 1.8rem; /* align with radio */
}
</style> </style>
</head> </head>
@ -79,13 +107,13 @@
</div> </div>
<div class="form-check mb-2"> <div class="form-check mb-2">
<input aria-describedby="delete-desc" class="form-check-input" id="deleteText" name="redactionMode" type="radio" value="aggressive"> <input aria-describedby="delete-desc" class="form-check-input" id="deleteText" name="redactionMode" type="radio" value="aggressive">
<label class="form-check-label" for="deleteText" th:text="#{autoRedact.deleteTextLabel}">Delete Text</label> <label class="form-check-label" for="deleteText" th:text="#{autoRedact.deleteTextLabel}">Remove Text</label>
<small class="form-text text-muted d-block mt-1" id="delete-desc" th:text="#{autoRedact.deleteTextDescription}">Removes the text completely. This may alter the original layout or leave a gap.</small> <small class="form-text text-muted d-block mt-1" id="delete-desc" th:text="#{autoRedact.deleteTextDescription}">Removes text completely, allowing the surrounding content to shift. This may change the document's original appearance.</small>
</div> </div>
<div class="form-check mb-3"> <div class="form-check mb-3">
<input aria-describedby="keep-desc" class="form-check-input" id="keepLayout" name="redactionMode" type="radio" value="moderate"> <input aria-describedby="keep-desc" class="form-check-input" id="keepLayout" name="redactionMode" type="radio" value="moderate">
<label class="form-check-label" for="keepLayout" th:text="#{autoRedact.keepLayoutLabel}">Keep Layout</label> <label class="form-check-label" for="keepLayout" th:text="#{autoRedact.keepLayoutLabel}">Remove Text & Cover (Preserve Layout)</label>
<small class="form-text text-muted d-block mt-1" id="keep-desc" th:text="#{autoRedact.keepLayoutDescription}">Covers text with a redaction box, preserving the page's original design.</small> <small class="form-text text-muted d-block mt-1" id="keep-desc" th:text="#{autoRedact.keepLayoutDescription}">Removes the underlying text and places a redaction box in its place, preserving the document's original layout.</small>
</div> </div>
<div class="form-check"> <div class="form-check">
@ -127,7 +155,9 @@
<div id="languages"> <div id="languages">
<div class="form-check" th:each="language, iterStat : ${languages}"> <div class="form-check" th:each="language, iterStat : ${languages}">
<input onchange="handleLangSelection()" required th:checked="${language == 'eng'}" th:id="${'language-' + language}" th:name="languages" th:value="${language}" type="checkbox" /> <input onchange="handleLangSelection()" required th:checked="${language == 'eng'}" th:id="${'language-' + language}" th:name="languages" th:value="${language}" type="checkbox" />
<label th:for="${'language-' + language}" th:text="${language}"></label> <label th:attr="data-lang-code=${language}, data-lang-name=#{'lang.' + language}"
th:for="${'language-' + language}"
th:text="${language}"></label>
</div> </div>
</div> </div>
<small class="form-text text-muted" id="ocr-desc">Used when OCR restoration is needed</small> <small class="form-text text-muted" id="ocr-desc">Used when OCR restoration is needed</small>
@ -180,11 +210,7 @@
} }
} }
// Translations for language names // Helper to get translated language from data attribute
const languageTranslations = {};
/*[# th:each="lang : ${languages}"]*/
languageTranslations['[(${lang})]'] = /*[[#{${'lang.' + lang}}]]*/[(${lang})];
/*[/]*/
const localeToTesseract = { const localeToTesseract = {
'en': 'eng', 'fr': 'fra', 'de': 'deu', 'es': 'spa', 'it': 'ita', 'pt': 'por', 'ru': 'rus', 'en': 'eng', 'fr': 'fra', 'de': 'deu', 'es': 'spa', 'it': 'ita', 'pt': 'por', 'ru': 'rus',
@ -195,7 +221,9 @@
}; };
function getTranslatedLanguageName(shortCode) { function getTranslatedLanguageName(shortCode) {
return languageTranslations[shortCode] || shortCode; // Try to find a label with matching code and read its data-lang-name
const label = document.querySelector(`#languages label[for="language-${shortCode}"]`);
return (label && (label.dataset.langName || label.textContent)) || shortCode;
} }
function prioritizeLanguages() { function prioritizeLanguages() {
@ -207,7 +235,9 @@
const label = element.querySelector('label'); const label = element.querySelector('label');
if (label) { if (label) {
const langCode = label.getAttribute('for').split('-')[1]; const langCode = label.getAttribute('for').split('-')[1];
label.textContent = getTranslatedLanguageName(langCode); // Display translated name if available
const translated = label.dataset.langName;
if (translated) label.textContent = translated;
} }
}); });
const browserLanguage = document.documentElement.lang || navigator.language || navigator.userLanguage; const browserLanguage = document.documentElement.lang || navigator.language || navigator.userLanguage;
@ -242,7 +272,7 @@
const guaranteeRedactionCheckbox = document.getElementById('guaranteeRedaction'); const guaranteeRedactionCheckbox = document.getElementById('guaranteeRedaction');
const defaultColor = document.getElementById('defaultColor'); const defaultColor = document.getElementById('defaultColor');
function updateMode() { function updateMode() {
const selectedMode = document.querySelector('input[name="redactionMode"]:checked'); const selectedMode = document.querySelector('input[name="redactionMode"]:checked');
if (selectedMode) { if (selectedMode) {
// Set aggressive mode for delete text option // Set aggressive mode for delete text option
@ -256,6 +286,11 @@
// Delete Text and Keep Layout modes disable PDF image // Delete Text and Keep Layout modes disable PDF image
guaranteeRedactionCheckbox.checked = false; guaranteeRedactionCheckbox.checked = false;
} }
// Highlight selected card
document.querySelectorAll('.redaction-options-group .form-check').forEach(div => div.classList.remove('selected'));
const parent = selectedMode.closest('.form-check');
if (parent) parent.classList.add('selected');
} }
} }