mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
improve search term extraction and enhance redaction options
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
8c38ecf899
commit
38c261a82e
@ -2,8 +2,10 @@ package stirling.software.SPDF.service;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -13,7 +15,7 @@ import stirling.software.SPDF.model.api.security.RedactPdfRequest;
|
||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||
|
||||
@Service
|
||||
class AggressiveRedactionService implements RedactionModeStrategy {
|
||||
public final class AggressiveRedactionService implements RedactionModeStrategy {
|
||||
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
private final RedactionService helper;
|
||||
@ -24,13 +26,40 @@ class AggressiveRedactionService implements RedactionModeStrategy {
|
||||
this.helper = helper;
|
||||
}
|
||||
|
||||
private static String[] extractSearchTerms(RedactPdfRequest request) {
|
||||
if (request == null || request.getListOfText() == null) {
|
||||
return new String[0];
|
||||
}
|
||||
// Normalize by line breaks (handles \n, \r\n, etc.), trim, and drop blanks/duplicates while
|
||||
// preserving order
|
||||
List<String> terms =
|
||||
Arrays.stream(request.getListOfText().split("\\R"))
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.distinct()
|
||||
.collect(Collectors.toList());
|
||||
return terms.toArray(new String[0]);
|
||||
}
|
||||
|
||||
private static byte[] toByteArray(PDDocument doc) throws IOException {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
doc.save(baos);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] redact(RedactPdfRequest request) throws IOException {
|
||||
String[] listOfText = request.getListOfText().split("\n");
|
||||
String[] listOfText = extractSearchTerms(request);
|
||||
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
||||
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
||||
|
||||
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
||||
// If no valid search terms provided, return original document unmodified
|
||||
if (listOfText.length == 0) {
|
||||
return toByteArray(doc);
|
||||
}
|
||||
|
||||
Map<Integer, List<PDFText>> allFound =
|
||||
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
||||
if (allFound.isEmpty()) {
|
||||
@ -58,11 +87,4 @@ class AggressiveRedactionService implements RedactionModeStrategy {
|
||||
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] toByteArray(PDDocument doc) throws IOException {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
doc.save(baos);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,8 +2,10 @@ package stirling.software.SPDF.service;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -13,7 +15,7 @@ import stirling.software.SPDF.model.api.security.RedactPdfRequest;
|
||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||
|
||||
@Service
|
||||
class ModerateRedactionService implements RedactionModeStrategy {
|
||||
public final class ModerateRedactionService implements RedactionModeStrategy {
|
||||
|
||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||
private final RedactionService helper;
|
||||
@ -23,13 +25,40 @@ class ModerateRedactionService implements RedactionModeStrategy {
|
||||
this.helper = helper;
|
||||
}
|
||||
|
||||
private static String[] extractSearchTerms(RedactPdfRequest request) {
|
||||
if (request == null || request.getListOfText() == null) {
|
||||
return new String[0];
|
||||
}
|
||||
// Normalize by line breaks (handles \n, \r\n, etc.), trim, and drop blanks/duplicates while
|
||||
// preserving order
|
||||
List<String> terms =
|
||||
Arrays.stream(request.getListOfText().split("\\R"))
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.distinct()
|
||||
.collect(Collectors.toList());
|
||||
return terms.toArray(new String[0]);
|
||||
}
|
||||
|
||||
private static byte[] toByteArray(PDDocument doc) throws IOException {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
doc.save(baos);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] redact(RedactPdfRequest request) throws IOException {
|
||||
String[] listOfText = request.getListOfText().split("\n");
|
||||
String[] listOfText = extractSearchTerms(request);
|
||||
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
||||
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
||||
|
||||
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
||||
// If no valid search terms provided, return original document unmodified
|
||||
if (listOfText.length == 0) {
|
||||
return toByteArray(doc);
|
||||
}
|
||||
|
||||
Map<Integer, List<PDFText>> allFound =
|
||||
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
||||
if (allFound.isEmpty()) {
|
||||
@ -54,11 +83,4 @@ class ModerateRedactionService implements RedactionModeStrategy {
|
||||
throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] toByteArray(PDDocument doc) throws IOException {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
doc.save(baos);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1654,7 +1654,8 @@ public class RedactionService {
|
||||
if (cw > 0) {
|
||||
int count = Math.max(1, Math.round(targetWidth / cw));
|
||||
int max = (originalWord != null ? originalWord.length() : 1) * 2;
|
||||
return " ".repeat(Math.min(count, max));
|
||||
// Repeat the chosen alternative character, not spaces
|
||||
return alt.repeat(Math.min(count, max));
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
@ -1969,9 +1970,11 @@ public class RedactionService {
|
||||
try {
|
||||
if (bytes.length >= 2) {
|
||||
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
|
||||
// UTF-16BE BOM
|
||||
return new String(
|
||||
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
|
||||
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16BE);
|
||||
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
|
||||
// UTF-16LE BOM
|
||||
return new String(
|
||||
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
|
||||
}
|
||||
@ -2384,7 +2387,9 @@ public class RedactionService {
|
||||
}
|
||||
|
||||
try {
|
||||
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) {
|
||||
if ("Tj".equals(segment.operatorName)
|
||||
|| "'".equals(segment.operatorName)
|
||||
|| "\"".equals(segment.operatorName)) {
|
||||
log.debug(
|
||||
"Creating modification task for Tj operator at segment {}",
|
||||
segmentIndex);
|
||||
@ -2702,9 +2707,10 @@ public class RedactionService {
|
||||
}
|
||||
|
||||
private String handleQuotedOperator(Object token, PDFont font) {
|
||||
// Do not add an extra newline; it shifts indices and breaks match ranges
|
||||
return (token instanceof COSString cosString)
|
||||
? "\n" + extractStringWithFallbacks(cosString, font)
|
||||
: "\n";
|
||||
? extractStringWithFallbacks(cosString, font)
|
||||
: "";
|
||||
}
|
||||
|
||||
private String handleTJOperator(Object token, PDFont font) {
|
||||
|
@ -926,10 +926,10 @@ autoRedact.redactionStyleLabel=Redaction Style
|
||||
autoRedact.pdfImageDescription=For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality.
|
||||
autoRedact.visualRedactionLabel=Visual
|
||||
autoRedact.visualRedactionDescription=Converts to image with visual redactions for maximum security.
|
||||
autoRedact.deleteTextLabel=Delete Text
|
||||
autoRedact.deleteTextDescription=Removes the text completely. This may alter the original layout or leave a gap.
|
||||
autoRedact.keepLayoutLabel=Keep Layout
|
||||
autoRedact.keepLayoutDescription=Covers text with a redaction box, preserving the page's original design.
|
||||
autoRedact.deleteTextLabel=Remove Text
|
||||
autoRedact.deleteTextDescription=Removes text completely, allowing the surrounding content to shift. This may change the document's original appearance.
|
||||
autoRedact.keepLayoutLabel=Remove Text & Cover (Preserve Layout)
|
||||
autoRedact.keepLayoutDescription=Removes the underlying text and places a redaction box in its place, preserving the document's original layout.
|
||||
|
||||
#redact
|
||||
redact.title=Manual Redaction
|
||||
|
@ -32,6 +32,34 @@
|
||||
border-radius: 5px;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
/* Better visibility for selected redaction option */
|
||||
.redaction-options-group .form-check {
|
||||
border: 1px solid var(--md-sys-color-surface-3);
|
||||
border-radius: 8px;
|
||||
padding: 10px 12px;
|
||||
transition: border-color .15s ease, background-color .15s ease, box-shadow .15s ease;
|
||||
}
|
||||
|
||||
.redaction-options-group .form-check + .form-check { margin-top: .5rem; }
|
||||
|
||||
.redaction-options-group .form-check:hover {
|
||||
background-color: var(--md-sys-color-surface-1);
|
||||
}
|
||||
|
||||
.redaction-options-group .form-check.selected {
|
||||
border-color: #0d6efd;
|
||||
background-color: rgba(13,110,253,0.06);
|
||||
box-shadow: 0 0 0 2px rgba(13,110,253,0.1) inset;
|
||||
}
|
||||
|
||||
.redaction-options-group .form-check .form-check-label {
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.redaction-options-group small.form-text {
|
||||
margin-left: 1.8rem; /* align with radio */
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
@ -79,13 +107,13 @@
|
||||
</div>
|
||||
<div class="form-check mb-2">
|
||||
<input aria-describedby="delete-desc" class="form-check-input" id="deleteText" name="redactionMode" type="radio" value="aggressive">
|
||||
<label class="form-check-label" for="deleteText" th:text="#{autoRedact.deleteTextLabel}">Delete Text</label>
|
||||
<small class="form-text text-muted d-block mt-1" id="delete-desc" th:text="#{autoRedact.deleteTextDescription}">Removes the text completely. This may alter the original layout or leave a gap.</small>
|
||||
<label class="form-check-label" for="deleteText" th:text="#{autoRedact.deleteTextLabel}">Remove Text</label>
|
||||
<small class="form-text text-muted d-block mt-1" id="delete-desc" th:text="#{autoRedact.deleteTextDescription}">Removes text completely, allowing the surrounding content to shift. This may change the document's original appearance.</small>
|
||||
</div>
|
||||
<div class="form-check mb-3">
|
||||
<input aria-describedby="keep-desc" class="form-check-input" id="keepLayout" name="redactionMode" type="radio" value="moderate">
|
||||
<label class="form-check-label" for="keepLayout" th:text="#{autoRedact.keepLayoutLabel}">Keep Layout</label>
|
||||
<small class="form-text text-muted d-block mt-1" id="keep-desc" th:text="#{autoRedact.keepLayoutDescription}">Covers text with a redaction box, preserving the page's original design.</small>
|
||||
<label class="form-check-label" for="keepLayout" th:text="#{autoRedact.keepLayoutLabel}">Remove Text & Cover (Preserve Layout)</label>
|
||||
<small class="form-text text-muted d-block mt-1" id="keep-desc" th:text="#{autoRedact.keepLayoutDescription}">Removes the underlying text and places a redaction box in its place, preserving the document's original layout.</small>
|
||||
</div>
|
||||
|
||||
<div class="form-check">
|
||||
@ -127,7 +155,9 @@
|
||||
<div id="languages">
|
||||
<div class="form-check" th:each="language, iterStat : ${languages}">
|
||||
<input onchange="handleLangSelection()" required th:checked="${language == 'eng'}" th:id="${'language-' + language}" th:name="languages" th:value="${language}" type="checkbox" />
|
||||
<label th:for="${'language-' + language}" th:text="${language}"></label>
|
||||
<label th:attr="data-lang-code=${language}, data-lang-name=#{'lang.' + language}"
|
||||
th:for="${'language-' + language}"
|
||||
th:text="${language}"></label>
|
||||
</div>
|
||||
</div>
|
||||
<small class="form-text text-muted" id="ocr-desc">Used when OCR restoration is needed</small>
|
||||
@ -180,11 +210,7 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Translations for language names
|
||||
const languageTranslations = {};
|
||||
/*[# th:each="lang : ${languages}"]*/
|
||||
languageTranslations['[(${lang})]'] = /*[[#{${'lang.' + lang}}]]*/[(${lang})];
|
||||
/*[/]*/
|
||||
// Helper to get translated language from data attribute
|
||||
|
||||
const localeToTesseract = {
|
||||
'en': 'eng', 'fr': 'fra', 'de': 'deu', 'es': 'spa', 'it': 'ita', 'pt': 'por', 'ru': 'rus',
|
||||
@ -195,7 +221,9 @@
|
||||
};
|
||||
|
||||
function getTranslatedLanguageName(shortCode) {
|
||||
return languageTranslations[shortCode] || shortCode;
|
||||
// Try to find a label with matching code and read its data-lang-name
|
||||
const label = document.querySelector(`#languages label[for="language-${shortCode}"]`);
|
||||
return (label && (label.dataset.langName || label.textContent)) || shortCode;
|
||||
}
|
||||
|
||||
function prioritizeLanguages() {
|
||||
@ -207,7 +235,9 @@
|
||||
const label = element.querySelector('label');
|
||||
if (label) {
|
||||
const langCode = label.getAttribute('for').split('-')[1];
|
||||
label.textContent = getTranslatedLanguageName(langCode);
|
||||
// Display translated name if available
|
||||
const translated = label.dataset.langName;
|
||||
if (translated) label.textContent = translated;
|
||||
}
|
||||
});
|
||||
const browserLanguage = document.documentElement.lang || navigator.language || navigator.userLanguage;
|
||||
@ -256,6 +286,11 @@
|
||||
// Delete Text and Keep Layout modes disable PDF image
|
||||
guaranteeRedactionCheckbox.checked = false;
|
||||
}
|
||||
|
||||
// Highlight selected card
|
||||
document.querySelectorAll('.redaction-options-group .form-check').forEach(div => div.classList.remove('selected'));
|
||||
const parent = selectedMode.closest('.form-check');
|
||||
if (parent) parent.classList.add('selected');
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user