mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
improve search term extraction and enhance redaction options
Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
8c38ecf899
commit
38c261a82e
@ -2,8 +2,10 @@ package stirling.software.SPDF.service;
|
|||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
@ -13,7 +15,7 @@ import stirling.software.SPDF.model.api.security.RedactPdfRequest;
|
|||||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
class AggressiveRedactionService implements RedactionModeStrategy {
|
public final class AggressiveRedactionService implements RedactionModeStrategy {
|
||||||
|
|
||||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||||
private final RedactionService helper;
|
private final RedactionService helper;
|
||||||
@ -24,13 +26,40 @@ class AggressiveRedactionService implements RedactionModeStrategy {
|
|||||||
this.helper = helper;
|
this.helper = helper;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static String[] extractSearchTerms(RedactPdfRequest request) {
|
||||||
|
if (request == null || request.getListOfText() == null) {
|
||||||
|
return new String[0];
|
||||||
|
}
|
||||||
|
// Normalize by line breaks (handles \n, \r\n, etc.), trim, and drop blanks/duplicates while
|
||||||
|
// preserving order
|
||||||
|
List<String> terms =
|
||||||
|
Arrays.stream(request.getListOfText().split("\\R"))
|
||||||
|
.map(String::trim)
|
||||||
|
.filter(s -> !s.isEmpty())
|
||||||
|
.distinct()
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
return terms.toArray(new String[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static byte[] toByteArray(PDDocument doc) throws IOException {
|
||||||
|
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||||
|
doc.save(baos);
|
||||||
|
return baos.toByteArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public byte[] redact(RedactPdfRequest request) throws IOException {
|
public byte[] redact(RedactPdfRequest request) throws IOException {
|
||||||
String[] listOfText = request.getListOfText().split("\n");
|
String[] listOfText = extractSearchTerms(request);
|
||||||
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
||||||
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
||||||
|
|
||||||
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
||||||
|
// If no valid search terms provided, return original document unmodified
|
||||||
|
if (listOfText.length == 0) {
|
||||||
|
return toByteArray(doc);
|
||||||
|
}
|
||||||
|
|
||||||
Map<Integer, List<PDFText>> allFound =
|
Map<Integer, List<PDFText>> allFound =
|
||||||
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
||||||
if (allFound.isEmpty()) {
|
if (allFound.isEmpty()) {
|
||||||
@ -58,11 +87,4 @@ class AggressiveRedactionService implements RedactionModeStrategy {
|
|||||||
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
|
throw new IOException("Aggressive redaction failed: " + e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private byte[] toByteArray(PDDocument doc) throws IOException {
|
|
||||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
|
||||||
doc.save(baos);
|
|
||||||
return baos.toByteArray();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -2,8 +2,10 @@ package stirling.software.SPDF.service;
|
|||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
@ -13,7 +15,7 @@ import stirling.software.SPDF.model.api.security.RedactPdfRequest;
|
|||||||
import stirling.software.common.service.CustomPDFDocumentFactory;
|
import stirling.software.common.service.CustomPDFDocumentFactory;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
class ModerateRedactionService implements RedactionModeStrategy {
|
public final class ModerateRedactionService implements RedactionModeStrategy {
|
||||||
|
|
||||||
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
private final CustomPDFDocumentFactory pdfDocumentFactory;
|
||||||
private final RedactionService helper;
|
private final RedactionService helper;
|
||||||
@ -23,13 +25,40 @@ class ModerateRedactionService implements RedactionModeStrategy {
|
|||||||
this.helper = helper;
|
this.helper = helper;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static String[] extractSearchTerms(RedactPdfRequest request) {
|
||||||
|
if (request == null || request.getListOfText() == null) {
|
||||||
|
return new String[0];
|
||||||
|
}
|
||||||
|
// Normalize by line breaks (handles \n, \r\n, etc.), trim, and drop blanks/duplicates while
|
||||||
|
// preserving order
|
||||||
|
List<String> terms =
|
||||||
|
Arrays.stream(request.getListOfText().split("\\R"))
|
||||||
|
.map(String::trim)
|
||||||
|
.filter(s -> !s.isEmpty())
|
||||||
|
.distinct()
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
return terms.toArray(new String[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static byte[] toByteArray(PDDocument doc) throws IOException {
|
||||||
|
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||||
|
doc.save(baos);
|
||||||
|
return baos.toByteArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public byte[] redact(RedactPdfRequest request) throws IOException {
|
public byte[] redact(RedactPdfRequest request) throws IOException {
|
||||||
String[] listOfText = request.getListOfText().split("\n");
|
String[] listOfText = extractSearchTerms(request);
|
||||||
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
boolean useRegex = Boolean.TRUE.equals(request.getUseRegex());
|
||||||
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
boolean wholeWord = Boolean.TRUE.equals(request.getWholeWordSearch());
|
||||||
|
|
||||||
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
try (PDDocument doc = pdfDocumentFactory.load(request.getFileInput())) {
|
||||||
|
// If no valid search terms provided, return original document unmodified
|
||||||
|
if (listOfText.length == 0) {
|
||||||
|
return toByteArray(doc);
|
||||||
|
}
|
||||||
|
|
||||||
Map<Integer, List<PDFText>> allFound =
|
Map<Integer, List<PDFText>> allFound =
|
||||||
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
RedactionService.findTextToRedact(doc, listOfText, useRegex, wholeWord);
|
||||||
if (allFound.isEmpty()) {
|
if (allFound.isEmpty()) {
|
||||||
@ -54,11 +83,4 @@ class ModerateRedactionService implements RedactionModeStrategy {
|
|||||||
throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
|
throw new IOException("Moderate redaction failed: " + e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private byte[] toByteArray(PDDocument doc) throws IOException {
|
|
||||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
|
||||||
doc.save(baos);
|
|
||||||
return baos.toByteArray();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1654,7 +1654,8 @@ public class RedactionService {
|
|||||||
if (cw > 0) {
|
if (cw > 0) {
|
||||||
int count = Math.max(1, Math.round(targetWidth / cw));
|
int count = Math.max(1, Math.round(targetWidth / cw));
|
||||||
int max = (originalWord != null ? originalWord.length() : 1) * 2;
|
int max = (originalWord != null ? originalWord.length() : 1) * 2;
|
||||||
return " ".repeat(Math.min(count, max));
|
// Repeat the chosen alternative character, not spaces
|
||||||
|
return alt.repeat(Math.min(count, max));
|
||||||
}
|
}
|
||||||
} catch (Exception ignored) {
|
} catch (Exception ignored) {
|
||||||
}
|
}
|
||||||
@ -1969,9 +1970,11 @@ public class RedactionService {
|
|||||||
try {
|
try {
|
||||||
if (bytes.length >= 2) {
|
if (bytes.length >= 2) {
|
||||||
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
|
if ((bytes[0] & 0xFF) == 0xFE && (bytes[1] & 0xFF) == 0xFF) {
|
||||||
|
// UTF-16BE BOM
|
||||||
return new String(
|
return new String(
|
||||||
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
|
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16BE);
|
||||||
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
|
} else if ((bytes[0] & 0xFF) == 0xFF && (bytes[1] & 0xFF) == 0xFE) {
|
||||||
|
// UTF-16LE BOM
|
||||||
return new String(
|
return new String(
|
||||||
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
|
bytes, 2, bytes.length - 2, StandardCharsets.UTF_16LE);
|
||||||
}
|
}
|
||||||
@ -2384,7 +2387,9 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if ("Tj".equals(segment.operatorName) || "'".equals(segment.operatorName)) {
|
if ("Tj".equals(segment.operatorName)
|
||||||
|
|| "'".equals(segment.operatorName)
|
||||||
|
|| "\"".equals(segment.operatorName)) {
|
||||||
log.debug(
|
log.debug(
|
||||||
"Creating modification task for Tj operator at segment {}",
|
"Creating modification task for Tj operator at segment {}",
|
||||||
segmentIndex);
|
segmentIndex);
|
||||||
@ -2702,9 +2707,10 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private String handleQuotedOperator(Object token, PDFont font) {
|
private String handleQuotedOperator(Object token, PDFont font) {
|
||||||
|
// Do not add an extra newline; it shifts indices and breaks match ranges
|
||||||
return (token instanceof COSString cosString)
|
return (token instanceof COSString cosString)
|
||||||
? "\n" + extractStringWithFallbacks(cosString, font)
|
? extractStringWithFallbacks(cosString, font)
|
||||||
: "\n";
|
: "";
|
||||||
}
|
}
|
||||||
|
|
||||||
private String handleTJOperator(Object token, PDFont font) {
|
private String handleTJOperator(Object token, PDFont font) {
|
||||||
|
@ -926,10 +926,10 @@ autoRedact.redactionStyleLabel=Redaction Style
|
|||||||
autoRedact.pdfImageDescription=For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality.
|
autoRedact.pdfImageDescription=For maximum security, uses an image-based method to ensure text is unrecoverable. May slightly affect document quality.
|
||||||
autoRedact.visualRedactionLabel=Visual
|
autoRedact.visualRedactionLabel=Visual
|
||||||
autoRedact.visualRedactionDescription=Converts to image with visual redactions for maximum security.
|
autoRedact.visualRedactionDescription=Converts to image with visual redactions for maximum security.
|
||||||
autoRedact.deleteTextLabel=Delete Text
|
autoRedact.deleteTextLabel=Remove Text
|
||||||
autoRedact.deleteTextDescription=Removes the text completely. This may alter the original layout or leave a gap.
|
autoRedact.deleteTextDescription=Removes text completely, allowing the surrounding content to shift. This may change the document's original appearance.
|
||||||
autoRedact.keepLayoutLabel=Keep Layout
|
autoRedact.keepLayoutLabel=Remove Text & Cover (Preserve Layout)
|
||||||
autoRedact.keepLayoutDescription=Covers text with a redaction box, preserving the page's original design.
|
autoRedact.keepLayoutDescription=Removes the underlying text and places a redaction box in its place, preserving the document's original layout.
|
||||||
|
|
||||||
#redact
|
#redact
|
||||||
redact.title=Manual Redaction
|
redact.title=Manual Redaction
|
||||||
|
@ -32,6 +32,34 @@
|
|||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
padding: 10px;
|
padding: 10px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Better visibility for selected redaction option */
|
||||||
|
.redaction-options-group .form-check {
|
||||||
|
border: 1px solid var(--md-sys-color-surface-3);
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 10px 12px;
|
||||||
|
transition: border-color .15s ease, background-color .15s ease, box-shadow .15s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.redaction-options-group .form-check + .form-check { margin-top: .5rem; }
|
||||||
|
|
||||||
|
.redaction-options-group .form-check:hover {
|
||||||
|
background-color: var(--md-sys-color-surface-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
.redaction-options-group .form-check.selected {
|
||||||
|
border-color: #0d6efd;
|
||||||
|
background-color: rgba(13,110,253,0.06);
|
||||||
|
box-shadow: 0 0 0 2px rgba(13,110,253,0.1) inset;
|
||||||
|
}
|
||||||
|
|
||||||
|
.redaction-options-group .form-check .form-check-label {
|
||||||
|
font-weight: 600;
|
||||||
|
}
|
||||||
|
|
||||||
|
.redaction-options-group small.form-text {
|
||||||
|
margin-left: 1.8rem; /* align with radio */
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
@ -79,13 +107,13 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="form-check mb-2">
|
<div class="form-check mb-2">
|
||||||
<input aria-describedby="delete-desc" class="form-check-input" id="deleteText" name="redactionMode" type="radio" value="aggressive">
|
<input aria-describedby="delete-desc" class="form-check-input" id="deleteText" name="redactionMode" type="radio" value="aggressive">
|
||||||
<label class="form-check-label" for="deleteText" th:text="#{autoRedact.deleteTextLabel}">Delete Text</label>
|
<label class="form-check-label" for="deleteText" th:text="#{autoRedact.deleteTextLabel}">Remove Text</label>
|
||||||
<small class="form-text text-muted d-block mt-1" id="delete-desc" th:text="#{autoRedact.deleteTextDescription}">Removes the text completely. This may alter the original layout or leave a gap.</small>
|
<small class="form-text text-muted d-block mt-1" id="delete-desc" th:text="#{autoRedact.deleteTextDescription}">Removes text completely, allowing the surrounding content to shift. This may change the document's original appearance.</small>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-check mb-3">
|
<div class="form-check mb-3">
|
||||||
<input aria-describedby="keep-desc" class="form-check-input" id="keepLayout" name="redactionMode" type="radio" value="moderate">
|
<input aria-describedby="keep-desc" class="form-check-input" id="keepLayout" name="redactionMode" type="radio" value="moderate">
|
||||||
<label class="form-check-label" for="keepLayout" th:text="#{autoRedact.keepLayoutLabel}">Keep Layout</label>
|
<label class="form-check-label" for="keepLayout" th:text="#{autoRedact.keepLayoutLabel}">Remove Text & Cover (Preserve Layout)</label>
|
||||||
<small class="form-text text-muted d-block mt-1" id="keep-desc" th:text="#{autoRedact.keepLayoutDescription}">Covers text with a redaction box, preserving the page's original design.</small>
|
<small class="form-text text-muted d-block mt-1" id="keep-desc" th:text="#{autoRedact.keepLayoutDescription}">Removes the underlying text and places a redaction box in its place, preserving the document's original layout.</small>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="form-check">
|
<div class="form-check">
|
||||||
@ -127,7 +155,9 @@
|
|||||||
<div id="languages">
|
<div id="languages">
|
||||||
<div class="form-check" th:each="language, iterStat : ${languages}">
|
<div class="form-check" th:each="language, iterStat : ${languages}">
|
||||||
<input onchange="handleLangSelection()" required th:checked="${language == 'eng'}" th:id="${'language-' + language}" th:name="languages" th:value="${language}" type="checkbox" />
|
<input onchange="handleLangSelection()" required th:checked="${language == 'eng'}" th:id="${'language-' + language}" th:name="languages" th:value="${language}" type="checkbox" />
|
||||||
<label th:for="${'language-' + language}" th:text="${language}"></label>
|
<label th:attr="data-lang-code=${language}, data-lang-name=#{'lang.' + language}"
|
||||||
|
th:for="${'language-' + language}"
|
||||||
|
th:text="${language}"></label>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<small class="form-text text-muted" id="ocr-desc">Used when OCR restoration is needed</small>
|
<small class="form-text text-muted" id="ocr-desc">Used when OCR restoration is needed</small>
|
||||||
@ -180,11 +210,7 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Translations for language names
|
// Helper to get translated language from data attribute
|
||||||
const languageTranslations = {};
|
|
||||||
/*[# th:each="lang : ${languages}"]*/
|
|
||||||
languageTranslations['[(${lang})]'] = /*[[#{${'lang.' + lang}}]]*/[(${lang})];
|
|
||||||
/*[/]*/
|
|
||||||
|
|
||||||
const localeToTesseract = {
|
const localeToTesseract = {
|
||||||
'en': 'eng', 'fr': 'fra', 'de': 'deu', 'es': 'spa', 'it': 'ita', 'pt': 'por', 'ru': 'rus',
|
'en': 'eng', 'fr': 'fra', 'de': 'deu', 'es': 'spa', 'it': 'ita', 'pt': 'por', 'ru': 'rus',
|
||||||
@ -195,7 +221,9 @@
|
|||||||
};
|
};
|
||||||
|
|
||||||
function getTranslatedLanguageName(shortCode) {
|
function getTranslatedLanguageName(shortCode) {
|
||||||
return languageTranslations[shortCode] || shortCode;
|
// Try to find a label with matching code and read its data-lang-name
|
||||||
|
const label = document.querySelector(`#languages label[for="language-${shortCode}"]`);
|
||||||
|
return (label && (label.dataset.langName || label.textContent)) || shortCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
function prioritizeLanguages() {
|
function prioritizeLanguages() {
|
||||||
@ -207,7 +235,9 @@
|
|||||||
const label = element.querySelector('label');
|
const label = element.querySelector('label');
|
||||||
if (label) {
|
if (label) {
|
||||||
const langCode = label.getAttribute('for').split('-')[1];
|
const langCode = label.getAttribute('for').split('-')[1];
|
||||||
label.textContent = getTranslatedLanguageName(langCode);
|
// Display translated name if available
|
||||||
|
const translated = label.dataset.langName;
|
||||||
|
if (translated) label.textContent = translated;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
const browserLanguage = document.documentElement.lang || navigator.language || navigator.userLanguage;
|
const browserLanguage = document.documentElement.lang || navigator.language || navigator.userLanguage;
|
||||||
@ -256,6 +286,11 @@
|
|||||||
// Delete Text and Keep Layout modes disable PDF image
|
// Delete Text and Keep Layout modes disable PDF image
|
||||||
guaranteeRedactionCheckbox.checked = false;
|
guaranteeRedactionCheckbox.checked = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Highlight selected card
|
||||||
|
document.querySelectorAll('.redaction-options-group .form-check').forEach(div => div.classList.remove('selected'));
|
||||||
|
const parent = selectedMode.closest('.form-check');
|
||||||
|
if (parent) parent.classList.add('selected');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user