mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
Refactor redaction services and utilities for improved readability and maintainability
- Adjusted indentation and formatting across multiple files for consistency. - Improved imports ordering in utility classes for better organization. - Enhanced `performTextReplacementAggressive` method with multi-sweep logic to handle residual text more effectively. - Added helper methods for verifying document text targets to streamline aggressive redaction. - Simplified logic and formatting in `RedactionService` and related classes. Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
8f19369c58
commit
1fac74a3ca
@ -65,6 +65,7 @@ public class RedactionService {
|
||||
private static final int FONT_SCALE_FACTOR = 1000;
|
||||
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
|
||||
private static final COSString EMPTY_COS_STRING = new COSString("");
|
||||
private static final int MAX_SWEEPS = 3;
|
||||
private static final ThreadLocal<Boolean> AGGRESSIVE_MODE =
|
||||
ThreadLocal.withInitial(() -> Boolean.FALSE);
|
||||
private static final ThreadLocal<Map<Integer, List<AggressiveSegMatch>>> AGGR_SEG_MATCHES =
|
||||
@ -268,6 +269,26 @@ public class RedactionService {
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean documentStillContainsTargets(
|
||||
PDDocument document,
|
||||
Set<String> targetWords,
|
||||
boolean useRegex,
|
||||
boolean wholeWordSearch) {
|
||||
try {
|
||||
int idx = -1;
|
||||
for (int i = 0; i < document.getNumberOfPages(); i++) {
|
||||
idx++;
|
||||
if (pageStillContainsTargets(
|
||||
document, idx, targetWords, useRegex, wholeWordSearch)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static Map<Integer, List<PDFText>> findTextToRedact(
|
||||
PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) {
|
||||
Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
|
||||
@ -809,6 +830,8 @@ public class RedactionService {
|
||||
.collect(Collectors.toSet());
|
||||
AGGRESSIVE_MODE.set(Boolean.TRUE);
|
||||
try {
|
||||
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||
boolean anyResidual = false;
|
||||
int pageIndex = -1;
|
||||
for (PDPage page : document.getPages()) {
|
||||
pageIndex++;
|
||||
@ -816,7 +839,11 @@ public class RedactionService {
|
||||
AGGR_SEG_MATCHES.remove();
|
||||
List<Object> filtered =
|
||||
createTokensWithoutTargetText(
|
||||
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||
document,
|
||||
page,
|
||||
allSearchTerms,
|
||||
useRegex,
|
||||
wholeWordSearchBool);
|
||||
writeFilteredContentStream(document, page, filtered);
|
||||
boolean residual =
|
||||
pageStillContainsTargets(
|
||||
@ -826,6 +853,7 @@ public class RedactionService {
|
||||
useRegex,
|
||||
wholeWordSearchBool);
|
||||
if (residual) {
|
||||
anyResidual = true;
|
||||
try {
|
||||
var sem = wipeAllSemanticTextInTokens(filtered);
|
||||
filtered = sem.tokens;
|
||||
@ -842,6 +870,16 @@ public class RedactionService {
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
// If no residuals detected in this sweep, stop early
|
||||
if (!anyResidual) {
|
||||
break;
|
||||
}
|
||||
// As a safety, if nothing left in the doc, stop
|
||||
if (!documentStillContainsTargets(
|
||||
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
AGGRESSIVE_MODE.remove();
|
||||
}
|
||||
@ -862,12 +900,19 @@ public class RedactionService {
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.collect(Collectors.toSet());
|
||||
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||
for (PDPage page : document.getPages()) {
|
||||
List<Object> filtered =
|
||||
createTokensWithoutTargetText(
|
||||
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||
writeFilteredContentStream(document, page, filtered);
|
||||
}
|
||||
// Stop early if nothing remains
|
||||
if (!documentStillContainsTargets(
|
||||
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} catch (Exception e) {
|
||||
return true;
|
||||
@ -1473,7 +1518,8 @@ public class RedactionService {
|
||||
String originalPart =
|
||||
originalText.substring(
|
||||
redactionStartInString, redactionEndInString);
|
||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && segment.getFont() != null
|
||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
|
||||
&& segment.getFont() != null
|
||||
&& !TextEncodingHelper.isTextSegmentRemovable(
|
||||
segment.getFont(), originalPart)) {
|
||||
continue;
|
||||
@ -1514,7 +1560,10 @@ public class RedactionService {
|
||||
}
|
||||
String modifiedString = newText.toString();
|
||||
newArray.add(new COSString(modifiedString));
|
||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && modified && segment.getFont() != null && segment.getFontSize() > 0) {
|
||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
|
||||
&& modified
|
||||
&& segment.getFont() != null
|
||||
&& segment.getFontSize() > 0) {
|
||||
try {
|
||||
float originalWidth =
|
||||
safeGetStringWidth(segment.getFont(), originalText)
|
||||
@ -1847,8 +1896,7 @@ public class RedactionService {
|
||||
private PDFont font = null;
|
||||
private float fontSize = 0;
|
||||
|
||||
public GraphicsState() {
|
||||
}
|
||||
public GraphicsState() {}
|
||||
}
|
||||
|
||||
@Data
|
||||
|
@ -1,17 +1,17 @@
|
||||
package stirling.software.SPDF.utils.text;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class TextDecodingHelper {
|
||||
@ -89,8 +89,7 @@ public class TextDecodingHelper {
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
if (charStr == null
|
||||
&& font instanceof PDType0Font type0Font) {
|
||||
if (charStr == null && font instanceof PDType0Font type0Font) {
|
||||
try {
|
||||
int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
|
||||
charStr = type0Font.toUnicode(cid);
|
||||
@ -143,8 +142,7 @@ public class TextDecodingHelper {
|
||||
try {
|
||||
if (bytes.length >= 2) {
|
||||
ByteBuffer buffer = ByteBuffer.wrap(bytes);
|
||||
CharsetDecoder decoder =
|
||||
StandardCharsets.UTF_16BE.newDecoder();
|
||||
CharsetDecoder decoder = StandardCharsets.UTF_16BE.newDecoder();
|
||||
CharBuffer charBuffer = decoder.decode(buffer);
|
||||
return charBuffer.toString();
|
||||
}
|
||||
|
@ -2,12 +2,12 @@ package stirling.software.SPDF.utils.text;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
|
||||
import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
|
@ -5,13 +5,13 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class TextFinderUtils {
|
||||
|
@ -1,9 +1,9 @@
|
||||
package stirling.software.SPDF.utils.text;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@ -44,8 +44,7 @@ public class WidthCalculator {
|
||||
}
|
||||
}
|
||||
|
||||
private float calculateWidthWithCharacterIteration(
|
||||
PDFont font, String text, float fontSize) {
|
||||
private float calculateWidthWithCharacterIteration(PDFont font, String text, float fontSize) {
|
||||
try {
|
||||
float totalWidth = 0;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user