mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-09-08 17:51:20 +02:00
Refactor redaction services and utilities for improved readability and maintainability
- Adjusted indentation and formatting across multiple files for consistency. - Improved imports ordering in utility classes for better organization. - Enhanced `performTextReplacementAggressive` method with multi-sweep logic to handle residual text more effectively. - Added helper methods for verifying document text targets to streamline aggressive redaction. - Simplified logic and formatting in `RedactionService` and related classes. Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
8f19369c58
commit
1fac74a3ca
@ -65,6 +65,7 @@ public class RedactionService {
|
|||||||
private static final int FONT_SCALE_FACTOR = 1000;
|
private static final int FONT_SCALE_FACTOR = 1000;
|
||||||
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
|
private static final Set<String> TEXT_SHOWING_OPERATORS = Set.of("Tj", "TJ", "'", "\"");
|
||||||
private static final COSString EMPTY_COS_STRING = new COSString("");
|
private static final COSString EMPTY_COS_STRING = new COSString("");
|
||||||
|
private static final int MAX_SWEEPS = 3;
|
||||||
private static final ThreadLocal<Boolean> AGGRESSIVE_MODE =
|
private static final ThreadLocal<Boolean> AGGRESSIVE_MODE =
|
||||||
ThreadLocal.withInitial(() -> Boolean.FALSE);
|
ThreadLocal.withInitial(() -> Boolean.FALSE);
|
||||||
private static final ThreadLocal<Map<Integer, List<AggressiveSegMatch>>> AGGR_SEG_MATCHES =
|
private static final ThreadLocal<Map<Integer, List<AggressiveSegMatch>>> AGGR_SEG_MATCHES =
|
||||||
@ -268,6 +269,26 @@ public class RedactionService {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean documentStillContainsTargets(
|
||||||
|
PDDocument document,
|
||||||
|
Set<String> targetWords,
|
||||||
|
boolean useRegex,
|
||||||
|
boolean wholeWordSearch) {
|
||||||
|
try {
|
||||||
|
int idx = -1;
|
||||||
|
for (int i = 0; i < document.getNumberOfPages(); i++) {
|
||||||
|
idx++;
|
||||||
|
if (pageStillContainsTargets(
|
||||||
|
document, idx, targetWords, useRegex, wholeWordSearch)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception ignored) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
public static Map<Integer, List<PDFText>> findTextToRedact(
|
public static Map<Integer, List<PDFText>> findTextToRedact(
|
||||||
PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) {
|
PDDocument document, String[] listOfText, boolean useRegex, boolean wholeWordSearch) {
|
||||||
Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
|
Map<Integer, List<PDFText>> allFoundTextsByPage = new HashMap<>();
|
||||||
@ -809,6 +830,8 @@ public class RedactionService {
|
|||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
AGGRESSIVE_MODE.set(Boolean.TRUE);
|
AGGRESSIVE_MODE.set(Boolean.TRUE);
|
||||||
try {
|
try {
|
||||||
|
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||||
|
boolean anyResidual = false;
|
||||||
int pageIndex = -1;
|
int pageIndex = -1;
|
||||||
for (PDPage page : document.getPages()) {
|
for (PDPage page : document.getPages()) {
|
||||||
pageIndex++;
|
pageIndex++;
|
||||||
@ -816,7 +839,11 @@ public class RedactionService {
|
|||||||
AGGR_SEG_MATCHES.remove();
|
AGGR_SEG_MATCHES.remove();
|
||||||
List<Object> filtered =
|
List<Object> filtered =
|
||||||
createTokensWithoutTargetText(
|
createTokensWithoutTargetText(
|
||||||
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
document,
|
||||||
|
page,
|
||||||
|
allSearchTerms,
|
||||||
|
useRegex,
|
||||||
|
wholeWordSearchBool);
|
||||||
writeFilteredContentStream(document, page, filtered);
|
writeFilteredContentStream(document, page, filtered);
|
||||||
boolean residual =
|
boolean residual =
|
||||||
pageStillContainsTargets(
|
pageStillContainsTargets(
|
||||||
@ -826,6 +853,7 @@ public class RedactionService {
|
|||||||
useRegex,
|
useRegex,
|
||||||
wholeWordSearchBool);
|
wholeWordSearchBool);
|
||||||
if (residual) {
|
if (residual) {
|
||||||
|
anyResidual = true;
|
||||||
try {
|
try {
|
||||||
var sem = wipeAllSemanticTextInTokens(filtered);
|
var sem = wipeAllSemanticTextInTokens(filtered);
|
||||||
filtered = sem.tokens;
|
filtered = sem.tokens;
|
||||||
@ -842,6 +870,16 @@ public class RedactionService {
|
|||||||
} catch (Exception ignored) {
|
} catch (Exception ignored) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// If no residuals detected in this sweep, stop early
|
||||||
|
if (!anyResidual) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// As a safety, if nothing left in the doc, stop
|
||||||
|
if (!documentStillContainsTargets(
|
||||||
|
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
} finally {
|
} finally {
|
||||||
AGGRESSIVE_MODE.remove();
|
AGGRESSIVE_MODE.remove();
|
||||||
}
|
}
|
||||||
@ -862,12 +900,19 @@ public class RedactionService {
|
|||||||
.map(String::trim)
|
.map(String::trim)
|
||||||
.filter(s -> !s.isEmpty())
|
.filter(s -> !s.isEmpty())
|
||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
|
for (int sweep = 0; sweep < MAX_SWEEPS; sweep++) {
|
||||||
for (PDPage page : document.getPages()) {
|
for (PDPage page : document.getPages()) {
|
||||||
List<Object> filtered =
|
List<Object> filtered =
|
||||||
createTokensWithoutTargetText(
|
createTokensWithoutTargetText(
|
||||||
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
document, page, allSearchTerms, useRegex, wholeWordSearchBool);
|
||||||
writeFilteredContentStream(document, page, filtered);
|
writeFilteredContentStream(document, page, filtered);
|
||||||
}
|
}
|
||||||
|
// Stop early if nothing remains
|
||||||
|
if (!documentStillContainsTargets(
|
||||||
|
document, allSearchTerms, useRegex, wholeWordSearchBool)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return true;
|
return true;
|
||||||
@ -1473,7 +1518,8 @@ public class RedactionService {
|
|||||||
String originalPart =
|
String originalPart =
|
||||||
originalText.substring(
|
originalText.substring(
|
||||||
redactionStartInString, redactionEndInString);
|
redactionStartInString, redactionEndInString);
|
||||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && segment.getFont() != null
|
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
|
||||||
|
&& segment.getFont() != null
|
||||||
&& !TextEncodingHelper.isTextSegmentRemovable(
|
&& !TextEncodingHelper.isTextSegmentRemovable(
|
||||||
segment.getFont(), originalPart)) {
|
segment.getFont(), originalPart)) {
|
||||||
continue;
|
continue;
|
||||||
@ -1514,7 +1560,10 @@ public class RedactionService {
|
|||||||
}
|
}
|
||||||
String modifiedString = newText.toString();
|
String modifiedString = newText.toString();
|
||||||
newArray.add(new COSString(modifiedString));
|
newArray.add(new COSString(modifiedString));
|
||||||
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get()) && modified && segment.getFont() != null && segment.getFontSize() > 0) {
|
if (!Boolean.TRUE.equals(AGGRESSIVE_MODE.get())
|
||||||
|
&& modified
|
||||||
|
&& segment.getFont() != null
|
||||||
|
&& segment.getFontSize() > 0) {
|
||||||
try {
|
try {
|
||||||
float originalWidth =
|
float originalWidth =
|
||||||
safeGetStringWidth(segment.getFont(), originalText)
|
safeGetStringWidth(segment.getFont(), originalText)
|
||||||
@ -1847,8 +1896,7 @@ public class RedactionService {
|
|||||||
private PDFont font = null;
|
private PDFont font = null;
|
||||||
private float fontSize = 0;
|
private float fontSize = 0;
|
||||||
|
|
||||||
public GraphicsState() {
|
public GraphicsState() {}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
package stirling.software.SPDF.utils.text;
|
package stirling.software.SPDF.utils.text;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
import org.apache.pdfbox.cos.COSString;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.CharBuffer;
|
import java.nio.CharBuffer;
|
||||||
import java.nio.charset.CharsetDecoder;
|
import java.nio.charset.CharsetDecoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.cos.COSString;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class TextDecodingHelper {
|
public class TextDecodingHelper {
|
||||||
@ -89,8 +89,7 @@ public class TextDecodingHelper {
|
|||||||
} catch (Exception ignored) {
|
} catch (Exception ignored) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (charStr == null
|
if (charStr == null && font instanceof PDType0Font type0Font) {
|
||||||
&& font instanceof PDType0Font type0Font) {
|
|
||||||
try {
|
try {
|
||||||
int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
|
int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code;
|
||||||
charStr = type0Font.toUnicode(cid);
|
charStr = type0Font.toUnicode(cid);
|
||||||
@ -143,8 +142,7 @@ public class TextDecodingHelper {
|
|||||||
try {
|
try {
|
||||||
if (bytes.length >= 2) {
|
if (bytes.length >= 2) {
|
||||||
ByteBuffer buffer = ByteBuffer.wrap(bytes);
|
ByteBuffer buffer = ByteBuffer.wrap(bytes);
|
||||||
CharsetDecoder decoder =
|
CharsetDecoder decoder = StandardCharsets.UTF_16BE.newDecoder();
|
||||||
StandardCharsets.UTF_16BE.newDecoder();
|
|
||||||
CharBuffer charBuffer = decoder.decode(buffer);
|
CharBuffer charBuffer = decoder.decode(buffer);
|
||||||
return charBuffer.toString();
|
return charBuffer.toString();
|
||||||
}
|
}
|
||||||
|
@ -2,12 +2,12 @@ package stirling.software.SPDF.utils.text;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
|
||||||
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
|
import org.apache.pdfbox.pdmodel.font.encoding.DictionaryEncoding;
|
||||||
import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
|
import org.apache.pdfbox.pdmodel.font.encoding.Encoding;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
|
@ -5,13 +5,13 @@ import java.util.List;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.PDResources;
|
import org.apache.pdfbox.pdmodel.PDResources;
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class TextFinderUtils {
|
public class TextFinderUtils {
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
package stirling.software.SPDF.utils.text;
|
package stirling.software.SPDF.utils.text;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@ -44,8 +44,7 @@ public class WidthCalculator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private float calculateWidthWithCharacterIteration(
|
private float calculateWidthWithCharacterIteration(PDFont font, String text, float fontSize) {
|
||||||
PDFont font, String text, float fontSize) {
|
|
||||||
try {
|
try {
|
||||||
float totalWidth = 0;
|
float totalWidth = 0;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user