refactor: clean up code formatting and improve readability in RedactionService

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-09-02 21:13:43 +02:00
parent 338b77de99
commit f00952b856

View File

@ -42,6 +42,7 @@ import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern; import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
@ -2386,13 +2387,13 @@ public class RedactionService {
} }
private static void mapStartToEnd( private static void mapStartToEnd(
List<TextSegment> segments, List<TextSegment> segments,
List<MatchRange> result, List<MatchRange> result,
Map<Integer, List<AggressiveSegMatch>> perSegMatches, Map<Integer, List<AggressiveSegMatch>> perSegMatches,
List<Integer> decStarts, List<Integer> decStarts,
List<Integer> decEnds, List<Integer> decEnds,
int gStart, int gStart,
int gEnd) { int gEnd) {
for (int sIdx = 0; sIdx < segments.size(); sIdx++) { for (int sIdx = 0; sIdx < segments.size(); sIdx++) {
int sStart = decStarts.get(sIdx); int sStart = decStarts.get(sIdx);
int sEnd = decEnds.get(sIdx); int sEnd = decEnds.get(sIdx);
@ -2413,7 +2414,7 @@ public class RedactionService {
} }
private static WidthCalculationResult calculatePreciseWidthAdjustment( private static WidthCalculationResult calculatePreciseWidthAdjustment(
TextSegment segment, List<MatchRange> matches, String text) { TextSegment segment, List<MatchRange> matches, String text) {
float totalOriginalWidth = 0f, totalPlaceholderWidth = 0f; float totalOriginalWidth = 0f, totalPlaceholderWidth = 0f;
int processedMatches = 0; int processedMatches = 0;
List<String> warnings = new ArrayList<>(); List<String> warnings = new ArrayList<>();
@ -2513,7 +2514,7 @@ public class RedactionService {
} }
private static void addSpacingAdjustment( private static void addSpacingAdjustment(
COSArray array, TextSegment segment, String originalText, String modifiedText) { COSArray array, TextSegment segment, String originalText, String modifiedText) {
try { try {
if (array == null || segment == null || segment.getFont() == null) return; if (array == null || segment == null || segment.getFont() == null) return;
if (Objects.equals(originalText, modifiedText)) return; if (Objects.equals(originalText, modifiedText)) return;
@ -2672,16 +2673,14 @@ public class RedactionService {
merger.setDestinationFileName(finalOutputFile.toString()); merger.setDestinationFileName(finalOutputFile.toString());
for (int pageNum = 0; pageNum < pageCount; pageNum++) { for (int pageNum = 0; pageNum < pageCount; pageNum++) {
BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 600); BufferedImage image = pdfRenderer.renderImageWithDPI(pageNum, 600);
File imagePath = File imagePath = new File(tempImagesDir, "page_" + pageNum + ".png");
new File(tempImagesDir, "page_" + pageNum + ".png");
ImageIO.write(image, "png", imagePath); ImageIO.write(image, "png", imagePath);
List<String> command = List<String> command =
new ArrayList<>( new ArrayList<>(
Arrays.asList( Arrays.asList(
"tesseract", "tesseract",
imagePath.toString(), imagePath.toString(),
new File(tempOutputDir, "page_" + pageNum) new File(tempOutputDir, "page_" + pageNum).toString(),
.toString(),
"-l", "-l",
buildLanguageOption(request), buildLanguageOption(request),
"--dpi", "--dpi",
@ -2696,8 +2695,7 @@ public class RedactionService {
throw new IOException( throw new IOException(
"Tesseract restoration failed with return code: " + result.getRc()); "Tesseract restoration failed with return code: " + result.getRc());
} }
java.io.File pageOutputPath = File pageOutputPath = new File(tempOutputDir, "page_" + pageNum + ".pdf");
new java.io.File(tempOutputDir, "page_" + pageNum + ".pdf");
merger.addSource(pageOutputPath); merger.addSource(pageOutputPath);
} }
merger.mergeDocuments(null); merger.mergeDocuments(null);
@ -2736,13 +2734,8 @@ public class RedactionService {
pageText.append(pdfText.getText()).append(" "); pageText.append(pdfText.getText()).append(" ");
} }
} }
String extractedText = pageText.toString().trim();
for (String word : targetWords) {
if (extractedText.toLowerCase().contains(word.toLowerCase())) {}
}
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to get direct text from page", e);
} }
} }
@ -2776,8 +2769,6 @@ public class RedactionService {
} }
String completeText = allText.toString().trim(); String completeText = allText.toString().trim();
if (!completeText.isEmpty() && hasProblematicChars) {}
} }
List<MatchRange> matches; List<MatchRange> matches;
@ -2939,8 +2930,7 @@ public class RedactionService {
NORMALIZE_WHITESPACE NORMALIZE_WHITESPACE
} }
public interface SemanticScrubber { public interface SemanticScrubber {}
}
private static class GlyphCoverageProbe { private static class GlyphCoverageProbe {
private final PDFont font; private final PDFont font;
@ -2956,7 +2946,7 @@ public class RedactionService {
if (font == null) return coverage; if (font == null) return coverage;
try { try {
if (font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font) { if (font instanceof PDType0Font) {
for (int cid = 0; cid < 65536; cid++) { for (int cid = 0; cid < 65536; cid++) {
try { try {
String unicode = font.toUnicode(cid); String unicode = font.toUnicode(cid);
@ -2964,12 +2954,12 @@ public class RedactionService {
coverage.add(cid); coverage.add(cid);
} }
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to get unicode for cid {}", cid, e);
} }
} }
} }
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to get glyph coverage for font {}", font, e);
} }
return coverage; return coverage;
} }
@ -2995,17 +2985,17 @@ public class RedactionService {
String charStr = new String(Character.toChars(codePoint)); String charStr = new String(Character.toChars(codePoint));
return font.getStringWidth(charStr) / FONT_SCALE_FACTOR * fontSize; return font.getStringWidth(charStr) / FONT_SCALE_FACTOR * fontSize;
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to get width for codepoint {}", codePoint, e);
} }
} }
return switch (strategy) { return switch (strategy) {
case EMBED_WIDTH -> getEmbeddedProgramWidth(codePoint, fontSize); case EMBED_WIDTH -> getEmbeddedProgramWidth(fontSize);
case AVERAGE_WIDTH -> getAverageFontWidth(fontSize); case AVERAGE_WIDTH -> getAverageFontWidth(fontSize);
case LEGACY_SUM -> getLegacySumFallback(codePoint, fontSize); case LEGACY_SUM -> getLegacySumFallback(codePoint, fontSize);
}; };
} }
private float getEmbeddedProgramWidth(int codePoint, float fontSize) { private float getEmbeddedProgramWidth(float fontSize) {
try { try {
if (font.getFontDescriptor() != null) { if (font.getFontDescriptor() != null) {
float avgWidth = font.getFontDescriptor().getAverageWidth(); float avgWidth = font.getFontDescriptor().getAverageWidth();
@ -3033,7 +3023,7 @@ public class RedactionService {
validChars++; validChars++;
} }
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to get width for char {}", ch, e);
} }
} }
@ -3091,7 +3081,7 @@ public class RedactionService {
} }
} }
private void scrubStructureElement(COSDictionary element, Set<ScrubOption> options) { private static void scrubStructureElement(COSDictionary element, Set<ScrubOption> options) {
if (element == null) return; if (element == null) return;
if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)) { if (options.contains(ScrubOption.REMOVE_ACTUALTEXT)) {
@ -3120,18 +3110,16 @@ public class RedactionService {
} }
} }
private void normalizeWhitespaceInElement(COSDictionary element) { private static void normalizeWhitespaceInElement(COSDictionary element) {
for (COSName key : List.of(COSName.ACTUAL_TEXT, COSName.ALT, COSName.TU)) { for (COSName key : List.of(COSName.ACTUAL_TEXT, COSName.ALT, COSName.TU)) {
COSBase value = element.getDictionaryObject(key); COSBase value = element.getDictionaryObject(key);
if (value instanceof COSString cosString) { if (value instanceof COSString cosString) {
String text = cosString.getString(); String text = cosString.getString();
if (text != null) { String normalized = text.replaceAll("\\s+", " ").trim();
String normalized = text.replaceAll("\\s+", " ").trim(); if (normalized.length() > 256) {
if (normalized.length() > 256) { normalized = normalized.substring(0, 256);
normalized = normalized.substring(0, 256);
}
element.setString(key, normalized);
} }
element.setString(key, normalized);
} }
} }
} }
@ -3157,7 +3145,7 @@ public class RedactionService {
} }
} }
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to scrub annotations", e);
} }
} }
} }