diff --git a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java index d464298fc..ce2cc513e 100644 --- a/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java +++ b/app/core/src/main/java/stirling/software/SPDF/service/RedactionService.java @@ -46,6 +46,7 @@ import stirling.software.SPDF.model.PDFText; import stirling.software.SPDF.model.api.security.ManualRedactPdfRequest; import stirling.software.SPDF.model.api.security.RedactPdfRequest; import stirling.software.SPDF.pdf.TextFinder; +import stirling.software.SPDF.utils.text.TextDecodingHelper; import stirling.software.SPDF.utils.text.TextEncodingHelper; import stirling.software.SPDF.utils.text.TextFinderUtils; import stirling.software.SPDF.utils.text.WidthCalculator; @@ -337,48 +338,7 @@ public class RedactionService { } } - private static String tryDecodeWithFontEnhanced(PDFont font, COSString cosString) { - try { - if (font == null || cosString == null) { - return null; - } - byte[] bytes = cosString.getBytes(); - if (bytes.length == 0) { - return ""; - } - String basicDecoded = tryDecodeWithFont(font, cosString); - if (basicDecoded != null && !basicDecoded.contains("?")) { - return basicDecoded; - } - StringBuilder out = new StringBuilder(); - for (byte aByte : bytes) { - int code = aByte & 0xFF; - String charStr = null; - try { - charStr = font.toUnicode(code); - } catch (Exception ignored) { - } - if (charStr == null && font.getName() != null && font.getName().contains("+")) { - charStr = mapSubsetCharacter(code); - } - - out.append(charStr != null ? charStr : ""); - } - return out.toString(); - } catch (Exception e) { - return tryDecodeWithFont(font, cosString); - } - } - - private static String mapSubsetCharacter(int code) { - if (code >= 32 && code <= 126) { - return String.valueOf((char) code); - } - if (code >= 160 && code <= 255) { - return String.valueOf((char) (code - 128)); - } - return null; - } + // Local decoding helpers removed in favor of TextDecodingHelper private static String normalizeForFuzzy(String s) { if (s == null) { @@ -632,71 +592,6 @@ public class RedactionService { return text.length() * 500f; } - private static String tryDecodeWithFont(PDFont font, COSString cosString) { - try { - if (font == null || cosString == null) { - return null; - } - byte[] bytes = cosString.getBytes(); - if (bytes.length == 0) { - return ""; - } - boolean anyMapped = false; - StringBuilder out = new StringBuilder(); - for (byte b : bytes) { - int code = b & 0xFF; - String uni = null; - try { - uni = font.toUnicode(code); - } catch (Exception ignored) { - } - if (uni != null) { - out.append(uni); - anyMapped = true; - } else { - out.append('?'); - } - } - if (anyMapped) { - return out.toString(); - } - out.setLength(0); - anyMapped = false; - for (int i = 0; i < bytes.length; ) { - int b1 = bytes[i] & 0xFF; - String u1 = null; - try { - u1 = font.toUnicode(b1); - } catch (Exception ignored) { - } - if (i + 1 < bytes.length) { - int b2 = bytes[i + 1] & 0xFF; - int code = (b1 << 8) | b2; - String u2 = null; - try { - u2 = font.toUnicode(code); - } catch (Exception ignored) { - } - if (u2 != null) { - out.append(u2); - i += 2; - anyMapped = true; - continue; - } - } - if (u1 != null) { - out.append(u1); - } else { - out.append('?'); - } - i += 1; - } - return anyMapped ? out.toString() : null; - } catch (Exception e) { - return null; - } - } - private static WipeResult wipeAllTextShowingOperators(List tokens) { List newTokens = new ArrayList<>(tokens); int modifications = 0; @@ -1062,7 +957,7 @@ public class RedactionService { if (aggressive && gs.font != null && tokens.get(i - 1) instanceof COSString cs) { - tryDecodeWithFontEnhanced(gs.font, cs); + TextDecodingHelper.tryDecodeWithFontEnhanced(gs.font, cs); } segments.add( new TextSegment( @@ -1175,12 +1070,12 @@ public class RedactionService { || "'".equals(seg.getOperatorName()) || "\"".equals(seg.getOperatorName())) && tok instanceof COSString cs) { - decoded = tryDecodeWithFont(seg.getFont(), cs); + decoded = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), cs); } else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) { StringBuilder sb = new StringBuilder(); for (COSBase el : arr) { if (el instanceof COSString s) { - String d = tryDecodeWithFont(seg.getFont(), s); + String d = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), s); sb.append(d != null ? d : s.getString()); } } @@ -1272,12 +1167,12 @@ public class RedactionService { Object tok = tokens.get(seg.getTokenIndex()); if (("Tj".equals(seg.getOperatorName()) || "'".equals(seg.getOperatorName())) && tok instanceof COSString cs) { - decoded = tryDecodeWithFont(seg.getFont(), cs); + decoded = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), cs); } else if ("TJ".equals(seg.getOperatorName()) && tok instanceof COSArray arr) { StringBuilder sb = new StringBuilder(); for (COSBase el : arr) { if (el instanceof COSString s) { - String d = tryDecodeWithFont(seg.getFont(), s); + String d = TextDecodingHelper.tryDecodeWithFont(seg.getFont(), s); sb.append(d != null ? d : s.getString()); } } @@ -1715,7 +1610,7 @@ public class RedactionService { } private int wipeAllTextInResources(PDDocument document, PDResources resources) { - int totalMods = 0; + int totalMods = 0; // aggregated but currently not returned to caller try { totalMods += wipeAllSemanticTextInProperties(resources); for (COSName xobjName : resources.getXObjectNames()) { @@ -1776,7 +1671,6 @@ public class RedactionService { } private void wipeAllTextInPatterns(PDDocument document, PDResources resources) { - int totalMods = 0; try { for (COSName patName : resources.getPatternNames()) { try { @@ -1786,7 +1680,7 @@ public class RedactionService { org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern tiling) { PDResources patRes = tiling.getResources(); if (patRes != null) { - totalMods += wipeAllTextInResources(document, patRes); + wipeAllTextInResources(document, patRes); } PDFStreamParser parser = new PDFStreamParser(tiling); List tokens = new ArrayList<>(); @@ -1795,9 +1689,7 @@ public class RedactionService { tokens.add(token); } WipeResult wrText = wipeAllTextShowingOperators(tokens); - totalMods += wrText.modifications; WipeResult wrSem = wipeAllSemanticTextInTokens(wrText.tokens); - totalMods += wrSem.modifications; if (wrText.modifications > 0 || wrSem.modifications > 0) { writeRedactedContentToPattern(tiling, wrSem.tokens); } @@ -1809,6 +1701,7 @@ public class RedactionService { } } + @SuppressWarnings("unused") private int wipeAllTextInAnnotations(PDDocument document, PDPage page) { int totalMods = 0; try { diff --git a/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java new file mode 100644 index 000000000..80e7267f6 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/utils/text/TextDecodingHelper.java @@ -0,0 +1,232 @@ +package stirling.software.SPDF.utils.text; + +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDType0Font; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class TextDecodingHelper { + + private static final int ASCII_LOWER_BOUND = 32; + private static final int ASCII_UPPER_BOUND = 126; + private static final int EXTENDED_ASCII_LOWER_BOUND = 160; + private static final int EXTENDED_ASCII_UPPER_BOUND = 255; + + public static void tryDecodeWithFontEnhanced(PDFont font, COSString cosString) { + if (font == null || cosString == null) { + return; + } + + try { + byte[] bytes = cosString.getBytes(); + if (bytes.length == 0) { + return; + } + + String basicDecoded = tryDecodeWithFont(font, cosString); + if (basicDecoded != null + && !basicDecoded.contains("?") + && !basicDecoded.trim().isEmpty()) { + return; + } + + decodeCharactersEnhanced(font, bytes); + + } catch (Exception e) { + log.error("Decoding failed: {}", e.getMessage(), e); + try { + tryDecodeWithFont(font, cosString); + } catch (Exception fallbackException) { + // Ultimate fallback: return hex representation for analysis + } + } + } + + public static String decodeCharactersEnhanced(PDFont font, byte[] bytes) { + StringBuilder out = new StringBuilder(); + boolean hasValidCharacters = false; + int i = 0; + while (i < bytes.length) { + int code = bytes[i] & 0xFF; + String charStr = decodeSingleCharacter(font, code, bytes); + + // Heuristic for multi-byte: if high byte, try combining with next + if (charStr == null && code >= 128 && i + 1 < bytes.length) { + int combinedCode = (code << 8) | (bytes[i + 1] & 0xFF); + charStr = decodeSingleCharacter(font, combinedCode, bytes); + if (charStr != null) { + i += 2; // Skip the next byte + out.append(charStr); + hasValidCharacters = true; + continue; + } + } + + if (charStr != null && !charStr.isEmpty()) { + out.append(charStr); + hasValidCharacters = true; + } else { + out.append('?'); + } + i++; + } + String result = out.toString(); + return hasValidCharacters ? result : null; + } + + public static String decodeSingleCharacter(PDFont font, int code, byte[] bytes) { + String charStr = null; + + try { + charStr = font.toUnicode(code); + } catch (Exception ignored) { + } + + // Enhanced CID Font and Composite Font Handling + if (charStr == null + && font instanceof org.apache.pdfbox.pdmodel.font.PDType0Font type0Font) { + try { + // Attempt CID-specific decoding for multi-byte codes + int cid = (bytes.length > 1) ? ((bytes[0] & 0xFF) << 8) | (bytes[1] & 0xFF) : code; + charStr = type0Font.toUnicode(cid); + log.debug("CID decoding successful for code {}: {}", cid, charStr); + } catch (Exception e) { + log.debug("CID decoding failed for code {}: {}", code, e.getMessage()); + } + } + + if (charStr == null && font.getName() != null && font.getName().contains("+")) { + charStr = mapSubsetCharacter(code); + } + + if (charStr == null) { + charStr = fallbackCharacterMapping(code, bytes, font); + } + + return charStr; + } + + public static String fallbackCharacterMapping(int code, byte[] bytes, PDFont font) { + try { + if (font instanceof PDType0Font && bytes.length > 1) { + return null; + } + + if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) { + return String.valueOf((char) code); + } + + if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) { + return String.valueOf((char) code); + } + + String fontName = font.getName(); + if (fontName != null) { + String lowerName = fontName.toLowerCase(); + if (lowerName.contains("cjk") + || lowerName.contains("gb") + || lowerName.contains("jp")) { + // Basic CJK fallback (expand with a lookup table if needed) + if (code >= 0x4E00 && code <= 0x9FFF) { + return String.valueOf( + (char) code); // Unicode Basic Multilingual Plane for CJK + } + } + } + + // Fallback to UTF-8/16 decoding attempt for unknown encodings + try { + if (bytes.length >= 2) { + java.nio.ByteBuffer buffer = java.nio.ByteBuffer.wrap(bytes); + java.nio.charset.CharsetDecoder decoder = + java.nio.charset.StandardCharsets.UTF_16BE.newDecoder(); + java.nio.CharBuffer charBuffer = decoder.decode(buffer); + return charBuffer.toString(); + } + } catch (Exception e) { + log.debug("UTF fallback failed: {}", e.getMessage()); + } + + return null; + } catch (Exception e) { + return null; + } + } + + public static String mapSubsetCharacter(int code) { + if (code >= ASCII_LOWER_BOUND && code <= ASCII_UPPER_BOUND) { + return String.valueOf((char) code); + } + if (code >= EXTENDED_ASCII_LOWER_BOUND && code <= EXTENDED_ASCII_UPPER_BOUND) { + return String.valueOf((char) (code - 128)); + } + return null; + } + + public static String tryDecodeWithFont(PDFont font, COSString cosString) { + try { + if (font == null || cosString == null) { + return null; + } + byte[] bytes = cosString.getBytes(); + if (bytes.length == 0) { + return ""; + } + boolean anyMapped = false; + StringBuilder out = new StringBuilder(); + for (byte b : bytes) { + int code = b & 0xFF; + String uni = null; + try { + uni = font.toUnicode(code); + } catch (Exception ignored) { + } + if (uni != null) { + out.append(uni); + anyMapped = true; + } else { + out.append('?'); + } + } + if (anyMapped) { + return out.toString(); + } + out.setLength(0); + anyMapped = false; + for (int i = 0; i < bytes.length; ) { + int b1 = bytes[i] & 0xFF; + String u1 = null; + try { + u1 = font.toUnicode(b1); + } catch (Exception ignored) { + } + if (i + 1 < bytes.length) { + int b2 = bytes[i + 1] & 0xFF; + int code = (b1 << 8) | b2; + String u2 = null; + try { + u2 = font.toUnicode(code); + } catch (Exception ignored) { + } + if (u2 != null) { + out.append(u2); + i += 2; + anyMapped = true; + continue; + } + } + if (u1 != null) { + out.append(u1); + } else { + out.append('?'); + } + i += 1; + } + return anyMapped ? out.toString() : null; + } catch (Exception e) { + return null; + } + } +}