fix(conversion): improve PDF/A conversion quality, color accuracy, and compliance (#5396)

# Description of Changes * Transparency Fix: Implemented a pre-processing step that adds an opaque white background to pages before conversion. This ensures that transparent elements are correctly flattened against white rather than defaulting to black (a common issue in Ghostscript flattening). * Color Distortion Fix: Removed a misconfigured -sDefaultCMYKProfile setting in the Ghostscript command that was incorrectly pointing to an RGB profile. This resolves the "dark/black" color corruption previously seen in print-ready CMYK PDFs. * PDF/A Compliance Improvements: * Font Handling: Updated fixType1FontCharSet to only add a standard CharSet if it is missing or empty. This prevents validation errors where subsetted fonts were being forced to declare glyphs they did not contain. * Spot Color Unification: Added fixSeparationColorSpaces to detect and unify TintTransform objects for Separation colors with the same colorant name, ensuring consistency across document resources. * OCG Naming: Ensured all Optional Content Groups have a valid Name entry.  --- ## Checklist ### General - [X] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [X] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [X] I have performed a self-review of my own code - [X] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### Translations (if applicable) - [ ] I ran [`scripts/counter_translation.py`](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/docs/counter_translation.md) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [X] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --------- Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
2026-04-22 23:08:53 +02:00 · 2026-01-06 00:53:39 +01:00
parent 91bf9abbaa
commit b6e675fab3
1 changed files with 263 additions and 24 deletions
--- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java
+++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java
@@ -21,6 +21,7 @@ import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSStream;
 import org.apache.pdfbox.io.RandomAccessRead;
 import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
 import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
@@ -378,7 +379,6 @@ public class ConvertPDFToPDFA {
        command.add("-sOutputICCProfile=" + colorProfiles.rgb().toAbsolutePath());
        command.add("-sDefaultRGBProfile=" + colorProfiles.rgb().toAbsolutePath());
        command.add("-sDefaultGrayProfile=" + colorProfiles.gray().toAbsolutePath());
-        command.add("-sDefaultCMYKProfile=" + colorProfiles.rgb().toAbsolutePath());

        // Font handling optimized for PDF/A CIDSet compliance
        command.add("-dEmbedAllFonts=true");
@@ -673,25 +673,55 @@ public class ConvertPDFToPDFA {
                    if (descriptor == null) continue;

                    // Check if this is a Type1 font
-                    if (fontNameStr.contains("Type1")
-                            || descriptor.getFontFile() != null
-                            || (descriptor.getFontFile2() == null
-                                    && descriptor.getFontFile3() == null)) {
+                    boolean isType1 =
+                            isType1Font(font)
+                                    || descriptor.getFontFile() != null
+                                    || (descriptor.getFontFile2() == null
+                                            && descriptor.getFontFile3() == null);

-                        String existingCharSet =
-                                descriptor.getCOSObject().getString(COSName.CHAR_SET);
+                    if (isType1) {
+                        COSDictionary descDict = descriptor.getCOSObject();
+                        String existingCharSet = descDict.getString(COSName.CHAR_SET);

-                        String glyphSet = buildStandardType1GlyphSet();
-                        if (!glyphSet.isEmpty()) {
-                            if (existingCharSet == null
-                                    || existingCharSet.trim().isEmpty()
-                                    || countGlyphs(existingCharSet) < countGlyphs(glyphSet)) {
-                                descriptor.getCOSObject().setString(COSName.CHAR_SET, glyphSet);
+                        // Check if font is embedded and if CharSet might be invalid
+                        boolean fontEmbedded = font.isEmbedded();
+                        boolean hasFontFile =
+                                descriptor.getFontFile() != null
+                                        || descriptor.getFontFile2() != null
+                                        || descriptor.getFontFile3() != null;
+
+                        // For PDF/A compliance: if CharSet exists but font is subsetted or
+                        // we can't verify it matches the font file, remove it to avoid validation
+                        // errors
+                        if (existingCharSet != null && !existingCharSet.trim().isEmpty()) {
+                            // If the font appears to be subsetted (indicated by subset prefix in
+                            // name)
+                            // or if we can't verify the CharSet is correct, remove it
+                            if (fontNameStr.contains("+") || fontNameStr.contains("Subset")) {
+                                descDict.removeItem(COSName.CHAR_SET);
                                log.debug(
-                                        "Fixed CharSet for Type1 font {} with {} glyphs (was: {})",
-                                        fontNameStr,
-                                        countGlyphs(glyphSet),
-                                        existingCharSet != null ? countGlyphs(existingCharSet) : 0);
+                                        "Removed potentially invalid CharSet from subsetted Type1 font: {}",
+                                        fontNameStr);
+                            } else if (!hasFontFile && fontEmbedded) {
+                                // Font is embedded but we can't verify CharSet, remove it
+                                descDict.removeItem(COSName.CHAR_SET);
+                                log.debug(
+                                        "Removed unverifiable CharSet from embedded Type1 font: {}",
+                                        fontNameStr);
+                            }
+                        } else if (existingCharSet == null || existingCharSet.trim().isEmpty()) {
+                            // Only add CharSet if font is not subsetted and we can verify it
+                            if (!fontNameStr.contains("+")
+                                    && !fontNameStr.contains("Subset")
+                                    && hasFontFile) {
+                                String glyphSet = buildStandardType1GlyphSet();
+                                if (!glyphSet.isEmpty()) {
+                                    descDict.setString(COSName.CHAR_SET, glyphSet);
+                                    log.debug(
+                                            "Added missing CharSet for Type1 font {} with {} glyphs",
+                                            fontNameStr,
+                                            countGlyphs(glyphSet));
+                                }
                            }
                        }
                    }
@@ -1349,13 +1379,22 @@ public class ConvertPDFToPDFA {

            for (COSBase base : ocgArray) {
                if (base instanceof COSDictionary ocgDict) {
-                    if (!ocgDict.containsKey(COSName.NAME)) {
+                    // Ensure Name entry exists and is not empty
+                    String nameValue = ocgDict.getString(COSName.NAME);
+                    if (nameValue == null || nameValue.trim().isEmpty()) {
                        String newName = "Layer " + unnamedCount++;
                        ocgDict.setString(COSName.NAME, newName);
-                        log.debug("Fixed OCG missing name, set to: {}", newName);
+                        log.debug("Fixed OCG missing or empty name, set to: {}", newName);
                    }
                }
            }
+        } else if (ocgs instanceof COSDictionary ocgDict) {
+            // Handle case where OCGS is a single dictionary instead of array
+            String nameValue = ocgDict.getString(COSName.NAME);
+            if (nameValue == null || nameValue.trim().isEmpty()) {
+                ocgDict.setString(COSName.NAME, "Layer 1");
+                log.debug("Fixed single OCG missing or empty name");
+            }
        }
    }

@@ -1479,7 +1518,9 @@ public class ConvertPDFToPDFA {
        Path pdfaDefFile = createPdfaDefFile(workingDir, colorProfiles, profile);

        // Preprocess PDF for PDF/A compliance using the sanitizer
-        Path sanitizedInputPdf = sanitizePdfWithPdfBox(inputPdf);
+        // We add a white background to ensure transparency is flattened correctly against white
+        // instead of black, addressing common PDF/A conversion issues.
+        Path sanitizedInputPdf = sanitizePdfWithPdfBox(inputPdf, true);
        Path preprocessedPdf = sanitizedInputPdf != null ? sanitizedInputPdf : inputPdf;

        // For PDF/A-1, clean CIDSet issues that may cause validation failures
@@ -1500,11 +1541,14 @@ public class ConvertPDFToPDFA {
                    buildGhostscriptCommand(
                            inputForGs, outputPdf, colorProfiles, workingDir, profile, pdfaDefFile);

+            log.info("Running Ghostscript command: {}", String.join(" ", command));
+
            ProcessExecutorResult result =
                    ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
                            .runCommandWithOutputHandling(command);

            if (result.getRc() != 0) {
+                log.error("Ghostscript failed with output: {}", result.getMessages());
                throw new IOException("Ghostscript exited with code " + result.getRc());
            }

@@ -1665,6 +1709,7 @@ public class ConvertPDFToPDFA {
    }

    private byte[] convertWithPdfBoxMethod(Path inputPath, PdfaProfile profile) throws Exception {
+        log.info("Starting PDFBox/LibreOffice conversion for PDF/A-{}", profile.getPart());
        Path tempInputFile = null;
        byte[] fileBytes;
        Path loPdfPath = null;
@@ -1720,17 +1765,20 @@ public class ConvertPDFToPDFA {
        ColorProfiles colorProfiles = prepareColorProfiles(workingDir);

        // Sanitize the PDF before PDF/X conversion for better Ghostscript compatibility
-        Path sanitizedInputPdf = sanitizePdfWithPdfBox(inputPdf);
+        Path sanitizedInputPdf = sanitizePdfWithPdfBox(inputPdf, true);
        Path inputForGs = sanitizedInputPdf != null ? sanitizedInputPdf : inputPdf;

        List<String> command =
                buildGhostscriptCommandX(inputForGs, outputPdf, colorProfiles, workingDir, profile);

+        log.info("Running Ghostscript PDF/X command: {}", String.join(" ", command));
+
        ProcessExecutorResult result =
                ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
                        .runCommandWithOutputHandling(command);

        if (result.getRc() != 0) {
+            log.error("Ghostscript PDF/X failed with output: {}", result.getMessages());
            throw new IOException("Ghostscript exited with code " + result.getRc());
        }

@@ -1796,12 +1844,12 @@ public class ConvertPDFToPDFA {
        }
    }

-    private Path sanitizePdfWithPdfBox(Path inputPdf) {
+    private Path sanitizePdfWithPdfBox(Path inputPdf, boolean addWhiteBackground) {
        try {
            Path sanitizedPath =
                    inputPdf.getParent().resolve("sanitized_" + inputPdf.getFileName().toString());

-            sanitizeDocument(inputPdf, sanitizedPath);
+            sanitizeDocument(inputPdf, sanitizedPath, addWhiteBackground);

            log.info("PDF sanitized with PDFBox for better Ghostscript compatibility");
            return sanitizedPath;
@@ -1813,7 +1861,8 @@ public class ConvertPDFToPDFA {
        }
    }

-    private void sanitizeDocument(Path inputPath, Path outputPath) throws IOException {
+    private void sanitizeDocument(Path inputPath, Path outputPath, boolean addWhiteBackground)
+            throws IOException {
        try (PDDocument doc = Loader.loadPDF(inputPath.toFile())) {
            Map<String, DocumentSanitizer> sanitizers = new LinkedHashMap<>();
            sanitizers.put("Flatten highlight annotations", this::flattenHighlightsToContent);
@@ -1824,6 +1873,11 @@ public class ConvertPDFToPDFA {
            sanitizers.put("Ensure embedded file compliance", this::ensureEmbeddedFileCompliance);
            sanitizers.put(
                    "Fix optional content groups", ConvertPDFToPDFA::fixOptionalContentGroups);
+            sanitizers.put("Fix separation color spaces", this::fixSeparationColorSpaces);
+
+            if (addWhiteBackground) {
+                sanitizers.put("Add white background", this::addWhiteBackground);
+            }

            for (Map.Entry<String, DocumentSanitizer> entry : sanitizers.entrySet()) {
                try {
@@ -1841,6 +1895,191 @@ public class ConvertPDFToPDFA {
        }
    }

+    private void fixSeparationColorSpaces(PDDocument doc) throws IOException {
+        Map<String, COSBase> knownTintTransforms = new HashMap<>();
+        Set<COSBase> visitedResources = new HashSet<>();
+
+        // Process all pages first to collect all separation color spaces
+        for (PDPage page : doc.getPages()) {
+            PDResources resources = page.getResources();
+            processResourcesForSeparation(resources, knownTintTransforms, visitedResources);
+        }
+
+        // Process document-level resources if they exist
+        PDDocumentCatalog catalog = doc.getDocumentCatalog();
+        if (catalog != null) {
+            PDResources docResources =
+                    catalog.getAcroForm() != null
+                            ? catalog.getAcroForm().getDefaultResources()
+                            : null;
+            if (docResources != null) {
+                processResourcesForSeparation(docResources, knownTintTransforms, visitedResources);
+            }
+        }
+
+        // Second pass: ensure all separations with the same name use the same tintTransform
+        visitedResources.clear();
+        for (PDPage page : doc.getPages()) {
+            PDResources resources = page.getResources();
+            enforceSeparationConsistency(resources, knownTintTransforms, visitedResources);
+        }
+    }
+
+    private void processResourcesForSeparation(
+            PDResources resources,
+            Map<String, COSBase> knownTintTransforms,
+            Set<COSBase> visitedResources) {
+        if (resources == null) return;
+
+        // Prevent infinite recursion if resources are shared or cyclic
+        if (!visitedResources.add(resources.getCOSObject())) {
+            return;
+        }
+
+        // Check defined ColorSpaces
+        COSDictionary csDict =
+                (COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.COLORSPACE);
+        if (csDict != null) {
+            for (COSName name : csDict.keySet()) {
+                COSBase csVal = csDict.getDictionaryObject(name);
+                checkAndFixSeparation(csVal, knownTintTransforms);
+            }
+        }
+
+        // Recursively check XObjects (Forms)
+        COSDictionary xObjDict =
+                (COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.XOBJECT);
+        if (xObjDict != null) {
+            for (COSName name : xObjDict.keySet()) {
+                COSBase xObj = xObjDict.getDictionaryObject(name);
+                if (xObj instanceof COSStream stream) {
+                    COSName type = (COSName) stream.getDictionaryObject(COSName.SUBTYPE);
+                    if (COSName.FORM.equals(type)) {
+                        COSBase formRes = stream.getDictionaryObject(COSName.RESOURCES);
+                        if (formRes instanceof COSDictionary formResDict) {
+                            processResourcesForSeparation(
+                                    new PDResources(formResDict),
+                                    knownTintTransforms,
+                                    visitedResources);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    private void checkAndFixSeparation(COSBase cs, Map<String, COSBase> knownTintTransforms) {
+        if (cs instanceof COSArray arr && arr.size() >= 4) {
+            COSBase type = arr.getObject(0);
+            if (COSName.SEPARATION.equals(type)) {
+                // Separation: [/Separation name altSpace tintTransform]
+                COSBase nameBase = arr.getObject(1);
+                if (nameBase instanceof COSName colorName) {
+                    String name = colorName.getName();
+                    COSBase tintTransform = arr.getObject(3);
+
+                    if (knownTintTransforms.containsKey(name)) {
+                        COSBase known = knownTintTransforms.get(name);
+                        // If objects are not identical (same reference), unify them
+                        if (known != tintTransform) {
+                            arr.set(3, known);
+                            log.debug("Unified TintTransform for Separation color: {}", name);
+                        }
+                    } else {
+                        // Store the first encountered tintTransform for this color name
+                        knownTintTransforms.put(name, tintTransform);
+                    }
+                }
+            }
+        }
+    }
+
+    private void enforceSeparationConsistency(
+            PDResources resources,
+            Map<String, COSBase> knownTintTransforms,
+            Set<COSBase> visitedResources) {
+        if (resources == null) return;
+
+        // Prevent infinite recursion
+        if (!visitedResources.add(resources.getCOSObject())) {
+            return;
+        }
+
+        // Check defined ColorSpaces
+        COSDictionary csDict =
+                (COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.COLORSPACE);
+        if (csDict != null) {
+            for (COSName name : csDict.keySet()) {
+                COSBase csVal = csDict.getDictionaryObject(name);
+                enforceSeparationTintTransform(csVal, knownTintTransforms);
+            }
+        }
+
+        // Recursively check XObjects (Forms)
+        COSDictionary xObjDict =
+                (COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.XOBJECT);
+        if (xObjDict != null) {
+            for (COSName name : xObjDict.keySet()) {
+                COSBase xObj = xObjDict.getDictionaryObject(name);
+                if (xObj instanceof COSStream stream) {
+                    COSName type = (COSName) stream.getDictionaryObject(COSName.SUBTYPE);
+                    if (COSName.FORM.equals(type)) {
+                        COSBase formRes = stream.getDictionaryObject(COSName.RESOURCES);
+                        if (formRes instanceof COSDictionary formResDict) {
+                            enforceSeparationConsistency(
+                                    new PDResources(formResDict),
+                                    knownTintTransforms,
+                                    visitedResources);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    private void enforceSeparationTintTransform(
+            COSBase cs, Map<String, COSBase> knownTintTransforms) {
+        if (cs instanceof COSArray arr && arr.size() >= 4) {
+            COSBase type = arr.getObject(0);
+            if (COSName.SEPARATION.equals(type)) {
+                COSBase nameBase = arr.getObject(1);
+                if (nameBase instanceof COSName colorName) {
+                    String name = colorName.getName();
+                    COSBase tintTransform = arr.getObject(3);
+
+                    // Ensure all separations with the same name use the same tintTransform
+                    // reference
+                    if (knownTintTransforms.containsKey(name)) {
+                        COSBase known = knownTintTransforms.get(name);
+                        if (known != tintTransform) {
+                            arr.set(3, known);
+                            log.debug(
+                                    "Enforced consistent TintTransform for Separation color: {}",
+                                    name);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    private void addWhiteBackground(PDDocument doc) throws IOException {
+        for (PDPage page : doc.getPages()) {
+            PDRectangle mediaBox = page.getMediaBox();
+            try (PDPageContentStream cs =
+                    new PDPageContentStream(
+                            doc, page, PDPageContentStream.AppendMode.PREPEND, true, true)) {
+                cs.setNonStrokingColor(Color.WHITE);
+                cs.addRect(
+                        mediaBox.getLowerLeftX(),
+                        mediaBox.getLowerLeftY(),
+                        mediaBox.getWidth(),
+                        mediaBox.getHeight());
+                cs.fill();
+            }
+        }
+    }
+
    private void flattenHighlightsToContent(PDDocument doc) throws IOException {
        for (PDPage page : doc.getPages()) {
            List<PDAnnotation> annotations = new ArrayList<>(page.getAnnotations());