fix(conversion): improve PDF/A conversion quality, color accuracy, and compliance (#5396)

# Description of Changes


* Transparency Fix: Implemented a pre-processing step that adds an
opaque white background to pages before conversion. This ensures that
transparent elements are correctly
flattened against white rather than defaulting to black (a common issue
in Ghostscript flattening).
* Color Distortion Fix: Removed a misconfigured -sDefaultCMYKProfile
setting in the Ghostscript command that was incorrectly pointing to an
RGB profile. This resolves the
"dark/black" color corruption previously seen in print-ready CMYK PDFs.
   * PDF/A Compliance Improvements:
* Font Handling: Updated fixType1FontCharSet to only add a standard
CharSet if it is missing or empty. This prevents validation errors where
subsetted fonts were being forced
         to declare glyphs they did not contain.
* Spot Color Unification: Added fixSeparationColorSpaces to detect and
unify TintTransform objects for Separation colors with the same colorant
name, ensuring consistency
         across document resources.
* OCG Naming: Ensured all Optional Content Groups have a valid Name
entry.





<!--
Please provide a summary of the changes, including:

- What was changed
- Why the change was made
- Any challenges encountered

Closes #(issue_number)
-->

---

## Checklist

### General

- [X] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [X] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [X] I have performed a self-review of my own code
- [X] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### Translations (if applicable)

- [ ] I ran
[`scripts/counter_translation.py`](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/docs/counter_translation.md)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [X] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.

---------

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs
2026-01-06 00:53:39 +01:00
committed by GitHub
parent 91bf9abbaa
commit b6e675fab3

View File

@@ -21,6 +21,7 @@ import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
@@ -378,7 +379,6 @@ public class ConvertPDFToPDFA {
command.add("-sOutputICCProfile=" + colorProfiles.rgb().toAbsolutePath());
command.add("-sDefaultRGBProfile=" + colorProfiles.rgb().toAbsolutePath());
command.add("-sDefaultGrayProfile=" + colorProfiles.gray().toAbsolutePath());
command.add("-sDefaultCMYKProfile=" + colorProfiles.rgb().toAbsolutePath());
// Font handling optimized for PDF/A CIDSet compliance
command.add("-dEmbedAllFonts=true");
@@ -673,25 +673,55 @@ public class ConvertPDFToPDFA {
if (descriptor == null) continue;
// Check if this is a Type1 font
if (fontNameStr.contains("Type1")
|| descriptor.getFontFile() != null
|| (descriptor.getFontFile2() == null
&& descriptor.getFontFile3() == null)) {
boolean isType1 =
isType1Font(font)
|| descriptor.getFontFile() != null
|| (descriptor.getFontFile2() == null
&& descriptor.getFontFile3() == null);
String existingCharSet =
descriptor.getCOSObject().getString(COSName.CHAR_SET);
if (isType1) {
COSDictionary descDict = descriptor.getCOSObject();
String existingCharSet = descDict.getString(COSName.CHAR_SET);
String glyphSet = buildStandardType1GlyphSet();
if (!glyphSet.isEmpty()) {
if (existingCharSet == null
|| existingCharSet.trim().isEmpty()
|| countGlyphs(existingCharSet) < countGlyphs(glyphSet)) {
descriptor.getCOSObject().setString(COSName.CHAR_SET, glyphSet);
// Check if font is embedded and if CharSet might be invalid
boolean fontEmbedded = font.isEmbedded();
boolean hasFontFile =
descriptor.getFontFile() != null
|| descriptor.getFontFile2() != null
|| descriptor.getFontFile3() != null;
// For PDF/A compliance: if CharSet exists but font is subsetted or
// we can't verify it matches the font file, remove it to avoid validation
// errors
if (existingCharSet != null && !existingCharSet.trim().isEmpty()) {
// If the font appears to be subsetted (indicated by subset prefix in
// name)
// or if we can't verify the CharSet is correct, remove it
if (fontNameStr.contains("+") || fontNameStr.contains("Subset")) {
descDict.removeItem(COSName.CHAR_SET);
log.debug(
"Fixed CharSet for Type1 font {} with {} glyphs (was: {})",
fontNameStr,
countGlyphs(glyphSet),
existingCharSet != null ? countGlyphs(existingCharSet) : 0);
"Removed potentially invalid CharSet from subsetted Type1 font: {}",
fontNameStr);
} else if (!hasFontFile && fontEmbedded) {
// Font is embedded but we can't verify CharSet, remove it
descDict.removeItem(COSName.CHAR_SET);
log.debug(
"Removed unverifiable CharSet from embedded Type1 font: {}",
fontNameStr);
}
} else if (existingCharSet == null || existingCharSet.trim().isEmpty()) {
// Only add CharSet if font is not subsetted and we can verify it
if (!fontNameStr.contains("+")
&& !fontNameStr.contains("Subset")
&& hasFontFile) {
String glyphSet = buildStandardType1GlyphSet();
if (!glyphSet.isEmpty()) {
descDict.setString(COSName.CHAR_SET, glyphSet);
log.debug(
"Added missing CharSet for Type1 font {} with {} glyphs",
fontNameStr,
countGlyphs(glyphSet));
}
}
}
}
@@ -1349,13 +1379,22 @@ public class ConvertPDFToPDFA {
for (COSBase base : ocgArray) {
if (base instanceof COSDictionary ocgDict) {
if (!ocgDict.containsKey(COSName.NAME)) {
// Ensure Name entry exists and is not empty
String nameValue = ocgDict.getString(COSName.NAME);
if (nameValue == null || nameValue.trim().isEmpty()) {
String newName = "Layer " + unnamedCount++;
ocgDict.setString(COSName.NAME, newName);
log.debug("Fixed OCG missing name, set to: {}", newName);
log.debug("Fixed OCG missing or empty name, set to: {}", newName);
}
}
}
} else if (ocgs instanceof COSDictionary ocgDict) {
// Handle case where OCGS is a single dictionary instead of array
String nameValue = ocgDict.getString(COSName.NAME);
if (nameValue == null || nameValue.trim().isEmpty()) {
ocgDict.setString(COSName.NAME, "Layer 1");
log.debug("Fixed single OCG missing or empty name");
}
}
}
@@ -1479,7 +1518,9 @@ public class ConvertPDFToPDFA {
Path pdfaDefFile = createPdfaDefFile(workingDir, colorProfiles, profile);
// Preprocess PDF for PDF/A compliance using the sanitizer
Path sanitizedInputPdf = sanitizePdfWithPdfBox(inputPdf);
// We add a white background to ensure transparency is flattened correctly against white
// instead of black, addressing common PDF/A conversion issues.
Path sanitizedInputPdf = sanitizePdfWithPdfBox(inputPdf, true);
Path preprocessedPdf = sanitizedInputPdf != null ? sanitizedInputPdf : inputPdf;
// For PDF/A-1, clean CIDSet issues that may cause validation failures
@@ -1500,11 +1541,14 @@ public class ConvertPDFToPDFA {
buildGhostscriptCommand(
inputForGs, outputPdf, colorProfiles, workingDir, profile, pdfaDefFile);
log.info("Running Ghostscript command: {}", String.join(" ", command));
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
.runCommandWithOutputHandling(command);
if (result.getRc() != 0) {
log.error("Ghostscript failed with output: {}", result.getMessages());
throw new IOException("Ghostscript exited with code " + result.getRc());
}
@@ -1665,6 +1709,7 @@ public class ConvertPDFToPDFA {
}
private byte[] convertWithPdfBoxMethod(Path inputPath, PdfaProfile profile) throws Exception {
log.info("Starting PDFBox/LibreOffice conversion for PDF/A-{}", profile.getPart());
Path tempInputFile = null;
byte[] fileBytes;
Path loPdfPath = null;
@@ -1720,17 +1765,20 @@ public class ConvertPDFToPDFA {
ColorProfiles colorProfiles = prepareColorProfiles(workingDir);
// Sanitize the PDF before PDF/X conversion for better Ghostscript compatibility
Path sanitizedInputPdf = sanitizePdfWithPdfBox(inputPdf);
Path sanitizedInputPdf = sanitizePdfWithPdfBox(inputPdf, true);
Path inputForGs = sanitizedInputPdf != null ? sanitizedInputPdf : inputPdf;
List<String> command =
buildGhostscriptCommandX(inputForGs, outputPdf, colorProfiles, workingDir, profile);
log.info("Running Ghostscript PDF/X command: {}", String.join(" ", command));
ProcessExecutorResult result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
.runCommandWithOutputHandling(command);
if (result.getRc() != 0) {
log.error("Ghostscript PDF/X failed with output: {}", result.getMessages());
throw new IOException("Ghostscript exited with code " + result.getRc());
}
@@ -1796,12 +1844,12 @@ public class ConvertPDFToPDFA {
}
}
private Path sanitizePdfWithPdfBox(Path inputPdf) {
private Path sanitizePdfWithPdfBox(Path inputPdf, boolean addWhiteBackground) {
try {
Path sanitizedPath =
inputPdf.getParent().resolve("sanitized_" + inputPdf.getFileName().toString());
sanitizeDocument(inputPdf, sanitizedPath);
sanitizeDocument(inputPdf, sanitizedPath, addWhiteBackground);
log.info("PDF sanitized with PDFBox for better Ghostscript compatibility");
return sanitizedPath;
@@ -1813,7 +1861,8 @@ public class ConvertPDFToPDFA {
}
}
private void sanitizeDocument(Path inputPath, Path outputPath) throws IOException {
private void sanitizeDocument(Path inputPath, Path outputPath, boolean addWhiteBackground)
throws IOException {
try (PDDocument doc = Loader.loadPDF(inputPath.toFile())) {
Map<String, DocumentSanitizer> sanitizers = new LinkedHashMap<>();
sanitizers.put("Flatten highlight annotations", this::flattenHighlightsToContent);
@@ -1824,6 +1873,11 @@ public class ConvertPDFToPDFA {
sanitizers.put("Ensure embedded file compliance", this::ensureEmbeddedFileCompliance);
sanitizers.put(
"Fix optional content groups", ConvertPDFToPDFA::fixOptionalContentGroups);
sanitizers.put("Fix separation color spaces", this::fixSeparationColorSpaces);
if (addWhiteBackground) {
sanitizers.put("Add white background", this::addWhiteBackground);
}
for (Map.Entry<String, DocumentSanitizer> entry : sanitizers.entrySet()) {
try {
@@ -1841,6 +1895,191 @@ public class ConvertPDFToPDFA {
}
}
private void fixSeparationColorSpaces(PDDocument doc) throws IOException {
Map<String, COSBase> knownTintTransforms = new HashMap<>();
Set<COSBase> visitedResources = new HashSet<>();
// Process all pages first to collect all separation color spaces
for (PDPage page : doc.getPages()) {
PDResources resources = page.getResources();
processResourcesForSeparation(resources, knownTintTransforms, visitedResources);
}
// Process document-level resources if they exist
PDDocumentCatalog catalog = doc.getDocumentCatalog();
if (catalog != null) {
PDResources docResources =
catalog.getAcroForm() != null
? catalog.getAcroForm().getDefaultResources()
: null;
if (docResources != null) {
processResourcesForSeparation(docResources, knownTintTransforms, visitedResources);
}
}
// Second pass: ensure all separations with the same name use the same tintTransform
visitedResources.clear();
for (PDPage page : doc.getPages()) {
PDResources resources = page.getResources();
enforceSeparationConsistency(resources, knownTintTransforms, visitedResources);
}
}
private void processResourcesForSeparation(
PDResources resources,
Map<String, COSBase> knownTintTransforms,
Set<COSBase> visitedResources) {
if (resources == null) return;
// Prevent infinite recursion if resources are shared or cyclic
if (!visitedResources.add(resources.getCOSObject())) {
return;
}
// Check defined ColorSpaces
COSDictionary csDict =
(COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.COLORSPACE);
if (csDict != null) {
for (COSName name : csDict.keySet()) {
COSBase csVal = csDict.getDictionaryObject(name);
checkAndFixSeparation(csVal, knownTintTransforms);
}
}
// Recursively check XObjects (Forms)
COSDictionary xObjDict =
(COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.XOBJECT);
if (xObjDict != null) {
for (COSName name : xObjDict.keySet()) {
COSBase xObj = xObjDict.getDictionaryObject(name);
if (xObj instanceof COSStream stream) {
COSName type = (COSName) stream.getDictionaryObject(COSName.SUBTYPE);
if (COSName.FORM.equals(type)) {
COSBase formRes = stream.getDictionaryObject(COSName.RESOURCES);
if (formRes instanceof COSDictionary formResDict) {
processResourcesForSeparation(
new PDResources(formResDict),
knownTintTransforms,
visitedResources);
}
}
}
}
}
}
private void checkAndFixSeparation(COSBase cs, Map<String, COSBase> knownTintTransforms) {
if (cs instanceof COSArray arr && arr.size() >= 4) {
COSBase type = arr.getObject(0);
if (COSName.SEPARATION.equals(type)) {
// Separation: [/Separation name altSpace tintTransform]
COSBase nameBase = arr.getObject(1);
if (nameBase instanceof COSName colorName) {
String name = colorName.getName();
COSBase tintTransform = arr.getObject(3);
if (knownTintTransforms.containsKey(name)) {
COSBase known = knownTintTransforms.get(name);
// If objects are not identical (same reference), unify them
if (known != tintTransform) {
arr.set(3, known);
log.debug("Unified TintTransform for Separation color: {}", name);
}
} else {
// Store the first encountered tintTransform for this color name
knownTintTransforms.put(name, tintTransform);
}
}
}
}
}
private void enforceSeparationConsistency(
PDResources resources,
Map<String, COSBase> knownTintTransforms,
Set<COSBase> visitedResources) {
if (resources == null) return;
// Prevent infinite recursion
if (!visitedResources.add(resources.getCOSObject())) {
return;
}
// Check defined ColorSpaces
COSDictionary csDict =
(COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.COLORSPACE);
if (csDict != null) {
for (COSName name : csDict.keySet()) {
COSBase csVal = csDict.getDictionaryObject(name);
enforceSeparationTintTransform(csVal, knownTintTransforms);
}
}
// Recursively check XObjects (Forms)
COSDictionary xObjDict =
(COSDictionary) resources.getCOSObject().getDictionaryObject(COSName.XOBJECT);
if (xObjDict != null) {
for (COSName name : xObjDict.keySet()) {
COSBase xObj = xObjDict.getDictionaryObject(name);
if (xObj instanceof COSStream stream) {
COSName type = (COSName) stream.getDictionaryObject(COSName.SUBTYPE);
if (COSName.FORM.equals(type)) {
COSBase formRes = stream.getDictionaryObject(COSName.RESOURCES);
if (formRes instanceof COSDictionary formResDict) {
enforceSeparationConsistency(
new PDResources(formResDict),
knownTintTransforms,
visitedResources);
}
}
}
}
}
}
private void enforceSeparationTintTransform(
COSBase cs, Map<String, COSBase> knownTintTransforms) {
if (cs instanceof COSArray arr && arr.size() >= 4) {
COSBase type = arr.getObject(0);
if (COSName.SEPARATION.equals(type)) {
COSBase nameBase = arr.getObject(1);
if (nameBase instanceof COSName colorName) {
String name = colorName.getName();
COSBase tintTransform = arr.getObject(3);
// Ensure all separations with the same name use the same tintTransform
// reference
if (knownTintTransforms.containsKey(name)) {
COSBase known = knownTintTransforms.get(name);
if (known != tintTransform) {
arr.set(3, known);
log.debug(
"Enforced consistent TintTransform for Separation color: {}",
name);
}
}
}
}
}
}
private void addWhiteBackground(PDDocument doc) throws IOException {
for (PDPage page : doc.getPages()) {
PDRectangle mediaBox = page.getMediaBox();
try (PDPageContentStream cs =
new PDPageContentStream(
doc, page, PDPageContentStream.AppendMode.PREPEND, true, true)) {
cs.setNonStrokingColor(Color.WHITE);
cs.addRect(
mediaBox.getLowerLeftX(),
mediaBox.getLowerLeftY(),
mediaBox.getWidth(),
mediaBox.getHeight());
cs.fill();
}
}
}
private void flattenHighlightsToContent(PDDocument doc) throws IOException {
for (PDPage page : doc.getPages()) {
List<PDAnnotation> annotations = new ArrayList<>(page.getAnnotations());