refactor(pdf-conversion): optimize PDF/A and font embedding flow

- Replaced redundant streams and lists initialization with more efficient alternatives
- Centralized stream reading logic to prevent reuse issues and ensure proper closing
- Enhanced logging for PDF/A validation to differentiate warnings from errors
- Simplified methods by removing redundant parameters and improving clarity
- Updated GregorianCalendar usage to modern java.time classes
- Ensured static state for utility-like methods for cleaner invocation
- Improved PDF/A metadata handling by aligning structure and removing redundancy

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-11-08 11:10:31 +01:00
parent 3728a123b3
commit a5e55e598a

View File

@ -3,28 +3,14 @@ package stirling.software.SPDF.controller.api.converters;
import java.awt.Color; import java.awt.Color;
import java.awt.color.ColorSpace; import java.awt.color.ColorSpace;
import java.awt.color.ICC_Profile; import java.awt.color.ICC_Profile;
import java.io.ByteArrayOutputStream; import java.io.*;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardCopyOption; import java.nio.file.StandardCopyOption;
import java.time.Instant; import java.time.Instant;
import java.time.ZoneId; import java.time.ZoneId;
import java.time.ZonedDateTime; import java.time.ZonedDateTime;
import java.util.ArrayList; import java.util.*;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.GregorianCalendar;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
@ -105,7 +91,12 @@ public class ConvertPDFToPDFA {
Path tempPdfPath = null; Path tempPdfPath = null;
try { try {
tempPdfPath = Files.createTempFile("validate_", ".pdf"); tempPdfPath = Files.createTempFile("validate_", ".pdf");
Files.write(tempPdfPath, pdfBytes);
try (OutputStream out = Files.newOutputStream(tempPdfPath)) {
out.write(pdfBytes);
}
try {
validatePdfaOutput(tempPdfPath, profile); validatePdfaOutput(tempPdfPath, profile);
log.info("PDF/A validation passed for {} using {}", profile.displayName(), method); log.info("PDF/A validation passed for {} using {}", profile.displayName(), method);
} catch (IOException e) { } catch (IOException e) {
@ -114,6 +105,13 @@ public class ConvertPDFToPDFA {
profile.displayName(), profile.displayName(),
method, method,
e.getMessage()); e.getMessage());
}
} catch (IOException e) {
log.warn(
"PDF/A validation warning for {} using {}: {}",
profile.displayName(),
method,
e.getMessage());
} finally { } finally {
if (tempPdfPath != null) { if (tempPdfPath != null) {
try { try {
@ -152,7 +150,7 @@ public class ConvertPDFToPDFA {
PdfaProfile profile, PdfaProfile profile,
Path pdfaDefFile) { Path pdfaDefFile) {
List<String> command = new ArrayList<>(); List<String> command = new ArrayList<>(25);
command.add("gs"); command.add("gs");
command.add("--permit-file-read=" + workingDir.toAbsolutePath()); command.add("--permit-file-read=" + workingDir.toAbsolutePath());
command.add("--permit-file-read=" + colorProfiles.rgb().toAbsolutePath()); command.add("--permit-file-read=" + colorProfiles.rgb().toAbsolutePath());
@ -276,31 +274,37 @@ public class ConvertPDFToPDFA {
Path pdfaDefFile = workingDir.resolve("PDFA_def.ps"); Path pdfaDefFile = workingDir.resolve("PDFA_def.ps");
String title = "Converted to " + profile.displayName(); String title = "Converted to " + profile.displayName();
String rgbProfilePath = colorProfiles.rgb().toAbsolutePath().toString().replace("\\", "/");
String pdfaDefContent = String pdfaDefContent =
String.format( String.format(
"%% This is a sample prefix file for creating a PDF/A document.\n" """
+ "%% Feel free to modify entries marked with \"Customize\".\n\n" %% This is a sample prefix file for creating a PDF/A document.
+ "%% Define entries in the document Info dictionary.\n" %% Feel free to modify entries marked with "Customize".
+ "[/Title (%s)\n"
+ " /DOCINFO pdfmark\n\n" %% Define entries in the document Info dictionary.
+ "%% Define an ICC profile.\n" [/Title (%s)
+ "[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark\n" /DOCINFO pdfmark
+ "[{icc_PDFA} <<\n"
+ " /N 3\n" %% Define an ICC profile.
+ ">> /PUT pdfmark\n" [/_objdef {icc_PDFA} /type /stream /OBJ pdfmark
+ "[{icc_PDFA} (%s) (r) file /PUT pdfmark\n\n" [{icc_PDFA} <<
+ "%% Define the output intent dictionary.\n" /N 3
+ "[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark\n" >> /PUT pdfmark
+ "[{OutputIntent_PDFA} <<\n" [{icc_PDFA} (%s) (r) file /PUT pdfmark
+ " /Type /OutputIntent\n"
+ " /S /GTS_PDFA1\n" %% Define the output intent dictionary.
+ " /DestOutputProfile {icc_PDFA}\n" [/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark
+ " /OutputConditionIdentifier (sRGB IEC61966-2.1)\n" [{OutputIntent_PDFA} <<
+ " /Info (sRGB IEC61966-2.1)\n" /Type /OutputIntent
+ " /RegistryName (http://www.color.org)\n" /S /GTS_PDFA1
+ ">> /PUT pdfmark\n" /DestOutputProfile {icc_PDFA}
+ "[{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark\n", /OutputConditionIdentifier (sRGB IEC61966-2.1)
title, colorProfiles.rgb().toAbsolutePath().toString().replace("\\", "/")); /Info (sRGB IEC61966-2.1)
/RegistryName (http://www.color.org)
>> /PUT pdfmark
[{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark
""",
title, rgbProfilePath);
Files.writeString(pdfaDefFile, pdfaDefContent); Files.writeString(pdfaDefFile, pdfaDefContent);
return pdfaDefFile; return pdfaDefFile;
@ -313,7 +317,7 @@ public class ConvertPDFToPDFA {
Path workingDir, Path workingDir,
PdfXProfile profile) { PdfXProfile profile) {
List<String> command = new ArrayList<>(); List<String> command = new ArrayList<>(25);
command.add("gs"); command.add("gs");
command.add("--permit-file-read=" + workingDir.toAbsolutePath()); command.add("--permit-file-read=" + workingDir.toAbsolutePath());
command.add("--permit-file-read=" + colorProfiles.rgb().toAbsolutePath()); command.add("--permit-file-read=" + colorProfiles.rgb().toAbsolutePath());
@ -340,11 +344,11 @@ public class ConvertPDFToPDFA {
return command; return command;
} }
private static void embedMissingFonts(PDDocument loDoc, PDDocument baseDoc, Set<String> missingFonts) private static void embedMissingFonts(
throws IOException { PDDocument loDoc, PDDocument baseDoc, Set<String> missingFonts) throws IOException {
List<PDPage> loPages = new ArrayList<>(); List<PDPage> loPages = new ArrayList<>(loDoc.getNumberOfPages());
loDoc.getPages().forEach(loPages::add); loDoc.getPages().forEach(loPages::add);
List<PDPage> basePages = new ArrayList<>(); List<PDPage> basePages = new ArrayList<>(baseDoc.getNumberOfPages());
baseDoc.getPages().forEach(basePages::add); baseDoc.getPages().forEach(basePages::add);
for (int i = 0; i < loPages.size(); i++) { for (int i = 0; i < loPages.size(); i++) {
@ -371,21 +375,31 @@ public class ConvertPDFToPDFA {
} }
if (fontStream == null) continue; if (fontStream == null) continue;
// Read the font stream into memory once so we can create fresh
// InputStreams for multiple load attempts. This avoids reusing a
// consumed stream and allows try-with-resources for each attempt.
byte[] fontBytes;
try (InputStream in = fontStream.createInputStream()) { try (InputStream in = fontStream.createInputStream()) {
PDFont newFont; fontBytes = in.readAllBytes();
try { }
newFont = PDType0Font.load(baseDoc, in, false);
PDFont embeddedFont = null;
// First try PDType0 (CID) font
try (InputStream tryIn = new ByteArrayInputStream(fontBytes)) {
embeddedFont = PDType0Font.load(baseDoc, tryIn, false);
} catch (IOException e1) { } catch (IOException e1) {
// Fallback to TrueType
try (InputStream tryIn2 = new ByteArrayInputStream(fontBytes)) {
try { try {
newFont = PDTrueTypeFont.load(baseDoc, in, null); embeddedFont = PDTrueTypeFont.load(baseDoc, tryIn2, null);
} catch (IOException | IllegalArgumentException e2) { } catch (IllegalArgumentException | IOException e2) {
log.error("Could not embed font {}: {}", psName, e2.getMessage()); log.error("Could not embed font {}: {}", psName, e2.getMessage());
continue;
} }
} }
if (newFont != null) {
baseRes.put(fontKey, newFont);
} }
if (embeddedFont != null) {
baseRes.put(fontKey, embeddedFont);
} }
} }
} }
@ -417,59 +431,18 @@ public class ConvertPDFToPDFA {
} }
} }
private ResponseEntity<byte[]> handlePdfAConversion( private static Set<String> findUnembeddedFontNames(PDDocument doc) throws IOException {
MultipartFile inputFile, String outputFormat) throws Exception { Set<String> missing = new HashSet<>(16);
PdfaProfile profile = PdfaProfile.fromRequest(outputFormat); for (PDPage page : doc.getPages()) {
PDResources res = page.getResources();
// Get the original filename without extension for (COSName name : res.getFontNames()) {
String originalFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename()); PDFont font = res.getFont(name);
if (originalFileName == null || originalFileName.trim().isEmpty()) { if (font != null && !font.isEmbedded()) {
originalFileName = "output.pdf"; missing.add(font.getName());
} }
String baseFileName =
originalFileName.contains(".")
? originalFileName.substring(0, originalFileName.lastIndexOf('.'))
: originalFileName;
Path workingDir = Files.createTempDirectory("pdfa_conversion_");
Path inputPath = workingDir.resolve("input.pdf");
inputFile.transferTo(inputPath);
try {
byte[] converted;
// Try Ghostscript first (preferred method)
if (isGhostscriptAvailable()) {
log.info("Using Ghostscript for PDF/A conversion to {}", profile.displayName());
try {
converted = convertWithGhostscript(inputPath, workingDir, profile);
String outputFilename = baseFileName + profile.outputSuffix();
validateAndWarnPdfA(converted, profile, "Ghostscript");
return WebResponseUtils.bytesToWebResponse(
converted, outputFilename, MediaType.APPLICATION_PDF);
} catch (Exception e) {
log.warn(
"Ghostscript conversion failed, falling back to PDFBox/LibreOffice method",
e);
} }
} else {
log.info("Ghostscript not available, using PDFBox/LibreOffice fallback method");
}
converted = convertWithPdfBoxMethod(inputPath, workingDir, profile);
String outputFilename = baseFileName + profile.outputSuffix();
// Validate with PDFBox preflight and warn if issues found
validateAndWarnPdfA(converted, profile, "PDFBox/LibreOffice");
return WebResponseUtils.bytesToWebResponse(
converted, outputFilename, MediaType.APPLICATION_PDF);
} finally {
deleteQuietly(workingDir);
} }
return missing;
} }
private ResponseEntity<byte[]> handlePdfXConversion( private ResponseEntity<byte[]> handlePdfXConversion(
@ -551,44 +524,29 @@ public class ConvertPDFToPDFA {
return Files.readAllBytes(outputPdf); return Files.readAllBytes(outputPdf);
} }
private byte[] convertWithPdfBoxMethod(Path inputPath, Path workingDir, PdfaProfile profile) private static void importFlattenedImages(PDDocument loDoc, PDDocument baseDoc)
throws Exception { throws IOException {
Path tempInputFile = null; List<PDPage> loPages = new ArrayList<>(loDoc.getNumberOfPages());
byte[] fileBytes; loDoc.getPages().forEach(loPages::add);
Path loPdfPath = null; List<PDPage> basePages = new ArrayList<>(baseDoc.getNumberOfPages());
File preProcessedFile = null; baseDoc.getPages().forEach(basePages::add);
int pdfaPart = profile.part();
try { for (int i = 0; i < loPages.size(); i++) {
tempInputFile = inputPath; PDPage loPage = loPages.get(i);
PDPage basePage = basePages.get(i);
if (pdfaPart == 2 || pdfaPart == 3) { PDResources loRes = loPage.getResources();
preProcessedFile = tempInputFile.toFile(); PDResources baseRes = basePage.getResources();
} else { Set<COSName> toReplace = detectTransparentXObjects(basePage);
preProcessedFile = preProcessHighlights(tempInputFile.toFile());
}
Set<String> missingFonts; for (COSName name : toReplace) {
boolean needImgs; PDXObject loXo = loRes.getXObject(name);
try (PDDocument doc = Loader.loadPDF(preProcessedFile)) { if (!(loXo instanceof PDImageXObject img)) continue;
missingFonts = findUnembeddedFontNames(doc);
needImgs = (pdfaPart == 1) && hasTransparentImages(doc);
if (!missingFonts.isEmpty() || needImgs) {
loPdfPath = runLibreOfficeConversion(preProcessedFile.toPath(), pdfaPart);
}
}
fileBytes =
convertToPdfA(
preProcessedFile.toPath(), loPdfPath, pdfaPart, missingFonts, needImgs);
return fileBytes; PDImageXObject newImg = LosslessFactory.createFromImage(baseDoc, img.getImage());
} finally { // replace the resource under the same name
if (loPdfPath != null && loPdfPath.getParent() != null) { baseRes.put(name, newImg);
FileUtils.deleteDirectory(loPdfPath.getParent().toFile());
}
if (preProcessedFile != null && !preProcessedFile.equals(tempInputFile.toFile())) {
Files.deleteIfExists(preProcessedFile.toPath());
} }
} }
} }
@ -608,13 +566,27 @@ public class ConvertPDFToPDFA {
return new ColorProfiles(rgbProfile, grayProfile); return new ColorProfiles(rgbProfile, grayProfile);
} }
private void copyResourceIcc(Path target) throws IOException { private static Set<COSName> detectTransparentXObjects(PDPage page) {
try (InputStream in = getClass().getResourceAsStream(ICC_RESOURCE_PATH)) { Set<COSName> transparentObjects = new HashSet<>();
if (in == null) { PDResources res = page.getResources();
throw new IOException("ICC profile resource not found: " + ICC_RESOURCE_PATH); if (res == null) return transparentObjects;
for (COSName name : res.getXObjectNames()) {
try {
PDXObject xo = res.getXObject(name);
if (xo instanceof PDImageXObject img) {
COSDictionary d = img.getCOSObject();
if (d.containsKey(COSName.SMASK)
|| isTransparencyGroup(d)
|| d.getBoolean(COSName.INTERPOLATE, false)) {
transparentObjects.add(name);
} }
Files.copy(in, target, StandardCopyOption.REPLACE_EXISTING);
} }
} catch (IOException ioe) {
log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage());
}
}
return transparentObjects;
} }
/** /**
@ -738,76 +710,13 @@ public class ConvertPDFToPDFA {
return Files.readAllBytes(outputPdf); return Files.readAllBytes(outputPdf);
} }
private Set<String> findUnembeddedFontNames(PDDocument doc) throws IOException { private static boolean isTransparencyGroup(COSDictionary dict) {
Set<String> missing = new HashSet<>();
for (PDPage page : doc.getPages()) {
PDResources res = page.getResources();
for (COSName name : res.getFontNames()) {
PDFont font = res.getFont(name);
if (font != null && !font.isEmbedded()) {
missing.add(font.getName());
}
}
}
return missing;
}
private void importFlattenedImages(PDDocument loDoc, PDDocument baseDoc) throws IOException {
List<PDPage> loPages = new ArrayList<>();
loDoc.getPages().forEach(loPages::add);
List<PDPage> basePages = new ArrayList<>();
baseDoc.getPages().forEach(basePages::add);
for (int i = 0; i < loPages.size(); i++) {
PDPage loPage = loPages.get(i);
PDPage basePage = basePages.get(i);
PDResources loRes = loPage.getResources();
PDResources baseRes = basePage.getResources();
Set<COSName> toReplace = detectTransparentXObjects(basePage);
for (COSName name : toReplace) {
PDXObject loXo = loRes.getXObject(name);
if (!(loXo instanceof PDImageXObject img)) continue;
PDImageXObject newImg = LosslessFactory.createFromImage(baseDoc, img.getImage());
// replace the resource under the same name
baseRes.put(name, newImg);
}
}
}
private Set<COSName> detectTransparentXObjects(PDPage page) {
Set<COSName> transparentObjects = new HashSet<>();
PDResources res = page.getResources();
if (res == null) return transparentObjects;
for (COSName name : res.getXObjectNames()) {
try {
PDXObject xo = res.getXObject(name);
if (xo instanceof PDImageXObject img) {
COSDictionary d = img.getCOSObject();
if (d.containsKey(COSName.SMASK)
|| isTransparencyGroup(d)
|| d.getBoolean(COSName.INTERPOLATE, false)) {
transparentObjects.add(name);
}
}
} catch (IOException ioe) {
log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage());
}
}
return transparentObjects;
}
private boolean isTransparencyGroup(COSDictionary dict) {
COSBase g = dict.getDictionaryObject(COSName.GROUP); COSBase g = dict.getDictionaryObject(COSName.GROUP);
return g instanceof COSDictionary gd return g instanceof COSDictionary gd
&& COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S)); && COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S));
} }
private boolean hasTransparentImages(PDDocument doc) { private static boolean hasTransparentImages(PDDocument doc) {
for (PDPage page : doc.getPages()) { for (PDPage page : doc.getPages()) {
PDResources res = page.getResources(); PDResources res = page.getResources();
if (res == null) continue; if (res == null) continue;
@ -832,7 +741,7 @@ public class ConvertPDFToPDFA {
return false; return false;
} }
private void sanitizePdfA(COSBase base, PDResources resources, int pdfaPart) { private static void sanitizePdfA(COSBase base, int pdfaPart) {
if (base instanceof COSDictionary dict) { if (base instanceof COSDictionary dict) {
if (pdfaPart == 1) { if (pdfaPart == 1) {
// Remove transparency-related elements // Remove transparency-related elements
@ -868,18 +777,18 @@ public class ConvertPDFToPDFA {
// Recurse through all entries in the dictionary // Recurse through all entries in the dictionary
for (Map.Entry<COSName, COSBase> entry : dict.entrySet()) { for (Map.Entry<COSName, COSBase> entry : dict.entrySet()) {
sanitizePdfA(entry.getValue(), resources, pdfaPart); sanitizePdfA(entry.getValue(), pdfaPart);
} }
} else if (base instanceof COSArray arr) { } else if (base instanceof COSArray arr) {
// Recursively sanitize each item in the array // Recursively sanitize each item in the array
for (COSBase item : arr) { for (COSBase item : arr) {
sanitizePdfA(item, resources, pdfaPart); sanitizePdfA(item, pdfaPart);
} }
} }
} }
private void removeElementsForPdfA(PDDocument doc, int pdfaPart) { private static void removeElementsForPdfA(PDDocument doc, int pdfaPart) {
if (pdfaPart == 1) { if (pdfaPart == 1) {
// Remove Optional Content (Layers) - not allowed in PDF/A-1 // Remove Optional Content (Layers) - not allowed in PDF/A-1
@ -892,7 +801,7 @@ public class ConvertPDFToPDFA {
} }
PDResources res = page.getResources(); PDResources res = page.getResources();
// Clean page-level dictionary // Clean page-level dictionary
sanitizePdfA(page.getCOSObject(), res, pdfaPart); sanitizePdfA(page.getCOSObject(), pdfaPart);
// sanitize each Form XObject // sanitize each Form XObject
if (res != null) { if (res != null) {
@ -900,9 +809,9 @@ public class ConvertPDFToPDFA {
try { try {
PDXObject xo = res.getXObject(name); PDXObject xo = res.getXObject(name);
if (xo instanceof PDFormXObject form) { if (xo instanceof PDFormXObject form) {
sanitizePdfA(form.getCOSObject(), res, pdfaPart); sanitizePdfA(form.getCOSObject(), pdfaPart);
} else if (xo instanceof PDImageXObject img) { } else if (xo instanceof PDImageXObject img) {
sanitizePdfA(img.getCOSObject(), res, pdfaPart); sanitizePdfA(img.getCOSObject(), pdfaPart);
} }
} catch (IOException ioe) { } catch (IOException ioe) {
log.error("Cannot load XObject {}: {}", name.getName(), ioe.getMessage()); log.error("Cannot load XObject {}: {}", name.getName(), ioe.getMessage());
@ -913,7 +822,7 @@ public class ConvertPDFToPDFA {
} }
/** Embbeds the XMP metadata required for PDF/A compliance. */ /** Embbeds the XMP metadata required for PDF/A compliance. */
private void mergeAndAddXmpMetadata(PDDocument document, int pdfaPart) throws Exception { private static void mergeAndAddXmpMetadata(PDDocument document, int pdfaPart) throws Exception {
PDMetadata existingMetadata = document.getDocumentCatalog().getMetadata(); PDMetadata existingMetadata = document.getDocumentCatalog().getMetadata();
XMPMetadata xmp; XMPMetadata xmp;
@ -998,31 +907,30 @@ public class ConvertPDFToPDFA {
adobePdfSchema.setKeywords(keywords); adobePdfSchema.setKeywords(keywords);
} }
// Set creation and modification dates using java.time and convert to GregorianCalendar // Set creation and modification dates using modern java.time API
Instant nowInstant = Instant.now(); Instant nowInstant = Instant.now();
ZonedDateTime nowZdt = ZonedDateTime.ofInstant(nowInstant, ZoneId.of("UTC")); ZonedDateTime nowZdt = ZonedDateTime.ofInstant(nowInstant, ZoneId.of("UTC"));
GregorianCalendar nowCal = GregorianCalendar.from(nowZdt);
java.util.Calendar originalCreationDate = docInfo.getCreationDate(); // Determine creation date from document info or use current time
GregorianCalendar creationCal; Instant creationInstant;
if (originalCreationDate == null) { Calendar originalCreationDate = docInfo.getCreationDate();
creationCal = nowCal; if (originalCreationDate != null) {
} else if (originalCreationDate instanceof GregorianCalendar) { creationInstant = originalCreationDate.toInstant();
creationCal = (GregorianCalendar) originalCreationDate;
} else { } else {
// convert other Calendar implementations to GregorianCalendar preserving instant creationInstant = nowInstant;
creationCal =
GregorianCalendar.from(
ZonedDateTime.ofInstant(
originalCreationDate.toInstant(), ZoneId.of("UTC")));
} }
ZonedDateTime creationZdt = ZonedDateTime.ofInstant(creationInstant, ZoneId.of("UTC"));
// Convert to GregorianCalendar for PDFBox API compatibility
GregorianCalendar creationCal = java.util.GregorianCalendar.from(creationZdt);
GregorianCalendar modificationCal = java.util.GregorianCalendar.from(nowZdt);
docInfo.setCreationDate(creationCal); docInfo.setCreationDate(creationCal);
xmpBasicSchema.setCreateDate(creationCal); xmpBasicSchema.setCreateDate(creationCal);
docInfo.setModificationDate(nowCal); docInfo.setModificationDate(modificationCal);
xmpBasicSchema.setModifyDate(nowCal); xmpBasicSchema.setModifyDate(modificationCal);
xmpBasicSchema.setMetadataDate(nowCal); xmpBasicSchema.setMetadataDate(modificationCal);
// Serialize the created metadata so it can be attached to the existent metadata // Serialize the created metadata so it can be attached to the existent metadata
ByteArrayOutputStream xmpOut = new ByteArrayOutputStream(); ByteArrayOutputStream xmpOut = new ByteArrayOutputStream();
@ -1033,22 +941,7 @@ public class ConvertPDFToPDFA {
document.getDocumentCatalog().setMetadata(newMetadata); document.getDocumentCatalog().setMetadata(newMetadata);
} }
private void addICCProfileIfNotPresent(PDDocument document) throws Exception { private static File preProcessHighlights(File inputPdf) throws Exception {
if (document.getDocumentCatalog().getOutputIntents().isEmpty()) {
try (InputStream colorProfile = getClass().getResourceAsStream("/icc/sRGB2014.icc")) {
PDOutputIntent outputIntent = new PDOutputIntent(document, colorProfile);
outputIntent.setInfo("sRGB IEC61966-2.1");
outputIntent.setOutputCondition("sRGB IEC61966-2.1");
outputIntent.setOutputConditionIdentifier("sRGB IEC61966-2.1");
outputIntent.setRegistryName("http://www.color.org");
document.getDocumentCatalog().addOutputIntent(outputIntent);
} catch (Exception e) {
log.error("Failed to load ICC profile: {}", e.getMessage());
}
}
}
private File preProcessHighlights(File inputPdf) throws Exception {
try (PDDocument document = Loader.loadPDF(inputPdf)) { try (PDDocument document = Loader.loadPDF(inputPdf)) {
@ -1127,17 +1020,17 @@ public class ConvertPDFToPDFA {
COSDictionary groupDict = COSDictionary groupDict =
(COSDictionary) pageDict.getDictionaryObject(COSName.GROUP); (COSDictionary) pageDict.getDictionaryObject(COSName.GROUP);
if (groupDict != null) { if (groupDict != null
if (COSName.TRANSPARENCY && COSName.TRANSPARENCY
.getName() .getName()
.equalsIgnoreCase(groupDict.getNameAsString(COSName.S))) { .equalsIgnoreCase(
groupDict.getNameAsString(COSName.S))) {
pageDict.removeItem(COSName.GROUP); pageDict.removeItem(COSName.GROUP);
} }
} }
} }
} }
} }
}
// Save the modified document to a temporary file. // Save the modified document to a temporary file.
File preProcessedFile = Files.createTempFile("preprocessed_", ".pdf").toFile(); File preProcessedFile = Files.createTempFile("preprocessed_", ".pdf").toFile();
document.save(preProcessedFile); document.save(preProcessedFile);
@ -1145,7 +1038,133 @@ public class ConvertPDFToPDFA {
} }
} }
/** Enum representing different PDF/A profiles */ private ResponseEntity<byte[]> handlePdfAConversion(
MultipartFile inputFile, String outputFormat) throws Exception {
PdfaProfile profile = PdfaProfile.fromRequest(outputFormat);
// Get the original filename without extension
String originalFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
if (originalFileName == null || originalFileName.trim().isEmpty()) {
originalFileName = "output.pdf";
}
String baseFileName =
originalFileName.contains(".")
? originalFileName.substring(0, originalFileName.lastIndexOf('.'))
: originalFileName;
Path workingDir = Files.createTempDirectory("pdfa_conversion_");
Path inputPath = workingDir.resolve("input.pdf");
inputFile.transferTo(inputPath);
try {
byte[] converted;
// Try Ghostscript first (preferred method)
if (isGhostscriptAvailable()) {
log.info("Using Ghostscript for PDF/A conversion to {}", profile.displayName());
try {
converted = convertWithGhostscript(inputPath, workingDir, profile);
String outputFilename = baseFileName + profile.outputSuffix();
validateAndWarnPdfA(converted, profile, "Ghostscript");
return WebResponseUtils.bytesToWebResponse(
converted, outputFilename, MediaType.APPLICATION_PDF);
} catch (Exception e) {
log.warn(
"Ghostscript conversion failed, falling back to PDFBox/LibreOffice method",
e);
}
} else {
log.info("Ghostscript not available, using PDFBox/LibreOffice fallback method");
}
converted = convertWithPdfBoxMethod(inputPath, profile);
String outputFilename = baseFileName + profile.outputSuffix();
// Validate with PDFBox preflight and warn if issues found
validateAndWarnPdfA(converted, profile, "PDFBox/LibreOffice");
return WebResponseUtils.bytesToWebResponse(
converted, outputFilename, MediaType.APPLICATION_PDF);
} finally {
deleteQuietly(workingDir);
}
}
private byte[] convertWithPdfBoxMethod(Path inputPath, PdfaProfile profile) throws Exception {
Path tempInputFile = null;
byte[] fileBytes;
Path loPdfPath = null;
File preProcessedFile = null;
int pdfaPart = profile.part();
try {
tempInputFile = inputPath;
if (pdfaPart == 2 || pdfaPart == 3) {
preProcessedFile = tempInputFile.toFile();
} else {
preProcessedFile = preProcessHighlights(tempInputFile.toFile());
}
Set<String> missingFonts;
boolean needImgs;
try (PDDocument doc = Loader.loadPDF(preProcessedFile)) {
missingFonts = findUnembeddedFontNames(doc);
needImgs = (pdfaPart == 1) && hasTransparentImages(doc);
if (!missingFonts.isEmpty() || needImgs) {
loPdfPath = runLibreOfficeConversion(preProcessedFile.toPath(), pdfaPart);
}
}
fileBytes =
convertToPdfA(
preProcessedFile.toPath(), loPdfPath, pdfaPart, missingFonts, needImgs);
return fileBytes;
} finally {
if (loPdfPath != null && loPdfPath.getParent() != null) {
FileUtils.deleteDirectory(loPdfPath.getParent().toFile());
}
if (preProcessedFile != null && !preProcessedFile.equals(tempInputFile.toFile())) {
Files.deleteIfExists(preProcessedFile.toPath());
}
}
}
private void copyResourceIcc(Path target) throws IOException {
try (InputStream in = getClass().getResourceAsStream(ICC_RESOURCE_PATH)) {
if (in == null) {
throw ExceptionUtils.createIllegalArgumentException(
"error.resourceNotFound", "Resource not found: {0}", ICC_RESOURCE_PATH);
}
Files.copy(in, target, StandardCopyOption.REPLACE_EXISTING);
}
}
private void addICCProfileIfNotPresent(PDDocument document) {
if (document.getDocumentCatalog().getOutputIntents().isEmpty()) {
try (InputStream colorProfile = getClass().getResourceAsStream("/icc/sRGB2014.icc")) {
if (colorProfile == null) {
throw ExceptionUtils.createIllegalArgumentException(
"error.resourceNotFound",
"Resource not found: {0}",
"/icc/sRGB2014.icc");
}
PDOutputIntent outputIntent = new PDOutputIntent(document, colorProfile);
outputIntent.setInfo("sRGB IEC61966-2.1");
outputIntent.setOutputCondition("sRGB IEC61966-2.1");
outputIntent.setOutputConditionIdentifier("sRGB IEC61966-2.1");
outputIntent.setRegistryName("http://www.color.org");
document.getDocumentCatalog().addOutputIntent(outputIntent);
} catch (Exception e) {
log.error("Failed to load ICC profile: {}", e.getMessage());
}
}
}
private enum PdfaProfile { private enum PdfaProfile {
PDF_A_1B(1, "PDF/A-1b", "_PDFA-1b.pdf", "1.4", Format.PDF_A1B, "pdfa-1"), PDF_A_1B(1, "PDF/A-1b", "_PDFA-1b.pdf", "1.4", Format.PDF_A1B, "pdfa-1"),
PDF_A_2B(2, "PDF/A-2b", "_PDFA-2b.pdf", "1.7", null, "pdfa", "pdfa-2", "pdfa-2b"), PDF_A_2B(2, "PDF/A-2b", "_PDFA-2b.pdf", "1.7", null, "pdfa", "pdfa-2", "pdfa-2b"),
@ -1212,9 +1231,9 @@ public class ConvertPDFToPDFA {
} }
private enum PdfXProfile { private enum PdfXProfile {
PDF_X_1(1, "PDF/X-1", "_PDFX-1.pdf", "1.3", "2001", "pdfx-1", "pdfx"), PDF_X_1("PDF/X-1", "_PDFX-1.pdf", "1.3", "2001", "pdfx-1", "pdfx"),
PDF_X_3(3, "PDF/X-3", "_PDFX-3.pdf", "1.3", "2003", "pdfx-3"), PDF_X_3("PDF/X-3", "_PDFX-3.pdf", "1.3", "2003", "pdfx-3"),
PDF_X_4(4, "PDF/X-4", "_PDFX-4.pdf", "1.4", "2008", "pdfx-4"); PDF_X_4("PDF/X-4", "_PDFX-4.pdf", "1.4", "2008", "pdfx-4");
private final String displayName; private final String displayName;
private final String suffix; private final String suffix;
@ -1223,7 +1242,6 @@ public class ConvertPDFToPDFA {
private final List<String> requestTokens; private final List<String> requestTokens;
PdfXProfile( PdfXProfile(
int version,
String displayName, String displayName,
String suffix, String suffix,
String compatibilityLevel, String compatibilityLevel,
@ -1270,6 +1288,5 @@ public class ConvertPDFToPDFA {
} }
} }
/** Record to hold color profile paths */
private record ColorProfiles(Path rgb, Path gray) {} private record ColorProfiles(Path rgb, Path gray) {}
} }