mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
refactor(pdf-a-validation): enhance validation and preprocessing logic
- Modularized PDF/A validation with methods for comprehensive and basic checks - Added improved CIDSet handling for PDF/A-1 compliance using QPDF - Simplified preflight document parsing logic with reusable methods - Enhanced metadata management with modern Java APIs - Streamlined Ghostscript and QPDF command execution for PDF normalization - Introduced detailed logging of validation results and errors for better debugging Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
parent
5e2caa48a3
commit
4d356c059b
@ -97,20 +97,22 @@ public class ConvertPDFToPDFA {
|
|||||||
out.write(pdfBytes);
|
out.write(pdfBytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
ValidationResult validationResult =
|
||||||
validatePdfaOutput(tempPdfPath, profile);
|
performComprehensivePdfAValidation(tempPdfPath, profile);
|
||||||
|
|
||||||
|
if (validationResult.isValid()) {
|
||||||
log.info(
|
log.info(
|
||||||
"PDF/A validation passed for {} using {}",
|
"PDF/A validation passed for {} using {}",
|
||||||
profile.getDisplayName(),
|
profile.getDisplayName(),
|
||||||
method);
|
method);
|
||||||
} catch (IOException e) {
|
} else {
|
||||||
log.warn(
|
log.warn(
|
||||||
"PDF/A validation warning for {} using {}: {}",
|
"PDF/A validation warning for {} using {}: {}",
|
||||||
profile.getDisplayName(),
|
profile.getDisplayName(),
|
||||||
method,
|
method,
|
||||||
e.getMessage());
|
buildComprehensiveValidationMessage(validationResult, profile));
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (Exception e) {
|
||||||
log.warn(
|
log.warn(
|
||||||
"PDF/A validation warning for {} using {}: {}",
|
"PDF/A validation warning for {} using {}: {}",
|
||||||
profile.getDisplayName(),
|
profile.getDisplayName(),
|
||||||
@ -127,6 +129,110 @@ public class ConvertPDFToPDFA {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static ValidationResult performComprehensivePdfAValidation(
|
||||||
|
Path pdfPath, PdfaProfile profile) throws IOException {
|
||||||
|
Optional<Format> format = profile.preflightFormat();
|
||||||
|
if (format.isEmpty()) {
|
||||||
|
// For profiles without preflight support, perform basic structure validation
|
||||||
|
return performBasicPdfAValidation(pdfPath, profile);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (RandomAccessRead rar = new RandomAccessReadBufferedFile(pdfPath.toFile())) {
|
||||||
|
PreflightParser parser = new PreflightParser(rar);
|
||||||
|
|
||||||
|
PreflightDocument document = parsePreflightDocument(parser, format.get(), profile);
|
||||||
|
if (document == null) {
|
||||||
|
throw new IOException(
|
||||||
|
"PDF/A preflight returned no document for " + profile.getDisplayName());
|
||||||
|
}
|
||||||
|
|
||||||
|
try (PreflightDocument closeableDocument = document) {
|
||||||
|
return closeableDocument.validate();
|
||||||
|
}
|
||||||
|
} catch (SyntaxValidationException e) {
|
||||||
|
return e.getResult();
|
||||||
|
} catch (ValidationException e) {
|
||||||
|
throw new IOException(
|
||||||
|
"PDF/A preflight validation failed for " + profile.getDisplayName(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ValidationResult performBasicPdfAValidation(Path pdfPath, PdfaProfile profile)
|
||||||
|
throws IOException {
|
||||||
|
try (PDDocument doc = Loader.loadPDF(pdfPath.toFile())) {
|
||||||
|
ValidationResult result = new ValidationResult(true);
|
||||||
|
|
||||||
|
float version = doc.getVersion();
|
||||||
|
float expectedVersion = profile.getPart() == 1 ? 1.4f : 1.7f;
|
||||||
|
if (version < expectedVersion) {
|
||||||
|
result.addError(
|
||||||
|
new ValidationError(
|
||||||
|
"PDF_VERSION",
|
||||||
|
"PDF version "
|
||||||
|
+ version
|
||||||
|
+ " is below required "
|
||||||
|
+ expectedVersion
|
||||||
|
+ " for "
|
||||||
|
+ profile.getDisplayName()));
|
||||||
|
}
|
||||||
|
|
||||||
|
PDDocumentCatalog catalog = doc.getDocumentCatalog();
|
||||||
|
if (catalog.getMetadata() == null) {
|
||||||
|
result.addError(
|
||||||
|
new ValidationError(
|
||||||
|
"MISSING_XMP",
|
||||||
|
"XMP metadata is required for " + profile.getDisplayName()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (catalog.getOutputIntents().isEmpty()) {
|
||||||
|
result.addError(
|
||||||
|
new ValidationError(
|
||||||
|
"MISSING_OUTPUT_INTENT",
|
||||||
|
"Output intent (ICC profile) is required for "
|
||||||
|
+ profile.getDisplayName()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String buildComprehensiveValidationMessage(
|
||||||
|
ValidationResult result, PdfaProfile profile) {
|
||||||
|
if (result == null) {
|
||||||
|
return "PDF/A validation failed for "
|
||||||
|
+ profile.getDisplayName()
|
||||||
|
+ ": no validation result available";
|
||||||
|
}
|
||||||
|
|
||||||
|
List<ValidationError> errors = result.getErrorsList();
|
||||||
|
|
||||||
|
StringBuilder message = new StringBuilder();
|
||||||
|
message.append("PDF/A validation issues for ").append(profile.getDisplayName());
|
||||||
|
|
||||||
|
if (errors != null && !errors.isEmpty()) {
|
||||||
|
message.append(" - ").append(errors.size()).append(" errors");
|
||||||
|
}
|
||||||
|
message.append(":");
|
||||||
|
|
||||||
|
if (errors != null && !errors.isEmpty()) {
|
||||||
|
message.append(" ERRORS: ");
|
||||||
|
message.append(
|
||||||
|
errors.stream()
|
||||||
|
.limit(5)
|
||||||
|
.map(
|
||||||
|
error ->
|
||||||
|
(error.getErrorCode() != null
|
||||||
|
? error.getErrorCode()
|
||||||
|
: "UNKNOWN")
|
||||||
|
+ (error.getDetails() != null
|
||||||
|
? ": " + error.getDetails()
|
||||||
|
: ""))
|
||||||
|
.collect(Collectors.joining("; ")));
|
||||||
|
}
|
||||||
|
|
||||||
|
return message.toString();
|
||||||
|
}
|
||||||
|
|
||||||
private static void deleteQuietly(Path directory) {
|
private static void deleteQuietly(Path directory) {
|
||||||
if (directory == null) {
|
if (directory == null) {
|
||||||
return;
|
return;
|
||||||
@ -154,7 +260,7 @@ public class ConvertPDFToPDFA {
|
|||||||
PdfaProfile profile,
|
PdfaProfile profile,
|
||||||
Path pdfaDefFile) {
|
Path pdfaDefFile) {
|
||||||
|
|
||||||
List<String> command = new ArrayList<>(25);
|
List<String> command = new ArrayList<>();
|
||||||
command.add("gs");
|
command.add("gs");
|
||||||
command.add("--permit-file-read=" + workingDir.toAbsolutePath());
|
command.add("--permit-file-read=" + workingDir.toAbsolutePath());
|
||||||
command.add("--permit-file-read=" + colorProfiles.rgb().toAbsolutePath());
|
command.add("--permit-file-read=" + colorProfiles.rgb().toAbsolutePath());
|
||||||
@ -162,24 +268,31 @@ public class ConvertPDFToPDFA {
|
|||||||
command.add("--permit-file-read=" + inputPdf.toAbsolutePath());
|
command.add("--permit-file-read=" + inputPdf.toAbsolutePath());
|
||||||
command.add("--permit-file-read=" + pdfaDefFile.toAbsolutePath());
|
command.add("--permit-file-read=" + pdfaDefFile.toAbsolutePath());
|
||||||
command.add("--permit-file-write=" + workingDir.toAbsolutePath());
|
command.add("--permit-file-write=" + workingDir.toAbsolutePath());
|
||||||
|
|
||||||
command.add("-dPDFA=" + profile.getPart());
|
command.add("-dPDFA=" + profile.getPart());
|
||||||
command.add("-dPDFACompatibilityPolicy=" + PDFA_COMPATIBILITY_POLICY);
|
command.add("-dPDFACompatibilityPolicy=" + PDFA_COMPATIBILITY_POLICY);
|
||||||
command.add("-dCompatibilityLevel=" + profile.getCompatibilityLevel());
|
command.add("-dCompatibilityLevel=" + profile.getCompatibilityLevel());
|
||||||
command.add("-sDEVICE=pdfwrite");
|
command.add("-sDEVICE=pdfwrite");
|
||||||
|
|
||||||
command.add("-sColorConversionStrategy=RGB");
|
command.add("-sColorConversionStrategy=RGB");
|
||||||
command.add("-dProcessColorModel=/DeviceRGB");
|
command.add("-dProcessColorModel=/DeviceRGB");
|
||||||
command.add("-sOutputICCProfile=" + colorProfiles.rgb().toAbsolutePath());
|
command.add("-sOutputICCProfile=" + colorProfiles.rgb().toAbsolutePath());
|
||||||
command.add("-sDefaultRGBProfile=" + colorProfiles.rgb().toAbsolutePath());
|
command.add("-sDefaultRGBProfile=" + colorProfiles.rgb().toAbsolutePath());
|
||||||
command.add("-sDefaultGrayProfile=" + colorProfiles.gray().toAbsolutePath());
|
command.add("-sDefaultGrayProfile=" + colorProfiles.gray().toAbsolutePath());
|
||||||
|
command.add("-sDefaultCMYKProfile=" + colorProfiles.rgb().toAbsolutePath());
|
||||||
|
|
||||||
|
// Font handling optimized for PDF/A CIDSet compliance
|
||||||
command.add("-dEmbedAllFonts=true");
|
command.add("-dEmbedAllFonts=true");
|
||||||
command.add("-dSubsetFonts=false"); // Embed complete fonts to avoid incomplete glyphs
|
command.add(
|
||||||
|
"-dSubsetFonts=true"); // Enable subsetting to generate proper CIDSet streams for
|
||||||
|
// PDF/A-1
|
||||||
command.add("-dCompressFonts=true");
|
command.add("-dCompressFonts=true");
|
||||||
command.add("-dNOSUBSTFONTS=false"); // Allow font substitution for problematic fonts
|
command.add("-dNOSUBSTFONTS=false"); // Allow font substitution for problematic fonts
|
||||||
command.add("-dPDFSETTINGS=/prepress"); // High quality, preserves all content
|
|
||||||
command.add("-dNOPAUSE");
|
command.add("-dNOPAUSE");
|
||||||
command.add("-dBATCH");
|
command.add("-dBATCH");
|
||||||
command.add("-dNOOUTERSAVE");
|
command.add("-dNOOUTERSAVE");
|
||||||
command.add("-sOutputFile=" + outputPdf.toAbsolutePath());
|
command.add("-sOutputFile=" + outputPdf.toAbsolutePath());
|
||||||
|
|
||||||
command.add(pdfaDefFile.toAbsolutePath().toString());
|
command.add(pdfaDefFile.toAbsolutePath().toString());
|
||||||
command.add(inputPdf.toAbsolutePath().toString());
|
command.add(inputPdf.toAbsolutePath().toString());
|
||||||
|
|
||||||
@ -189,10 +302,15 @@ public class ConvertPDFToPDFA {
|
|||||||
private static PreflightDocument parsePreflightDocument(
|
private static PreflightDocument parsePreflightDocument(
|
||||||
PreflightParser parser, Format format, PdfaProfile profile) throws IOException {
|
PreflightParser parser, Format format, PdfaProfile profile) throws IOException {
|
||||||
try {
|
try {
|
||||||
return (PreflightDocument)
|
PreflightConfiguration config = PreflightConfiguration.createPdfA1BConfiguration();
|
||||||
parser.parse(format, PreflightConfiguration.createPdfA1BConfiguration());
|
if (profile.getPart() != 1) {
|
||||||
|
log.debug(
|
||||||
|
"Using PDF/A-1B configuration for PDF/A-{} validation", profile.getPart());
|
||||||
|
}
|
||||||
|
|
||||||
|
return (PreflightDocument) parser.parse(format, config);
|
||||||
} catch (SyntaxValidationException e) {
|
} catch (SyntaxValidationException e) {
|
||||||
throw new IOException(buildPreflightErrorMessage(e.getResult(), profile), e);
|
throw new IOException(buildComprehensiveValidationMessage(e.getResult(), profile), e);
|
||||||
} catch (ClassCastException e) {
|
} catch (ClassCastException e) {
|
||||||
throw new IOException(
|
throw new IOException(
|
||||||
"PDF/A preflight did not produce a PreflightDocument for "
|
"PDF/A preflight did not produce a PreflightDocument for "
|
||||||
@ -201,74 +319,6 @@ public class ConvertPDFToPDFA {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void validatePdfaOutput(Path pdfPath, PdfaProfile profile) throws IOException {
|
|
||||||
Optional<Format> format = profile.preflightFormat();
|
|
||||||
if (format.isEmpty()) {
|
|
||||||
log.debug("Skipping PDFBox preflight validation for {}", profile.getDisplayName());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
try (RandomAccessRead rar = new RandomAccessReadBufferedFile(pdfPath.toFile())) {
|
|
||||||
PreflightParser parser = new PreflightParser(rar);
|
|
||||||
|
|
||||||
PreflightDocument document = parsePreflightDocument(parser, format.get(), profile);
|
|
||||||
if (document == null) {
|
|
||||||
throw new IOException(
|
|
||||||
"PDF/A preflight returned no document for " + profile.getDisplayName());
|
|
||||||
}
|
|
||||||
|
|
||||||
try (PreflightDocument closeableDocument = document) {
|
|
||||||
ValidationResult result = closeableDocument.validate();
|
|
||||||
if (result == null || !result.isValid()) {
|
|
||||||
throw new IOException(buildPreflightErrorMessage(result, profile));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (SyntaxValidationException e) {
|
|
||||||
throw new IOException(buildPreflightErrorMessage(e.getResult(), profile), e);
|
|
||||||
} catch (ValidationException e) {
|
|
||||||
throw new IOException(
|
|
||||||
"PDF/A preflight validation failed for " + profile.getDisplayName(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String buildPreflightErrorMessage(ValidationResult result, PdfaProfile profile) {
|
|
||||||
String baseMessage = "PDF/A preflight validation failed for " + profile.getDisplayName();
|
|
||||||
if (result == null) {
|
|
||||||
return baseMessage + ": no detailed validation result available";
|
|
||||||
}
|
|
||||||
|
|
||||||
List<ValidationError> errors = result.getErrorsList();
|
|
||||||
if (errors == null || errors.isEmpty()) {
|
|
||||||
return baseMessage + ": unknown validation error";
|
|
||||||
}
|
|
||||||
|
|
||||||
String summarizedErrors =
|
|
||||||
errors.stream()
|
|
||||||
.limit(5)
|
|
||||||
.map(
|
|
||||||
error -> {
|
|
||||||
StringBuilder sb =
|
|
||||||
new StringBuilder(
|
|
||||||
Optional.ofNullable(error.getErrorCode())
|
|
||||||
.orElse("UNKNOWN"));
|
|
||||||
String details = error.getDetails();
|
|
||||||
if (details != null && !details.isBlank()) {
|
|
||||||
sb.append(": ").append(details.trim());
|
|
||||||
}
|
|
||||||
if (error.isWarning()) {
|
|
||||||
sb.append(" (warning)");
|
|
||||||
}
|
|
||||||
return sb.toString();
|
|
||||||
})
|
|
||||||
.collect(Collectors.joining("; "));
|
|
||||||
|
|
||||||
if (errors.size() > 5) {
|
|
||||||
summarizedErrors += " (" + (errors.size() - 5) + " more)";
|
|
||||||
}
|
|
||||||
|
|
||||||
return baseMessage + ": " + summarizedErrors;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void writeJavaIccProfile(ICC_Profile profile, Path target) throws IOException {
|
private static void writeJavaIccProfile(ICC_Profile profile, Path target) throws IOException {
|
||||||
try (OutputStream out = Files.newOutputStream(target)) {
|
try (OutputStream out = Files.newOutputStream(target)) {
|
||||||
out.write(profile.getData());
|
out.write(profile.getData());
|
||||||
@ -507,29 +557,37 @@ public class ConvertPDFToPDFA {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private byte[] convertWithGhostscript(Path inputPdf, Path workingDir, PdfaProfile profile)
|
private static void fixCidSetIssues(PDDocument document) throws IOException {
|
||||||
throws IOException, InterruptedException {
|
for (PDPage page : document.getPages()) {
|
||||||
Path outputPdf = workingDir.resolve("gs_output.pdf");
|
PDResources resources = page.getResources();
|
||||||
ColorProfiles colorProfiles = prepareColorProfiles(workingDir);
|
if (resources == null) continue;
|
||||||
Path pdfaDefFile = createPdfaDefFile(workingDir, colorProfiles, profile);
|
|
||||||
|
|
||||||
List<String> command =
|
for (COSName fontName : resources.getFontNames()) {
|
||||||
buildGhostscriptCommand(
|
try {
|
||||||
inputPdf, outputPdf, colorProfiles, workingDir, profile, pdfaDefFile);
|
PDFont font = resources.getFont(fontName);
|
||||||
|
if (font == null) continue;
|
||||||
|
|
||||||
ProcessExecutorResult result =
|
PDFontDescriptor descriptor = font.getFontDescriptor();
|
||||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
if (descriptor == null) continue;
|
||||||
.runCommandWithOutputHandling(command);
|
|
||||||
|
|
||||||
if (result.getRc() != 0) {
|
COSDictionary fontDict = descriptor.getCOSObject();
|
||||||
throw new IOException("Ghostscript exited with code " + result.getRc());
|
|
||||||
|
// Remove invalid or incomplete CIDSet entries for PDF/A-1 compliance
|
||||||
|
// PDF/A-1 requires CIDSet to be present and complete for subsetted CIDFonts
|
||||||
|
// For PDF/A-2+, CIDSet is optional but must be complete if present
|
||||||
|
COSBase cidSet = fontDict.getDictionaryObject(COSName.getPDFName("CIDSet"));
|
||||||
|
if (cidSet != null) {
|
||||||
|
// If CIDSet exists but may be invalid, remove it to avoid validation errors
|
||||||
|
// This is safer than trying to fix incomplete CIDSet streams
|
||||||
|
fontDict.removeItem(COSName.getPDFName("CIDSet"));
|
||||||
|
log.debug(
|
||||||
|
"Removed potentially invalid CIDSet from font {}", font.getName());
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.debug("Error processing CIDSet for font: {}", e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!Files.exists(outputPdf)) {
|
|
||||||
throw new IOException("Ghostscript did not produce an output file");
|
|
||||||
}
|
|
||||||
|
|
||||||
return Files.readAllBytes(outputPdf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void importFlattenedImages(PDDocument loDoc, PDDocument baseDoc)
|
private static void importFlattenedImages(PDDocument loDoc, PDDocument baseDoc)
|
||||||
@ -631,6 +689,65 @@ public class ConvertPDFToPDFA {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] convertWithGhostscript(Path inputPdf, Path workingDir, PdfaProfile profile)
|
||||||
|
throws IOException, InterruptedException {
|
||||||
|
Path outputPdf = workingDir.resolve("gs_output.pdf");
|
||||||
|
ColorProfiles colorProfiles = prepareColorProfiles(workingDir);
|
||||||
|
Path pdfaDefFile = createPdfaDefFile(workingDir, colorProfiles, profile);
|
||||||
|
|
||||||
|
// Preprocess PDF for PDF/A compliance
|
||||||
|
Path preprocessedPdf = inputPdf;
|
||||||
|
|
||||||
|
// For PDF/A-1, clean CIDSet issues that may cause validation failures
|
||||||
|
if (profile.getPart() == 1) {
|
||||||
|
Path cidSetCleaned = cleanCidSetWithQpdf(inputPdf);
|
||||||
|
if (cidSetCleaned != null) {
|
||||||
|
preprocessedPdf = cidSetCleaned;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize PDF with qpdf before Ghostscript conversion to ensure proper font program
|
||||||
|
// handling
|
||||||
|
Path normalizedInputPdf = normalizePdfWithQpdf(preprocessedPdf);
|
||||||
|
Path inputForGs = (normalizedInputPdf != null) ? normalizedInputPdf : preprocessedPdf;
|
||||||
|
|
||||||
|
try {
|
||||||
|
List<String> command =
|
||||||
|
buildGhostscriptCommand(
|
||||||
|
inputForGs, outputPdf, colorProfiles, workingDir, profile, pdfaDefFile);
|
||||||
|
|
||||||
|
ProcessExecutorResult result =
|
||||||
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT)
|
||||||
|
.runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
|
if (result.getRc() != 0) {
|
||||||
|
throw new IOException("Ghostscript exited with code " + result.getRc());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!Files.exists(outputPdf)) {
|
||||||
|
throw new IOException("Ghostscript did not produce an output file");
|
||||||
|
}
|
||||||
|
|
||||||
|
return Files.readAllBytes(outputPdf);
|
||||||
|
} finally {
|
||||||
|
// Clean up temporary files
|
||||||
|
if (normalizedInputPdf != null && !normalizedInputPdf.equals(preprocessedPdf)) {
|
||||||
|
try {
|
||||||
|
Files.deleteIfExists(normalizedInputPdf);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.debug("Failed to delete temporary normalized file", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (preprocessedPdf != null && !preprocessedPdf.equals(inputPdf)) {
|
||||||
|
try {
|
||||||
|
Files.deleteIfExists(preprocessedPdf);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.debug("Failed to delete temporary CIDSet cleaned file", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static void fixType1FontCharSet(PDDocument document) throws IOException {
|
private static void fixType1FontCharSet(PDDocument document) throws IOException {
|
||||||
for (PDPage page : document.getPages()) {
|
for (PDPage page : document.getPages()) {
|
||||||
PDResources resources = page.getResources();
|
PDResources resources = page.getResources();
|
||||||
@ -680,10 +797,8 @@ public class ConvertPDFToPDFA {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static String buildStandardType1GlyphSet() {
|
private static String buildStandardType1GlyphSet() {
|
||||||
// Standard PDF glyph names for Type1 fonts
|
|
||||||
Set<String> glyphNames = new LinkedHashSet<>();
|
Set<String> glyphNames = new LinkedHashSet<>();
|
||||||
|
|
||||||
// Add common Type1 glyphs from standard encoding
|
|
||||||
String[] standardGlyphs = {
|
String[] standardGlyphs = {
|
||||||
".notdef",
|
".notdef",
|
||||||
".null",
|
".null",
|
||||||
@ -886,29 +1001,34 @@ public class ConvertPDFToPDFA {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private byte[] processWithPDFBox(PDDocument document, int pdfaPart) throws Exception {
|
private byte[] processWithPDFBox(PDDocument document, int pdfaPart) throws Exception {
|
||||||
|
|
||||||
removeElementsForPdfA(document, pdfaPart);
|
removeElementsForPdfA(document, pdfaPart);
|
||||||
|
|
||||||
|
document.getDocument().setVersion(pdfaPart == 1 ? 1.4f : 1.7f);
|
||||||
|
|
||||||
mergeAndAddXmpMetadata(document, pdfaPart);
|
mergeAndAddXmpMetadata(document, pdfaPart);
|
||||||
|
|
||||||
addICCProfileIfNotPresent(document);
|
addICCProfileIfNotPresent(document);
|
||||||
|
|
||||||
|
// Fix CIDSet issues for PDF/A compliance
|
||||||
|
if (pdfaPart == 1) {
|
||||||
|
fixCidSetIssues(document);
|
||||||
|
}
|
||||||
|
|
||||||
fixType1FontCharSet(document);
|
fixType1FontCharSet(document);
|
||||||
|
|
||||||
// Mark the document as PDF/A
|
|
||||||
PDDocumentCatalog catalog = document.getDocumentCatalog();
|
PDDocumentCatalog catalog = document.getDocumentCatalog();
|
||||||
catalog.setMetadata(
|
catalog.setMetadata(document.getDocumentCatalog().getMetadata());
|
||||||
document.getDocumentCatalog().getMetadata()); // Ensure metadata is linked
|
|
||||||
catalog.setViewerPreferences(
|
PDViewerPreferences viewerPrefs = new PDViewerPreferences(catalog.getCOSObject());
|
||||||
new PDViewerPreferences(catalog.getCOSObject())); // PDF/A best practice
|
viewerPrefs.setDisplayDocTitle(true);
|
||||||
document.getDocument().setVersion(pdfaPart == 1 ? 1.4f : 1.7f);
|
catalog.setViewerPreferences(viewerPrefs);
|
||||||
|
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
if (pdfaPart == 1) {
|
CompressParameters compressParams =
|
||||||
document.save(baos, CompressParameters.NO_COMPRESSION);
|
pdfaPart == 1 ? CompressParameters.NO_COMPRESSION : new CompressParameters();
|
||||||
} else {
|
|
||||||
document.save(baos);
|
document.save(baos, compressParams);
|
||||||
}
|
log.debug("PDF/A-{} document processed with PDFBox", pdfaPart);
|
||||||
|
|
||||||
return baos.toByteArray();
|
return baos.toByteArray();
|
||||||
}
|
}
|
||||||
@ -1010,7 +1130,6 @@ public class ConvertPDFToPDFA {
|
|||||||
private static void sanitizePdfA(COSBase base, int pdfaPart) {
|
private static void sanitizePdfA(COSBase base, int pdfaPart) {
|
||||||
if (base instanceof COSDictionary dict) {
|
if (base instanceof COSDictionary dict) {
|
||||||
if (pdfaPart == 1) {
|
if (pdfaPart == 1) {
|
||||||
// Remove transparency-related elements
|
|
||||||
COSBase group = dict.getDictionaryObject(COSName.GROUP);
|
COSBase group = dict.getDictionaryObject(COSName.GROUP);
|
||||||
if (group instanceof COSDictionary gDict
|
if (group instanceof COSDictionary gDict
|
||||||
&& COSName.TRANSPARENCY.equals(gDict.getCOSName(COSName.S))) {
|
&& COSName.TRANSPARENCY.equals(gDict.getCOSName(COSName.S))) {
|
||||||
@ -1018,18 +1137,15 @@ public class ConvertPDFToPDFA {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dict.removeItem(COSName.SMASK);
|
dict.removeItem(COSName.SMASK);
|
||||||
// Transparency blending constants (/CA, /ca) — disallowed in PDF/A-1
|
|
||||||
dict.removeItem(COSName.CA);
|
dict.removeItem(COSName.CA);
|
||||||
dict.removeItem(COSName.getPDFName("ca"));
|
dict.removeItem(COSName.getPDFName("ca"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Interpolation (non-deterministic image scaling) — required to be false
|
|
||||||
if (dict.containsKey(COSName.INTERPOLATE)
|
if (dict.containsKey(COSName.INTERPOLATE)
|
||||||
&& dict.getBoolean(COSName.INTERPOLATE, true)) {
|
&& dict.getBoolean(COSName.INTERPOLATE, true)) {
|
||||||
dict.setBoolean(COSName.INTERPOLATE, false);
|
dict.setBoolean(COSName.INTERPOLATE, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove common forbidden features (for PDF/A 1 and 2)
|
|
||||||
dict.removeItem(COSName.JAVA_SCRIPT);
|
dict.removeItem(COSName.JAVA_SCRIPT);
|
||||||
dict.removeItem(COSName.getPDFName("JS"));
|
dict.removeItem(COSName.getPDFName("JS"));
|
||||||
dict.removeItem(COSName.getPDFName("RichMedia"));
|
dict.removeItem(COSName.getPDFName("RichMedia"));
|
||||||
@ -1041,13 +1157,11 @@ public class ConvertPDFToPDFA {
|
|||||||
dict.removeItem(COSName.EMBEDDED_FILES);
|
dict.removeItem(COSName.EMBEDDED_FILES);
|
||||||
dict.removeItem(COSName.FILESPEC);
|
dict.removeItem(COSName.FILESPEC);
|
||||||
|
|
||||||
// Recurse through all entries in the dictionary
|
|
||||||
for (Map.Entry<COSName, COSBase> entry : dict.entrySet()) {
|
for (Map.Entry<COSName, COSBase> entry : dict.entrySet()) {
|
||||||
sanitizePdfA(entry.getValue(), pdfaPart);
|
sanitizePdfA(entry.getValue(), pdfaPart);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (base instanceof COSArray arr) {
|
} else if (base instanceof COSArray arr) {
|
||||||
// Recursively sanitize each item in the array
|
|
||||||
for (COSBase item : arr) {
|
for (COSBase item : arr) {
|
||||||
sanitizePdfA(item, pdfaPart);
|
sanitizePdfA(item, pdfaPart);
|
||||||
}
|
}
|
||||||
@ -1057,7 +1171,6 @@ public class ConvertPDFToPDFA {
|
|||||||
private static void removeElementsForPdfA(PDDocument doc, int pdfaPart) {
|
private static void removeElementsForPdfA(PDDocument doc, int pdfaPart) {
|
||||||
|
|
||||||
if (pdfaPart == 1) {
|
if (pdfaPart == 1) {
|
||||||
// Remove Optional Content (Layers) - not allowed in PDF/A-1
|
|
||||||
doc.getDocumentCatalog().getCOSObject().removeItem(COSName.getPDFName("OCProperties"));
|
doc.getDocumentCatalog().getCOSObject().removeItem(COSName.getPDFName("OCProperties"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1066,10 +1179,8 @@ public class ConvertPDFToPDFA {
|
|||||||
page.setAnnotations(Collections.emptyList());
|
page.setAnnotations(Collections.emptyList());
|
||||||
}
|
}
|
||||||
PDResources res = page.getResources();
|
PDResources res = page.getResources();
|
||||||
// Clean page-level dictionary
|
|
||||||
sanitizePdfA(page.getCOSObject(), pdfaPart);
|
sanitizePdfA(page.getCOSObject(), pdfaPart);
|
||||||
|
|
||||||
// sanitize each Form XObject
|
|
||||||
if (res != null) {
|
if (res != null) {
|
||||||
for (COSName name : res.getXObjectNames()) {
|
for (COSName name : res.getXObjectNames()) {
|
||||||
try {
|
try {
|
||||||
@ -1092,7 +1203,6 @@ public class ConvertPDFToPDFA {
|
|||||||
PDMetadata existingMetadata = document.getDocumentCatalog().getMetadata();
|
PDMetadata existingMetadata = document.getDocumentCatalog().getMetadata();
|
||||||
XMPMetadata xmp;
|
XMPMetadata xmp;
|
||||||
|
|
||||||
// Load existing XMP if available
|
|
||||||
if (existingMetadata != null) {
|
if (existingMetadata != null) {
|
||||||
try (InputStream xmpStream = existingMetadata.createInputStream()) {
|
try (InputStream xmpStream = existingMetadata.createInputStream()) {
|
||||||
DomXmpParser parser = new DomXmpParser();
|
DomXmpParser parser = new DomXmpParser();
|
||||||
@ -1113,7 +1223,6 @@ public class ConvertPDFToPDFA {
|
|||||||
String originalCreator = Optional.ofNullable(docInfo.getCreator()).orElse("Unknown");
|
String originalCreator = Optional.ofNullable(docInfo.getCreator()).orElse("Unknown");
|
||||||
String originalProducer = Optional.ofNullable(docInfo.getProducer()).orElse("Unknown");
|
String originalProducer = Optional.ofNullable(docInfo.getProducer()).orElse("Unknown");
|
||||||
|
|
||||||
// Only keep the original creator so it can match xmp creator tool for compliance
|
|
||||||
DublinCoreSchema dcSchema = xmp.getDublinCoreSchema();
|
DublinCoreSchema dcSchema = xmp.getDublinCoreSchema();
|
||||||
if (dcSchema != null) {
|
if (dcSchema != null) {
|
||||||
List<String> existingCreators = dcSchema.getCreators();
|
List<String> existingCreators = dcSchema.getCreators();
|
||||||
@ -1154,7 +1263,6 @@ public class ConvertPDFToPDFA {
|
|||||||
String originalAuthor = docInfo.getAuthor();
|
String originalAuthor = docInfo.getAuthor();
|
||||||
if (originalAuthor != null && !originalAuthor.isBlank()) {
|
if (originalAuthor != null && !originalAuthor.isBlank()) {
|
||||||
docInfo.setAuthor(null);
|
docInfo.setAuthor(null);
|
||||||
// If the author is set, we keep it in the XMP metadata
|
|
||||||
if (!originalCreator.equals(originalAuthor)) {
|
if (!originalCreator.equals(originalAuthor)) {
|
||||||
dcSchema.addCreator(originalAuthor);
|
dcSchema.addCreator(originalAuthor);
|
||||||
}
|
}
|
||||||
@ -1173,11 +1281,9 @@ public class ConvertPDFToPDFA {
|
|||||||
adobePdfSchema.setKeywords(keywords);
|
adobePdfSchema.setKeywords(keywords);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set creation and modification dates using modern java.time API
|
|
||||||
Instant nowInstant = Instant.now();
|
Instant nowInstant = Instant.now();
|
||||||
ZonedDateTime nowZdt = ZonedDateTime.ofInstant(nowInstant, ZoneId.of("UTC"));
|
ZonedDateTime nowZdt = ZonedDateTime.ofInstant(nowInstant, ZoneId.of("UTC"));
|
||||||
|
|
||||||
// Determine creation date from document info or use current time
|
|
||||||
Instant creationInstant;
|
Instant creationInstant;
|
||||||
Calendar originalCreationDate = docInfo.getCreationDate();
|
Calendar originalCreationDate = docInfo.getCreationDate();
|
||||||
if (originalCreationDate != null) {
|
if (originalCreationDate != null) {
|
||||||
@ -1187,7 +1293,6 @@ public class ConvertPDFToPDFA {
|
|||||||
}
|
}
|
||||||
ZonedDateTime creationZdt = ZonedDateTime.ofInstant(creationInstant, ZoneId.of("UTC"));
|
ZonedDateTime creationZdt = ZonedDateTime.ofInstant(creationInstant, ZoneId.of("UTC"));
|
||||||
|
|
||||||
// Convert to GregorianCalendar for PDFBox API compatibility
|
|
||||||
GregorianCalendar creationCal = GregorianCalendar.from(creationZdt);
|
GregorianCalendar creationCal = GregorianCalendar.from(creationZdt);
|
||||||
GregorianCalendar modificationCal = GregorianCalendar.from(nowZdt);
|
GregorianCalendar modificationCal = GregorianCalendar.from(nowZdt);
|
||||||
|
|
||||||
@ -1198,7 +1303,6 @@ public class ConvertPDFToPDFA {
|
|||||||
xmpBasicSchema.setModifyDate(modificationCal);
|
xmpBasicSchema.setModifyDate(modificationCal);
|
||||||
xmpBasicSchema.setMetadataDate(modificationCal);
|
xmpBasicSchema.setMetadataDate(modificationCal);
|
||||||
|
|
||||||
// Serialize the created metadata so it can be attached to the existent metadata
|
|
||||||
ByteArrayOutputStream xmpOut = new ByteArrayOutputStream();
|
ByteArrayOutputStream xmpOut = new ByteArrayOutputStream();
|
||||||
new XmpSerializer().serialize(xmp, xmpOut, true);
|
new XmpSerializer().serialize(xmp, xmpOut, true);
|
||||||
|
|
||||||
@ -1212,13 +1316,10 @@ public class ConvertPDFToPDFA {
|
|||||||
try (PDDocument document = Loader.loadPDF(inputPdf)) {
|
try (PDDocument document = Loader.loadPDF(inputPdf)) {
|
||||||
|
|
||||||
for (PDPage page : document.getPages()) {
|
for (PDPage page : document.getPages()) {
|
||||||
// Retrieve the annotations on the page.
|
|
||||||
List<PDAnnotation> annotations = page.getAnnotations();
|
List<PDAnnotation> annotations = page.getAnnotations();
|
||||||
for (PDAnnotation annot : annotations) {
|
for (PDAnnotation annot : annotations) {
|
||||||
// Process only highlight annotations.
|
|
||||||
if ("Highlight".equals(annot.getSubtype())
|
if ("Highlight".equals(annot.getSubtype())
|
||||||
&& annot instanceof PDAnnotationTextMarkup highlight) {
|
&& annot instanceof PDAnnotationTextMarkup highlight) {
|
||||||
// Create a new appearance stream with the same bounding box.
|
|
||||||
float[] colorComponents =
|
float[] colorComponents =
|
||||||
highlight.getColor() != null
|
highlight.getColor() != null
|
||||||
? highlight.getColor().getComponents()
|
? highlight.getColor().getComponents()
|
||||||
@ -1240,8 +1341,6 @@ public class ConvertPDFToPDFA {
|
|||||||
cs.setStrokingColor(highlightColor);
|
cs.setStrokingColor(highlightColor);
|
||||||
cs.setLineWidth(0.05f);
|
cs.setLineWidth(0.05f);
|
||||||
float spacing = 2f;
|
float spacing = 2f;
|
||||||
// Draw diagonal lines across the highlight area to simulate
|
|
||||||
// transparency.
|
|
||||||
for (int i = 0; i < quadPoints.length; i += 8) {
|
for (int i = 0; i < quadPoints.length; i += 8) {
|
||||||
float minX =
|
float minX =
|
||||||
Math.min(
|
Math.min(
|
||||||
@ -1386,7 +1485,7 @@ public class ConvertPDFToPDFA {
|
|||||||
.runCommandWithOutputHandling(command);
|
.runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
if (result.getRc() == 0 && Files.exists(normalizedPdf)) {
|
if (result.getRc() == 0 && Files.exists(normalizedPdf)) {
|
||||||
log.info("PDF normalized with QPDF to fix font programs");
|
log.info("PDF normalized with QPDF to fix font programs and CIDSet issues");
|
||||||
return normalizedPdf;
|
return normalizedPdf;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
@ -1397,6 +1496,47 @@ public class ConvertPDFToPDFA {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Path cleanCidSetWithQpdf(Path inputPdf) {
|
||||||
|
try {
|
||||||
|
ProcessExecutorResult checkResult =
|
||||||
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
|
||||||
|
.runCommandWithOutputHandling(Arrays.asList("qpdf", "--version"));
|
||||||
|
|
||||||
|
if (checkResult.getRc() != 0) {
|
||||||
|
log.debug("QPDF not available for CIDSet cleaning");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
Path cleanedPdf =
|
||||||
|
inputPdf.getParent()
|
||||||
|
.resolve("cidset_cleaned_" + inputPdf.getFileName().toString());
|
||||||
|
|
||||||
|
// Use QPDF to remove problematic CIDSet entries that may be incomplete
|
||||||
|
List<String> command =
|
||||||
|
Arrays.asList(
|
||||||
|
"qpdf",
|
||||||
|
"--remove-unreferenced-resources=yes",
|
||||||
|
"--normalize-content=y",
|
||||||
|
"--object-streams=preserve",
|
||||||
|
inputPdf.toAbsolutePath().toString(),
|
||||||
|
cleanedPdf.toAbsolutePath().toString());
|
||||||
|
|
||||||
|
ProcessExecutorResult result =
|
||||||
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.QPDF)
|
||||||
|
.runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
|
if (result.getRc() == 0 && Files.exists(cleanedPdf)) {
|
||||||
|
log.info("PDF CIDSet cleaned with QPDF");
|
||||||
|
return cleanedPdf;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.debug("QPDF CIDSet cleaning error: {}", e.getMessage());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private byte[] convertWithPdfBoxMethod(Path inputPath, PdfaProfile profile) throws Exception {
|
private byte[] convertWithPdfBoxMethod(Path inputPath, PdfaProfile profile) throws Exception {
|
||||||
Path tempInputFile = null;
|
Path tempInputFile = null;
|
||||||
byte[] fileBytes;
|
byte[] fileBytes;
|
||||||
@ -1465,13 +1605,16 @@ public class ConvertPDFToPDFA {
|
|||||||
"error.resourceNotFound", "Resource not found: {0}", ICC_RESOURCE_PATH);
|
"error.resourceNotFound", "Resource not found: {0}", ICC_RESOURCE_PATH);
|
||||||
}
|
}
|
||||||
PDOutputIntent outputIntent = new PDOutputIntent(document, colorProfile);
|
PDOutputIntent outputIntent = new PDOutputIntent(document, colorProfile);
|
||||||
|
// PDF/A compliant output intent settings
|
||||||
outputIntent.setInfo("sRGB IEC61966-2.1");
|
outputIntent.setInfo("sRGB IEC61966-2.1");
|
||||||
outputIntent.setOutputCondition("sRGB IEC61966-2.1");
|
outputIntent.setOutputCondition("sRGB IEC61966-2.1");
|
||||||
outputIntent.setOutputConditionIdentifier("sRGB IEC61966-2.1");
|
outputIntent.setOutputConditionIdentifier("sRGB IEC61966-2.1");
|
||||||
outputIntent.setRegistryName("http://www.color.org");
|
outputIntent.setRegistryName("http://www.color.org");
|
||||||
document.getDocumentCatalog().addOutputIntent(outputIntent);
|
document.getDocumentCatalog().addOutputIntent(outputIntent);
|
||||||
|
log.debug("Added ICC color profile for PDF/A compliance");
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("Failed to load ICC profile: {}", e.getMessage());
|
log.error("Failed to load ICC profile: {}", e.getMessage());
|
||||||
|
throw new RuntimeException("ICC profile loading failed for PDF/A compliance", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user