From 40b80a7baa8ea991e26a2be67b4644bc4e62116e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Fri, 7 Nov 2025 22:57:33 +0100 Subject: [PATCH 01/14] feat(pdf-conversion): add support for PDF/A-3b, PDF/X formats improve current PDF/A conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Updated `PdfToPdfARequest` to include PDF/X in supported output formats - Expanded input handling and model validation for PDF/A and PDF/X - Added Ghostscript as a preferred backend for PDF/A and PDF/X conversions - Implemented PDF/X-specific conversion logic with detailed validation - Updated UI templates to separate PDF/A and PDF/X format options - Enhanced error handling and warnings during conversion processes - Revised localized strings to reflect expanded functionality Signed-off-by: Balázs Szücs --- .../api/converters/ConvertPDFToPDFA.java | 698 ++++++++++++++++-- .../api/converters/PdfToPdfARequest.java | 7 +- .../main/resources/messages_en_US.properties | 12 +- .../templates/convert/pdf-to-pdfa.html | 13 +- 4 files changed, 652 insertions(+), 78 deletions(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java index 16e1a65e7..41e3726e1 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java @@ -1,24 +1,32 @@ package stirling.software.SPDF.controller.api.converters; import java.awt.Color; +import java.awt.color.ColorSpace; +import java.awt.color.ICC_Profile; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.time.Instant; import java.time.ZoneId; import java.time.ZonedDateTime; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.GregorianCalendar; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pdfbox.Loader; @@ -26,6 +34,8 @@ import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.io.RandomAccessRead; +import org.apache.pdfbox.io.RandomAccessReadBufferedFile; import org.apache.pdfbox.pdfwriter.compress.CompressParameters; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; @@ -47,6 +57,14 @@ import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup; import org.apache.pdfbox.pdmodel.interactive.viewerpreferences.PDViewerPreferences; +import org.apache.pdfbox.preflight.Format; +import org.apache.pdfbox.preflight.PreflightConfiguration; +import org.apache.pdfbox.preflight.PreflightDocument; +import org.apache.pdfbox.preflight.ValidationResult; +import org.apache.pdfbox.preflight.ValidationResult.ValidationError; +import org.apache.pdfbox.preflight.exception.SyntaxValidationException; +import org.apache.pdfbox.preflight.exception.ValidationException; +import org.apache.pdfbox.preflight.parser.PreflightParser; import org.apache.xmpbox.XMPMetadata; import org.apache.xmpbox.schema.AdobePDFSchema; import org.apache.xmpbox.schema.DublinCoreSchema; @@ -80,11 +98,304 @@ import stirling.software.common.util.WebResponseUtils; @Tag(name = "Convert", description = "Convert APIs") public class ConvertPDFToPDFA { + private static final String ICC_RESOURCE_PATH = "/icc/sRGB2014.icc"; + private static final int PDFA_COMPATIBILITY_POLICY = 1; + + private static void validateAndWarnPdfA(byte[] pdfBytes, PdfaProfile profile, String method) { + Path tempPdfPath = null; + try { + tempPdfPath = Files.createTempFile("validate_", ".pdf"); + Files.write(tempPdfPath, pdfBytes); + validatePdfaOutput(tempPdfPath, profile); + log.info("PDF/A validation passed for {} using {}", profile.displayName(), method); + } catch (IOException e) { + log.warn( + "PDF/A validation warning for {} using {}: {}", + profile.displayName(), + method, + e.getMessage()); + } finally { + if (tempPdfPath != null) { + try { + Files.deleteIfExists(tempPdfPath); + } catch (IOException e) { + log.debug("Failed to delete temporary validation file", e); + } + } + } + } + + private static void deleteQuietly(Path directory) { + if (directory == null) { + return; + } + try (Stream stream = Files.walk(directory)) { + stream.sorted(Comparator.reverseOrder()) + .forEach( + path -> { + try { + Files.deleteIfExists(path); + } catch (IOException e) { + log.warn("Failed to delete temporary file: {}", path, e); + } + }); + } catch (IOException e) { + log.warn("Failed to clean temporary directory: {}", directory, e); + } + } + + private static List buildGhostscriptCommand( + Path inputPdf, + Path outputPdf, + ColorProfiles colorProfiles, + Path workingDir, + PdfaProfile profile, + Path pdfaDefFile) { + + List command = new ArrayList<>(); + command.add("gs"); + command.add("--permit-file-read=" + workingDir.toAbsolutePath()); + command.add("--permit-file-read=" + colorProfiles.rgb().toAbsolutePath()); + command.add("--permit-file-read=" + colorProfiles.gray().toAbsolutePath()); + command.add("--permit-file-read=" + inputPdf.toAbsolutePath()); + command.add("--permit-file-read=" + pdfaDefFile.toAbsolutePath()); + command.add("--permit-file-write=" + workingDir.toAbsolutePath()); + command.add("-dPDFA=" + profile.part()); + command.add("-dPDFACompatibilityPolicy=" + PDFA_COMPATIBILITY_POLICY); + command.add("-dCompatibilityLevel=" + profile.compatibilityLevel()); + command.add("-sDEVICE=pdfwrite"); + command.add("-sColorConversionStrategy=RGB"); + command.add("-dProcessColorModel=/DeviceRGB"); + command.add("-sOutputICCProfile=" + colorProfiles.rgb().toAbsolutePath()); + command.add("-sDefaultRGBProfile=" + colorProfiles.rgb().toAbsolutePath()); + command.add("-sDefaultGrayProfile=" + colorProfiles.gray().toAbsolutePath()); + command.add("-dEmbedAllFonts=true"); + command.add("-dSubsetFonts=true"); + command.add("-dCompressFonts=true"); + command.add("-dNOPAUSE"); + command.add("-dBATCH"); + command.add("-dNOOUTERSAVE"); + command.add("-sOutputFile=" + outputPdf.toAbsolutePath()); + command.add(pdfaDefFile.toAbsolutePath().toString()); + command.add(inputPdf.toAbsolutePath().toString()); + + return command; + } + + private static void validatePdfaOutput(Path pdfPath, PdfaProfile profile) throws IOException { + Optional format = profile.preflightFormat(); + if (format.isEmpty()) { + log.debug("Skipping PDFBox preflight validation for {}", profile.displayName()); + return; + } + + try (RandomAccessRead rar = new RandomAccessReadBufferedFile(pdfPath.toFile())) { + PreflightParser parser = new PreflightParser(rar); + + PreflightDocument document; + try { + document = + (PreflightDocument) + parser.parse( + format.get(), + PreflightConfiguration.createPdfA1BConfiguration()); + } catch (SyntaxValidationException e) { + throw new IOException(buildPreflightErrorMessage(e.getResult(), profile), e); + } catch (ClassCastException e) { + throw new IOException( + "PDF/A preflight did not produce a PreflightDocument for " + + profile.displayName(), + e); + } + + if (document == null) { + throw new IOException( + "PDF/A preflight returned no document for " + profile.displayName()); + } + + try (PreflightDocument closeableDocument = document) { + ValidationResult result = closeableDocument.validate(); + if (result == null || !result.isValid()) { + throw new IOException(buildPreflightErrorMessage(result, profile)); + } + } + } catch (SyntaxValidationException e) { + throw new IOException(buildPreflightErrorMessage(e.getResult(), profile), e); + } catch (ValidationException e) { + throw new IOException( + "PDF/A preflight validation failed for " + profile.displayName(), e); + } + } + + private static String buildPreflightErrorMessage(ValidationResult result, PdfaProfile profile) { + String baseMessage = "PDF/A preflight validation failed for " + profile.displayName(); + if (result == null) { + return baseMessage + ": no detailed validation result available"; + } + + List errors = result.getErrorsList(); + if (errors == null || errors.isEmpty()) { + return baseMessage + ": unknown validation error"; + } + + String summarizedErrors = + errors.stream() + .limit(5) + .map( + error -> { + StringBuilder sb = + new StringBuilder( + Optional.ofNullable(error.getErrorCode()) + .orElse("UNKNOWN")); + String details = error.getDetails(); + if (details != null && !details.isBlank()) { + sb.append(": ").append(details.trim()); + } + if (error.isWarning()) { + sb.append(" (warning)"); + } + return sb.toString(); + }) + .collect(Collectors.joining("; ")); + + if (errors.size() > 5) { + summarizedErrors += " (" + (errors.size() - 5) + " more)"; + } + + return baseMessage + ": " + summarizedErrors; + } + + private static void writeJavaIccProfile(ICC_Profile profile, Path target) throws IOException { + try (OutputStream out = Files.newOutputStream(target)) { + out.write(profile.getData()); + } + } + + private static Path createPdfaDefFile( + Path workingDir, ColorProfiles colorProfiles, PdfaProfile profile) throws IOException { + Path pdfaDefFile = workingDir.resolve("PDFA_def.ps"); + + String title = "Converted to " + profile.displayName(); + String pdfaDefContent = + String.format( + "%% This is a sample prefix file for creating a PDF/A document.\n" + + "%% Feel free to modify entries marked with \"Customize\".\n\n" + + "%% Define entries in the document Info dictionary.\n" + + "[/Title (%s)\n" + + " /DOCINFO pdfmark\n\n" + + "%% Define an ICC profile.\n" + + "[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark\n" + + "[{icc_PDFA} <<\n" + + " /N 3\n" + + ">> /PUT pdfmark\n" + + "[{icc_PDFA} (%s) (r) file /PUT pdfmark\n\n" + + "%% Define the output intent dictionary.\n" + + "[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark\n" + + "[{OutputIntent_PDFA} <<\n" + + " /Type /OutputIntent\n" + + " /S /GTS_PDFA1\n" + + " /DestOutputProfile {icc_PDFA}\n" + + " /OutputConditionIdentifier (sRGB IEC61966-2.1)\n" + + " /Info (sRGB IEC61966-2.1)\n" + + " /RegistryName (http://www.color.org)\n" + + ">> /PUT pdfmark\n" + + "[{Catalog} <> /PUT pdfmark\n", + title, colorProfiles.rgb().toAbsolutePath().toString().replace("\\", "/")); + + Files.writeString(pdfaDefFile, pdfaDefContent); + return pdfaDefFile; + } + + private static List buildGhostscriptCommandX( + Path inputPdf, + Path outputPdf, + ColorProfiles colorProfiles, + Path workingDir, + PdfXProfile profile) { + + List command = new ArrayList<>(); + command.add("gs"); + command.add("--permit-file-read=" + workingDir.toAbsolutePath()); + command.add("--permit-file-read=" + colorProfiles.rgb().toAbsolutePath()); + command.add("--permit-file-read=" + colorProfiles.gray().toAbsolutePath()); + command.add("--permit-file-read=" + inputPdf.toAbsolutePath()); + command.add("--permit-file-write=" + workingDir.toAbsolutePath()); + command.add("-dPDFX=" + profile.pdfxVersion()); + command.add("-dCompatibilityLevel=" + profile.compatibilityLevel()); + command.add("-sDEVICE=pdfwrite"); + command.add("-sColorConversionStrategy=RGB"); + command.add("-dProcessColorModel=/DeviceRGB"); + command.add("-sOutputICCProfile=" + colorProfiles.rgb().toAbsolutePath()); + command.add("-sDefaultRGBProfile=" + colorProfiles.rgb().toAbsolutePath()); + command.add("-sDefaultGrayProfile=" + colorProfiles.gray().toAbsolutePath()); + command.add("-dEmbedAllFonts=true"); + command.add("-dSubsetFonts=true"); + command.add("-dCompressFonts=true"); + command.add("-dNOPAUSE"); + command.add("-dBATCH"); + command.add("-dNOOUTERSAVE"); + command.add("-sOutputFile=" + outputPdf.toAbsolutePath()); + command.add(inputPdf.toAbsolutePath().toString()); + + return command; + } + + private static void embedMissingFonts(PDDocument loDoc, PDDocument baseDoc, Set missingFonts) + throws IOException { + List loPages = new ArrayList<>(); + loDoc.getPages().forEach(loPages::add); + List basePages = new ArrayList<>(); + baseDoc.getPages().forEach(basePages::add); + + for (int i = 0; i < loPages.size(); i++) { + PDResources loRes = loPages.get(i).getResources(); + PDResources baseRes = basePages.get(i).getResources(); + + for (COSName fontKey : loRes.getFontNames()) { + PDFont loFont = loRes.getFont(fontKey); + if (loFont == null) continue; + + String psName = loFont.getName(); + if (!missingFonts.contains(psName)) continue; + + PDFontDescriptor desc = loFont.getFontDescriptor(); + if (desc == null) continue; + + PDStream fontStream = null; + if (desc.getFontFile() != null) { + fontStream = desc.getFontFile(); + } else if (desc.getFontFile2() != null) { + fontStream = desc.getFontFile2(); + } else if (desc.getFontFile3() != null) { + fontStream = desc.getFontFile3(); + } + if (fontStream == null) continue; + + try (InputStream in = fontStream.createInputStream()) { + PDFont newFont; + try { + newFont = PDType0Font.load(baseDoc, in, false); + } catch (IOException e1) { + try { + newFont = PDTrueTypeFont.load(baseDoc, in, null); + } catch (IOException | IllegalArgumentException e2) { + log.error("Could not embed font {}: {}", psName, e2.getMessage()); + continue; + } + } + if (newFont != null) { + baseRes.put(fontKey, newFont); + } + } + } + } + } + @PostMapping(consumes = MediaType.MULTIPART_FORM_DATA_VALUE, value = "/pdf/pdfa") @Operation( - summary = "Convert a PDF to a PDF/A", + summary = "Convert a PDF to a PDF/A or PDF/X", description = - "This endpoint converts a PDF file to a PDF/A file using LibreOffice. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO") + "This endpoint converts a PDF file to a PDF/A or PDF/X file using Ghostscript (preferred) or PDFBox/LibreOffice (fallback). PDF/A is a format designed for long-term archiving, while PDF/X is optimized for print production. Input:PDF Output:PDF Type:SISO") public ResponseEntity pdfToPdfA(@ModelAttribute PdfToPdfARequest request) throws Exception { MultipartFile inputFile = request.getFileInput(); @@ -96,6 +407,20 @@ public class ConvertPDFToPDFA { throw ExceptionUtils.createPdfFileRequiredException(); } + // Determine if this is PDF/A or PDF/X conversion + boolean isPdfX = outputFormat != null && outputFormat.toLowerCase().startsWith("pdfx"); + + if (isPdfX) { + return handlePdfXConversion(inputFile, outputFormat); + } else { + return handlePdfAConversion(inputFile, outputFormat); + } + } + + private ResponseEntity handlePdfAConversion( + MultipartFile inputFile, String outputFormat) throws Exception { + PdfaProfile profile = PdfaProfile.fromRequest(outputFormat); + // Get the original filename without extension String originalFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename()); if (originalFileName == null || originalFileName.trim().isEmpty()) { @@ -106,31 +431,149 @@ public class ConvertPDFToPDFA { ? originalFileName.substring(0, originalFileName.lastIndexOf('.')) : originalFileName; - Path tempInputFile = null; - byte[] fileBytes; - Path loPdfPath = null; // Used for LibreOffice conversion output - File preProcessedFile = null; - int pdfaPart = 2; + Path workingDir = Files.createTempDirectory("pdfa_conversion_"); + Path inputPath = workingDir.resolve("input.pdf"); + inputFile.transferTo(inputPath); try { - // Save uploaded file to temp location - tempInputFile = Files.createTempFile("input_", ".pdf"); - inputFile.transferTo(tempInputFile); + byte[] converted; - // Branch conversion based on desired output PDF/A format - if ("pdfa".equals(outputFormat)) { + // Try Ghostscript first (preferred method) + if (isGhostscriptAvailable()) { + log.info("Using Ghostscript for PDF/A conversion to {}", profile.displayName()); + try { + converted = convertWithGhostscript(inputPath, workingDir, profile); + String outputFilename = baseFileName + profile.outputSuffix(); + + validateAndWarnPdfA(converted, profile, "Ghostscript"); + + return WebResponseUtils.bytesToWebResponse( + converted, outputFilename, MediaType.APPLICATION_PDF); + } catch (Exception e) { + log.warn( + "Ghostscript conversion failed, falling back to PDFBox/LibreOffice method", + e); + } + } else { + log.info("Ghostscript not available, using PDFBox/LibreOffice fallback method"); + } + + converted = convertWithPdfBoxMethod(inputPath, workingDir, profile); + String outputFilename = baseFileName + profile.outputSuffix(); + + // Validate with PDFBox preflight and warn if issues found + validateAndWarnPdfA(converted, profile, "PDFBox/LibreOffice"); + + return WebResponseUtils.bytesToWebResponse( + converted, outputFilename, MediaType.APPLICATION_PDF); + + } finally { + deleteQuietly(workingDir); + } + } + + private ResponseEntity handlePdfXConversion( + MultipartFile inputFile, String outputFormat) throws Exception { + PdfXProfile profile = PdfXProfile.fromRequest(outputFormat); + + String originalFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename()); + if (originalFileName == null || originalFileName.trim().isEmpty()) { + originalFileName = "output.pdf"; + } + String baseFileName = + originalFileName.contains(".") + ? originalFileName.substring(0, originalFileName.lastIndexOf('.')) + : originalFileName; + + Path workingDir = Files.createTempDirectory("pdfx_conversion_"); + Path inputPath = workingDir.resolve("input.pdf"); + inputFile.transferTo(inputPath); + + try { + // PDF/X conversion uses Ghostscript (no fallback currently) + if (!isGhostscriptAvailable()) { + log.error("Ghostscript is required for PDF/X conversion"); + throw new IOException( + "Ghostscript is required for PDF/X conversion but is not available on the system"); + } + + log.info("Using Ghostscript for PDF/X conversion to {}", profile.displayName()); + byte[] converted = convertWithGhostscriptX(inputPath, workingDir, profile); + String outputFilename = baseFileName + profile.outputSuffix(); + + log.info("PDF/X conversion completed successfully to {}", profile.displayName()); + + return WebResponseUtils.bytesToWebResponse( + converted, outputFilename, MediaType.APPLICATION_PDF); + + } catch (IOException | InterruptedException e) { + log.error("PDF/X conversion failed", e); + throw ExceptionUtils.createPdfaConversionFailedException(); + } finally { + deleteQuietly(workingDir); + } + } + + private boolean isGhostscriptAvailable() { + try { + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(Arrays.asList("gs", "--version")); + return result.getRc() == 0; + } catch (Exception e) { + log.debug("Ghostscript availability check failed", e); + return false; + } + } + + private byte[] convertWithGhostscript(Path inputPdf, Path workingDir, PdfaProfile profile) + throws IOException, InterruptedException { + Path outputPdf = workingDir.resolve("gs_output.pdf"); + ColorProfiles colorProfiles = prepareColorProfiles(workingDir); + Path pdfaDefFile = createPdfaDefFile(workingDir, colorProfiles, profile); + + List command = + buildGhostscriptCommand( + inputPdf, outputPdf, colorProfiles, workingDir, profile, pdfaDefFile); + + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(command); + + if (result.getRc() != 0) { + throw new IOException("Ghostscript exited with code " + result.getRc()); + } + + if (!Files.exists(outputPdf)) { + throw new IOException("Ghostscript did not produce an output file"); + } + + return Files.readAllBytes(outputPdf); + } + + private byte[] convertWithPdfBoxMethod(Path inputPath, Path workingDir, PdfaProfile profile) + throws Exception { + Path tempInputFile = null; + byte[] fileBytes; + Path loPdfPath = null; + File preProcessedFile = null; + int pdfaPart = profile.part(); + + try { + tempInputFile = inputPath; + + if (pdfaPart == 2 || pdfaPart == 3) { preProcessedFile = tempInputFile.toFile(); } else { - pdfaPart = 1; preProcessedFile = preProcessHighlights(tempInputFile.toFile()); } - Set missingFonts = new HashSet<>(); + + Set missingFonts; boolean needImgs; try (PDDocument doc = Loader.loadPDF(preProcessedFile)) { missingFonts = findUnembeddedFontNames(doc); needImgs = (pdfaPart == 1) && hasTransparentImages(doc); if (!missingFonts.isEmpty() || needImgs) { - // Run LibreOffice conversion to get flattened images and embedded fonts loPdfPath = runLibreOfficeConversion(preProcessedFile.toPath(), pdfaPart); } } @@ -138,25 +581,42 @@ public class ConvertPDFToPDFA { convertToPdfA( preProcessedFile.toPath(), loPdfPath, pdfaPart, missingFonts, needImgs); - String outputFilename = baseFileName + "_PDFA.pdf"; - - return WebResponseUtils.bytesToWebResponse( - fileBytes, outputFilename, MediaType.APPLICATION_PDF); + return fileBytes; } finally { - // Clean up temporary files - if (tempInputFile != null) { - Files.deleteIfExists(tempInputFile); - } if (loPdfPath != null && loPdfPath.getParent() != null) { FileUtils.deleteDirectory(loPdfPath.getParent().toFile()); } - if (preProcessedFile != null) { + if (preProcessedFile != null && !preProcessedFile.equals(tempInputFile.toFile())) { Files.deleteIfExists(preProcessedFile.toPath()); } } } + private ColorProfiles prepareColorProfiles(Path workingDir) throws IOException { + Path rgbProfile = workingDir.resolve("sRGB.icc"); + copyResourceIcc(rgbProfile); + + Path grayProfile = workingDir.resolve("Gray.icc"); + try { + writeJavaIccProfile(ICC_Profile.getInstance(ColorSpace.CS_GRAY), grayProfile); + } catch (IllegalArgumentException e) { + log.warn("Falling back to sRGB ICC profile for grayscale defaults", e); + Files.copy(rgbProfile, grayProfile, StandardCopyOption.REPLACE_EXISTING); + } + + return new ColorProfiles(rgbProfile, grayProfile); + } + + private void copyResourceIcc(Path target) throws IOException { + try (InputStream in = getClass().getResourceAsStream(ICC_RESOURCE_PATH)) { + if (in == null) { + throw new IOException("ICC profile resource not found: " + ICC_RESOURCE_PATH); + } + Files.copy(in, target, StandardCopyOption.REPLACE_EXISTING); + } + } + /** * Merge fonts & flattened images from loPdfPath into basePdfPath, then run the standard * PDFBox/A pipeline. @@ -255,55 +715,27 @@ public class ConvertPDFToPDFA { return outputFiles[0].toPath(); } - private void embedMissingFonts(PDDocument loDoc, PDDocument baseDoc, Set missingFonts) - throws IOException { - List loPages = new ArrayList<>(); - loDoc.getPages().forEach(loPages::add); - List basePages = new ArrayList<>(); - baseDoc.getPages().forEach(basePages::add); + private byte[] convertWithGhostscriptX(Path inputPdf, Path workingDir, PdfXProfile profile) + throws IOException, InterruptedException { + Path outputPdf = workingDir.resolve("gs_output_pdfx.pdf"); + ColorProfiles colorProfiles = prepareColorProfiles(workingDir); - for (int i = 0; i < loPages.size(); i++) { - PDResources loRes = loPages.get(i).getResources(); - PDResources baseRes = basePages.get(i).getResources(); + List command = + buildGhostscriptCommandX(inputPdf, outputPdf, colorProfiles, workingDir, profile); - for (COSName fontKey : loRes.getFontNames()) { - PDFont loFont = loRes.getFont(fontKey); - if (loFont == null) continue; + ProcessExecutorResult result = + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT) + .runCommandWithOutputHandling(command); - String psName = loFont.getName(); - if (!missingFonts.contains(psName)) continue; - - PDFontDescriptor desc = loFont.getFontDescriptor(); - if (desc == null) continue; - - PDStream fontStream = null; - if (desc.getFontFile() != null) { - fontStream = desc.getFontFile(); - } else if (desc.getFontFile2() != null) { - fontStream = desc.getFontFile2(); - } else if (desc.getFontFile3() != null) { - fontStream = desc.getFontFile3(); - } - if (fontStream == null) continue; - - try (InputStream in = fontStream.createInputStream()) { - PDFont newFont; - try { - newFont = PDType0Font.load(baseDoc, in, false); - } catch (IOException e1) { - try { - newFont = PDTrueTypeFont.load(baseDoc, in, null); - } catch (IOException | IllegalArgumentException e2) { - log.error("Could not embed font {}: {}", psName, e2.getMessage()); - continue; - } - } - if (newFont != null) { - baseRes.put(fontKey, newFont); - } - } - } + if (result.getRc() != 0) { + throw new IOException("Ghostscript exited with code " + result.getRc()); } + + if (!Files.exists(outputPdf)) { + throw new IOException("Ghostscript did not produce an output file"); + } + + return Files.readAllBytes(outputPdf); } private Set findUnembeddedFontNames(PDDocument doc) throws IOException { @@ -712,4 +1144,132 @@ public class ConvertPDFToPDFA { return preProcessedFile; } } + + /** Enum representing different PDF/A profiles */ + private enum PdfaProfile { + PDF_A_1B(1, "PDF/A-1b", "_PDFA-1b.pdf", "1.4", Format.PDF_A1B, "pdfa-1"), + PDF_A_2B(2, "PDF/A-2b", "_PDFA-2b.pdf", "1.7", null, "pdfa", "pdfa-2", "pdfa-2b"), + PDF_A_3B(3, "PDF/A-3b", "_PDFA-3b.pdf", "1.7", null, "pdfa-3", "pdfa-3b"); + + private final int part; + private final String displayName; + private final String suffix; + private final String compatibilityLevel; + private final Format preflightFormat; + private final List requestTokens; + + PdfaProfile( + int part, + String displayName, + String suffix, + String compatibilityLevel, + Format preflightFormat, + String... requestTokens) { + this.part = part; + this.displayName = displayName; + this.suffix = suffix; + this.compatibilityLevel = compatibilityLevel; + this.preflightFormat = preflightFormat; + this.requestTokens = Arrays.asList(requestTokens); + } + + static PdfaProfile fromRequest(String requestToken) { + if (requestToken == null) { + return PDF_A_2B; + } + String normalized = requestToken.trim().toLowerCase(Locale.ROOT); + Optional match = + Arrays.stream(values()) + .filter( + profile -> + profile.requestTokens.stream() + .map(token -> token.toLowerCase(Locale.ROOT)) + .anyMatch(token -> token.equals(normalized))) + .findFirst(); + + return match.orElse(PDF_A_2B); + } + + int part() { + return part; + } + + String displayName() { + return displayName; + } + + String outputSuffix() { + return suffix; + } + + String compatibilityLevel() { + return compatibilityLevel; + } + + Optional preflightFormat() { + return Optional.ofNullable(preflightFormat); + } + } + + private enum PdfXProfile { + PDF_X_1(1, "PDF/X-1", "_PDFX-1.pdf", "1.3", "2001", "pdfx-1", "pdfx"), + PDF_X_3(3, "PDF/X-3", "_PDFX-3.pdf", "1.3", "2003", "pdfx-3"), + PDF_X_4(4, "PDF/X-4", "_PDFX-4.pdf", "1.4", "2008", "pdfx-4"); + + private final String displayName; + private final String suffix; + private final String compatibilityLevel; + private final String pdfxVersion; + private final List requestTokens; + + PdfXProfile( + int version, + String displayName, + String suffix, + String compatibilityLevel, + String pdfxVersion, + String... requestTokens) { + this.displayName = displayName; + this.suffix = suffix; + this.compatibilityLevel = compatibilityLevel; + this.pdfxVersion = pdfxVersion; + this.requestTokens = Arrays.asList(requestTokens); + } + + static PdfXProfile fromRequest(String requestToken) { + if (requestToken == null) { + return PDF_X_4; + } + String normalized = requestToken.trim().toLowerCase(Locale.ROOT); + Optional match = + Arrays.stream(values()) + .filter( + profile -> + profile.requestTokens.stream() + .map(token -> token.toLowerCase(Locale.ROOT)) + .anyMatch(token -> token.equals(normalized))) + .findFirst(); + + return match.orElse(PDF_X_4); + } + + String displayName() { + return displayName; + } + + String outputSuffix() { + return suffix; + } + + String compatibilityLevel() { + return compatibilityLevel; + } + + String pdfxVersion() { + return pdfxVersion; + } + } + + /** Record to hold color profile paths */ + private record ColorProfiles(Path rgb, Path gray) {} } diff --git a/app/core/src/main/java/stirling/software/SPDF/model/api/converters/PdfToPdfARequest.java b/app/core/src/main/java/stirling/software/SPDF/model/api/converters/PdfToPdfARequest.java index 0553988ca..4bda52cc7 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/api/converters/PdfToPdfARequest.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/api/converters/PdfToPdfARequest.java @@ -12,8 +12,11 @@ import stirling.software.common.model.api.PDFFile; public class PdfToPdfARequest extends PDFFile { @Schema( - description = "The output PDF/A type", + description = "The output format type (PDF/A or PDF/X)", requiredMode = Schema.RequiredMode.REQUIRED, - allowableValues = {"pdfa", "pdfa-1"}) + allowableValues = { + "pdfa", "pdfa-1", "pdfa-2", "pdfa-2b", "pdfa-3", "pdfa-3b", "pdfx", "pdfx-1", + "pdfx-3", "pdfx-4" + }) private String outputFormat; } diff --git a/app/core/src/main/resources/messages_en_US.properties b/app/core/src/main/resources/messages_en_US.properties index 287955226..7b7f8bc82 100644 --- a/app/core/src/main/resources/messages_en_US.properties +++ b/app/core/src/main/resources/messages_en_US.properties @@ -694,9 +694,9 @@ home.extractImages.title=Extract Images home.extractImages.desc=Extracts all images from a PDF and saves them to zip extractImages.tags=picture,photo,save,archive,zip,capture,grab -home.pdfToPDFA.title=PDF to PDF/A -home.pdfToPDFA.desc=Convert PDF to PDF/A for long-term storage -pdfToPDFA.tags=archive,long-term,standard,conversion,storage,preservation +home.pdfToPDFA.title=PDF to PDF/A & PDF/X +home.pdfToPDFA.desc=Convert PDF to PDF/A for long-term storage or PDF/X for print production +pdfToPDFA.tags=archive,long-term,standard,conversion,storage,preservation,print,pdf-x home.PDFToWord.title=PDF to Word home.PDFToWord.desc=Convert PDF to Word formats (DOC, DOCX and ODT) @@ -1621,11 +1621,13 @@ unlockPDFForms.submit=Remove #pdfToPDFA pdfToPDFA.title=PDF To PDF/A pdfToPDFA.header=PDF To PDF/A -pdfToPDFA.credit=This service uses libreoffice for PDF/A conversion +pdfToPDFA.credit=This service uses Ghostscript (preferred) or LibreOffice for PDF/A conversion, and Ghostscript for PDF/X conversion pdfToPDFA.submit=Convert -pdfToPDFA.tip=Currently does not work for multiple inputs at once +pdfToPDFA.tip=Convert PDF to PDF/A (long-term archiving) or PDF/X (print production) pdfToPDFA.outputFormat=Output format pdfToPDFA.pdfWithDigitalSignature=The PDF contains a digital signature. This will be removed in the next step. +pdfToPDFA.pdfaFormats=PDF/A Formats (Long-term Archiving) +pdfToPDFA.pdfxFormats=PDF/X Formats (Print Production) #PDFToWord diff --git a/app/core/src/main/resources/templates/convert/pdf-to-pdfa.html b/app/core/src/main/resources/templates/convert/pdf-to-pdfa.html index 33c07acb9..2e9ed11ef 100644 --- a/app/core/src/main/resources/templates/convert/pdf-to-pdfa.html +++ b/app/core/src/main/resources/templates/convert/pdf-to-pdfa.html @@ -23,8 +23,17 @@
From 3728a123b3480abede8ad2f12463acb94227e2ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Fri, 7 Nov 2025 23:01:20 +0100 Subject: [PATCH 02/14] fix(i18n): synchronize PDF/A and PDF/X localization updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Resolved inconsistencies in messages_en_US and messages_en_GB property files - Updated descriptions, tags, and credits for PDF/A and PDF/X conversion - Adjusted localization strings to ensure proper format support handling Signed-off-by: Balázs Szücs --- .../src/main/resources/messages_en_GB.properties | 12 +++++++----- .../src/main/resources/messages_en_US.properties | 12 +++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/app/core/src/main/resources/messages_en_GB.properties b/app/core/src/main/resources/messages_en_GB.properties index 41e0ef4ee..f6570a550 100644 --- a/app/core/src/main/resources/messages_en_GB.properties +++ b/app/core/src/main/resources/messages_en_GB.properties @@ -694,9 +694,9 @@ home.extractImages.title=Extract Images home.extractImages.desc=Extracts all images from a PDF and saves them to zip extractImages.tags=picture,photo,save,archive,zip,capture,grab -home.pdfToPDFA.title=PDF to PDF/A -home.pdfToPDFA.desc=Convert PDF to PDF/A for long-term storage -pdfToPDFA.tags=archive,long-term,standard,conversion,storage,preservation +home.pdfToPDFA.title=PDF to PDF/A & PDF/X +home.pdfToPDFA.desc=Convert PDF to PDF/A for long-term storage or PDF/X for print production +pdfToPDFA.tags=archive,long-term,standard,conversion,storage,preservation,print,pdf-x home.PDFToWord.title=PDF to Word home.PDFToWord.desc=Convert PDF to Word formats (DOC, DOCX and ODT) @@ -1621,11 +1621,13 @@ unlockPDFForms.submit=Remove #pdfToPDFA pdfToPDFA.title=PDF To PDF/A pdfToPDFA.header=PDF To PDF/A -pdfToPDFA.credit=This service uses LibreOffice for PDF/A conversion +pdfToPDFA.credit=This service uses Ghostscript (preferred) or LibreOffice for PDF/A conversion, and Ghostscript for PDF/X conversion pdfToPDFA.submit=Convert -pdfToPDFA.tip=Currently does not work for multiple inputs at once +pdfToPDFA.tip=Convert PDF to PDF/A (long-term archiving) or PDF/X (print production) pdfToPDFA.outputFormat=Output format pdfToPDFA.pdfWithDigitalSignature=The PDF contains a digital signature. This will be removed in the next step. +pdfToPDFA.pdfaFormats=PDF/A Formats (Long-term Archiving) +pdfToPDFA.pdfxFormats=PDF/X Formats (Print Production) #PDFToWord diff --git a/app/core/src/main/resources/messages_en_US.properties b/app/core/src/main/resources/messages_en_US.properties index 7b7f8bc82..287955226 100644 --- a/app/core/src/main/resources/messages_en_US.properties +++ b/app/core/src/main/resources/messages_en_US.properties @@ -694,9 +694,9 @@ home.extractImages.title=Extract Images home.extractImages.desc=Extracts all images from a PDF and saves them to zip extractImages.tags=picture,photo,save,archive,zip,capture,grab -home.pdfToPDFA.title=PDF to PDF/A & PDF/X -home.pdfToPDFA.desc=Convert PDF to PDF/A for long-term storage or PDF/X for print production -pdfToPDFA.tags=archive,long-term,standard,conversion,storage,preservation,print,pdf-x +home.pdfToPDFA.title=PDF to PDF/A +home.pdfToPDFA.desc=Convert PDF to PDF/A for long-term storage +pdfToPDFA.tags=archive,long-term,standard,conversion,storage,preservation home.PDFToWord.title=PDF to Word home.PDFToWord.desc=Convert PDF to Word formats (DOC, DOCX and ODT) @@ -1621,13 +1621,11 @@ unlockPDFForms.submit=Remove #pdfToPDFA pdfToPDFA.title=PDF To PDF/A pdfToPDFA.header=PDF To PDF/A -pdfToPDFA.credit=This service uses Ghostscript (preferred) or LibreOffice for PDF/A conversion, and Ghostscript for PDF/X conversion +pdfToPDFA.credit=This service uses libreoffice for PDF/A conversion pdfToPDFA.submit=Convert -pdfToPDFA.tip=Convert PDF to PDF/A (long-term archiving) or PDF/X (print production) +pdfToPDFA.tip=Currently does not work for multiple inputs at once pdfToPDFA.outputFormat=Output format pdfToPDFA.pdfWithDigitalSignature=The PDF contains a digital signature. This will be removed in the next step. -pdfToPDFA.pdfaFormats=PDF/A Formats (Long-term Archiving) -pdfToPDFA.pdfxFormats=PDF/X Formats (Print Production) #PDFToWord From a5e55e598a16a8fd894bd6f0b268c08e18beb105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Sat, 8 Nov 2025 11:10:31 +0100 Subject: [PATCH 03/14] refactor(pdf-conversion): optimize PDF/A and font embedding flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replaced redundant streams and lists initialization with more efficient alternatives - Centralized stream reading logic to prevent reuse issues and ensure proper closing - Enhanced logging for PDF/A validation to differentiate warnings from errors - Simplified methods by removing redundant parameters and improving clarity - Updated GregorianCalendar usage to modern java.time classes - Ensured static state for utility-like methods for cleaner invocation - Improved PDF/A metadata handling by aligning structure and removing redundancy Signed-off-by: Balázs Szücs --- .../api/converters/ConvertPDFToPDFA.java | 543 +++++++++--------- 1 file changed, 280 insertions(+), 263 deletions(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java index 41e3726e1..022f5fe61 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java @@ -3,28 +3,14 @@ package stirling.software.SPDF.controller.api.converters; import java.awt.Color; import java.awt.color.ColorSpace; import java.awt.color.ICC_Profile; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; +import java.io.*; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.time.Instant; import java.time.ZoneId; import java.time.ZonedDateTime; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.GregorianCalendar; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Optional; -import java.util.Set; +import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -105,9 +91,21 @@ public class ConvertPDFToPDFA { Path tempPdfPath = null; try { tempPdfPath = Files.createTempFile("validate_", ".pdf"); - Files.write(tempPdfPath, pdfBytes); - validatePdfaOutput(tempPdfPath, profile); - log.info("PDF/A validation passed for {} using {}", profile.displayName(), method); + + try (OutputStream out = Files.newOutputStream(tempPdfPath)) { + out.write(pdfBytes); + } + + try { + validatePdfaOutput(tempPdfPath, profile); + log.info("PDF/A validation passed for {} using {}", profile.displayName(), method); + } catch (IOException e) { + log.warn( + "PDF/A validation warning for {} using {}: {}", + profile.displayName(), + method, + e.getMessage()); + } } catch (IOException e) { log.warn( "PDF/A validation warning for {} using {}: {}", @@ -152,7 +150,7 @@ public class ConvertPDFToPDFA { PdfaProfile profile, Path pdfaDefFile) { - List command = new ArrayList<>(); + List command = new ArrayList<>(25); command.add("gs"); command.add("--permit-file-read=" + workingDir.toAbsolutePath()); command.add("--permit-file-read=" + colorProfiles.rgb().toAbsolutePath()); @@ -276,31 +274,37 @@ public class ConvertPDFToPDFA { Path pdfaDefFile = workingDir.resolve("PDFA_def.ps"); String title = "Converted to " + profile.displayName(); + String rgbProfilePath = colorProfiles.rgb().toAbsolutePath().toString().replace("\\", "/"); String pdfaDefContent = String.format( - "%% This is a sample prefix file for creating a PDF/A document.\n" - + "%% Feel free to modify entries marked with \"Customize\".\n\n" - + "%% Define entries in the document Info dictionary.\n" - + "[/Title (%s)\n" - + " /DOCINFO pdfmark\n\n" - + "%% Define an ICC profile.\n" - + "[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark\n" - + "[{icc_PDFA} <<\n" - + " /N 3\n" - + ">> /PUT pdfmark\n" - + "[{icc_PDFA} (%s) (r) file /PUT pdfmark\n\n" - + "%% Define the output intent dictionary.\n" - + "[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark\n" - + "[{OutputIntent_PDFA} <<\n" - + " /Type /OutputIntent\n" - + " /S /GTS_PDFA1\n" - + " /DestOutputProfile {icc_PDFA}\n" - + " /OutputConditionIdentifier (sRGB IEC61966-2.1)\n" - + " /Info (sRGB IEC61966-2.1)\n" - + " /RegistryName (http://www.color.org)\n" - + ">> /PUT pdfmark\n" - + "[{Catalog} <> /PUT pdfmark\n", - title, colorProfiles.rgb().toAbsolutePath().toString().replace("\\", "/")); + """ + %% This is a sample prefix file for creating a PDF/A document. + %% Feel free to modify entries marked with "Customize". + + %% Define entries in the document Info dictionary. + [/Title (%s) + /DOCINFO pdfmark + + %% Define an ICC profile. + [/_objdef {icc_PDFA} /type /stream /OBJ pdfmark + [{icc_PDFA} << + /N 3 + >> /PUT pdfmark + [{icc_PDFA} (%s) (r) file /PUT pdfmark + + %% Define the output intent dictionary. + [/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark + [{OutputIntent_PDFA} << + /Type /OutputIntent + /S /GTS_PDFA1 + /DestOutputProfile {icc_PDFA} + /OutputConditionIdentifier (sRGB IEC61966-2.1) + /Info (sRGB IEC61966-2.1) + /RegistryName (http://www.color.org) + >> /PUT pdfmark + [{Catalog} <> /PUT pdfmark + """, + title, rgbProfilePath); Files.writeString(pdfaDefFile, pdfaDefContent); return pdfaDefFile; @@ -313,7 +317,7 @@ public class ConvertPDFToPDFA { Path workingDir, PdfXProfile profile) { - List command = new ArrayList<>(); + List command = new ArrayList<>(25); command.add("gs"); command.add("--permit-file-read=" + workingDir.toAbsolutePath()); command.add("--permit-file-read=" + colorProfiles.rgb().toAbsolutePath()); @@ -340,11 +344,11 @@ public class ConvertPDFToPDFA { return command; } - private static void embedMissingFonts(PDDocument loDoc, PDDocument baseDoc, Set missingFonts) - throws IOException { - List loPages = new ArrayList<>(); + private static void embedMissingFonts( + PDDocument loDoc, PDDocument baseDoc, Set missingFonts) throws IOException { + List loPages = new ArrayList<>(loDoc.getNumberOfPages()); loDoc.getPages().forEach(loPages::add); - List basePages = new ArrayList<>(); + List basePages = new ArrayList<>(baseDoc.getNumberOfPages()); baseDoc.getPages().forEach(basePages::add); for (int i = 0; i < loPages.size(); i++) { @@ -371,21 +375,31 @@ public class ConvertPDFToPDFA { } if (fontStream == null) continue; + // Read the font stream into memory once so we can create fresh + // InputStreams for multiple load attempts. This avoids reusing a + // consumed stream and allows try-with-resources for each attempt. + byte[] fontBytes; try (InputStream in = fontStream.createInputStream()) { - PDFont newFont; - try { - newFont = PDType0Font.load(baseDoc, in, false); - } catch (IOException e1) { + fontBytes = in.readAllBytes(); + } + + PDFont embeddedFont = null; + // First try PDType0 (CID) font + try (InputStream tryIn = new ByteArrayInputStream(fontBytes)) { + embeddedFont = PDType0Font.load(baseDoc, tryIn, false); + } catch (IOException e1) { + // Fallback to TrueType + try (InputStream tryIn2 = new ByteArrayInputStream(fontBytes)) { try { - newFont = PDTrueTypeFont.load(baseDoc, in, null); - } catch (IOException | IllegalArgumentException e2) { + embeddedFont = PDTrueTypeFont.load(baseDoc, tryIn2, null); + } catch (IllegalArgumentException | IOException e2) { log.error("Could not embed font {}: {}", psName, e2.getMessage()); - continue; } } - if (newFont != null) { - baseRes.put(fontKey, newFont); - } + } + + if (embeddedFont != null) { + baseRes.put(fontKey, embeddedFont); } } } @@ -417,59 +431,18 @@ public class ConvertPDFToPDFA { } } - private ResponseEntity handlePdfAConversion( - MultipartFile inputFile, String outputFormat) throws Exception { - PdfaProfile profile = PdfaProfile.fromRequest(outputFormat); - - // Get the original filename without extension - String originalFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename()); - if (originalFileName == null || originalFileName.trim().isEmpty()) { - originalFileName = "output.pdf"; - } - String baseFileName = - originalFileName.contains(".") - ? originalFileName.substring(0, originalFileName.lastIndexOf('.')) - : originalFileName; - - Path workingDir = Files.createTempDirectory("pdfa_conversion_"); - Path inputPath = workingDir.resolve("input.pdf"); - inputFile.transferTo(inputPath); - - try { - byte[] converted; - - // Try Ghostscript first (preferred method) - if (isGhostscriptAvailable()) { - log.info("Using Ghostscript for PDF/A conversion to {}", profile.displayName()); - try { - converted = convertWithGhostscript(inputPath, workingDir, profile); - String outputFilename = baseFileName + profile.outputSuffix(); - - validateAndWarnPdfA(converted, profile, "Ghostscript"); - - return WebResponseUtils.bytesToWebResponse( - converted, outputFilename, MediaType.APPLICATION_PDF); - } catch (Exception e) { - log.warn( - "Ghostscript conversion failed, falling back to PDFBox/LibreOffice method", - e); + private static Set findUnembeddedFontNames(PDDocument doc) throws IOException { + Set missing = new HashSet<>(16); + for (PDPage page : doc.getPages()) { + PDResources res = page.getResources(); + for (COSName name : res.getFontNames()) { + PDFont font = res.getFont(name); + if (font != null && !font.isEmbedded()) { + missing.add(font.getName()); } - } else { - log.info("Ghostscript not available, using PDFBox/LibreOffice fallback method"); } - - converted = convertWithPdfBoxMethod(inputPath, workingDir, profile); - String outputFilename = baseFileName + profile.outputSuffix(); - - // Validate with PDFBox preflight and warn if issues found - validateAndWarnPdfA(converted, profile, "PDFBox/LibreOffice"); - - return WebResponseUtils.bytesToWebResponse( - converted, outputFilename, MediaType.APPLICATION_PDF); - - } finally { - deleteQuietly(workingDir); } + return missing; } private ResponseEntity handlePdfXConversion( @@ -551,44 +524,29 @@ public class ConvertPDFToPDFA { return Files.readAllBytes(outputPdf); } - private byte[] convertWithPdfBoxMethod(Path inputPath, Path workingDir, PdfaProfile profile) - throws Exception { - Path tempInputFile = null; - byte[] fileBytes; - Path loPdfPath = null; - File preProcessedFile = null; - int pdfaPart = profile.part(); + private static void importFlattenedImages(PDDocument loDoc, PDDocument baseDoc) + throws IOException { + List loPages = new ArrayList<>(loDoc.getNumberOfPages()); + loDoc.getPages().forEach(loPages::add); + List basePages = new ArrayList<>(baseDoc.getNumberOfPages()); + baseDoc.getPages().forEach(basePages::add); - try { - tempInputFile = inputPath; + for (int i = 0; i < loPages.size(); i++) { + PDPage loPage = loPages.get(i); + PDPage basePage = basePages.get(i); - if (pdfaPart == 2 || pdfaPart == 3) { - preProcessedFile = tempInputFile.toFile(); - } else { - preProcessedFile = preProcessHighlights(tempInputFile.toFile()); - } + PDResources loRes = loPage.getResources(); + PDResources baseRes = basePage.getResources(); + Set toReplace = detectTransparentXObjects(basePage); - Set missingFonts; - boolean needImgs; - try (PDDocument doc = Loader.loadPDF(preProcessedFile)) { - missingFonts = findUnembeddedFontNames(doc); - needImgs = (pdfaPart == 1) && hasTransparentImages(doc); - if (!missingFonts.isEmpty() || needImgs) { - loPdfPath = runLibreOfficeConversion(preProcessedFile.toPath(), pdfaPart); - } - } - fileBytes = - convertToPdfA( - preProcessedFile.toPath(), loPdfPath, pdfaPart, missingFonts, needImgs); + for (COSName name : toReplace) { + PDXObject loXo = loRes.getXObject(name); + if (!(loXo instanceof PDImageXObject img)) continue; - return fileBytes; + PDImageXObject newImg = LosslessFactory.createFromImage(baseDoc, img.getImage()); - } finally { - if (loPdfPath != null && loPdfPath.getParent() != null) { - FileUtils.deleteDirectory(loPdfPath.getParent().toFile()); - } - if (preProcessedFile != null && !preProcessedFile.equals(tempInputFile.toFile())) { - Files.deleteIfExists(preProcessedFile.toPath()); + // replace the resource under the same name + baseRes.put(name, newImg); } } } @@ -608,13 +566,27 @@ public class ConvertPDFToPDFA { return new ColorProfiles(rgbProfile, grayProfile); } - private void copyResourceIcc(Path target) throws IOException { - try (InputStream in = getClass().getResourceAsStream(ICC_RESOURCE_PATH)) { - if (in == null) { - throw new IOException("ICC profile resource not found: " + ICC_RESOURCE_PATH); + private static Set detectTransparentXObjects(PDPage page) { + Set transparentObjects = new HashSet<>(); + PDResources res = page.getResources(); + if (res == null) return transparentObjects; + + for (COSName name : res.getXObjectNames()) { + try { + PDXObject xo = res.getXObject(name); + if (xo instanceof PDImageXObject img) { + COSDictionary d = img.getCOSObject(); + if (d.containsKey(COSName.SMASK) + || isTransparencyGroup(d) + || d.getBoolean(COSName.INTERPOLATE, false)) { + transparentObjects.add(name); + } + } + } catch (IOException ioe) { + log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage()); } - Files.copy(in, target, StandardCopyOption.REPLACE_EXISTING); } + return transparentObjects; } /** @@ -738,76 +710,13 @@ public class ConvertPDFToPDFA { return Files.readAllBytes(outputPdf); } - private Set findUnembeddedFontNames(PDDocument doc) throws IOException { - Set missing = new HashSet<>(); - for (PDPage page : doc.getPages()) { - PDResources res = page.getResources(); - for (COSName name : res.getFontNames()) { - PDFont font = res.getFont(name); - if (font != null && !font.isEmbedded()) { - missing.add(font.getName()); - } - } - } - return missing; - } - - private void importFlattenedImages(PDDocument loDoc, PDDocument baseDoc) throws IOException { - List loPages = new ArrayList<>(); - loDoc.getPages().forEach(loPages::add); - List basePages = new ArrayList<>(); - baseDoc.getPages().forEach(basePages::add); - - for (int i = 0; i < loPages.size(); i++) { - PDPage loPage = loPages.get(i); - PDPage basePage = basePages.get(i); - - PDResources loRes = loPage.getResources(); - PDResources baseRes = basePage.getResources(); - Set toReplace = detectTransparentXObjects(basePage); - - for (COSName name : toReplace) { - PDXObject loXo = loRes.getXObject(name); - if (!(loXo instanceof PDImageXObject img)) continue; - - PDImageXObject newImg = LosslessFactory.createFromImage(baseDoc, img.getImage()); - - // replace the resource under the same name - baseRes.put(name, newImg); - } - } - } - - private Set detectTransparentXObjects(PDPage page) { - Set transparentObjects = new HashSet<>(); - PDResources res = page.getResources(); - if (res == null) return transparentObjects; - - for (COSName name : res.getXObjectNames()) { - try { - PDXObject xo = res.getXObject(name); - if (xo instanceof PDImageXObject img) { - COSDictionary d = img.getCOSObject(); - if (d.containsKey(COSName.SMASK) - || isTransparencyGroup(d) - || d.getBoolean(COSName.INTERPOLATE, false)) { - transparentObjects.add(name); - } - } - } catch (IOException ioe) { - log.error("Error processing XObject {}: {}", name.getName(), ioe.getMessage()); - } - } - return transparentObjects; - } - - private boolean isTransparencyGroup(COSDictionary dict) { + private static boolean isTransparencyGroup(COSDictionary dict) { COSBase g = dict.getDictionaryObject(COSName.GROUP); return g instanceof COSDictionary gd && COSName.TRANSPARENCY.equals(gd.getCOSName(COSName.S)); } - private boolean hasTransparentImages(PDDocument doc) { + private static boolean hasTransparentImages(PDDocument doc) { for (PDPage page : doc.getPages()) { PDResources res = page.getResources(); if (res == null) continue; @@ -832,7 +741,7 @@ public class ConvertPDFToPDFA { return false; } - private void sanitizePdfA(COSBase base, PDResources resources, int pdfaPart) { + private static void sanitizePdfA(COSBase base, int pdfaPart) { if (base instanceof COSDictionary dict) { if (pdfaPart == 1) { // Remove transparency-related elements @@ -868,18 +777,18 @@ public class ConvertPDFToPDFA { // Recurse through all entries in the dictionary for (Map.Entry entry : dict.entrySet()) { - sanitizePdfA(entry.getValue(), resources, pdfaPart); + sanitizePdfA(entry.getValue(), pdfaPart); } } else if (base instanceof COSArray arr) { // Recursively sanitize each item in the array for (COSBase item : arr) { - sanitizePdfA(item, resources, pdfaPart); + sanitizePdfA(item, pdfaPart); } } } - private void removeElementsForPdfA(PDDocument doc, int pdfaPart) { + private static void removeElementsForPdfA(PDDocument doc, int pdfaPart) { if (pdfaPart == 1) { // Remove Optional Content (Layers) - not allowed in PDF/A-1 @@ -892,7 +801,7 @@ public class ConvertPDFToPDFA { } PDResources res = page.getResources(); // Clean page-level dictionary - sanitizePdfA(page.getCOSObject(), res, pdfaPart); + sanitizePdfA(page.getCOSObject(), pdfaPart); // sanitize each Form XObject if (res != null) { @@ -900,9 +809,9 @@ public class ConvertPDFToPDFA { try { PDXObject xo = res.getXObject(name); if (xo instanceof PDFormXObject form) { - sanitizePdfA(form.getCOSObject(), res, pdfaPart); + sanitizePdfA(form.getCOSObject(), pdfaPart); } else if (xo instanceof PDImageXObject img) { - sanitizePdfA(img.getCOSObject(), res, pdfaPart); + sanitizePdfA(img.getCOSObject(), pdfaPart); } } catch (IOException ioe) { log.error("Cannot load XObject {}: {}", name.getName(), ioe.getMessage()); @@ -913,7 +822,7 @@ public class ConvertPDFToPDFA { } /** Embbeds the XMP metadata required for PDF/A compliance. */ - private void mergeAndAddXmpMetadata(PDDocument document, int pdfaPart) throws Exception { + private static void mergeAndAddXmpMetadata(PDDocument document, int pdfaPart) throws Exception { PDMetadata existingMetadata = document.getDocumentCatalog().getMetadata(); XMPMetadata xmp; @@ -998,31 +907,30 @@ public class ConvertPDFToPDFA { adobePdfSchema.setKeywords(keywords); } - // Set creation and modification dates using java.time and convert to GregorianCalendar + // Set creation and modification dates using modern java.time API Instant nowInstant = Instant.now(); ZonedDateTime nowZdt = ZonedDateTime.ofInstant(nowInstant, ZoneId.of("UTC")); - GregorianCalendar nowCal = GregorianCalendar.from(nowZdt); - java.util.Calendar originalCreationDate = docInfo.getCreationDate(); - GregorianCalendar creationCal; - if (originalCreationDate == null) { - creationCal = nowCal; - } else if (originalCreationDate instanceof GregorianCalendar) { - creationCal = (GregorianCalendar) originalCreationDate; + // Determine creation date from document info or use current time + Instant creationInstant; + Calendar originalCreationDate = docInfo.getCreationDate(); + if (originalCreationDate != null) { + creationInstant = originalCreationDate.toInstant(); } else { - // convert other Calendar implementations to GregorianCalendar preserving instant - creationCal = - GregorianCalendar.from( - ZonedDateTime.ofInstant( - originalCreationDate.toInstant(), ZoneId.of("UTC"))); + creationInstant = nowInstant; } + ZonedDateTime creationZdt = ZonedDateTime.ofInstant(creationInstant, ZoneId.of("UTC")); + + // Convert to GregorianCalendar for PDFBox API compatibility + GregorianCalendar creationCal = java.util.GregorianCalendar.from(creationZdt); + GregorianCalendar modificationCal = java.util.GregorianCalendar.from(nowZdt); docInfo.setCreationDate(creationCal); xmpBasicSchema.setCreateDate(creationCal); - docInfo.setModificationDate(nowCal); - xmpBasicSchema.setModifyDate(nowCal); - xmpBasicSchema.setMetadataDate(nowCal); + docInfo.setModificationDate(modificationCal); + xmpBasicSchema.setModifyDate(modificationCal); + xmpBasicSchema.setMetadataDate(modificationCal); // Serialize the created metadata so it can be attached to the existent metadata ByteArrayOutputStream xmpOut = new ByteArrayOutputStream(); @@ -1033,22 +941,7 @@ public class ConvertPDFToPDFA { document.getDocumentCatalog().setMetadata(newMetadata); } - private void addICCProfileIfNotPresent(PDDocument document) throws Exception { - if (document.getDocumentCatalog().getOutputIntents().isEmpty()) { - try (InputStream colorProfile = getClass().getResourceAsStream("/icc/sRGB2014.icc")) { - PDOutputIntent outputIntent = new PDOutputIntent(document, colorProfile); - outputIntent.setInfo("sRGB IEC61966-2.1"); - outputIntent.setOutputCondition("sRGB IEC61966-2.1"); - outputIntent.setOutputConditionIdentifier("sRGB IEC61966-2.1"); - outputIntent.setRegistryName("http://www.color.org"); - document.getDocumentCatalog().addOutputIntent(outputIntent); - } catch (Exception e) { - log.error("Failed to load ICC profile: {}", e.getMessage()); - } - } - } - - private File preProcessHighlights(File inputPdf) throws Exception { + private static File preProcessHighlights(File inputPdf) throws Exception { try (PDDocument document = Loader.loadPDF(inputPdf)) { @@ -1127,12 +1020,12 @@ public class ConvertPDFToPDFA { COSDictionary groupDict = (COSDictionary) pageDict.getDictionaryObject(COSName.GROUP); - if (groupDict != null) { - if (COSName.TRANSPARENCY - .getName() - .equalsIgnoreCase(groupDict.getNameAsString(COSName.S))) { - pageDict.removeItem(COSName.GROUP); - } + if (groupDict != null + && COSName.TRANSPARENCY + .getName() + .equalsIgnoreCase( + groupDict.getNameAsString(COSName.S))) { + pageDict.removeItem(COSName.GROUP); } } } @@ -1145,7 +1038,133 @@ public class ConvertPDFToPDFA { } } - /** Enum representing different PDF/A profiles */ + private ResponseEntity handlePdfAConversion( + MultipartFile inputFile, String outputFormat) throws Exception { + PdfaProfile profile = PdfaProfile.fromRequest(outputFormat); + + // Get the original filename without extension + String originalFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename()); + if (originalFileName == null || originalFileName.trim().isEmpty()) { + originalFileName = "output.pdf"; + } + String baseFileName = + originalFileName.contains(".") + ? originalFileName.substring(0, originalFileName.lastIndexOf('.')) + : originalFileName; + + Path workingDir = Files.createTempDirectory("pdfa_conversion_"); + Path inputPath = workingDir.resolve("input.pdf"); + inputFile.transferTo(inputPath); + + try { + byte[] converted; + + // Try Ghostscript first (preferred method) + if (isGhostscriptAvailable()) { + log.info("Using Ghostscript for PDF/A conversion to {}", profile.displayName()); + try { + converted = convertWithGhostscript(inputPath, workingDir, profile); + String outputFilename = baseFileName + profile.outputSuffix(); + + validateAndWarnPdfA(converted, profile, "Ghostscript"); + + return WebResponseUtils.bytesToWebResponse( + converted, outputFilename, MediaType.APPLICATION_PDF); + } catch (Exception e) { + log.warn( + "Ghostscript conversion failed, falling back to PDFBox/LibreOffice method", + e); + } + } else { + log.info("Ghostscript not available, using PDFBox/LibreOffice fallback method"); + } + + converted = convertWithPdfBoxMethod(inputPath, profile); + String outputFilename = baseFileName + profile.outputSuffix(); + + // Validate with PDFBox preflight and warn if issues found + validateAndWarnPdfA(converted, profile, "PDFBox/LibreOffice"); + + return WebResponseUtils.bytesToWebResponse( + converted, outputFilename, MediaType.APPLICATION_PDF); + + } finally { + deleteQuietly(workingDir); + } + } + + private byte[] convertWithPdfBoxMethod(Path inputPath, PdfaProfile profile) throws Exception { + Path tempInputFile = null; + byte[] fileBytes; + Path loPdfPath = null; + File preProcessedFile = null; + int pdfaPart = profile.part(); + + try { + tempInputFile = inputPath; + + if (pdfaPart == 2 || pdfaPart == 3) { + preProcessedFile = tempInputFile.toFile(); + } else { + preProcessedFile = preProcessHighlights(tempInputFile.toFile()); + } + + Set missingFonts; + boolean needImgs; + try (PDDocument doc = Loader.loadPDF(preProcessedFile)) { + missingFonts = findUnembeddedFontNames(doc); + needImgs = (pdfaPart == 1) && hasTransparentImages(doc); + if (!missingFonts.isEmpty() || needImgs) { + loPdfPath = runLibreOfficeConversion(preProcessedFile.toPath(), pdfaPart); + } + } + fileBytes = + convertToPdfA( + preProcessedFile.toPath(), loPdfPath, pdfaPart, missingFonts, needImgs); + + return fileBytes; + + } finally { + if (loPdfPath != null && loPdfPath.getParent() != null) { + FileUtils.deleteDirectory(loPdfPath.getParent().toFile()); + } + if (preProcessedFile != null && !preProcessedFile.equals(tempInputFile.toFile())) { + Files.deleteIfExists(preProcessedFile.toPath()); + } + } + } + + private void copyResourceIcc(Path target) throws IOException { + try (InputStream in = getClass().getResourceAsStream(ICC_RESOURCE_PATH)) { + if (in == null) { + throw ExceptionUtils.createIllegalArgumentException( + "error.resourceNotFound", "Resource not found: {0}", ICC_RESOURCE_PATH); + } + Files.copy(in, target, StandardCopyOption.REPLACE_EXISTING); + } + } + + private void addICCProfileIfNotPresent(PDDocument document) { + if (document.getDocumentCatalog().getOutputIntents().isEmpty()) { + try (InputStream colorProfile = getClass().getResourceAsStream("/icc/sRGB2014.icc")) { + if (colorProfile == null) { + throw ExceptionUtils.createIllegalArgumentException( + "error.resourceNotFound", + "Resource not found: {0}", + "/icc/sRGB2014.icc"); + } + PDOutputIntent outputIntent = new PDOutputIntent(document, colorProfile); + outputIntent.setInfo("sRGB IEC61966-2.1"); + outputIntent.setOutputCondition("sRGB IEC61966-2.1"); + outputIntent.setOutputConditionIdentifier("sRGB IEC61966-2.1"); + outputIntent.setRegistryName("http://www.color.org"); + document.getDocumentCatalog().addOutputIntent(outputIntent); + } catch (Exception e) { + log.error("Failed to load ICC profile: {}", e.getMessage()); + } + } + } + private enum PdfaProfile { PDF_A_1B(1, "PDF/A-1b", "_PDFA-1b.pdf", "1.4", Format.PDF_A1B, "pdfa-1"), PDF_A_2B(2, "PDF/A-2b", "_PDFA-2b.pdf", "1.7", null, "pdfa", "pdfa-2", "pdfa-2b"), @@ -1212,9 +1231,9 @@ public class ConvertPDFToPDFA { } private enum PdfXProfile { - PDF_X_1(1, "PDF/X-1", "_PDFX-1.pdf", "1.3", "2001", "pdfx-1", "pdfx"), - PDF_X_3(3, "PDF/X-3", "_PDFX-3.pdf", "1.3", "2003", "pdfx-3"), - PDF_X_4(4, "PDF/X-4", "_PDFX-4.pdf", "1.4", "2008", "pdfx-4"); + PDF_X_1("PDF/X-1", "_PDFX-1.pdf", "1.3", "2001", "pdfx-1", "pdfx"), + PDF_X_3("PDF/X-3", "_PDFX-3.pdf", "1.3", "2003", "pdfx-3"), + PDF_X_4("PDF/X-4", "_PDFX-4.pdf", "1.4", "2008", "pdfx-4"); private final String displayName; private final String suffix; @@ -1223,7 +1242,6 @@ public class ConvertPDFToPDFA { private final List requestTokens; PdfXProfile( - int version, String displayName, String suffix, String compatibilityLevel, @@ -1270,6 +1288,5 @@ public class ConvertPDFToPDFA { } } - /** Record to hold color profile paths */ private record ColorProfiles(Path rgb, Path gray) {} } From 69140b4b03bab2c6e25d37610cc064bbee5e8e05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= Date: Sat, 8 Nov 2025 12:15:19 +0100 Subject: [PATCH 04/14] refactor(pdf-conversion): improve request token handling and resource validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added null checks for PDResources to prevent possible null pointer exceptions - Streamlined request token processing by normalizing input directly in initialization - Simplified filter logic for profile matching by leveraging pre-normalized tokens - Removed redundant option from the PDF/A format dropdown in the UI template Signed-off-by: Balázs Szücs --- .../api/converters/ConvertPDFToPDFA.java | 24 +++++++++---------- .../templates/convert/pdf-to-pdfa.html | 1 - 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java index 022f5fe61..ae3205ae7 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToPDFA.java @@ -537,6 +537,8 @@ public class ConvertPDFToPDFA { PDResources loRes = loPage.getResources(); PDResources baseRes = basePage.getResources(); + if (loRes == null || baseRes == null) continue; + Set toReplace = detectTransparentXObjects(basePage); for (COSName name : toReplace) { @@ -1189,7 +1191,10 @@ public class ConvertPDFToPDFA { this.suffix = suffix; this.compatibilityLevel = compatibilityLevel; this.preflightFormat = preflightFormat; - this.requestTokens = Arrays.asList(requestTokens); + this.requestTokens = + Arrays.stream(requestTokens) + .map(token -> token.toLowerCase(Locale.ROOT)) + .toList(); } static PdfaProfile fromRequest(String requestToken) { @@ -1199,11 +1204,7 @@ public class ConvertPDFToPDFA { String normalized = requestToken.trim().toLowerCase(Locale.ROOT); Optional match = Arrays.stream(values()) - .filter( - profile -> - profile.requestTokens.stream() - .map(token -> token.toLowerCase(Locale.ROOT)) - .anyMatch(token -> token.equals(normalized))) + .filter(profile -> profile.requestTokens.contains(normalized)) .findFirst(); return match.orElse(PDF_A_2B); @@ -1251,7 +1252,10 @@ public class ConvertPDFToPDFA { this.suffix = suffix; this.compatibilityLevel = compatibilityLevel; this.pdfxVersion = pdfxVersion; - this.requestTokens = Arrays.asList(requestTokens); + this.requestTokens = + Arrays.stream(requestTokens) + .map(token -> token.toLowerCase(Locale.ROOT)) + .toList(); } static PdfXProfile fromRequest(String requestToken) { @@ -1261,11 +1265,7 @@ public class ConvertPDFToPDFA { String normalized = requestToken.trim().toLowerCase(Locale.ROOT); Optional match = Arrays.stream(values()) - .filter( - profile -> - profile.requestTokens.stream() - .map(token -> token.toLowerCase(Locale.ROOT)) - .anyMatch(token -> token.equals(normalized))) + .filter(profile -> profile.requestTokens.contains(normalized)) .findFirst(); return match.orElse(PDF_X_4); diff --git a/app/core/src/main/resources/templates/convert/pdf-to-pdfa.html b/app/core/src/main/resources/templates/convert/pdf-to-pdfa.html index 2e9ed11ef..5b44ff5b6 100644 --- a/app/core/src/main/resources/templates/convert/pdf-to-pdfa.html +++ b/app/core/src/main/resources/templates/convert/pdf-to-pdfa.html @@ -25,7 +25,6 @@