fix(markdown): markdown conversion image handling and zip support (#5677)

2026-04-16 23:08:38 +02:00 · 2026-02-12 00:31:41 +01:00
parent e523190f39
commit f88f1db7e7
4 changed files with 254 additions and 58 deletions
--- a/app/common/src/main/java/stirling/software/common/util/PDFToFile.java
+++ b/app/common/src/main/java/stirling/software/common/util/PDFToFile.java
@@ -107,56 +107,65 @@ public class PDFToFile {
            File[] outputFiles =
                    Objects.requireNonNull(tempOutputDir.getPath().toFile().listFiles());
            List<File> markdownFiles = new ArrayList<>();
+            List<File> imageFiles = new ArrayList<>();

-            // Convert HTML files to Markdown
+            // Convert HTML files to Markdown and collect image files
            for (File outputFile : outputFiles) {
                if (outputFile.getName().endsWith(".html")) {
                    String html = Files.readString(outputFile.toPath());
                    String markdown = htmlToMarkdownConverter.convert(html);

+                    // Update image references to point to images/ folder
+                    markdown = updateImageReferences(markdown);
+
                    String mdFileName = outputFile.getName().replace(".html", ".md");
                    File mdFile = new File(tempOutputDir.getPath().toFile(), mdFileName);
                    Files.writeString(mdFile.toPath(), markdown);
                    markdownFiles.add(mdFile);
+                } else if (!outputFile.getName().endsWith(".md")) {
+                    // Collect non-HTML, non-MD files as images/assets
+                    imageFiles.add(outputFile);
                }
            }

-            // If there's only one markdown file, return it directly
-            if (markdownFiles.size() == 1) {
-                fileName = pdfBaseName + ".md";
-                fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());
-            } else {
-                // Multiple files - create a zip
-                fileName = pdfBaseName + "ToMarkdown.zip";
-                ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
+            // Always create a ZIP file
+            fileName = pdfBaseName + "ToMarkdown.zip";
+            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();

-                try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
-                    // Add markdown files
-                    for (File mdFile : markdownFiles) {
-                        ZipEntry mdEntry = new ZipEntry(mdFile.getName());
-                        zipOutputStream.putNextEntry(mdEntry);
-                        Files.copy(mdFile.toPath(), zipOutputStream);
-                        zipOutputStream.closeEntry();
-                    }
-
-                    // Add images and other assets
-                    for (File file : outputFiles) {
-                        if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {
-                            ZipEntry assetEntry = new ZipEntry(file.getName());
-                            zipOutputStream.putNextEntry(assetEntry);
-                            Files.copy(file.toPath(), zipOutputStream);
-                            zipOutputStream.closeEntry();
-                        }
-                    }
+            try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
+                // Add markdown files to root of ZIP
+                for (File mdFile : markdownFiles) {
+                    ZipEntry mdEntry = new ZipEntry(mdFile.getName());
+                    zipOutputStream.putNextEntry(mdEntry);
+                    Files.copy(mdFile.toPath(), zipOutputStream);
+                    zipOutputStream.closeEntry();
                }

-                fileBytes = byteArrayOutputStream.toByteArray();
+                // Add images and other assets to images/ folder
+                for (File imageFile : imageFiles) {
+                    ZipEntry assetEntry = new ZipEntry("images/" + imageFile.getName());
+                    zipOutputStream.putNextEntry(assetEntry);
+                    Files.copy(imageFile.toPath(), zipOutputStream);
+                    zipOutputStream.closeEntry();
+                }
            }
+
+            fileBytes = byteArrayOutputStream.toByteArray();
        }
        return WebResponseUtils.bytesToWebResponse(
                fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
    }

+    /**
+     * Updates image references in markdown to point to the images/ folder. Matches patterns like
+     * ![alt](filename.png) and converts to ![alt](images/filename.png)
+     */
+    private String updateImageReferences(String markdown) {
+        // Match markdown image syntax: ![alt text](image.png)
+        // Only update if the path doesn't already start with images/
+        return markdown.replaceAll("(!\\[.*?\\])\\((?!images/)([^/)][^)]*?)\\)", "$1(images/$2)");
+    }
+
    public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
            throws IOException, InterruptedException {
        if (!MediaType.APPLICATION_PDF_VALUE.equals(inputFile.getContentType())) {
--- a/app/common/src/test/java/stirling/software/common/util/PDFToFileTest.java
+++ b/app/common/src/test/java/stirling/software/common/util/PDFToFileTest.java
@@ -153,11 +153,12 @@ class PDFToFileTest {
                            MediaType.APPLICATION_PDF_VALUE,
                            "Fake PDF content".getBytes());

-            // Create a mock HTML output file
+            // Create a mock HTML output file with image references
            Path htmlOutputFile = tempDir.resolve("test.html");
            Files.write(
                    htmlOutputFile,
-                    "<html><body><h1>Test</h1><p>This is a test.</p></body></html>".getBytes());
+                    "<html><body><h1>Test</h1><p>This is a test.</p><img src=\"image1.png\" /></body></html>"
+                            .getBytes());

            // Setup ProcessExecutor mock
            mockedStaticProcessExecutor
@@ -174,18 +175,61 @@ class PDFToFileTest {
                                Files.copy(
                                        htmlOutputFile, Path.of(outputDir.getPath(), "test.html"));

+                                // Create a mock image file
+                                Files.write(
+                                        Path.of(outputDir.getPath(), "image1.png"),
+                                        "Fake image data".getBytes());
+
                                return mockExecutorResult;
                            });

            // Execute the method
            ResponseEntity<byte[]> response = pdfToFile.processPdfToMarkdown(pdfFile);

-            // Verify
+            // Verify - should now return a ZIP file instead of plain markdown
            assertEquals(HttpStatus.OK, response.getStatusCode());
            assertNotNull(response.getBody());
            assertTrue(response.getBody().length > 0);
+
+            // Verify content disposition indicates a ZIP file
            assertTrue(
-                    response.getHeaders().getContentDisposition().toString().contains("test.md"));
+                    response.getHeaders()
+                            .getContentDisposition()
+                            .toString()
+                            .contains("ToMarkdown.zip"));
+
+            // Verify the content by unzipping it
+            try (ZipInputStream zipStream =
+                    ZipSecurity.createHardenedInputStream(
+                            new java.io.ByteArrayInputStream(response.getBody()))) {
+                ZipEntry entry;
+                boolean foundMdFile = false;
+                boolean foundImageInFolder = false;
+                String markdownContent = null;
+
+                while ((entry = zipStream.getNextEntry()) != null) {
+                    if (entry.getName().endsWith(".md")) {
+                        foundMdFile = true;
+                        // Read markdown content to verify image references
+                        markdownContent =
+                                new String(
+                                        zipStream.readAllBytes(),
+                                        java.nio.charset.StandardCharsets.UTF_8);
+                    } else if (entry.getName().startsWith("images/")
+                            && entry.getName().endsWith(".png")) {
+                        foundImageInFolder = true;
+                    }
+                    zipStream.closeEntry();
+                }
+
+                assertTrue(foundMdFile, "ZIP should contain Markdown file");
+                assertTrue(foundImageInFolder, "ZIP should contain image in images/ folder");
+                assertNotNull(markdownContent, "Markdown content should be present");
+                // Verify markdown references images with images/ prefix
+                assertTrue(
+                        markdownContent.contains("images/"),
+                        "Markdown should reference images with images/ prefix");
+            }
        }
    }

@@ -256,14 +300,15 @@ class PDFToFileTest {
                while ((entry = zipStream.getNextEntry()) != null) {
                    if (entry.getName().endsWith(".md")) {
                        foundMdFiles = true;
-                    } else if (entry.getName().endsWith(".png")) {
+                    } else if (entry.getName().startsWith("images/")
+                            && entry.getName().endsWith(".png")) {
                        foundImage = true;
                    }
                    zipStream.closeEntry();
                }

                assertTrue(foundMdFiles, "ZIP should contain Markdown files");
-                assertTrue(foundImage, "ZIP should contain image files");
+                assertTrue(foundImage, "ZIP should contain image files in images/ folder");
            }
        }
    }
--- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertMarkdownToPdf.java
+++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertMarkdownToPdf.java
@@ -44,7 +44,7 @@ public class ConvertMarkdownToPdf {
    @Operation(
            summary = "Convert a Markdown file to PDF",
            description =
-                    "This endpoint takes a Markdown file input, converts it to HTML, and then to"
+                    "This endpoint takes a Markdown file or ZIP (containing Markdown + images) input, converts it to HTML, and then to"
                            + " PDF format. Input:MARKDOWN Output:PDF Type:SISO")
    public ResponseEntity<byte[]> markdownToPdf(@ModelAttribute GeneralFile generalFile)
            throws Exception {
@@ -52,40 +52,181 @@ public class ConvertMarkdownToPdf {

        if (fileInput == null) {
            throw ExceptionUtils.createIllegalArgumentException(
-                    "error.fileFormatRequired", "File must be in {0} format", "Markdown");
+                    "error.fileFormatRequired", "File must be in {0} format", "Markdown or ZIP");
        }

        String originalFilename = Filenames.toSimpleFileName(fileInput.getOriginalFilename());
-        if (originalFilename == null || !originalFilename.endsWith(".md")) {
+        if (originalFilename == null) {
            throw ExceptionUtils.createIllegalArgumentException(
-                    "error.fileFormatRequired", "File must be in {0} format", ".md");
+                    "error.fileFormatRequired", "File must be in {0} format", ".md or .zip");
        }

-        // Convert Markdown to HTML using CommonMark
-        List<Extension> extensions = List.of(TablesExtension.create());
-        Parser parser = Parser.builder().extensions(extensions).build();
+        boolean isZip = originalFilename.toLowerCase().endsWith(".zip");
+        boolean isMarkdown = originalFilename.toLowerCase().endsWith(".md");

-        Node document = parser.parse(new String(fileInput.getBytes()));
-        HtmlRenderer renderer =
-                HtmlRenderer.builder()
-                        .attributeProviderFactory(context -> new TableAttributeProvider())
-                        .extensions(extensions)
-                        .build();
+        if (!isZip && !isMarkdown) {
+            throw ExceptionUtils.createIllegalArgumentException(
+                    "error.fileFormatRequired", "File must be in {0} format", ".md or .zip");
+        }

-        String htmlContent = renderer.render(document);
+        byte[] pdfBytes;
+        String outputFilename;
+
+        if (isZip) {
+            // Handle ZIP file containing markdown + images
+            try (TempDirectory tempDir = new TempDirectory(tempFileManager)) {
+                // Extract ZIP to temp directory
+                java.nio.file.Path tempDirPath = tempDir.getPath();
+                try (java.util.zip.ZipInputStream zipIn =
+                        io.github.pixee.security.ZipSecurity.createHardenedInputStream(
+                                new java.io.ByteArrayInputStream(fileInput.getBytes()))) {
+                    java.util.zip.ZipEntry entry;
+                    while ((entry = zipIn.getNextEntry()) != null) {
+                        if (!entry.isDirectory()) {
+                            java.nio.file.Path filePath = tempDirPath.resolve(entry.getName());
+                            java.nio.file.Files.createDirectories(filePath.getParent());
+                            java.nio.file.Files.copy(zipIn, filePath);
+                        }
+                        zipIn.closeEntry();
+                    }
+                }
+
+                // Find the markdown file (look for .md files, prefer index.md or first one)
+                java.io.File markdownFile = findMarkdownFile(tempDirPath.toFile());
+                if (markdownFile == null) {
+                    throw ExceptionUtils.createIllegalArgumentException(
+                            "error.fileFormatRequired",
+                            "ZIP must contain at least one {0} file",
+                            ".md");
+                }
+
+                // Read and convert markdown to HTML
+                String markdownContent = java.nio.file.Files.readString(markdownFile.toPath());
+                List<Extension> extensions = List.of(TablesExtension.create());
+                Parser parser = Parser.builder().extensions(extensions).build();
+                Node document = parser.parse(markdownContent);
+                HtmlRenderer renderer =
+                        HtmlRenderer.builder()
+                                .attributeProviderFactory(context -> new TableAttributeProvider())
+                                .extensions(extensions)
+                                .build();
+                String htmlContent = renderer.render(document);
+
+                // Create a new ZIP with HTML + images for WeasyPrint
+                byte[] htmlZipBytes = createHtmlZip(htmlContent, tempDirPath.toFile());
+
+                // Use FileToPdf which already supports ZIP files with images
+                pdfBytes =
+                        FileToPdf.convertHtmlToPdf(
+                                runtimePathConfig.getWeasyPrintPath(),
+                                null,
+                                htmlZipBytes,
+                                "package.zip",
+                                tempFileManager,
+                                customHtmlSanitizer);
+
+                outputFilename =
+                        GeneralUtils.generateFilename(
+                                originalFilename.substring(0, originalFilename.lastIndexOf('.')),
+                                ".pdf");
+            }
+        } else {
+            // Handle plain markdown file (no images)
+            List<Extension> extensions = List.of(TablesExtension.create());
+            Parser parser = Parser.builder().extensions(extensions).build();
+
+            Node document = parser.parse(new String(fileInput.getBytes()));
+            HtmlRenderer renderer =
+                    HtmlRenderer.builder()
+                            .attributeProviderFactory(context -> new TableAttributeProvider())
+                            .extensions(extensions)
+                            .build();
+
+            String htmlContent = renderer.render(document);
+
+            pdfBytes =
+                    FileToPdf.convertHtmlToPdf(
+                            runtimePathConfig.getWeasyPrintPath(),
+                            null,
+                            htmlContent.getBytes(),
+                            "converted.html",
+                            tempFileManager,
+                            customHtmlSanitizer);
+
+            outputFilename = GeneralUtils.generateFilename(originalFilename, ".pdf");
+        }

-        byte[] pdfBytes =
-                FileToPdf.convertHtmlToPdf(
-                        runtimePathConfig.getWeasyPrintPath(),
-                        null,
-                        htmlContent.getBytes(),
-                        "converted.html",
-                        tempFileManager,
-                        customHtmlSanitizer);
        pdfBytes = pdfDocumentFactory.createNewBytesBasedOnOldDocument(pdfBytes);
-        String outputFilename = GeneralUtils.generateFilename(originalFilename, ".pdf");
        return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
    }
+
+    /**
+     * Finds a markdown file in the directory. Prefers index.md, otherwise returns the first .md
+     * file found.
+     */
+    private java.io.File findMarkdownFile(java.io.File directory) throws java.io.IOException {
+        java.io.File indexMd = new java.io.File(directory, "index.md");
+        if (indexMd.exists()) {
+            return indexMd;
+        }
+
+        // Search for any .md file
+        try (java.util.stream.Stream<java.nio.file.Path> paths =
+                java.nio.file.Files.walk(directory.toPath())) {
+            return paths.filter(p -> p.toString().toLowerCase().endsWith(".md"))
+                    .findFirst()
+                    .map(java.nio.file.Path::toFile)
+                    .orElse(null);
+        }
+    }
+
+    /**
+     * Creates a ZIP file containing the HTML content and all other files (images) from the
+     * directory.
+     */
+    private byte[] createHtmlZip(String htmlContent, java.io.File sourceDir)
+            throws java.io.IOException {
+        java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream();
+
+        try (java.util.zip.ZipOutputStream zos = new java.util.zip.ZipOutputStream(baos)) {
+            // Add HTML file to root
+            java.util.zip.ZipEntry htmlEntry = new java.util.zip.ZipEntry("index.html");
+            zos.putNextEntry(htmlEntry);
+            zos.write(htmlContent.getBytes(java.nio.charset.StandardCharsets.UTF_8));
+            zos.closeEntry();
+
+            // Add all other files (images, etc.)
+            addDirectoryToZip(zos, sourceDir.toPath(), sourceDir.toPath());
+        }
+
+        return baos.toByteArray();
+    }
+
+    /** Recursively adds files from a directory to a ZIP, excluding .md files. */
+    private void addDirectoryToZip(
+            java.util.zip.ZipOutputStream zos,
+            java.nio.file.Path sourceDir,
+            java.nio.file.Path rootDir)
+            throws java.io.IOException {
+        try (java.util.stream.Stream<java.nio.file.Path> paths =
+                java.nio.file.Files.walk(sourceDir, 1)) {
+            for (java.nio.file.Path path : paths.toList()) {
+                if (java.nio.file.Files.isDirectory(path)) {
+                    if (!path.equals(sourceDir)) {
+                        addDirectoryToZip(zos, path, rootDir);
+                    }
+                } else if (!path.toString().toLowerCase().endsWith(".md")) {
+                    // Add file to ZIP, maintaining relative path structure
+                    java.nio.file.Path relativePath = rootDir.relativize(path);
+                    java.util.zip.ZipEntry entry =
+                            new java.util.zip.ZipEntry(relativePath.toString());
+                    zos.putNextEntry(entry);
+                    java.nio.file.Files.copy(path, zos);
+                    zos.closeEntry();
+                }
+            }
+        }
+    }
 }

 class TableAttributeProvider implements AttributeProvider {
--- a/testing/cucumber/features/external.feature
+++ b/testing/cucumber/features/external.feature
@@ -233,7 +233,8 @@ Feature: API Validation
        When I send the API request to the endpoint "/api/v1/convert/pdf/markdown"
        Then the response status code should be 200
        And the response file should have size greater than 100
-        And the response file should have extension ".md"
+        And the response file should have extension ".zip"
+        And the response ZIP should contain 4 files


    @positive @pdftocsv