fix(markdown): markdown conversion image handling and zip support (#5677)

This commit is contained in:
Balázs Szücs
2026-02-12 00:31:41 +01:00
committed by GitHub
parent e523190f39
commit f88f1db7e7
4 changed files with 254 additions and 58 deletions

View File

@@ -107,56 +107,65 @@ public class PDFToFile {
File[] outputFiles =
Objects.requireNonNull(tempOutputDir.getPath().toFile().listFiles());
List<File> markdownFiles = new ArrayList<>();
List<File> imageFiles = new ArrayList<>();
// Convert HTML files to Markdown
// Convert HTML files to Markdown and collect image files
for (File outputFile : outputFiles) {
if (outputFile.getName().endsWith(".html")) {
String html = Files.readString(outputFile.toPath());
String markdown = htmlToMarkdownConverter.convert(html);
// Update image references to point to images/ folder
markdown = updateImageReferences(markdown);
String mdFileName = outputFile.getName().replace(".html", ".md");
File mdFile = new File(tempOutputDir.getPath().toFile(), mdFileName);
Files.writeString(mdFile.toPath(), markdown);
markdownFiles.add(mdFile);
} else if (!outputFile.getName().endsWith(".md")) {
// Collect non-HTML, non-MD files as images/assets
imageFiles.add(outputFile);
}
}
// If there's only one markdown file, return it directly
if (markdownFiles.size() == 1) {
fileName = pdfBaseName + ".md";
fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());
} else {
// Multiple files - create a zip
fileName = pdfBaseName + "ToMarkdown.zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
// Always create a ZIP file
fileName = pdfBaseName + "ToMarkdown.zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
// Add markdown files
for (File mdFile : markdownFiles) {
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
zipOutputStream.putNextEntry(mdEntry);
Files.copy(mdFile.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
// Add images and other assets
for (File file : outputFiles) {
if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {
ZipEntry assetEntry = new ZipEntry(file.getName());
zipOutputStream.putNextEntry(assetEntry);
Files.copy(file.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
}
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
// Add markdown files to root of ZIP
for (File mdFile : markdownFiles) {
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
zipOutputStream.putNextEntry(mdEntry);
Files.copy(mdFile.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
fileBytes = byteArrayOutputStream.toByteArray();
// Add images and other assets to images/ folder
for (File imageFile : imageFiles) {
ZipEntry assetEntry = new ZipEntry("images/" + imageFile.getName());
zipOutputStream.putNextEntry(assetEntry);
Files.copy(imageFile.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
}
fileBytes = byteArrayOutputStream.toByteArray();
}
return WebResponseUtils.bytesToWebResponse(
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
}
/**
* Updates image references in markdown to point to the images/ folder. Matches patterns like
* ![alt](filename.png) and converts to ![alt](images/filename.png)
*/
private String updateImageReferences(String markdown) {
// Match markdown image syntax: ![alt text](image.png)
// Only update if the path doesn't already start with images/
return markdown.replaceAll("(!\\[.*?\\])\\((?!images/)([^/)][^)]*?)\\)", "$1(images/$2)");
}
public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
throws IOException, InterruptedException {
if (!MediaType.APPLICATION_PDF_VALUE.equals(inputFile.getContentType())) {

View File

@@ -153,11 +153,12 @@ class PDFToFileTest {
MediaType.APPLICATION_PDF_VALUE,
"Fake PDF content".getBytes());
// Create a mock HTML output file
// Create a mock HTML output file with image references
Path htmlOutputFile = tempDir.resolve("test.html");
Files.write(
htmlOutputFile,
"<html><body><h1>Test</h1><p>This is a test.</p></body></html>".getBytes());
"<html><body><h1>Test</h1><p>This is a test.</p><img src=\"image1.png\" /></body></html>"
.getBytes());
// Setup ProcessExecutor mock
mockedStaticProcessExecutor
@@ -174,18 +175,61 @@ class PDFToFileTest {
Files.copy(
htmlOutputFile, Path.of(outputDir.getPath(), "test.html"));
// Create a mock image file
Files.write(
Path.of(outputDir.getPath(), "image1.png"),
"Fake image data".getBytes());
return mockExecutorResult;
});
// Execute the method
ResponseEntity<byte[]> response = pdfToFile.processPdfToMarkdown(pdfFile);
// Verify
// Verify - should now return a ZIP file instead of plain markdown
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
assertTrue(response.getBody().length > 0);
// Verify content disposition indicates a ZIP file
assertTrue(
response.getHeaders().getContentDisposition().toString().contains("test.md"));
response.getHeaders()
.getContentDisposition()
.toString()
.contains("ToMarkdown.zip"));
// Verify the content by unzipping it
try (ZipInputStream zipStream =
ZipSecurity.createHardenedInputStream(
new java.io.ByteArrayInputStream(response.getBody()))) {
ZipEntry entry;
boolean foundMdFile = false;
boolean foundImageInFolder = false;
String markdownContent = null;
while ((entry = zipStream.getNextEntry()) != null) {
if (entry.getName().endsWith(".md")) {
foundMdFile = true;
// Read markdown content to verify image references
markdownContent =
new String(
zipStream.readAllBytes(),
java.nio.charset.StandardCharsets.UTF_8);
} else if (entry.getName().startsWith("images/")
&& entry.getName().endsWith(".png")) {
foundImageInFolder = true;
}
zipStream.closeEntry();
}
assertTrue(foundMdFile, "ZIP should contain Markdown file");
assertTrue(foundImageInFolder, "ZIP should contain image in images/ folder");
assertNotNull(markdownContent, "Markdown content should be present");
// Verify markdown references images with images/ prefix
assertTrue(
markdownContent.contains("images/"),
"Markdown should reference images with images/ prefix");
}
}
}
@@ -256,14 +300,15 @@ class PDFToFileTest {
while ((entry = zipStream.getNextEntry()) != null) {
if (entry.getName().endsWith(".md")) {
foundMdFiles = true;
} else if (entry.getName().endsWith(".png")) {
} else if (entry.getName().startsWith("images/")
&& entry.getName().endsWith(".png")) {
foundImage = true;
}
zipStream.closeEntry();
}
assertTrue(foundMdFiles, "ZIP should contain Markdown files");
assertTrue(foundImage, "ZIP should contain image files");
assertTrue(foundImage, "ZIP should contain image files in images/ folder");
}
}
}

View File

@@ -44,7 +44,7 @@ public class ConvertMarkdownToPdf {
@Operation(
summary = "Convert a Markdown file to PDF",
description =
"This endpoint takes a Markdown file input, converts it to HTML, and then to"
"This endpoint takes a Markdown file or ZIP (containing Markdown + images) input, converts it to HTML, and then to"
+ " PDF format. Input:MARKDOWN Output:PDF Type:SISO")
public ResponseEntity<byte[]> markdownToPdf(@ModelAttribute GeneralFile generalFile)
throws Exception {
@@ -52,40 +52,181 @@ public class ConvertMarkdownToPdf {
if (fileInput == null) {
throw ExceptionUtils.createIllegalArgumentException(
"error.fileFormatRequired", "File must be in {0} format", "Markdown");
"error.fileFormatRequired", "File must be in {0} format", "Markdown or ZIP");
}
String originalFilename = Filenames.toSimpleFileName(fileInput.getOriginalFilename());
if (originalFilename == null || !originalFilename.endsWith(".md")) {
if (originalFilename == null) {
throw ExceptionUtils.createIllegalArgumentException(
"error.fileFormatRequired", "File must be in {0} format", ".md");
"error.fileFormatRequired", "File must be in {0} format", ".md or .zip");
}
// Convert Markdown to HTML using CommonMark
List<Extension> extensions = List.of(TablesExtension.create());
Parser parser = Parser.builder().extensions(extensions).build();
boolean isZip = originalFilename.toLowerCase().endsWith(".zip");
boolean isMarkdown = originalFilename.toLowerCase().endsWith(".md");
Node document = parser.parse(new String(fileInput.getBytes()));
HtmlRenderer renderer =
HtmlRenderer.builder()
.attributeProviderFactory(context -> new TableAttributeProvider())
.extensions(extensions)
.build();
if (!isZip && !isMarkdown) {
throw ExceptionUtils.createIllegalArgumentException(
"error.fileFormatRequired", "File must be in {0} format", ".md or .zip");
}
String htmlContent = renderer.render(document);
byte[] pdfBytes;
String outputFilename;
if (isZip) {
// Handle ZIP file containing markdown + images
try (TempDirectory tempDir = new TempDirectory(tempFileManager)) {
// Extract ZIP to temp directory
java.nio.file.Path tempDirPath = tempDir.getPath();
try (java.util.zip.ZipInputStream zipIn =
io.github.pixee.security.ZipSecurity.createHardenedInputStream(
new java.io.ByteArrayInputStream(fileInput.getBytes()))) {
java.util.zip.ZipEntry entry;
while ((entry = zipIn.getNextEntry()) != null) {
if (!entry.isDirectory()) {
java.nio.file.Path filePath = tempDirPath.resolve(entry.getName());
java.nio.file.Files.createDirectories(filePath.getParent());
java.nio.file.Files.copy(zipIn, filePath);
}
zipIn.closeEntry();
}
}
// Find the markdown file (look for .md files, prefer index.md or first one)
java.io.File markdownFile = findMarkdownFile(tempDirPath.toFile());
if (markdownFile == null) {
throw ExceptionUtils.createIllegalArgumentException(
"error.fileFormatRequired",
"ZIP must contain at least one {0} file",
".md");
}
// Read and convert markdown to HTML
String markdownContent = java.nio.file.Files.readString(markdownFile.toPath());
List<Extension> extensions = List.of(TablesExtension.create());
Parser parser = Parser.builder().extensions(extensions).build();
Node document = parser.parse(markdownContent);
HtmlRenderer renderer =
HtmlRenderer.builder()
.attributeProviderFactory(context -> new TableAttributeProvider())
.extensions(extensions)
.build();
String htmlContent = renderer.render(document);
// Create a new ZIP with HTML + images for WeasyPrint
byte[] htmlZipBytes = createHtmlZip(htmlContent, tempDirPath.toFile());
// Use FileToPdf which already supports ZIP files with images
pdfBytes =
FileToPdf.convertHtmlToPdf(
runtimePathConfig.getWeasyPrintPath(),
null,
htmlZipBytes,
"package.zip",
tempFileManager,
customHtmlSanitizer);
outputFilename =
GeneralUtils.generateFilename(
originalFilename.substring(0, originalFilename.lastIndexOf('.')),
".pdf");
}
} else {
// Handle plain markdown file (no images)
List<Extension> extensions = List.of(TablesExtension.create());
Parser parser = Parser.builder().extensions(extensions).build();
Node document = parser.parse(new String(fileInput.getBytes()));
HtmlRenderer renderer =
HtmlRenderer.builder()
.attributeProviderFactory(context -> new TableAttributeProvider())
.extensions(extensions)
.build();
String htmlContent = renderer.render(document);
pdfBytes =
FileToPdf.convertHtmlToPdf(
runtimePathConfig.getWeasyPrintPath(),
null,
htmlContent.getBytes(),
"converted.html",
tempFileManager,
customHtmlSanitizer);
outputFilename = GeneralUtils.generateFilename(originalFilename, ".pdf");
}
byte[] pdfBytes =
FileToPdf.convertHtmlToPdf(
runtimePathConfig.getWeasyPrintPath(),
null,
htmlContent.getBytes(),
"converted.html",
tempFileManager,
customHtmlSanitizer);
pdfBytes = pdfDocumentFactory.createNewBytesBasedOnOldDocument(pdfBytes);
String outputFilename = GeneralUtils.generateFilename(originalFilename, ".pdf");
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
}
/**
* Finds a markdown file in the directory. Prefers index.md, otherwise returns the first .md
* file found.
*/
private java.io.File findMarkdownFile(java.io.File directory) throws java.io.IOException {
java.io.File indexMd = new java.io.File(directory, "index.md");
if (indexMd.exists()) {
return indexMd;
}
// Search for any .md file
try (java.util.stream.Stream<java.nio.file.Path> paths =
java.nio.file.Files.walk(directory.toPath())) {
return paths.filter(p -> p.toString().toLowerCase().endsWith(".md"))
.findFirst()
.map(java.nio.file.Path::toFile)
.orElse(null);
}
}
/**
* Creates a ZIP file containing the HTML content and all other files (images) from the
* directory.
*/
private byte[] createHtmlZip(String htmlContent, java.io.File sourceDir)
throws java.io.IOException {
java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream();
try (java.util.zip.ZipOutputStream zos = new java.util.zip.ZipOutputStream(baos)) {
// Add HTML file to root
java.util.zip.ZipEntry htmlEntry = new java.util.zip.ZipEntry("index.html");
zos.putNextEntry(htmlEntry);
zos.write(htmlContent.getBytes(java.nio.charset.StandardCharsets.UTF_8));
zos.closeEntry();
// Add all other files (images, etc.)
addDirectoryToZip(zos, sourceDir.toPath(), sourceDir.toPath());
}
return baos.toByteArray();
}
/** Recursively adds files from a directory to a ZIP, excluding .md files. */
private void addDirectoryToZip(
java.util.zip.ZipOutputStream zos,
java.nio.file.Path sourceDir,
java.nio.file.Path rootDir)
throws java.io.IOException {
try (java.util.stream.Stream<java.nio.file.Path> paths =
java.nio.file.Files.walk(sourceDir, 1)) {
for (java.nio.file.Path path : paths.toList()) {
if (java.nio.file.Files.isDirectory(path)) {
if (!path.equals(sourceDir)) {
addDirectoryToZip(zos, path, rootDir);
}
} else if (!path.toString().toLowerCase().endsWith(".md")) {
// Add file to ZIP, maintaining relative path structure
java.nio.file.Path relativePath = rootDir.relativize(path);
java.util.zip.ZipEntry entry =
new java.util.zip.ZipEntry(relativePath.toString());
zos.putNextEntry(entry);
java.nio.file.Files.copy(path, zos);
zos.closeEntry();
}
}
}
}
}
class TableAttributeProvider implements AttributeProvider {

View File

@@ -233,7 +233,8 @@ Feature: API Validation
When I send the API request to the endpoint "/api/v1/convert/pdf/markdown"
Then the response status code should be 200
And the response file should have size greater than 100
And the response file should have extension ".md"
And the response file should have extension ".zip"
And the response ZIP should contain 4 files
@positive @pdftocsv