mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2026-03-04 02:20:19 +01:00
fix(markdown): markdown conversion image handling and zip support (#5677)
This commit is contained in:
@@ -107,56 +107,65 @@ public class PDFToFile {
|
||||
File[] outputFiles =
|
||||
Objects.requireNonNull(tempOutputDir.getPath().toFile().listFiles());
|
||||
List<File> markdownFiles = new ArrayList<>();
|
||||
List<File> imageFiles = new ArrayList<>();
|
||||
|
||||
// Convert HTML files to Markdown
|
||||
// Convert HTML files to Markdown and collect image files
|
||||
for (File outputFile : outputFiles) {
|
||||
if (outputFile.getName().endsWith(".html")) {
|
||||
String html = Files.readString(outputFile.toPath());
|
||||
String markdown = htmlToMarkdownConverter.convert(html);
|
||||
|
||||
// Update image references to point to images/ folder
|
||||
markdown = updateImageReferences(markdown);
|
||||
|
||||
String mdFileName = outputFile.getName().replace(".html", ".md");
|
||||
File mdFile = new File(tempOutputDir.getPath().toFile(), mdFileName);
|
||||
Files.writeString(mdFile.toPath(), markdown);
|
||||
markdownFiles.add(mdFile);
|
||||
} else if (!outputFile.getName().endsWith(".md")) {
|
||||
// Collect non-HTML, non-MD files as images/assets
|
||||
imageFiles.add(outputFile);
|
||||
}
|
||||
}
|
||||
|
||||
// If there's only one markdown file, return it directly
|
||||
if (markdownFiles.size() == 1) {
|
||||
fileName = pdfBaseName + ".md";
|
||||
fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());
|
||||
} else {
|
||||
// Multiple files - create a zip
|
||||
fileName = pdfBaseName + "ToMarkdown.zip";
|
||||
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
// Always create a ZIP file
|
||||
fileName = pdfBaseName + "ToMarkdown.zip";
|
||||
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
|
||||
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
|
||||
// Add markdown files
|
||||
for (File mdFile : markdownFiles) {
|
||||
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
|
||||
zipOutputStream.putNextEntry(mdEntry);
|
||||
Files.copy(mdFile.toPath(), zipOutputStream);
|
||||
zipOutputStream.closeEntry();
|
||||
}
|
||||
|
||||
// Add images and other assets
|
||||
for (File file : outputFiles) {
|
||||
if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {
|
||||
ZipEntry assetEntry = new ZipEntry(file.getName());
|
||||
zipOutputStream.putNextEntry(assetEntry);
|
||||
Files.copy(file.toPath(), zipOutputStream);
|
||||
zipOutputStream.closeEntry();
|
||||
}
|
||||
}
|
||||
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
|
||||
// Add markdown files to root of ZIP
|
||||
for (File mdFile : markdownFiles) {
|
||||
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
|
||||
zipOutputStream.putNextEntry(mdEntry);
|
||||
Files.copy(mdFile.toPath(), zipOutputStream);
|
||||
zipOutputStream.closeEntry();
|
||||
}
|
||||
|
||||
fileBytes = byteArrayOutputStream.toByteArray();
|
||||
// Add images and other assets to images/ folder
|
||||
for (File imageFile : imageFiles) {
|
||||
ZipEntry assetEntry = new ZipEntry("images/" + imageFile.getName());
|
||||
zipOutputStream.putNextEntry(assetEntry);
|
||||
Files.copy(imageFile.toPath(), zipOutputStream);
|
||||
zipOutputStream.closeEntry();
|
||||
}
|
||||
}
|
||||
|
||||
fileBytes = byteArrayOutputStream.toByteArray();
|
||||
}
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates image references in markdown to point to the images/ folder. Matches patterns like
|
||||
*  and converts to 
|
||||
*/
|
||||
private String updateImageReferences(String markdown) {
|
||||
// Match markdown image syntax: 
|
||||
// Only update if the path doesn't already start with images/
|
||||
return markdown.replaceAll("(!\\[.*?\\])\\((?!images/)([^/)][^)]*?)\\)", "$1(images/$2)");
|
||||
}
|
||||
|
||||
public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
|
||||
throws IOException, InterruptedException {
|
||||
if (!MediaType.APPLICATION_PDF_VALUE.equals(inputFile.getContentType())) {
|
||||
|
||||
@@ -153,11 +153,12 @@ class PDFToFileTest {
|
||||
MediaType.APPLICATION_PDF_VALUE,
|
||||
"Fake PDF content".getBytes());
|
||||
|
||||
// Create a mock HTML output file
|
||||
// Create a mock HTML output file with image references
|
||||
Path htmlOutputFile = tempDir.resolve("test.html");
|
||||
Files.write(
|
||||
htmlOutputFile,
|
||||
"<html><body><h1>Test</h1><p>This is a test.</p></body></html>".getBytes());
|
||||
"<html><body><h1>Test</h1><p>This is a test.</p><img src=\"image1.png\" /></body></html>"
|
||||
.getBytes());
|
||||
|
||||
// Setup ProcessExecutor mock
|
||||
mockedStaticProcessExecutor
|
||||
@@ -174,18 +175,61 @@ class PDFToFileTest {
|
||||
Files.copy(
|
||||
htmlOutputFile, Path.of(outputDir.getPath(), "test.html"));
|
||||
|
||||
// Create a mock image file
|
||||
Files.write(
|
||||
Path.of(outputDir.getPath(), "image1.png"),
|
||||
"Fake image data".getBytes());
|
||||
|
||||
return mockExecutorResult;
|
||||
});
|
||||
|
||||
// Execute the method
|
||||
ResponseEntity<byte[]> response = pdfToFile.processPdfToMarkdown(pdfFile);
|
||||
|
||||
// Verify
|
||||
// Verify - should now return a ZIP file instead of plain markdown
|
||||
assertEquals(HttpStatus.OK, response.getStatusCode());
|
||||
assertNotNull(response.getBody());
|
||||
assertTrue(response.getBody().length > 0);
|
||||
|
||||
// Verify content disposition indicates a ZIP file
|
||||
assertTrue(
|
||||
response.getHeaders().getContentDisposition().toString().contains("test.md"));
|
||||
response.getHeaders()
|
||||
.getContentDisposition()
|
||||
.toString()
|
||||
.contains("ToMarkdown.zip"));
|
||||
|
||||
// Verify the content by unzipping it
|
||||
try (ZipInputStream zipStream =
|
||||
ZipSecurity.createHardenedInputStream(
|
||||
new java.io.ByteArrayInputStream(response.getBody()))) {
|
||||
ZipEntry entry;
|
||||
boolean foundMdFile = false;
|
||||
boolean foundImageInFolder = false;
|
||||
String markdownContent = null;
|
||||
|
||||
while ((entry = zipStream.getNextEntry()) != null) {
|
||||
if (entry.getName().endsWith(".md")) {
|
||||
foundMdFile = true;
|
||||
// Read markdown content to verify image references
|
||||
markdownContent =
|
||||
new String(
|
||||
zipStream.readAllBytes(),
|
||||
java.nio.charset.StandardCharsets.UTF_8);
|
||||
} else if (entry.getName().startsWith("images/")
|
||||
&& entry.getName().endsWith(".png")) {
|
||||
foundImageInFolder = true;
|
||||
}
|
||||
zipStream.closeEntry();
|
||||
}
|
||||
|
||||
assertTrue(foundMdFile, "ZIP should contain Markdown file");
|
||||
assertTrue(foundImageInFolder, "ZIP should contain image in images/ folder");
|
||||
assertNotNull(markdownContent, "Markdown content should be present");
|
||||
// Verify markdown references images with images/ prefix
|
||||
assertTrue(
|
||||
markdownContent.contains("images/"),
|
||||
"Markdown should reference images with images/ prefix");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -256,14 +300,15 @@ class PDFToFileTest {
|
||||
while ((entry = zipStream.getNextEntry()) != null) {
|
||||
if (entry.getName().endsWith(".md")) {
|
||||
foundMdFiles = true;
|
||||
} else if (entry.getName().endsWith(".png")) {
|
||||
} else if (entry.getName().startsWith("images/")
|
||||
&& entry.getName().endsWith(".png")) {
|
||||
foundImage = true;
|
||||
}
|
||||
zipStream.closeEntry();
|
||||
}
|
||||
|
||||
assertTrue(foundMdFiles, "ZIP should contain Markdown files");
|
||||
assertTrue(foundImage, "ZIP should contain image files");
|
||||
assertTrue(foundImage, "ZIP should contain image files in images/ folder");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +44,7 @@ public class ConvertMarkdownToPdf {
|
||||
@Operation(
|
||||
summary = "Convert a Markdown file to PDF",
|
||||
description =
|
||||
"This endpoint takes a Markdown file input, converts it to HTML, and then to"
|
||||
"This endpoint takes a Markdown file or ZIP (containing Markdown + images) input, converts it to HTML, and then to"
|
||||
+ " PDF format. Input:MARKDOWN Output:PDF Type:SISO")
|
||||
public ResponseEntity<byte[]> markdownToPdf(@ModelAttribute GeneralFile generalFile)
|
||||
throws Exception {
|
||||
@@ -52,40 +52,181 @@ public class ConvertMarkdownToPdf {
|
||||
|
||||
if (fileInput == null) {
|
||||
throw ExceptionUtils.createIllegalArgumentException(
|
||||
"error.fileFormatRequired", "File must be in {0} format", "Markdown");
|
||||
"error.fileFormatRequired", "File must be in {0} format", "Markdown or ZIP");
|
||||
}
|
||||
|
||||
String originalFilename = Filenames.toSimpleFileName(fileInput.getOriginalFilename());
|
||||
if (originalFilename == null || !originalFilename.endsWith(".md")) {
|
||||
if (originalFilename == null) {
|
||||
throw ExceptionUtils.createIllegalArgumentException(
|
||||
"error.fileFormatRequired", "File must be in {0} format", ".md");
|
||||
"error.fileFormatRequired", "File must be in {0} format", ".md or .zip");
|
||||
}
|
||||
|
||||
// Convert Markdown to HTML using CommonMark
|
||||
List<Extension> extensions = List.of(TablesExtension.create());
|
||||
Parser parser = Parser.builder().extensions(extensions).build();
|
||||
boolean isZip = originalFilename.toLowerCase().endsWith(".zip");
|
||||
boolean isMarkdown = originalFilename.toLowerCase().endsWith(".md");
|
||||
|
||||
Node document = parser.parse(new String(fileInput.getBytes()));
|
||||
HtmlRenderer renderer =
|
||||
HtmlRenderer.builder()
|
||||
.attributeProviderFactory(context -> new TableAttributeProvider())
|
||||
.extensions(extensions)
|
||||
.build();
|
||||
if (!isZip && !isMarkdown) {
|
||||
throw ExceptionUtils.createIllegalArgumentException(
|
||||
"error.fileFormatRequired", "File must be in {0} format", ".md or .zip");
|
||||
}
|
||||
|
||||
String htmlContent = renderer.render(document);
|
||||
byte[] pdfBytes;
|
||||
String outputFilename;
|
||||
|
||||
if (isZip) {
|
||||
// Handle ZIP file containing markdown + images
|
||||
try (TempDirectory tempDir = new TempDirectory(tempFileManager)) {
|
||||
// Extract ZIP to temp directory
|
||||
java.nio.file.Path tempDirPath = tempDir.getPath();
|
||||
try (java.util.zip.ZipInputStream zipIn =
|
||||
io.github.pixee.security.ZipSecurity.createHardenedInputStream(
|
||||
new java.io.ByteArrayInputStream(fileInput.getBytes()))) {
|
||||
java.util.zip.ZipEntry entry;
|
||||
while ((entry = zipIn.getNextEntry()) != null) {
|
||||
if (!entry.isDirectory()) {
|
||||
java.nio.file.Path filePath = tempDirPath.resolve(entry.getName());
|
||||
java.nio.file.Files.createDirectories(filePath.getParent());
|
||||
java.nio.file.Files.copy(zipIn, filePath);
|
||||
}
|
||||
zipIn.closeEntry();
|
||||
}
|
||||
}
|
||||
|
||||
// Find the markdown file (look for .md files, prefer index.md or first one)
|
||||
java.io.File markdownFile = findMarkdownFile(tempDirPath.toFile());
|
||||
if (markdownFile == null) {
|
||||
throw ExceptionUtils.createIllegalArgumentException(
|
||||
"error.fileFormatRequired",
|
||||
"ZIP must contain at least one {0} file",
|
||||
".md");
|
||||
}
|
||||
|
||||
// Read and convert markdown to HTML
|
||||
String markdownContent = java.nio.file.Files.readString(markdownFile.toPath());
|
||||
List<Extension> extensions = List.of(TablesExtension.create());
|
||||
Parser parser = Parser.builder().extensions(extensions).build();
|
||||
Node document = parser.parse(markdownContent);
|
||||
HtmlRenderer renderer =
|
||||
HtmlRenderer.builder()
|
||||
.attributeProviderFactory(context -> new TableAttributeProvider())
|
||||
.extensions(extensions)
|
||||
.build();
|
||||
String htmlContent = renderer.render(document);
|
||||
|
||||
// Create a new ZIP with HTML + images for WeasyPrint
|
||||
byte[] htmlZipBytes = createHtmlZip(htmlContent, tempDirPath.toFile());
|
||||
|
||||
// Use FileToPdf which already supports ZIP files with images
|
||||
pdfBytes =
|
||||
FileToPdf.convertHtmlToPdf(
|
||||
runtimePathConfig.getWeasyPrintPath(),
|
||||
null,
|
||||
htmlZipBytes,
|
||||
"package.zip",
|
||||
tempFileManager,
|
||||
customHtmlSanitizer);
|
||||
|
||||
outputFilename =
|
||||
GeneralUtils.generateFilename(
|
||||
originalFilename.substring(0, originalFilename.lastIndexOf('.')),
|
||||
".pdf");
|
||||
}
|
||||
} else {
|
||||
// Handle plain markdown file (no images)
|
||||
List<Extension> extensions = List.of(TablesExtension.create());
|
||||
Parser parser = Parser.builder().extensions(extensions).build();
|
||||
|
||||
Node document = parser.parse(new String(fileInput.getBytes()));
|
||||
HtmlRenderer renderer =
|
||||
HtmlRenderer.builder()
|
||||
.attributeProviderFactory(context -> new TableAttributeProvider())
|
||||
.extensions(extensions)
|
||||
.build();
|
||||
|
||||
String htmlContent = renderer.render(document);
|
||||
|
||||
pdfBytes =
|
||||
FileToPdf.convertHtmlToPdf(
|
||||
runtimePathConfig.getWeasyPrintPath(),
|
||||
null,
|
||||
htmlContent.getBytes(),
|
||||
"converted.html",
|
||||
tempFileManager,
|
||||
customHtmlSanitizer);
|
||||
|
||||
outputFilename = GeneralUtils.generateFilename(originalFilename, ".pdf");
|
||||
}
|
||||
|
||||
byte[] pdfBytes =
|
||||
FileToPdf.convertHtmlToPdf(
|
||||
runtimePathConfig.getWeasyPrintPath(),
|
||||
null,
|
||||
htmlContent.getBytes(),
|
||||
"converted.html",
|
||||
tempFileManager,
|
||||
customHtmlSanitizer);
|
||||
pdfBytes = pdfDocumentFactory.createNewBytesBasedOnOldDocument(pdfBytes);
|
||||
String outputFilename = GeneralUtils.generateFilename(originalFilename, ".pdf");
|
||||
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds a markdown file in the directory. Prefers index.md, otherwise returns the first .md
|
||||
* file found.
|
||||
*/
|
||||
private java.io.File findMarkdownFile(java.io.File directory) throws java.io.IOException {
|
||||
java.io.File indexMd = new java.io.File(directory, "index.md");
|
||||
if (indexMd.exists()) {
|
||||
return indexMd;
|
||||
}
|
||||
|
||||
// Search for any .md file
|
||||
try (java.util.stream.Stream<java.nio.file.Path> paths =
|
||||
java.nio.file.Files.walk(directory.toPath())) {
|
||||
return paths.filter(p -> p.toString().toLowerCase().endsWith(".md"))
|
||||
.findFirst()
|
||||
.map(java.nio.file.Path::toFile)
|
||||
.orElse(null);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a ZIP file containing the HTML content and all other files (images) from the
|
||||
* directory.
|
||||
*/
|
||||
private byte[] createHtmlZip(String htmlContent, java.io.File sourceDir)
|
||||
throws java.io.IOException {
|
||||
java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream();
|
||||
|
||||
try (java.util.zip.ZipOutputStream zos = new java.util.zip.ZipOutputStream(baos)) {
|
||||
// Add HTML file to root
|
||||
java.util.zip.ZipEntry htmlEntry = new java.util.zip.ZipEntry("index.html");
|
||||
zos.putNextEntry(htmlEntry);
|
||||
zos.write(htmlContent.getBytes(java.nio.charset.StandardCharsets.UTF_8));
|
||||
zos.closeEntry();
|
||||
|
||||
// Add all other files (images, etc.)
|
||||
addDirectoryToZip(zos, sourceDir.toPath(), sourceDir.toPath());
|
||||
}
|
||||
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
/** Recursively adds files from a directory to a ZIP, excluding .md files. */
|
||||
private void addDirectoryToZip(
|
||||
java.util.zip.ZipOutputStream zos,
|
||||
java.nio.file.Path sourceDir,
|
||||
java.nio.file.Path rootDir)
|
||||
throws java.io.IOException {
|
||||
try (java.util.stream.Stream<java.nio.file.Path> paths =
|
||||
java.nio.file.Files.walk(sourceDir, 1)) {
|
||||
for (java.nio.file.Path path : paths.toList()) {
|
||||
if (java.nio.file.Files.isDirectory(path)) {
|
||||
if (!path.equals(sourceDir)) {
|
||||
addDirectoryToZip(zos, path, rootDir);
|
||||
}
|
||||
} else if (!path.toString().toLowerCase().endsWith(".md")) {
|
||||
// Add file to ZIP, maintaining relative path structure
|
||||
java.nio.file.Path relativePath = rootDir.relativize(path);
|
||||
java.util.zip.ZipEntry entry =
|
||||
new java.util.zip.ZipEntry(relativePath.toString());
|
||||
zos.putNextEntry(entry);
|
||||
java.nio.file.Files.copy(path, zos);
|
||||
zos.closeEntry();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class TableAttributeProvider implements AttributeProvider {
|
||||
|
||||
@@ -233,7 +233,8 @@ Feature: API Validation
|
||||
When I send the API request to the endpoint "/api/v1/convert/pdf/markdown"
|
||||
Then the response status code should be 200
|
||||
And the response file should have size greater than 100
|
||||
And the response file should have extension ".md"
|
||||
And the response file should have extension ".zip"
|
||||
And the response ZIP should contain 4 files
|
||||
|
||||
|
||||
@positive @pdftocsv
|
||||
|
||||
Reference in New Issue
Block a user