Cleanup work + stream endpoints to reduce memory usage (#6106)

This commit is contained in:
Anthony Stirling
2026-04-15 15:34:17 +01:00
committed by GitHub
parent 702f4e5c2c
commit cc5a0b8def
116 changed files with 3005 additions and 1512 deletions

View File

@@ -1,8 +1,10 @@
package stirling.software.common.service;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.UUID;
@@ -10,6 +12,7 @@ import java.util.UUID;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@@ -143,6 +146,24 @@ public class FileStorage {
return new StoredFile(fileId, size);
}
public String storeFromStreamingBody(StreamingResponseBody body, String originalName)
throws IOException {
String fileId = generateFileId();
Path filePath = getFilePath(fileId);
Files.createDirectories(filePath.getParent());
boolean success = false;
try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(filePath))) {
body.writeTo(os);
success = true;
} finally {
if (!success) {
Files.deleteIfExists(filePath);
}
}
log.debug("Stored StreamingResponseBody with ID: {}", fileId);
return fileId;
}
/**
* Delete a file by its ID
*

View File

@@ -16,6 +16,7 @@ import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody;
import jakarta.servlet.http.HttpServletRequest;
@@ -305,33 +306,21 @@ public class JobExecutorService {
Object body = response.getBody();
if (body instanceof byte[]) {
// Extract filename from content-disposition header if available
String filename = "result.pdf";
String contentType = MediaType.APPLICATION_PDF_VALUE;
String filename = extractResponseFilename(response);
String contentType = extractResponseContentType(response);
if (response.getHeaders().getContentDisposition() != null) {
String disposition =
response.getHeaders().getContentDisposition().toString();
if (disposition.contains("filename=")) {
filename =
disposition.substring(
disposition.indexOf("filename=") + 9,
disposition.lastIndexOf('"'));
}
}
MediaType mediaType = response.getHeaders().getContentType();
if (mediaType != null) {
contentType = mediaType.toString();
}
// Store byte array directly to disk
String fileId = fileStorage.storeBytes((byte[]) body, filename);
taskManager.setFileResult(jobId, fileId, filename, contentType);
log.debug("Stored ResponseEntity<byte[]> result with fileId: {}", fileId);
} else if (body instanceof StreamingResponseBody streamingBody) {
String filename = extractResponseFilename(response);
String contentType = extractResponseContentType(response);
// Let the GC handle the memory naturally
String fileId = fileStorage.storeFromStreamingBody(streamingBody, filename);
taskManager.setFileResult(jobId, fileId, filename, contentType);
log.debug(
"Stored ResponseEntity<StreamingResponseBody> result with fileId: {}",
fileId);
} else {
// Check if the response body contains a fileId
if (body != null && body.toString().contains("fileId")) {
@@ -481,6 +470,21 @@ public class JobExecutorService {
}
}
private static String extractResponseFilename(ResponseEntity<?> response) {
if (response.getHeaders().getContentDisposition() != null) {
String filename = response.getHeaders().getContentDisposition().getFilename();
if (filename != null && !filename.isEmpty()) {
return filename;
}
}
return "result.pdf";
}
private static String extractResponseContentType(ResponseEntity<?> response) {
MediaType mediaType = response.getHeaders().getContentType();
return mediaType != null ? mediaType.toString() : MediaType.APPLICATION_PDF_VALUE;
}
/**
* Parse session timeout string (e.g., "30m", "1h") to milliseconds
*

View File

@@ -401,7 +401,7 @@ public class JobQueue implements SmartLifecycle {
* @throws Exception If there is an execution error
*/
private <T> T executeWithTimeout(Supplier<T> supplier, long timeoutMs) throws Exception {
CompletableFuture<T> future = CompletableFuture.supplyAsync(supplier);
CompletableFuture<T> future = CompletableFuture.supplyAsync(supplier, jobExecutor);
try {
if (timeoutMs <= 0) {

View File

@@ -7,11 +7,8 @@ import java.lang.management.MemoryMXBean;
import java.lang.management.OperatingSystemMXBean;
import java.lang.management.RuntimeMXBean;
import java.lang.management.ThreadMXBean;
import java.net.InetAddress;
import java.net.NetworkInterface;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
@@ -94,21 +91,12 @@ public class PostHogService {
metrics.put("os_name", System.getProperty("os.name"));
metrics.put("os_version", System.getProperty("os.version"));
metrics.put("java_version", System.getProperty("java.version"));
metrics.put("user_name", System.getProperty("user.name"));
metrics.put("user_home", System.getProperty("user.home"));
metrics.put("user_dir", System.getProperty("user.dir"));
// CPU and Memory
metrics.put("cpu_cores", Runtime.getRuntime().availableProcessors());
metrics.put("total_memory", Runtime.getRuntime().totalMemory());
metrics.put("free_memory", Runtime.getRuntime().freeMemory());
// Network and Server Identity
InetAddress localHost = InetAddress.getLocalHost();
metrics.put("ip_address", localHost.getHostAddress());
metrics.put("hostname", localHost.getHostName());
metrics.put("mac_address", getMacAddress());
// JVM info
metrics.put("jvm_vendor", System.getProperty("java.vendor"));
metrics.put("jvm_version", System.getProperty("java.vm.version"));
@@ -153,9 +141,6 @@ public class PostHogService {
metrics.put("gc_" + gcBean.getName() + "_time", gcBean.getCollectionTime());
}
// Network interfaces
metrics.put("network_interfaces", getNetworkInterfacesInfo());
// Docker detection and stats
boolean isDocker = isRunningInDocker();
if (isDocker) {
@@ -353,30 +338,6 @@ public class PostHogService {
.getProFeatures()
.getCustomMetadata()
.isAutoUpdateMetadata());
addIfNotEmpty(
properties,
"enterpriseEdition_customMetadata_author",
applicationProperties
.getPremium()
.getProFeatures()
.getCustomMetadata()
.getAuthor());
addIfNotEmpty(
properties,
"enterpriseEdition_customMetadata_creator",
applicationProperties
.getPremium()
.getProFeatures()
.getCustomMetadata()
.getCreator());
addIfNotEmpty(
properties,
"enterpriseEdition_customMetadata_producer",
applicationProperties
.getPremium()
.getProFeatures()
.getCustomMetadata()
.getProducer());
}
// Capture AutoPipeline properties
addIfNotEmpty(
@@ -386,39 +347,4 @@ public class PostHogService {
return properties;
}
private String getMacAddress() {
try {
Enumeration<NetworkInterface> networkInterfaces =
NetworkInterface.getNetworkInterfaces();
while (networkInterfaces.hasMoreElements()) {
NetworkInterface ni = networkInterfaces.nextElement();
byte[] hardwareAddress = ni.getHardwareAddress();
if (hardwareAddress != null) {
String[] hexadecimal = new String[hardwareAddress.length];
for (int i = 0; i < hardwareAddress.length; i++) {
hexadecimal[i] = String.format("%02X", hardwareAddress[i]);
}
return String.join("-", hexadecimal);
}
}
} catch (Exception e) {
// Handle exception
}
return "Unknown";
}
private Map<String, String> getNetworkInterfacesInfo() {
Map<String, String> interfacesInfo = new HashMap<>();
try {
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
while (nets.hasMoreElements()) {
NetworkInterface netint = nets.nextElement();
interfacesInfo.put(netint.getName(), netint.getDisplayName());
}
} catch (Exception e) {
interfacesInfo.put("error", e.getMessage());
}
return interfacesInfo;
}
}

View File

@@ -1,6 +1,5 @@
package stirling.software.common.util;
import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
@@ -66,16 +65,7 @@ public class FileToPdf {
ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT)
.runCommandWithOutputHandling(command);
byte[] pdfBytes = Files.readAllBytes(tempOutputFile.getPath());
try {
return pdfBytes;
} catch (Exception e) {
pdfBytes = Files.readAllBytes(tempOutputFile.getPath());
if (pdfBytes.length < 1) {
throw e;
}
return pdfBytes;
}
return Files.readAllBytes(tempOutputFile.getPath());
} // tempInputFile auto-closed
} // tempOutputFile auto-closed
}
@@ -92,8 +82,7 @@ public class FileToPdf {
throws IOException {
try (TempDirectory tempUnzippedDir = new TempDirectory(tempFileManager)) {
try (ZipInputStream zipIn =
ZipSecurity.createHardenedInputStream(
new ByteArrayInputStream(Files.readAllBytes(zipFilePath)))) {
ZipSecurity.createHardenedInputStream(Files.newInputStream(zipFilePath))) {
ZipEntry entry = zipIn.getNextEntry();
while (entry != null) {
Path filePath =

View File

@@ -1,9 +1,9 @@
package stirling.software.common.util;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
@@ -20,6 +20,7 @@ import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody;
import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
import com.vladsch.flexmark.util.data.MutableDataSet;
@@ -48,7 +49,7 @@ public class PDFToFile {
this.runtimePathConfig = runtimePathConfig;
}
public ResponseEntity<byte[]> processPdfToMarkdown(MultipartFile inputFile)
public ResponseEntity<StreamingResponseBody> processPdfToMarkdown(MultipartFile inputFile)
throws IOException, InterruptedException {
if (!MediaType.APPLICATION_PDF_VALUE.equals(inputFile.getContentType())) {
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
@@ -85,78 +86,77 @@ public class PDFToFile {
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
}
byte[] fileBytes;
String fileName;
String fileName = pdfBaseName + "ToMarkdown.zip";
TempFile finalOut = tempFileManager.createManagedTempFile(".zip");
try {
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
TempDirectory tempOutputDir = new TempDirectory(tempFileManager)) {
inputFile.transferTo(tempInputFile.getFile());
try (TempFile tempInputFile = new TempFile(tempFileManager, ".pdf");
TempDirectory tempOutputDir = new TempDirectory(tempFileManager)) {
inputFile.transferTo(tempInputFile.getFile());
List<String> command =
new ArrayList<>(
Arrays.asList(
"pdftohtml",
"-s",
"-noframes",
"-c",
tempInputFile.getAbsolutePath(),
pdfBaseName));
List<String> command =
new ArrayList<>(
Arrays.asList(
"pdftohtml",
"-s",
"-noframes",
"-c",
tempInputFile.getAbsolutePath(),
pdfBaseName));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
.runCommandWithOutputHandling(
command, tempOutputDir.getPath().toFile());
// Process HTML files to Markdown
File[] outputFiles =
Objects.requireNonNull(tempOutputDir.getPath().toFile().listFiles());
List<File> markdownFiles = new ArrayList<>();
List<File> imageFiles = new ArrayList<>();
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
.runCommandWithOutputHandling(
command, tempOutputDir.getPath().toFile());
// Process HTML files to Markdown
File[] outputFiles =
Objects.requireNonNull(tempOutputDir.getPath().toFile().listFiles());
List<File> markdownFiles = new ArrayList<>();
List<File> imageFiles = new ArrayList<>();
// Convert HTML files to Markdown and collect image files
for (File outputFile : outputFiles) {
if (outputFile.getName().endsWith(".html")) {
String html = Files.readString(outputFile.toPath());
String markdown = htmlToMarkdownConverter.convert(html);
// Convert HTML files to Markdown and collect image files
for (File outputFile : outputFiles) {
if (outputFile.getName().endsWith(".html")) {
String html = Files.readString(outputFile.toPath());
String markdown = htmlToMarkdownConverter.convert(html);
// Update image references to point to images/ folder
markdown = updateImageReferences(markdown);
// Update image references to point to images/ folder
markdown = updateImageReferences(markdown);
String mdFileName = outputFile.getName().replace(".html", ".md");
File mdFile = new File(tempOutputDir.getPath().toFile(), mdFileName);
Files.writeString(mdFile.toPath(), markdown);
markdownFiles.add(mdFile);
} else if (!outputFile.getName().endsWith(".md")) {
// Collect non-HTML, non-MD files as images/assets
imageFiles.add(outputFile);
}
}
String mdFileName = outputFile.getName().replace(".html", ".md");
File mdFile = new File(tempOutputDir.getPath().toFile(), mdFileName);
Files.writeString(mdFile.toPath(), markdown);
markdownFiles.add(mdFile);
} else if (!outputFile.getName().endsWith(".md")) {
// Collect non-HTML, non-MD files as images/assets
imageFiles.add(outputFile);
try (OutputStream fos = Files.newOutputStream(finalOut.getPath());
ZipOutputStream zipOutputStream = new ZipOutputStream(fos)) {
// Add markdown files to root of ZIP
for (File mdFile : markdownFiles) {
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
zipOutputStream.putNextEntry(mdEntry);
Files.copy(mdFile.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
// Add images and other assets to images/ folder
for (File imageFile : imageFiles) {
ZipEntry assetEntry = new ZipEntry("images/" + imageFile.getName());
zipOutputStream.putNextEntry(assetEntry);
Files.copy(imageFile.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
}
}
// Always create a ZIP file
fileName = pdfBaseName + "ToMarkdown.zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
// Add markdown files to root of ZIP
for (File mdFile : markdownFiles) {
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
zipOutputStream.putNextEntry(mdEntry);
Files.copy(mdFile.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
// Add images and other assets to images/ folder
for (File imageFile : imageFiles) {
ZipEntry assetEntry = new ZipEntry("images/" + imageFile.getName());
zipOutputStream.putNextEntry(assetEntry);
Files.copy(imageFile.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
}
fileBytes = byteArrayOutputStream.toByteArray();
} catch (Exception e) {
finalOut.close();
throw e;
}
return WebResponseUtils.bytesToWebResponse(
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
return WebResponseUtils.fileToWebResponse(
finalOut, fileName, MediaType.APPLICATION_OCTET_STREAM);
}
/**
@@ -169,7 +169,7 @@ public class PDFToFile {
return PATTERN.matcher(markdown).replaceAll("$1(images/$2)");
}
public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
public ResponseEntity<StreamingResponseBody> processPdfToHtml(MultipartFile inputFile)
throws IOException, InterruptedException {
if (!MediaType.APPLICATION_PDF_VALUE.equals(inputFile.getContentType())) {
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
@@ -182,56 +182,57 @@ public class PDFToFile {
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
}
byte[] fileBytes;
String fileName;
String fileName = pdfBaseName + "ToHtml.zip";
TempFile finalOut = tempFileManager.createManagedTempFile(".zip");
try {
try (TempFile inputFileTemp = new TempFile(tempFileManager, ".pdf");
TempDirectory outputDirTemp = new TempDirectory(tempFileManager)) {
try (TempFile inputFileTemp = new TempFile(tempFileManager, ".pdf");
TempDirectory outputDirTemp = new TempDirectory(tempFileManager)) {
Path tempInputFile = inputFileTemp.getPath();
Path tempOutputDir = outputDirTemp.getPath();
Path tempInputFile = inputFileTemp.getPath();
Path tempOutputDir = outputDirTemp.getPath();
// Save the uploaded file to a temporary location
inputFile.transferTo(tempInputFile);
// Save the uploaded file to a temporary location
inputFile.transferTo(tempInputFile);
// Run the pdftohtml command with complex output
List<String> command =
new ArrayList<>(
Arrays.asList(
"pdftohtml", "-c", tempInputFile.toString(), pdfBaseName));
// Run the pdftohtml command with complex output
List<String> command =
new ArrayList<>(
Arrays.asList(
"pdftohtml", "-c", tempInputFile.toString(), pdfBaseName));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
// Get output files
File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
// Get output files
File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
// Return output files in a ZIP archive
fileName = pdfBaseName + "ToHtml.zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
for (File outputFile : outputFiles) {
ZipEntry entry = new ZipEntry(outputFile.getName());
zipOutputStream.putNextEntry(entry);
try (FileInputStream fis = new FileInputStream(outputFile)) {
IOUtils.copy(fis, zipOutputStream);
} catch (IOException e) {
log.error("Exception writing zip entry", e);
try (OutputStream fos = Files.newOutputStream(finalOut.getPath());
ZipOutputStream zipOutputStream = new ZipOutputStream(fos)) {
for (File outputFile : outputFiles) {
ZipEntry entry = new ZipEntry(outputFile.getName());
zipOutputStream.putNextEntry(entry);
try (FileInputStream fis = new FileInputStream(outputFile)) {
IOUtils.copy(fis, zipOutputStream);
} catch (IOException e) {
log.error("Exception writing zip entry", e);
}
zipOutputStream.closeEntry();
}
zipOutputStream.closeEntry();
} catch (IOException e) {
log.error("Exception writing zip", e);
}
} catch (IOException e) {
log.error("Exception writing zip", e);
}
fileBytes = byteArrayOutputStream.toByteArray();
} catch (Exception e) {
finalOut.close();
throw e;
}
return WebResponseUtils.bytesToWebResponse(
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
return WebResponseUtils.fileToWebResponse(
finalOut, fileName, MediaType.APPLICATION_OCTET_STREAM);
}
public ResponseEntity<byte[]> processPdfToOfficeFormat(
public ResponseEntity<StreamingResponseBody> processPdfToOfficeFormat(
MultipartFile inputFile, String outputFormat, String libreOfficeFilter)
throws IOException, InterruptedException {
@@ -257,109 +258,115 @@ public class PDFToFile {
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
}
byte[] fileBytes;
String fileName;
TempFile finalOut =
tempFileManager.createManagedTempFile("." + resolvePrimaryExtension(outputFormat));
Path libreOfficeProfile = null;
try (TempFile inputFileTemp = new TempFile(tempFileManager, ".pdf");
TempDirectory outputDirTemp = new TempDirectory(tempFileManager)) {
try {
try (TempFile inputFileTemp = new TempFile(tempFileManager, ".pdf");
TempDirectory outputDirTemp = new TempDirectory(tempFileManager)) {
Path tempInputFile = inputFileTemp.getPath();
Path tempOutputDir = outputDirTemp.getPath();
Path unoOutputFile =
tempOutputDir.resolve(
pdfBaseName + "." + resolvePrimaryExtension(outputFormat));
Path tempInputFile = inputFileTemp.getPath();
Path tempOutputDir = outputDirTemp.getPath();
Path unoOutputFile =
tempOutputDir.resolve(
pdfBaseName + "." + resolvePrimaryExtension(outputFormat));
// Save the uploaded file to a temporary location
inputFile.transferTo(tempInputFile);
// Save the uploaded file to a temporary location
inputFile.transferTo(tempInputFile);
// Run the LibreOffice command
ProcessExecutorResult returnCode = null;
IOException unoconvertException = null;
// Run the LibreOffice command
ProcessExecutorResult returnCode = null;
IOException unoconvertException = null;
if (isUnoConvertEnabled()) {
try {
List<String> unoCommand =
buildUnoConvertCommand(
tempInputFile, unoOutputFile, outputFormat, libreOfficeFilter);
returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(unoCommand);
} catch (IOException e) {
unoconvertException = e;
log.warn(
"Unoconvert command failed ({}). Falling back to soffice command.",
e.getMessage());
}
}
if (returnCode == null) {
// Run the LibreOffice command as a fallback
libreOfficeProfile = Files.createTempDirectory("libreoffice_profile_");
List<String> command = new ArrayList<>();
command.add(runtimePathConfig.getSOfficePath());
command.add("-env:UserInstallation=" + libreOfficeProfile.toUri().toString());
command.add("--headless");
command.add("--nologo");
command.add("--infilter=" + libreOfficeFilter);
command.add("--convert-to");
command.add(outputFormat);
command.add("--outdir");
command.add(tempOutputDir.toString());
command.add(tempInputFile.toString());
try {
returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(command);
} catch (IOException e) {
if (unoconvertException != null) {
e.addSuppressed(unoconvertException);
if (isUnoConvertEnabled()) {
try {
List<String> unoCommand =
buildUnoConvertCommand(
tempInputFile,
unoOutputFile,
outputFormat,
libreOfficeFilter);
returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(unoCommand);
} catch (IOException e) {
unoconvertException = e;
log.warn(
"Unoconvert command failed ({}). Falling back to soffice command.",
e.getMessage());
}
throw e;
}
}
// Get output files
List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
if (returnCode == null) {
// Run the LibreOffice command as a fallback
libreOfficeProfile = Files.createTempDirectory("libreoffice_profile_");
List<String> command = new ArrayList<>();
command.add(runtimePathConfig.getSOfficePath());
command.add("-env:UserInstallation=" + libreOfficeProfile.toUri().toString());
command.add("--headless");
command.add("--nologo");
command.add("--infilter=" + libreOfficeFilter);
command.add("--convert-to");
command.add(outputFormat);
command.add("--outdir");
command.add(tempOutputDir.toString());
command.add(tempInputFile.toString());
if (outputFiles.size() == 1) {
// Return single output file
File outputFile = outputFiles.get(0);
if ("txt:Text".equals(outputFormat)) {
outputFormat = "txt";
}
fileName = pdfBaseName + "." + outputFormat;
fileBytes = FileUtils.readFileToByteArray(outputFile);
} else {
// Return output files in a ZIP archive
fileName = pdfBaseName + "To" + outputFormat + ".zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
for (File outputFile : outputFiles) {
ZipEntry entry = new ZipEntry(outputFile.getName());
zipOutputStream.putNextEntry(entry);
try (FileInputStream fis = new FileInputStream(outputFile)) {
IOUtils.copy(fis, zipOutputStream);
} catch (IOException e) {
log.error("Exception writing zip entry", e);
try {
returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(command);
} catch (IOException e) {
if (unoconvertException != null) {
e.addSuppressed(unoconvertException);
}
zipOutputStream.closeEntry();
throw e;
}
} catch (IOException e) {
log.error("Exception writing zip", e);
}
fileBytes = byteArrayOutputStream.toByteArray();
// Get output files
List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
if (outputFiles.size() == 1) {
// Return single output file
File outputFile = outputFiles.get(0);
if ("txt:Text".equals(outputFormat)) {
outputFormat = "txt";
}
fileName = pdfBaseName + "." + outputFormat;
FileUtils.copyFile(outputFile, finalOut.getFile());
} else {
// Return output files in a ZIP archive
fileName = pdfBaseName + "To" + outputFormat + ".zip";
try (OutputStream fos = Files.newOutputStream(finalOut.getPath());
ZipOutputStream zipOutputStream = new ZipOutputStream(fos)) {
for (File outputFile : outputFiles) {
ZipEntry entry = new ZipEntry(outputFile.getName());
zipOutputStream.putNextEntry(entry);
try (FileInputStream fis = new FileInputStream(outputFile)) {
IOUtils.copy(fis, zipOutputStream);
} catch (IOException e) {
log.error("Exception writing zip entry", e);
}
zipOutputStream.closeEntry();
}
} catch (IOException e) {
log.error("Exception writing zip", e);
}
}
}
} catch (Exception e) {
finalOut.close();
throw e;
} finally {
if (libreOfficeProfile != null) {
FileUtils.deleteQuietly(libreOfficeProfile.toFile());
}
}
return WebResponseUtils.bytesToWebResponse(
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
return WebResponseUtils.fileToWebResponse(
finalOut, fileName, MediaType.APPLICATION_OCTET_STREAM);
}
private boolean isUnoConvertEnabled() {

View File

@@ -73,6 +73,19 @@ public class WebResponseUtils {
return baosToWebResponse(baos, docName);
}
public static ResponseEntity<StreamingResponseBody> pdfDocToWebResponse(
PDDocument document, String docName, TempFileManager tempFileManager)
throws IOException {
TempFile tempFile = tempFileManager.createManagedTempFile(".pdf");
try {
document.save(tempFile.getFile());
} catch (IOException e) {
tempFile.close();
throw e;
}
return pdfFileToWebResponse(tempFile, docName);
}
/**
* Convert a File to a web response (PDF default).
*
@@ -108,23 +121,37 @@ public class WebResponseUtils {
public static ResponseEntity<StreamingResponseBody> fileToWebResponse(
TempFile outputTempFile, String docName, MediaType mediaType) throws IOException {
Path path = outputTempFile.getFile().toPath().normalize();
long len = Files.size(path);
HttpHeaders headers = new HttpHeaders();
headers.setContentType(mediaType);
headers.setContentLength(len);
headers.add(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=\"" + docName + "\"");
try {
Path path = outputTempFile.getFile().toPath().normalize();
long len = Files.size(path);
HttpHeaders headers = new HttpHeaders();
headers.setContentType(mediaType);
headers.setContentLength(len);
String encodedDocName =
RegexPatternUtils.getInstance()
.getPlusSignPattern()
.matcher(URLEncoder.encode(docName, StandardCharsets.UTF_8))
.replaceAll("%20");
headers.setContentDispositionFormData("attachment", encodedDocName);
StreamingResponseBody body =
os -> {
try (os) {
Files.copy(path, os);
os.flush();
} finally {
outputTempFile.close();
}
};
StreamingResponseBody body =
os -> {
try (os) {
Files.copy(path, os);
os.flush();
} finally {
outputTempFile.close();
}
};
return new ResponseEntity<>(body, headers, HttpStatus.OK);
return new ResponseEntity<>(body, headers, HttpStatus.OK);
} catch (IOException | RuntimeException e) {
try {
outputTempFile.close();
} catch (Exception closeEx) {
e.addSuppressed(closeEx);
}
throw e;
}
}
}

View File

@@ -9,6 +9,7 @@ import static org.mockito.Mockito.lenient;
import static org.mockito.Mockito.mockStatic;
import static org.mockito.Mockito.when;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
@@ -29,6 +30,7 @@ import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.mvc.method.annotation.StreamingResponseBody;
import io.github.pixee.security.ZipSecurity;
@@ -59,6 +61,19 @@ class PDFToFileTest {
.thenAnswer(
invocation ->
Files.createTempFile("test", invocation.getArgument(0)).toFile());
lenient()
.when(mockTempFileManager.createManagedTempFile(anyString()))
.thenAnswer(
invocation -> {
File f =
Files.createTempFile("test", invocation.<String>getArgument(0))
.toFile();
TempFile tf = org.mockito.Mockito.mock(TempFile.class);
lenient().when(tf.getFile()).thenReturn(f);
lenient().when(tf.getPath()).thenReturn(f.toPath());
lenient().when(tf.getAbsolutePath()).thenReturn(f.getAbsolutePath());
return tf;
});
lenient()
.when(mockTempFileManager.createTempDirectory())
.thenAnswer(invocation -> Files.createTempDirectory("test"));
@@ -68,6 +83,12 @@ class PDFToFileTest {
pdfToFile = new PDFToFile(mockTempFileManager, mockRuntimePathConfig);
}
private static byte[] drain(ResponseEntity<StreamingResponseBody> response) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
response.getBody().writeTo(baos);
return baos.toByteArray();
}
@Test
void testProcessPdfToMarkdown_InvalidContentType() throws IOException, InterruptedException {
// Prepare
@@ -79,7 +100,7 @@ class PDFToFileTest {
"This is not a PDF".getBytes());
// Execute
ResponseEntity<byte[]> response = pdfToFile.processPdfToMarkdown(nonPdfFile);
ResponseEntity<StreamingResponseBody> response = pdfToFile.processPdfToMarkdown(nonPdfFile);
// Verify
assertEquals(HttpStatus.BAD_REQUEST, response.getStatusCode());
@@ -96,7 +117,7 @@ class PDFToFileTest {
"This is not a PDF".getBytes());
// Execute
ResponseEntity<byte[]> response = pdfToFile.processPdfToHtml(nonPdfFile);
ResponseEntity<StreamingResponseBody> response = pdfToFile.processPdfToHtml(nonPdfFile);
// Verify
assertEquals(HttpStatus.BAD_REQUEST, response.getStatusCode());
@@ -114,7 +135,7 @@ class PDFToFileTest {
"This is not a PDF".getBytes());
// Execute
ResponseEntity<byte[]> response =
ResponseEntity<StreamingResponseBody> response =
pdfToFile.processPdfToOfficeFormat(nonPdfFile, "docx", "draw_pdf_import");
// Verify
@@ -133,7 +154,7 @@ class PDFToFileTest {
"Fake PDF content".getBytes());
// Execute with invalid format
ResponseEntity<byte[]> response =
ResponseEntity<StreamingResponseBody> response =
pdfToFile.processPdfToOfficeFormat(pdfFile, "invalid_format", "draw_pdf_import");
// Verify
@@ -184,12 +205,14 @@ class PDFToFileTest {
});
// Execute the method
ResponseEntity<byte[]> response = pdfToFile.processPdfToMarkdown(pdfFile);
ResponseEntity<StreamingResponseBody> response =
pdfToFile.processPdfToMarkdown(pdfFile);
// Verify - should now return a ZIP file instead of plain markdown
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
assertTrue(response.getBody().length > 0);
byte[] bodyBytes = drain(response);
assertNotNull(bodyBytes);
assertTrue(bodyBytes.length > 0);
// Verify content disposition indicates a ZIP file
assertTrue(
@@ -201,7 +224,7 @@ class PDFToFileTest {
// Verify the content by unzipping it
try (ZipInputStream zipStream =
ZipSecurity.createHardenedInputStream(
new java.io.ByteArrayInputStream(response.getBody()))) {
new java.io.ByteArrayInputStream(bodyBytes))) {
ZipEntry entry;
boolean foundMdFile = false;
boolean foundImageInFolder = false;
@@ -275,12 +298,14 @@ class PDFToFileTest {
});
// Execute the method
ResponseEntity<byte[]> response = pdfToFile.processPdfToMarkdown(pdfFile);
ResponseEntity<StreamingResponseBody> response =
pdfToFile.processPdfToMarkdown(pdfFile);
// Verify
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
assertTrue(response.getBody().length > 0);
byte[] bodyBytes = drain(response);
assertNotNull(bodyBytes);
assertTrue(bodyBytes.length > 0);
// Verify content disposition indicates a zip file
assertTrue(
@@ -292,7 +317,7 @@ class PDFToFileTest {
// Verify the content by unzipping it
try (ZipInputStream zipStream =
ZipSecurity.createHardenedInputStream(
new java.io.ByteArrayInputStream(response.getBody()))) {
new java.io.ByteArrayInputStream(bodyBytes))) {
ZipEntry entry;
boolean foundMdFiles = false;
boolean foundImage = false;
@@ -352,12 +377,13 @@ class PDFToFileTest {
});
// Execute the method
ResponseEntity<byte[]> response = pdfToFile.processPdfToHtml(pdfFile);
ResponseEntity<StreamingResponseBody> response = pdfToFile.processPdfToHtml(pdfFile);
// Verify
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
assertTrue(response.getBody().length > 0);
byte[] bodyBytes = drain(response);
assertNotNull(bodyBytes);
assertTrue(bodyBytes.length > 0);
// Verify content disposition indicates a zip file
assertTrue(
@@ -369,7 +395,7 @@ class PDFToFileTest {
// Verify the content by unzipping it
try (ZipInputStream zipStream =
ZipSecurity.createHardenedInputStream(
new java.io.ByteArrayInputStream(response.getBody()))) {
new java.io.ByteArrayInputStream(bodyBytes))) {
ZipEntry entry;
boolean foundMainHtml = false;
boolean foundIndexHtml = false;
@@ -437,13 +463,14 @@ class PDFToFileTest {
});
// Execute the method with docx format
ResponseEntity<byte[]> response =
ResponseEntity<StreamingResponseBody> response =
pdfToFile.processPdfToOfficeFormat(pdfFile, "docx", "draw_pdf_import");
// Verify
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
assertTrue(response.getBody().length > 0);
byte[] bodyBytes = drain(response);
assertNotNull(bodyBytes);
assertTrue(bodyBytes.length > 0);
// Verify content disposition has correct filename
assertTrue(
@@ -508,13 +535,14 @@ class PDFToFileTest {
});
// Execute the method with ODP format
ResponseEntity<byte[]> response =
ResponseEntity<StreamingResponseBody> response =
pdfToFile.processPdfToOfficeFormat(pdfFile, "odp", "draw_pdf_import");
// Verify
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
assertTrue(response.getBody().length > 0);
byte[] bodyBytes = drain(response);
assertNotNull(bodyBytes);
assertTrue(bodyBytes.length > 0);
// Verify content disposition for zip file
assertTrue(
@@ -526,7 +554,7 @@ class PDFToFileTest {
// Verify the content by unzipping it
try (ZipInputStream zipStream =
ZipSecurity.createHardenedInputStream(
new java.io.ByteArrayInputStream(response.getBody()))) {
new java.io.ByteArrayInputStream(bodyBytes))) {
ZipEntry entry;
boolean foundMainFile = false;
boolean foundMediaFiles = false;
@@ -592,13 +620,14 @@ class PDFToFileTest {
});
// Execute the method with text format
ResponseEntity<byte[]> response =
ResponseEntity<StreamingResponseBody> response =
pdfToFile.processPdfToOfficeFormat(pdfFile, "txt:Text", "draw_pdf_import");
// Verify
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
assertTrue(response.getBody().length > 0);
byte[] bodyBytes = drain(response);
assertNotNull(bodyBytes);
assertTrue(bodyBytes.length > 0);
// Verify content disposition has txt extension
assertTrue(
@@ -650,13 +679,14 @@ class PDFToFileTest {
});
// Execute the method
ResponseEntity<byte[]> response =
ResponseEntity<StreamingResponseBody> response =
pdfToFile.processPdfToOfficeFormat(pdfFile, "docx", "draw_pdf_import");
// Verify
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
assertTrue(response.getBody().length > 0);
byte[] bodyBytes = drain(response);
assertNotNull(bodyBytes);
assertTrue(bodyBytes.length > 0);
// Verify content disposition contains output.docx
assertTrue(
@@ -696,12 +726,13 @@ class PDFToFileTest {
return mockExecutorResult;
});
ResponseEntity<byte[]> response =
ResponseEntity<StreamingResponseBody> response =
pdfToFileWithUno.processPdfToOfficeFormat(pdfFile, "docx", "writer_pdf_import");
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
assertTrue(response.getBody().length > 0);
byte[] bodyBytes = drain(response);
assertNotNull(bodyBytes);
assertTrue(bodyBytes.length > 0);
assertTrue(
response.getHeaders()
.getContentDisposition()
@@ -759,12 +790,13 @@ class PDFToFileTest {
return mockExecutorResult;
});
ResponseEntity<byte[]> response =
ResponseEntity<StreamingResponseBody> response =
pdfToFileWithUno.processPdfToOfficeFormat(pdfFile, "docx", "writer_pdf_import");
assertEquals(HttpStatus.OK, response.getStatusCode());
assertNotNull(response.getBody());
assertTrue(response.getBody().length > 0);
byte[] bodyBytes = drain(response);
assertNotNull(bodyBytes);
assertTrue(bodyBytes.length > 0);
assertTrue(
response.getHeaders()
.getContentDisposition()