feat(convert): PDF conversion with unoconvert fallback soffice (#4316)

# Description of Changes

- **What was changed**
- Reworked `ConvertOfficeController` to use a dedicated temporary
working directory per request and cleaned up with directory-level
deletion.
- Added detection for converter availability via `EndpointConfiguration`
to choose between **unoconvert** and a **soffice** headless fallback.
- Ensured safe filename handling (sanitization, extension checks,
lowercase normalization) and early validation errors for missing/invalid
filenames.
- Switched raw temp file writes to `Files.copy` / `Files.writeString`
with `StandardCopyOption.REPLACE_EXISTING`.
  - Implemented robust output handling:
    - Verified non-zero exit codes and null results.
    - Checked for missing/empty PDF outputs.
- Added fallback lookup for any produced `.pdf` within the work
directory if the expected name is not present.
  - Introduced `@Slf4j` logging; improved error and cleanup logging.
- Replaced ad-hoc temp cleanup with `FileUtils.deleteDirectory` for full
working-dir removal.
- Minor imports/cleanup: removed unused `Arrays`, added
`StandardCopyOption`, `FileUtils`, and related imports.

- **Why the change was made**
- Increase conversion reliability across environments where either
unoconvert or soffice may be available.
- Harden security and stability through strict input validation and
sanitized HTML processing.
- Prevent orphaned files/directories and ensure consistent cleanup to
reduce disk footprint and operational issues.
- Provide clearer operational signals (logging, explicit exceptions) for
easier troubleshooting.


---

## Checklist

### General

- [x] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [x] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [x] I have performed a self-review of my own code
- [x] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.
This commit is contained in:
Ludy 2025-09-04 15:33:35 +02:00 committed by GitHub
parent fe84b3ff15
commit c055f9456a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -5,10 +5,11 @@ import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.http.ResponseEntity;
@ -23,7 +24,9 @@ import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.config.EndpointConfiguration;
import stirling.software.common.configuration.RuntimePathConfig;
import stirling.software.common.model.api.GeneralFile;
import stirling.software.common.service.CustomPDFDocumentFactory;
@ -36,59 +39,130 @@ import stirling.software.common.util.WebResponseUtils;
@Tag(name = "Convert", description = "Convert APIs")
@RequestMapping("/api/v1/convert")
@RequiredArgsConstructor
@Slf4j
public class ConvertOfficeController {
private final CustomPDFDocumentFactory pdfDocumentFactory;
private final RuntimePathConfig runtimePathConfig;
private final CustomHtmlSanitizer customHtmlSanitizer;
private final EndpointConfiguration endpointConfiguration;
private boolean isUnoconvertAvailable() {
return endpointConfiguration.isGroupEnabled("Unoconvert")
|| endpointConfiguration.isGroupEnabled("Python");
}
public File convertToPdf(MultipartFile inputFile) throws IOException, InterruptedException {
// Check for valid file extension
// Check for valid file extension and sanitize filename
String originalFilename = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
if (originalFilename == null
|| !isValidFileExtension(FilenameUtils.getExtension(originalFilename))) {
throw new IllegalArgumentException("Invalid file extension");
if (originalFilename == null || originalFilename.isBlank()) {
throw new IllegalArgumentException("Missing original filename");
}
// Save the uploaded file to a temporary location
Path tempInputFile =
Files.createTempFile("input_", "." + FilenameUtils.getExtension(originalFilename));
// Check for valid file extension
String extension = FilenameUtils.getExtension(originalFilename);
if (extension == null || !isValidFileExtension(extension)) {
throw new IllegalArgumentException("Invalid file extension");
}
String extensionLower = extension.toLowerCase();
String baseName = FilenameUtils.getBaseName(originalFilename);
if (baseName == null || baseName.isBlank()) {
baseName = "input";
}
// create temporary working directory
Path workDir = Files.createTempDirectory("office2pdf_");
Path inputPath = workDir.resolve(baseName + "." + extensionLower);
Path outputPath = workDir.resolve(baseName + ".pdf");
// Check if the file is HTML and apply sanitization if needed
String fileExtension = FilenameUtils.getExtension(originalFilename).toLowerCase();
if ("html".equals(fileExtension) || "htm".equals(fileExtension)) {
if ("html".equals(extensionLower) || "htm".equals(extensionLower)) {
// Read and sanitize HTML content
String htmlContent = new String(inputFile.getBytes(), StandardCharsets.UTF_8);
String sanitizedHtml = customHtmlSanitizer.sanitize(htmlContent);
Files.write(tempInputFile, sanitizedHtml.getBytes(StandardCharsets.UTF_8));
Files.writeString(inputPath, sanitizedHtml, StandardCharsets.UTF_8);
} else {
inputFile.transferTo(tempInputFile);
// copy file content
Files.copy(inputFile.getInputStream(), inputPath, StandardCopyOption.REPLACE_EXISTING);
}
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
try {
// Run the LibreOffice command
List<String> command =
new ArrayList<>(
Arrays.asList(
runtimePathConfig.getUnoConvertPath(),
"--port",
"2003",
"--convert-to",
"pdf",
tempInputFile.toString(),
tempOutputFile.toString()));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(command);
ProcessExecutorResult result;
// Run Unoconvert command
if (isUnoconvertAvailable()) {
// Unoconvert: schreibe direkt in outputPath innerhalb des workDir
List<String> command = new ArrayList<>();
command.add(runtimePathConfig.getUnoConvertPath());
command.add("--port");
command.add("2003");
command.add("--convert-to");
command.add("pdf");
command.add(inputPath.toString());
command.add(outputPath.toString());
// Read the converted PDF file
return tempOutputFile.toFile();
result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(command);
} // Run soffice command
else {
List<String> command = new ArrayList<>();
command.add("soffice");
command.add("--headless");
command.add("--nologo");
command.add("--convert-to");
command.add("pdf:writer_pdf_Export");
command.add("--outdir");
command.add(workDir.toString());
command.add(inputPath.toString());
result =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(command);
}
// Check the result
if (result == null) {
throw new IllegalStateException("Converter returned no result");
}
if (result.getRc() != 0) {
throw new IllegalStateException("Conversion failed (exit " + result.getRc() + ")");
}
if (!Files.exists(outputPath)) {
// Some LibreOffice versions may deviate with exotic names as a fallback, we try
// to find any .pdf in the workDir
try (var stream = Files.list(workDir)) {
Path fallback =
stream.filter(
p ->
p.getFileName()
.toString()
.toLowerCase()
.endsWith(".pdf"))
.findFirst()
.orElse(null);
if (fallback == null) {
throw new IllegalStateException("No PDF produced.");
}
// Move the found PDF to the expected outputPath
Files.move(fallback, outputPath, StandardCopyOption.REPLACE_EXISTING);
}
}
// Check if the output file is empty
if (Files.size(outputPath) == 0L) {
throw new IllegalStateException("Produced PDF is empty");
}
return outputPath.toFile();
} finally {
// Clean up the temporary files
if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
try {
Files.deleteIfExists(inputPath);
} catch (IOException e) {
log.warn("Failed to delete temp input file: {}", inputPath, e);
}
}
}
@ -119,7 +193,9 @@ public class ConvertOfficeController {
.replaceFirst("[.][^.]+$", "")
+ "_convertedToPDF.pdf");
} finally {
if (file != null) file.delete();
if (file != null && file.getParent() != null) {
FileUtils.deleteDirectory(file.getParentFile());
}
}
}
}