fix(security): Harden website-to-PDF conversion (#4638)

# Description of Changes **What was changed** - Fetch remote HTML content via `HttpClient` before invoking WeasyPrint to inspect and sanitize input. - Reject conversions when downloaded HTML contains disallowed `file:` scheme references (including encoded/obfuscated variants) using a compiled `Pattern`. - Write fetched HTML to a secured temporary file and pass that path to WeasyPrint instead of the remote URL. - Provide `--base-url` to WeasyPrint so relative resources resolve correctly while avoiding direct remote fetching as the primary input. - Add comprehensive unit tests: - Ensure command invocation uses local temp HTML + `--base-url` and cleans up temp files. - Verify redirect with error when disallowed content is detected. - Cover temp file deletion behavior and error handling paths. - Improve resource cleanup in `finally` blocks for both temp HTML and output PDF artifacts. **Why the change was made** - Prevents traversal/local file exposure risks by blocking `file:` (and encoded equivalents) discovered in fetched HTML. - Reduces attack surface of URL-to-PDF by avoiding direct handing of remote URLs to the renderer and enabling pre-validation. - Strengthens deterministic behavior of conversions and improves safety against SSRF-like vectors. --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details.
2025-11-16 01:21:16 +01:00 · 2025-10-16 23:41:04 +02:00 · 2025-10-16 23:41:04 +02:00 · 955a26f32b
commit 955a26f32b
parent e40f41d79a
3 changed files with 216 additions and 28 deletions
--- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java
+++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java
@ -2,10 +2,18 @@ package stirling.software.SPDF.controller.api.converters;
 import java.io.IOException;
 import java.net.URI;
 import java.net.http.HttpClient;
 import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.time.Duration;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.springframework.http.HttpStatus;
@ -44,6 +52,11 @@ public class ConvertWebsiteToPDF {
    private final RuntimePathConfig runtimePathConfig;
    private final ApplicationProperties applicationProperties;
    private static final Pattern FILE_SCHEME_PATTERN =
            Pattern.compile("(?<![a-z0-9_])file\\s*:(?:/{1,3}|%2f|%5c|%3a|&#x2f;|&#47;)");
    private static final Pattern NUMERIC_HTML_ENTITY_PATTERN = Pattern.compile("&#(x?[0-9a-f]+);");
    @PostMapping(consumes = MediaType.MULTIPART_FORM_DATA_VALUE, value = "/url/pdf")
    @Operation(
            summary = "Convert a URL to a PDF",
@ -91,14 +104,33 @@ public class ConvertWebsiteToPDF {
        }
        Path tempOutputFile = null;
        Path tempHtmlInput = null;
        PDDocument doc = null;
        try {
            // Download the remote content first to ensure we don't allow dangerous schemes
            String htmlContent = fetchRemoteHtml(URL);
            if (containsDisallowedUriScheme(htmlContent)) {
                URI rejectionLocation =
                        uriComponentsBuilder
                                .queryParam("error", "error.disallowedUrlContent")
                                .build()
                                .toUri();
                log.warn("Rejected URL to PDF conversion due to disallowed content references");
                return ResponseEntity.status(status).location(rejectionLocation).build();
            }
            tempHtmlInput = Files.createTempFile("url_input_", ".html");
            Files.writeString(tempHtmlInput, htmlContent, StandardCharsets.UTF_8);
            // Prepare the output file path
            tempOutputFile = Files.createTempFile("output_", ".pdf");
            // Prepare the WeasyPrint command
            List<String> command = new ArrayList<>();
            command.add(runtimePathConfig.getWeasyPrintPath());
            command.add(tempHtmlInput.toString());
            command.add("--base-url");
            command.add(URL);
            command.add("--pdf-forms");
            command.add(tempOutputFile.toString());
@ -120,6 +152,13 @@ public class ConvertWebsiteToPDF {
            }
            return response;
        } finally {
            if (tempHtmlInput != null) {
                try {
                    Files.deleteIfExists(tempHtmlInput);
                } catch (IOException e) {
                    log.error("Error deleting temporary HTML input file", e);
                }
            }
            if (tempOutputFile != null) {
                try {
@ -131,6 +170,90 @@ public class ConvertWebsiteToPDF {
        }
    }
    private String fetchRemoteHtml(String url) throws IOException, InterruptedException {
        HttpClient client =
                HttpClient.newBuilder()
                        .followRedirects(HttpClient.Redirect.NORMAL)
                        .connectTimeout(Duration.ofSeconds(10))
                        .build();
        HttpRequest request =
                HttpRequest.newBuilder(URI.create(url))
                        .timeout(Duration.ofSeconds(20))
                        .GET()
                        .header("User-Agent", "Stirling-PDF/URL-to-PDF")
                        .build();
        HttpResponse<String> response =
                client.send(request, HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8));
        if (response.statusCode() >= 400 || response.body() == null) {
            throw new IOException(
                    "Failed to retrieve remote HTML. Status: " + response.statusCode());
        }
        return response.body();
    }
    private boolean containsDisallowedUriScheme(String htmlContent) {
        if (htmlContent == null || htmlContent.isEmpty()) {
            return false;
        }
        String normalized = normalizeForSchemeDetection(htmlContent);
        return FILE_SCHEME_PATTERN.matcher(normalized).find();
    }
    private String normalizeForSchemeDetection(String htmlContent) {
        String lowerCaseContent = htmlContent.toLowerCase(Locale.ROOT);
        String decodedHtmlEntities = decodeNumericHtmlEntities(lowerCaseContent);
        decodedHtmlEntities =
                decodedHtmlEntities
                        .replace("&colon;", ":")
                        .replace("&sol;", "/")
                        .replace("&frasl;", "/");
        return percentDecode(decodedHtmlEntities);
    }
    private String percentDecode(String content) {
        StringBuilder result = new StringBuilder(content.length());
        for (int i = 0; i < content.length(); i++) {
            char current = content.charAt(i);
            if (current == '%' && i + 2 < content.length()) {
                String hex = content.substring(i + 1, i + 3);
                try {
                    int value = Integer.parseInt(hex, 16);
                    result.append((char) value);
                    i += 2;
                    continue;
                } catch (NumberFormatException ignored) {
                    // Fall through to append the literal characters when parsing fails
                }
            }
            result.append(current);
        }
        return result.toString();
    }
    private String decodeNumericHtmlEntities(String content) {
        Matcher matcher = NUMERIC_HTML_ENTITY_PATTERN.matcher(content);
        StringBuffer decoded = new StringBuffer();
        while (matcher.find()) {
            String entityBody = matcher.group(1);
            try {
                int radix = entityBody.startsWith("x") ? 16 : 10;
                int codePoint =
                        Integer.parseInt(radix == 16 ? entityBody.substring(1) : entityBody, radix);
                matcher.appendReplacement(
                        decoded, Matcher.quoteReplacement(Character.toString((char) codePoint)));
            } catch (NumberFormatException ex) {
                matcher.appendReplacement(decoded, matcher.group(0));
            }
        }
        matcher.appendTail(decoded);
        return decoded.toString();
    }
    private String convertURLToFileName(String url) {
        String safeName = GeneralUtils.convertToFileName(url);
        if (safeName == null || safeName.isBlank()) {
--- a/app/core/src/main/resources/messages_en_GB.properties
+++ b/app/core/src/main/resources/messages_en_GB.properties
@ -194,6 +194,7 @@ error.fileFormatRequired=File must be in {0} format
 error.invalidFormat=Invalid {0} format: {1}
 error.endpointDisabled=This endpoint has been disabled by the admin
 error.urlNotReachable=URL is not reachable, please provide a valid URL
 error.disallowedUrlContent=URL content references disallowed resources and cannot be converted
 error.invalidUrlFormat=Invalid URL format provided. The provided format is invalid.
 # DPI and image rendering messages - used by frontend for dynamic translation
--- a/app/core/src/test/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPdfTest.java
+++ b/app/core/src/test/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPdfTest.java
@ -3,14 +3,19 @@ package stirling.software.SPDF.controller.api.converters;
 import static org.junit.jupiter.api.Assertions.*;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.anyString;
 import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.when;
 import java.io.File;
 import java.io.IOException;
 import java.lang.reflect.Method;
 import java.net.URI;
 import java.net.http.HttpClient;
 import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.time.Duration;
 import java.util.List;
 import org.apache.pdfbox.pdmodel.PDDocument;
@ -51,18 +56,18 @@ public class ConvertWebsiteToPdfTest {
    void setUp() throws Exception {
        mocks = MockitoAnnotations.openMocks(this);
-        // Feature einschalten (ggf. Struktur an dein Projekt anpassen)
+        // Enable feature (adjust structure for your project if necessary)
        applicationProperties = new ApplicationProperties();
        applicationProperties.getSystem().setEnableUrlToPDF(true);
-        // Stubs, falls der Code weiterlaufen sollte
+        // Stubs in case the code continues to run
        when(runtimePathConfig.getWeasyPrintPath()).thenReturn("/usr/bin/weasyprint");
        when(pdfDocumentFactory.load(any(File.class))).thenReturn(new PDDocument());
-        // SUT bauen
+        // Build SUT
        sut = new ConvertWebsiteToPDF(pdfDocumentFactory, runtimePathConfig, applicationProperties);
-        // RequestContext für ServletUriComponentsBuilder bereitstellen
+        // Provide RequestContext for ServletUriComponentsBuilder
        MockHttpServletRequest req = new MockHttpServletRequest();
        req.setScheme("http");
        req.setServerName("localhost");
@ -94,7 +99,7 @@ public class ConvertWebsiteToPdfTest {
    @Test
    void redirect_with_error_when_url_is_not_reachable() throws Exception {
        UrlToPdfRequest request = new UrlToPdfRequest();
-        // .invalid ist per RFC reserviert und nicht auflösbar
+        // .invalid is reserved by RFC and not resolvable
        request.setUrlInput("https://nonexistent.invalid/");
        ResponseEntity<?> resp = sut.urlToPdf(request);
@ -109,7 +114,7 @@ public class ConvertWebsiteToPdfTest {
    @Test
    void redirect_with_error_when_endpoint_disabled() throws Exception {
-        // Feature deaktivieren
+        // Disable feature
        applicationProperties.getSystem().setEnableUrlToPDF(false);
        UrlToPdfRequest request = new UrlToPdfRequest();
@ -135,9 +140,9 @@ public class ConvertWebsiteToPdfTest {
        String out = (String) m.invoke(sut, in);
        assertTrue(out.endsWith(".pdf"));
-        // Nur A–Z, a–z, 0–9, Unterstrich und Punkt erlaubt
+        // Only A–Z, a–z, 0–9, underscore and dot allowed
        assertTrue(out.matches("[A-Za-z0-9_]+\\.pdf"));
-        // keine Truncation hier (Quelle ist nicht so lang)
+        // no truncation here (source not that long)
        assertTrue(out.length() <= 54);
    }
@ -147,14 +152,14 @@ public class ConvertWebsiteToPdfTest {
                ConvertWebsiteToPDF.class.getDeclaredMethod("convertURLToFileName", String.class);
        m.setAccessible(true);
-        // Sehr lange URL → löst Truncation aus
+        // Very long URL -> triggers truncation
        String longUrl =
                "https://very-very-long-domain.example.com/some/really/long/path/with?many=params&and=chars";
        String out = (String) m.invoke(sut, longUrl);
        assertTrue(out.endsWith(".pdf"));
        assertTrue(out.matches("[A-Za-z0-9_]+\\.pdf"));
-        // safeName ist auf 50 begrenzt → total max 54 inkl. ".pdf"
+        // safeName limited to 50 -> total max 54 including '.pdf'
        assertTrue(out.length() <= 54, "Filename should be truncated to 50 + '.pdf'");
    }
@ -165,25 +170,26 @@ public class ConvertWebsiteToPdfTest {
        try (MockedStatic<ProcessExecutor> pe = Mockito.mockStatic(ProcessExecutor.class);
                MockedStatic<WebResponseUtils> wr = Mockito.mockStatic(WebResponseUtils.class);
-                MockedStatic<GeneralUtils> gu = Mockito.mockStatic(GeneralUtils.class)) {
+                MockedStatic<GeneralUtils> gu = Mockito.mockStatic(GeneralUtils.class);
                MockedStatic<HttpClient> httpClient = mockHttpClientReturning("<html></html>")) {
-            // URL-Checks positiv erzwingen
+            // Force URL checks to be positive
            gu.when(() -> GeneralUtils.isValidURL("https://example.com")).thenReturn(true);
            gu.when(() -> GeneralUtils.isURLReachable("https://example.com")).thenReturn(true);
-            // richtiger ProcessExecutor!
+            // correct ProcessExecutor!
            ProcessExecutor mockExec = Mockito.mock(ProcessExecutor.class);
            pe.when(() -> ProcessExecutor.getInstance(Processes.WEASYPRINT)).thenReturn(mockExec);
            @SuppressWarnings("unchecked")
            ArgumentCaptor<List<String>> cmdCaptor = ArgumentCaptor.forClass(List.class);
-            // Rückgabewert typgerecht
+            // Return value of correct type
            ProcessExecutorResult dummyResult = Mockito.mock(ProcessExecutorResult.class);
            when(mockExec.runCommandWithOutputHandling(cmdCaptor.capture()))
                    .thenReturn(dummyResult);
-            // WebResponseUtils mocken
+            // Mock WebResponseUtils
            ResponseEntity<byte[]> fakeResponse = ResponseEntity.ok(new byte[0]);
            wr.when(() -> WebResponseUtils.pdfDocToWebResponse(any(PDDocument.class), anyString()))
                    .thenReturn(fakeResponse);
@ -194,20 +200,23 @@ public class ConvertWebsiteToPdfTest {
            // Assert – Response OK
            assertEquals(HttpStatus.OK, resp.getStatusCode());
-            // Assert – WeasyPrint-Kommando korrekt
+            // Assert – WeasyPrint command correct
            List<String> cmd = cmdCaptor.getValue();
            assertNotNull(cmd);
            assertEquals("/usr/bin/weasyprint", cmd.get(0));
-            assertEquals("https://example.com", cmd.get(1));
+            assertTrue(cmd.size() >= 6, "WeasyPrint should receive HTML input and output path");
-            assertEquals("--pdf-forms", cmd.get(2));
+            String htmlPathStr = cmd.get(1);
-            assertTrue(cmd.size() >= 4, "WeasyPrint sollte einen Output-Pfad erhalten");
+            assertEquals("--base-url", cmd.get(2));
-            String outPathStr = cmd.get(3);
+            assertEquals("https://example.com", cmd.get(3));
            assertEquals("--pdf-forms", cmd.get(4));
            String outPathStr = cmd.get(5);
            assertNotNull(outPathStr);
-            // Temp-Datei muss im finally gelöscht sein
+            // Temp file must be deleted in finally
            Path outPath = Path.of(outPathStr);
            assertFalse(
-                    Files.exists(outPath), "Temp-Output-Datei sollte nach dem Call gelöscht sein");
+                    Files.exists(Path.of(htmlPathStr)),
                    "Temp HTML file should be deleted after the call");
        }
    }
@ -218,21 +227,32 @@ public class ConvertWebsiteToPdfTest {
        request.setUrlInput("https://example.com");
        Path preCreatedTemp = java.nio.file.Files.createTempFile("test_output_", ".pdf");
        Path htmlTemp = java.nio.file.Files.createTempFile("test_input_", ".html");
        try (MockedStatic<GeneralUtils> gu = Mockito.mockStatic(GeneralUtils.class);
                MockedStatic<ProcessExecutor> pe = Mockito.mockStatic(ProcessExecutor.class);
                MockedStatic<WebResponseUtils> wr = Mockito.mockStatic(WebResponseUtils.class);
-                MockedStatic<Files> files = Mockito.mockStatic(Files.class)) {
+                MockedStatic<Files> files = Mockito.mockStatic(Files.class);
                MockedStatic<HttpClient> httpClient = mockHttpClientReturning("<html></html>")) {
-            // URL-Checks positiv
+            // Force URL checks to be positive
            gu.when(() -> GeneralUtils.isValidURL("https://example.com")).thenReturn(true);
            gu.when(() -> GeneralUtils.isURLReachable("https://example.com")).thenReturn(true);
-            // Temp-Datei erzwingen + Delete-Fehler provozieren
+            // Force temp files + provoke delete error
            files.when(() -> Files.createTempFile("url_input_", ".html")).thenReturn(htmlTemp);
            files.when(() -> Files.createTempFile("output_", ".pdf")).thenReturn(preCreatedTemp);
            files.when(
                            () ->
                                    Files.writeString(
                                            eq(htmlTemp),
                                            anyString(),
                                            eq(java.nio.charset.StandardCharsets.UTF_8)))
                    .thenReturn(htmlTemp);
            files.when(() -> Files.deleteIfExists(htmlTemp)).thenReturn(true);
            files.when(() -> Files.deleteIfExists(preCreatedTemp))
                    .thenThrow(new IOException("fail delete"));
-            files.when(() -> Files.exists(preCreatedTemp)).thenReturn(true); // für den Assert
+            files.when(() -> Files.exists(preCreatedTemp)).thenReturn(true); // for the assert
            // ProcessExecutor
            ProcessExecutor mockExec = Mockito.mock(ProcessExecutor.class);
@ -245,7 +265,7 @@ public class ConvertWebsiteToPdfTest {
            wr.when(() -> WebResponseUtils.pdfDocToWebResponse(any(PDDocument.class), anyString()))
                    .thenReturn(fakeResponse);
-            // Act: darf keine Exception werfen und soll eine Response liefern
+            // Act: should not throw and should return a Response
            ResponseEntity<?> resp = assertDoesNotThrow(() -> sut.urlToPdf(request));
            // Assert
@ -253,12 +273,56 @@ public class ConvertWebsiteToPdfTest {
            assertEquals(HttpStatus.OK, resp.getStatusCode());
            assertTrue(
                    java.nio.file.Files.exists(preCreatedTemp),
-                    "Temp-Datei sollte trotz Lösch-IOException noch existieren");
+                    "Temp file should still exist despite delete IOException");
        } finally {
            try {
                java.nio.file.Files.deleteIfExists(preCreatedTemp);
                java.nio.file.Files.deleteIfExists(htmlTemp);
            } catch (IOException ignore) {
            }
        }
    }
    @Test
    void redirect_with_error_when_disallowed_content_detected() throws Exception {
        UrlToPdfRequest request = new UrlToPdfRequest();
        request.setUrlInput("https://example.com");
        try (MockedStatic<GeneralUtils> gu = Mockito.mockStatic(GeneralUtils.class);
                MockedStatic<HttpClient> httpClient =
                        mockHttpClientReturning(
                                "<link rel=\"attachment\" href=\"file:///etc/passwd\">"); ) {
            gu.when(() -> GeneralUtils.isValidURL("https://example.com")).thenReturn(true);
            gu.when(() -> GeneralUtils.isURLReachable("https://example.com")).thenReturn(true);
            ResponseEntity<?> resp = sut.urlToPdf(request);
            assertEquals(HttpStatus.SEE_OTHER, resp.getStatusCode());
            URI location = resp.getHeaders().getLocation();
            assertNotNull(location, "Location header expected");
            assertTrue(
                    location.getQuery() != null
                            && location.getQuery().contains("error=error.disallowedUrlContent"));
        }
    }
    private MockedStatic<HttpClient> mockHttpClientReturning(String body) throws Exception {
        MockedStatic<HttpClient> httpClientStatic = Mockito.mockStatic(HttpClient.class);
        HttpClient.Builder builder = Mockito.mock(HttpClient.Builder.class);
        HttpClient client = Mockito.mock(HttpClient.class);
        HttpResponse<String> response = Mockito.mock(HttpResponse.class);
        httpClientStatic.when(HttpClient::newBuilder).thenReturn(builder);
        when(builder.followRedirects(HttpClient.Redirect.NORMAL)).thenReturn(builder);
        when(builder.connectTimeout(any(Duration.class))).thenReturn(builder);
        when(builder.build()).thenReturn(client);
        when(client.send(any(HttpRequest.class), any(HttpResponse.BodyHandler.class)))
                .thenReturn(response);
        when(response.statusCode()).thenReturn(200);
        when(response.body()).thenReturn(body);
        return httpClientStatic;
    }
 }