From 955a26f32b7697ceb7e159cc676c8738aa2b5012 Mon Sep 17 00:00:00 2001
From: Ludy <Ludy87@users.noreply.github.com>
Date: Thu, 16 Oct 2025 23:41:04 +0200
Subject: [PATCH] fix(security): Harden website-to-PDF conversion (#4638)

# Description of Changes

**What was changed**
- Fetch remote HTML content via `HttpClient` before invoking WeasyPrint
to inspect and sanitize input.
- Reject conversions when downloaded HTML contains disallowed `file:`
scheme references (including encoded/obfuscated variants) using a
compiled `Pattern`.
- Write fetched HTML to a secured temporary file and pass that path to
WeasyPrint instead of the remote URL.
- Provide `--base-url` to WeasyPrint so relative resources resolve
correctly while avoiding direct remote fetching as the primary input.
- Add comprehensive unit tests:
- Ensure command invocation uses local temp HTML + `--base-url` and
cleans up temp files.
  - Verify redirect with error when disallowed content is detected.
  - Cover temp file deletion behavior and error handling paths.
- Improve resource cleanup in `finally` blocks for both temp HTML and
output PDF artifacts.

**Why the change was made**
- Prevents traversal/local file exposure risks by blocking `file:` (and
encoded equivalents) discovered in fetched HTML.
- Reduces attack surface of URL-to-PDF by avoiding direct handing of
remote URLs to the renderer and enabling pre-validation.
- Strengthens deterministic behavior of conversions and improves safety
against SSRF-like vectors.

---

## Checklist

### General

- [ ] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [ ] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [ ] I have performed a self-review of my own code
- [ ] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.
---
 .../api/converters/ConvertWebsiteToPDF.java   | 123 ++++++++++++++++++
 .../main/resources/messages_en_GB.properties  |   1 +
 .../converters/ConvertWebsiteToPdfTest.java   | 120 +++++++++++++----
 3 files changed, 216 insertions(+), 28 deletions(-)

diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java
index 4109db0e7..c35aa0282 100644
--- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java
+++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java
@@ -2,10 +2,18 @@ package stirling.software.SPDF.controller.api.converters;
 
 import java.io.IOException;
 import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.time.Duration;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.springframework.http.HttpStatus;
@@ -44,6 +52,11 @@ public class ConvertWebsiteToPDF {
     private final RuntimePathConfig runtimePathConfig;
     private final ApplicationProperties applicationProperties;
 
+    private static final Pattern FILE_SCHEME_PATTERN =
+            Pattern.compile("(?<![a-z0-9_])file\\s*:(?:/{1,3}|%2f|%5c|%3a|&#x2f;|&#47;)");
+
+    private static final Pattern NUMERIC_HTML_ENTITY_PATTERN = Pattern.compile("&#(x?[0-9a-f]+);");
+
     @PostMapping(consumes = MediaType.MULTIPART_FORM_DATA_VALUE, value = "/url/pdf")
     @Operation(
             summary = "Convert a URL to a PDF",
@@ -91,14 +104,33 @@ public class ConvertWebsiteToPDF {
         }
 
         Path tempOutputFile = null;
+        Path tempHtmlInput = null;
         PDDocument doc = null;
         try {
+            // Download the remote content first to ensure we don't allow dangerous schemes
+            String htmlContent = fetchRemoteHtml(URL);
+
+            if (containsDisallowedUriScheme(htmlContent)) {
+                URI rejectionLocation =
+                        uriComponentsBuilder
+                                .queryParam("error", "error.disallowedUrlContent")
+                                .build()
+                                .toUri();
+                log.warn("Rejected URL to PDF conversion due to disallowed content references");
+                return ResponseEntity.status(status).location(rejectionLocation).build();
+            }
+
+            tempHtmlInput = Files.createTempFile("url_input_", ".html");
+            Files.writeString(tempHtmlInput, htmlContent, StandardCharsets.UTF_8);
+
             // Prepare the output file path
             tempOutputFile = Files.createTempFile("output_", ".pdf");
 
             // Prepare the WeasyPrint command
             List<String> command = new ArrayList<>();
             command.add(runtimePathConfig.getWeasyPrintPath());
+            command.add(tempHtmlInput.toString());
+            command.add("--base-url");
             command.add(URL);
             command.add("--pdf-forms");
             command.add(tempOutputFile.toString());
@@ -120,6 +152,13 @@ public class ConvertWebsiteToPDF {
             }
             return response;
         } finally {
+            if (tempHtmlInput != null) {
+                try {
+                    Files.deleteIfExists(tempHtmlInput);
+                } catch (IOException e) {
+                    log.error("Error deleting temporary HTML input file", e);
+                }
+            }
 
             if (tempOutputFile != null) {
                 try {
@@ -131,6 +170,90 @@ public class ConvertWebsiteToPDF {
         }
     }
 
+    private String fetchRemoteHtml(String url) throws IOException, InterruptedException {
+        HttpClient client =
+                HttpClient.newBuilder()
+                        .followRedirects(HttpClient.Redirect.NORMAL)
+                        .connectTimeout(Duration.ofSeconds(10))
+                        .build();
+
+        HttpRequest request =
+                HttpRequest.newBuilder(URI.create(url))
+                        .timeout(Duration.ofSeconds(20))
+                        .GET()
+                        .header("User-Agent", "Stirling-PDF/URL-to-PDF")
+                        .build();
+
+        HttpResponse<String> response =
+                client.send(request, HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8));
+
+        if (response.statusCode() >= 400 || response.body() == null) {
+            throw new IOException(
+                    "Failed to retrieve remote HTML. Status: " + response.statusCode());
+        }
+
+        return response.body();
+    }
+
+    private boolean containsDisallowedUriScheme(String htmlContent) {
+        if (htmlContent == null || htmlContent.isEmpty()) {
+            return false;
+        }
+
+        String normalized = normalizeForSchemeDetection(htmlContent);
+        return FILE_SCHEME_PATTERN.matcher(normalized).find();
+    }
+
+    private String normalizeForSchemeDetection(String htmlContent) {
+        String lowerCaseContent = htmlContent.toLowerCase(Locale.ROOT);
+        String decodedHtmlEntities = decodeNumericHtmlEntities(lowerCaseContent);
+        decodedHtmlEntities =
+                decodedHtmlEntities
+                        .replace("&colon;", ":")
+                        .replace("&sol;", "/")
+                        .replace("&frasl;", "/");
+        return percentDecode(decodedHtmlEntities);
+    }
+
+    private String percentDecode(String content) {
+        StringBuilder result = new StringBuilder(content.length());
+        for (int i = 0; i < content.length(); i++) {
+            char current = content.charAt(i);
+            if (current == '%' && i + 2 < content.length()) {
+                String hex = content.substring(i + 1, i + 3);
+                try {
+                    int value = Integer.parseInt(hex, 16);
+                    result.append((char) value);
+                    i += 2;
+                    continue;
+                } catch (NumberFormatException ignored) {
+                    // Fall through to append the literal characters when parsing fails
+                }
+            }
+            result.append(current);
+        }
+        return result.toString();
+    }
+
+    private String decodeNumericHtmlEntities(String content) {
+        Matcher matcher = NUMERIC_HTML_ENTITY_PATTERN.matcher(content);
+        StringBuffer decoded = new StringBuffer();
+        while (matcher.find()) {
+            String entityBody = matcher.group(1);
+            try {
+                int radix = entityBody.startsWith("x") ? 16 : 10;
+                int codePoint =
+                        Integer.parseInt(radix == 16 ? entityBody.substring(1) : entityBody, radix);
+                matcher.appendReplacement(
+                        decoded, Matcher.quoteReplacement(Character.toString((char) codePoint)));
+            } catch (NumberFormatException ex) {
+                matcher.appendReplacement(decoded, matcher.group(0));
+            }
+        }
+        matcher.appendTail(decoded);
+        return decoded.toString();
+    }
+
     private String convertURLToFileName(String url) {
         String safeName = GeneralUtils.convertToFileName(url);
         if (safeName == null || safeName.isBlank()) {
diff --git a/app/core/src/main/resources/messages_en_GB.properties b/app/core/src/main/resources/messages_en_GB.properties
index 1eee0ab11..86f6e928a 100644
--- a/app/core/src/main/resources/messages_en_GB.properties
+++ b/app/core/src/main/resources/messages_en_GB.properties
@@ -194,6 +194,7 @@ error.fileFormatRequired=File must be in {0} format
 error.invalidFormat=Invalid {0} format: {1}
 error.endpointDisabled=This endpoint has been disabled by the admin
 error.urlNotReachable=URL is not reachable, please provide a valid URL
+error.disallowedUrlContent=URL content references disallowed resources and cannot be converted
 error.invalidUrlFormat=Invalid URL format provided. The provided format is invalid.
 
 # DPI and image rendering messages - used by frontend for dynamic translation
diff --git a/app/core/src/test/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPdfTest.java b/app/core/src/test/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPdfTest.java
index a51dad6a7..9abcbae92 100644
--- a/app/core/src/test/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPdfTest.java
+++ b/app/core/src/test/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPdfTest.java
@@ -3,14 +3,19 @@ package stirling.software.SPDF.controller.api.converters;
 import static org.junit.jupiter.api.Assertions.*;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.anyString;
+import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.when;
 
 import java.io.File;
 import java.io.IOException;
 import java.lang.reflect.Method;
 import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.time.Duration;
 import java.util.List;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
@@ -51,18 +56,18 @@ public class ConvertWebsiteToPdfTest {
     void setUp() throws Exception {
         mocks = MockitoAnnotations.openMocks(this);
 
-        // Feature einschalten (ggf. Struktur an dein Projekt anpassen)
+        // Enable feature (adjust structure for your project if necessary)
         applicationProperties = new ApplicationProperties();
         applicationProperties.getSystem().setEnableUrlToPDF(true);
 
-        // Stubs, falls der Code weiterlaufen sollte
+        // Stubs in case the code continues to run
         when(runtimePathConfig.getWeasyPrintPath()).thenReturn("/usr/bin/weasyprint");
         when(pdfDocumentFactory.load(any(File.class))).thenReturn(new PDDocument());
 
-        // SUT bauen
+        // Build SUT
         sut = new ConvertWebsiteToPDF(pdfDocumentFactory, runtimePathConfig, applicationProperties);
 
-        // RequestContext für ServletUriComponentsBuilder bereitstellen
+        // Provide RequestContext for ServletUriComponentsBuilder
         MockHttpServletRequest req = new MockHttpServletRequest();
         req.setScheme("http");
         req.setServerName("localhost");
@@ -94,7 +99,7 @@ public class ConvertWebsiteToPdfTest {
     @Test
     void redirect_with_error_when_url_is_not_reachable() throws Exception {
         UrlToPdfRequest request = new UrlToPdfRequest();
-        // .invalid ist per RFC reserviert und nicht auflösbar
+        // .invalid is reserved by RFC and not resolvable
         request.setUrlInput("https://nonexistent.invalid/");
 
         ResponseEntity<?> resp = sut.urlToPdf(request);
@@ -109,7 +114,7 @@ public class ConvertWebsiteToPdfTest {
 
     @Test
     void redirect_with_error_when_endpoint_disabled() throws Exception {
-        // Feature deaktivieren
+        // Disable feature
         applicationProperties.getSystem().setEnableUrlToPDF(false);
 
         UrlToPdfRequest request = new UrlToPdfRequest();
@@ -135,9 +140,9 @@ public class ConvertWebsiteToPdfTest {
         String out = (String) m.invoke(sut, in);
 
         assertTrue(out.endsWith(".pdf"));
-        // Nur A–Z, a–z, 0–9, Unterstrich und Punkt erlaubt
+        // Only A–Z, a–z, 0–9, underscore and dot allowed
         assertTrue(out.matches("[A-Za-z0-9_]+\\.pdf"));
-        // keine Truncation hier (Quelle ist nicht so lang)
+        // no truncation here (source not that long)
         assertTrue(out.length() <= 54);
     }
 
@@ -147,14 +152,14 @@ public class ConvertWebsiteToPdfTest {
                 ConvertWebsiteToPDF.class.getDeclaredMethod("convertURLToFileName", String.class);
         m.setAccessible(true);
 
-        // Sehr lange URL → löst Truncation aus
+        // Very long URL -> triggers truncation
         String longUrl =
                 "https://very-very-long-domain.example.com/some/really/long/path/with?many=params&and=chars";
         String out = (String) m.invoke(sut, longUrl);
 
         assertTrue(out.endsWith(".pdf"));
         assertTrue(out.matches("[A-Za-z0-9_]+\\.pdf"));
-        // safeName ist auf 50 begrenzt → total max 54 inkl. ".pdf"
+        // safeName limited to 50 -> total max 54 including '.pdf'
         assertTrue(out.length() <= 54, "Filename should be truncated to 50 + '.pdf'");
     }
 
@@ -165,25 +170,26 @@ public class ConvertWebsiteToPdfTest {
 
         try (MockedStatic<ProcessExecutor> pe = Mockito.mockStatic(ProcessExecutor.class);
                 MockedStatic<WebResponseUtils> wr = Mockito.mockStatic(WebResponseUtils.class);
-                MockedStatic<GeneralUtils> gu = Mockito.mockStatic(GeneralUtils.class)) {
+                MockedStatic<GeneralUtils> gu = Mockito.mockStatic(GeneralUtils.class);
+                MockedStatic<HttpClient> httpClient = mockHttpClientReturning("<html></html>")) {
 
-            // URL-Checks positiv erzwingen
+            // Force URL checks to be positive
             gu.when(() -> GeneralUtils.isValidURL("https://example.com")).thenReturn(true);
             gu.when(() -> GeneralUtils.isURLReachable("https://example.com")).thenReturn(true);
 
-            // richtiger ProcessExecutor!
+            // correct ProcessExecutor!
             ProcessExecutor mockExec = Mockito.mock(ProcessExecutor.class);
             pe.when(() -> ProcessExecutor.getInstance(Processes.WEASYPRINT)).thenReturn(mockExec);
 
             @SuppressWarnings("unchecked")
             ArgumentCaptor<List<String>> cmdCaptor = ArgumentCaptor.forClass(List.class);
 
-            // Rückgabewert typgerecht
+            // Return value of correct type
             ProcessExecutorResult dummyResult = Mockito.mock(ProcessExecutorResult.class);
             when(mockExec.runCommandWithOutputHandling(cmdCaptor.capture()))
                     .thenReturn(dummyResult);
 
-            // WebResponseUtils mocken
+            // Mock WebResponseUtils
             ResponseEntity<byte[]> fakeResponse = ResponseEntity.ok(new byte[0]);
             wr.when(() -> WebResponseUtils.pdfDocToWebResponse(any(PDDocument.class), anyString()))
                     .thenReturn(fakeResponse);
@@ -194,20 +200,23 @@ public class ConvertWebsiteToPdfTest {
             // Assert – Response OK
             assertEquals(HttpStatus.OK, resp.getStatusCode());
 
-            // Assert – WeasyPrint-Kommando korrekt
+            // Assert – WeasyPrint command correct
             List<String> cmd = cmdCaptor.getValue();
             assertNotNull(cmd);
             assertEquals("/usr/bin/weasyprint", cmd.get(0));
-            assertEquals("https://example.com", cmd.get(1));
-            assertEquals("--pdf-forms", cmd.get(2));
-            assertTrue(cmd.size() >= 4, "WeasyPrint sollte einen Output-Pfad erhalten");
-            String outPathStr = cmd.get(3);
+            assertTrue(cmd.size() >= 6, "WeasyPrint should receive HTML input and output path");
+            String htmlPathStr = cmd.get(1);
+            assertEquals("--base-url", cmd.get(2));
+            assertEquals("https://example.com", cmd.get(3));
+            assertEquals("--pdf-forms", cmd.get(4));
+            String outPathStr = cmd.get(5);
             assertNotNull(outPathStr);
 
-            // Temp-Datei muss im finally gelöscht sein
+            // Temp file must be deleted in finally
             Path outPath = Path.of(outPathStr);
             assertFalse(
-                    Files.exists(outPath), "Temp-Output-Datei sollte nach dem Call gelöscht sein");
+                    Files.exists(Path.of(htmlPathStr)),
+                    "Temp HTML file should be deleted after the call");
         }
     }
 
@@ -218,21 +227,32 @@ public class ConvertWebsiteToPdfTest {
         request.setUrlInput("https://example.com");
 
         Path preCreatedTemp = java.nio.file.Files.createTempFile("test_output_", ".pdf");
+        Path htmlTemp = java.nio.file.Files.createTempFile("test_input_", ".html");
 
         try (MockedStatic<GeneralUtils> gu = Mockito.mockStatic(GeneralUtils.class);
                 MockedStatic<ProcessExecutor> pe = Mockito.mockStatic(ProcessExecutor.class);
                 MockedStatic<WebResponseUtils> wr = Mockito.mockStatic(WebResponseUtils.class);
-                MockedStatic<Files> files = Mockito.mockStatic(Files.class)) {
+                MockedStatic<Files> files = Mockito.mockStatic(Files.class);
+                MockedStatic<HttpClient> httpClient = mockHttpClientReturning("<html></html>")) {
 
-            // URL-Checks positiv
+            // Force URL checks to be positive
             gu.when(() -> GeneralUtils.isValidURL("https://example.com")).thenReturn(true);
             gu.when(() -> GeneralUtils.isURLReachable("https://example.com")).thenReturn(true);
 
-            // Temp-Datei erzwingen + Delete-Fehler provozieren
+            // Force temp files + provoke delete error
+            files.when(() -> Files.createTempFile("url_input_", ".html")).thenReturn(htmlTemp);
             files.when(() -> Files.createTempFile("output_", ".pdf")).thenReturn(preCreatedTemp);
+            files.when(
+                            () ->
+                                    Files.writeString(
+                                            eq(htmlTemp),
+                                            anyString(),
+                                            eq(java.nio.charset.StandardCharsets.UTF_8)))
+                    .thenReturn(htmlTemp);
+            files.when(() -> Files.deleteIfExists(htmlTemp)).thenReturn(true);
             files.when(() -> Files.deleteIfExists(preCreatedTemp))
                     .thenThrow(new IOException("fail delete"));
-            files.when(() -> Files.exists(preCreatedTemp)).thenReturn(true); // für den Assert
+            files.when(() -> Files.exists(preCreatedTemp)).thenReturn(true); // for the assert
 
             // ProcessExecutor
             ProcessExecutor mockExec = Mockito.mock(ProcessExecutor.class);
@@ -245,7 +265,7 @@ public class ConvertWebsiteToPdfTest {
             wr.when(() -> WebResponseUtils.pdfDocToWebResponse(any(PDDocument.class), anyString()))
                     .thenReturn(fakeResponse);
 
-            // Act: darf keine Exception werfen und soll eine Response liefern
+            // Act: should not throw and should return a Response
             ResponseEntity<?> resp = assertDoesNotThrow(() -> sut.urlToPdf(request));
 
             // Assert
@@ -253,12 +273,56 @@ public class ConvertWebsiteToPdfTest {
             assertEquals(HttpStatus.OK, resp.getStatusCode());
             assertTrue(
                     java.nio.file.Files.exists(preCreatedTemp),
-                    "Temp-Datei sollte trotz Lösch-IOException noch existieren");
+                    "Temp file should still exist despite delete IOException");
         } finally {
             try {
                 java.nio.file.Files.deleteIfExists(preCreatedTemp);
+                java.nio.file.Files.deleteIfExists(htmlTemp);
             } catch (IOException ignore) {
             }
         }
     }
+
+    @Test
+    void redirect_with_error_when_disallowed_content_detected() throws Exception {
+        UrlToPdfRequest request = new UrlToPdfRequest();
+        request.setUrlInput("https://example.com");
+
+        try (MockedStatic<GeneralUtils> gu = Mockito.mockStatic(GeneralUtils.class);
+                MockedStatic<HttpClient> httpClient =
+                        mockHttpClientReturning(
+                                "<link rel=\"attachment\" href=\"file:///etc/passwd\">"); ) {
+
+            gu.when(() -> GeneralUtils.isValidURL("https://example.com")).thenReturn(true);
+            gu.when(() -> GeneralUtils.isURLReachable("https://example.com")).thenReturn(true);
+
+            ResponseEntity<?> resp = sut.urlToPdf(request);
+
+            assertEquals(HttpStatus.SEE_OTHER, resp.getStatusCode());
+            URI location = resp.getHeaders().getLocation();
+            assertNotNull(location, "Location header expected");
+            assertTrue(
+                    location.getQuery() != null
+                            && location.getQuery().contains("error=error.disallowedUrlContent"));
+        }
+    }
+
+    private MockedStatic<HttpClient> mockHttpClientReturning(String body) throws Exception {
+        MockedStatic<HttpClient> httpClientStatic = Mockito.mockStatic(HttpClient.class);
+        HttpClient.Builder builder = Mockito.mock(HttpClient.Builder.class);
+        HttpClient client = Mockito.mock(HttpClient.class);
+        HttpResponse<String> response = Mockito.mock(HttpResponse.class);
+
+        httpClientStatic.when(HttpClient::newBuilder).thenReturn(builder);
+        when(builder.followRedirects(HttpClient.Redirect.NORMAL)).thenReturn(builder);
+        when(builder.connectTimeout(any(Duration.class))).thenReturn(builder);
+        when(builder.build()).thenReturn(client);
+
+        when(client.send(any(HttpRequest.class), any(HttpResponse.BodyHandler.class)))
+                .thenReturn(response);
+        when(response.statusCode()).thenReturn(200);
+        when(response.body()).thenReturn(body);
+
+        return httpClientStatic;
+    }
 }