From 955a26f32b7697ceb7e159cc676c8738aa2b5012 Mon Sep 17 00:00:00 2001 From: Ludy Date: Thu, 16 Oct 2025 23:41:04 +0200 Subject: [PATCH] fix(security): Harden website-to-PDF conversion (#4638) # Description of Changes **What was changed** - Fetch remote HTML content via `HttpClient` before invoking WeasyPrint to inspect and sanitize input. - Reject conversions when downloaded HTML contains disallowed `file:` scheme references (including encoded/obfuscated variants) using a compiled `Pattern`. - Write fetched HTML to a secured temporary file and pass that path to WeasyPrint instead of the remote URL. - Provide `--base-url` to WeasyPrint so relative resources resolve correctly while avoiding direct remote fetching as the primary input. - Add comprehensive unit tests: - Ensure command invocation uses local temp HTML + `--base-url` and cleans up temp files. - Verify redirect with error when disallowed content is detected. - Cover temp file deletion behavior and error handling paths. - Improve resource cleanup in `finally` blocks for both temp HTML and output PDF artifacts. **Why the change was made** - Prevents traversal/local file exposure risks by blocking `file:` (and encoded equivalents) discovered in fetched HTML. - Reduces attack surface of URL-to-PDF by avoiding direct handing of remote URLs to the renderer and enabling pre-validation. - Strengthens deterministic behavior of conversions and improves safety against SSRF-like vectors. --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --- .../api/converters/ConvertWebsiteToPDF.java | 123 ++++++++++++++++++ .../main/resources/messages_en_GB.properties | 1 + .../converters/ConvertWebsiteToPdfTest.java | 120 +++++++++++++---- 3 files changed, 216 insertions(+), 28 deletions(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java index 4109db0e7..c35aa0282 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java @@ -2,10 +2,18 @@ package stirling.software.SPDF.controller.api.converters; import java.io.IOException; import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Duration; import java.util.ArrayList; import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.http.HttpStatus; @@ -44,6 +52,11 @@ public class ConvertWebsiteToPDF { private final RuntimePathConfig runtimePathConfig; private final ApplicationProperties applicationProperties; + private static final Pattern FILE_SCHEME_PATTERN = + Pattern.compile("(? command = new ArrayList<>(); command.add(runtimePathConfig.getWeasyPrintPath()); + command.add(tempHtmlInput.toString()); + command.add("--base-url"); command.add(URL); command.add("--pdf-forms"); command.add(tempOutputFile.toString()); @@ -120,6 +152,13 @@ public class ConvertWebsiteToPDF { } return response; } finally { + if (tempHtmlInput != null) { + try { + Files.deleteIfExists(tempHtmlInput); + } catch (IOException e) { + log.error("Error deleting temporary HTML input file", e); + } + } if (tempOutputFile != null) { try { @@ -131,6 +170,90 @@ public class ConvertWebsiteToPDF { } } + private String fetchRemoteHtml(String url) throws IOException, InterruptedException { + HttpClient client = + HttpClient.newBuilder() + .followRedirects(HttpClient.Redirect.NORMAL) + .connectTimeout(Duration.ofSeconds(10)) + .build(); + + HttpRequest request = + HttpRequest.newBuilder(URI.create(url)) + .timeout(Duration.ofSeconds(20)) + .GET() + .header("User-Agent", "Stirling-PDF/URL-to-PDF") + .build(); + + HttpResponse response = + client.send(request, HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8)); + + if (response.statusCode() >= 400 || response.body() == null) { + throw new IOException( + "Failed to retrieve remote HTML. Status: " + response.statusCode()); + } + + return response.body(); + } + + private boolean containsDisallowedUriScheme(String htmlContent) { + if (htmlContent == null || htmlContent.isEmpty()) { + return false; + } + + String normalized = normalizeForSchemeDetection(htmlContent); + return FILE_SCHEME_PATTERN.matcher(normalized).find(); + } + + private String normalizeForSchemeDetection(String htmlContent) { + String lowerCaseContent = htmlContent.toLowerCase(Locale.ROOT); + String decodedHtmlEntities = decodeNumericHtmlEntities(lowerCaseContent); + decodedHtmlEntities = + decodedHtmlEntities + .replace(":", ":") + .replace("/", "/") + .replace("⁄", "/"); + return percentDecode(decodedHtmlEntities); + } + + private String percentDecode(String content) { + StringBuilder result = new StringBuilder(content.length()); + for (int i = 0; i < content.length(); i++) { + char current = content.charAt(i); + if (current == '%' && i + 2 < content.length()) { + String hex = content.substring(i + 1, i + 3); + try { + int value = Integer.parseInt(hex, 16); + result.append((char) value); + i += 2; + continue; + } catch (NumberFormatException ignored) { + // Fall through to append the literal characters when parsing fails + } + } + result.append(current); + } + return result.toString(); + } + + private String decodeNumericHtmlEntities(String content) { + Matcher matcher = NUMERIC_HTML_ENTITY_PATTERN.matcher(content); + StringBuffer decoded = new StringBuffer(); + while (matcher.find()) { + String entityBody = matcher.group(1); + try { + int radix = entityBody.startsWith("x") ? 16 : 10; + int codePoint = + Integer.parseInt(radix == 16 ? entityBody.substring(1) : entityBody, radix); + matcher.appendReplacement( + decoded, Matcher.quoteReplacement(Character.toString((char) codePoint))); + } catch (NumberFormatException ex) { + matcher.appendReplacement(decoded, matcher.group(0)); + } + } + matcher.appendTail(decoded); + return decoded.toString(); + } + private String convertURLToFileName(String url) { String safeName = GeneralUtils.convertToFileName(url); if (safeName == null || safeName.isBlank()) { diff --git a/app/core/src/main/resources/messages_en_GB.properties b/app/core/src/main/resources/messages_en_GB.properties index 1eee0ab11..86f6e928a 100644 --- a/app/core/src/main/resources/messages_en_GB.properties +++ b/app/core/src/main/resources/messages_en_GB.properties @@ -194,6 +194,7 @@ error.fileFormatRequired=File must be in {0} format error.invalidFormat=Invalid {0} format: {1} error.endpointDisabled=This endpoint has been disabled by the admin error.urlNotReachable=URL is not reachable, please provide a valid URL +error.disallowedUrlContent=URL content references disallowed resources and cannot be converted error.invalidUrlFormat=Invalid URL format provided. The provided format is invalid. # DPI and image rendering messages - used by frontend for dynamic translation diff --git a/app/core/src/test/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPdfTest.java b/app/core/src/test/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPdfTest.java index a51dad6a7..9abcbae92 100644 --- a/app/core/src/test/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPdfTest.java +++ b/app/core/src/test/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPdfTest.java @@ -3,14 +3,19 @@ package stirling.software.SPDF.controller.api.converters; import static org.junit.jupiter.api.Assertions.*; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.when; import java.io.File; import java.io.IOException; import java.lang.reflect.Method; import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Duration; import java.util.List; import org.apache.pdfbox.pdmodel.PDDocument; @@ -51,18 +56,18 @@ public class ConvertWebsiteToPdfTest { void setUp() throws Exception { mocks = MockitoAnnotations.openMocks(this); - // Feature einschalten (ggf. Struktur an dein Projekt anpassen) + // Enable feature (adjust structure for your project if necessary) applicationProperties = new ApplicationProperties(); applicationProperties.getSystem().setEnableUrlToPDF(true); - // Stubs, falls der Code weiterlaufen sollte + // Stubs in case the code continues to run when(runtimePathConfig.getWeasyPrintPath()).thenReturn("/usr/bin/weasyprint"); when(pdfDocumentFactory.load(any(File.class))).thenReturn(new PDDocument()); - // SUT bauen + // Build SUT sut = new ConvertWebsiteToPDF(pdfDocumentFactory, runtimePathConfig, applicationProperties); - // RequestContext für ServletUriComponentsBuilder bereitstellen + // Provide RequestContext for ServletUriComponentsBuilder MockHttpServletRequest req = new MockHttpServletRequest(); req.setScheme("http"); req.setServerName("localhost"); @@ -94,7 +99,7 @@ public class ConvertWebsiteToPdfTest { @Test void redirect_with_error_when_url_is_not_reachable() throws Exception { UrlToPdfRequest request = new UrlToPdfRequest(); - // .invalid ist per RFC reserviert und nicht auflösbar + // .invalid is reserved by RFC and not resolvable request.setUrlInput("https://nonexistent.invalid/"); ResponseEntity resp = sut.urlToPdf(request); @@ -109,7 +114,7 @@ public class ConvertWebsiteToPdfTest { @Test void redirect_with_error_when_endpoint_disabled() throws Exception { - // Feature deaktivieren + // Disable feature applicationProperties.getSystem().setEnableUrlToPDF(false); UrlToPdfRequest request = new UrlToPdfRequest(); @@ -135,9 +140,9 @@ public class ConvertWebsiteToPdfTest { String out = (String) m.invoke(sut, in); assertTrue(out.endsWith(".pdf")); - // Nur A–Z, a–z, 0–9, Unterstrich und Punkt erlaubt + // Only A–Z, a–z, 0–9, underscore and dot allowed assertTrue(out.matches("[A-Za-z0-9_]+\\.pdf")); - // keine Truncation hier (Quelle ist nicht so lang) + // no truncation here (source not that long) assertTrue(out.length() <= 54); } @@ -147,14 +152,14 @@ public class ConvertWebsiteToPdfTest { ConvertWebsiteToPDF.class.getDeclaredMethod("convertURLToFileName", String.class); m.setAccessible(true); - // Sehr lange URL → löst Truncation aus + // Very long URL -> triggers truncation String longUrl = "https://very-very-long-domain.example.com/some/really/long/path/with?many=params&and=chars"; String out = (String) m.invoke(sut, longUrl); assertTrue(out.endsWith(".pdf")); assertTrue(out.matches("[A-Za-z0-9_]+\\.pdf")); - // safeName ist auf 50 begrenzt → total max 54 inkl. ".pdf" + // safeName limited to 50 -> total max 54 including '.pdf' assertTrue(out.length() <= 54, "Filename should be truncated to 50 + '.pdf'"); } @@ -165,25 +170,26 @@ public class ConvertWebsiteToPdfTest { try (MockedStatic pe = Mockito.mockStatic(ProcessExecutor.class); MockedStatic wr = Mockito.mockStatic(WebResponseUtils.class); - MockedStatic gu = Mockito.mockStatic(GeneralUtils.class)) { + MockedStatic gu = Mockito.mockStatic(GeneralUtils.class); + MockedStatic httpClient = mockHttpClientReturning("")) { - // URL-Checks positiv erzwingen + // Force URL checks to be positive gu.when(() -> GeneralUtils.isValidURL("https://example.com")).thenReturn(true); gu.when(() -> GeneralUtils.isURLReachable("https://example.com")).thenReturn(true); - // richtiger ProcessExecutor! + // correct ProcessExecutor! ProcessExecutor mockExec = Mockito.mock(ProcessExecutor.class); pe.when(() -> ProcessExecutor.getInstance(Processes.WEASYPRINT)).thenReturn(mockExec); @SuppressWarnings("unchecked") ArgumentCaptor> cmdCaptor = ArgumentCaptor.forClass(List.class); - // Rückgabewert typgerecht + // Return value of correct type ProcessExecutorResult dummyResult = Mockito.mock(ProcessExecutorResult.class); when(mockExec.runCommandWithOutputHandling(cmdCaptor.capture())) .thenReturn(dummyResult); - // WebResponseUtils mocken + // Mock WebResponseUtils ResponseEntity fakeResponse = ResponseEntity.ok(new byte[0]); wr.when(() -> WebResponseUtils.pdfDocToWebResponse(any(PDDocument.class), anyString())) .thenReturn(fakeResponse); @@ -194,20 +200,23 @@ public class ConvertWebsiteToPdfTest { // Assert – Response OK assertEquals(HttpStatus.OK, resp.getStatusCode()); - // Assert – WeasyPrint-Kommando korrekt + // Assert – WeasyPrint command correct List cmd = cmdCaptor.getValue(); assertNotNull(cmd); assertEquals("/usr/bin/weasyprint", cmd.get(0)); - assertEquals("https://example.com", cmd.get(1)); - assertEquals("--pdf-forms", cmd.get(2)); - assertTrue(cmd.size() >= 4, "WeasyPrint sollte einen Output-Pfad erhalten"); - String outPathStr = cmd.get(3); + assertTrue(cmd.size() >= 6, "WeasyPrint should receive HTML input and output path"); + String htmlPathStr = cmd.get(1); + assertEquals("--base-url", cmd.get(2)); + assertEquals("https://example.com", cmd.get(3)); + assertEquals("--pdf-forms", cmd.get(4)); + String outPathStr = cmd.get(5); assertNotNull(outPathStr); - // Temp-Datei muss im finally gelöscht sein + // Temp file must be deleted in finally Path outPath = Path.of(outPathStr); assertFalse( - Files.exists(outPath), "Temp-Output-Datei sollte nach dem Call gelöscht sein"); + Files.exists(Path.of(htmlPathStr)), + "Temp HTML file should be deleted after the call"); } } @@ -218,21 +227,32 @@ public class ConvertWebsiteToPdfTest { request.setUrlInput("https://example.com"); Path preCreatedTemp = java.nio.file.Files.createTempFile("test_output_", ".pdf"); + Path htmlTemp = java.nio.file.Files.createTempFile("test_input_", ".html"); try (MockedStatic gu = Mockito.mockStatic(GeneralUtils.class); MockedStatic pe = Mockito.mockStatic(ProcessExecutor.class); MockedStatic wr = Mockito.mockStatic(WebResponseUtils.class); - MockedStatic files = Mockito.mockStatic(Files.class)) { + MockedStatic files = Mockito.mockStatic(Files.class); + MockedStatic httpClient = mockHttpClientReturning("")) { - // URL-Checks positiv + // Force URL checks to be positive gu.when(() -> GeneralUtils.isValidURL("https://example.com")).thenReturn(true); gu.when(() -> GeneralUtils.isURLReachable("https://example.com")).thenReturn(true); - // Temp-Datei erzwingen + Delete-Fehler provozieren + // Force temp files + provoke delete error + files.when(() -> Files.createTempFile("url_input_", ".html")).thenReturn(htmlTemp); files.when(() -> Files.createTempFile("output_", ".pdf")).thenReturn(preCreatedTemp); + files.when( + () -> + Files.writeString( + eq(htmlTemp), + anyString(), + eq(java.nio.charset.StandardCharsets.UTF_8))) + .thenReturn(htmlTemp); + files.when(() -> Files.deleteIfExists(htmlTemp)).thenReturn(true); files.when(() -> Files.deleteIfExists(preCreatedTemp)) .thenThrow(new IOException("fail delete")); - files.when(() -> Files.exists(preCreatedTemp)).thenReturn(true); // für den Assert + files.when(() -> Files.exists(preCreatedTemp)).thenReturn(true); // for the assert // ProcessExecutor ProcessExecutor mockExec = Mockito.mock(ProcessExecutor.class); @@ -245,7 +265,7 @@ public class ConvertWebsiteToPdfTest { wr.when(() -> WebResponseUtils.pdfDocToWebResponse(any(PDDocument.class), anyString())) .thenReturn(fakeResponse); - // Act: darf keine Exception werfen und soll eine Response liefern + // Act: should not throw and should return a Response ResponseEntity resp = assertDoesNotThrow(() -> sut.urlToPdf(request)); // Assert @@ -253,12 +273,56 @@ public class ConvertWebsiteToPdfTest { assertEquals(HttpStatus.OK, resp.getStatusCode()); assertTrue( java.nio.file.Files.exists(preCreatedTemp), - "Temp-Datei sollte trotz Lösch-IOException noch existieren"); + "Temp file should still exist despite delete IOException"); } finally { try { java.nio.file.Files.deleteIfExists(preCreatedTemp); + java.nio.file.Files.deleteIfExists(htmlTemp); } catch (IOException ignore) { } } } + + @Test + void redirect_with_error_when_disallowed_content_detected() throws Exception { + UrlToPdfRequest request = new UrlToPdfRequest(); + request.setUrlInput("https://example.com"); + + try (MockedStatic gu = Mockito.mockStatic(GeneralUtils.class); + MockedStatic httpClient = + mockHttpClientReturning( + ""); ) { + + gu.when(() -> GeneralUtils.isValidURL("https://example.com")).thenReturn(true); + gu.when(() -> GeneralUtils.isURLReachable("https://example.com")).thenReturn(true); + + ResponseEntity resp = sut.urlToPdf(request); + + assertEquals(HttpStatus.SEE_OTHER, resp.getStatusCode()); + URI location = resp.getHeaders().getLocation(); + assertNotNull(location, "Location header expected"); + assertTrue( + location.getQuery() != null + && location.getQuery().contains("error=error.disallowedUrlContent")); + } + } + + private MockedStatic mockHttpClientReturning(String body) throws Exception { + MockedStatic httpClientStatic = Mockito.mockStatic(HttpClient.class); + HttpClient.Builder builder = Mockito.mock(HttpClient.Builder.class); + HttpClient client = Mockito.mock(HttpClient.class); + HttpResponse response = Mockito.mock(HttpResponse.class); + + httpClientStatic.when(HttpClient::newBuilder).thenReturn(builder); + when(builder.followRedirects(HttpClient.Redirect.NORMAL)).thenReturn(builder); + when(builder.connectTimeout(any(Duration.class))).thenReturn(builder); + when(builder.build()).thenReturn(client); + + when(client.send(any(HttpRequest.class), any(HttpResponse.BodyHandler.class))) + .thenReturn(response); + when(response.statusCode()).thenReturn(200); + when(response.body()).thenReturn(body); + + return httpClientStatic; + } }