From 085b8795d58504656a77773b76c9a59f28a71a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Sz=C3=BCcs?= <127139797+balazs-szucs@users.noreply.github.com> Date: Sat, 11 Oct 2025 19:35:24 +0200 Subject: [PATCH] feat(crop): Crop remove outside text (#4499) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description of Changes This PR adds option to remove text outside crop area via Ghostscript. ### Crop feature enhancements - Added a checkbox to the `crop.html` template and a corresponding label in the English properties file to allow users to select "Remove text outside crop (retains images)" when cropping PDFs. - Updated the `CropPdfForm` model to include a new boolean property `removeDataOutsideCrop` to capture the user's selection. image ### Backend logic changes - Modified the `CropController` so that if `removeDataOutsideCrop` is true, cropping is performed using a two-step process: first setting the crop box with PDFBox, then using Ghostscript to remove data outside the crop box. Otherwise, the crop is performed using only PDFBox. - Added necessary imports for handling files, paths, and process execution to support the new Ghostscript-based cropping workflow. ### Endpoint configuration - Registered the new "crop" endpoint under the "Ghostscript" group in the endpoint configuration, enabling routing for the enhanced cropping feature. ### UI image ### Sample files/Verification Before: image After: image See for yourself with: [true-pdf-sample-1_cropped.pdf](https://github.com/user-attachments/files/22546716/true-pdf-sample-1_cropped.pdf) other sample PDF: [output.pdf](https://github.com/user-attachments/files/22546785/output.pdf) Closes #2652 --- ## Checklist ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [x] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [x] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [x] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing) for more details. --------- Signed-off-by: Balázs Szücs --- .../SPDF/config/EndpointConfiguration.java | 1 + .../SPDF/controller/api/CropController.java | 70 +++++++++++++++++++ .../SPDF/model/api/general/CropPdfForm.java | 5 ++ .../src/main/resources/templates/crop.html | 1 + testing/webpage_urls.txt | 1 - 5 files changed, 77 insertions(+), 1 deletion(-) diff --git a/app/core/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java b/app/core/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java index 2b074640d..02eb82163 100644 --- a/app/core/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java +++ b/app/core/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java @@ -401,6 +401,7 @@ public class EndpointConfiguration { /* Ghostscript */ addEndpointToGroup("Ghostscript", "repair"); addEndpointToGroup("Ghostscript", "compress-pdf"); + addEndpointToGroup("Ghostscript", "crop"); addEndpointToGroup("Ghostscript", "replace-invert-pdf"); /* tesseract */ diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/CropController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/CropController.java index 2fbbadf5e..8ca9604ce 100644 --- a/app/core/src/main/java/stirling/software/SPDF/controller/api/CropController.java +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/CropController.java @@ -2,6 +2,9 @@ package stirling.software.SPDF.controller.api; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; import org.apache.pdfbox.multipdf.LayerUtility; import org.apache.pdfbox.pdmodel.PDDocument; @@ -21,16 +24,19 @@ import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; import stirling.software.SPDF.model.api.general.CropPdfForm; import stirling.software.common.service.CustomPDFDocumentFactory; import stirling.software.common.util.GeneralUtils; +import stirling.software.common.util.ProcessExecutor; import stirling.software.common.util.WebResponseUtils; @RestController @RequestMapping("/api/v1/general") @Tag(name = "General", description = "General APIs") @RequiredArgsConstructor +@Slf4j public class CropController { private final CustomPDFDocumentFactory pdfDocumentFactory; @@ -42,6 +48,15 @@ public class CropController { "This operation takes an input PDF file and crops it according to the given" + " coordinates. Input:PDF Output:PDF Type:SISO") public ResponseEntity cropPdf(@ModelAttribute CropPdfForm request) throws IOException { + if (request.isRemoveDataOutsideCrop()) { + return cropWithGhostscript(request); + } else { + return cropWithPDFBox(request); + } + } + + private ResponseEntity cropWithPDFBox(@ModelAttribute CropPdfForm request) + throws IOException { PDDocument sourceDocument = pdfDocumentFactory.load(request); PDDocument newDocument = @@ -97,4 +112,59 @@ public class CropController { GeneralUtils.generateFilename( request.getFileInput().getOriginalFilename(), "_cropped.pdf")); } + + private ResponseEntity cropWithGhostscript(@ModelAttribute CropPdfForm request) + throws IOException { + PDDocument sourceDocument = pdfDocumentFactory.load(request); + + for (int i = 0; i < sourceDocument.getNumberOfPages(); i++) { + PDPage page = sourceDocument.getPage(i); + PDRectangle cropBox = + new PDRectangle( + request.getX(), + request.getY(), + request.getWidth(), + request.getHeight()); + page.setCropBox(cropBox); + } + + Path tempInputFile = Files.createTempFile("crop_input", ".pdf"); + Path tempOutputFile = Files.createTempFile("crop_output", ".pdf"); + + try { + sourceDocument.save(tempInputFile.toFile()); + sourceDocument.close(); + + ProcessExecutor processExecutor = + ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT); + List command = + List.of( + "gs", + "-sDEVICE=pdfwrite", + "-dUseCropBox", + "-o", + tempOutputFile.toString(), + tempInputFile.toString()); + + processExecutor.runCommandWithOutputHandling(command); + + byte[] pdfContent = Files.readAllBytes(tempOutputFile); + + return WebResponseUtils.bytesToWebResponse( + pdfContent, + request.getFileInput().getOriginalFilename().replaceFirst("[.][^.]+$", "") + + "_cropped.pdf"); + + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("Ghostscript processing was interrupted", e); + } finally { + try { + Files.deleteIfExists(tempInputFile); + Files.deleteIfExists(tempOutputFile); + } catch (IOException e) { + log.debug("Failed to delete temporary files", e); + } + } + } } diff --git a/app/core/src/main/java/stirling/software/SPDF/model/api/general/CropPdfForm.java b/app/core/src/main/java/stirling/software/SPDF/model/api/general/CropPdfForm.java index 913f94a10..480169468 100644 --- a/app/core/src/main/java/stirling/software/SPDF/model/api/general/CropPdfForm.java +++ b/app/core/src/main/java/stirling/software/SPDF/model/api/general/CropPdfForm.java @@ -26,4 +26,9 @@ public class CropPdfForm extends PDFFile { @Schema(description = "The height of the crop area", type = "number") private float height; + + @Schema( + description = "Whether to remove text outside the crop area (keeps images)", + type = "boolean") + private boolean removeDataOutsideCrop = true; } diff --git a/app/core/src/main/resources/templates/crop.html b/app/core/src/main/resources/templates/crop.html index 0617bf9b6..e91c481c3 100644 --- a/app/core/src/main/resources/templates/crop.html +++ b/app/core/src/main/resources/templates/crop.html @@ -22,6 +22,7 @@ +
diff --git a/testing/webpage_urls.txt b/testing/webpage_urls.txt index c6c713dd0..6e7874eca 100644 --- a/testing/webpage_urls.txt +++ b/testing/webpage_urls.txt @@ -8,7 +8,6 @@ /pdf-organizer /multi-page-layout /scale-pages -/crop /extract-page /pdf-to-single-page /img-to-pdf