feat(crop): Crop remove outside text (#4499)

# Description of Changes

This PR adds option to remove text outside crop area via Ghostscript.
### Crop feature enhancements

- Added a checkbox to the `crop.html` template and a corresponding label
in the English properties file to allow users to select "Remove text
outside crop (retains images)" when cropping PDFs.
- Updated the `CropPdfForm` model to include a new boolean property
`removeDataOutsideCrop` to capture the user's selection.
<img width="1418" height="815" alt="image"
src="https://github.com/user-attachments/assets/47785372-7609-4637-ab3b-f05ab6d95957"
/>


### Backend logic changes

- Modified the `CropController` so that if `removeDataOutsideCrop` is
true, cropping is performed using a two-step process: first setting the
crop box with PDFBox, then using Ghostscript to remove data outside the
crop box. Otherwise, the crop is performed using only PDFBox.
- Added necessary imports for handling files, paths, and process
execution to support the new Ghostscript-based cropping workflow.

### Endpoint configuration

- Registered the new "crop" endpoint under the "Ghostscript" group in
the endpoint configuration, enabling routing for the enhanced cropping
feature.

### UI
<img width="671" height="867" alt="image"
src="https://github.com/user-attachments/assets/ad01fcd4-343b-40e8-9345-135bdf746ca4"
/>

### Sample files/Verification
Before:
<img width="969" height="747" alt="image"
src="https://github.com/user-attachments/assets/d5a205f7-0aaf-4990-9b24-43e9ef9cf716"
/>

After:
<img width="1165" height="690" alt="image"
src="https://github.com/user-attachments/assets/7c73e35b-c52b-4be2-a892-72f0125f66b4"
/>

See for yourself with:

[true-pdf-sample-1_cropped.pdf](https://github.com/user-attachments/files/22546716/true-pdf-sample-1_cropped.pdf)
other sample PDF:

[output.pdf](https://github.com/user-attachments/files/22546785/output.pdf)

Closes #2652

<!--
Please provide a summary of the changes, including:

- What was changed
- Why the change was made
- Any challenges encountered

Closes #(issue_number)
-->

---

## Checklist

### General

- [x] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [x] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [x] I have performed a self-review of my own code
- [x] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [x] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [x] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.

---------

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2025-10-11 19:35:24 +02:00 committed by GitHub
parent fda1d6bc73
commit 085b8795d5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 77 additions and 1 deletions

View File

@ -401,6 +401,7 @@ public class EndpointConfiguration {
/* Ghostscript */
addEndpointToGroup("Ghostscript", "repair");
addEndpointToGroup("Ghostscript", "compress-pdf");
addEndpointToGroup("Ghostscript", "crop");
addEndpointToGroup("Ghostscript", "replace-invert-pdf");
/* tesseract */

View File

@ -2,6 +2,9 @@ package stirling.software.SPDF.controller.api;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
@ -21,16 +24,19 @@ import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.model.api.general.CropPdfForm;
import stirling.software.common.service.CustomPDFDocumentFactory;
import stirling.software.common.util.GeneralUtils;
import stirling.software.common.util.ProcessExecutor;
import stirling.software.common.util.WebResponseUtils;
@RestController
@RequestMapping("/api/v1/general")
@Tag(name = "General", description = "General APIs")
@RequiredArgsConstructor
@Slf4j
public class CropController {
private final CustomPDFDocumentFactory pdfDocumentFactory;
@ -42,6 +48,15 @@ public class CropController {
"This operation takes an input PDF file and crops it according to the given"
+ " coordinates. Input:PDF Output:PDF Type:SISO")
public ResponseEntity<byte[]> cropPdf(@ModelAttribute CropPdfForm request) throws IOException {
if (request.isRemoveDataOutsideCrop()) {
return cropWithGhostscript(request);
} else {
return cropWithPDFBox(request);
}
}
private ResponseEntity<byte[]> cropWithPDFBox(@ModelAttribute CropPdfForm request)
throws IOException {
PDDocument sourceDocument = pdfDocumentFactory.load(request);
PDDocument newDocument =
@ -97,4 +112,59 @@ public class CropController {
GeneralUtils.generateFilename(
request.getFileInput().getOriginalFilename(), "_cropped.pdf"));
}
private ResponseEntity<byte[]> cropWithGhostscript(@ModelAttribute CropPdfForm request)
throws IOException {
PDDocument sourceDocument = pdfDocumentFactory.load(request);
for (int i = 0; i < sourceDocument.getNumberOfPages(); i++) {
PDPage page = sourceDocument.getPage(i);
PDRectangle cropBox =
new PDRectangle(
request.getX(),
request.getY(),
request.getWidth(),
request.getHeight());
page.setCropBox(cropBox);
}
Path tempInputFile = Files.createTempFile("crop_input", ".pdf");
Path tempOutputFile = Files.createTempFile("crop_output", ".pdf");
try {
sourceDocument.save(tempInputFile.toFile());
sourceDocument.close();
ProcessExecutor processExecutor =
ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT);
List<String> command =
List.of(
"gs",
"-sDEVICE=pdfwrite",
"-dUseCropBox",
"-o",
tempOutputFile.toString(),
tempInputFile.toString());
processExecutor.runCommandWithOutputHandling(command);
byte[] pdfContent = Files.readAllBytes(tempOutputFile);
return WebResponseUtils.bytesToWebResponse(
pdfContent,
request.getFileInput().getOriginalFilename().replaceFirst("[.][^.]+$", "")
+ "_cropped.pdf");
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("Ghostscript processing was interrupted", e);
} finally {
try {
Files.deleteIfExists(tempInputFile);
Files.deleteIfExists(tempOutputFile);
} catch (IOException e) {
log.debug("Failed to delete temporary files", e);
}
}
}
}

View File

@ -26,4 +26,9 @@ public class CropPdfForm extends PDFFile {
@Schema(description = "The height of the crop area", type = "number")
private float height;
@Schema(
description = "Whether to remove text outside the crop area (keeps images)",
type = "boolean")
private boolean removeDataOutsideCrop = true;
}

View File

@ -22,6 +22,7 @@
<input id="y" type="hidden" name="y">
<input id="width" type="hidden" name="width">
<input id="height" type="hidden" name="height">
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{crop.submit}"></button>
</form>
<div id="canvasesContainer" style="position: relative; margin: 20px 0; width: auto;">

View File

@ -8,7 +8,6 @@
/pdf-organizer
/multi-page-layout
/scale-pages
/crop
/extract-page
/pdf-to-single-page
/img-to-pdf