mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-02-21 00:17:05 +01:00
pipeline bug, doc bugs, auto split new URL and doc (#2906)
# Description of Changes Please provide a summary of the changes, including: - What was changed Pipeline bug where files would be processed even when incorrect format some API docs had spaces causing format issues Auto split doc now links to [stirlingpdf.com](http://stirlingpdf.com/) not github + updated old logo removed old docs not used - Why the change was made - Any challenges encountered Closes #(issue_number) --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details. --------- Co-authored-by: a <a>
This commit is contained in:
parent
242aa5eae1
commit
0233086487
@ -26,7 +26,7 @@ ext {
|
||||
}
|
||||
|
||||
group = "stirling.software"
|
||||
version = "0.40.2"
|
||||
version = "0.41.0"
|
||||
|
||||
java {
|
||||
// 17 is lowest but we support and recommend 21
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 50 KiB |
File diff suppressed because one or more lines are too long
Before Width: | Height: | Size: 9.4 KiB |
@ -8,7 +8,9 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
@ -41,8 +43,12 @@ import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||
public class AutoSplitPdfController {
|
||||
|
||||
private static final String QR_CONTENT = "https://github.com/Stirling-Tools/Stirling-PDF";
|
||||
private static final String QR_CONTENT_OLD = "https://github.com/Frooodle/Stirling-PDF";
|
||||
private static final Set<String> VALID_QR_CONTENTS =
|
||||
new HashSet<>(
|
||||
Set.of(
|
||||
"https://github.com/Stirling-Tools/Stirling-PDF",
|
||||
"https://github.com/Frooodle/Stirling-PDF",
|
||||
"https://stirlingpdf.com"));
|
||||
|
||||
private final CustomPDDocumentFactory pdfDocumentFactory;
|
||||
|
||||
@ -120,13 +126,14 @@ public class AutoSplitPdfController {
|
||||
for (int page = 0; page < document.getNumberOfPages(); ++page) {
|
||||
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 150);
|
||||
String result = decodeQRCode(bim);
|
||||
if ((QR_CONTENT.equals(result) || QR_CONTENT_OLD.equals(result)) && page != 0) {
|
||||
|
||||
boolean isValidQrCode = VALID_QR_CONTENTS.contains(result);
|
||||
log.debug("detected qr code {}, code is vale={}", result, isValidQrCode);
|
||||
if (isValidQrCode && page != 0) {
|
||||
splitDocuments.add(new PDDocument());
|
||||
}
|
||||
|
||||
if (!splitDocuments.isEmpty()
|
||||
&& !QR_CONTENT.equals(result)
|
||||
&& !QR_CONTENT_OLD.equals(result)) {
|
||||
if (!splitDocuments.isEmpty() && !isValidQrCode) {
|
||||
splitDocuments.get(splitDocuments.size() - 1).addPage(document.getPage(page));
|
||||
} else if (page == 0) {
|
||||
PDDocument firstDocument = new PDDocument();
|
||||
@ -135,7 +142,7 @@ public class AutoSplitPdfController {
|
||||
}
|
||||
|
||||
// If duplexMode is true and current page is a divider, then skip next page
|
||||
if (duplexMode && (QR_CONTENT.equals(result) || QR_CONTENT_OLD.equals(result))) {
|
||||
if (duplexMode && isValidQrCode) {
|
||||
page++;
|
||||
}
|
||||
}
|
||||
@ -168,6 +175,9 @@ public class AutoSplitPdfController {
|
||||
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
|
||||
} catch (Exception e) {
|
||||
log.error("Error in auto split", e);
|
||||
throw e;
|
||||
} finally {
|
||||
// Clean up resources
|
||||
if (document != null) {
|
||||
|
@ -52,7 +52,7 @@ public class ExtractImagesController {
|
||||
@Operation(
|
||||
summary = "Extract images from a PDF file",
|
||||
description =
|
||||
"This endpoint extracts images from a given PDF file and returns them in a zip file. Users can specify the output image format. Input: PDF Output: IMAGE/ZIP Type: SIMO")
|
||||
"This endpoint extracts images from a given PDF file and returns them in a zip file. Users can specify the output image format. Input:PDF Output:IMAGE/ZIP Type:SIMO")
|
||||
public ResponseEntity<byte[]> extractImages(@ModelAttribute PDFExtractImagesRequest request)
|
||||
throws IOException, InterruptedException, ExecutionException {
|
||||
MultipartFile file = request.getFileInput();
|
||||
|
@ -46,7 +46,7 @@ public class FlattenController {
|
||||
@Operation(
|
||||
summary = "Flatten PDF form fields or full page",
|
||||
description =
|
||||
"Flattening just PDF form fields or converting each page to images to make text unselectable. Input: PDF, Output: PDF. Type: SISO")
|
||||
"Flattening just PDF form fields or converting each page to images to make text unselectable. Input:PDF, Output:PDF. Type:SISO")
|
||||
public ResponseEntity<byte[]> flatten(@ModelAttribute FlattenRequest request) throws Exception {
|
||||
MultipartFile file = request.getFileInput();
|
||||
|
||||
|
@ -8,7 +8,7 @@ import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||
@ -26,6 +26,7 @@ import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import io.github.pixee.security.BoundedLineReader;
|
||||
import io.github.pixee.security.Filenames;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -65,9 +66,10 @@ public class OCRController {
|
||||
}
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
|
||||
@Operation(
|
||||
summary = "Process PDF files with OCR using Tesseract",
|
||||
description = "Takes a PDF file as input, performs OCR using specified languages and OCR type (skip-text/force-ocr), and returns the processed PDF. Input:PDF Output:PDF Type:SISO")
|
||||
@Operation(
|
||||
summary = "Process PDF files with OCR using Tesseract",
|
||||
description =
|
||||
"Takes a PDF file as input, performs OCR using specified languages and OCR type (skip-text/force-ocr), and returns the processed PDF. Input:PDF Output:PDF Type:SISO")
|
||||
public ResponseEntity<byte[]> processPdfWithOCR(
|
||||
@ModelAttribute ProcessPdfWithOcrRequest request)
|
||||
throws IOException, InterruptedException {
|
||||
|
@ -25,6 +25,7 @@ import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import stirling.software.SPDF.model.PipelineConfig;
|
||||
import stirling.software.SPDF.model.PipelineResult;
|
||||
import stirling.software.SPDF.model.api.HandleDataRequest;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@ -58,7 +59,8 @@ public class PipelineController {
|
||||
if (inputFiles == null || inputFiles.size() == 0) {
|
||||
return null;
|
||||
}
|
||||
List<Resource> outputFiles = processor.runPipelineAgainstFiles(inputFiles, config);
|
||||
PipelineResult result = processor.runPipelineAgainstFiles(inputFiles, config);
|
||||
List<Resource> outputFiles = result.getOutputFiles();
|
||||
if (outputFiles != null && outputFiles.size() == 1) {
|
||||
// If there is only one file, return it directly
|
||||
Resource singleFile = outputFiles.get(0);
|
||||
|
@ -27,6 +27,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
import stirling.software.SPDF.config.InstallationPathConfig;
|
||||
import stirling.software.SPDF.model.PipelineConfig;
|
||||
import stirling.software.SPDF.model.PipelineOperation;
|
||||
import stirling.software.SPDF.model.PipelineResult;
|
||||
import stirling.software.SPDF.utils.FileMonitor;
|
||||
|
||||
@Service
|
||||
@ -143,19 +144,64 @@ public class PipelineDirectoryProcessor {
|
||||
|
||||
private File[] collectFilesForProcessing(Path dir, Path jsonFile, PipelineOperation operation)
|
||||
throws IOException {
|
||||
|
||||
List<String> inputExtensions =
|
||||
apiDocService.getExtensionTypes(false, operation.getOperation());
|
||||
log.info(
|
||||
"Allowed extensions for operation {}: {}",
|
||||
operation.getOperation(),
|
||||
inputExtensions);
|
||||
|
||||
boolean allowAllFiles = inputExtensions.contains("ALL");
|
||||
|
||||
try (Stream<Path> paths = Files.list(dir)) {
|
||||
if ("automated".equals(operation.getParameters().get("fileInput"))) {
|
||||
return paths.filter(
|
||||
path ->
|
||||
!Files.isDirectory(path)
|
||||
&& !path.equals(jsonFile)
|
||||
&& fileMonitor.isFileReadyForProcessing(path))
|
||||
.map(Path::toFile)
|
||||
.toArray(File[]::new);
|
||||
} else {
|
||||
String fileInput = (String) operation.getParameters().get("fileInput");
|
||||
return new File[] {new File(fileInput)};
|
||||
}
|
||||
File[] files =
|
||||
paths.filter(
|
||||
path -> {
|
||||
if (Files.isDirectory(path)) {
|
||||
return false;
|
||||
}
|
||||
if (path.equals(jsonFile)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get file extension
|
||||
String filename = path.getFileName().toString();
|
||||
String extension =
|
||||
filename.contains(".")
|
||||
? filename.substring(
|
||||
filename.lastIndexOf(".")
|
||||
+ 1)
|
||||
.toLowerCase()
|
||||
: "";
|
||||
|
||||
// Check against allowed extensions
|
||||
boolean isAllowed =
|
||||
allowAllFiles
|
||||
|| inputExtensions.contains(extension);
|
||||
if (!isAllowed) {
|
||||
log.info(
|
||||
"Skipping file with unsupported extension: {} ({})",
|
||||
filename,
|
||||
extension);
|
||||
}
|
||||
return isAllowed;
|
||||
})
|
||||
.filter(
|
||||
path -> {
|
||||
boolean isReady =
|
||||
fileMonitor.isFileReadyForProcessing(path);
|
||||
if (!isReady) {
|
||||
log.info(
|
||||
"File not ready for processing (locked/created last 5s): {}",
|
||||
path);
|
||||
}
|
||||
return isReady;
|
||||
})
|
||||
.map(Path::toFile)
|
||||
.toArray(File[]::new);
|
||||
log.info("Collected {} files for processing", files.length);
|
||||
return files;
|
||||
}
|
||||
}
|
||||
|
||||
@ -198,19 +244,37 @@ public class PipelineDirectoryProcessor {
|
||||
try {
|
||||
List<Resource> inputFiles =
|
||||
processor.generateInputFiles(filesToProcess.toArray(new File[0]));
|
||||
if (inputFiles == null || inputFiles.size() == 0) {
|
||||
if (inputFiles == null || inputFiles.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
List<Resource> outputFiles = processor.runPipelineAgainstFiles(inputFiles, config);
|
||||
if (outputFiles == null) return;
|
||||
moveAndRenameFiles(outputFiles, config, dir);
|
||||
deleteOriginalFiles(filesToProcess, processingDir);
|
||||
PipelineResult result = processor.runPipelineAgainstFiles(inputFiles, config);
|
||||
|
||||
if (result.isHasErrors()) {
|
||||
log.error("Errors occurred during processing, retaining original files");
|
||||
moveToErrorDirectory(filesToProcess, dir);
|
||||
} else {
|
||||
moveAndRenameFiles(result.getOutputFiles(), config, dir);
|
||||
deleteOriginalFiles(filesToProcess, processingDir);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("error during processing", e);
|
||||
log.error("Error during processing", e);
|
||||
moveFilesBack(filesToProcess, processingDir);
|
||||
}
|
||||
}
|
||||
|
||||
private void moveToErrorDirectory(List<File> files, Path originalDir) throws IOException {
|
||||
Path errorDir = originalDir.resolve("error");
|
||||
if (!Files.exists(errorDir)) {
|
||||
Files.createDirectories(errorDir);
|
||||
}
|
||||
|
||||
for (File file : files) {
|
||||
Path target = errorDir.resolve(file.getName());
|
||||
Files.move(file.toPath(), target);
|
||||
log.info("Moved failed file to error directory for investigation: {}", target);
|
||||
}
|
||||
}
|
||||
|
||||
private void moveAndRenameFiles(List<Resource> resources, PipelineConfig config, Path dir)
|
||||
throws IOException {
|
||||
for (Resource resource : resources) {
|
||||
|
@ -33,6 +33,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
import stirling.software.SPDF.SPDFApplication;
|
||||
import stirling.software.SPDF.model.PipelineConfig;
|
||||
import stirling.software.SPDF.model.PipelineOperation;
|
||||
import stirling.software.SPDF.model.PipelineResult;
|
||||
import stirling.software.SPDF.model.Role;
|
||||
|
||||
@Service
|
||||
@ -84,8 +85,10 @@ public class PipelineProcessor {
|
||||
return "http://localhost:" + port + contextPath + "/";
|
||||
}
|
||||
|
||||
List<Resource> runPipelineAgainstFiles(List<Resource> outputFiles, PipelineConfig config)
|
||||
PipelineResult runPipelineAgainstFiles(List<Resource> outputFiles, PipelineConfig config)
|
||||
throws Exception {
|
||||
PipelineResult result = new PipelineResult();
|
||||
|
||||
ByteArrayOutputStream logStream = new ByteArrayOutputStream();
|
||||
PrintStream logPrintStream = new PrintStream(logStream);
|
||||
boolean hasErrors = false;
|
||||
@ -130,7 +133,8 @@ public class PipelineProcessor {
|
||||
if (operation.startsWith("filter-")
|
||||
&& (response.getBody() == null
|
||||
|| response.getBody().length == 0)) {
|
||||
log.info("Skipping file due to failing {}", operation);
|
||||
result.setFiltersApplied(true);
|
||||
log.info("Skipping file due to filtering {}", operation);
|
||||
continue;
|
||||
}
|
||||
if (!response.getStatusCode().equals(HttpStatus.OK)) {
|
||||
@ -208,7 +212,10 @@ public class PipelineProcessor {
|
||||
if (hasErrors) {
|
||||
log.error("Errors occurred during processing. Log: {}", logStream.toString());
|
||||
}
|
||||
return outputFiles;
|
||||
result.setHasErrors(hasErrors);
|
||||
result.setFiltersApplied(hasErrors);
|
||||
result.setOutputFiles(outputFiles);
|
||||
return result;
|
||||
}
|
||||
|
||||
private ResponseEntity<byte[]> sendWebRequest(String url, MultiValueMap<String, Object> body) {
|
||||
|
@ -40,8 +40,7 @@ public class RemoveCertSignController {
|
||||
@Operation(
|
||||
summary = "Remove digital signature from PDF",
|
||||
description =
|
||||
"This endpoint accepts a PDF file and returns the PDF file without the digital signature."
|
||||
+ " Input: PDF, Output: PDF")
|
||||
"This endpoint accepts a PDF file and returns the PDF file without the digital signature. Input:PDF, Output:PDF Type:SISO")
|
||||
public ResponseEntity<byte[]> removeCertSignPDF(@ModelAttribute PDFFile request)
|
||||
throws Exception {
|
||||
MultipartFile pdf = request.getFileInput();
|
||||
|
@ -0,0 +1,14 @@
|
||||
package stirling.software.SPDF.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.core.io.Resource;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class PipelineResult {
|
||||
private List<Resource> outputFiles;
|
||||
private boolean hasErrors;
|
||||
private boolean filtersApplied;
|
||||
}
|
@ -768,7 +768,6 @@ autoSplitPDF.selectText.3=Upload the single large scanned PDF file and let Stirl
|
||||
autoSplitPDF.selectText.4=Divider pages are automatically detected and removed, guaranteeing a neat final document.
|
||||
autoSplitPDF.formPrompt=Submit PDF containing Stirling-PDF Page dividers:
|
||||
autoSplitPDF.duplexMode=Duplex Mode (Front and back scanning)
|
||||
autoSplitPDF.dividerDownload1=Download 'Auto Splitter Divider (minimal).pdf'
|
||||
autoSplitPDF.dividerDownload2=Download 'Auto Splitter Divider (with instructions).pdf'
|
||||
autoSplitPDF.submit=Submit
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
@ -45,8 +45,6 @@
|
||||
<li th:text="#{autoSplitPDF.selectText.3}"></li>
|
||||
<li th:text="#{autoSplitPDF.selectText.4}"></li>
|
||||
</ul>
|
||||
<p><a th:href="@{'/files/Auto%20Splitter%20Divider%20(minimal).pdf'}" download
|
||||
th:text="#{autoSplitPDF.dividerDownload1}"></a></p>
|
||||
<p><a th:href="@{'/files/Auto%20Splitter%20Divider%20(with%20instructions).pdf'}" download
|
||||
th:text="#{autoSplitPDF.dividerDownload2}"></a></p>
|
||||
</div>
|
||||
|
Loading…
Reference in New Issue
Block a user