Major changes, use libre

This commit is contained in:
Anthony Stirling 2023-03-19 14:45:07 +00:00
parent 946df5b545
commit cbfa70d851
19 changed files with 650 additions and 627 deletions

View File

@ -1,5 +1,54 @@
# Build jbig2enc in a separate stage
FROM debian:bullseye-slim as jbig2enc_builder
RUN apt-get update && \
apt-get install -y --no-install-recommends \
git \
automake \
autoconf \
libtool \
libleptonica-dev \
pkg-config \
ca-certificates \
zlib1g-dev \
make \
g++
RUN git clone https://github.com/agl/jbig2enc && \
cd jbig2enc && \
./autogen.sh && \
./configure && \
make && \
make install
# Main stage
FROM openjdk:17-jdk-slim
# Install necessary dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libreoffice-core \
libreoffice-common \
libreoffice-writer \
libreoffice-calc \
libreoffice-impress \
python3-uno \
python3-pip \
unoconv \
ocrmypdf && \
pip install --user --upgrade ocrmypdf
# Copy the jbig2enc binary from the builder stage
COPY --from=jbig2enc_builder /usr/local/bin/jbig2 /usr/local/bin/jbig2
# Copy the application JAR file
COPY build/libs/*.jar app.jar
# Expose the application port
EXPOSE 8080
# Set environment variables
ENV LOG_LEVEL=INFO
ENTRYPOINT ["java","-jar","/app.jar","-Dlogging.level=${LOG_LEVEL}"]
# Run the application
ENTRYPOINT ["java","-jar","/app.jar","-Dlogging.level=${LOG_LEVEL}"]

49
HowToUseOCR.md Normal file
View File

@ -0,0 +1,49 @@
# OCR Language Packs and Setup
This document provides instructions on how to add additional language packs for the OCR tab in Stirling-PDF, both inside and outside of Docker.
## How does the OCR Work
Stirling-PDF uses OCRmyPDF which in turn uses tesseract for its text recognition.
All credit goes to them for this awesome work!
## Language Packs
Tesseract OCR supports a variety of languages. You can find additional language packs in the Tesseract GitHub repositories:
- [tessdata_fast](https://github.com/tesseract-ocr/tessdata_fast): These language packs are smaller and faster to load, but may provide lower recognition accuracy.
- [tessdata](https://github.com/tesseract-ocr/tessdata): These language packs are larger and provide better recognition accuracy, but may take longer to load.
Depending on your requirements, you can choose the appropriate language pack for your use case. By default Stirling-PDF uses the tessdata_fast eng but this can be replaced.
### Installing Language Packs
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata`
#### Docker
If you are using Docker, you need to expose the Tesseract tessdata directory as a volume in order to use the additional language packs.
#### Docker Compose
Modify your `docker-compose.yml` file to include the following volume configuration:
```yaml
services:
your_service_name:
image: your_docker_image_name
volumes:
- /usr/share/tesseract-ocr/4.00/tessdata:/location/of/trainingData
```
#### Docker run
Add the following to your existing docker run command
```bash
-v /usr/share/tesseract-ocr/4.00/tessdata:/location/of/trainingData
```
#### Non-Docker
If you are not using Docker, you need to install the OCR components, including the ocrmypdf app.
You can see [OCRmyPDF install guide](https://ocrmypdf.readthedocs.io/en/latest/installation.html)

View File

@ -21,24 +21,12 @@ dependencies {
implementation 'org.springframework.boot:spring-boot-starter-thymeleaf'
testImplementation 'org.springframework.boot:spring-boot-starter-test'
implementation 'org.apache.xmlgraphics:batik-transcoder:1.14'
implementation 'org.apache.logging.log4j:log4j-core:2.20.0'
//general PDF
implementation 'org.apache.pdfbox:pdfbox:2.0.27'
implementation 'com.itextpdf:itextpdf:5.5.13.3'
//xml conversions and others
implementation 'org.apache.poi:poi:5.2.3'
implementation 'org.apache.poi:poi-scratchpad:5.2.3'
implementation 'org.apache.poi:poi-ooxml:5.2.3'
implementation 'com.itextpdf.tool:xmlworker:5.5.13.3'
//docx conversions
implementation('org.docx4j:docx4j:6.1.2') {
exclude group: 'org.slf4j', module: 'slf4j-reload4j'
}
implementation 'org.docx4j:docx4j-export-fo:11.2.9'
}

View File

@ -0,0 +1,94 @@
package stirling.software.SPDF;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class LibreOfficeListener {
private static final LibreOfficeListener INSTANCE = new LibreOfficeListener();
private static final long ACTIVITY_TIMEOUT = 20 * 60 * 1000; // 20 minutes
private static final int LISTENER_PORT = 2002;
private ExecutorService executorService;
private Process process;
private long lastActivityTime;
private LibreOfficeListener() {}
public static LibreOfficeListener getInstance() {
return INSTANCE;
}
public void start() throws IOException {
// Check if the listener is already running
if (process != null && process.isAlive()) {
return;
}
// Start the listener process
process = Runtime.getRuntime().exec("unoconv --listener");
lastActivityTime = System.currentTimeMillis();
// Start a background thread to monitor the activity timeout
executorService = Executors.newSingleThreadExecutor();
executorService.submit(() -> {
while (true) {
long idleTime = System.currentTimeMillis() - lastActivityTime;
if (idleTime >= ACTIVITY_TIMEOUT) {
// If there has been no activity for too long, tear down the listener
process.destroy();
break;
}
try {
Thread.sleep(5000); // Check for inactivity every 5 seconds
} catch (InterruptedException e) {
break;
}
}
});
// Wait for the listener to start up
long startTime = System.currentTimeMillis();
long timeout = 30000; // Timeout after 30 seconds
while (System.currentTimeMillis() - startTime < timeout) {
if (isListenerRunning()) {
lastActivityTime = System.currentTimeMillis();
return;
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} // Check every 1 second
}
}
private boolean isListenerRunning() {
try {
System.out.println("waiting for listener to start");
Socket socket = new Socket();
socket.connect(new InetSocketAddress("localhost", 2002), 1000); // Timeout after 1 second
socket.close();
return true;
} catch (IOException e) {
return false;
}
}
public synchronized void stop() {
// Stop the activity timeout monitor thread
executorService.shutdownNow();
// Stop the listener process
if (process != null && process.isAlive()) {
process.destroy();
}
}
}

View File

@ -31,8 +31,26 @@ import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStamper;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
//import com.spire.pdf.*;
@Controller
public class CompressController {
@ -45,80 +63,55 @@ public class CompressController {
}
@PostMapping("/compress-pdf")
public ResponseEntity<byte[]> compressPDF(
@RequestParam("fileInput") MultipartFile pdfFile,
@RequestParam(value = "compressPDF", defaultValue = "false") boolean compressPDF,
@RequestParam(value = "compressImages", defaultValue = "false") boolean compressImages,
@RequestParam(value = "useLossyCompression", defaultValue = "false") boolean useLossyCompression,
@RequestParam(value = "resolutionPercentage", defaultValue = "50") int resolutionPercentage) {
public ResponseEntity<byte[]> optimizePdf(
@RequestParam("fileInput") MultipartFile inputFile,
@RequestParam("optimizeLevel") int optimizeLevel,
@RequestParam(name = "fastWebView", required = false) Boolean fastWebView,
@RequestParam(name = "jbig2Lossy", required = false) Boolean jbig2Lossy) throws IOException, InterruptedException {
ByteArrayOutputStream baosPDFBox = new ByteArrayOutputStream();
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", ".pdf");
inputFile.transferTo(tempInputFile.toFile());
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Prepare the OCRmyPDF command
List<String> command = new ArrayList<>();
command.add("ocrmypdf");
command.add("--optimize");
command.add(String.valueOf(optimizeLevel));
if (fastWebView != null && fastWebView) {
long fileSize = inputFile.getSize();
long fastWebViewSize = (long) (fileSize * 1.25); // 25% higher than file size
command.add("--fast-web-view");
command.add(String.valueOf(fastWebViewSize));
}
if (jbig2Lossy != null && jbig2Lossy) {
command.add("--jbig2-lossy");
}
command.add(tempInputFile.toString());
command.add(tempOutputFile.toString());
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
try (InputStream is = pdfFile.getInputStream();
PDDocument document = PDDocument.load(is)) {
// Read the optimized PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
if (compressImages) {
for (PDPage page : document.getPages()) {
PDResources resources = page.getResources();
for (COSName cosName : resources.getXObjectNames()) {
if (resources.isImageXObject(cosName)) {
PDImageXObject image = (PDImageXObject) resources.getXObject(cosName);
BufferedImage bufferedImage = image.getImage();
BufferedImage resizedImage = resizeImage(bufferedImage, resolutionPercentage);
// Clean up the temporary files
Files.delete(tempInputFile);
Files.delete(tempOutputFile);
// Return the optimized PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_Optimized.pdf";
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_PDF);
headers.setContentDispositionFormData("attachment", outputFilename);
return ResponseEntity.ok().headers(headers).body(pdfBytes);
}
if (useLossyCompression) {
File tempFile = File.createTempFile("pdfbox", ".jpg");
ImageIO.write(resizedImage, "jpg", tempFile);
PDImageXObject newImage = PDImageXObject.createFromFile(tempFile.getAbsolutePath(), document);
resources.put(cosName, newImage);
} else {
File tempFile = File.createTempFile("pdfbox", ".png");
ImageIO.write(resizedImage, "png", tempFile);
PDImageXObject newImage = PDImageXObject.createFromFile(tempFile.getAbsolutePath(), document);
resources.put(cosName, newImage);
}
}
}
}
}
document.save(baosPDFBox);
} catch (IOException e) {
e.printStackTrace();
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
}
try (ByteArrayInputStream baisPDFBox = new ByteArrayInputStream(baosPDFBox.toByteArray());
ByteArrayOutputStream baosFinal = new ByteArrayOutputStream()) {
PdfReader reader = new PdfReader(baisPDFBox);
PdfStamper stamper = new PdfStamper(reader, baosFinal);
if (compressPDF) {
stamper.setFullCompression();
}
stamper.close();
reader.close();
return PdfUtils.boasToWebResponse(baosFinal, pdfFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_compressed.pdf");
} catch (IOException | DocumentException e) {
e.printStackTrace();
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
}
}
private BufferedImage resizeImage(BufferedImage originalImage, int resolutionPercentage) {
int newWidth = originalImage.getWidth() * resolutionPercentage / 100;
int newHeight = originalImage.getHeight() * resolutionPercentage / 100;
BufferedImage resizedImage = new BufferedImage(newWidth, newHeight, originalImage.getType());
Graphics2D g = resizedImage.createGraphics();
g.drawImage(originalImage, 0, 0, newWidth, newHeight, null);
g.dispose();
return resizedImage;
}
}

View File

@ -0,0 +1,143 @@
package stirling.software.SPDF.controller;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.ModelAndView;
import stirling.software.SPDF.utils.ProcessExecutor;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
//import com.spire.pdf.*;
@Controller
public class OCRController {
private static final Logger logger = LoggerFactory.getLogger(OCRController.class);
@GetMapping("/ocr-pdf")
public ModelAndView ocrPdfPage() {
ModelAndView modelAndView = new ModelAndView("ocr-pdf");
modelAndView.addObject("languages", getAvailableTesseractLanguages());
modelAndView.addObject("currentPage", "ocr-pdf");
return modelAndView;
}
@PostMapping("/ocr-pdf")
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
@RequestParam("languages") List<String> selectedLanguages,
@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {
//--output-type pdfa
if (selectedLanguages == null || selectedLanguages.size() < 1) {
throw new IOException("Please select at least one language.");
}
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", ".pdf");
inputFile.transferTo(tempInputFile.toFile());
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Run OCR Command
String languageOption = String.join("+", selectedLanguages);
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,
tempInputFile.toString(), tempOutputFile.toString()));
String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
if (sidecar != null && sidecar) {
command.add("--sidecar");
command.add(sidecarFile);
}
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
// Read the OCR processed PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files
Files.delete(tempInputFile);
// Return the OCR processed PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
HttpHeaders headers = new HttpHeaders();
if (sidecar != null && sidecar) {
// Create a zip file containing both the PDF and the text file
String outputZipFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.zip";
Path tempZipFile = Files.createTempFile("output_", ".zip");
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
// Add PDF file to the zip
ZipEntry pdfEntry = new ZipEntry(outputFilename);
zipOut.putNextEntry(pdfEntry);
Files.copy(tempOutputFile, zipOut);
zipOut.closeEntry();
// Add text file to the zip
ZipEntry txtEntry = new ZipEntry(sidecarFile);
zipOut.putNextEntry(txtEntry);
Files.copy(Paths.get(sidecarFile), zipOut);
zipOut.closeEntry();
}
byte[] zipBytes = Files.readAllBytes(tempZipFile);
// Clean up the temporary zip file
Files.delete(tempZipFile);
Files.delete(tempOutputFile);
Files.delete(Paths.get(sidecarFile));
// Return the zip file containing both the PDF and the text file
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
headers.setContentDispositionFormData("attachment", outputZipFilename);
return ResponseEntity.ok().headers(headers).body(zipBytes);
} else {
// Return the OCR processed PDF as a response
Files.delete(tempOutputFile);
headers.setContentType(MediaType.APPLICATION_PDF);
headers.setContentDispositionFormData("attachment", outputFilename);
return ResponseEntity.ok().headers(headers).body(pdfBytes);
}
}
public List<String> getAvailableTesseractLanguages() {
String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";
File[] files = new File(tessdataDir).listFiles();
if (files == null) {
return Collections.emptyList();
}
return Arrays.stream(files)
.filter(file -> file.getName().endsWith(".traineddata"))
.map(file -> file.getName().replace(".traineddata", ""))
.filter(lang -> !lang.equalsIgnoreCase("osd"))
.collect(Collectors.toList());
}
}

View File

@ -1,79 +0,0 @@
package stirling.software.SPDF.controller.converters;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Paragraph;
import com.itextpdf.text.pdf.PdfPCell;
import com.itextpdf.text.pdf.PdfPTable;
import com.itextpdf.text.pdf.PdfWriter;
import stirling.software.SPDF.utils.PdfUtils;
@Controller
public class ConvertCsvController {
@GetMapping("/csv-to-pdf")
public String cinvertToPDF(Model model) {
model.addAttribute("currentPage", "xlsx-to-pdf");
return "convert/xlsx-to-pdf";
}
@PostMapping("/csv-to-pdf")
public ResponseEntity<byte[]> convertCsvToPdf(@RequestParam("fileInput") MultipartFile csvFile) throws IOException, DocumentException {
// Create PDF document
Document document = new Document();
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PdfWriter.getInstance(document, outputStream);
document.open();
// Read CSV file
InputStreamReader inputStreamReader = new InputStreamReader(csvFile.getInputStream(), StandardCharsets.UTF_8);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
// Create PDF table from CSV content
PdfPTable table = null;
String csvRow;
while ((csvRow = bufferedReader.readLine()) != null) {
String[] csvRowCells = csvRow.split(","); // Assuming comma as a delimiter
if (table == null) {
table = new PdfPTable(csvRowCells.length);
}
for (String cellValue : csvRowCells) {
PdfPCell pdfCell = new PdfPCell(new Paragraph(cellValue));
table.addCell(pdfCell);
}
}
if (table != null) {
document.add(table);
}
// Close BufferedReader, document, and output stream
bufferedReader.close();
document.close();
outputStream.close();
return PdfUtils.boasToWebResponse(outputStream, csvFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_convertedToPDF.pdf");
}
}

View File

@ -1,43 +0,0 @@
package stirling.software.SPDF.controller.converters;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.docx4j.Docx4J;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import stirling.software.SPDF.utils.PdfUtils;
@Controller
public class ConvertDocController {
@GetMapping("/docx-to-pdf")
public String cinvertToPDF(Model model) {
model.addAttribute("currentPage", "xlsx-to-pdf");
return "convert/xlsx-to-pdf";
}
@PostMapping("/docx-to-pdf")
public ResponseEntity<byte[]> convertDocxToPdf(@RequestParam("fileInput") MultipartFile docxFile) throws IOException, Docx4JException {
// Load WordprocessingMLPackage
WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.load(docxFile.getInputStream());
// Create PDF output stream
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
// Convert DOCX to PDF
Docx4J.toPDF(wordMLPackage, outputStream);
return PdfUtils.boasToWebResponse(outputStream, docxFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_convertedToPDF.pdf");
}
}

View File

@ -1,54 +0,0 @@
package stirling.software.SPDF.controller.converters;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import stirling.software.SPDF.utils.PdfUtils;
@Controller
public class ConvertHtmlController {
@GetMapping("//html-to-pdf")
public String cinvertToPDF(Model model) {
model.addAttribute("currentPage", "xlsx-to-pdf");
return "convert/xlsx-to-pdf";
}
@PostMapping("/html-to-pdf")
public ResponseEntity<byte[]> convertHtmlToPdf(@RequestParam("fileInput") MultipartFile htmlFile) throws IOException, DocumentException {
// Create PDF document
Document document = new Document();
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PdfWriter writer = PdfWriter.getInstance(document, outputStream);
document.open();
// Read HTML file
InputStream htmlInputStream = new ByteArrayInputStream(htmlFile.getBytes());
// Convert HTML content to PDF
XMLWorkerHelper.getInstance().parseXHtml(writer, document, htmlInputStream);
// Close document and output stream
document.close();
outputStream.close();
return PdfUtils.boasToWebResponse(outputStream, "");
}
}

View File

@ -0,0 +1,82 @@
package stirling.software.SPDF.controller.converters;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.ModelAndView;
import stirling.software.SPDF.LibreOfficeListener;
import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
@Controller
public class ConvertOfficeController {
@GetMapping("/file-to-pdf")
public String convertToPdfForm(Model model) {
model.addAttribute("currentPage", "file-to-pdf");
return "convert/file-to-pdf";
}
@PostMapping("/file-to-pdf")
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
//unused but can start server instance if startup time is to long
//LibreOfficeListener.getInstance().start();
byte[] pdfByteArray = convertToPdf(inputFile);
return PdfUtils.bytesToWebResponse(pdfByteArray, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_convertedToPDF.pdf");
}
public byte[] convertToPdf(MultipartFile inputFile) throws IOException, InterruptedException {
// Save the uploaded file to a temporary location
Path tempInputFile = Files.createTempFile("input_", "." + getFileExtension(inputFile.getOriginalFilename()));
inputFile.transferTo(tempInputFile.toFile());
// Prepare the output file path
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
// Run the LibreOffice command
List<String> command = new ArrayList<>(Arrays.asList("unoconv", "-vvv",
"-f",
"pdf",
"-o",
tempOutputFile.toString(),
tempInputFile.toString()));
int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);
// Read the converted PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files
Files.delete(tempInputFile);
Files.delete(tempOutputFile);
return pdfBytes;
}
private String getFileExtension(String fileName) {
int dotIndex = fileName.lastIndexOf('.');
if (dotIndex == -1) {
return "";
}
return fileName.substring(dotIndex + 1);
}
}

View File

@ -1,79 +0,0 @@
package stirling.software.SPDF.controller.converters;
import java.awt.Color;
import java.awt.Graphics2D;
import java.awt.RenderingHints;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.pdf.PdfWriter;
import stirling.software.SPDF.utils.PdfUtils;
@Controller
public class ConvertPPTController {
@GetMapping("/pptx-to-pdf")
public String cinvertToPDF(Model model) {
model.addAttribute("currentPage", "xlsx-to-pdf");
return "convert/xlsx-to-pdf";
}
@PostMapping("/pptx-to-pdf")
public ResponseEntity<byte[]> convertPptxToPdf(@RequestParam("fileInput") MultipartFile pptxFile) throws IOException, DocumentException {
// Read PowerPoint presentation
XMLSlideShow ppt = new XMLSlideShow(pptxFile.getInputStream());
// Create PDF document
Document pdfDocument = new Document(PageSize.A4.rotate());
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PdfWriter.getInstance(pdfDocument, outputStream);
pdfDocument.open();
// Convert PowerPoint slides to images, then add them to the PDF
for (XSLFSlide slide : ppt.getSlides()) {
BufferedImage slideImage = new BufferedImage((int) Math.ceil(ppt.getPageSize().getWidth()), (int) Math.ceil(ppt.getPageSize().getHeight()), BufferedImage.TYPE_INT_RGB);
Graphics2D graphics = slideImage.createGraphics();
// Set graphics rendering hints for better quality
graphics.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON);
graphics.setRenderingHint(RenderingHints.KEY_RENDERING, RenderingHints.VALUE_RENDER_QUALITY);
graphics.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC);
// Draw the slide on the graphics
graphics.setPaint(Color.white);
graphics.fill(new Rectangle2D.Float(0, 0, slideImage.getWidth(), slideImage.getHeight()));
slide.draw(graphics);
// Add the slide image to the PDF document
Image image = Image.getInstance(slideImage, null);
image.scaleToFit(PageSize.A4.getWidth() - 72, PageSize.A4.getHeight() - 72);
pdfDocument.add(image);
}
// Close PowerPoint and PDF documents
ppt.close();
pdfDocument.close();
outputStream.close();
return PdfUtils.boasToWebResponse(outputStream, pptxFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_convertedToPDF.pdf");
}
}

View File

@ -1,63 +0,0 @@
package stirling.software.SPDF.controller.converters;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.apache.commons.io.FilenameUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Paragraph;
import com.itextpdf.text.pdf.PdfWriter;
import stirling.software.SPDF.utils.PdfUtils;
@Controller
public class ConvertTextController {
@GetMapping("/txt-rtf-to-pdf")
public String cinvertToPDF(Model model) {
model.addAttribute("currentPage", "xlsx-to-pdf");
return "convert/xlsx-to-pdf";
}
@PostMapping("/txt-rtf-to-pdf")
public ResponseEntity<byte[]> convertTxtRtfToPdf(@RequestParam("fileInput") MultipartFile txtRtfFile) throws IOException, DocumentException {
// Create PDF document
Document document = new Document();
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PdfWriter.getInstance(document, outputStream);
document.open();
// Read TXT/RTF file content
String fileContent;
String fileExtension = FilenameUtils.getExtension(txtRtfFile.getOriginalFilename());
if (fileExtension.equalsIgnoreCase("rtf")) {
HWPFDocument hwpfDocument = new HWPFDocument(new POIFSFileSystem(txtRtfFile.getInputStream()));
fileContent = hwpfDocument.getText().toString();
} else {
fileContent = new String(txtRtfFile.getBytes(), StandardCharsets.UTF_8);
}
// Add content to PDF
document.add(new Paragraph(fileContent));
// Close document and output stream
document.close();
outputStream.close();
return PdfUtils.boasToWebResponse(outputStream, txtRtfFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_convertedToPDF.pdf");
}
}

View File

@ -1,152 +0,0 @@
package stirling.software.SPDF.controller.converters;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFColor;
import org.apache.poi.xssf.usermodel.XSSFFont;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import com.itextpdf.text.BaseColor;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Font;
import com.itextpdf.text.Paragraph;
import com.itextpdf.text.pdf.PdfPCell;
import com.itextpdf.text.pdf.PdfPTable;
import com.itextpdf.text.pdf.PdfWriter;
import stirling.software.SPDF.utils.PdfUtils;
@Controller
public class ConvertXlsxController {
@GetMapping("/xlsx-to-pdf")
public String cinvertToPDF(Model model) {
model.addAttribute("currentPage", "xlsx-to-pdf");
return "convert/xlsx-to-pdf";
}
@PostMapping("/xlsx-to-pdf")
public ResponseEntity<byte[]> convertToPDF(@RequestParam("fileInput") MultipartFile xlsx) throws IOException, DocumentException {
// Load Excel file
Workbook workbook = WorkbookFactory.create(xlsx.getInputStream());
// Create PDF document
Document document = new Document();
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PdfWriter.getInstance(document, outputStream);
document.open();
// Convert each sheet in Excel to a separate page in PDF
for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
Sheet sheet = workbook.getSheetAt(i);
int numOfColumns = sheet.getRow(0).getPhysicalNumberOfCells();
PdfPTable table = new PdfPTable(numOfColumns);
for (int row = 0; row < sheet.getPhysicalNumberOfRows(); row++) {
Row excelRow = sheet.getRow(row);
if (excelRow == null) {
continue; // Skip this row if it's null
}
for (int cell = 0; cell < excelRow.getPhysicalNumberOfCells(); cell++) {
Cell excelCell = excelRow.getCell(cell);
// Check if the cell is null
if (excelCell == null) {
table.addCell(""); // Add an empty cell to the PDF table
continue;
}
// Convert cell to string
DataFormatter dataFormatter = new DataFormatter();
String cellValue = dataFormatter.formatCellValue(excelCell);
System.out.println("Cell Value: " + cellValue);
// Get Excel cell font
Font cellFont = getFontFromExcelCell(workbook, excelCell);
// Create PDF cell with Excel cell font
PdfPCell pdfCell = new PdfPCell(new Paragraph(cellValue, cellFont));
// Set cell height and width
float height = sheet.getRow(row).getHeightInPoints();
System.out.print(height);
pdfCell.setFixedHeight(30f);
// Copy cell style, borders, and background color
XSSFCellStyle cellStyle = (XSSFCellStyle) excelCell.getCellStyle();
if (cellStyle != null) {
XSSFColor bottomBorderColor = cellStyle.getBottomBorderXSSFColor();
if (bottomBorderColor != null) {
pdfCell.setBorderColor(new BaseColor(bottomBorderColor.getRGB()[0] & 0xFF, bottomBorderColor.getRGB()[1] & 0xFF, bottomBorderColor.getRGB()[2] & 0xFF));
}
XSSFColor topBorderColor = cellStyle.getTopBorderXSSFColor();
if (topBorderColor != null) {
pdfCell.setBorderColor(new BaseColor(topBorderColor.getRGB()[0] & 0xFF, topBorderColor.getRGB()[1] & 0xFF, topBorderColor.getRGB()[2] & 0xFF));
}
XSSFColor leftBorderColor = cellStyle.getLeftBorderXSSFColor();
if (leftBorderColor != null) {
pdfCell.setBorderColor(new BaseColor(leftBorderColor.getRGB()[0] & 0xFF, leftBorderColor.getRGB()[1] & 0xFF, leftBorderColor.getRGB()[2] & 0xFF));
}
XSSFColor rightBorderColor = cellStyle.getRightBorderXSSFColor();
if (rightBorderColor != null) {
pdfCell.setBorderColor(new BaseColor(rightBorderColor.getRGB()[0] & 0xFF, rightBorderColor.getRGB()[1] & 0xFF, rightBorderColor.getRGB()[2] & 0xFF));
}
XSSFColor fillForegroundColor = cellStyle.getFillForegroundXSSFColor();
if (fillForegroundColor != null) {
pdfCell.setBackgroundColor(new BaseColor(fillForegroundColor.getRGB()[0] & 0xFF, fillForegroundColor.getRGB()[1] & 0xFF, fillForegroundColor.getRGB()[2] & 0xFF));
}
}
table.addCell(pdfCell);
}
}
// Add sheet to PDF
document.add(table);
// Add page break if there are more sheets
if (i < workbook.getNumberOfSheets() - 1) {
document.newPage();
}
}
// Close document and output stream
document.close();
outputStream.flush();
outputStream.close();
// Return PDF as response
return PdfUtils.boasToWebResponse(outputStream, xlsx.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_convertedToPDF.pdf");
}
private Font getFontFromExcelCell(Workbook workbook, Cell excelCell) {
XSSFFont excelFont = ((XSSFCellStyle) excelCell.getCellStyle()).getFont();
Font.FontFamily fontFamily = Font.getFamily(excelFont.getFontName());
float fontSize = excelFont.getFontHeightInPoints();
int fontStyle = (excelFont.getBold() ? Font.BOLD : Font.NORMAL) | (excelFont.getItalic() ? Font.ITALIC : Font.NORMAL);
return new Font(fontFamily, fontSize, fontStyle);
}
}

View File

@ -0,0 +1,69 @@
package stirling.software.SPDF.utils;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.List;
import java.io.BufferedReader;
import java.util.ArrayList;
public class ProcessExecutor {
public static int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
ProcessBuilder processBuilder = new ProcessBuilder(command);
Process process = processBuilder.start();
// Read the error stream and standard output stream concurrently
List<String> errorLines = new ArrayList<>();
List<String> outputLines = new ArrayList<>();
Thread errorReaderThread = new Thread(() -> {
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) {
String line;
while ((line = errorReader.readLine()) != null) {
errorLines.add(line);
}
} catch (IOException e) {
e.printStackTrace();
}
});
Thread outputReaderThread = new Thread(() -> {
try (BufferedReader outputReader = new BufferedReader(new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
String line;
while ((line = outputReader.readLine()) != null) {
outputLines.add(line);
}
} catch (IOException e) {
e.printStackTrace();
}
});
errorReaderThread.start();
outputReaderThread.start();
// Wait for the conversion process to complete
int exitCode = process.waitFor();
// Wait for the reader threads to finish
errorReaderThread.join();
outputReaderThread.join();
if (outputLines.size() > 0) {
String outputMessage = String.join("\n", outputLines);
System.out.println("Command output:\n" + outputMessage);
}
if (errorLines.size() > 0) {
String errorMessage = String.join("\n", errorLines);
System.out.println("Command error output:\n" + errorMessage);
if (exitCode != 0) {
throw new IOException("Command process failed with exit code " + exitCode + ". Error message: " + errorMessage);
}
}
return exitCode;
}
}

View File

@ -0,0 +1,11 @@
fileToPDF.fileTypesList=Microsoft Word: (DOC, DOCX, DOT, DOTX) \
Microsoft Excel: (CSV, XLS, XLSX, XLT, XLTX, SLK, DIF) \
Microsoft PowerPoint: (PPT, PPTX) \
OpenDocument Formats: (ODT, OTT, ODS, OTS, ODP, OTP, ODG, OTG) \
Plain Text: (TXT, TEXT, XML) \
Rich Text Format: (RTF) \
Images: (BMP, GIF, JPEG, PNG, TIF, PBM, PGM, PPM, RAS, XBM, XPM, SVG, SVM, WMF) \
HTML: (HTML) \
Lotus Word Pro: (LWP) \
StarOffice formats: (SDA, SDC, SDD, SDW, STC, STD, STI, STW, SXD, SXG, SXI, SXW) \
Other formats: (DBF, FODS, VSD, VOR, VOR3, VOR4, UOP, PCT, PS, PDF)

View File

@ -224,19 +224,8 @@ changeMetadata.selectText.5=Add Custom Metadata Entry
changeMetadata.submit=Change
xlsToPdf.title=Excel to PDF
xlsToPdf.header=Excel to PDF
xlsToPdf.selectText.1=Select XLS or XLSX Excel sheet to convert
xlsToPdf.convert=convert
fileToPDF.credit=This service uses LibreOffice and Unoconv for file conversion.
fileToPDF.supportedFileTypes=Supported file types should include the below however for a full updated list of supported formats, please refer to the LibreOffice documentation

View File

@ -14,60 +14,30 @@
<div class="row justify-content-center">
<div class="col-md-6">
<h2 th:text="#{compress.header}"></h2>
<form method="post" enctype="multipart/form-data" th:action="@{/compress-pdf}">
<div class="form-group">
<label for="fileInput">Select a PDF file to compress:</label>
<input type="file" class="form-control-file" id="fileInput" name="fileInput" accept=".pdf">
</div>
<div class="form-group">
<div class="form-check">
<input class="form-check-input" type="checkbox" id="compressPDF" name="compressPDF" checked>
<label class="form-check-label" for="compressPDF">Compress PDF?</label>
</div>
<div class="form-group">
<label for="pdfCompressionLevel">PDF Compression Level:</label>
<select class="form-control" id="pdfCompressionLevel" name="pdfCompressionLevel">
<option value="0">0 (No compression)</option>
<option value="1">1 (Lowest compression)</option>
<option value="2">2</option>
<option value="3">3</option>
<option value="4">4</option>
<option value="5" selected>5 (Default compression)</option>
<option value="6">6</option>
<option value="7">7</option>
<option value="8">8</option>
<option value="9">9 (Maximum compression)</option>
</select>
</div>
</div>
<div class="form-group">
<div class="form-check">
<input class="form-check-input" type="checkbox" id="compressImages" name="compressImages" checked>
<label class="form-check-label" for="compressImages">Compress Images?</label>
</div>
</div>
<div class="form-group">
<div class="form-check">
<input class="form-check-input" type="checkbox" id="useLossyCompression" name="useLossyCompression">
<label class="form-check-label" for="useLossyCompression">Use Lossy Compression for Images?</label>
</div>
</div>
<div class="form-group">
<label for="imageCompressionLevel">Image Compression Level:</label>
<select class="form-control" id="imageCompressionLevel" name="imageCompressionLevel">
<option value="0">0 (No compression)</option>
<option value="10">10 (Lowest quality)</option>
<option value="25">25</option>
<option value="50" selected>50 (Default quality)</option>
<option value="75">75</option>
<option value="90">90 (High quality)</option>
<option value="100">100 (Best quality)</option>
</select>
</div>
<div class="form-group">
<button type="submit" class="btn btn-primary">Compress PDF</button>
</div>
</form>
<form action="#" th:action="@{/compress-pdf}" method="post" enctype="multipart/form-data">
<div>
<label for="fileInput">Choose a PDF file</label>
<input type="file" name="fileInput" id="fileInput" accept="application/pdf" required>
</div>
<div>
<label for="optimizeLevel">Optimization level:</label>
<select name="optimizeLevel" id="optimizeLevel">
<option value="0">-O0 (No optimization)</option>
<option value="1" selected>-O1 (Default, lossless optimization)</option>
<option value="2">-O2 (Lossy optimization)</option>
<option value="3">-O3 (Lossy optimization, more aggressive)</option>
</select>
</div>
<div>
<input type="checkbox" name="fastWebView" id="fastWebView" checked>
<label for="fastWebView">Enable fast web view (linearize PDF)</label>
</div>
<div>
<input type="checkbox" name="jbig2Lossy" id="jbig2Lossy">
<label for="jbig2Lossy">Enable lossy JBIG2 encoding</label>
</div>
<button type="submit">Optimize PDF</button>
</form>
<th:block th:insert="~{fragments/common :: filelist}"></th:block>

View File

@ -1,7 +1,7 @@
<!DOCTYPE html>
<html th:lang="${#locale.language}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
<th:block th:insert="~{fragments/common :: head(title=#{xlsToPdf.title})}"></th:block>
<th:block th:insert="~{fragments/common :: head(title=#{fileToPDF.title})}"></th:block>
<body>
@ -12,18 +12,23 @@
<div class="container">
<div class="row justify-content-center">
<div class="col-md-6">
<h2 th:text="#{xlsToPdf.header}"></h2>
<h2 th:text="#{fileToPDF.header}"></h2>
<form method="post" enctype="multipart/form-data" th:action="@{xlsx-to-pdf}">
<form method="post" enctype="multipart/form-data" th:action="@{file-to-pdf}">
<div class="custom-file">
<input type="file" class="custom-file-input" id="fileInput" name="fileInput" required>
<label class="custom-file-label" for="fileInput" th:text="#{xlsToPdf.selectText.1}"></label>
<label class="custom-file-label" for="fileInput" th:text="#{filePrompt}"></label>
</div>
<br> <br>
<button type="submit" class="btn btn-primary" th:text="#{imageToPDF.submit}"></button>
<button type="submit" class="btn btn-primary" th:text="#{fileToPDF.submit}"></button>
</form>
<th:block th:insert="~{fragments/common :: filelist}"></th:block>
<p class="mt-3" th:text="#{fileToPDF.credit}"></p>
<p class="mt-3" th:text="#{fileToPDF.supportedFileTypes}"></p>
<p th:utext="#{fileToPDF.fileTypesList}"></p>
<a href="https://help.libreoffice.org/latest/en-US/text/shared/guide/supported_formats.html"></a>
</div>
</div>
</div>

View File

@ -0,0 +1,51 @@
<!DOCTYPE html>
<html th:lang="${#locale.language}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
<th:block th:insert="~{fragments/common :: head(title=#{addImage.title})}"></th:block>
<body>
<div id="page-container">
<div id="content-wrap">
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
<br> <br>
<div class="container">
<div class="row justify-content-center">
<div class="col-md-6">
<h2 th:text="#{ocrPDF.header}"></h2>
<form action="#" th:action="@{/ocr-pdf}" method="post" enctype="multipart/form-data" class="mb-3">
<div class="mb-3">
<label for="fileInput" class="form-label">Choose a PDF file</label>
<input type="file" name="fileInput" id="fileInput" accept="application/pdf" required class="form-control">
</div>
<div class="mb-3">
<label for="languages" class="form-label">Select languages that are to be detected within the PDF (Ones listed are the ones currently detected):</label>
<div id="languages">
<div th:each="language: ${languages}">
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
<label class="form-check-label" th:for="${'language-' + language}" th:text="${language}"></label>
</div>
</div>
</div>
<div class="mb-3">
<input type="checkbox" class="form-check-input" name="sidecar" id="sidecar" />
<label class="form-check-label" for="sidecar">Produce text file containing OCR text alongside the OCR'ed PDF</label>
</div>
<button type="submit" class="btn btn-primary">Process PDF with OCR</button>
</form>
<p>
Please read this documentation on how to use this for other languages and/or not in docker
</p>
<th:block th:insert="~{fragments/common :: filelist}"></th:block>
</div>
</div>
</div>
</div>
<div th:insert="~{fragments/footer.html :: footer}"></div>
</div>
</body>
</html>