fix: switch to pdftohtml for pdf to html conversions (#998)

* fix: switch to pdftohtml for pdf to html conversions

* build: include poppler-utils in dockerfile for pdftohtml
This commit is contained in:
Eric 2024-03-29 17:02:33 -04:00 committed by GitHub
parent 27bbf7a513
commit dfb8c64f5a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 101 additions and 58 deletions

View File

@ -36,6 +36,8 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
shadow \ shadow \
# Doc conversion # Doc conversion
libreoffice@testing \ libreoffice@testing \
# pdftohtml
poppler-utils \
# OCR MY PDF (unpaper for descew and other advanced featues) # OCR MY PDF (unpaper for descew and other advanced featues)
ocrmypdf \ ocrmypdf \
tesseract-ocr-data-eng \ tesseract-ocr-data-eng \

View File

@ -244,6 +244,6 @@ public class EndpointConfiguration {
} }
} }
} }
private static final String REMOVE_BLANKS = "remove-blanks"; private static final String REMOVE_BLANKS = "remove-blanks";
} }

View File

@ -291,6 +291,6 @@ public class UserController {
} }
return ResponseEntity.ok(apiKey); return ResponseEntity.ok(apiKey);
} }
private static final String LOGIN_MESSAGETYPE_CREDSUPDATED = "/login?messageType=credsUpdated"; private static final String LOGIN_MESSAGETYPE_CREDSUPDATED = "/login?messageType=credsUpdated";
} }

View File

@ -29,18 +29,6 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Convert", description = "Convert APIs") @Tag(name = "Convert", description = "Convert APIs")
public class ConvertPDFToOffice { public class ConvertPDFToOffice {
@PostMapping(consumes = "multipart/form-data", value = "/pdf/html")
@Operation(
summary = "Convert PDF to HTML",
description =
"This endpoint converts a PDF file to HTML format. Input:PDF Output:HTML Type:SISO")
public ResponseEntity<byte[]> processPdfToHTML(@ModelAttribute PDFFile request)
throws Exception {
MultipartFile inputFile = request.getFileInput();
PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, "html", "writer_pdf_import");
}
@PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation") @PostMapping(consumes = "multipart/form-data", value = "/pdf/presentation")
@Operation( @Operation(
summary = "Convert PDF to Presentation format", summary = "Convert PDF to Presentation format",

View File

@ -6,8 +6,6 @@ import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.PostMapping;

View File

@ -219,6 +219,6 @@ public class ExtractImageScansController {
}); });
} }
} }
private static final String REPLACEFIRST = "[.][^.]+$"; private static final String REPLACEFIRST = "[.][^.]+$";
} }

View File

@ -26,7 +26,6 @@ import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import stirling.software.SPDF.model.ApplicationProperties;
import stirling.software.SPDF.model.PipelineConfig; import stirling.software.SPDF.model.PipelineConfig;
import stirling.software.SPDF.model.PipelineOperation; import stirling.software.SPDF.model.PipelineOperation;

View File

@ -3,8 +3,6 @@ package stirling.software.SPDF.repository;
import java.util.Optional; import java.util.Optional;
import org.springframework.data.jpa.repository.JpaRepository; import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import stirling.software.SPDF.model.User; import stirling.software.SPDF.model.User;

View File

@ -25,6 +25,71 @@ import io.github.pixee.security.Filenames;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult; import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
public class PDFToFile { public class PDFToFile {
public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
throws IOException, InterruptedException {
if (!"application/pdf".equals(inputFile.getContentType())) {
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
}
// Get the original PDF file name without the extension
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
String pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
Path tempInputFile = null;
Path tempOutputDir = null;
byte[] fileBytes;
String fileName = "temp.file";
try {
// Save the uploaded file to a temporary location
tempInputFile = Files.createTempFile("input_", ".pdf");
Files.copy(
inputFile.getInputStream(), tempInputFile, StandardCopyOption.REPLACE_EXISTING);
// Prepare the output directory
tempOutputDir = Files.createTempDirectory("output_");
// Run the pdftohtml command with complex output
List<String> command =
new ArrayList<>(
Arrays.asList(
"pdftohtml", "-c", tempInputFile.toString(), pdfBaseName));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
// Get output files
List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
// Return output files in a ZIP archive
fileName = pdfBaseName + "ToHtml.zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream);
for (File outputFile : outputFiles) {
ZipEntry entry = new ZipEntry(outputFile.getName());
zipOutputStream.putNextEntry(entry);
FileInputStream fis = new FileInputStream(outputFile);
IOUtils.copy(fis, zipOutputStream);
fis.close();
zipOutputStream.closeEntry();
}
zipOutputStream.close();
fileBytes = byteArrayOutputStream.toByteArray();
} finally {
// Clean up the temporary files
if (tempInputFile != null) Files.delete(tempInputFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
}
return WebResponseUtils.bytesToWebResponse(
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
}
public ResponseEntity<byte[]> processPdfToOfficeFormat( public ResponseEntity<byte[]> processPdfToOfficeFormat(
MultipartFile inputFile, String outputFormat, String libreOfficeFilter) MultipartFile inputFile, String outputFormat, String libreOfficeFilter)
throws IOException, InterruptedException { throws IOException, InterruptedException {
@ -39,17 +104,7 @@ public class PDFToFile {
// Validate output format // Validate output format
List<String> allowedFormats = List<String> allowedFormats =
Arrays.asList( Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text");
"doc",
"docx",
"odt",
"ppt",
"pptx",
"odp",
"rtf",
"html",
"xml",
"txt:Text");
if (!allowedFormats.contains(outputFormat)) { if (!allowedFormats.contains(outputFormat)) {
return new ResponseEntity<>(HttpStatus.BAD_REQUEST); return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
} }

View File

@ -24,6 +24,7 @@ public class ProcessExecutor {
public enum Processes { public enum Processes {
LIBRE_OFFICE, LIBRE_OFFICE,
PDFTOHTML,
OCR_MY_PDF, OCR_MY_PDF,
PYTHON_OPENCV, PYTHON_OPENCV,
GHOSTSCRIPT, GHOSTSCRIPT,
@ -45,6 +46,7 @@ public class ProcessExecutor {
int semaphoreLimit = int semaphoreLimit =
switch (key) { switch (key) {
case LIBRE_OFFICE -> 1; case LIBRE_OFFICE -> 1;
case PDFTOHTML -> 1;
case OCR_MY_PDF -> 2; case OCR_MY_PDF -> 2;
case PYTHON_OPENCV -> 8; case PYTHON_OPENCV -> 8;
case GHOSTSCRIPT -> 16; case GHOSTSCRIPT -> 16;
@ -56,6 +58,7 @@ public class ProcessExecutor {
long timeoutMinutes = long timeoutMinutes =
switch (key) { switch (key) {
case LIBRE_OFFICE -> 30; case LIBRE_OFFICE -> 30;
case PDFTOHTML -> 5;
case OCR_MY_PDF -> 30; case OCR_MY_PDF -> 30;
case PYTHON_OPENCV -> 30; case PYTHON_OPENCV -> 30;
case GHOSTSCRIPT -> 5; case GHOSTSCRIPT -> 5;

View File

@ -957,7 +957,7 @@ PDFToText.submit=تحويل
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF إلى HTML PDFToHTML.title=PDF إلى HTML
PDFToHTML.header=PDF إلى HTML PDFToHTML.header=PDF إلى HTML
PDFToHTML.credit=تستخدم هذه الخدمة LibreOffice لتحويل الملفات. PDFToHTML.credit=تستخدم هذه الخدمة pdftohtml لتحويل الملفات.
PDFToHTML.submit=تحويل PDFToHTML.submit=تحويل

View File

@ -957,7 +957,7 @@ PDFToText.submit=Преобразуване
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF към HTML PDFToHTML.title=PDF към HTML
PDFToHTML.header=PDF към HTML PDFToHTML.header=PDF към HTML
PDFToHTML.credit=Тази услуга използва LibreOffice за преобразуване на файлове. PDFToHTML.credit=Тази услуга използва pdftohtml за преобразуване на файлове.
PDFToHTML.submit=Преобразуване PDFToHTML.submit=Преобразуване

View File

@ -957,7 +957,7 @@ PDFToText.submit=Converteix
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF a HTML PDFToHTML.title=PDF a HTML
PDFToHTML.header=PDF a HTML PDFToHTML.header=PDF a HTML
PDFToHTML.credit=Utilitza LibreOffice per a la conversió d'Arxius. PDFToHTML.credit=Utilitza pdftohtml per a la conversió d'Arxius.
PDFToHTML.submit=Converteix PDFToHTML.submit=Converteix

View File

@ -957,7 +957,7 @@ PDFToText.submit=Konvertieren
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF zu HTML PDFToHTML.title=PDF zu HTML
PDFToHTML.header=PDF zu HTML PDFToHTML.header=PDF zu HTML
PDFToHTML.credit=Dieser Dienst verwendet LibreOffice für die Dateikonvertierung. PDFToHTML.credit=Dieser Dienst verwendet pdftohtml für die Dateikonvertierung.
PDFToHTML.submit=Konvertieren PDFToHTML.submit=Konvertieren

View File

@ -957,7 +957,7 @@ PDFToText.submit=Μετατροπή
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF σε HTML PDFToHTML.title=PDF σε HTML
PDFToHTML.header=PDF σε HTML PDFToHTML.header=PDF σε HTML
PDFToHTML.credit=Αυτή η υπηρεσία χρησιμοποιεί LibreOffice για τη μετατροπή των αρχείων. PDFToHTML.credit=Αυτή η υπηρεσία χρησιμοποιεί pdftohtml για τη μετατροπή των αρχείων.
PDFToHTML.submit=Μετατροπή PDFToHTML.submit=Μετατροπή

View File

@ -957,7 +957,7 @@ PDFToText.submit=Convert
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF to HTML PDFToHTML.title=PDF to HTML
PDFToHTML.header=PDF to HTML PDFToHTML.header=PDF to HTML
PDFToHTML.credit=This service uses LibreOffice for file conversion. PDFToHTML.credit=This service uses pdftohtml for file conversion.
PDFToHTML.submit=Convert PDFToHTML.submit=Convert

View File

@ -957,7 +957,7 @@ PDFToText.submit=Convert
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF to HTML PDFToHTML.title=PDF to HTML
PDFToHTML.header=PDF to HTML PDFToHTML.header=PDF to HTML
PDFToHTML.credit=This service uses LibreOffice for file conversion. PDFToHTML.credit=This service uses pdftohtml for file conversion.
PDFToHTML.submit=Convert PDFToHTML.submit=Convert

View File

@ -957,7 +957,7 @@ PDFToText.submit=Convertir
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF a HTML PDFToHTML.title=PDF a HTML
PDFToHTML.header=PDF a HTML PDFToHTML.header=PDF a HTML
PDFToHTML.credit=Este servicio utiliza LibreOffice para la conversión de archivos PDFToHTML.credit=Este servicio utiliza pdftohtml para la conversión de archivos
PDFToHTML.submit=Convertir PDFToHTML.submit=Convertir

View File

@ -957,7 +957,7 @@ PDFToText.submit=Bihurtu
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDFa HTML bihurtu PDFToHTML.title=PDFa HTML bihurtu
PDFToHTML.header=PDFa HTML bihurtu PDFToHTML.header=PDFa HTML bihurtu
PDFToHTML.credit=Zerbitzu honek LibreOffice erabiltzen du fitxategiak bihurtzeko PDFToHTML.credit=Zerbitzu honek pdftohtml erabiltzen du fitxategiak bihurtzeko
PDFToHTML.submit=Bihurtu PDFToHTML.submit=Bihurtu

View File

@ -957,7 +957,7 @@ PDFToText.submit=Convertir
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF en HTML PDFToHTML.title=PDF en HTML
PDFToHTML.header=PDF en HTML PDFToHTML.header=PDF en HTML
PDFToHTML.credit=Ce service utilise LibreOffice pour la conversion de fichiers. PDFToHTML.credit=Ce service utilise pdftohtml pour la conversion de fichiers.
PDFToHTML.submit=Convertir PDFToHTML.submit=Convertir

View File

@ -957,7 +957,7 @@ PDFToText.submit=परिवर्तित करें
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF से HTML PDFToHTML.title=PDF से HTML
PDFToHTML.header=PDF से HTML PDFToHTML.header=PDF से HTML
PDFToHTML.credit=यह सेवा फ़ाइल परिवर्तन के लिए LibreOffice का उपयोग करती है। PDFToHTML.credit=यह सेवा फ़ाइल परिवर्तन के लिए pdftohtml का उपयोग करती है।
PDFToHTML.submit=परिवर्तित करें PDFToHTML.submit=परिवर्तित करें

View File

@ -957,7 +957,7 @@ PDFToText.submit=Konvertálás
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF >> HTML PDFToHTML.title=PDF >> HTML
PDFToHTML.header=PDF >> HTML PDFToHTML.header=PDF >> HTML
PDFToHTML.credit=Ez a szolgáltatás a LibreOffice-t használja a fájlkonverzióhoz. PDFToHTML.credit=Ez a szolgáltatás a pdftohtml-t használja a fájlkonverzióhoz.
PDFToHTML.submit=Konvertálás PDFToHTML.submit=Konvertálás

View File

@ -957,7 +957,7 @@ PDFToText.submit=Konversi
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF Ke HTML PDFToHTML.title=PDF Ke HTML
PDFToHTML.header=PDF ke HTML PDFToHTML.header=PDF ke HTML
PDFToHTML.credit=Layanan ini menggunakan LibreOffice untuk konversi berkas. PDFToHTML.credit=Layanan ini menggunakan pdftohtml untuk konversi berkas.
PDFToHTML.submit=Konversi PDFToHTML.submit=Konversi

View File

@ -957,7 +957,7 @@ PDFToText.submit=Converti
#PDFToHTML #PDFToHTML
PDFToHTML.title=Da PDF a HTML PDFToHTML.title=Da PDF a HTML
PDFToHTML.header=Da PDF a HTML PDFToHTML.header=Da PDF a HTML
PDFToHTML.credit=Questo servizio utilizza LibreOffice per la conversione. PDFToHTML.credit=Questo servizio utilizza pdftohtml per la conversione.
PDFToHTML.submit=Converti PDFToHTML.submit=Converti

View File

@ -957,7 +957,7 @@ PDFToText.submit=変換
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDFをHTMLに変換 PDFToHTML.title=PDFをHTMLに変換
PDFToHTML.header=PDFをHTMLに変換 PDFToHTML.header=PDFをHTMLに変換
PDFToHTML.credit=本サービスはファイル変換にLibreOfficeを使用しています。 PDFToHTML.credit=本サービスはファイル変換にpdftohtmlを使用しています。
PDFToHTML.submit=変換 PDFToHTML.submit=変換

View File

@ -957,7 +957,7 @@ PDFToText.submit=변환
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF를 HTML로 PDFToHTML.title=PDF를 HTML로
PDFToHTML.header=PDF 문서를 HTML로 변환 PDFToHTML.header=PDF 문서를 HTML로 변환
PDFToHTML.credit=이 서비스는 파일 변환을 위해 LibreOffice를 사용합니다. PDFToHTML.credit=이 서비스는 파일 변환을 위해 pdftohtml를 사용합니다.
PDFToHTML.submit=변환 PDFToHTML.submit=변환

View File

@ -957,7 +957,7 @@ PDFToText.submit=Converteren
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF naar HTML PDFToHTML.title=PDF naar HTML
PDFToHTML.header=PDF naar HTML PDFToHTML.header=PDF naar HTML
PDFToHTML.credit=Deze service gebruikt LibreOffice voor bestandsconversie. PDFToHTML.credit=Deze service gebruikt pdftohtml voor bestandsconversie.
PDFToHTML.submit=Converteren PDFToHTML.submit=Converteren

View File

@ -957,7 +957,7 @@ PDFToText.submit=Konwertuj
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF na HTML PDFToHTML.title=PDF na HTML
PDFToHTML.header=PDF na HTML PDFToHTML.header=PDF na HTML
PDFToHTML.credit=Ta usługa używa LibreOffice do konwersji plików. PDFToHTML.credit=Ta usługa używa pdftohtml do konwersji plików.
PDFToHTML.submit=Konwertuj PDFToHTML.submit=Konwertuj

View File

@ -957,7 +957,7 @@ PDFToText.submit=Converter
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF para HTML PDFToHTML.title=PDF para HTML
PDFToHTML.header=PDF para HTML PDFToHTML.header=PDF para HTML
PDFToHTML.credit=Este serviço usa o LibreOffice para Conversão de Arquivos. PDFToHTML.credit=Este serviço usa o pdftohtml para Conversão de Arquivos.
PDFToHTML.submit=Converter PDFToHTML.submit=Converter

View File

@ -957,7 +957,7 @@ PDFToText.submit=Converter
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF para HTML PDFToHTML.title=PDF para HTML
PDFToHTML.header=PDF para HTML PDFToHTML.header=PDF para HTML
PDFToHTML.credit=Este serviço usa o LibreOffice para Conversão de ficheiros. PDFToHTML.credit=Este serviço usa o pdftohtml para Conversão de ficheiros.
PDFToHTML.submit=Converter PDFToHTML.submit=Converter

View File

@ -957,7 +957,7 @@ PDFToText.submit=Convert
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF către HTML PDFToHTML.title=PDF către HTML
PDFToHTML.header=PDF către HTML PDFToHTML.header=PDF către HTML
PDFToHTML.credit=Acest serviciu utilizează LibreOffice pentru conversia fișierului. PDFToHTML.credit=Acest serviciu utilizează pdftohtml pentru conversia fișierului.
PDFToHTML.submit=Convert PDFToHTML.submit=Convert

View File

@ -957,7 +957,7 @@ PDFToText.submit=Конвертировать
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF в HTML PDFToHTML.title=PDF в HTML
PDFToHTML.header=PDF в HTML PDFToHTML.header=PDF в HTML
PDFToHTML.credit=Этот сервис использует LibreOffice для преобразования файлов. PDFToHTML.credit=Этот сервис использует pdftohtml для преобразования файлов.
PDFToHTML.submit=Конвертировать PDFToHTML.submit=Конвертировать

View File

@ -957,7 +957,7 @@ PDFToText.submit=Konvertuj
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF u HTML PDFToHTML.title=PDF u HTML
PDFToHTML.header=PDF u HTML PDFToHTML.header=PDF u HTML
PDFToHTML.credit=Ova usluga koristi LibreOffice za konverziju fajlova. PDFToHTML.credit=Ova usluga koristi pdftohtml za konverziju fajlova.
PDFToHTML.submit=Konvertuj PDFToHTML.submit=Konvertuj

View File

@ -957,7 +957,7 @@ PDFToText.submit=Konvertera
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF till HTML PDFToHTML.title=PDF till HTML
PDFToHTML.header=PDF till HTML PDFToHTML.header=PDF till HTML
PDFToHTML.credit=Denna tjänst använder LibreOffice för filkonvertering. PDFToHTML.credit=Denna tjänst använder pdftohtml för filkonvertering.
PDFToHTML.submit=Konvertera PDFToHTML.submit=Konvertera

View File

@ -957,7 +957,7 @@ PDFToText.submit=Dönüştür
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF'den HTML'e PDFToHTML.title=PDF'den HTML'e
PDFToHTML.header=PDF'den HTML'e PDFToHTML.header=PDF'den HTML'e
PDFToHTML.credit=Bu hizmet dosya dönüşümü için LibreOffice kullanır. PDFToHTML.credit=Bu hizmet dosya dönüşümü için pdftohtml kullanır.
PDFToHTML.submit=Dönüştür PDFToHTML.submit=Dönüştür

View File

@ -957,7 +957,7 @@ PDFToText.submit=转换
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF To HTML PDFToHTML.title=PDF To HTML
PDFToHTML.header=将PDF转换成HTML PDFToHTML.header=将PDF转换成HTML
PDFToHTML.credit=此服务使用LibreOffice进行文件转换。 PDFToHTML.credit=此服务使用pdftohtml进行文件转换。
PDFToHTML.submit=转换 PDFToHTML.submit=转换

View File

@ -957,7 +957,7 @@ PDFToText.submit=轉換
#PDFToHTML #PDFToHTML
PDFToHTML.title=PDF 轉 HTML PDFToHTML.title=PDF 轉 HTML
PDFToHTML.header=PDF 轉 HTML PDFToHTML.header=PDF 轉 HTML
PDFToHTML.credit=此服務使用 LibreOffice 進行檔案轉換。 PDFToHTML.credit=此服務使用 pdftohtml 進行檔案轉換。
PDFToHTML.submit=轉換 PDFToHTML.submit=轉換