diff --git a/app/core/build.gradle b/app/core/build.gradle index 6e952f59e..b75eaeea0 100644 --- a/app/core/build.gradle +++ b/app/core/build.gradle @@ -66,6 +66,7 @@ dependencies { implementation 'javax.xml.bind:jaxb-api:2.3.1' implementation 'com.sun.xml.bind:jaxb-impl:2.3.9' implementation 'com.sun.xml.bind:jaxb-core:2.3.0.1' + implementation 'org.apache.poi:poi-ooxml:5.5.1' // https://mvnrepository.com/artifact/technology.tabula/tabula implementation ('technology.tabula:tabula:1.0.5') { diff --git a/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToExcelController.java b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToExcelController.java new file mode 100644 index 000000000..60b5679c9 --- /dev/null +++ b/app/core/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToExcelController.java @@ -0,0 +1,127 @@ +package stirling.software.SPDF.controller.api.converters; + +import java.io.ByteArrayOutputStream; +import java.util.List; +import java.util.Locale; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.ss.usermodel.Workbook; +import org.apache.poi.ss.util.WorkbookUtil; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.springframework.http.ContentDisposition; +import org.springframework.http.HttpHeaders; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.ModelAttribute; + +import io.swagger.v3.oas.annotations.Operation; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import stirling.software.SPDF.model.api.PDFWithPageNums; +import stirling.software.common.annotations.AutoJobPostMapping; +import stirling.software.common.annotations.api.ConvertApi; +import stirling.software.common.service.CustomPDFDocumentFactory; +import stirling.software.common.util.GeneralUtils; + +import technology.tabula.ObjectExtractor; +import technology.tabula.Page; +import technology.tabula.RectangularTextContainer; +import technology.tabula.Table; +import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; + +@ConvertApi +@Slf4j +@RequiredArgsConstructor +public class ConvertPDFToExcelController { + + private final CustomPDFDocumentFactory pdfDocumentFactory; + + @AutoJobPostMapping(value = "/pdf/xlsx", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) + @Operation( + summary = "Convert a PDF to an Excel spreadsheet (XLSX)", + description = + "Extracts tabular data from each page of a PDF and writes it into an Excel" + + " workbook, one sheet per table. Input:PDF Output:XLSX Type:SISO") + public ResponseEntity pdfToExcel(@ModelAttribute PDFWithPageNums request) + throws Exception { + String baseName = + GeneralUtils.removeExtension(request.getFileInput().getOriginalFilename()); + + try (PDDocument document = pdfDocumentFactory.load(request); + XSSFWorkbook workbook = new XSSFWorkbook(); + ObjectExtractor extractor = new ObjectExtractor(document)) { + + List pages = request.getPageNumbersList(document, true); + SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); + int sheetCount = 0; + + for (int pageNum : pages) { + Page page = extractor.extract(pageNum); + List tables = sea.extract(page); + + for (int tableIdx = 0; tableIdx < tables.size(); tableIdx++) { + Table table = tables.get(tableIdx); + String sheetName = + tables.size() == 1 + ? String.format(Locale.ROOT, "Page %d", pageNum) + : String.format( + Locale.ROOT, + "Page %d Table %d", + pageNum, + tableIdx + 1); + + sheetName = getUniqueSheetName(workbook, sheetName); + Sheet sheet = workbook.createSheet(sheetName); + List> rows = table.getRows(); + + for (int rowIdx = 0; rowIdx < rows.size(); rowIdx++) { + Row excelRow = sheet.createRow(rowIdx); + List cells = rows.get(rowIdx); + for (int cellIdx = 0; cellIdx < cells.size(); cellIdx++) { + Cell excelCell = excelRow.createCell(cellIdx); + excelCell.setCellValue(cells.get(cellIdx).getText()); + } + } + sheetCount++; + } + } + + if (sheetCount == 0) { + return ResponseEntity.noContent().build(); + } + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + workbook.write(baos); + + HttpHeaders headers = new HttpHeaders(); + headers.setContentDisposition( + ContentDisposition.builder("attachment").filename(baseName + ".xlsx").build()); + headers.setContentType( + MediaType.parseMediaType( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")); + + return ResponseEntity.ok().headers(headers).body(baos.toByteArray()); + } + } + + private String getUniqueSheetName(Workbook workbook, String baseName) { + String safeName = WorkbookUtil.createSafeSheetName(baseName); + String uniqueName = safeName; + int count = 1; + while (workbook.getSheet(uniqueName) != null) { + String suffix = " (" + count + ")"; + if (safeName.length() + suffix.length() > 31) { + uniqueName = safeName.substring(0, 31 - suffix.length()) + suffix; + } else { + uniqueName = safeName + suffix; + } + count++; + } + return uniqueName; + } +} diff --git a/frontend/public/locales/en-GB/translation.toml b/frontend/public/locales/en-GB/translation.toml index 02f168117..36a823739 100644 --- a/frontend/public/locales/en-GB/translation.toml +++ b/frontend/public/locales/en-GB/translation.toml @@ -4649,6 +4649,13 @@ prompt = "Choose page to extract table" submit = "Extract" title = "PDF to CSV" +[PDFToXLSX] +header = "PDF to Excel" +prompt = "Choose pages to extract tables" +submit = "Convert" +title = "PDF to Excel (XLSX)" +tags = "spreadsheet,excel,xlsx,table,extract,convert" + [PDFToHTML] credit = "This service uses pdftohtml for file conversion." header = "PDF to HTML" diff --git a/frontend/src/core/constants/convertConstants.ts b/frontend/src/core/constants/convertConstants.ts index bdb9caa6b..2c351753b 100644 --- a/frontend/src/core/constants/convertConstants.ts +++ b/frontend/src/core/constants/convertConstants.ts @@ -28,6 +28,7 @@ export const CONVERSION_ENDPOINTS = { 'pdf-office-presentation': '/api/v1/convert/pdf/presentation', 'pdf-office-text': '/api/v1/convert/pdf/text', 'pdf-csv': '/api/v1/convert/pdf/csv', + 'pdf-xlsx': '/api/v1/convert/pdf/xlsx', 'pdf-markdown': '/api/v1/convert/pdf/markdown', 'pdf-html': '/api/v1/convert/pdf/html', 'pdf-xml': '/api/v1/convert/pdf/xml', @@ -54,6 +55,7 @@ export const ENDPOINT_NAMES = { 'pdf-office-presentation': 'pdf-to-presentation', 'pdf-office-text': 'pdf-to-text', 'pdf-csv': 'pdf-to-csv', + 'pdf-xlsx': 'pdf-to-xlsx', 'pdf-markdown': 'pdf-to-markdown', 'pdf-html': 'pdf-to-html', 'pdf-xml': 'pdf-to-xml', @@ -116,6 +118,7 @@ export const TO_FORMAT_OPTIONS = [ { value: 'cbz', label: 'CBZ', group: 'Archive' }, { value: 'cbr', label: 'CBR', group: 'Archive' }, { value: 'csv', label: 'CSV', group: 'Spreadsheet' }, + { value: 'xlsx', label: 'XLSX', group: 'Spreadsheet' }, { value: 'pptx', label: 'PPTX', group: 'Presentation' }, { value: 'odp', label: 'ODP', group: 'Presentation' }, { value: 'txt', label: 'TXT', group: 'Text' }, @@ -137,7 +140,7 @@ export const TO_FORMAT_OPTIONS = [ export const CONVERSION_MATRIX: Record = { 'any': ['pdf'], // Mixed files always convert to PDF 'image': ['pdf'], // Multiple images always convert to PDF - 'pdf': ['png', 'jpg', 'gif', 'tiff', 'bmp', 'webp', 'docx', 'odt', 'pptx', 'odp', 'csv', 'txt', 'rtf', 'md', 'html', 'xml', 'pdfa', 'pdfx', 'cbz', 'cbr', 'epub', 'azw3'], + 'pdf': ['png', 'jpg', 'gif', 'tiff', 'bmp', 'webp', 'docx', 'odt', 'pptx', 'odp', 'csv', 'xlsx', 'txt', 'rtf', 'md', 'html', 'xml', 'pdfa', 'pdfx', 'cbz', 'cbr', 'epub', 'azw3'], 'cbz': ['pdf'], 'docx': ['pdf'], 'doc': ['pdf'], 'odt': ['pdf'], 'xlsx': ['pdf'], 'xls': ['pdf'], 'ods': ['pdf'], @@ -162,6 +165,7 @@ export const EXTENSION_TO_ENDPOINT: Record> = { 'docx': 'pdf-to-word', 'odt': 'pdf-to-word', 'pptx': 'pdf-to-presentation', 'odp': 'pdf-to-presentation', 'csv': 'pdf-to-csv', + 'xlsx': 'pdf-to-xlsx', 'txt': 'pdf-to-text', 'rtf': 'pdf-to-text', 'md': 'pdf-to-markdown', 'html': 'pdf-to-html', 'xml': 'pdf-to-xml', 'pdfa': 'pdf-to-pdfa', diff --git a/frontend/src/core/data/useTranslatedToolRegistry.tsx b/frontend/src/core/data/useTranslatedToolRegistry.tsx index f055f9c2c..9ae96bcb8 100644 --- a/frontend/src/core/data/useTranslatedToolRegistry.tsx +++ b/frontend/src/core/data/useTranslatedToolRegistry.tsx @@ -916,6 +916,7 @@ export function useTranslatedToolCatalog(): TranslatedToolCatalog { "markdown-to-pdf", "file-to-pdf", "pdf-to-csv", + "pdf-to-xlsx", "pdf-to-markdown", "pdf-to-pdfa", "eml-to-pdf", diff --git a/frontend/src/core/hooks/tools/convert/useConvertOperation.ts b/frontend/src/core/hooks/tools/convert/useConvertOperation.ts index 2399ceaaf..cf805fc9c 100644 --- a/frontend/src/core/hooks/tools/convert/useConvertOperation.ts +++ b/frontend/src/core/hooks/tools/convert/useConvertOperation.ts @@ -21,8 +21,8 @@ export const shouldProcessFilesSeparately = ( (parameters.fromExtension === 'pdf' && isImageFormat(parameters.toExtension)) || // PDF to PDF/A and PDF/X conversions (each PDF should be processed separately) (parameters.fromExtension === 'pdf' && (parameters.toExtension === 'pdfa' || parameters.toExtension === 'pdfx')) || - // PDF to text-like formats should be one output per input - (parameters.fromExtension === 'pdf' && ['txt', 'rtf', 'csv'].includes(parameters.toExtension)) || + // PDF to text-like/spreadsheet formats should be one output per input + (parameters.fromExtension === 'pdf' && ['txt', 'rtf', 'csv', 'xlsx'].includes(parameters.toExtension)) || // PDF to CBR conversions (each PDF should generate its own archive) (parameters.fromExtension === 'pdf' && parameters.toExtension === 'cbr') || // PDF to EPUB/AZW3 conversions (each PDF should generate its own ebook) @@ -85,6 +85,8 @@ export const buildConvertFormData = (parameters: ConvertParameters, selectedFile formData.append("outputFormat", pdfxOptions?.outputFormat || 'pdfx'); } else if (fromExtension === 'pdf' && toExtension === 'csv') { formData.append("pageNumbers", "all"); + } else if (fromExtension === 'pdf' && toExtension === 'xlsx') { + formData.append("pageNumbers", "all"); } else if (fromExtension === 'cbr' && toExtension === 'pdf') { formData.append("optimizeForEbook", cbrOptions.optimizeForEbook.toString()); } else if (fromExtension === 'pdf' && toExtension === 'cbr') { diff --git a/frontend/src/core/tests/helpers/conversionEndpointDiscovery.ts b/frontend/src/core/tests/helpers/conversionEndpointDiscovery.ts index 5bb26b48f..6ff8d5d7c 100644 --- a/frontend/src/core/tests/helpers/conversionEndpointDiscovery.ts +++ b/frontend/src/core/tests/helpers/conversionEndpointDiscovery.ts @@ -107,6 +107,13 @@ const ALL_CONVERSION_ENDPOINTS: ConversionEndpoint[] = [ description: 'Extract CSV data from PDF', apiPath: '/api/v1/convert/pdf/csv' }, + { + endpoint: 'pdf-to-xlsx', + fromFormat: 'pdf', + toFormat: 'xlsx', + description: 'Extract Excel spreadsheet from PDF', + apiPath: '/api/v1/convert/pdf/xlsx' + }, { endpoint: 'pdf-to-markdown', fromFormat: 'pdf', diff --git a/frontend/src/core/utils/urlMapping.ts b/frontend/src/core/utils/urlMapping.ts index f7bb0d666..118426ddf 100644 --- a/frontend/src/core/utils/urlMapping.ts +++ b/frontend/src/core/utils/urlMapping.ts @@ -23,6 +23,7 @@ export const URL_TO_TOOL_MAP: Record = { '/html-to-pdf': 'convert', '/markdown-to-pdf': 'convert', '/pdf-to-csv': 'convert', + '/pdf-to-xlsx': 'convert', '/pdf-to-img': 'convert', '/pdf-to-markdown': 'convert', '/pdf-to-pdfa': 'convert', diff --git a/frontend/src/proprietary/components/shared/config/configSections/AdminEndpointsSection.tsx b/frontend/src/proprietary/components/shared/config/configSections/AdminEndpointsSection.tsx index df0e5412e..5a593a989 100644 --- a/frontend/src/proprietary/components/shared/config/configSections/AdminEndpointsSection.tsx +++ b/frontend/src/proprietary/components/shared/config/configSections/AdminEndpointsSection.tsx @@ -143,6 +143,7 @@ export default function AdminEndpointsSection() { 'ocr-pdf', 'overlay-pdf', 'pdf-to-csv', + 'pdf-to-xlsx', 'pdf-to-epub', 'pdf-to-html', 'pdf-to-img',