From afad06bed4e29b0fc1ed4d33aac4234311edce81 Mon Sep 17 00:00:00 2001 From: Omar Ahmed Hassan <98468609+omar-ahmed42@users.noreply.github.com> Date: Sun, 24 Nov 2024 01:28:44 +0200 Subject: [PATCH] Extract tables from PDF to CSV using Tabula (#2312) * Add Tabula dependency and exclude slf4j-simple - Add tabula-java dependency to extract tables into CSV. - Exclude slf4j-simple due to Logback * Add a flexible CSVWriter - Add FlexibleCSVWriter which extends CSVWriter to pass a custom CSVFormat, as CSVWriter's parameterized constructor (that allows changing CSVFormat) is protected. * Use Tabula in extracting tables from PDF - Use Tabula in extracting tables from PDF instead of the existing implementation * Delete PDFTableStripper as It is unneeded - Delete PDFTableStripper as It is unneeded as Tabula-Java is used instead. * Use correct class in ExtractCSVController logger * Exclude gson and bcprov-jdk15on dependencies from tabula - Exclude gson and bcprov-jdk15on from tabula-java due to detected security vulnerabilities. --- build.gradle | 7 + .../api/converters/ExtractCSVController.java | 112 ++---- .../api/strippers/PDFTableStripper.java | 327 ------------------ .../software/SPDF/pdf/FlexibleCSVWriter.java | 16 + 4 files changed, 43 insertions(+), 419 deletions(-) delete mode 100644 src/main/java/stirling/software/SPDF/controller/api/strippers/PDFTableStripper.java create mode 100644 src/main/java/stirling/software/SPDF/pdf/FlexibleCSVWriter.java diff --git a/build.gradle b/build.gradle index d64bd28b..7f19329b 100644 --- a/build.gradle +++ b/build.gradle @@ -203,6 +203,13 @@ dependencies { exclude group: "commons-logging", module: "commons-logging" } + // https://mvnrepository.com/artifact/technology.tabula/tabula + implementation ('technology.tabula:tabula:1.0.5') { + exclude group: "org.slf4j", module: "slf4j-simple" + exclude group: "org.bouncycastle", module: "bcprov-jdk15on" + exclude group: "com.google.code.gson", module: "gson" + } + implementation 'org.apache.pdfbox:jbig2-imageio:3.0.4' implementation "org.bouncycastle:bcprov-jdk18on:$bouncycastleVersion" diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java index 8cf8aa4f..a6415bfc 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java @@ -1,12 +1,12 @@ package stirling.software.SPDF.controller.api.converters; import java.io.StringWriter; -import java.util.ArrayList; import java.util.List; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.QuoteMode; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.http.ContentDisposition; @@ -18,79 +18,36 @@ import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; -import com.opencsv.CSVWriter; - import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; - import stirling.software.SPDF.controller.api.CropController; -import stirling.software.SPDF.controller.api.strippers.PDFTableStripper; import stirling.software.SPDF.model.api.extract.PDFFilePage; +import stirling.software.SPDF.pdf.FlexibleCSVWriter; +import technology.tabula.ObjectExtractor; +import technology.tabula.Page; +import technology.tabula.Table; +import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; +import technology.tabula.writers.Writer; @RestController @RequestMapping("/api/v1/convert") @Tag(name = "Convert", description = "Convert APIs") public class ExtractCSVController { - private static final Logger logger = LoggerFactory.getLogger(CropController.class); + private static final Logger logger = LoggerFactory.getLogger(ExtractCSVController.class); @PostMapping(value = "/pdf/csv", consumes = "multipart/form-data") - @Operation( - summary = "Extracts a CSV document from a PDF", - description = - "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO") + @Operation(summary = "Extracts a CSV document from a PDF", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO") public ResponseEntity PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception { - - ArrayList tableData = new ArrayList<>(); - int columnsCount = 0; - - try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) { - final double res = 72; // PDF units are at 72 DPI - PDFTableStripper stripper = new PDFTableStripper(); - PDPage pdPage = document.getPage(form.getPageId() - 1); - stripper.extractTable(pdPage); - columnsCount = stripper.getColumns(); - for (int c = 0; c < columnsCount; ++c) { - for (int r = 0; r < stripper.getRows(); ++r) { - tableData.add(stripper.getText(r, c)); - } - } - } - - ArrayList notEmptyColumns = new ArrayList<>(); - - for (String item : tableData) { - if (!item.trim().isEmpty()) { - notEmptyColumns.add(item); - } else { - columnsCount--; - } - } - - List fullTable = - notEmptyColumns.stream() - .map( - (entity) -> - entity.replace('\n', ' ') - .replace('\r', ' ') - .trim() - .replaceAll("\\s{2,}", "|")) - .toList(); - - int rowsCount = fullTable.get(0).split("\\|").length; - - ArrayList headersList = getTableHeaders(columnsCount, fullTable); - ArrayList recordList = getRecordsList(rowsCount, fullTable); - - if (headersList.size() == 0 && recordList.size() == 0) { - throw new Exception("No table detected, no headers or records found"); - } - StringWriter writer = new StringWriter(); - try (CSVWriter csvWriter = new CSVWriter(writer)) { - csvWriter.writeNext(headersList.toArray(new String[0])); - for (String record : recordList) { - csvWriter.writeNext(record.split("\\|")); + try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) { + CSVFormat format = CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build(); + Writer csvWriter = new FlexibleCSVWriter(format); + SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); + try (ObjectExtractor extractor = new ObjectExtractor(document)) { + Page page = extractor.extract(form.getPageId()); + List tables = sea.extract(page); + csvWriter.write(writer, tables); } } @@ -99,41 +56,12 @@ public class ExtractCSVController { ContentDisposition.builder("attachment") .filename( form.getFileInput() - .getOriginalFilename() - .replaceFirst("[.][^.]+$", "") + .getOriginalFilename() + .replaceFirst("[.][^.]+$", "") + "_extracted.csv") .build()); headers.setContentType(MediaType.parseMediaType("text/csv")); return ResponseEntity.ok().headers(headers).body(writer.toString()); } - - private ArrayList getRecordsList(int rowsCounts, List items) { - ArrayList recordsList = new ArrayList<>(); - - for (int b = 1; b < rowsCounts; b++) { - StringBuilder strbldr = new StringBuilder(); - - for (int i = 0; i < items.size(); i++) { - String[] parts = items.get(i).split("\\|"); - strbldr.append(parts[b]); - if (i != items.size() - 1) { - strbldr.append("|"); - } - } - recordsList.add(strbldr.toString()); - } - - return recordsList; - } - - private ArrayList getTableHeaders(int columnsCount, List items) { - ArrayList resultList = new ArrayList<>(); - for (int i = 0; i < columnsCount; i++) { - String[] parts = items.get(i).split("\\|"); - resultList.add(parts[0]); - } - - return resultList; - } } diff --git a/src/main/java/stirling/software/SPDF/controller/api/strippers/PDFTableStripper.java b/src/main/java/stirling/software/SPDF/controller/api/strippers/PDFTableStripper.java deleted file mode 100644 index 0ea3e131..00000000 --- a/src/main/java/stirling/software/SPDF/controller/api/strippers/PDFTableStripper.java +++ /dev/null @@ -1,327 +0,0 @@ -package stirling.software.SPDF.controller.api.strippers; - -import java.awt.Shape; -import java.awt.geom.AffineTransform; -import java.awt.geom.Rectangle2D; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; - -import org.apache.fontbox.util.BoundingBox; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDType3Font; -import org.apache.pdfbox.text.PDFTextStripper; -import org.apache.pdfbox.text.PDFTextStripperByArea; -import org.apache.pdfbox.text.TextPosition; - -/** - * Class to extract tabular data from a PDF. Works by making a first pass of the page to group all - * nearby text items together, and then inferring a 2D grid from these regions. Each table cell is - * then extracted using a PDFTextStripperByArea object. - * - *

Works best when headers are included in the detected region, to ensure representative text in - * every column. - * - *

Based upon DrawPrintTextLocations PDFBox example - * (https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/util/DrawPrintTextLocations.java) - * - * @author Beldaz - */ -public class PDFTableStripper extends PDFTextStripper { - - /** - * This will print the documents data, for each table cell. - * - * @param args The command line arguments. - * @throws IOException If there is an error parsing the document. - */ - /* - * Used in methods derived from DrawPrintTextLocations - */ - private AffineTransform flipAT; - - private AffineTransform rotateAT; - - /** Regions updated by calls to writeString */ - private Set boxes; - - // Border to allow when finding intersections - private double dx = 1.0; // This value works for me, feel free to tweak (or add setter) - private double dy = 0.000; // Rows of text tend to overlap, so need to extend - - /** Region in which to find table (otherwise whole page) */ - private Rectangle2D regionArea; - - /** Number of rows in inferred table */ - private int nRows = 0; - - /** Number of columns in inferred table */ - private int nCols = 0; - - /** This is the object that does the text extraction */ - private PDFTextStripperByArea regionStripper; - - /** - * 1D intervals - used for calculateTableRegions() - * - * @author Beldaz - */ - public static class Interval { - double start; - double end; - - public Interval(double start, double end) { - this.start = start; - this.end = end; - } - - public void add(Interval col) { - if (col.start < start) start = col.start; - if (col.end > end) end = col.end; - } - - public static void addTo(Interval x, LinkedList columns) { - int p = 0; - Iterator it = columns.iterator(); - // Find where x should go - while (it.hasNext()) { - Interval col = it.next(); - if (x.end >= col.start) { - if (x.start <= col.end) { // overlaps - x.add(col); - it.remove(); - } - break; - } - ++p; - } - while (it.hasNext()) { - Interval col = it.next(); - if (x.start > col.end) break; - x.add(col); - it.remove(); - } - columns.add(p, x); - } - } - - /** - * Instantiate a new PDFTableStripper object. - * - * @throws IOException If there is an error loading the properties. - */ - public PDFTableStripper() throws IOException { - super.setShouldSeparateByBeads(false); - regionStripper = new PDFTextStripperByArea(); - regionStripper.setSortByPosition(true); - } - - /** - * Define the region to group text by. - * - * @param rect The rectangle area to retrieve the text from. - */ - public void setRegion(Rectangle2D rect) { - regionArea = rect; - } - - public int getRows() { - return nRows; - } - - public int getColumns() { - return nCols; - } - - /** - * Get the text for the region, this should be called after extractTable(). - * - * @return The text that was identified in that region. - */ - public String getText(int row, int col) { - return regionStripper.getTextForRegion("el" + col + "x" + row); - } - - public void extractTable(PDPage pdPage) throws IOException { - setStartPage(getCurrentPageNo()); - setEndPage(getCurrentPageNo()); - - boxes = new HashSet(); - // flip y-axis - flipAT = new AffineTransform(); - flipAT.translate(0, pdPage.getBBox().getHeight()); - flipAT.scale(1, -1); - - // page may be rotated - rotateAT = new AffineTransform(); - int rotation = pdPage.getRotation(); - if (rotation != 0) { - PDRectangle mediaBox = pdPage.getMediaBox(); - switch (rotation) { - case 90: - rotateAT.translate(mediaBox.getHeight(), 0); - break; - case 270: - rotateAT.translate(0, mediaBox.getWidth()); - break; - case 180: - rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight()); - break; - default: - break; - } - rotateAT.rotate(Math.toRadians(rotation)); - } - // Trigger processing of the document so that writeString is called. - try (Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream())) { - super.output = dummy; - super.processPage(pdPage); - } - - Rectangle2D[][] regions = calculateTableRegions(); - - // System.err.println("Drawing " + nCols + "x" + nRows + "="+ nRows*nCols + " - // regions"); - for (int i = 0; i < nCols; ++i) { - for (int j = 0; j < nRows; ++j) { - final Rectangle2D region = regions[i][j]; - regionStripper.addRegion("el" + i + "x" + j, region); - } - } - - regionStripper.extractRegions(pdPage); - } - - /** - * Infer a rectangular grid of regions from the boxes field. - * - * @return 2D array of table regions (as Rectangle2D objects). Note that some of these regions - * may have no content. - */ - private Rectangle2D[][] calculateTableRegions() { - - // Build up a list of all table regions, based upon the populated - // regions of boxes field. Treats the horizontal and vertical extents - // of each box as distinct - LinkedList columns = new LinkedList(); - LinkedList rows = new LinkedList(); - - for (Rectangle2D box : boxes) { - Interval x = new Interval(box.getMinX(), box.getMaxX()); - Interval y = new Interval(box.getMinY(), box.getMaxY()); - - Interval.addTo(x, columns); - Interval.addTo(y, rows); - } - - nRows = rows.size(); - nCols = columns.size(); - Rectangle2D[][] regions = new Rectangle2D[nCols][nRows]; - int i = 0; - // Label regions from top left, rather than the transformed orientation - for (Interval column : columns) { - int j = 0; - for (Interval row : rows) { - regions[nCols - i - 1][nRows - j - 1] = - new Rectangle2D.Double( - column.start, - row.start, - column.end - column.start, - row.end - row.start); - ++j; - } - ++i; - } - - return regions; - } - - /** - * Register each character's bounding box, updating boxes field to maintain a list of all - * distinct groups of characters. - * - *

Overrides the default functionality of PDFTextStripper. Most of this is taken from - * DrawPrintTextLocations.java, with extra steps at end of main loop - */ - @Override - protected void writeString(String string, List textPositions) throws IOException { - for (TextPosition text : textPositions) { - // glyph space -> user space - // note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix - AffineTransform at = text.getTextMatrix().createAffineTransform(); - PDFont font = text.getFont(); - BoundingBox bbox = font.getBoundingBox(); - - // advance width, bbox height (glyph space) - float xadvance = - font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars - Rectangle2D.Float rect = - new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight()); - - if (font instanceof PDType3Font) { - // bbox and font matrix are unscaled - at.concatenate(font.getFontMatrix().createAffineTransform()); - } else { - // bbox and font matrix are already scaled to 1000 - at.scale(1 / 1000f, 1 / 1000f); - } - Shape s = at.createTransformedShape(rect); - s = flipAT.createTransformedShape(s); - s = rotateAT.createTransformedShape(s); - - // - // Merge character's bounding box with boxes field - // - Rectangle2D bounds = s.getBounds2D(); - // Pad sides to detect almost touching boxes - Rectangle2D hitbox = bounds.getBounds2D(); - hitbox.add(bounds.getMinX() - dx, bounds.getMinY() - dy); - hitbox.add(bounds.getMaxX() + dx, bounds.getMaxY() + dy); - - // Find all overlapping boxes - List intersectList = new ArrayList(); - for (Rectangle2D box : boxes) { - if (box.intersects(hitbox)) { - intersectList.add(box); - } - } - - // Combine all touching boxes and update - // (NOTE: Potentially this could leave some overlapping boxes un-merged, - // but it's sufficient for now and get's fixed up in calculateTableRegions) - for (Rectangle2D box : intersectList) { - bounds.add(box); - boxes.remove(box); - } - boxes.add(bounds); - } - } - - /** - * This method does nothing in this derived class, because beads and regions are incompatible. - * Beads are ignored when stripping by area. - * - * @param aShouldSeparateByBeads The new grouping of beads. - */ - @Override - public final void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) {} - - /** Adapted from PDFTextStripperByArea {@inheritDoc} */ - @Override - protected void processTextPosition(TextPosition text) { - if (regionArea != null && !regionArea.contains(text.getX(), text.getY())) { - // skip character - } else { - super.processTextPosition(text); - } - } -} diff --git a/src/main/java/stirling/software/SPDF/pdf/FlexibleCSVWriter.java b/src/main/java/stirling/software/SPDF/pdf/FlexibleCSVWriter.java new file mode 100644 index 00000000..94a48d93 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/pdf/FlexibleCSVWriter.java @@ -0,0 +1,16 @@ +package stirling.software.SPDF.pdf; + +import org.apache.commons.csv.CSVFormat; + +import technology.tabula.writers.CSVWriter; + +public class FlexibleCSVWriter extends CSVWriter { + + public FlexibleCSVWriter() { + super(); + } + + public FlexibleCSVWriter(CSVFormat csvFormat) { + super(csvFormat); + } +}