From afad06bed4e29b0fc1ed4d33aac4234311edce81 Mon Sep 17 00:00:00 2001
From: Omar Ahmed Hassan <98468609+omar-ahmed42@users.noreply.github.com>
Date: Sun, 24 Nov 2024 01:28:44 +0200
Subject: [PATCH] Extract tables from PDF to CSV using Tabula (#2312)

* Add Tabula dependency and exclude slf4j-simple

- Add tabula-java dependency to extract tables into CSV.
- Exclude slf4j-simple due to Logback

* Add a flexible CSVWriter

- Add FlexibleCSVWriter which extends CSVWriter to pass a custom CSVFormat, as CSVWriter's parameterized constructor (that allows changing CSVFormat) is protected.

* Use Tabula in extracting tables from PDF

- Use Tabula in extracting tables from PDF instead of the existing implementation

* Delete PDFTableStripper as It is unneeded

- Delete PDFTableStripper as It is unneeded as Tabula-Java is used instead.

* Use correct class in ExtractCSVController logger

* Exclude gson and bcprov-jdk15on dependencies from tabula

- Exclude gson and bcprov-jdk15on from tabula-java due to detected security vulnerabilities.
---
 build.gradle                                  |   7 +
 .../api/converters/ExtractCSVController.java  | 112 ++----
 .../api/strippers/PDFTableStripper.java       | 327 ------------------
 .../software/SPDF/pdf/FlexibleCSVWriter.java  |  16 +
 4 files changed, 43 insertions(+), 419 deletions(-)
 delete mode 100644 src/main/java/stirling/software/SPDF/controller/api/strippers/PDFTableStripper.java
 create mode 100644 src/main/java/stirling/software/SPDF/pdf/FlexibleCSVWriter.java

diff --git a/build.gradle b/build.gradle
index d64bd28b..7f19329b 100644
--- a/build.gradle
+++ b/build.gradle
@@ -203,6 +203,13 @@ dependencies {
         exclude group: "commons-logging", module: "commons-logging"
     }
 
+    // https://mvnrepository.com/artifact/technology.tabula/tabula
+    implementation ('technology.tabula:tabula:1.0.5')  {
+        exclude group: "org.slf4j", module: "slf4j-simple"
+        exclude group: "org.bouncycastle", module: "bcprov-jdk15on"
+        exclude group: "com.google.code.gson", module: "gson"
+    }
+
     implementation 'org.apache.pdfbox:jbig2-imageio:3.0.4'
 
     implementation "org.bouncycastle:bcprov-jdk18on:$bouncycastleVersion"
diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java
index 8cf8aa4f..a6415bfc 100644
--- a/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ExtractCSVController.java
@@ -1,12 +1,12 @@
 package stirling.software.SPDF.controller.api.converters;
 
 import java.io.StringWriter;
-import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.QuoteMode;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.http.ContentDisposition;
@@ -18,79 +18,36 @@ import org.springframework.web.bind.annotation.PostMapping;
 import org.springframework.web.bind.annotation.RequestMapping;
 import org.springframework.web.bind.annotation.RestController;
 
-import com.opencsv.CSVWriter;
-
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;
-
 import stirling.software.SPDF.controller.api.CropController;
-import stirling.software.SPDF.controller.api.strippers.PDFTableStripper;
 import stirling.software.SPDF.model.api.extract.PDFFilePage;
+import stirling.software.SPDF.pdf.FlexibleCSVWriter;
+import technology.tabula.ObjectExtractor;
+import technology.tabula.Page;
+import technology.tabula.Table;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+import technology.tabula.writers.Writer;
 
 @RestController
 @RequestMapping("/api/v1/convert")
 @Tag(name = "Convert", description = "Convert APIs")
 public class ExtractCSVController {
 
-    private static final Logger logger = LoggerFactory.getLogger(CropController.class);
+    private static final Logger logger = LoggerFactory.getLogger(ExtractCSVController.class);
 
     @PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
-    @Operation(
-            summary = "Extracts a CSV document from a PDF",
-            description =
-                    "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
+    @Operation(summary = "Extracts a CSV document from a PDF", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
     public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception {
-
-        ArrayList<String> tableData = new ArrayList<>();
-        int columnsCount = 0;
-
-        try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
-            final double res = 72; // PDF units are at 72 DPI
-            PDFTableStripper stripper = new PDFTableStripper();
-            PDPage pdPage = document.getPage(form.getPageId() - 1);
-            stripper.extractTable(pdPage);
-            columnsCount = stripper.getColumns();
-            for (int c = 0; c < columnsCount; ++c) {
-                for (int r = 0; r < stripper.getRows(); ++r) {
-                    tableData.add(stripper.getText(r, c));
-                }
-            }
-        }
-
-        ArrayList<String> notEmptyColumns = new ArrayList<>();
-
-        for (String item : tableData) {
-            if (!item.trim().isEmpty()) {
-                notEmptyColumns.add(item);
-            } else {
-                columnsCount--;
-            }
-        }
-
-        List<String> fullTable =
-                notEmptyColumns.stream()
-                        .map(
-                                (entity) ->
-                                        entity.replace('\n', ' ')
-                                                .replace('\r', ' ')
-                                                .trim()
-                                                .replaceAll("\\s{2,}", "|"))
-                        .toList();
-
-        int rowsCount = fullTable.get(0).split("\\|").length;
-
-        ArrayList<String> headersList = getTableHeaders(columnsCount, fullTable);
-        ArrayList<String> recordList = getRecordsList(rowsCount, fullTable);
-
-        if (headersList.size() == 0 && recordList.size() == 0) {
-            throw new Exception("No table detected, no headers or records found");
-        }
-
         StringWriter writer = new StringWriter();
-        try (CSVWriter csvWriter = new CSVWriter(writer)) {
-            csvWriter.writeNext(headersList.toArray(new String[0]));
-            for (String record : recordList) {
-                csvWriter.writeNext(record.split("\\|"));
+        try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
+            CSVFormat format = CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
+            Writer csvWriter = new FlexibleCSVWriter(format);
+            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
+            try (ObjectExtractor extractor = new ObjectExtractor(document)) {
+                Page page = extractor.extract(form.getPageId());
+                List<Table> tables = sea.extract(page);
+                csvWriter.write(writer, tables);
             }
         }
 
@@ -99,41 +56,12 @@ public class ExtractCSVController {
                 ContentDisposition.builder("attachment")
                         .filename(
                                 form.getFileInput()
-                                                .getOriginalFilename()
-                                                .replaceFirst("[.][^.]+$", "")
+                                        .getOriginalFilename()
+                                        .replaceFirst("[.][^.]+$", "")
                                         + "_extracted.csv")
                         .build());
         headers.setContentType(MediaType.parseMediaType("text/csv"));
 
         return ResponseEntity.ok().headers(headers).body(writer.toString());
     }
-
-    private ArrayList<String> getRecordsList(int rowsCounts, List<String> items) {
-        ArrayList<String> recordsList = new ArrayList<>();
-
-        for (int b = 1; b < rowsCounts; b++) {
-            StringBuilder strbldr = new StringBuilder();
-
-            for (int i = 0; i < items.size(); i++) {
-                String[] parts = items.get(i).split("\\|");
-                strbldr.append(parts[b]);
-                if (i != items.size() - 1) {
-                    strbldr.append("|");
-                }
-            }
-            recordsList.add(strbldr.toString());
-        }
-
-        return recordsList;
-    }
-
-    private ArrayList<String> getTableHeaders(int columnsCount, List<String> items) {
-        ArrayList<String> resultList = new ArrayList<>();
-        for (int i = 0; i < columnsCount; i++) {
-            String[] parts = items.get(i).split("\\|");
-            resultList.add(parts[0]);
-        }
-
-        return resultList;
-    }
 }
diff --git a/src/main/java/stirling/software/SPDF/controller/api/strippers/PDFTableStripper.java b/src/main/java/stirling/software/SPDF/controller/api/strippers/PDFTableStripper.java
deleted file mode 100644
index 0ea3e131..00000000
--- a/src/main/java/stirling/software/SPDF/controller/api/strippers/PDFTableStripper.java
+++ /dev/null
@@ -1,327 +0,0 @@
-package stirling.software.SPDF.controller.api.strippers;
-
-import java.awt.Shape;
-import java.awt.geom.AffineTransform;
-import java.awt.geom.Rectangle2D;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.fontbox.util.BoundingBox;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-import org.apache.pdfbox.pdmodel.font.PDFont;
-import org.apache.pdfbox.pdmodel.font.PDType3Font;
-import org.apache.pdfbox.text.PDFTextStripper;
-import org.apache.pdfbox.text.PDFTextStripperByArea;
-import org.apache.pdfbox.text.TextPosition;
-
-/**
- * Class to extract tabular data from a PDF. Works by making a first pass of the page to group all
- * nearby text items together, and then inferring a 2D grid from these regions. Each table cell is
- * then extracted using a PDFTextStripperByArea object.
- *
- * <p>Works best when headers are included in the detected region, to ensure representative text in
- * every column.
- *
- * <p>Based upon DrawPrintTextLocations PDFBox example
- * (https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/util/DrawPrintTextLocations.java)
- *
- * @author Beldaz
- */
-public class PDFTableStripper extends PDFTextStripper {
-
-    /**
-     * This will print the documents data, for each table cell.
-     *
-     * @param args The command line arguments.
-     * @throws IOException If there is an error parsing the document.
-     */
-    /*
-     *  Used in methods derived from DrawPrintTextLocations
-     */
-    private AffineTransform flipAT;
-
-    private AffineTransform rotateAT;
-
-    /** Regions updated by calls to writeString */
-    private Set<Rectangle2D> boxes;
-
-    // Border to allow when finding intersections
-    private double dx = 1.0; // This value works for me, feel free to tweak (or add setter)
-    private double dy = 0.000; // Rows of text tend to overlap, so need to extend
-
-    /** Region in which to find table (otherwise whole page) */
-    private Rectangle2D regionArea;
-
-    /** Number of rows in inferred table */
-    private int nRows = 0;
-
-    /** Number of columns in inferred table */
-    private int nCols = 0;
-
-    /** This is the object that does the text extraction */
-    private PDFTextStripperByArea regionStripper;
-
-    /**
-     * 1D intervals - used for calculateTableRegions()
-     *
-     * @author Beldaz
-     */
-    public static class Interval {
-        double start;
-        double end;
-
-        public Interval(double start, double end) {
-            this.start = start;
-            this.end = end;
-        }
-
-        public void add(Interval col) {
-            if (col.start < start) start = col.start;
-            if (col.end > end) end = col.end;
-        }
-
-        public static void addTo(Interval x, LinkedList<Interval> columns) {
-            int p = 0;
-            Iterator<Interval> it = columns.iterator();
-            // Find where x should go
-            while (it.hasNext()) {
-                Interval col = it.next();
-                if (x.end >= col.start) {
-                    if (x.start <= col.end) { // overlaps
-                        x.add(col);
-                        it.remove();
-                    }
-                    break;
-                }
-                ++p;
-            }
-            while (it.hasNext()) {
-                Interval col = it.next();
-                if (x.start > col.end) break;
-                x.add(col);
-                it.remove();
-            }
-            columns.add(p, x);
-        }
-    }
-
-    /**
-     * Instantiate a new PDFTableStripper object.
-     *
-     * @throws IOException If there is an error loading the properties.
-     */
-    public PDFTableStripper() throws IOException {
-        super.setShouldSeparateByBeads(false);
-        regionStripper = new PDFTextStripperByArea();
-        regionStripper.setSortByPosition(true);
-    }
-
-    /**
-     * Define the region to group text by.
-     *
-     * @param rect The rectangle area to retrieve the text from.
-     */
-    public void setRegion(Rectangle2D rect) {
-        regionArea = rect;
-    }
-
-    public int getRows() {
-        return nRows;
-    }
-
-    public int getColumns() {
-        return nCols;
-    }
-
-    /**
-     * Get the text for the region, this should be called after extractTable().
-     *
-     * @return The text that was identified in that region.
-     */
-    public String getText(int row, int col) {
-        return regionStripper.getTextForRegion("el" + col + "x" + row);
-    }
-
-    public void extractTable(PDPage pdPage) throws IOException {
-        setStartPage(getCurrentPageNo());
-        setEndPage(getCurrentPageNo());
-
-        boxes = new HashSet<Rectangle2D>();
-        // flip y-axis
-        flipAT = new AffineTransform();
-        flipAT.translate(0, pdPage.getBBox().getHeight());
-        flipAT.scale(1, -1);
-
-        // page may be rotated
-        rotateAT = new AffineTransform();
-        int rotation = pdPage.getRotation();
-        if (rotation != 0) {
-            PDRectangle mediaBox = pdPage.getMediaBox();
-            switch (rotation) {
-                case 90:
-                    rotateAT.translate(mediaBox.getHeight(), 0);
-                    break;
-                case 270:
-                    rotateAT.translate(0, mediaBox.getWidth());
-                    break;
-                case 180:
-                    rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
-                    break;
-                default:
-                    break;
-            }
-            rotateAT.rotate(Math.toRadians(rotation));
-        }
-        // Trigger processing of the document so that writeString is called.
-        try (Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream())) {
-            super.output = dummy;
-            super.processPage(pdPage);
-        }
-
-        Rectangle2D[][] regions = calculateTableRegions();
-
-        //        System.err.println("Drawing " + nCols + "x" + nRows + "="+ nRows*nCols + "
-        // regions");
-        for (int i = 0; i < nCols; ++i) {
-            for (int j = 0; j < nRows; ++j) {
-                final Rectangle2D region = regions[i][j];
-                regionStripper.addRegion("el" + i + "x" + j, region);
-            }
-        }
-
-        regionStripper.extractRegions(pdPage);
-    }
-
-    /**
-     * Infer a rectangular grid of regions from the boxes field.
-     *
-     * @return 2D array of table regions (as Rectangle2D objects). Note that some of these regions
-     *     may have no content.
-     */
-    private Rectangle2D[][] calculateTableRegions() {
-
-        // Build up a list of all table regions, based upon the populated
-        // regions of boxes field. Treats the horizontal and vertical extents
-        // of each box as distinct
-        LinkedList<Interval> columns = new LinkedList<Interval>();
-        LinkedList<Interval> rows = new LinkedList<Interval>();
-
-        for (Rectangle2D box : boxes) {
-            Interval x = new Interval(box.getMinX(), box.getMaxX());
-            Interval y = new Interval(box.getMinY(), box.getMaxY());
-
-            Interval.addTo(x, columns);
-            Interval.addTo(y, rows);
-        }
-
-        nRows = rows.size();
-        nCols = columns.size();
-        Rectangle2D[][] regions = new Rectangle2D[nCols][nRows];
-        int i = 0;
-        // Label regions from top left, rather than the transformed orientation
-        for (Interval column : columns) {
-            int j = 0;
-            for (Interval row : rows) {
-                regions[nCols - i - 1][nRows - j - 1] =
-                        new Rectangle2D.Double(
-                                column.start,
-                                row.start,
-                                column.end - column.start,
-                                row.end - row.start);
-                ++j;
-            }
-            ++i;
-        }
-
-        return regions;
-    }
-
-    /**
-     * Register each character's bounding box, updating boxes field to maintain a list of all
-     * distinct groups of characters.
-     *
-     * <p>Overrides the default functionality of PDFTextStripper. Most of this is taken from
-     * DrawPrintTextLocations.java, with extra steps at end of main loop
-     */
-    @Override
-    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
-        for (TextPosition text : textPositions) {
-            // glyph space -> user space
-            // note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
-            AffineTransform at = text.getTextMatrix().createAffineTransform();
-            PDFont font = text.getFont();
-            BoundingBox bbox = font.getBoundingBox();
-
-            // advance width, bbox height (glyph space)
-            float xadvance =
-                    font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
-            Rectangle2D.Float rect =
-                    new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());
-
-            if (font instanceof PDType3Font) {
-                // bbox and font matrix are unscaled
-                at.concatenate(font.getFontMatrix().createAffineTransform());
-            } else {
-                // bbox and font matrix are already scaled to 1000
-                at.scale(1 / 1000f, 1 / 1000f);
-            }
-            Shape s = at.createTransformedShape(rect);
-            s = flipAT.createTransformedShape(s);
-            s = rotateAT.createTransformedShape(s);
-
-            //
-            // Merge character's bounding box with boxes field
-            //
-            Rectangle2D bounds = s.getBounds2D();
-            // Pad sides to detect almost touching boxes
-            Rectangle2D hitbox = bounds.getBounds2D();
-            hitbox.add(bounds.getMinX() - dx, bounds.getMinY() - dy);
-            hitbox.add(bounds.getMaxX() + dx, bounds.getMaxY() + dy);
-
-            // Find all overlapping boxes
-            List<Rectangle2D> intersectList = new ArrayList<Rectangle2D>();
-            for (Rectangle2D box : boxes) {
-                if (box.intersects(hitbox)) {
-                    intersectList.add(box);
-                }
-            }
-
-            // Combine all touching boxes and update
-            // (NOTE: Potentially this could leave some overlapping boxes un-merged,
-            // but it's sufficient for now and get's fixed up in calculateTableRegions)
-            for (Rectangle2D box : intersectList) {
-                bounds.add(box);
-                boxes.remove(box);
-            }
-            boxes.add(bounds);
-        }
-    }
-
-    /**
-     * This method does nothing in this derived class, because beads and regions are incompatible.
-     * Beads are ignored when stripping by area.
-     *
-     * @param aShouldSeparateByBeads The new grouping of beads.
-     */
-    @Override
-    public final void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) {}
-
-    /** Adapted from PDFTextStripperByArea {@inheritDoc} */
-    @Override
-    protected void processTextPosition(TextPosition text) {
-        if (regionArea != null && !regionArea.contains(text.getX(), text.getY())) {
-            // skip character
-        } else {
-            super.processTextPosition(text);
-        }
-    }
-}
diff --git a/src/main/java/stirling/software/SPDF/pdf/FlexibleCSVWriter.java b/src/main/java/stirling/software/SPDF/pdf/FlexibleCSVWriter.java
new file mode 100644
index 00000000..94a48d93
--- /dev/null
+++ b/src/main/java/stirling/software/SPDF/pdf/FlexibleCSVWriter.java
@@ -0,0 +1,16 @@
+package stirling.software.SPDF.pdf;
+
+import org.apache.commons.csv.CSVFormat;
+
+import technology.tabula.writers.CSVWriter;
+
+public class FlexibleCSVWriter extends CSVWriter {
+
+    public FlexibleCSVWriter() {
+        super();
+    }
+
+    public FlexibleCSVWriter(CSVFormat csvFormat) {
+        super(csvFormat);
+    }
+}