mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2024-12-21 19:08:24 +01:00
Extract tables from PDF to CSV using Tabula (#2312)
* Add Tabula dependency and exclude slf4j-simple - Add tabula-java dependency to extract tables into CSV. - Exclude slf4j-simple due to Logback * Add a flexible CSVWriter - Add FlexibleCSVWriter which extends CSVWriter to pass a custom CSVFormat, as CSVWriter's parameterized constructor (that allows changing CSVFormat) is protected. * Use Tabula in extracting tables from PDF - Use Tabula in extracting tables from PDF instead of the existing implementation * Delete PDFTableStripper as It is unneeded - Delete PDFTableStripper as It is unneeded as Tabula-Java is used instead. * Use correct class in ExtractCSVController logger * Exclude gson and bcprov-jdk15on dependencies from tabula - Exclude gson and bcprov-jdk15on from tabula-java due to detected security vulnerabilities.
This commit is contained in:
parent
faa8a9752c
commit
afad06bed4
@ -203,6 +203,13 @@ dependencies {
|
|||||||
exclude group: "commons-logging", module: "commons-logging"
|
exclude group: "commons-logging", module: "commons-logging"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// https://mvnrepository.com/artifact/technology.tabula/tabula
|
||||||
|
implementation ('technology.tabula:tabula:1.0.5') {
|
||||||
|
exclude group: "org.slf4j", module: "slf4j-simple"
|
||||||
|
exclude group: "org.bouncycastle", module: "bcprov-jdk15on"
|
||||||
|
exclude group: "com.google.code.gson", module: "gson"
|
||||||
|
}
|
||||||
|
|
||||||
implementation 'org.apache.pdfbox:jbig2-imageio:3.0.4'
|
implementation 'org.apache.pdfbox:jbig2-imageio:3.0.4'
|
||||||
|
|
||||||
implementation "org.bouncycastle:bcprov-jdk18on:$bouncycastleVersion"
|
implementation "org.bouncycastle:bcprov-jdk18on:$bouncycastleVersion"
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
package stirling.software.SPDF.controller.api.converters;
|
package stirling.software.SPDF.controller.api.converters;
|
||||||
|
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.csv.CSVFormat;
|
||||||
|
import org.apache.commons.csv.QuoteMode;
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.http.ContentDisposition;
|
import org.springframework.http.ContentDisposition;
|
||||||
@ -18,79 +18,36 @@ import org.springframework.web.bind.annotation.PostMapping;
|
|||||||
import org.springframework.web.bind.annotation.RequestMapping;
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
import org.springframework.web.bind.annotation.RestController;
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
||||||
import com.opencsv.CSVWriter;
|
|
||||||
|
|
||||||
import io.swagger.v3.oas.annotations.Operation;
|
import io.swagger.v3.oas.annotations.Operation;
|
||||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||||
|
|
||||||
import stirling.software.SPDF.controller.api.CropController;
|
import stirling.software.SPDF.controller.api.CropController;
|
||||||
import stirling.software.SPDF.controller.api.strippers.PDFTableStripper;
|
|
||||||
import stirling.software.SPDF.model.api.extract.PDFFilePage;
|
import stirling.software.SPDF.model.api.extract.PDFFilePage;
|
||||||
|
import stirling.software.SPDF.pdf.FlexibleCSVWriter;
|
||||||
|
import technology.tabula.ObjectExtractor;
|
||||||
|
import technology.tabula.Page;
|
||||||
|
import technology.tabula.Table;
|
||||||
|
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
||||||
|
import technology.tabula.writers.Writer;
|
||||||
|
|
||||||
@RestController
|
@RestController
|
||||||
@RequestMapping("/api/v1/convert")
|
@RequestMapping("/api/v1/convert")
|
||||||
@Tag(name = "Convert", description = "Convert APIs")
|
@Tag(name = "Convert", description = "Convert APIs")
|
||||||
public class ExtractCSVController {
|
public class ExtractCSVController {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CropController.class);
|
private static final Logger logger = LoggerFactory.getLogger(ExtractCSVController.class);
|
||||||
|
|
||||||
@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
|
@PostMapping(value = "/pdf/csv", consumes = "multipart/form-data")
|
||||||
@Operation(
|
@Operation(summary = "Extracts a CSV document from a PDF", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
|
||||||
summary = "Extracts a CSV document from a PDF",
|
|
||||||
description =
|
|
||||||
"This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
|
|
||||||
public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception {
|
public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form) throws Exception {
|
||||||
|
|
||||||
ArrayList<String> tableData = new ArrayList<>();
|
|
||||||
int columnsCount = 0;
|
|
||||||
|
|
||||||
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
|
|
||||||
final double res = 72; // PDF units are at 72 DPI
|
|
||||||
PDFTableStripper stripper = new PDFTableStripper();
|
|
||||||
PDPage pdPage = document.getPage(form.getPageId() - 1);
|
|
||||||
stripper.extractTable(pdPage);
|
|
||||||
columnsCount = stripper.getColumns();
|
|
||||||
for (int c = 0; c < columnsCount; ++c) {
|
|
||||||
for (int r = 0; r < stripper.getRows(); ++r) {
|
|
||||||
tableData.add(stripper.getText(r, c));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ArrayList<String> notEmptyColumns = new ArrayList<>();
|
|
||||||
|
|
||||||
for (String item : tableData) {
|
|
||||||
if (!item.trim().isEmpty()) {
|
|
||||||
notEmptyColumns.add(item);
|
|
||||||
} else {
|
|
||||||
columnsCount--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
List<String> fullTable =
|
|
||||||
notEmptyColumns.stream()
|
|
||||||
.map(
|
|
||||||
(entity) ->
|
|
||||||
entity.replace('\n', ' ')
|
|
||||||
.replace('\r', ' ')
|
|
||||||
.trim()
|
|
||||||
.replaceAll("\\s{2,}", "|"))
|
|
||||||
.toList();
|
|
||||||
|
|
||||||
int rowsCount = fullTable.get(0).split("\\|").length;
|
|
||||||
|
|
||||||
ArrayList<String> headersList = getTableHeaders(columnsCount, fullTable);
|
|
||||||
ArrayList<String> recordList = getRecordsList(rowsCount, fullTable);
|
|
||||||
|
|
||||||
if (headersList.size() == 0 && recordList.size() == 0) {
|
|
||||||
throw new Exception("No table detected, no headers or records found");
|
|
||||||
}
|
|
||||||
|
|
||||||
StringWriter writer = new StringWriter();
|
StringWriter writer = new StringWriter();
|
||||||
try (CSVWriter csvWriter = new CSVWriter(writer)) {
|
try (PDDocument document = Loader.loadPDF(form.getFileInput().getBytes())) {
|
||||||
csvWriter.writeNext(headersList.toArray(new String[0]));
|
CSVFormat format = CSVFormat.EXCEL.builder().setEscape('"').setQuoteMode(QuoteMode.ALL).build();
|
||||||
for (String record : recordList) {
|
Writer csvWriter = new FlexibleCSVWriter(format);
|
||||||
csvWriter.writeNext(record.split("\\|"));
|
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
|
||||||
|
try (ObjectExtractor extractor = new ObjectExtractor(document)) {
|
||||||
|
Page page = extractor.extract(form.getPageId());
|
||||||
|
List<Table> tables = sea.extract(page);
|
||||||
|
csvWriter.write(writer, tables);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -107,33 +64,4 @@ public class ExtractCSVController {
|
|||||||
|
|
||||||
return ResponseEntity.ok().headers(headers).body(writer.toString());
|
return ResponseEntity.ok().headers(headers).body(writer.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
private ArrayList<String> getRecordsList(int rowsCounts, List<String> items) {
|
|
||||||
ArrayList<String> recordsList = new ArrayList<>();
|
|
||||||
|
|
||||||
for (int b = 1; b < rowsCounts; b++) {
|
|
||||||
StringBuilder strbldr = new StringBuilder();
|
|
||||||
|
|
||||||
for (int i = 0; i < items.size(); i++) {
|
|
||||||
String[] parts = items.get(i).split("\\|");
|
|
||||||
strbldr.append(parts[b]);
|
|
||||||
if (i != items.size() - 1) {
|
|
||||||
strbldr.append("|");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
recordsList.add(strbldr.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
return recordsList;
|
|
||||||
}
|
|
||||||
|
|
||||||
private ArrayList<String> getTableHeaders(int columnsCount, List<String> items) {
|
|
||||||
ArrayList<String> resultList = new ArrayList<>();
|
|
||||||
for (int i = 0; i < columnsCount; i++) {
|
|
||||||
String[] parts = items.get(i).split("\\|");
|
|
||||||
resultList.add(parts[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return resultList;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,327 +0,0 @@
|
|||||||
package stirling.software.SPDF.controller.api.strippers;
|
|
||||||
|
|
||||||
import java.awt.Shape;
|
|
||||||
import java.awt.geom.AffineTransform;
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.OutputStreamWriter;
|
|
||||||
import java.io.Writer;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.fontbox.util.BoundingBox;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
|
||||||
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
|
||||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Class to extract tabular data from a PDF. Works by making a first pass of the page to group all
|
|
||||||
* nearby text items together, and then inferring a 2D grid from these regions. Each table cell is
|
|
||||||
* then extracted using a PDFTextStripperByArea object.
|
|
||||||
*
|
|
||||||
* <p>Works best when headers are included in the detected region, to ensure representative text in
|
|
||||||
* every column.
|
|
||||||
*
|
|
||||||
* <p>Based upon DrawPrintTextLocations PDFBox example
|
|
||||||
* (https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/util/DrawPrintTextLocations.java)
|
|
||||||
*
|
|
||||||
* @author Beldaz
|
|
||||||
*/
|
|
||||||
public class PDFTableStripper extends PDFTextStripper {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will print the documents data, for each table cell.
|
|
||||||
*
|
|
||||||
* @param args The command line arguments.
|
|
||||||
* @throws IOException If there is an error parsing the document.
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
* Used in methods derived from DrawPrintTextLocations
|
|
||||||
*/
|
|
||||||
private AffineTransform flipAT;
|
|
||||||
|
|
||||||
private AffineTransform rotateAT;
|
|
||||||
|
|
||||||
/** Regions updated by calls to writeString */
|
|
||||||
private Set<Rectangle2D> boxes;
|
|
||||||
|
|
||||||
// Border to allow when finding intersections
|
|
||||||
private double dx = 1.0; // This value works for me, feel free to tweak (or add setter)
|
|
||||||
private double dy = 0.000; // Rows of text tend to overlap, so need to extend
|
|
||||||
|
|
||||||
/** Region in which to find table (otherwise whole page) */
|
|
||||||
private Rectangle2D regionArea;
|
|
||||||
|
|
||||||
/** Number of rows in inferred table */
|
|
||||||
private int nRows = 0;
|
|
||||||
|
|
||||||
/** Number of columns in inferred table */
|
|
||||||
private int nCols = 0;
|
|
||||||
|
|
||||||
/** This is the object that does the text extraction */
|
|
||||||
private PDFTextStripperByArea regionStripper;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 1D intervals - used for calculateTableRegions()
|
|
||||||
*
|
|
||||||
* @author Beldaz
|
|
||||||
*/
|
|
||||||
public static class Interval {
|
|
||||||
double start;
|
|
||||||
double end;
|
|
||||||
|
|
||||||
public Interval(double start, double end) {
|
|
||||||
this.start = start;
|
|
||||||
this.end = end;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void add(Interval col) {
|
|
||||||
if (col.start < start) start = col.start;
|
|
||||||
if (col.end > end) end = col.end;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void addTo(Interval x, LinkedList<Interval> columns) {
|
|
||||||
int p = 0;
|
|
||||||
Iterator<Interval> it = columns.iterator();
|
|
||||||
// Find where x should go
|
|
||||||
while (it.hasNext()) {
|
|
||||||
Interval col = it.next();
|
|
||||||
if (x.end >= col.start) {
|
|
||||||
if (x.start <= col.end) { // overlaps
|
|
||||||
x.add(col);
|
|
||||||
it.remove();
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
++p;
|
|
||||||
}
|
|
||||||
while (it.hasNext()) {
|
|
||||||
Interval col = it.next();
|
|
||||||
if (x.start > col.end) break;
|
|
||||||
x.add(col);
|
|
||||||
it.remove();
|
|
||||||
}
|
|
||||||
columns.add(p, x);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiate a new PDFTableStripper object.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error loading the properties.
|
|
||||||
*/
|
|
||||||
public PDFTableStripper() throws IOException {
|
|
||||||
super.setShouldSeparateByBeads(false);
|
|
||||||
regionStripper = new PDFTextStripperByArea();
|
|
||||||
regionStripper.setSortByPosition(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Define the region to group text by.
|
|
||||||
*
|
|
||||||
* @param rect The rectangle area to retrieve the text from.
|
|
||||||
*/
|
|
||||||
public void setRegion(Rectangle2D rect) {
|
|
||||||
regionArea = rect;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getRows() {
|
|
||||||
return nRows;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getColumns() {
|
|
||||||
return nCols;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the text for the region, this should be called after extractTable().
|
|
||||||
*
|
|
||||||
* @return The text that was identified in that region.
|
|
||||||
*/
|
|
||||||
public String getText(int row, int col) {
|
|
||||||
return regionStripper.getTextForRegion("el" + col + "x" + row);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void extractTable(PDPage pdPage) throws IOException {
|
|
||||||
setStartPage(getCurrentPageNo());
|
|
||||||
setEndPage(getCurrentPageNo());
|
|
||||||
|
|
||||||
boxes = new HashSet<Rectangle2D>();
|
|
||||||
// flip y-axis
|
|
||||||
flipAT = new AffineTransform();
|
|
||||||
flipAT.translate(0, pdPage.getBBox().getHeight());
|
|
||||||
flipAT.scale(1, -1);
|
|
||||||
|
|
||||||
// page may be rotated
|
|
||||||
rotateAT = new AffineTransform();
|
|
||||||
int rotation = pdPage.getRotation();
|
|
||||||
if (rotation != 0) {
|
|
||||||
PDRectangle mediaBox = pdPage.getMediaBox();
|
|
||||||
switch (rotation) {
|
|
||||||
case 90:
|
|
||||||
rotateAT.translate(mediaBox.getHeight(), 0);
|
|
||||||
break;
|
|
||||||
case 270:
|
|
||||||
rotateAT.translate(0, mediaBox.getWidth());
|
|
||||||
break;
|
|
||||||
case 180:
|
|
||||||
rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
rotateAT.rotate(Math.toRadians(rotation));
|
|
||||||
}
|
|
||||||
// Trigger processing of the document so that writeString is called.
|
|
||||||
try (Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream())) {
|
|
||||||
super.output = dummy;
|
|
||||||
super.processPage(pdPage);
|
|
||||||
}
|
|
||||||
|
|
||||||
Rectangle2D[][] regions = calculateTableRegions();
|
|
||||||
|
|
||||||
// System.err.println("Drawing " + nCols + "x" + nRows + "="+ nRows*nCols + "
|
|
||||||
// regions");
|
|
||||||
for (int i = 0; i < nCols; ++i) {
|
|
||||||
for (int j = 0; j < nRows; ++j) {
|
|
||||||
final Rectangle2D region = regions[i][j];
|
|
||||||
regionStripper.addRegion("el" + i + "x" + j, region);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
regionStripper.extractRegions(pdPage);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Infer a rectangular grid of regions from the boxes field.
|
|
||||||
*
|
|
||||||
* @return 2D array of table regions (as Rectangle2D objects). Note that some of these regions
|
|
||||||
* may have no content.
|
|
||||||
*/
|
|
||||||
private Rectangle2D[][] calculateTableRegions() {
|
|
||||||
|
|
||||||
// Build up a list of all table regions, based upon the populated
|
|
||||||
// regions of boxes field. Treats the horizontal and vertical extents
|
|
||||||
// of each box as distinct
|
|
||||||
LinkedList<Interval> columns = new LinkedList<Interval>();
|
|
||||||
LinkedList<Interval> rows = new LinkedList<Interval>();
|
|
||||||
|
|
||||||
for (Rectangle2D box : boxes) {
|
|
||||||
Interval x = new Interval(box.getMinX(), box.getMaxX());
|
|
||||||
Interval y = new Interval(box.getMinY(), box.getMaxY());
|
|
||||||
|
|
||||||
Interval.addTo(x, columns);
|
|
||||||
Interval.addTo(y, rows);
|
|
||||||
}
|
|
||||||
|
|
||||||
nRows = rows.size();
|
|
||||||
nCols = columns.size();
|
|
||||||
Rectangle2D[][] regions = new Rectangle2D[nCols][nRows];
|
|
||||||
int i = 0;
|
|
||||||
// Label regions from top left, rather than the transformed orientation
|
|
||||||
for (Interval column : columns) {
|
|
||||||
int j = 0;
|
|
||||||
for (Interval row : rows) {
|
|
||||||
regions[nCols - i - 1][nRows - j - 1] =
|
|
||||||
new Rectangle2D.Double(
|
|
||||||
column.start,
|
|
||||||
row.start,
|
|
||||||
column.end - column.start,
|
|
||||||
row.end - row.start);
|
|
||||||
++j;
|
|
||||||
}
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
|
|
||||||
return regions;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Register each character's bounding box, updating boxes field to maintain a list of all
|
|
||||||
* distinct groups of characters.
|
|
||||||
*
|
|
||||||
* <p>Overrides the default functionality of PDFTextStripper. Most of this is taken from
|
|
||||||
* DrawPrintTextLocations.java, with extra steps at end of main loop
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
|
|
||||||
for (TextPosition text : textPositions) {
|
|
||||||
// glyph space -> user space
|
|
||||||
// note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
|
|
||||||
AffineTransform at = text.getTextMatrix().createAffineTransform();
|
|
||||||
PDFont font = text.getFont();
|
|
||||||
BoundingBox bbox = font.getBoundingBox();
|
|
||||||
|
|
||||||
// advance width, bbox height (glyph space)
|
|
||||||
float xadvance =
|
|
||||||
font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
|
|
||||||
Rectangle2D.Float rect =
|
|
||||||
new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());
|
|
||||||
|
|
||||||
if (font instanceof PDType3Font) {
|
|
||||||
// bbox and font matrix are unscaled
|
|
||||||
at.concatenate(font.getFontMatrix().createAffineTransform());
|
|
||||||
} else {
|
|
||||||
// bbox and font matrix are already scaled to 1000
|
|
||||||
at.scale(1 / 1000f, 1 / 1000f);
|
|
||||||
}
|
|
||||||
Shape s = at.createTransformedShape(rect);
|
|
||||||
s = flipAT.createTransformedShape(s);
|
|
||||||
s = rotateAT.createTransformedShape(s);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Merge character's bounding box with boxes field
|
|
||||||
//
|
|
||||||
Rectangle2D bounds = s.getBounds2D();
|
|
||||||
// Pad sides to detect almost touching boxes
|
|
||||||
Rectangle2D hitbox = bounds.getBounds2D();
|
|
||||||
hitbox.add(bounds.getMinX() - dx, bounds.getMinY() - dy);
|
|
||||||
hitbox.add(bounds.getMaxX() + dx, bounds.getMaxY() + dy);
|
|
||||||
|
|
||||||
// Find all overlapping boxes
|
|
||||||
List<Rectangle2D> intersectList = new ArrayList<Rectangle2D>();
|
|
||||||
for (Rectangle2D box : boxes) {
|
|
||||||
if (box.intersects(hitbox)) {
|
|
||||||
intersectList.add(box);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Combine all touching boxes and update
|
|
||||||
// (NOTE: Potentially this could leave some overlapping boxes un-merged,
|
|
||||||
// but it's sufficient for now and get's fixed up in calculateTableRegions)
|
|
||||||
for (Rectangle2D box : intersectList) {
|
|
||||||
bounds.add(box);
|
|
||||||
boxes.remove(box);
|
|
||||||
}
|
|
||||||
boxes.add(bounds);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This method does nothing in this derived class, because beads and regions are incompatible.
|
|
||||||
* Beads are ignored when stripping by area.
|
|
||||||
*
|
|
||||||
* @param aShouldSeparateByBeads The new grouping of beads.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public final void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) {}
|
|
||||||
|
|
||||||
/** Adapted from PDFTextStripperByArea {@inheritDoc} */
|
|
||||||
@Override
|
|
||||||
protected void processTextPosition(TextPosition text) {
|
|
||||||
if (regionArea != null && !regionArea.contains(text.getX(), text.getY())) {
|
|
||||||
// skip character
|
|
||||||
} else {
|
|
||||||
super.processTextPosition(text);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -0,0 +1,16 @@
|
|||||||
|
package stirling.software.SPDF.pdf;
|
||||||
|
|
||||||
|
import org.apache.commons.csv.CSVFormat;
|
||||||
|
|
||||||
|
import technology.tabula.writers.CSVWriter;
|
||||||
|
|
||||||
|
public class FlexibleCSVWriter extends CSVWriter {
|
||||||
|
|
||||||
|
public FlexibleCSVWriter() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public FlexibleCSVWriter(CSVFormat csvFormat) {
|
||||||
|
super(csvFormat);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user