Started working on splitOn empty/qr-/barcode

This commit is contained in:
Felix Kaspar 2023-10-26 19:56:23 +02:00
parent f78a64d545
commit 4e8d8e3d53
6 changed files with 135 additions and 85 deletions

View File

@ -1,23 +1,8 @@
import { createSubDocument } from "./shared/createSubDocument.js";
export async function extractPages(snapshot, pagesToExtractArray, PDFLib) { export async function extractPages(snapshot, pagesToExtractArray, PDFLib) {
const pdfDoc = await PDFLib.PDFDocument.load(snapshot) const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
// TODO: invent a better format for pagesToExtractArray and convert it. // TODO: invent a better format for pagesToExtractArray and convert it.
return createSubDocument(pdfDoc, pagesToExtractArray, PDFLib); return createSubDocument(pdfDoc, pagesToExtractArray, PDFLib);
}; };
export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
const subDocument = await PDFLib.PDFDocument.create();
// Check that array max number is not larger pdf pages number
if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
}
const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
for (let i = 0; i < copiedPages.length; i++) {
subDocument.addPage(copiedPages[i]);
}
return subDocument.save();
}

View File

@ -1,6 +1,8 @@
import { detectEmptyPages } from "./shared/detectEmptyPages.js";
export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) { export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) {
const emptyPages = await findEmptyPages(snapshot); const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
console.log("Empty Pages: ", emptyPages); console.log("Empty Pages: ", emptyPages);
@ -12,69 +14,4 @@ export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV,
}) })
return pdfDoc.save(); return pdfDoc.save();
async function findEmptyPages(snapshot) {
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
const emptyPages = [];
for (let i = 1; i <= pdfDoc.numPages; i++) {
const page = await pdfDoc.getPage(i);
console.log("Checking page " + i);
if(!await hasText(page)) {
console.log(`Found text on Page ${i}, page is not empty`);
continue;
}
if(!await areImagesBlank(page, whiteThreashold)) {
console.log(`Found non white image on Page ${i}, page is not empty`);
continue;
}
console.log(`Page ${i} is empty.`);
emptyPages.push(i - 1);
}
return emptyPages;
}
async function areImagesBlank(page, whiteThreashold) {
const ops = await page.getOperatorList();
for (var j=0; j < ops.fnArray.length; j++) {
if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
const image = page.objs.get(ops.argsArray[j][0]);
if(image.data) {
return isImageBlank(image, whiteThreashold);
}
}
}
return true;
}
async function hasText(page) {
const textContent = await page.getTextContent();
return textContent.items.length === 0;
}
async function isImageBlank(image, threshold) {
const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
src.data.set(image.data);
// Convert the image to grayscale
const gray = new OpenCV.cv.Mat();
OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
// Calculate the mean value of the grayscale image
const meanValue = OpenCV.cv.mean(gray);
// Free memory
src.delete();
gray.delete();
// Check if the mean value is below the threshold
if (meanValue[0] <= threshold) {
return true;
} else {
return false;
}
}
}; };

View File

@ -0,0 +1,16 @@
export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
const subDocument = await PDFLib.PDFDocument.create();
// Check that array max number is not larger pdf pages number
if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
}
const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
for (let i = 0; i < copiedPages.length; i++) {
subDocument.addPage(copiedPages[i]);
}
return subDocument.save();
}

View File

@ -0,0 +1,64 @@
export async function detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV) {
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
const emptyPages = [];
for (let i = 1; i <= pdfDoc.numPages; i++) {
const page = await pdfDoc.getPage(i);
console.log("Checking page " + i);
if(!await hasText(page)) {
console.log(`Found text on Page ${i}, page is not empty`);
continue;
}
if(!await areImagesBlank(page, whiteThreashold)) {
console.log(`Found non white image on Page ${i}, page is not empty`);
continue;
}
console.log(`Page ${i} is empty.`);
emptyPages.push(i - 1);
}
return emptyPages;
async function hasText(page) {
const textContent = await page.getTextContent();
return textContent.items.length === 0;
}
async function areImagesBlank(page, threshold) {
const ops = await page.getOperatorList();
for (var j=0; j < ops.fnArray.length; j++) {
if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
const image = page.objs.get(ops.argsArray[j][0]);
if(image.data) {
return isImageBlank(image, threshold);
}
}
}
return true;
}
async function isImageBlank(image, threshold) {
const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
src.data.set(image.data);
// Convert the image to grayscale
const gray = new OpenCV.cv.Mat();
OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
// Calculate the mean value of the grayscale image
const meanValue = OpenCV.cv.mean(gray);
// Free memory
src.delete();
gray.delete();
// Check if the mean value is below the threshold
if (meanValue[0] <= threshold) {
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,48 @@
import { detectEmptyPages } from "./shared/detectEmptyPages";
/**
* @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType
*/
/**
*
* @param {Uint16Array} snapshot
* @param {SplitType} type
* @param {} PDFJS
* @param {} OpenCV
* @param {} PDFLib
* @param {} QRCode
* @returns
*/
export async function splitOn(snapshot, type, whiteThreashold, PDFJS, OpenCV, PDFLib, QRCode) {
let splitAtPages = [];
switch (type) {
case "BAR_CODE":
// TODO: Implement
throw new Error("This split-type has not been implemented yet")
break;
case "QR_CODE":
// TODO: Implement
throw new Error("This split-type has not been implemented yet")
break;
case "BLANK_PAGE":
splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
break;
default:
throw new Error("An invalid split-type was provided.")
break;
}
console.log("Split At Pages: ", splitAtPages);
const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
// TODO: Remove detected Pages & Split
return pdfDoc.save();
};

View File

@ -1,4 +1,4 @@
import { createSubDocument } from "./extractPages.js"; import { createSubDocument } from "./shared/createSubDocument.js";
export async function splitPDF(snapshot, splitAfterPageArray, PDFLib) { export async function splitPDF(snapshot, splitAfterPageArray, PDFLib) {
const pdfDoc = await PDFLib.PDFDocument.load(snapshot) const pdfDoc = await PDFLib.PDFDocument.load(snapshot)