mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-01-10 00:06:51 +01:00
Started working on splitOn empty/qr-/barcode
This commit is contained in:
parent
f78a64d545
commit
4e8d8e3d53
@ -1,23 +1,8 @@
|
|||||||
|
import { createSubDocument } from "./shared/createSubDocument.js";
|
||||||
|
|
||||||
export async function extractPages(snapshot, pagesToExtractArray, PDFLib) {
|
export async function extractPages(snapshot, pagesToExtractArray, PDFLib) {
|
||||||
const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
|
const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
|
||||||
|
|
||||||
// TODO: invent a better format for pagesToExtractArray and convert it.
|
// TODO: invent a better format for pagesToExtractArray and convert it.
|
||||||
return createSubDocument(pdfDoc, pagesToExtractArray, PDFLib);
|
return createSubDocument(pdfDoc, pagesToExtractArray, PDFLib);
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
|
|
||||||
const subDocument = await PDFLib.PDFDocument.create();
|
|
||||||
|
|
||||||
// Check that array max number is not larger pdf pages number
|
|
||||||
if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
|
|
||||||
throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
|
|
||||||
|
|
||||||
for (let i = 0; i < copiedPages.length; i++) {
|
|
||||||
subDocument.addPage(copiedPages[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return subDocument.save();
|
|
||||||
}
|
|
@ -1,6 +1,8 @@
|
|||||||
|
import { detectEmptyPages } from "./shared/detectEmptyPages.js";
|
||||||
|
|
||||||
export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) {
|
export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) {
|
||||||
|
|
||||||
const emptyPages = await findEmptyPages(snapshot);
|
const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
|
||||||
|
|
||||||
console.log("Empty Pages: ", emptyPages);
|
console.log("Empty Pages: ", emptyPages);
|
||||||
|
|
||||||
@ -12,69 +14,4 @@ export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV,
|
|||||||
})
|
})
|
||||||
|
|
||||||
return pdfDoc.save();
|
return pdfDoc.save();
|
||||||
|
|
||||||
async function findEmptyPages(snapshot) {
|
|
||||||
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
|
|
||||||
|
|
||||||
const emptyPages = [];
|
|
||||||
for (let i = 1; i <= pdfDoc.numPages; i++) {
|
|
||||||
const page = await pdfDoc.getPage(i);
|
|
||||||
console.log("Checking page " + i);
|
|
||||||
|
|
||||||
if(!await hasText(page)) {
|
|
||||||
console.log(`Found text on Page ${i}, page is not empty`);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!await areImagesBlank(page, whiteThreashold)) {
|
|
||||||
console.log(`Found non white image on Page ${i}, page is not empty`);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`Page ${i} is empty.`);
|
|
||||||
emptyPages.push(i - 1);
|
|
||||||
}
|
|
||||||
return emptyPages;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function areImagesBlank(page, whiteThreashold) {
|
|
||||||
const ops = await page.getOperatorList();
|
|
||||||
|
|
||||||
for (var j=0; j < ops.fnArray.length; j++) {
|
|
||||||
if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
|
|
||||||
const image = page.objs.get(ops.argsArray[j][0]);
|
|
||||||
if(image.data) {
|
|
||||||
return isImageBlank(image, whiteThreashold);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function hasText(page) {
|
|
||||||
const textContent = await page.getTextContent();
|
|
||||||
return textContent.items.length === 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function isImageBlank(image, threshold) {
|
|
||||||
const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
|
|
||||||
src.data.set(image.data);
|
|
||||||
// Convert the image to grayscale
|
|
||||||
const gray = new OpenCV.cv.Mat();
|
|
||||||
OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
|
|
||||||
|
|
||||||
// Calculate the mean value of the grayscale image
|
|
||||||
const meanValue = OpenCV.cv.mean(gray);
|
|
||||||
|
|
||||||
// Free memory
|
|
||||||
src.delete();
|
|
||||||
gray.delete();
|
|
||||||
|
|
||||||
// Check if the mean value is below the threshold
|
|
||||||
if (meanValue[0] <= threshold) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
16
public/functions/shared/createSubDocument.js
Normal file
16
public/functions/shared/createSubDocument.js
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
|
||||||
|
const subDocument = await PDFLib.PDFDocument.create();
|
||||||
|
|
||||||
|
// Check that array max number is not larger pdf pages number
|
||||||
|
if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
|
||||||
|
throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
|
||||||
|
|
||||||
|
for (let i = 0; i < copiedPages.length; i++) {
|
||||||
|
subDocument.addPage(copiedPages[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return subDocument.save();
|
||||||
|
}
|
64
public/functions/shared/detectEmptyPages.js
Normal file
64
public/functions/shared/detectEmptyPages.js
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
export async function detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV) {
|
||||||
|
const pdfDoc = await PDFJS.getDocument(snapshot).promise;
|
||||||
|
|
||||||
|
const emptyPages = [];
|
||||||
|
for (let i = 1; i <= pdfDoc.numPages; i++) {
|
||||||
|
const page = await pdfDoc.getPage(i);
|
||||||
|
console.log("Checking page " + i);
|
||||||
|
|
||||||
|
if(!await hasText(page)) {
|
||||||
|
console.log(`Found text on Page ${i}, page is not empty`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!await areImagesBlank(page, whiteThreashold)) {
|
||||||
|
console.log(`Found non white image on Page ${i}, page is not empty`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Page ${i} is empty.`);
|
||||||
|
emptyPages.push(i - 1);
|
||||||
|
}
|
||||||
|
return emptyPages;
|
||||||
|
|
||||||
|
async function hasText(page) {
|
||||||
|
const textContent = await page.getTextContent();
|
||||||
|
return textContent.items.length === 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function areImagesBlank(page, threshold) {
|
||||||
|
const ops = await page.getOperatorList();
|
||||||
|
|
||||||
|
for (var j=0; j < ops.fnArray.length; j++) {
|
||||||
|
if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
|
||||||
|
const image = page.objs.get(ops.argsArray[j][0]);
|
||||||
|
if(image.data) {
|
||||||
|
return isImageBlank(image, threshold);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function isImageBlank(image, threshold) {
|
||||||
|
const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
|
||||||
|
src.data.set(image.data);
|
||||||
|
// Convert the image to grayscale
|
||||||
|
const gray = new OpenCV.cv.Mat();
|
||||||
|
OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
|
||||||
|
|
||||||
|
// Calculate the mean value of the grayscale image
|
||||||
|
const meanValue = OpenCV.cv.mean(gray);
|
||||||
|
|
||||||
|
// Free memory
|
||||||
|
src.delete();
|
||||||
|
gray.delete();
|
||||||
|
|
||||||
|
// Check if the mean value is below the threshold
|
||||||
|
if (meanValue[0] <= threshold) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
48
public/functions/splitOn.js
Normal file
48
public/functions/splitOn.js
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import { detectEmptyPages } from "./shared/detectEmptyPages";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param {Uint16Array} snapshot
|
||||||
|
* @param {SplitType} type
|
||||||
|
* @param {} PDFJS
|
||||||
|
* @param {} OpenCV
|
||||||
|
* @param {} PDFLib
|
||||||
|
* @param {} QRCode
|
||||||
|
* @returns
|
||||||
|
*/
|
||||||
|
export async function splitOn(snapshot, type, whiteThreashold, PDFJS, OpenCV, PDFLib, QRCode) {
|
||||||
|
|
||||||
|
let splitAtPages = [];
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case "BAR_CODE":
|
||||||
|
// TODO: Implement
|
||||||
|
throw new Error("This split-type has not been implemented yet")
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "QR_CODE":
|
||||||
|
// TODO: Implement
|
||||||
|
throw new Error("This split-type has not been implemented yet")
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "BLANK_PAGE":
|
||||||
|
splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new Error("An invalid split-type was provided.")
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("Split At Pages: ", splitAtPages);
|
||||||
|
|
||||||
|
const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
|
||||||
|
|
||||||
|
// TODO: Remove detected Pages & Split
|
||||||
|
|
||||||
|
return pdfDoc.save();
|
||||||
|
};
|
@ -1,4 +1,4 @@
|
|||||||
import { createSubDocument } from "./extractPages.js";
|
import { createSubDocument } from "./shared/createSubDocument.js";
|
||||||
|
|
||||||
export async function splitPDF(snapshot, splitAfterPageArray, PDFLib) {
|
export async function splitPDF(snapshot, splitAfterPageArray, PDFLib) {
|
||||||
const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
|
const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
|
||||||
|
Loading…
Reference in New Issue
Block a user