Started working on splitOn empty/qr-/barcode

2025-11-01 01:21:18 +01:00 · 2023-10-26 19:56:23 +02:00 · 2023-10-26 19:56:23 +02:00 · 4e8d8e3d53
commit 4e8d8e3d53
parent f78a64d545
6 changed files with 135 additions and 85 deletions
--- a/public/functions/extractPages.js
+++ b/public/functions/extractPages.js
@ -1,23 +1,8 @@
 import { createSubDocument } from "./shared/createSubDocument.js";
 export async function extractPages(snapshot, pagesToExtractArray, PDFLib) {
    const pdfDoc = await PDFLib.PDFDocument.load(snapshot)
    // TODO: invent a better format for pagesToExtractArray and convert it.
    return createSubDocument(pdfDoc, pagesToExtractArray, PDFLib);
 };
 export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
    const subDocument = await PDFLib.PDFDocument.create();
    // Check that array max number is not larger pdf pages number
    if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
        throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
    }
    const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
    for (let i = 0; i < copiedPages.length; i++) {
        subDocument.addPage(copiedPages[i]);
    }
    return subDocument.save();
 }
--- a/public/functions/removeBlankPages.js
+++ b/public/functions/removeBlankPages.js
@ -1,6 +1,8 @@
 import { detectEmptyPages } from "./shared/detectEmptyPages.js";
 export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) {
-    const emptyPages = await findEmptyPages(snapshot);
+    const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
    console.log("Empty Pages: ", emptyPages);
@ -12,69 +14,4 @@ export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV,
    })
    return pdfDoc.save();
    async function findEmptyPages(snapshot) {
        const pdfDoc = await PDFJS.getDocument(snapshot).promise;
        const emptyPages = [];
        for (let i = 1; i <= pdfDoc.numPages; i++) {
            const page = await pdfDoc.getPage(i);
            console.log("Checking page " + i);
            if(!await hasText(page)) {
                console.log(`Found text on Page ${i}, page is not empty`);
                continue;
            }
            if(!await areImagesBlank(page, whiteThreashold)) {
                console.log(`Found non white image on Page ${i}, page is not empty`);
                continue;
            }
            console.log(`Page ${i} is empty.`);
            emptyPages.push(i - 1);
        }
        return emptyPages;
    }
    async function areImagesBlank(page, whiteThreashold) {
        const ops = await page.getOperatorList();
        for (var j=0; j < ops.fnArray.length; j++) {
            if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
                const image = page.objs.get(ops.argsArray[j][0]);
                if(image.data) {
                    return isImageBlank(image, whiteThreashold);
                }
            }
        }
        return true;
    }
    async function hasText(page) {
        const textContent = await page.getTextContent();
        return textContent.items.length === 0;
    }
    async function isImageBlank(image, threshold) {
        const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
        src.data.set(image.data);
        // Convert the image to grayscale
        const gray = new OpenCV.cv.Mat();
        OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
        // Calculate the mean value of the grayscale image
        const meanValue = OpenCV.cv.mean(gray);
        // Free memory
        src.delete();
        gray.delete();
        // Check if the mean value is below the threshold
        if (meanValue[0] <= threshold) {
            return true;
        } else {
            return false;
        }
    }
 };
--- a/public/functions/shared/createSubDocument.js
+++ b/public/functions/shared/createSubDocument.js
@ -0,0 +1,16 @@
 export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
    const subDocument = await PDFLib.PDFDocument.create();
    // Check that array max number is not larger pdf pages number
    if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
        throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
    }
    const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
    for (let i = 0; i < copiedPages.length; i++) {
        subDocument.addPage(copiedPages[i]);
    }
    return subDocument.save();
 }
--- a/public/functions/shared/detectEmptyPages.js
+++ b/public/functions/shared/detectEmptyPages.js
@ -0,0 +1,64 @@
 export async function detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV) {
    const pdfDoc = await PDFJS.getDocument(snapshot).promise;
    const emptyPages = [];
    for (let i = 1; i <= pdfDoc.numPages; i++) {
        const page = await pdfDoc.getPage(i);
        console.log("Checking page " + i);
        if(!await hasText(page)) {
            console.log(`Found text on Page ${i}, page is not empty`);
            continue;
        }
        if(!await areImagesBlank(page, whiteThreashold)) {
            console.log(`Found non white image on Page ${i}, page is not empty`);
            continue;
        }
        console.log(`Page ${i} is empty.`);
        emptyPages.push(i - 1);
    }
    return emptyPages;
    async function hasText(page) {
        const textContent = await page.getTextContent();
        return textContent.items.length === 0;
    }
    async function areImagesBlank(page, threshold) {
        const ops = await page.getOperatorList();
        for (var j=0; j < ops.fnArray.length; j++) {
            if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
                const image = page.objs.get(ops.argsArray[j][0]);
                if(image.data) {
                    return isImageBlank(image, threshold);
                }
            }
        }
        return true;
    }
    async function isImageBlank(image, threshold) {
        const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
        src.data.set(image.data);
        // Convert the image to grayscale
        const gray = new OpenCV.cv.Mat();
        OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
        // Calculate the mean value of the grayscale image
        const meanValue = OpenCV.cv.mean(gray);
        // Free memory
        src.delete();
        gray.delete();
        // Check if the mean value is below the threshold
        if (meanValue[0] <= threshold) {
            return true;
        } else {
            return false;
        }
    }
 }
--- a/public/functions/splitOn.js
+++ b/public/functions/splitOn.js
@ -0,0 +1,48 @@
 import { detectEmptyPages } from "./shared/detectEmptyPages";
 /**
 * @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType
 */
 /**
 * 
 * @param {Uint16Array} snapshot
 * @param {SplitType} type
 * @param {} PDFJS
 * @param {} OpenCV
 * @param {} PDFLib
 * @param {} QRCode
 * @returns 
 */
 export async function splitOn(snapshot, type, whiteThreashold, PDFJS, OpenCV, PDFLib, QRCode) {
    let splitAtPages = [];
    switch (type) {
        case "BAR_CODE":
            // TODO: Implement
            throw new Error("This split-type has not been implemented yet")
            break;
        case "QR_CODE":
            // TODO: Implement
            throw new Error("This split-type has not been implemented yet")
            break;
        case "BLANK_PAGE":
            splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
            break;
        default:
            throw new Error("An invalid split-type was provided.")
            break;
    }
    console.log("Split At Pages: ", splitAtPages);
    const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
    // TODO: Remove detected Pages & Split
    return pdfDoc.save();
 };
--- a/public/functions/splitPDF.js
+++ b/public/functions/splitPDF.js
@ -1,4 +1,4 @@
-import { createSubDocument } from "./extractPages.js";
+import { createSubDocument } from "./shared/createSubDocument.js";
 export async function splitPDF(snapshot, splitAfterPageArray, PDFLib) {
    const pdfDoc = await PDFLib.PDFDocument.load(snapshot)