Started working on splitOn empty/qr-/barcode

2025-11-01 01:21:18 +01:00 · 2023-10-26 19:56:23 +02:00 · 2023-10-26 19:56:23 +02:00 · 4e8d8e3d53
commit 4e8d8e3d53
parent f78a64d545
6 changed files with 135 additions and 85 deletions
--- a/public/functions/extractPages.js
+++ b/public/functions/extractPages.js
@ -1,23 +1,8 @@
+import { createSubDocument } from "./shared/createSubDocument.js";
+
 export async function extractPages(snapshot, pagesToExtractArray, PDFLib) {
    const pdfDoc = await PDFLib.PDFDocument.load(snapshot)

    // TODO: invent a better format for pagesToExtractArray and convert it.
    return createSubDocument(pdfDoc, pagesToExtractArray, PDFLib);
-};
-
-export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
-    const subDocument = await PDFLib.PDFDocument.create();
-
-    // Check that array max number is not larger pdf pages number
-    if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
-        throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
-    }
-
-    const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
-
-    for (let i = 0; i < copiedPages.length; i++) {
-        subDocument.addPage(copiedPages[i]);
-    }
-
-    return subDocument.save();
-}
+};
--- a/public/functions/removeBlankPages.js
+++ b/public/functions/removeBlankPages.js
@ -1,6 +1,8 @@
+import { detectEmptyPages } from "./shared/detectEmptyPages.js";
+
 export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) {
    
-    const emptyPages = await findEmptyPages(snapshot);
+    const emptyPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);

    console.log("Empty Pages: ", emptyPages);

@ -12,69 +14,4 @@ export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV,
    })

    return pdfDoc.save();
-
-    async function findEmptyPages(snapshot) {
-        const pdfDoc = await PDFJS.getDocument(snapshot).promise;
-
-        const emptyPages = [];
-        for (let i = 1; i <= pdfDoc.numPages; i++) {
-            const page = await pdfDoc.getPage(i);
-            console.log("Checking page " + i);
-    
-            if(!await hasText(page)) {
-                console.log(`Found text on Page ${i}, page is not empty`);
-                continue;
-            }
-    
-            if(!await areImagesBlank(page, whiteThreashold)) {
-                console.log(`Found non white image on Page ${i}, page is not empty`);
-                continue;
-            }
-    
-            console.log(`Page ${i} is empty.`);
-            emptyPages.push(i - 1);
-        }
-        return emptyPages;
-    }
-
-    async function areImagesBlank(page, whiteThreashold) {
-        const ops = await page.getOperatorList();
-    
-        for (var j=0; j < ops.fnArray.length; j++) {
-            if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
-                const image = page.objs.get(ops.argsArray[j][0]);
-                if(image.data) {
-                    return isImageBlank(image, whiteThreashold);
-                }
-            }
-        }
-        return true;
-    }
-    
-    async function hasText(page) {
-        const textContent = await page.getTextContent();
-        return textContent.items.length === 0;
-    }
-    
-    async function isImageBlank(image, threshold) {
-        const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
-        src.data.set(image.data);
-        // Convert the image to grayscale
-        const gray = new OpenCV.cv.Mat();
-        OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
-    
-        // Calculate the mean value of the grayscale image
-        const meanValue = OpenCV.cv.mean(gray);
-    
-        // Free memory
-        src.delete();
-        gray.delete();
-    
-        // Check if the mean value is below the threshold
-        if (meanValue[0] <= threshold) {
-            return true;
-        } else {
-            return false;
-        }
-    }
 };
--- a/public/functions/shared/createSubDocument.js
+++ b/public/functions/shared/createSubDocument.js
@ -0,0 +1,16 @@
+export async function createSubDocument(pdfDoc, pagesToExtractArray, PDFLib) {
+    const subDocument = await PDFLib.PDFDocument.create();
+
+    // Check that array max number is not larger pdf pages number
+    if(Math.max(...pagesToExtractArray) >= pdfDoc.getPageCount()) {
+        throw new Error(`The PDF document only has ${pdfDoc.getPageCount()} pages and you tried to extract page ${Math.max(...pagesToExtractArray)}`);
+    }
+
+    const copiedPages = await subDocument.copyPages(pdfDoc, pagesToExtractArray);
+
+    for (let i = 0; i < copiedPages.length; i++) {
+        subDocument.addPage(copiedPages[i]);
+    }
+
+    return subDocument.save();
+}
--- a/public/functions/shared/detectEmptyPages.js
+++ b/public/functions/shared/detectEmptyPages.js
@ -0,0 +1,64 @@
+export async function detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV) {
+    const pdfDoc = await PDFJS.getDocument(snapshot).promise;
+
+    const emptyPages = [];
+    for (let i = 1; i <= pdfDoc.numPages; i++) {
+        const page = await pdfDoc.getPage(i);
+        console.log("Checking page " + i);
+
+        if(!await hasText(page)) {
+            console.log(`Found text on Page ${i}, page is not empty`);
+            continue;
+        }
+
+        if(!await areImagesBlank(page, whiteThreashold)) {
+            console.log(`Found non white image on Page ${i}, page is not empty`);
+            continue;
+        }
+
+        console.log(`Page ${i} is empty.`);
+        emptyPages.push(i - 1);
+    }
+    return emptyPages;
+
+    async function hasText(page) {
+        const textContent = await page.getTextContent();
+        return textContent.items.length === 0;
+    }
+
+    async function areImagesBlank(page, threshold) {
+        const ops = await page.getOperatorList();
+    
+        for (var j=0; j < ops.fnArray.length; j++) {
+            if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) {
+                const image = page.objs.get(ops.argsArray[j][0]);
+                if(image.data) {
+                    return isImageBlank(image, threshold);
+                }
+            }
+        }
+        return true;
+    }
+    
+    async function isImageBlank(image, threshold) {
+        const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4);
+        src.data.set(image.data);
+        // Convert the image to grayscale
+        const gray = new OpenCV.cv.Mat();
+        OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY);
+    
+        // Calculate the mean value of the grayscale image
+        const meanValue = OpenCV.cv.mean(gray);
+    
+        // Free memory
+        src.delete();
+        gray.delete();
+    
+        // Check if the mean value is below the threshold
+        if (meanValue[0] <= threshold) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+}
--- a/public/functions/splitOn.js
+++ b/public/functions/splitOn.js
@ -0,0 +1,48 @@
+import { detectEmptyPages } from "./shared/detectEmptyPages";
+
+/**
+ * @typedef {"BAR_CODE"|"QR_CODE"|"BLANK_PAGE"} SplitType
+ */
+
+/**
+ * 
+ * @param {Uint16Array} snapshot
+ * @param {SplitType} type
+ * @param {} PDFJS
+ * @param {} OpenCV
+ * @param {} PDFLib
+ * @param {} QRCode
+ * @returns 
+ */
+export async function splitOn(snapshot, type, whiteThreashold, PDFJS, OpenCV, PDFLib, QRCode) {
+    
+    let splitAtPages = [];
+
+    switch (type) {
+        case "BAR_CODE":
+            // TODO: Implement
+            throw new Error("This split-type has not been implemented yet")
+            break;
+
+        case "QR_CODE":
+            // TODO: Implement
+            throw new Error("This split-type has not been implemented yet")
+            break;
+
+        case "BLANK_PAGE":
+            splitAtPages = await detectEmptyPages(snapshot, whiteThreashold, PDFJS, OpenCV);
+            break;
+    
+        default:
+            throw new Error("An invalid split-type was provided.")
+            break;
+    }
+
+    console.log("Split At Pages: ", splitAtPages);
+
+    const pdfDoc = await PDFLib.PDFDocument.load(snapshot);
+
+    // TODO: Remove detected Pages & Split
+
+    return pdfDoc.save();
+};
--- a/public/functions/splitPDF.js
+++ b/public/functions/splitPDF.js
@ -1,4 +1,4 @@
-import { createSubDocument } from "./extractPages.js";
+import { createSubDocument } from "./shared/createSubDocument.js";

 export async function splitPDF(snapshot, splitAfterPageArray, PDFLib) {
    const pdfDoc = await PDFLib.PDFDocument.load(snapshot)