From f78a64d5455ac53a4621cd3c128ea5fbd27daea0 Mon Sep 17 00:00:00 2001 From: Felix Kaspar Date: Tue, 24 Oct 2023 19:31:14 +0200 Subject: [PATCH] Remove blank pages done, Updated README.md --- README.md | 2 +- public/functions/removeBlankPages.js | 52 +++++++++++++++++----------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 6ad5dae6..47aa7aac 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,7 @@ Current functions of spdf and their progress in this repo. | Status | Feature | Description | | ------ | ------------------ | ----------- | -| 🚧 | Remove Blank Pages | | +| ✔️ | Remove Blank Pages | | | 🚧 | Auto Split Pages | | | Status | Feature | Description | diff --git a/public/functions/removeBlankPages.js b/public/functions/removeBlankPages.js index a7a9c729..7e9fb939 100644 --- a/public/functions/removeBlankPages.js +++ b/public/functions/removeBlankPages.js @@ -1,30 +1,42 @@ export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV, PDFLib) { - const pdfDoc = await PDFJS.getDocument(snapshot).promise; + const emptyPages = await findEmptyPages(snapshot); - const emptyPages = []; - for (let i = 1; i <= pdfDoc.numPages; i++) { - const page = await pdfDoc.getPage(i); - console.log("Checking images"); + console.log("Empty Pages: ", emptyPages); - if(!await hasText(page)) { - console.log("Found text on Page, page is not empty"); - continue; + const pdfDoc = await PDFLib.PDFDocument.load(snapshot); + + // Reverse the array before looping in order to keep the indecies at the right pages. E.g. if you delete page 5 page 7 becomes page 6, if you delete page 7 page 5 remains page 5 + emptyPages.reverse().forEach(pageIndex => { + pdfDoc.removePage(pageIndex); + }) + + return pdfDoc.save(); + + async function findEmptyPages(snapshot) { + const pdfDoc = await PDFJS.getDocument(snapshot).promise; + + const emptyPages = []; + for (let i = 1; i <= pdfDoc.numPages; i++) { + const page = await pdfDoc.getPage(i); + console.log("Checking page " + i); + + if(!await hasText(page)) { + console.log(`Found text on Page ${i}, page is not empty`); + continue; + } + + if(!await areImagesBlank(page, whiteThreashold)) { + console.log(`Found non white image on Page ${i}, page is not empty`); + continue; + } + + console.log(`Page ${i} is empty.`); + emptyPages.push(i - 1); } - - if(!await areImagesBlank(page, whiteThreashold)) { - console.log("Found image on Page, page is not empty"); - continue; - } - - emptyPages.push[i]; + return emptyPages; } - console.log(emptyPages); - - // TODO: Remove emptyPages using pdflib - // return pdf; - async function areImagesBlank(page, whiteThreashold) { const ops = await page.getOperatorList();