diff --git a/functions.js b/functions.js index 5fc7f486a..db99a76f7 100644 --- a/functions.js +++ b/functions.js @@ -1,4 +1,6 @@ import PDFLib from 'pdf-lib'; +import OpenCV from 'opencv-wasm'; +import PDFJS from "pdfjs-dist"; import * as pdfcpuWraopper from "./public/wasm/pdfcpu-wrapper-node.js"; import { extractPages as dependantExtractPages } from "./public/functions/extractPages.js"; @@ -10,6 +12,7 @@ import { scalePage as dependantScalePage } from './public/functions/scalePage.js import { splitPDF as dependantSplitPDF } from './public/functions/splitPDF.js'; import { editMetadata as dependantEditMetadata } from './public/functions/editMetadata.js'; import { organizePages as dependantOrganizePages } from './public/functions/organizePages.js'; +import { removeBlankPages as dependantRemoveBlankPages} from './public/functions/removeBlankPages.js'; export async function extractPages(snapshot, pagesToExtractArray) { return dependantExtractPages(snapshot, pagesToExtractArray, PDFLib); @@ -45,4 +48,8 @@ export async function editMetadata(snapshot, metadata) { export async function organizePages(snapshot, operation, customOrderString) { return dependantOrganizePages(snapshot, operation, customOrderString, PDFLib); +} + +export async function removeBlankPages(snapshot, whiteThreashold) { + return dependantRemoveBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV); } \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index cee0c2a9b..57c6dceba 100644 --- a/package-lock.json +++ b/package-lock.json @@ -39,6 +39,22 @@ "negotiator": "0.6.3" } }, + "ajv": { + "version": "6.12.6", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", + "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", + "requires": { + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" + } + }, + "ajv-keywords": { + "version": "3.5.2", + "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz", + "integrity": "sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==" + }, "archiver": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/archiver/-/archiver-6.0.1.tgz", @@ -103,6 +119,11 @@ "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==" }, + "big.js": { + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/big.js/-/big.js-5.2.2.tgz", + "integrity": "sha512-vyL2OymJxmarO8gxMr0mhChsO9QGwhynfuu4+MHTAW6czfq9humCB7rKpUjDd9YUiDPU4mzpyupFSvOClAwbmQ==" + }, "bl": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", @@ -252,6 +273,11 @@ "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==" }, + "emojis-list": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/emojis-list/-/emojis-list-3.0.0.tgz", + "integrity": "sha512-/kyM18EfinwXZbno9FyUGeFh87KC8HRQBQGildHZbEuRyWFOmv1U10o9BBp8XVZDVNNuQKyIGIu5ZYAAXJ0V2Q==" + }, "encodeurl": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", @@ -321,6 +347,11 @@ "busboy": "^1.6.0" } }, + "fast-deep-equal": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", + "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==" + }, "fast-extend": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/fast-extend/-/fast-extend-1.0.2.tgz", @@ -331,6 +362,11 @@ "resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz", "integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==" }, + "fast-json-stable-stringify": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", + "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==" + }, "finalhandler": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.2.0.tgz", @@ -467,6 +503,19 @@ "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==" }, + "json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==" + }, + "json5": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/json5/-/json5-1.0.2.tgz", + "integrity": "sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==", + "requires": { + "minimist": "^1.2.0" + } + }, "lazystream": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/lazystream/-/lazystream-1.0.1.tgz", @@ -504,6 +553,16 @@ } } }, + "loader-utils": { + "version": "1.4.2", + "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-1.4.2.tgz", + "integrity": "sha512-I5d00Pd/jwMD2QCduo657+YM/6L3KZu++pmX9VFncxaxvHcru9jx1lBaFft+r4Mt2jK0Yhp41XlRAihzPxHNCg==", + "requires": { + "big.js": "^5.2.2", + "emojis-list": "^3.0.0", + "json5": "^1.0.1" + } + }, "lodash": { "version": "4.17.21", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", @@ -559,6 +618,11 @@ "brace-expansion": "^2.0.1" } }, + "minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==" + }, "ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", @@ -569,6 +633,11 @@ "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz", "integrity": "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==" }, + "node-ensure": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/node-ensure/-/node-ensure-0.0.0.tgz", + "integrity": "sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==" + }, "normalize-path": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", @@ -595,6 +664,11 @@ "wrappy": "1" } }, + "opencv-wasm": { + "version": "4.3.0-10", + "resolved": "https://registry.npmjs.org/opencv-wasm/-/opencv-wasm-4.3.0-10.tgz", + "integrity": "sha512-EWmWLUzp2suoc6N44Y4ouWT85QwvShx23Q430R+lp6NyS828bjQn6mCgA3NJ6Z/S59aaTeeu+RhqPQIJIYld1w==" + }, "pako": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", @@ -621,6 +695,15 @@ "tslib": "^1.11.1" } }, + "pdfjs-dist": { + "version": "2.0.943", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-2.0.943.tgz", + "integrity": "sha512-iLhNcm4XceTHRaSU5o22ZGCm4YpuW5+rf4+BJFH/feBhMQLbCGBry+Jet8Q419QDI4qgARaIQzXuiNrsNWS8Yw==", + "requires": { + "node-ensure": "^0.0.0", + "worker-loader": "^2.0.0" + } + }, "process-nextick-args": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", @@ -635,6 +718,11 @@ "ipaddr.js": "1.9.1" } }, + "punycode": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.0.tgz", + "integrity": "sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA==" + }, "qs": { "version": "6.11.0", "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz", @@ -692,6 +780,15 @@ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" }, + "schema-utils": { + "version": "0.4.7", + "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-0.4.7.tgz", + "integrity": "sha512-v/iwU6wvwGK8HbU9yi3/nhGzP0yGSuhQMzL6ySiec1FSrZZDkhm4noOSWzrNFo/jEc+SJY6jRTwuwbSXJPDUnQ==", + "requires": { + "ajv": "^6.1.0", + "ajv-keywords": "^3.1.0" + } + }, "send": { "version": "0.18.0", "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", @@ -808,6 +905,14 @@ "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==" }, + "uri-js": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", + "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==", + "requires": { + "punycode": "^2.1.0" + } + }, "util-deprecate": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", @@ -823,6 +928,15 @@ "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", "integrity": "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==" }, + "worker-loader": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/worker-loader/-/worker-loader-2.0.0.tgz", + "integrity": "sha512-tnvNp4K3KQOpfRnD20m8xltE3eWh89Ye+5oj7wXEEHKac1P4oZ6p9oTj8/8ExqoSBnk9nu5Pr4nKfQ1hn2APJw==", + "requires": { + "loader-utils": "^1.0.0", + "schema-utils": "^0.4.0" + } + }, "wrappy": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", diff --git a/package.json b/package.json index 0c554ac6a..a2f2e75e6 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,9 @@ "archiver": "^6.0.1", "express": "^4.18.2", "express-fileupload": "^1.4.1", - "pdf-lib": "^1.17.1" + "opencv-wasm": "^4.3.0-10", + "pdf-lib": "^1.17.1", + "pdfjs-dist": "^2.0.943" }, "type": "module" } diff --git a/public/functions.js b/public/functions.js index 6d1f566da..0aa7b51ab 100644 --- a/public/functions.js +++ b/public/functions.js @@ -1,4 +1,5 @@ // PDFLib gets importet via index.html script-tag +// TODO: OpenCV import * as pdfcpuWraopper from "./wasm/pdfcpu-wrapper-browser.js"; import { extractPages as dependantExtractPages } from "./functions/extractPages.js"; @@ -10,6 +11,7 @@ import { scalePage as dependantScalePage } from './functions/scalePage.js'; import { splitPDF as dependantSplitPDF } from './functions/splitPDF.js'; import { editMetadata as dependantEditMetadata} from "./functions/editMetadata.js"; import { organizePages as dependantOrganizePages} from "./functions/organizePages.js"; +import { removeBlankPages as dependantRemoveBlankPages} from "./functions/removeBlankPages.js"; export async function extractPages(snapshot, pagesToExtractArray) { return dependantExtractPages(snapshot, pagesToExtractArray, PDFLib); @@ -45,4 +47,8 @@ export async function editMetadata(snapshot, metadata) { export async function organizePages(snapshot, operation, customOrderString) { return dependantOrganizePages(snapshot, operation, customOrderString, PDFLib); +} + +export async function removeBlankPages(snapshot, whiteThreashold) { + return dependantRemoveBlankPages(snapshot, whiteThreashold, PDFLib, OpenCV); } \ No newline at end of file diff --git a/public/functions/removeBlankPages.js b/public/functions/removeBlankPages.js new file mode 100644 index 000000000..932e44f53 --- /dev/null +++ b/public/functions/removeBlankPages.js @@ -0,0 +1,67 @@ +export async function removeBlankPages(snapshot, whiteThreashold, PDFJS, OpenCV) { + + const pdfDoc = await PDFJS.getDocument(snapshot).promise; + + const emptyPages = []; + for (let i = 1; i <= pdfDoc.numPages; i++) { + const page = await pdfDoc.getPage(i); + + if(!await hasText(page)) { + console.log("Found text on Page, page is not empty"); + continue; + } + + if(!await areImagesBlank(page, whiteThreashold)) { + console.log("Found image on Page, page is not empty"); + continue; + } + + emptyPages.push[i]; + } + + console.log(emptyPages); + + // TODO: Remove emptyPages using pdflib + // return pdf; + + async function areImagesBlank(page, whiteThreashold) { + const ops = await page.getOperatorList(); + + for (var j=0; j < ops.fnArray.length; j++) { + if (ops.fnArray[j] == PDFJS.OPS.paintJpegXObject || ops.fnArray[j] == PDFJS.OPS.paintImageXObject) { + const image = page.objs.get(ops.argsArray[j][0]); + if(image.data) { + return isImageBlank(image, whiteThreashold); + } + } + } + return true; + } + + async function hasText(page) { + const textContent = await page.getTextContent(); + return textContent.items.length === 0; + } + + async function isImageBlank(image, threshold) { + const src = new OpenCV.cv.Mat(image.width, image.height, OpenCV.cv.CV_8UC4); + src.data.set(image.data); + // Convert the image to grayscale + const gray = new OpenCV.cv.Mat(); + OpenCV.cv.cvtColor(src, gray, OpenCV.cv.COLOR_RGBA2GRAY); + + // Calculate the mean value of the grayscale image + const meanValue = OpenCV.cv.mean(gray); + + // Free memory + src.delete(); + gray.delete(); + + // Check if the mean value is below the threshold + if (meanValue[0] <= threshold) { + return true; + } else { + return false; + } + } +}; \ No newline at end of file diff --git a/public/traverseOperations.js b/public/traverseOperations.js index d6e2ab403..c57f728f2 100644 --- a/public/traverseOperations.js +++ b/public/traverseOperations.js @@ -109,6 +109,12 @@ export async function * traverseOperations(operations, input, Functions) { input.buffer = await Functions.organizePages(input.buffer, operation.values["operation"], operation.values["customOrderString"]); }); break; + case "removeBlankPages": + yield* nToN(input, operation, async (input) => { + input.fileName += "_removedBlanks"; + input.buffer = await Functions.removeBlankPages(input.buffer, operation.values["whiteThreashold"]); + }); + break; default: throw new Error(`${operation.type} not implemented yet.`); break;