import { DocumentInitParameters, PDFPageProxy } from "pdfjs-dist/types/src/display/api.js"; import PDFJS from 'pdfjs-dist'; import { Image } from 'image-js'; import { getImagesOnPage } from "./getImagesOnPage.js"; export async function detectEmptyPages(snapshot: string | URL | ArrayBuffer | DocumentInitParameters, whiteThreashold: number) { const pdfDoc = await PDFJS.getDocument(snapshot).promise; const emptyPages: number[] = []; for (let i = 1; i <= pdfDoc.numPages; i++) { const page = await pdfDoc.getPage(i); console.log("Checking page " + i); if(!await hasText(page)) { console.log(`Found text on Page ${i}, page is not empty`); continue; } if(!await areImagesBlank(page, whiteThreashold)) { console.log(`Found non white image on Page ${i}, page is not empty`); continue; } console.log(`Page ${i} is empty.`); emptyPages.push(i - 1); } return emptyPages; } async function hasText(page: PDFPageProxy): Promise { const textContent = await page.getTextContent(); return textContent.items.length === 0; } async function areImagesBlank(page: PDFPageProxy, threshold: number): Promise { const images = await getImagesOnPage(page); for (const image of images) { if(!await isImageBlank(image, threshold)) return false; } return true; } async function isImageBlank(image: string | Uint8Array | ArrayBuffer, threshold: number): Promise { var img = await Image.load(image); var grey = img.grey(); var mean = grey.getMean(); return mean[0] <= threshold; }