2023-10-24 19:03:43 +02:00
export async function removeBlankPages ( snapshot , whiteThreashold , PDFJS , OpenCV , PDFLib ) {
2023-10-24 16:09:10 +02:00
2023-10-24 19:31:14 +02:00
const emptyPages = await findEmptyPages ( snapshot ) ;
2023-10-24 16:09:10 +02:00
2023-10-24 19:31:14 +02:00
console . log ( "Empty Pages: " , emptyPages ) ;
2023-10-24 16:09:10 +02:00
2023-10-24 19:31:14 +02:00
const pdfDoc = await PDFLib . PDFDocument . load ( snapshot ) ;
2023-10-24 16:09:10 +02:00
2023-10-24 19:31:14 +02:00
// Reverse the array before looping in order to keep the indecies at the right pages. E.g. if you delete page 5 page 7 becomes page 6, if you delete page 7 page 5 remains page 5
emptyPages . reverse ( ) . forEach ( pageIndex => {
pdfDoc . removePage ( pageIndex ) ;
} )
2023-10-24 16:09:10 +02:00
2023-10-24 19:31:14 +02:00
return pdfDoc . save ( ) ;
2023-10-24 16:09:10 +02:00
2023-10-24 19:31:14 +02:00
async function findEmptyPages ( snapshot ) {
const pdfDoc = await PDFJS . getDocument ( snapshot ) . promise ;
2023-10-24 16:09:10 +02:00
2023-10-24 19:31:14 +02:00
const emptyPages = [ ] ;
for ( let i = 1 ; i <= pdfDoc . numPages ; i ++ ) {
const page = await pdfDoc . getPage ( i ) ;
console . log ( "Checking page " + i ) ;
if ( ! await hasText ( page ) ) {
console . log ( ` Found text on Page ${ i } , page is not empty ` ) ;
continue ;
}
if ( ! await areImagesBlank ( page , whiteThreashold ) ) {
console . log ( ` Found non white image on Page ${ i } , page is not empty ` ) ;
continue ;
}
console . log ( ` Page ${ i } is empty. ` ) ;
emptyPages . push ( i - 1 ) ;
}
return emptyPages ;
}
2023-10-24 16:09:10 +02:00
async function areImagesBlank ( page , whiteThreashold ) {
const ops = await page . getOperatorList ( ) ;
for ( var j = 0 ; j < ops . fnArray . length ; j ++ ) {
if ( ops . fnArray [ j ] == PDFJS . OPS . paintJpegXObject || ops . fnArray [ j ] == PDFJS . OPS . paintImageXObject ) {
const image = page . objs . get ( ops . argsArray [ j ] [ 0 ] ) ;
if ( image . data ) {
return isImageBlank ( image , whiteThreashold ) ;
}
}
}
return true ;
}
async function hasText ( page ) {
const textContent = await page . getTextContent ( ) ;
return textContent . items . length === 0 ;
}
async function isImageBlank ( image , threshold ) {
const src = new OpenCV . cv . Mat ( image . width , image . height , OpenCV . cv . CV _8UC4 ) ;
src . data . set ( image . data ) ;
// Convert the image to grayscale
const gray = new OpenCV . cv . Mat ( ) ;
OpenCV . cv . cvtColor ( src , gray , OpenCV . cv . COLOR _RGBA2GRAY ) ;
// Calculate the mean value of the grayscale image
const meanValue = OpenCV . cv . mean ( gray ) ;
// Free memory
src . delete ( ) ;
gray . delete ( ) ;
// Check if the mean value is below the threshold
if ( meanValue [ 0 ] <= threshold ) {
return true ;
} else {
return false ;
}
}
} ;