Add:Epub metadata parser and cover extractor #1479

This commit is contained in:
advplyr 2024-01-07 17:51:07 -06:00
parent 48a08e9659
commit 69e23ef9f2
8 changed files with 284 additions and 35 deletions

View File

@ -63,7 +63,7 @@ export default {
},
audioMetatags: {
id: 'audioMetatags',
name: 'Audio file meta tags',
name: 'Audio file meta tags OR ebook metadata',
include: true
},
nfoFile: {

View File

@ -7,6 +7,8 @@ const imageType = require('../libs/imageType')
const globals = require('../utils/globals')
const { downloadImageFile, filePathToPOSIX, checkPathIsFile } = require('../utils/fileUtils')
const { extractCoverArt } = require('../utils/ffmpegHelpers')
const parseEbookMetadata = require('../utils/parsers/parseEbookMetadata')
const CacheManager = require('../managers/CacheManager')
class CoverManager {
@ -234,6 +236,7 @@ class CoverManager {
/**
* Extract cover art from audio file and save for library item
*
* @param {import('../models/Book').AudioFileObject[]} audioFiles
* @param {string} libraryItemId
* @param {string} [libraryItemPath] null for isFile library items
@ -268,6 +271,44 @@ class CoverManager {
return null
}
/**
* Extract cover art from ebook and save for library item
*
* @param {import('../utils/parsers/parseEbookMetadata').EBookFileScanData} ebookFileScanData
* @param {string} libraryItemId
* @param {string} [libraryItemPath] null for isFile library items
* @returns {Promise<string>} returns cover path
*/
async saveEbookCoverArt(ebookFileScanData, libraryItemId, libraryItemPath) {
if (!ebookFileScanData?.ebookCoverPath) return null
let coverDirPath = null
if (global.ServerSettings.storeCoverWithItem && libraryItemPath) {
coverDirPath = libraryItemPath
} else {
coverDirPath = Path.posix.join(global.MetadataPath, 'items', libraryItemId)
}
await fs.ensureDir(coverDirPath)
let extname = Path.extname(ebookFileScanData.ebookCoverPath) || '.jpg'
if (extname === '.jpeg') extname = '.jpg'
const coverFilename = `cover${extname}`
const coverFilePath = Path.join(coverDirPath, coverFilename)
// TODO: Overwrite if exists?
const coverAlreadyExists = await fs.pathExists(coverFilePath)
if (coverAlreadyExists) {
Logger.warn(`[CoverManager] Extract embedded cover art but cover already exists for "${coverFilePath}" - overwriting`)
}
const success = await parseEbookMetadata.extractCoverImage(ebookFileScanData, coverFilePath)
if (success) {
await CacheManager.purgeCoverCache(libraryItemId)
return coverFilePath
}
return null
}
/**
*
* @param {string} url

View File

@ -36,6 +36,8 @@ class AbsMetadataFileScanner {
for (const key in abMetadata) {
// TODO: When to override with null or empty arrays?
if (abMetadata[key] === undefined || abMetadata[key] === null) continue
if (key === 'authors' && !abMetadata.authors?.length) continue
if (key === 'genres' && !abMetadata.genres?.length) continue
if (key === 'tags' && !abMetadata.tags?.length) continue
if (key === 'chapters' && !abMetadata.chapters?.length) continue

View File

@ -3,8 +3,8 @@ const Path = require('path')
const sequelize = require('sequelize')
const { LogLevel } = require('../utils/constants')
const { getTitleIgnorePrefix, areEquivalent } = require('../utils/index')
const abmetadataGenerator = require('../utils/generators/abmetadataGenerator')
const parseNameString = require('../utils/parsers/parseNameString')
const parseEbookMetadata = require('../utils/parsers/parseEbookMetadata')
const globals = require('../utils/globals')
const AudioFileScanner = require('./AudioFileScanner')
const Database = require('../Database')
@ -170,7 +170,9 @@ class BookScanner {
hasMediaChanges = true
}
const bookMetadata = await this.getBookMetadataFromScanData(media.audioFiles, libraryItemData, libraryScan, librarySettings, existingLibraryItem.id)
const ebookFileScanData = await parseEbookMetadata.parse(media.ebookFile)
const bookMetadata = await this.getBookMetadataFromScanData(media.audioFiles, ebookFileScanData, libraryItemData, libraryScan, librarySettings, existingLibraryItem.id)
let authorsUpdated = false
const bookAuthorsRemoved = []
let seriesUpdated = false
@ -317,24 +319,34 @@ class BookScanner {
})
}
// If no cover then extract cover from audio file if available OR search for cover if enabled in server settings
// If no cover then extract cover from audio file OR from ebook
const libraryItemDir = existingLibraryItem.isFile ? null : existingLibraryItem.path
if (!media.coverPath) {
const libraryItemDir = existingLibraryItem.isFile ? null : existingLibraryItem.path
const extractedCoverPath = await CoverManager.saveEmbeddedCoverArt(media.audioFiles, existingLibraryItem.id, libraryItemDir)
let extractedCoverPath = await CoverManager.saveEmbeddedCoverArt(media.audioFiles, existingLibraryItem.id, libraryItemDir)
if (extractedCoverPath) {
libraryScan.addLog(LogLevel.DEBUG, `Updating book "${bookMetadata.title}" extracted embedded cover art from audio file to path "${extractedCoverPath}"`)
media.coverPath = extractedCoverPath
hasMediaChanges = true
} else if (Database.serverSettings.scannerFindCovers) {
const authorName = media.authors.map(au => au.name).filter(au => au).join(', ')
const coverPath = await this.searchForCover(existingLibraryItem.id, libraryItemDir, media.title, authorName, libraryScan)
if (coverPath) {
media.coverPath = coverPath
} else if (ebookFileScanData?.ebookCoverPath) {
extractedCoverPath = await CoverManager.saveEbookCoverArt(ebookFileScanData, existingLibraryItem.id, libraryItemDir)
if (extractedCoverPath) {
libraryScan.addLog(LogLevel.DEBUG, `Updating book "${bookMetadata.title}" extracted embedded cover art from ebook file to path "${extractedCoverPath}"`)
media.coverPath = extractedCoverPath
hasMediaChanges = true
}
}
}
// If no cover then search for cover if enabled in server settings
if (!media.coverPath && Database.serverSettings.scannerFindCovers) {
const authorName = media.authors.map(au => au.name).filter(au => au).join(', ')
const coverPath = await this.searchForCover(existingLibraryItem.id, libraryItemDir, media.title, authorName, libraryScan)
if (coverPath) {
media.coverPath = coverPath
hasMediaChanges = true
}
}
existingLibraryItem.media = media
let libraryItemUpdated = false
@ -408,12 +420,14 @@ class BookScanner {
return null
}
let ebookFileScanData = null
if (ebookLibraryFile) {
ebookLibraryFile = ebookLibraryFile.toJSON()
ebookLibraryFile.ebookFormat = ebookLibraryFile.metadata.ext.slice(1).toLowerCase()
ebookFileScanData = await parseEbookMetadata.parse(ebookLibraryFile)
}
const bookMetadata = await this.getBookMetadataFromScanData(scannedAudioFiles, libraryItemData, libraryScan, librarySettings)
const bookMetadata = await this.getBookMetadataFromScanData(scannedAudioFiles, ebookFileScanData, libraryItemData, libraryScan, librarySettings)
bookMetadata.explicit = !!bookMetadata.explicit // Ensure boolean
bookMetadata.abridged = !!bookMetadata.abridged // Ensure boolean
@ -481,19 +495,28 @@ class BookScanner {
}
}
// If cover was not found in folder then check embedded covers in audio files OR search for cover
// If cover was not found in folder then check embedded covers in audio files OR ebook file
const libraryItemDir = libraryItemObj.isFile ? null : libraryItemObj.path
if (!bookObject.coverPath) {
const libraryItemDir = libraryItemObj.isFile ? null : libraryItemObj.path
// Extract and save embedded cover art
const extractedCoverPath = await CoverManager.saveEmbeddedCoverArt(scannedAudioFiles, libraryItemObj.id, libraryItemDir)
let extractedCoverPath = await CoverManager.saveEmbeddedCoverArt(scannedAudioFiles, libraryItemObj.id, libraryItemDir)
if (extractedCoverPath) {
libraryScan.addLog(LogLevel.DEBUG, `Extracted embedded cover from audio file at "${extractedCoverPath}" for book "${bookObject.title}"`)
bookObject.coverPath = extractedCoverPath
} else if (Database.serverSettings.scannerFindCovers) {
const authorName = bookMetadata.authors.join(', ')
bookObject.coverPath = await this.searchForCover(libraryItemObj.id, libraryItemDir, bookObject.title, authorName, libraryScan)
} else if (ebookFileScanData?.ebookCoverPath) {
extractedCoverPath = await CoverManager.saveEbookCoverArt(ebookFileScanData, libraryItemObj.id, libraryItemDir)
if (extractedCoverPath) {
libraryScan.addLog(LogLevel.DEBUG, `Extracted embedded cover from ebook file at "${extractedCoverPath}" for book "${bookObject.title}"`)
bookObject.coverPath = extractedCoverPath
}
}
}
// If cover not found then search for cover if enabled in settings
if (!bookObject.coverPath && Database.serverSettings.scannerFindCovers) {
const authorName = bookMetadata.authors.join(', ')
bookObject.coverPath = await this.searchForCover(libraryItemObj.id, libraryItemDir, bookObject.title, authorName, libraryScan)
}
libraryItemObj.book = bookObject
const libraryItem = await Database.libraryItemModel.create(libraryItemObj, {
include: {
@ -570,13 +593,14 @@ class BookScanner {
/**
*
* @param {import('../models/Book').AudioFileObject[]} audioFiles
* @param {import('../utils/parsers/parseEbookMetadata').EBookFileScanData} ebookFileScanData
* @param {import('./LibraryItemScanData')} libraryItemData
* @param {LibraryScan} libraryScan
* @param {import('../models/Library').LibrarySettingsObject} librarySettings
* @param {string} [existingLibraryItemId]
* @returns {Promise<BookMetadataObject>}
*/
async getBookMetadataFromScanData(audioFiles, libraryItemData, libraryScan, librarySettings, existingLibraryItemId = null) {
async getBookMetadataFromScanData(audioFiles, ebookFileScanData, libraryItemData, libraryScan, librarySettings, existingLibraryItemId = null) {
// First set book metadata from folder/file names
const bookMetadata = {
title: libraryItemData.mediaMetadata.title, // required
@ -599,7 +623,7 @@ class BookScanner {
coverPath: undefined
}
const bookMetadataSourceHandler = new BookScanner.BookMetadataSourceHandler(bookMetadata, audioFiles, libraryItemData, libraryScan, existingLibraryItemId)
const bookMetadataSourceHandler = new BookScanner.BookMetadataSourceHandler(bookMetadata, audioFiles, ebookFileScanData, libraryItemData, libraryScan, existingLibraryItemId)
const metadataPrecedence = librarySettings.metadataPrecedence || ['folderStructure', 'audioMetatags', 'nfoFile', 'txtFiles', 'opfFile', 'absMetadata']
libraryScan.addLog(LogLevel.DEBUG, `"${bookMetadata.title}" Getting metadata with precedence [${metadataPrecedence.join(', ')}]`)
for (const metadataSource of metadataPrecedence) {
@ -627,13 +651,15 @@ class BookScanner {
*
* @param {Object} bookMetadata
* @param {import('../models/Book').AudioFileObject[]} audioFiles
* @param {import('../utils/parsers/parseEbookMetadata').EBookFileScanData} ebookFileScanData
* @param {import('./LibraryItemScanData')} libraryItemData
* @param {LibraryScan} libraryScan
* @param {string} existingLibraryItemId
*/
constructor(bookMetadata, audioFiles, libraryItemData, libraryScan, existingLibraryItemId) {
constructor(bookMetadata, audioFiles, ebookFileScanData, libraryItemData, libraryScan, existingLibraryItemId) {
this.bookMetadata = bookMetadata
this.audioFiles = audioFiles
this.ebookFileScanData = ebookFileScanData
this.libraryItemData = libraryItemData
this.libraryScan = libraryScan
this.existingLibraryItemId = existingLibraryItemId
@ -647,13 +673,42 @@ class BookScanner {
}
/**
* Metadata from audio file meta tags
* Metadata from audio file meta tags OR metadata from ebook file
*/
audioMetatags() {
if (!this.audioFiles.length) return
// Modifies bookMetadata with metadata mapped from audio file meta tags
const bookTitle = this.bookMetadata.title || this.libraryItemData.mediaMetadata.title
AudioFileScanner.setBookMetadataFromAudioMetaTags(bookTitle, this.audioFiles, this.bookMetadata, this.libraryScan)
if (this.audioFiles.length) {
// Modifies bookMetadata with metadata mapped from audio file meta tags
const bookTitle = this.bookMetadata.title || this.libraryItemData.mediaMetadata.title
AudioFileScanner.setBookMetadataFromAudioMetaTags(bookTitle, this.audioFiles, this.bookMetadata, this.libraryScan)
} else if (this.ebookFileScanData) {
const ebookMetdataObject = this.ebookFileScanData.metadata
for (const key in ebookMetdataObject) {
if (key === 'tags') {
if (ebookMetdataObject.tags.length) {
this.bookMetadata.tags = ebookMetdataObject.tags
}
} else if (key === 'genres') {
if (ebookMetdataObject.genres.length) {
this.bookMetadata.genres = ebookMetdataObject.genres
}
} else if (key === 'authors') {
if (ebookMetdataObject.authors?.length) {
this.bookMetadata.authors = ebookMetdataObject.authors
}
} else if (key === 'narrators') {
if (ebookMetdataObject.narrators?.length) {
this.bookMetadata.narrators = ebookMetdataObject.narrators
}
} else if (key === 'series') {
if (ebookMetdataObject.series?.length) {
this.bookMetadata.series = ebookMetdataObject.series
}
} else if (ebookMetdataObject[key] && key !== 'sequence') {
this.bookMetadata[key] = ebookMetdataObject[key]
}
}
}
return null
}
/**

View File

@ -2,7 +2,6 @@ const uuidv4 = require("uuid").v4
const Path = require('path')
const { LogLevel } = require('../utils/constants')
const { getTitleIgnorePrefix } = require('../utils/index')
const abmetadataGenerator = require('../utils/generators/abmetadataGenerator')
const AudioFileScanner = require('./AudioFileScanner')
const Database = require('../Database')
const { filePathToPOSIX, getFileTimestampsWithIno } = require('../utils/fileUtils')

View File

@ -0,0 +1,42 @@
const parseEpubMetadata = require('./parseEpubMetadata')
/**
* @typedef EBookFileScanData
* @property {string} path
* @property {string} ebookFormat
* @property {string} ebookCoverPath internal image path
* @property {import('../../scanner/BookScanner').BookMetadataObject} metadata
*/
/**
* Parse metadata from ebook file
*
* @param {import('../../models/Book').EBookFileObject} ebookFile
* @returns {Promise<EBookFileScanData>}
*/
async function parse(ebookFile) {
if (!ebookFile) return null
if (ebookFile.ebookFormat === 'epub') {
return parseEpubMetadata.parse(ebookFile.metadata.path)
}
return null
}
module.exports.parse = parse
/**
* Extract cover from ebook file
*
* @param {EBookFileScanData} ebookFileScanData
* @param {string} outputCoverPath
* @returns {Promise<boolean>}
*/
async function extractCoverImage(ebookFileScanData, outputCoverPath) {
if (!ebookFileScanData?.ebookCoverPath) return false
if (ebookFileScanData.ebookFormat === 'epub') {
return parseEpubMetadata.extractCoverImage(ebookFileScanData.path, ebookFileScanData.ebookCoverPath, outputCoverPath)
}
return false
}
module.exports.extractCoverImage = extractCoverImage

View File

@ -0,0 +1,109 @@
const Path = require('path')
const Logger = require('../../Logger')
const StreamZip = require('../../libs/nodeStreamZip')
const parseOpfMetadata = require('./parseOpfMetadata')
const { xmlToJSON } = require('../index')
/**
* Extract file from epub and return string content
*
* @param {string} epubPath
* @param {string} filepath
* @returns {Promise<string>}
*/
async function extractFileFromEpub(epubPath, filepath) {
const zip = new StreamZip.async({ file: epubPath })
const data = await zip.entryData(filepath).catch((error) => {
Logger.error(`[parseEpubMetadata] Failed to extract ${filepath} from epub at "${epubPath}"`, error)
})
const filedata = data?.toString('utf8')
await zip.close()
return filedata
}
/**
* Extract an XML file from epub and return JSON
*
* @param {string} epubPath
* @param {string} xmlFilepath
* @returns {Promise<Object>}
*/
async function extractXmlToJson(epubPath, xmlFilepath) {
const filedata = await extractFileFromEpub(epubPath, xmlFilepath)
if (!filedata) return null
return xmlToJSON(filedata)
}
/**
* Extract cover image from epub return true if success
*
* @param {string} epubPath
* @param {string} epubImageFilepath
* @param {string} outputCoverPath
* @returns {Promise<boolean>}
*/
async function extractCoverImage(epubPath, epubImageFilepath, outputCoverPath) {
const zip = new StreamZip.async({ file: epubPath })
const success = await zip.extract(epubImageFilepath, outputCoverPath).then(() => true).catch((error) => {
Logger.error(`[parseEpubMetadata] Failed to extract image ${epubImageFilepath} from epub at "${epubPath}"`, error)
return false
})
await zip.close()
return success
}
module.exports.extractCoverImage = extractCoverImage
/**
* Parse metadata from epub
*
* @param {string} epubPath
* @returns {Promise<import('./parseEbookMetadata').EBookFileScanData>}
*/
async function parse(epubPath) {
Logger.debug(`Parsing metadata from epub at "${epubPath}"`)
// Entrypoint of the epub that contains the filepath to the package document (opf file)
const containerJson = await extractXmlToJson(epubPath, 'META-INF/container.xml')
// Get package document opf filepath from container.xml
const packageDocPath = containerJson.container?.rootfiles?.[0]?.rootfile?.[0]?.$?.['full-path']
if (!packageDocPath) {
Logger.error(`Failed to get package doc path in Container.xml`, JSON.stringify(containerJson, null, 2))
return null
}
// Extract package document to JSON
const packageJson = await extractXmlToJson(epubPath, packageDocPath)
if (!packageJson) {
return null
}
// Parse metadata from package document opf file
const opfMetadata = parseOpfMetadata.parseOpfMetadataJson(packageJson)
if (!opfMetadata) {
Logger.error(`Unable to parse metadata in package doc with json`, JSON.stringify(packageJson, null, 2))
return null
}
const payload = {
path: epubPath,
ebookFormat: 'epub',
metadata: opfMetadata
}
// Attempt to find filepath to cover image
const manifestFirstImage = packageJson.package?.manifest?.[0]?.item?.find(item => item.$?.['media-type']?.startsWith('image/'))
let coverImagePath = manifestFirstImage?.$?.href
if (coverImagePath) {
const packageDirname = Path.dirname(packageDocPath)
payload.ebookCoverPath = Path.posix.join(packageDirname, coverImagePath)
} else {
Logger.warn(`Cover image not found in manifest for epub at "${epubPath}"`)
}
return payload
}
module.exports.parse = parse

View File

@ -136,11 +136,7 @@ function stripPrefix(str) {
return str.split(':').pop()
}
module.exports.parseOpfMetadataXML = async (xml) => {
const json = await xmlToJSON(xml)
if (!json) return null
module.exports.parseOpfMetadataJson = (json) => {
// Handle <package ...> or with prefix <ns0:package ...>
const packageKey = Object.keys(json).find(key => stripPrefix(key) === 'package')
if (!packageKey) return null
@ -167,7 +163,7 @@ module.exports.parseOpfMetadataXML = async (xml) => {
const creators = parseCreators(metadata)
const authors = (fetchCreators(creators, 'aut') || []).map(au => au?.trim()).filter(au => au)
const narrators = (fetchNarrators(creators, metadata) || []).map(nrt => nrt?.trim()).filter(nrt => nrt)
const data = {
return {
title: fetchTitle(metadata),
subtitle: fetchSubtitle(metadata),
authors,
@ -182,5 +178,10 @@ module.exports.parseOpfMetadataXML = async (xml) => {
series: fetchSeries(metadataMeta),
tags: fetchTags(metadata)
}
return data
}
module.exports.parseOpfMetadataXML = async (xml) => {
const json = await xmlToJSON(xml)
if (!json) return null
return this.parseOpfMetadataJson(json)
}