audiobookshelf/server/utils/podcastUtils.js

388 lines
12 KiB
JavaScript

const axios = require('axios')
const ssrfFilter = require('ssrf-req-filter')
const Logger = require('../Logger')
const { xmlToJSON, levenshteinDistance } = require('./index')
const htmlSanitizer = require('../utils/htmlSanitizer')
/**
* @typedef RssPodcastEpisode
* @property {string} title
* @property {string} subtitle
* @property {string} description
* @property {string} descriptionPlain
* @property {string} pubDate
* @property {string} episodeType
* @property {string} season
* @property {string} episode
* @property {string} author
* @property {string} duration
* @property {string} explicit
* @property {number} publishedAt - Unix timestamp
* @property {{ url: string, type?: string, length?: string }} enclosure
* @property {string} guid
* @property {string} chaptersUrl
* @property {string} chaptersType
*/
/**
* @typedef RssPodcastMetadata
* @property {string} title
* @property {string} language
* @property {string} explicit
* @property {string} author
* @property {string} pubDate
* @property {string} link
* @property {string} image
* @property {string[]} categories
* @property {string} feedUrl
* @property {string} description
* @property {string} descriptionPlain
* @property {string} type
*/
/**
* @typedef RssPodcast
* @property {RssPodcastMetadata} metadata
* @property {RssPodcastEpisode[]} episodes
* @property {number} numEpisodes
*/
function extractFirstArrayItem(json, key) {
if (!json[key]?.length) return null
return json[key][0]
}
function extractStringOrStringify(json) {
try {
if (typeof json[Object.keys(json)[0]]?.[0] === 'string') {
return json[Object.keys(json)[0]][0]
}
// Handles case where html was included without being wrapped in CDATA
return JSON.stringify(value)
} catch {
return ''
}
}
function extractFirstArrayItemString(json, key) {
const item = extractFirstArrayItem(json, key)
if (!item) return ''
if (typeof item === 'object') {
if (item?.['_'] && typeof item['_'] === 'string') return item['_']
return extractStringOrStringify(item)
}
return typeof item === 'string' ? item : ''
}
function extractImage(channel) {
if (!channel.image || !channel.image.url || !channel.image.url.length) {
if (!channel['itunes:image'] || !channel['itunes:image'].length || !channel['itunes:image'][0]['$']) {
return null
}
var itunesImage = channel['itunes:image'][0]['$']
return itunesImage.href || null
}
return channel.image.url[0] || null
}
function extractCategories(channel) {
if (!channel['itunes:category'] || !channel['itunes:category'].length) return []
var categories = channel['itunes:category']
var cleanedCats = []
categories.forEach((cat) => {
if (!cat['$'] || !cat['$'].text) return
var cattext = cat['$'].text
if (cat['itunes:category']) {
var subcats = extractCategories(cat)
if (subcats.length) {
cleanedCats = cleanedCats.concat(subcats.map((subcat) => `${cattext}:${subcat}`))
} else {
cleanedCats.push(cattext)
}
} else {
cleanedCats.push(cattext)
}
})
return cleanedCats
}
function extractPodcastMetadata(channel) {
const metadata = {
image: extractImage(channel),
categories: extractCategories(channel),
feedUrl: null,
description: null,
descriptionPlain: null,
type: null
}
if (channel['itunes:new-feed-url']) {
metadata.feedUrl = extractFirstArrayItem(channel, 'itunes:new-feed-url')
} else if (channel['atom:link'] && channel['atom:link'].length && channel['atom:link'][0]['$']) {
metadata.feedUrl = channel['atom:link'][0]['$'].href || null
}
if (channel['description']) {
const rawDescription = extractFirstArrayItemString(channel, 'description')
metadata.description = htmlSanitizer.sanitize(rawDescription.trim())
metadata.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription.trim())
}
const arrayFields = ['title', 'language', 'itunes:explicit', 'itunes:author', 'pubDate', 'link', 'itunes:type']
arrayFields.forEach((key) => {
const cleanKey = key.split(':').pop()
let value = extractFirstArrayItem(channel, key)
if (value?.['_']) value = value['_']
metadata[cleanKey] = value
})
return metadata
}
function extractEpisodeData(item) {
// Episode must have url
if (!item.enclosure?.[0]?.['$']?.url) {
Logger.error(`[podcastUtils] Invalid podcast episode data`)
return null
}
const episode = {
enclosure: {
...item.enclosure[0]['$']
}
}
episode.enclosure.url = episode.enclosure.url.trim()
// Full description with html
if (item['content:encoded']) {
const rawDescription = (extractFirstArrayItem(item, 'content:encoded') || '').trim()
episode.description = htmlSanitizer.sanitize(rawDescription)
}
// Extract chapters
if (item['podcast:chapters']?.[0]?.['$']?.url) {
episode.chaptersUrl = item['podcast:chapters'][0]['$'].url
episode.chaptersType = item['podcast:chapters'][0]['$'].type || 'application/json'
}
// Supposed to be the plaintext description but not always followed
if (item['description']) {
const rawDescription = extractFirstArrayItemString(item, 'description')
if (!episode.description) episode.description = htmlSanitizer.sanitize(rawDescription.trim())
episode.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription.trim())
}
if (item['pubDate']) {
const pubDate = extractFirstArrayItem(item, 'pubDate')
if (typeof pubDate === 'string') {
episode.pubDate = pubDate
} else if (typeof pubDate?._ === 'string') {
episode.pubDate = pubDate._
} else {
Logger.error(`[podcastUtils] Invalid pubDate ${item['pubDate']} for ${episode.enclosure.url}`)
}
}
if (item['guid']) {
const guidItem = extractFirstArrayItem(item, 'guid')
if (typeof guidItem === 'string') {
episode.guid = guidItem
} else if (typeof guidItem?._ === 'string') {
episode.guid = guidItem._
} else {
Logger.error(`[podcastUtils] Invalid guid ${item['guid']} for ${episode.enclosure.url}`)
}
}
const arrayFields = ['title', 'itunes:episodeType', 'itunes:season', 'itunes:episode', 'itunes:author', 'itunes:duration', 'itunes:explicit', 'itunes:subtitle']
arrayFields.forEach((key) => {
const cleanKey = key.split(':').pop()
episode[cleanKey] = extractFirstArrayItemString(item, key)
})
return episode
}
function cleanEpisodeData(data) {
const pubJsDate = data.pubDate ? new Date(data.pubDate) : null
const publishedAt = pubJsDate && !isNaN(pubJsDate) ? pubJsDate.valueOf() : null
return {
title: data.title,
subtitle: data.subtitle || '',
description: data.description || '',
descriptionPlain: data.descriptionPlain || '',
pubDate: data.pubDate || '',
episodeType: data.episodeType || '',
season: data.season || '',
episode: data.episode || '',
author: data.author || '',
duration: data.duration || '',
explicit: data.explicit || '',
publishedAt,
enclosure: data.enclosure,
guid: data.guid || null,
chaptersUrl: data.chaptersUrl || null,
chaptersType: data.chaptersType || null
}
}
function extractPodcastEpisodes(items) {
const episodes = []
items.forEach((item) => {
const extracted = extractEpisodeData(item)
if (extracted) {
episodes.push(cleanEpisodeData(extracted))
}
})
return episodes
}
function cleanPodcastJson(rssJson, excludeEpisodeMetadata) {
if (!rssJson.channel?.length) {
Logger.error(`[podcastUtil] Invalid podcast no channel object`)
return null
}
const channel = rssJson.channel[0]
if (!channel.item?.length) {
Logger.error(`[podcastUtil] Invalid podcast no episodes`)
return null
}
const podcast = {
metadata: extractPodcastMetadata(channel)
}
if (!excludeEpisodeMetadata) {
podcast.episodes = extractPodcastEpisodes(channel.item)
} else {
podcast.numEpisodes = channel.item.length
}
return podcast
}
module.exports.parsePodcastRssFeedXml = async (xml, excludeEpisodeMetadata = false, includeRaw = false) => {
if (!xml) return null
const json = await xmlToJSON(xml)
if (!json?.rss) {
Logger.error('[podcastUtils] Invalid XML or RSS feed')
return null
}
const podcast = cleanPodcastJson(json.rss, excludeEpisodeMetadata)
if (!podcast) return null
if (includeRaw) {
return {
podcast,
rawJson: json
}
} else {
return {
podcast
}
}
}
/**
* Get podcast RSS feed as JSON
* Uses SSRF filter to prevent internal URLs
*
* @param {string} feedUrl
* @param {boolean} [excludeEpisodeMetadata=false]
* @returns {Promise<RssPodcast|null>}
*/
module.exports.getPodcastFeed = (feedUrl, excludeEpisodeMetadata = false) => {
Logger.debug(`[podcastUtils] getPodcastFeed for "${feedUrl}"`)
let userAgent = 'audiobookshelf (+https://audiobookshelf.org; like iTMS)'
// Workaround for CBC RSS feeds rejecting our user agent string
// See: https://github.com/advplyr/audiobookshelf/issues/3322
if (feedUrl.startsWith('https://www.cbc.ca')) {
userAgent = 'audiobookshelf (+https://audiobookshelf.org; like iTMS) - CBC'
}
return axios({
url: feedUrl,
method: 'GET',
timeout: global.PodcastDownloadTimeout,
responseType: 'arraybuffer',
headers: {
Accept: 'application/rss+xml, application/xhtml+xml, application/xml, */*;q=0.8',
'User-Agent': userAgent
},
httpAgent: global.DisableSsrfRequestFilter?.(feedUrl) ? null : ssrfFilter(feedUrl),
httpsAgent: global.DisableSsrfRequestFilter?.(feedUrl) ? null : ssrfFilter(feedUrl)
})
.then(async (data) => {
// Adding support for ios-8859-1 encoded RSS feeds.
// See: https://github.com/advplyr/audiobookshelf/issues/1489
const contentType = data.headers?.['content-type'] || '' // e.g. text/xml; charset=iso-8859-1
if (contentType.toLowerCase().includes('iso-8859-1')) {
data.data = data.data.toString('latin1')
} else {
data.data = data.data.toString()
}
if (!data?.data) {
Logger.error(`[podcastUtils] getPodcastFeed: Invalid podcast feed request response (${feedUrl})`)
return null
}
Logger.debug(`[podcastUtils] getPodcastFeed for "${feedUrl}" success - parsing xml`)
const payload = await this.parsePodcastRssFeedXml(data.data, excludeEpisodeMetadata)
if (!payload) {
return null
}
// RSS feed may be a private RSS feed
payload.podcast.metadata.feedUrl = feedUrl
return payload.podcast
})
.catch((error) => {
Logger.error('[podcastUtils] getPodcastFeed Error', error)
return null
})
}
// Return array of episodes ordered by closest match (Levenshtein distance of 6 or less)
module.exports.findMatchingEpisodes = async (feedUrl, searchTitle) => {
const feed = await this.getPodcastFeed(feedUrl).catch(() => {
return null
})
return this.findMatchingEpisodesInFeed(feed, searchTitle)
}
/**
*
* @param {RssPodcast} feed
* @param {string} searchTitle
* @returns {Array<{ episode: RssPodcastEpisode, levenshtein: number }>}
*/
module.exports.findMatchingEpisodesInFeed = (feed, searchTitle) => {
searchTitle = searchTitle.toLowerCase().trim()
if (!feed?.episodes) {
return null
}
const matches = []
feed.episodes.forEach((ep) => {
if (!ep.title) return
const epTitle = ep.title.toLowerCase().trim()
if (epTitle === searchTitle) {
matches.push({
episode: ep,
levenshtein: 0
})
} else {
const levenshtein = levenshteinDistance(searchTitle, epTitle, true)
if (levenshtein <= 6 && epTitle.length > levenshtein) {
matches.push({
episode: ep,
levenshtein
})
}
}
})
return matches.sort((a, b) => a.levenshtein - b.levenshtein)
}