2022-09-16 01:35:56 +02:00
|
|
|
const axios = require('axios')
|
2023-12-17 19:00:11 +01:00
|
|
|
const ssrfFilter = require('ssrf-req-filter')
|
|
|
|
const Logger = require('../Logger')
|
2022-09-16 01:35:56 +02:00
|
|
|
const { xmlToJSON, levenshteinDistance } = require('./index')
|
2022-05-28 02:41:40 +02:00
|
|
|
const htmlSanitizer = require('../utils/htmlSanitizer')
|
2022-03-06 01:54:24 +01:00
|
|
|
|
2025-01-04 19:41:09 +01:00
|
|
|
/**
|
|
|
|
* @typedef RssPodcastEpisode
|
|
|
|
* @property {string} title
|
|
|
|
* @property {string} subtitle
|
|
|
|
* @property {string} description
|
|
|
|
* @property {string} descriptionPlain
|
|
|
|
* @property {string} pubDate
|
|
|
|
* @property {string} episodeType
|
|
|
|
* @property {string} season
|
|
|
|
* @property {string} episode
|
|
|
|
* @property {string} author
|
|
|
|
* @property {string} duration
|
|
|
|
* @property {string} explicit
|
|
|
|
* @property {number} publishedAt - Unix timestamp
|
|
|
|
* @property {{ url: string, type?: string, length?: string }} enclosure
|
|
|
|
* @property {string} guid
|
|
|
|
* @property {string} chaptersUrl
|
|
|
|
* @property {string} chaptersType
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @typedef RssPodcastMetadata
|
|
|
|
* @property {string} title
|
|
|
|
* @property {string} language
|
|
|
|
* @property {string} explicit
|
|
|
|
* @property {string} author
|
|
|
|
* @property {string} pubDate
|
|
|
|
* @property {string} link
|
|
|
|
* @property {string} image
|
|
|
|
* @property {string[]} categories
|
|
|
|
* @property {string} feedUrl
|
|
|
|
* @property {string} description
|
|
|
|
* @property {string} descriptionPlain
|
|
|
|
* @property {string} type
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @typedef RssPodcast
|
|
|
|
* @property {RssPodcastMetadata} metadata
|
|
|
|
* @property {RssPodcastEpisode[]} episodes
|
|
|
|
* @property {number} numEpisodes
|
|
|
|
*/
|
|
|
|
|
2022-03-06 01:54:24 +01:00
|
|
|
function extractFirstArrayItem(json, key) {
|
2023-10-17 00:47:44 +02:00
|
|
|
if (!json[key]?.length) return null
|
2022-03-06 01:54:24 +01:00
|
|
|
return json[key][0]
|
|
|
|
}
|
|
|
|
|
|
|
|
function extractImage(channel) {
|
|
|
|
if (!channel.image || !channel.image.url || !channel.image.url.length) {
|
|
|
|
if (!channel['itunes:image'] || !channel['itunes:image'].length || !channel['itunes:image'][0]['$']) {
|
|
|
|
return null
|
|
|
|
}
|
|
|
|
var itunesImage = channel['itunes:image'][0]['$']
|
|
|
|
return itunesImage.href || null
|
|
|
|
}
|
|
|
|
return channel.image.url[0] || null
|
|
|
|
}
|
|
|
|
|
|
|
|
function extractCategories(channel) {
|
|
|
|
if (!channel['itunes:category'] || !channel['itunes:category'].length) return []
|
|
|
|
var categories = channel['itunes:category']
|
|
|
|
var cleanedCats = []
|
|
|
|
categories.forEach((cat) => {
|
|
|
|
if (!cat['$'] || !cat['$'].text) return
|
|
|
|
var cattext = cat['$'].text
|
|
|
|
if (cat['itunes:category']) {
|
|
|
|
var subcats = extractCategories(cat)
|
|
|
|
if (subcats.length) {
|
|
|
|
cleanedCats = cleanedCats.concat(subcats.map((subcat) => `${cattext}:${subcat}`))
|
|
|
|
} else {
|
|
|
|
cleanedCats.push(cattext)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
cleanedCats.push(cattext)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
return cleanedCats
|
|
|
|
}
|
|
|
|
|
|
|
|
function extractPodcastMetadata(channel) {
|
2023-04-06 00:40:40 +02:00
|
|
|
const metadata = {
|
2022-03-06 01:54:24 +01:00
|
|
|
image: extractImage(channel),
|
2022-04-13 23:55:48 +02:00
|
|
|
categories: extractCategories(channel),
|
|
|
|
feedUrl: null,
|
|
|
|
description: null,
|
2023-02-22 19:22:52 +01:00
|
|
|
descriptionPlain: null,
|
|
|
|
type: null
|
2022-03-06 01:54:24 +01:00
|
|
|
}
|
2022-04-13 23:55:48 +02:00
|
|
|
|
|
|
|
if (channel['itunes:new-feed-url']) {
|
|
|
|
metadata.feedUrl = extractFirstArrayItem(channel, 'itunes:new-feed-url')
|
|
|
|
} else if (channel['atom:link'] && channel['atom:link'].length && channel['atom:link'][0]['$']) {
|
|
|
|
metadata.feedUrl = channel['atom:link'][0]['$'].href || null
|
|
|
|
}
|
|
|
|
|
|
|
|
if (channel['description']) {
|
2022-05-28 02:41:40 +02:00
|
|
|
const rawDescription = extractFirstArrayItem(channel, 'description') || ''
|
2024-12-18 00:44:18 +01:00
|
|
|
metadata.description = htmlSanitizer.sanitize(rawDescription.trim())
|
|
|
|
metadata.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription.trim())
|
2022-04-13 23:55:48 +02:00
|
|
|
}
|
|
|
|
|
2023-04-06 00:40:40 +02:00
|
|
|
const arrayFields = ['title', 'language', 'itunes:explicit', 'itunes:author', 'pubDate', 'link', 'itunes:type']
|
2022-03-06 01:54:24 +01:00
|
|
|
arrayFields.forEach((key) => {
|
2023-04-06 00:40:40 +02:00
|
|
|
const cleanKey = key.split(':').pop()
|
|
|
|
let value = extractFirstArrayItem(channel, key)
|
2023-10-28 23:11:15 +02:00
|
|
|
if (value?.['_']) value = value['_']
|
2023-04-06 00:40:40 +02:00
|
|
|
metadata[cleanKey] = value
|
2022-03-06 01:54:24 +01:00
|
|
|
})
|
|
|
|
return metadata
|
|
|
|
}
|
|
|
|
|
|
|
|
function extractEpisodeData(item) {
|
|
|
|
// Episode must have url
|
2023-04-09 21:32:51 +02:00
|
|
|
if (!item.enclosure?.[0]?.['$']?.url) {
|
2022-03-06 01:54:24 +01:00
|
|
|
Logger.error(`[podcastUtils] Invalid podcast episode data`)
|
|
|
|
return null
|
|
|
|
}
|
2022-04-18 00:52:06 +02:00
|
|
|
|
2023-04-09 21:32:51 +02:00
|
|
|
const episode = {
|
2022-03-06 01:54:24 +01:00
|
|
|
enclosure: {
|
|
|
|
...item.enclosure[0]['$']
|
|
|
|
}
|
|
|
|
}
|
2022-04-18 00:52:06 +02:00
|
|
|
|
2023-04-18 00:03:58 +02:00
|
|
|
episode.enclosure.url = episode.enclosure.url.trim()
|
|
|
|
|
2022-05-28 18:38:51 +02:00
|
|
|
// Full description with html
|
|
|
|
if (item['content:encoded']) {
|
|
|
|
const rawDescription = (extractFirstArrayItem(item, 'content:encoded') || '').trim()
|
|
|
|
episode.description = htmlSanitizer.sanitize(rawDescription)
|
|
|
|
}
|
|
|
|
|
2023-04-09 21:32:51 +02:00
|
|
|
// Extract chapters
|
|
|
|
if (item['podcast:chapters']?.[0]?.['$']?.url) {
|
|
|
|
episode.chaptersUrl = item['podcast:chapters'][0]['$'].url
|
|
|
|
episode.chaptersType = item['podcast:chapters'][0]['$'].type || 'application/json'
|
|
|
|
}
|
|
|
|
|
2022-05-28 18:38:51 +02:00
|
|
|
// Supposed to be the plaintext description but not always followed
|
2022-04-18 00:52:06 +02:00
|
|
|
if (item['description']) {
|
2022-05-28 02:41:40 +02:00
|
|
|
const rawDescription = extractFirstArrayItem(item, 'description') || ''
|
2024-12-18 00:44:18 +01:00
|
|
|
if (!episode.description) episode.description = htmlSanitizer.sanitize(rawDescription.trim())
|
|
|
|
episode.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription.trim())
|
2022-04-18 00:52:06 +02:00
|
|
|
}
|
|
|
|
|
2022-10-16 23:24:05 +02:00
|
|
|
if (item['pubDate']) {
|
2022-11-06 22:43:17 +01:00
|
|
|
const pubDate = extractFirstArrayItem(item, 'pubDate')
|
|
|
|
if (typeof pubDate === 'string') {
|
|
|
|
episode.pubDate = pubDate
|
2023-10-17 00:47:44 +02:00
|
|
|
} else if (typeof pubDate?._ === 'string') {
|
2022-11-06 22:43:17 +01:00
|
|
|
episode.pubDate = pubDate._
|
2022-10-16 23:24:05 +02:00
|
|
|
} else {
|
|
|
|
Logger.error(`[podcastUtils] Invalid pubDate ${item['pubDate']} for ${episode.enclosure.url}`)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-17 00:47:44 +02:00
|
|
|
if (item['guid']) {
|
|
|
|
const guidItem = extractFirstArrayItem(item, 'guid')
|
|
|
|
if (typeof guidItem === 'string') {
|
|
|
|
episode.guid = guidItem
|
|
|
|
} else if (typeof guidItem?._ === 'string') {
|
|
|
|
episode.guid = guidItem._
|
|
|
|
} else {
|
|
|
|
Logger.error(`[podcastUtils] Invalid guid ${item['guid']} for ${episode.enclosure.url}`)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-06 00:40:40 +02:00
|
|
|
const arrayFields = ['title', 'itunes:episodeType', 'itunes:season', 'itunes:episode', 'itunes:author', 'itunes:duration', 'itunes:explicit', 'itunes:subtitle']
|
2022-03-06 01:54:24 +01:00
|
|
|
arrayFields.forEach((key) => {
|
2023-04-06 00:40:40 +02:00
|
|
|
const cleanKey = key.split(':').pop()
|
2023-10-28 23:11:15 +02:00
|
|
|
let value = extractFirstArrayItem(item, key)
|
|
|
|
if (value?.['_']) value = value['_']
|
|
|
|
episode[cleanKey] = value
|
2022-03-06 01:54:24 +01:00
|
|
|
})
|
|
|
|
return episode
|
|
|
|
}
|
|
|
|
|
2022-03-19 12:41:54 +01:00
|
|
|
function cleanEpisodeData(data) {
|
2022-10-16 23:24:05 +02:00
|
|
|
const pubJsDate = data.pubDate ? new Date(data.pubDate) : null
|
|
|
|
const publishedAt = pubJsDate && !isNaN(pubJsDate) ? pubJsDate.valueOf() : null
|
2022-03-19 12:41:54 +01:00
|
|
|
return {
|
|
|
|
title: data.title,
|
|
|
|
subtitle: data.subtitle || '',
|
|
|
|
description: data.description || '',
|
2022-04-18 00:52:06 +02:00
|
|
|
descriptionPlain: data.descriptionPlain || '',
|
2022-03-19 12:41:54 +01:00
|
|
|
pubDate: data.pubDate || '',
|
|
|
|
episodeType: data.episodeType || '',
|
2022-05-04 16:14:09 +02:00
|
|
|
season: data.season || '',
|
2022-03-19 12:41:54 +01:00
|
|
|
episode: data.episode || '',
|
|
|
|
author: data.author || '',
|
|
|
|
duration: data.duration || '',
|
|
|
|
explicit: data.explicit || '',
|
2022-10-16 23:24:05 +02:00
|
|
|
publishedAt,
|
2023-04-09 21:32:51 +02:00
|
|
|
enclosure: data.enclosure,
|
2023-10-17 00:47:44 +02:00
|
|
|
guid: data.guid || null,
|
2023-04-09 21:32:51 +02:00
|
|
|
chaptersUrl: data.chaptersUrl || null,
|
|
|
|
chaptersType: data.chaptersType || null
|
2022-03-19 12:41:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-06 01:54:24 +01:00
|
|
|
function extractPodcastEpisodes(items) {
|
2023-04-09 21:32:51 +02:00
|
|
|
const episodes = []
|
2022-03-06 01:54:24 +01:00
|
|
|
items.forEach((item) => {
|
2023-04-09 21:32:51 +02:00
|
|
|
const extracted = extractEpisodeData(item)
|
2022-03-19 12:41:54 +01:00
|
|
|
if (extracted) {
|
|
|
|
episodes.push(cleanEpisodeData(extracted))
|
2022-03-06 01:54:24 +01:00
|
|
|
}
|
|
|
|
})
|
|
|
|
return episodes
|
|
|
|
}
|
|
|
|
|
2022-05-29 18:46:45 +02:00
|
|
|
function cleanPodcastJson(rssJson, excludeEpisodeMetadata) {
|
2023-10-17 00:47:44 +02:00
|
|
|
if (!rssJson.channel?.length) {
|
2022-03-06 01:54:24 +01:00
|
|
|
Logger.error(`[podcastUtil] Invalid podcast no channel object`)
|
|
|
|
return null
|
|
|
|
}
|
2023-10-17 00:47:44 +02:00
|
|
|
const channel = rssJson.channel[0]
|
|
|
|
if (!channel.item?.length) {
|
2022-03-06 01:54:24 +01:00
|
|
|
Logger.error(`[podcastUtil] Invalid podcast no episodes`)
|
|
|
|
return null
|
|
|
|
}
|
2023-10-17 00:47:44 +02:00
|
|
|
const podcast = {
|
2022-05-29 18:46:45 +02:00
|
|
|
metadata: extractPodcastMetadata(channel)
|
|
|
|
}
|
|
|
|
if (!excludeEpisodeMetadata) {
|
|
|
|
podcast.episodes = extractPodcastEpisodes(channel.item)
|
|
|
|
} else {
|
|
|
|
podcast.numEpisodes = channel.item.length
|
2022-03-06 01:54:24 +01:00
|
|
|
}
|
|
|
|
return podcast
|
|
|
|
}
|
|
|
|
|
2022-05-29 18:46:45 +02:00
|
|
|
module.exports.parsePodcastRssFeedXml = async (xml, excludeEpisodeMetadata = false, includeRaw = false) => {
|
2022-03-06 01:54:24 +01:00
|
|
|
if (!xml) return null
|
2023-10-17 00:47:44 +02:00
|
|
|
const json = await xmlToJSON(xml)
|
|
|
|
if (!json?.rss) {
|
2022-03-06 01:54:24 +01:00
|
|
|
Logger.error('[podcastUtils] Invalid XML or RSS feed')
|
|
|
|
return null
|
|
|
|
}
|
2022-04-13 23:55:48 +02:00
|
|
|
|
2022-05-29 18:46:45 +02:00
|
|
|
const podcast = cleanPodcastJson(json.rss, excludeEpisodeMetadata)
|
2022-04-13 23:55:48 +02:00
|
|
|
if (!podcast) return null
|
|
|
|
|
|
|
|
if (includeRaw) {
|
|
|
|
return {
|
|
|
|
podcast,
|
|
|
|
rawJson: json
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return {
|
|
|
|
podcast
|
|
|
|
}
|
|
|
|
}
|
2022-09-16 01:35:56 +02:00
|
|
|
}
|
|
|
|
|
2023-12-17 19:00:11 +01:00
|
|
|
/**
|
|
|
|
* Get podcast RSS feed as JSON
|
|
|
|
* Uses SSRF filter to prevent internal URLs
|
2024-06-04 00:21:18 +02:00
|
|
|
*
|
|
|
|
* @param {string} feedUrl
|
2023-12-17 19:00:11 +01:00
|
|
|
* @param {boolean} [excludeEpisodeMetadata=false]
|
2025-01-04 19:41:09 +01:00
|
|
|
* @returns {Promise<RssPodcast|null>}
|
2023-12-17 19:00:11 +01:00
|
|
|
*/
|
2022-09-16 01:35:56 +02:00
|
|
|
module.exports.getPodcastFeed = (feedUrl, excludeEpisodeMetadata = false) => {
|
|
|
|
Logger.debug(`[podcastUtils] getPodcastFeed for "${feedUrl}"`)
|
2023-12-17 19:00:11 +01:00
|
|
|
|
2024-11-15 15:30:54 +01:00
|
|
|
let userAgent = 'audiobookshelf (+https://audiobookshelf.org; like iTMS)'
|
|
|
|
// Workaround for CBC RSS feeds rejecting our user agent string
|
|
|
|
// See: https://github.com/advplyr/audiobookshelf/issues/3322
|
|
|
|
if (feedUrl.startsWith('https://www.cbc.ca')) {
|
|
|
|
userAgent = 'audiobookshelf (+https://audiobookshelf.org; like iTMS) - CBC'
|
|
|
|
}
|
|
|
|
|
2023-12-17 19:00:11 +01:00
|
|
|
return axios({
|
|
|
|
url: feedUrl,
|
|
|
|
method: 'GET',
|
|
|
|
timeout: 12000,
|
|
|
|
responseType: 'arraybuffer',
|
2024-06-23 18:35:37 +02:00
|
|
|
headers: {
|
|
|
|
Accept: 'application/rss+xml, application/xhtml+xml, application/xml, */*;q=0.8',
|
2024-11-15 15:30:54 +01:00
|
|
|
'User-Agent': userAgent
|
2024-06-23 18:35:37 +02:00
|
|
|
},
|
2024-12-24 00:18:08 +01:00
|
|
|
httpAgent: global.DisableSsrfRequestFilter?.(feedUrl) ? null : ssrfFilter(feedUrl),
|
|
|
|
httpsAgent: global.DisableSsrfRequestFilter?.(feedUrl) ? null : ssrfFilter(feedUrl)
|
2024-06-04 00:21:18 +02:00
|
|
|
})
|
|
|
|
.then(async (data) => {
|
|
|
|
// Adding support for ios-8859-1 encoded RSS feeds.
|
|
|
|
// See: https://github.com/advplyr/audiobookshelf/issues/1489
|
|
|
|
const contentType = data.headers?.['content-type'] || '' // e.g. text/xml; charset=iso-8859-1
|
|
|
|
if (contentType.toLowerCase().includes('iso-8859-1')) {
|
|
|
|
data.data = data.data.toString('latin1')
|
|
|
|
} else {
|
|
|
|
data.data = data.data.toString()
|
|
|
|
}
|
2023-02-11 00:07:25 +01:00
|
|
|
|
2024-06-04 00:21:18 +02:00
|
|
|
if (!data?.data) {
|
|
|
|
Logger.error(`[podcastUtils] getPodcastFeed: Invalid podcast feed request response (${feedUrl})`)
|
|
|
|
return null
|
|
|
|
}
|
|
|
|
Logger.debug(`[podcastUtils] getPodcastFeed for "${feedUrl}" success - parsing xml`)
|
|
|
|
const payload = await this.parsePodcastRssFeedXml(data.data, excludeEpisodeMetadata)
|
|
|
|
if (!payload) {
|
|
|
|
return null
|
|
|
|
}
|
2022-09-16 01:35:56 +02:00
|
|
|
|
2024-06-04 00:21:18 +02:00
|
|
|
// RSS feed may be a private RSS feed
|
|
|
|
payload.podcast.metadata.feedUrl = feedUrl
|
2022-09-16 01:35:56 +02:00
|
|
|
|
2024-06-04 00:21:18 +02:00
|
|
|
return payload.podcast
|
|
|
|
})
|
|
|
|
.catch((error) => {
|
|
|
|
Logger.error('[podcastUtils] getPodcastFeed Error', error)
|
|
|
|
return null
|
|
|
|
})
|
2022-09-16 01:35:56 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Return array of episodes ordered by closest match (Levenshtein distance of 6 or less)
|
|
|
|
module.exports.findMatchingEpisodes = async (feedUrl, searchTitle) => {
|
|
|
|
const feed = await this.getPodcastFeed(feedUrl).catch(() => {
|
|
|
|
return null
|
|
|
|
})
|
|
|
|
|
|
|
|
return this.findMatchingEpisodesInFeed(feed, searchTitle)
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports.findMatchingEpisodesInFeed = (feed, searchTitle) => {
|
|
|
|
searchTitle = searchTitle.toLowerCase().trim()
|
2023-10-17 00:47:44 +02:00
|
|
|
if (!feed?.episodes) {
|
2022-09-16 01:35:56 +02:00
|
|
|
return null
|
|
|
|
}
|
|
|
|
|
|
|
|
const matches = []
|
2024-06-04 00:21:18 +02:00
|
|
|
feed.episodes.forEach((ep) => {
|
2022-09-16 01:35:56 +02:00
|
|
|
if (!ep.title) return
|
|
|
|
const epTitle = ep.title.toLowerCase().trim()
|
|
|
|
if (epTitle === searchTitle) {
|
|
|
|
matches.push({
|
|
|
|
episode: ep,
|
|
|
|
levenshtein: 0
|
|
|
|
})
|
|
|
|
} else {
|
|
|
|
const levenshtein = levenshteinDistance(searchTitle, epTitle, true)
|
|
|
|
if (levenshtein <= 6 && epTitle.length > levenshtein) {
|
|
|
|
matches.push({
|
|
|
|
episode: ep,
|
|
|
|
levenshtein
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
})
|
|
|
|
return matches.sort((a, b) => a.levenshtein - b.levenshtein)
|
2023-02-22 19:22:52 +01:00
|
|
|
}
|