audiobookshelf/server/utils/podcastUtils.js
Conner McCall f460297daf
fix: allow upgrading HTTP to HTTPS for redirects
Re: #3142 and #3658

When adding certain podcasts, the server encountered a redirect from an HTTP URL to an HTTPS domain, causing an error that was difficult for end users to diagnose without inspecting logs or HTML.

This issue arose due to SSRF security measures that blocked such redirects. Instead of failing in these cases, we now detect when the error is caused by an HTTP-to-HTTPS upgrade. If confirmed, we upgrade the initial URL to HTTPS and resend the request.

Since this change does not allow cross-protocol or cross-domain redirections, it remains secure while resolving most of the reported issues.

Affected podcasts that are now fixed:

- D&D is for Nerds
- The New Yorker: The Writer's Voice - New Fiction from The New Yorker
- Radiolab
2025-02-13 09:19:02 -06:00

401 lines
12 KiB
JavaScript

const axios = require('axios')
const ssrfFilter = require('ssrf-req-filter')
const Logger = require('../Logger')
const { xmlToJSON, levenshteinDistance } = require('./index')
const htmlSanitizer = require('../utils/htmlSanitizer')
/**
* @typedef RssPodcastEpisode
* @property {string} title
* @property {string} subtitle
* @property {string} description
* @property {string} descriptionPlain
* @property {string} pubDate
* @property {string} episodeType
* @property {string} season
* @property {string} episode
* @property {string} author
* @property {string} duration
* @property {string} explicit
* @property {number} publishedAt - Unix timestamp
* @property {{ url: string, type?: string, length?: string }} enclosure
* @property {string} guid
* @property {string} chaptersUrl
* @property {string} chaptersType
*/
/**
* @typedef RssPodcastMetadata
* @property {string} title
* @property {string} language
* @property {string} explicit
* @property {string} author
* @property {string} pubDate
* @property {string} link
* @property {string} image
* @property {string[]} categories
* @property {string} feedUrl
* @property {string} description
* @property {string} descriptionPlain
* @property {string} type
*/
/**
* @typedef RssPodcast
* @property {RssPodcastMetadata} metadata
* @property {RssPodcastEpisode[]} episodes
* @property {number} numEpisodes
*/
function extractFirstArrayItem(json, key) {
if (!json[key]?.length) return null
return json[key][0]
}
function extractStringOrStringify(json) {
try {
if (typeof json[Object.keys(json)[0]]?.[0] === 'string') {
return json[Object.keys(json)[0]][0]
}
// Handles case where html was included without being wrapped in CDATA
return JSON.stringify(value)
} catch {
return ''
}
}
function extractFirstArrayItemString(json, key) {
const item = extractFirstArrayItem(json, key)
if (!item) return ''
if (typeof item === 'object') {
if (item?.['_'] && typeof item['_'] === 'string') return item['_']
return extractStringOrStringify(item)
}
return typeof item === 'string' ? item : ''
}
function extractImage(channel) {
if (!channel.image || !channel.image.url || !channel.image.url.length) {
if (!channel['itunes:image'] || !channel['itunes:image'].length || !channel['itunes:image'][0]['$']) {
return null
}
var itunesImage = channel['itunes:image'][0]['$']
return itunesImage.href || null
}
return channel.image.url[0] || null
}
function extractCategories(channel) {
if (!channel['itunes:category'] || !channel['itunes:category'].length) return []
var categories = channel['itunes:category']
var cleanedCats = []
categories.forEach((cat) => {
if (!cat['$'] || !cat['$'].text) return
var cattext = cat['$'].text
if (cat['itunes:category']) {
var subcats = extractCategories(cat)
if (subcats.length) {
cleanedCats = cleanedCats.concat(subcats.map((subcat) => `${cattext}:${subcat}`))
} else {
cleanedCats.push(cattext)
}
} else {
cleanedCats.push(cattext)
}
})
return cleanedCats
}
function extractPodcastMetadata(channel) {
const metadata = {
image: extractImage(channel),
categories: extractCategories(channel),
feedUrl: null,
description: null,
descriptionPlain: null,
type: null
}
if (channel['itunes:new-feed-url']) {
metadata.feedUrl = extractFirstArrayItem(channel, 'itunes:new-feed-url')
} else if (channel['atom:link'] && channel['atom:link'].length && channel['atom:link'][0]['$']) {
metadata.feedUrl = channel['atom:link'][0]['$'].href || null
}
if (channel['description']) {
const rawDescription = extractFirstArrayItemString(channel, 'description')
metadata.description = htmlSanitizer.sanitize(rawDescription.trim())
metadata.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription.trim())
}
const arrayFields = ['title', 'language', 'itunes:explicit', 'itunes:author', 'pubDate', 'link', 'itunes:type']
arrayFields.forEach((key) => {
const cleanKey = key.split(':').pop()
let value = extractFirstArrayItem(channel, key)
if (value?.['_']) value = value['_']
metadata[cleanKey] = value
})
return metadata
}
function extractEpisodeData(item) {
// Episode must have url
let enclosure
if (item.enclosure?.[0]?.['$']?.url) {
enclosure = item.enclosure[0]['$']
} else if(item['media:content']?.find(c => c?.['$']?.url && (c?.['$']?.type ?? "").startsWith("audio"))) {
enclosure = item['media:content'].find(c => (c['$']?.type ?? "").startsWith("audio"))['$']
} else {
Logger.error(`[podcastUtils] Invalid podcast episode data`)
return null
}
const episode = {
enclosure: enclosure,
}
episode.enclosure.url = episode.enclosure.url.trim()
// Full description with html
if (item['content:encoded']) {
const rawDescription = (extractFirstArrayItem(item, 'content:encoded') || '').trim()
episode.description = htmlSanitizer.sanitize(rawDescription)
}
// Extract chapters
if (item['podcast:chapters']?.[0]?.['$']?.url) {
episode.chaptersUrl = item['podcast:chapters'][0]['$'].url
episode.chaptersType = item['podcast:chapters'][0]['$'].type || 'application/json'
}
// Supposed to be the plaintext description but not always followed
if (item['description']) {
const rawDescription = extractFirstArrayItemString(item, 'description')
if (!episode.description) episode.description = htmlSanitizer.sanitize(rawDescription.trim())
episode.descriptionPlain = htmlSanitizer.stripAllTags(rawDescription.trim())
}
if (item['pubDate']) {
const pubDate = extractFirstArrayItem(item, 'pubDate')
if (typeof pubDate === 'string') {
episode.pubDate = pubDate
} else if (typeof pubDate?._ === 'string') {
episode.pubDate = pubDate._
} else {
Logger.error(`[podcastUtils] Invalid pubDate ${item['pubDate']} for ${episode.enclosure.url}`)
}
}
if (item['guid']) {
const guidItem = extractFirstArrayItem(item, 'guid')
if (typeof guidItem === 'string') {
episode.guid = guidItem
} else if (typeof guidItem?._ === 'string') {
episode.guid = guidItem._
} else {
Logger.error(`[podcastUtils] Invalid guid ${item['guid']} for ${episode.enclosure.url}`)
}
}
const arrayFields = ['title', 'itunes:episodeType', 'itunes:season', 'itunes:episode', 'itunes:author', 'itunes:duration', 'itunes:explicit', 'itunes:subtitle']
arrayFields.forEach((key) => {
const cleanKey = key.split(':').pop()
episode[cleanKey] = extractFirstArrayItemString(item, key)
})
return episode
}
function cleanEpisodeData(data) {
const pubJsDate = data.pubDate ? new Date(data.pubDate) : null
const publishedAt = pubJsDate && !isNaN(pubJsDate) ? pubJsDate.valueOf() : null
return {
title: data.title,
subtitle: data.subtitle || '',
description: data.description || '',
descriptionPlain: data.descriptionPlain || '',
pubDate: data.pubDate || '',
episodeType: data.episodeType || '',
season: data.season || '',
episode: data.episode || '',
author: data.author || '',
duration: data.duration || '',
explicit: data.explicit || '',
publishedAt,
enclosure: data.enclosure,
guid: data.guid || null,
chaptersUrl: data.chaptersUrl || null,
chaptersType: data.chaptersType || null
}
}
function extractPodcastEpisodes(items) {
const episodes = []
items.forEach((item) => {
const extracted = extractEpisodeData(item)
if (extracted) {
episodes.push(cleanEpisodeData(extracted))
}
})
return episodes
}
function cleanPodcastJson(rssJson, excludeEpisodeMetadata) {
if (!rssJson.channel?.length) {
Logger.error(`[podcastUtil] Invalid podcast no channel object`)
return null
}
const channel = rssJson.channel[0]
if (!channel.item?.length) {
Logger.error(`[podcastUtil] Invalid podcast no episodes`)
return null
}
const podcast = {
metadata: extractPodcastMetadata(channel)
}
if (!excludeEpisodeMetadata) {
podcast.episodes = extractPodcastEpisodes(channel.item)
} else {
podcast.numEpisodes = channel.item.length
}
return podcast
}
module.exports.parsePodcastRssFeedXml = async (xml, excludeEpisodeMetadata = false, includeRaw = false) => {
if (!xml) return null
const json = await xmlToJSON(xml)
if (!json?.rss) {
Logger.error('[podcastUtils] Invalid XML or RSS feed')
return null
}
const podcast = cleanPodcastJson(json.rss, excludeEpisodeMetadata)
if (!podcast) return null
if (includeRaw) {
return {
podcast,
rawJson: json
}
} else {
return {
podcast
}
}
}
/**
* Get podcast RSS feed as JSON
* Uses SSRF filter to prevent internal URLs
*
* @param {string} feedUrl
* @param {boolean} [excludeEpisodeMetadata=false]
* @returns {Promise<RssPodcast|null>}
*/
module.exports.getPodcastFeed = (feedUrl, excludeEpisodeMetadata = false) => {
Logger.debug(`[podcastUtils] getPodcastFeed for "${feedUrl}"`)
let userAgent = 'audiobookshelf (+https://audiobookshelf.org; like iTMS)'
// Workaround for CBC RSS feeds rejecting our user agent string
// See: https://github.com/advplyr/audiobookshelf/issues/3322
if (feedUrl.startsWith('https://www.cbc.ca')) {
userAgent = 'audiobookshelf (+https://audiobookshelf.org; like iTMS) - CBC'
}
return axios({
url: feedUrl,
method: 'GET',
timeout: global.PodcastDownloadTimeout,
responseType: 'arraybuffer',
headers: {
Accept: 'application/rss+xml, application/xhtml+xml, application/xml, */*;q=0.8',
'Accept-Encoding': 'gzip, compress, deflate',
'User-Agent': userAgent
},
httpAgent: global.DisableSsrfRequestFilter?.(feedUrl) ? null : ssrfFilter(feedUrl),
httpsAgent: global.DisableSsrfRequestFilter?.(feedUrl) ? null : ssrfFilter(feedUrl)
})
.then(async (data) => {
// Adding support for ios-8859-1 encoded RSS feeds.
// See: https://github.com/advplyr/audiobookshelf/issues/1489
const contentType = data.headers?.['content-type'] || '' // e.g. text/xml; charset=iso-8859-1
if (contentType.toLowerCase().includes('iso-8859-1')) {
data.data = data.data.toString('latin1')
} else {
data.data = data.data.toString()
}
if (!data?.data) {
Logger.error(`[podcastUtils] getPodcastFeed: Invalid podcast feed request response (${feedUrl})`)
return null
}
Logger.debug(`[podcastUtils] getPodcastFeed for "${feedUrl}" success - parsing xml`)
const payload = await this.parsePodcastRssFeedXml(data.data, excludeEpisodeMetadata)
if (!payload) {
return null
}
// RSS feed may be a private RSS feed
payload.podcast.metadata.feedUrl = feedUrl
return payload.podcast
})
.catch((error) => {
// Check for failures due to redirecting from http to https. If original url was http, upgrade to https and try again
if (error.code === 'ERR_FR_REDIRECTION_FAILURE' && error.cause.code === 'ERR_INVALID_PROTOCOL') {
if (feedUrl.startsWith('http://') && error.request._options.protocol === 'https:') {
Logger.info('Redirection from http to https detected. Upgrading Request', error.request._options.href)
feedUrl = feedUrl.replace('http://', 'https://')
return this.getPodcastFeed(feedUrl, excludeEpisodeMetadata)
}
}
Logger.error('[podcastUtils] getPodcastFeed Error', error)
return null
})
}
// Return array of episodes ordered by closest match (Levenshtein distance of 6 or less)
module.exports.findMatchingEpisodes = async (feedUrl, searchTitle) => {
const feed = await this.getPodcastFeed(feedUrl).catch(() => {
return null
})
return this.findMatchingEpisodesInFeed(feed, searchTitle)
}
/**
*
* @param {RssPodcast} feed
* @param {string} searchTitle
* @returns {Array<{ episode: RssPodcastEpisode, levenshtein: number }>}
*/
module.exports.findMatchingEpisodesInFeed = (feed, searchTitle) => {
searchTitle = searchTitle.toLowerCase().trim()
if (!feed?.episodes) {
return null
}
const matches = []
feed.episodes.forEach((ep) => {
if (!ep.title) return
const epTitle = ep.title.toLowerCase().trim()
if (epTitle === searchTitle) {
matches.push({
episode: ep,
levenshtein: 0
})
} else {
const levenshtein = levenshteinDistance(searchTitle, epTitle, true)
if (levenshtein <= 6 && epTitle.length > levenshtein) {
matches.push({
episode: ep,
levenshtein
})
}
}
})
return matches.sort((a, b) => a.levenshtein - b.levenshtein)
}