mirror of
https://github.com/advplyr/audiobookshelf.git
synced 2025-03-19 00:18:34 +01:00
Merge pull request #2186 from mikiher/Fuzzy-Matching-Continued
Fuzzy matching continued
This commit is contained in:
commit
5ad9f507ba
@ -59,12 +59,17 @@ class BookFinder {
|
|||||||
|
|
||||||
// Remove single quotes (i.e. "Ender's Game" becomes "Enders Game")
|
// Remove single quotes (i.e. "Ender's Game" becomes "Enders Game")
|
||||||
cleaned = cleaned.replace(/'/g, '')
|
cleaned = cleaned.replace(/'/g, '')
|
||||||
return this.replaceAccentedChars(cleaned)
|
return this.replaceAccentedChars(cleaned).toLowerCase()
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanAuthorForCompares(author) {
|
cleanAuthorForCompares(author) {
|
||||||
if (!author) return ''
|
if (!author) return ''
|
||||||
return this.replaceAccentedChars(author)
|
let cleanAuthor = this.replaceAccentedChars(author).toLowerCase()
|
||||||
|
// separate initials
|
||||||
|
cleanAuthor = cleanAuthor.replace(/([a-z])\.([a-z])/g, '$1. $2')
|
||||||
|
// remove middle initials
|
||||||
|
cleanAuthor = cleanAuthor.replace(/(?<=\w\w)(\s+[a-z]\.?)+(?=\s+\w\w)/g, '')
|
||||||
|
return cleanAuthor
|
||||||
}
|
}
|
||||||
|
|
||||||
filterSearchResults(books, title, author, maxTitleDistance, maxAuthorDistance) {
|
filterSearchResults(books, title, author, maxTitleDistance, maxAuthorDistance) {
|
||||||
@ -136,6 +141,10 @@ class BookFinder {
|
|||||||
if (!booksFiltered.length && books.length) {
|
if (!booksFiltered.length && books.length) {
|
||||||
if (this.verbose) Logger.debug(`Search has ${books.length} matches, but no close title matches`)
|
if (this.verbose) Logger.debug(`Search has ${books.length} matches, but no close title matches`)
|
||||||
}
|
}
|
||||||
|
booksFiltered.sort((a, b) => {
|
||||||
|
return a.totalDistance - b.totalDistance
|
||||||
|
})
|
||||||
|
|
||||||
return booksFiltered
|
return booksFiltered
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -179,35 +188,152 @@ class BookFinder {
|
|||||||
return books
|
return books
|
||||||
}
|
}
|
||||||
|
|
||||||
addTitleCandidate(title, candidates) {
|
static TitleCandidates = class {
|
||||||
// Main variant
|
|
||||||
const cleanTitle = this.cleanTitleForCompares(title).trim()
|
|
||||||
if (!cleanTitle) return
|
|
||||||
candidates.add(cleanTitle)
|
|
||||||
|
|
||||||
let candidate = cleanTitle
|
constructor(bookFinder, cleanAuthor) {
|
||||||
|
this.bookFinder = bookFinder
|
||||||
|
this.candidates = new Set()
|
||||||
|
this.cleanAuthor = cleanAuthor
|
||||||
|
this.priorities = {}
|
||||||
|
this.positions = {}
|
||||||
|
}
|
||||||
|
|
||||||
// Remove subtitle
|
add(title, position = 0) {
|
||||||
candidate = candidate.replace(/([,:;_]| by ).*/g, "").trim()
|
// if title contains the author, remove it
|
||||||
if (candidate)
|
if (this.cleanAuthor) {
|
||||||
candidates.add(candidate)
|
const authorRe = new RegExp(`(^| | by |)${this.cleanAuthor}(?= |$)`, "g")
|
||||||
|
title = this.bookFinder.cleanAuthorForCompares(title).replace(authorRe, '').trim()
|
||||||
|
}
|
||||||
|
|
||||||
// Remove preceding/trailing numbers
|
const titleTransformers = [
|
||||||
candidate = candidate.replace(/^\d+ | \d+$/g, "").trim()
|
[/([,:;_]| by ).*/g, ''], // Remove subtitle
|
||||||
if (candidate)
|
[/(^| )\d+k(bps)?( |$)/, ' '], // Remove bitrate
|
||||||
candidates.add(candidate)
|
[/ (2nd|3rd|\d+th)\s+ed(\.|ition)?/g, ''], // Remove edition
|
||||||
|
[/(^| |\.)(m4b|m4a|mp3)( |$)/g, ''], // Remove file-type
|
||||||
|
[/ a novel.*$/g, ''], // Remove "a novel"
|
||||||
|
[/^\d+ | \d+$/g, ''], // Remove preceding/trailing numbers
|
||||||
|
]
|
||||||
|
|
||||||
// Remove bitrate
|
// Main variant
|
||||||
candidate = candidate.replace(/(^| )\d+k(bps)?( |$)/, " ").trim()
|
const cleanTitle = this.bookFinder.cleanTitleForCompares(title).trim()
|
||||||
if (candidate)
|
if (!cleanTitle) return
|
||||||
candidates.add(candidate)
|
this.candidates.add(cleanTitle)
|
||||||
|
this.priorities[cleanTitle] = 0
|
||||||
|
this.positions[cleanTitle] = position
|
||||||
|
|
||||||
// Remove edition
|
let candidate = cleanTitle
|
||||||
candidate = candidate.replace(/ (2nd|3rd|\d+th)\s+ed(\.|ition)?/, "").trim()
|
|
||||||
if (candidate)
|
for (const transformer of titleTransformers)
|
||||||
candidates.add(candidate)
|
candidate = candidate.replace(transformer[0], transformer[1]).trim()
|
||||||
|
|
||||||
|
if (candidate != cleanTitle) {
|
||||||
|
if (candidate) {
|
||||||
|
this.candidates.add(candidate)
|
||||||
|
this.priorities[candidate] = 0
|
||||||
|
this.positions[candidate] = position
|
||||||
|
}
|
||||||
|
this.priorities[cleanTitle] = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
get size() {
|
||||||
|
return this.candidates.size
|
||||||
|
}
|
||||||
|
|
||||||
|
getCandidates() {
|
||||||
|
var candidates = [...this.candidates]
|
||||||
|
candidates.sort((a, b) => {
|
||||||
|
// Candidates that include the author are likely low quality
|
||||||
|
const includesAuthorDiff = !b.includes(this.cleanAuthor) - !a.includes(this.cleanAuthor)
|
||||||
|
if (includesAuthorDiff) return includesAuthorDiff
|
||||||
|
// Candidates that include only digits are also likely low quality
|
||||||
|
const onlyDigits = /^\d+$/
|
||||||
|
const includesOnlyDigitsDiff = !onlyDigits.test(b) - !onlyDigits.test(a)
|
||||||
|
if (includesOnlyDigitsDiff) return includesOnlyDigitsDiff
|
||||||
|
// transformed candidates receive higher priority
|
||||||
|
const priorityDiff = this.priorities[a] - this.priorities[b]
|
||||||
|
if (priorityDiff) return priorityDiff
|
||||||
|
// if same priorirty, prefer candidates that are closer to the beginning (e.g. titles before subtitles)
|
||||||
|
const positionDiff = this.positions[a] - this.positions[b]
|
||||||
|
if (positionDiff) return positionDiff
|
||||||
|
// Start with longer candidaets, as they are likely more specific
|
||||||
|
const lengthDiff = b.length - a.length
|
||||||
|
if (lengthDiff) return lengthDiff
|
||||||
|
return b.localeCompare(a)
|
||||||
|
})
|
||||||
|
Logger.debug(`[${this.constructor.name}] Found ${candidates.length} fuzzy title candidates`)
|
||||||
|
Logger.debug(candidates)
|
||||||
|
return candidates
|
||||||
|
}
|
||||||
|
|
||||||
|
delete(title) {
|
||||||
|
return this.candidates.delete(title)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static AuthorCandidates = class {
|
||||||
|
constructor(bookFinder, cleanAuthor) {
|
||||||
|
this.bookFinder = bookFinder
|
||||||
|
this.candidates = new Set()
|
||||||
|
this.cleanAuthor = cleanAuthor
|
||||||
|
if (cleanAuthor) this.candidates.add(cleanAuthor)
|
||||||
|
}
|
||||||
|
|
||||||
|
validateAuthor(name, region = '', maxLevenshtein = 2) {
|
||||||
|
return this.bookFinder.audnexus.authorASINsRequest(name, region).then((asins) => {
|
||||||
|
for (const [i, asin] of asins.entries()) {
|
||||||
|
if (i > 10) break
|
||||||
|
let cleanName = this.bookFinder.cleanAuthorForCompares(asin.name)
|
||||||
|
if (!cleanName) continue
|
||||||
|
if (cleanName.includes(name)) return name
|
||||||
|
if (name.includes(cleanName)) return cleanName
|
||||||
|
if (levenshteinDistance(cleanName, name) <= maxLevenshtein) return cleanName
|
||||||
|
}
|
||||||
|
return ''
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
add(author) {
|
||||||
|
const cleanAuthor = this.bookFinder.cleanAuthorForCompares(author).trim()
|
||||||
|
if (!cleanAuthor) return
|
||||||
|
this.candidates.add(cleanAuthor)
|
||||||
|
}
|
||||||
|
|
||||||
|
get size() {
|
||||||
|
return this.candidates.size
|
||||||
|
}
|
||||||
|
|
||||||
|
get agressivelyCleanAuthor() {
|
||||||
|
if (this.cleanAuthor) {
|
||||||
|
const agressivelyCleanAuthor = this.cleanAuthor.replace(/[,/-].*$/, '').trim()
|
||||||
|
return agressivelyCleanAuthor ? agressivelyCleanAuthor : this.cleanAuthor
|
||||||
|
}
|
||||||
|
return ''
|
||||||
|
}
|
||||||
|
|
||||||
|
async getCandidates() {
|
||||||
|
var filteredCandidates = []
|
||||||
|
var promises = []
|
||||||
|
for (const candidate of this.candidates) {
|
||||||
|
promises.push(this.validateAuthor(candidate))
|
||||||
|
}
|
||||||
|
const results = [...new Set(await Promise.all(promises))]
|
||||||
|
filteredCandidates = results.filter(author => author)
|
||||||
|
// If no valid candidates were found, add back an aggresively cleaned author version
|
||||||
|
if (!filteredCandidates.length && this.cleanAuthor) filteredCandidates.push(this.agressivelyCleanAuthor)
|
||||||
|
// Always add an empty author candidate
|
||||||
|
filteredCandidates.push('')
|
||||||
|
Logger.debug(`[${this.constructor.name}] Found ${filteredCandidates.length} fuzzy author candidates`)
|
||||||
|
Logger.debug(filteredCandidates)
|
||||||
|
return filteredCandidates
|
||||||
|
}
|
||||||
|
|
||||||
|
delete(author) {
|
||||||
|
return this.candidates.delete(author)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Search for books including fuzzy searches
|
* Search for books including fuzzy searches
|
||||||
*
|
*
|
||||||
@ -232,62 +358,36 @@ class BookFinder {
|
|||||||
books = await this.runSearch(title, author, provider, asin, maxTitleDistance, maxAuthorDistance)
|
books = await this.runSearch(title, author, provider, asin, maxTitleDistance, maxAuthorDistance)
|
||||||
|
|
||||||
if (!books.length && maxFuzzySearches > 0) {
|
if (!books.length && maxFuzzySearches > 0) {
|
||||||
// normalize title and author
|
// Normalize title and author
|
||||||
title = title.trim().toLowerCase()
|
title = title.trim().toLowerCase()
|
||||||
author = author?.trim().toLowerCase() || ''
|
author = author?.trim().toLowerCase() || ''
|
||||||
|
|
||||||
// Now run up to maxFuzzySearches fuzzy searches
|
const cleanAuthor = this.cleanAuthorForCompares(author)
|
||||||
let candidates = new Set()
|
|
||||||
let cleanedAuthor = this.cleanAuthorForCompares(author)
|
|
||||||
this.addTitleCandidate(title, candidates)
|
|
||||||
|
|
||||||
// remove parentheses and their contents, and replace with a separator
|
// Now run up to maxFuzzySearches fuzzy searches
|
||||||
const cleanTitle = title.replace(/\[.*?\]|\(.*?\)|{.*?}/g, " - ")
|
let authorCandidates = new BookFinder.AuthorCandidates(this, cleanAuthor)
|
||||||
|
|
||||||
|
// Remove underscores and parentheses with their contents, and replace with a separator
|
||||||
|
const cleanTitle = title.replace(/\[.*?\]|\(.*?\)|{.*?}|_/g, " - ")
|
||||||
// Split title into hypen-separated parts
|
// Split title into hypen-separated parts
|
||||||
const titleParts = cleanTitle.split(/ - | -|- /)
|
const titleParts = cleanTitle.split(/ - | -|- /)
|
||||||
for (const titlePart of titleParts) {
|
for (const titlePart of titleParts)
|
||||||
this.addTitleCandidate(titlePart, candidates)
|
authorCandidates.add(titlePart)
|
||||||
}
|
authorCandidates = await authorCandidates.getCandidates()
|
||||||
// We already searched for original title
|
for (const authorCandidate of authorCandidates) {
|
||||||
if (author == cleanedAuthor) candidates.delete(title)
|
let titleCandidates = new BookFinder.TitleCandidates(this, authorCandidate)
|
||||||
if (candidates.size > 0) {
|
for (const [position, titlePart] of titleParts.entries())
|
||||||
candidates = [...candidates]
|
titleCandidates.add(titlePart, position)
|
||||||
candidates.sort((a, b) => {
|
titleCandidates = titleCandidates.getCandidates()
|
||||||
// Candidates that include the author are likely low quality
|
for (const titleCandidate of titleCandidates) {
|
||||||
const includesAuthorDiff = !b.includes(cleanedAuthor) - !a.includes(cleanedAuthor)
|
if (titleCandidate == title && authorCandidate == author) continue // We already tried this
|
||||||
if (includesAuthorDiff) return includesAuthorDiff
|
|
||||||
// Candidates that include only digits are also likely low quality
|
|
||||||
const onlyDigits = /^\d+$/
|
|
||||||
const includesOnlyDigitsDiff = !onlyDigits.test(b) - !onlyDigits.test(a)
|
|
||||||
if (includesOnlyDigitsDiff) return includesOnlyDigitsDiff
|
|
||||||
// Start with longer candidaets, as they are likely more specific
|
|
||||||
const lengthDiff = b.length - a.length
|
|
||||||
if (lengthDiff) return lengthDiff
|
|
||||||
return b.localeCompare(a)
|
|
||||||
})
|
|
||||||
Logger.debug(`[BookFinder] Found ${candidates.length} fuzzy title candidates`, candidates)
|
|
||||||
for (const candidate of candidates) {
|
|
||||||
if (++numFuzzySearches > maxFuzzySearches) return books
|
if (++numFuzzySearches > maxFuzzySearches) return books
|
||||||
books = await this.runSearch(candidate, cleanedAuthor, provider, asin, maxTitleDistance, maxAuthorDistance)
|
books = await this.runSearch(titleCandidate, authorCandidate, provider, asin, maxTitleDistance, maxAuthorDistance)
|
||||||
if (books.length) break
|
if (books.length) return books
|
||||||
}
|
|
||||||
if (!books.length) {
|
|
||||||
// Now try searching without the author
|
|
||||||
for (const candidate of candidates) {
|
|
||||||
if (++numFuzzySearches > maxFuzzySearches) return books
|
|
||||||
books = await this.runSearch(candidate, '', provider, asin, maxTitleDistance, maxAuthorDistance)
|
|
||||||
if (books.length) break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (provider === 'openlibrary') {
|
|
||||||
books.sort((a, b) => {
|
|
||||||
return a.totalDistance - b.totalDistance
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
return books
|
return books
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user