Fuzzy Matching V1

This commit is contained in:
mikiher 2023-09-14 21:32:20 +00:00
parent fea28351f9
commit ac746f199b

View File

@ -60,13 +60,13 @@ class BookFinder {
// Remove single quotes (i.e. "Ender's Game" becomes "Enders Game")
cleaned = cleaned.replace(/'/g, '')
cleaned = this.replaceAccentedChars(cleaned)
return cleaned.toLowerCase()
return cleaned
}
cleanAuthorForCompares(author) {
if (!author) return ''
var cleaned = this.replaceAccentedChars(author)
return cleaned.toLowerCase()
return cleaned
}
filterSearchResults(books, title, author, maxTitleDistance, maxAuthorDistance) {
@ -181,12 +181,113 @@ class BookFinder {
return books
}
addTitleCandidate(title, candidates) {
// Main variant
const cleanTitle = this.cleanTitleForCompares(title).trim()
if (!cleanTitle) return
candidates.add(cleanTitle)
let candidate = cleanTitle
// Remove subtitle
candidate = candidate.replace(/([,:;_]| by ).*/g, "").trim()
if (candidate)
candidates.add(candidate)
// Remove preceding/trailing numbers
candidate = candidate.replace(/^\d+ | \d+$/g, "").trim()
if (candidate)
candidates.add(candidate)
// Remove bitrate
candidate = candidate.replace(/(^| )\d+k(bps)?( |$)/, " ").trim()
if (candidate)
candidates.add(candidate)
// Remove edition
candidate = candidate.replace(/ (2nd|3rd|\d+th)\s+ed(\.|ition)?/, "").trim()
if (candidate)
candidates.add(candidate)
}
async search(provider, title, author, isbn, asin, options = {}) {
var books = []
var maxTitleDistance = !isNaN(options.titleDistance) ? Number(options.titleDistance) : 4
var maxAuthorDistance = !isNaN(options.authorDistance) ? Number(options.authorDistance) : 4
const maxTitleDistance = !isNaN(options.titleDistance) ? Number(options.titleDistance) : 4
const maxAuthorDistance = !isNaN(options.authorDistance) ? Number(options.authorDistance) : 4
const maxFuzzySearches = 5
var numFuzzySearches = 0
if (!title)
return books
books = await this.runSearch(title, author, provider, asin, maxTitleDistance, maxAuthorDistance)
if (!books.length && maxFuzzySearches > 0) {
// normalize title and author
title = title.trim().toLowerCase()
author = author.trim().toLowerCase()
// Now run up to maxFuzzySearches fuzzy searches
var candidates = new Set()
var cleanedAuthor = this.cleanAuthorForCompares(author)
this.addTitleCandidate(title, candidates)
// remove parentheses and their contents, and replace with a separator
const cleanTitle = title.replace(/\[.*?\]|\(.*?\)|{.*?}/g, " - ")
// Split title into hypen-separated parts
const titleParts = cleanTitle.split(/ - | -|- /)
for (const titlePart of titleParts) {
this.addTitleCandidate(titlePart, candidates)
}
// We already searched for original title
if (author == cleanedAuthor) candidates.delete(title)
if (candidates.size > 0) {
candidates = [...candidates]
candidates.sort((a, b) => {
// Candidates that include the author are likely low quality
const includesAuthorDiff = !b.includes(cleanedAuthor) - !a.includes(cleanedAuthor)
if (includesAuthorDiff) return includesAuthorDiff
// Candidates that include only digits are also likely low quality
const onlyDigits = /^\d+$/
const includesOnlyDigitsDiff = !onlyDigits.test(b) - !onlyDigits.test(a)
if (includesOnlyDigitsDiff) return includesOnlyDigitsDiff
// Start with longer candidaets, as they are likely more specific
const lengthDiff = b.length - a.length
if (lengthDiff) return lengthDiff
return b.localeCompare(a)
})
Logger.debug(`[BookFinder] Found ${candidates.length} fuzzy title candidates`)
Logger.debug(candidates)
for (const candidate of candidates) {
if (++numFuzzySearches > maxFuzzySearches) return books
books = await this.runSearch(candidate, cleanedAuthor, provider, asin, maxTitleDistance, maxAuthorDistance)
if (books.length) break
}
if (!books.length) {
// Now try searching without the author
for (const candidate of candidates) {
if (++numFuzzySearches > maxFuzzySearches) return books
books = await this.runSearch(candidate, '', provider, asin, maxTitleDistance, maxAuthorDistance)
if (books.length) break
}
}
}
}
if (provider === 'openlibrary') {
books.sort((a, b) => {
return a.totalDistance - b.totalDistance
})
}
return books
}
async runSearch(title, author, provider, asin, maxTitleDistance, maxAuthorDistance) {
Logger.debug(`Book Search: title: "${title}", author: "${author || ''}", provider: ${provider}`)
var books = []
if (provider === 'google') {
books = await this.getGoogleBooksResults(title, author)
} else if (provider.startsWith('audible')) {
@ -203,23 +304,6 @@ class BookFinder {
else {
books = await this.getGoogleBooksResults(title, author)
}
if (!books.length && !options.currentlyTryingCleaned) {
var cleanedTitle = this.cleanTitleForCompares(title)
var cleanedAuthor = this.cleanAuthorForCompares(author)
if (cleanedTitle == title && cleanedAuthor == author) return books
Logger.debug(`Book Search, no matches.. checking cleaned title and author`)
options.currentlyTryingCleaned = true
return this.search(provider, cleanedTitle, cleanedAuthor, isbn, asin, options)
}
if (provider === 'openlibrary') {
books.sort((a, b) => {
return a.totalDistance - b.totalDistance
})
}
return books
}