Merge pull request #2186 from mikiher/Fuzzy-Matching-Continued

Fuzzy matching continued
2025-10-27 11:18:14 +01:00 · 2023-10-08 10:39:11 -05:00 · 2023-10-08 10:39:11 -05:00 · 5ad9f507ba
commit 5ad9f507ba
parent db9d5c9d43 f8f555b4b6
1 changed files with 169 additions and 69 deletions
--- a/server/finders/BookFinder.js
+++ b/server/finders/BookFinder.js
@ -59,12 +59,17 @@ class BookFinder {
    // Remove single quotes (i.e. "Ender's Game" becomes "Enders Game")
    cleaned = cleaned.replace(/'/g, '')
-    return this.replaceAccentedChars(cleaned)
+    return this.replaceAccentedChars(cleaned).toLowerCase()
  }
  cleanAuthorForCompares(author) {
    if (!author) return ''
-    return this.replaceAccentedChars(author)
+    let cleanAuthor = this.replaceAccentedChars(author).toLowerCase()
    // separate initials
    cleanAuthor = cleanAuthor.replace(/([a-z])\.([a-z])/g, '$1. $2')
    // remove middle initials
    cleanAuthor = cleanAuthor.replace(/(?<=\w\w)(\s+[a-z]\.?)+(?=\s+\w\w)/g, '')
    return cleanAuthor
  }
  filterSearchResults(books, title, author, maxTitleDistance, maxAuthorDistance) {
@ -136,6 +141,10 @@ class BookFinder {
    if (!booksFiltered.length && books.length) {
      if (this.verbose) Logger.debug(`Search has ${books.length} matches, but no close title matches`)
    }
    booksFiltered.sort((a, b) => {
      return a.totalDistance - b.totalDistance
    })
    return booksFiltered
  }
@ -179,35 +188,152 @@ class BookFinder {
    return books
  }
-  addTitleCandidate(title, candidates) {
+  static TitleCandidates = class {
    // Main variant
    const cleanTitle = this.cleanTitleForCompares(title).trim()
    if (!cleanTitle) return
    candidates.add(cleanTitle)
-    let candidate = cleanTitle
+    constructor(bookFinder, cleanAuthor) {
      this.bookFinder = bookFinder
      this.candidates = new Set()
      this.cleanAuthor = cleanAuthor
      this.priorities = {}
      this.positions = {}
    }
-    // Remove subtitle
+    add(title, position = 0) {
-    candidate = candidate.replace(/([,:;_]| by ).*/g, "").trim()
+      // if title contains the author, remove it
-    if (candidate)
+      if (this.cleanAuthor) {
-      candidates.add(candidate)
+        const authorRe = new RegExp(`(^| | by |)${this.cleanAuthor}(?= |$)`, "g")
        title = this.bookFinder.cleanAuthorForCompares(title).replace(authorRe, '').trim()
      }
-    // Remove preceding/trailing numbers
+      const titleTransformers = [
-    candidate = candidate.replace(/^\d+ | \d+$/g, "").trim()
+        [/([,:;_]| by ).*/g, ''],                  // Remove subtitle
-    if (candidate)
+        [/(^| )\d+k(bps)?( |$)/, ' '],             // Remove bitrate
-      candidates.add(candidate)
+        [/ (2nd|3rd|\d+th)\s+ed(\.|ition)?/g, ''], // Remove edition
        [/(^| |\.)(m4b|m4a|mp3)( |$)/g, ''],       // Remove file-type
        [/ a novel.*$/g, ''],                      // Remove "a novel"
        [/^\d+ | \d+$/g, ''],                      // Remove preceding/trailing numbers
      ]
-    // Remove bitrate
+      // Main variant
-    candidate = candidate.replace(/(^| )\d+k(bps)?( |$)/, " ").trim()
+      const cleanTitle = this.bookFinder.cleanTitleForCompares(title).trim()
-    if (candidate)
+      if (!cleanTitle) return
-      candidates.add(candidate)
+      this.candidates.add(cleanTitle)
      this.priorities[cleanTitle] = 0
      this.positions[cleanTitle] = position
-    // Remove edition
+      let candidate = cleanTitle
-    candidate = candidate.replace(/ (2nd|3rd|\d+th)\s+ed(\.|ition)?/, "").trim()
+
-    if (candidate)
+      for (const transformer of titleTransformers)
-      candidates.add(candidate)
+        candidate = candidate.replace(transformer[0], transformer[1]).trim()
      if (candidate != cleanTitle) {
        if (candidate) {
          this.candidates.add(candidate)
          this.priorities[candidate] = 0
          this.positions[candidate] = position
        }
        this.priorities[cleanTitle] = 1
      }
    }
    get size() {
      return this.candidates.size
    }
    getCandidates() {
      var candidates = [...this.candidates]
      candidates.sort((a, b) => {
        // Candidates that include the author are likely low quality
        const includesAuthorDiff = !b.includes(this.cleanAuthor) - !a.includes(this.cleanAuthor)
        if (includesAuthorDiff) return includesAuthorDiff
        // Candidates that include only digits are also likely low quality
        const onlyDigits = /^\d+$/
        const includesOnlyDigitsDiff = !onlyDigits.test(b) - !onlyDigits.test(a)
        if (includesOnlyDigitsDiff) return includesOnlyDigitsDiff
        // transformed candidates receive higher priority
        const priorityDiff = this.priorities[a] - this.priorities[b]
        if (priorityDiff) return priorityDiff
        // if same priorirty, prefer candidates that are closer to the beginning (e.g. titles before subtitles)
        const positionDiff = this.positions[a] - this.positions[b]
        if (positionDiff) return positionDiff
        // Start with longer candidaets, as they are likely more specific
        const lengthDiff = b.length - a.length
        if (lengthDiff) return lengthDiff
        return b.localeCompare(a)
      })
      Logger.debug(`[${this.constructor.name}] Found ${candidates.length} fuzzy title candidates`)
      Logger.debug(candidates)
      return candidates
    }
    delete(title) {
      return this.candidates.delete(title)
    }
  }
  static AuthorCandidates = class {
    constructor(bookFinder, cleanAuthor) {
      this.bookFinder = bookFinder
      this.candidates = new Set()
      this.cleanAuthor = cleanAuthor
      if (cleanAuthor) this.candidates.add(cleanAuthor)
    }
    validateAuthor(name, region = '', maxLevenshtein = 2) {
      return this.bookFinder.audnexus.authorASINsRequest(name, region).then((asins) => {
        for (const [i, asin] of asins.entries()) {
          if (i > 10) break
          let cleanName = this.bookFinder.cleanAuthorForCompares(asin.name)
          if (!cleanName) continue
          if (cleanName.includes(name)) return name
          if (name.includes(cleanName)) return cleanName
          if (levenshteinDistance(cleanName, name) <= maxLevenshtein) return cleanName
        }
        return ''
      })
    }
    add(author) {
      const cleanAuthor = this.bookFinder.cleanAuthorForCompares(author).trim()
      if (!cleanAuthor) return
      this.candidates.add(cleanAuthor)
    }
    get size() {
      return this.candidates.size
    }
    get agressivelyCleanAuthor() {
      if (this.cleanAuthor) {
        const agressivelyCleanAuthor = this.cleanAuthor.replace(/[,/-].*$/, '').trim()
        return agressivelyCleanAuthor ? agressivelyCleanAuthor : this.cleanAuthor
      }
      return ''
    }
    async getCandidates() {
      var filteredCandidates = []
      var promises = []
      for (const candidate of this.candidates) {
        promises.push(this.validateAuthor(candidate))
      }
      const results = [...new Set(await Promise.all(promises))]
      filteredCandidates = results.filter(author => author)
      // If no valid candidates were found, add back an aggresively cleaned author version
      if (!filteredCandidates.length && this.cleanAuthor) filteredCandidates.push(this.agressivelyCleanAuthor)
      // Always add an empty author candidate
      filteredCandidates.push('')
      Logger.debug(`[${this.constructor.name}] Found ${filteredCandidates.length} fuzzy author candidates`)
      Logger.debug(filteredCandidates)
      return filteredCandidates
    }
    delete(author) {
      return this.candidates.delete(author)
    }
  }
  /**
   * Search for books including fuzzy searches
   * 
@ -232,62 +358,36 @@ class BookFinder {
    books = await this.runSearch(title, author, provider, asin, maxTitleDistance, maxAuthorDistance)
    if (!books.length && maxFuzzySearches > 0) {
-      // normalize title and author
+      // Normalize title and author
      title = title.trim().toLowerCase()
      author = author?.trim().toLowerCase() || ''
-      // Now run up to maxFuzzySearches fuzzy searches
+      const cleanAuthor = this.cleanAuthorForCompares(author)
      let candidates = new Set()
      let cleanedAuthor = this.cleanAuthorForCompares(author)
      this.addTitleCandidate(title, candidates)
-      // remove parentheses and their contents, and replace with a separator
+      // Now run up to maxFuzzySearches fuzzy searches
-      const cleanTitle = title.replace(/\[.*?\]|\(.*?\)|{.*?}/g, " - ")
+      let authorCandidates = new BookFinder.AuthorCandidates(this, cleanAuthor)
      // Remove underscores and parentheses with their contents, and replace with a separator
      const cleanTitle = title.replace(/\[.*?\]|\(.*?\)|{.*?}|_/g, " - ")
      // Split title into hypen-separated parts
      const titleParts = cleanTitle.split(/ - | -|- /)
-      for (const titlePart of titleParts) {
+      for (const titlePart of titleParts)
-        this.addTitleCandidate(titlePart, candidates)
+        authorCandidates.add(titlePart)
-      }
+      authorCandidates = await authorCandidates.getCandidates()
-      // We already searched for original title
+      for (const authorCandidate of authorCandidates) {
-      if (author == cleanedAuthor) candidates.delete(title)
+        let titleCandidates = new BookFinder.TitleCandidates(this, authorCandidate)
-      if (candidates.size > 0) {
+        for (const [position, titlePart] of titleParts.entries())
-        candidates = [...candidates]
+          titleCandidates.add(titlePart, position)
-        candidates.sort((a, b) => {
+        titleCandidates = titleCandidates.getCandidates()
-          // Candidates that include the author are likely low quality
+        for (const titleCandidate of titleCandidates) {
-          const includesAuthorDiff = !b.includes(cleanedAuthor) - !a.includes(cleanedAuthor)
+          if (titleCandidate == title && authorCandidate == author) continue // We already tried this
          if (includesAuthorDiff) return includesAuthorDiff
          // Candidates that include only digits are also likely low quality
          const onlyDigits = /^\d+$/
          const includesOnlyDigitsDiff = !onlyDigits.test(b) - !onlyDigits.test(a)
          if (includesOnlyDigitsDiff) return includesOnlyDigitsDiff
          // Start with longer candidaets, as they are likely more specific
          const lengthDiff = b.length - a.length
          if (lengthDiff) return lengthDiff
          return b.localeCompare(a)
        })
        Logger.debug(`[BookFinder] Found ${candidates.length} fuzzy title candidates`, candidates)
        for (const candidate of candidates) {
          if (++numFuzzySearches > maxFuzzySearches) return books
-          books = await this.runSearch(candidate, cleanedAuthor, provider, asin, maxTitleDistance, maxAuthorDistance)
+          books = await this.runSearch(titleCandidate, authorCandidate, provider, asin, maxTitleDistance, maxAuthorDistance)
-          if (books.length) break
+          if (books.length) return books
        }
        if (!books.length) {
          // Now try searching without the author
          for (const candidate of candidates) {
            if (++numFuzzySearches > maxFuzzySearches) return books
            books = await this.runSearch(candidate, '', provider, asin, maxTitleDistance, maxAuthorDistance)
            if (books.length) break
          }
        }
      }
    }
    if (provider === 'openlibrary') {
      books.sort((a, b) => {
        return a.totalDistance - b.totalDistance
      })
    }
    return books
  }