audiobookshelf/server/utils/parsers/parseFullName.js
2023-09-14 22:12:22 -07:00

347 lines
15 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// https://github.com/RateGravity/parse-full-name/blob/master/index.js
module.exports = (nameToParse, partToReturn, fixCase, stopOnError, useLongLists) => {
var i, j, k, l, m, n, part, comma, titleList, suffixList, prefixList, regex,
partToCheck, partFound, partsFoundCount, firstComma, remainingCommas,
nameParts = [], nameCommas = [null], partsFound = [],
conjunctionList = ['&', 'and', 'et', 'e', 'of', 'the', 'und', 'y'],
parsedName = {
title: '', first: '', middle: '', last: '', nick: '', suffix: '', error: []
};
// Validate inputs, or set to defaults
partToReturn = partToReturn && ['title', 'first', 'middle', 'last', 'nick',
'suffix', 'error'].indexOf(partToReturn.toLowerCase()) > -1 ?
partToReturn.toLowerCase() : 'all';
// 'all' = return object with all parts, others return single part
if (fixCase === false) fixCase = 0;
if (fixCase === true) fixCase = 1;
fixCase = fixCase !== 'undefined' && (fixCase === 0 || fixCase === 1) ?
fixCase : -1; // -1 = fix case only if input is all upper or lowercase
if (stopOnError === true) stopOnError = 1;
stopOnError = stopOnError && stopOnError === 1 ? 1 : 0;
// false = output warnings on parse error, but don't stop
if (useLongLists === true) useLongLists = 1;
useLongLists = useLongLists && useLongLists === 1 ? 1 : 0; // 0 = short lists
// If stopOnError = 1, throw error, otherwise return error messages in array
function handleError(errorMessage) {
if (stopOnError) {
throw 'Error: ' + errorMessage;
} else {
parsedName.error.push('Error: ' + errorMessage);
}
}
// If fixCase = 1, fix case of parsedName parts before returning
function fixParsedNameCase(fixedCaseName, fixCaseNow) {
var forceCaseList = ['e', 'y', 'av', 'af', 'da', 'dal', 'de', 'del', 'der', 'di',
'la', 'le', 'van', 'der', 'den', 'vel', 'von', 'II', 'III', 'IV', 'J.D.', 'LL.M.',
'M.D.', 'D.O.', 'D.C.', 'Ph.D.'];
var forceCaseListIndex;
var namePartLabels = [];
var namePartWords;
if (fixCaseNow) {
namePartLabels = Object.keys(parsedName)
.filter(function (v) { return v !== 'error'; });
for (i = 0, l = namePartLabels.length; i < l; i++) {
if (fixedCaseName[namePartLabels[i]]) {
namePartWords = (fixedCaseName[namePartLabels[i]] + '').split(' ');
for (j = 0, m = namePartWords.length; j < m; j++) {
forceCaseListIndex = forceCaseList
.map(function (v) { return v.toLowerCase(); })
.indexOf(namePartWords[j].toLowerCase());
if (forceCaseListIndex > -1) { // Set case of words in forceCaseList
namePartWords[j] = forceCaseList[forceCaseListIndex];
} else if (namePartWords[j].length === 1) { // Uppercase initials
namePartWords[j] = namePartWords[j].toUpperCase();
} else if (
namePartWords[j].length > 2 &&
namePartWords[j].slice(0, 1) ===
namePartWords[j].slice(0, 1).toUpperCase() &&
namePartWords[j].slice(1, 2) ===
namePartWords[j].slice(1, 2).toLowerCase() &&
namePartWords[j].slice(2) ===
namePartWords[j].slice(2).toUpperCase()
) { // Detect McCASE and convert to McCase
namePartWords[j] = namePartWords[j].slice(0, 3) +
namePartWords[j].slice(3).toLowerCase();
} else if (
namePartLabels[j] === 'suffix' &&
namePartWords[j].slice(-1) !== '.' &&
!suffixList.indexOf(namePartWords[j].toLowerCase())
) { // Convert suffix abbreviations to UPPER CASE
if (namePartWords[j] === namePartWords[j].toLowerCase()) {
namePartWords[j] = namePartWords[j].toUpperCase();
}
} else { // Convert to Title Case
namePartWords[j] = namePartWords[j].slice(0, 1).toUpperCase() +
namePartWords[j].slice(1).toLowerCase();
}
}
fixedCaseName[namePartLabels[i]] = namePartWords.join(' ');
}
}
}
return fixedCaseName;
}
// If no input name, or input name is not a string, abort
if (!nameToParse || typeof nameToParse !== 'string') {
handleError('No input');
parsedName = fixParsedNameCase(parsedName, fixCase);
return partToReturn === 'all' ? parsedName : parsedName[partToReturn];
} else {
nameToParse = nameToParse.trim();
}
// Auto-detect fixCase: fix if nameToParse is all upper or all lowercase
if (fixCase === -1) {
fixCase = (
nameToParse === nameToParse.toUpperCase() ||
nameToParse === nameToParse.toLowerCase() ? 1 : 0
);
}
// Initilize lists of prefixs, suffixs, and titles to detect
// Note: These list entries must be all lowercase
if (useLongLists) {
suffixList = ['esq', 'esquire', 'jr', 'jnr', 'sr', 'snr', '2', 'ii', 'iii', 'iv',
'v', 'clu', 'chfc', 'cfp', 'md', 'phd', 'j.d.', 'll.m.', 'm.d.', 'd.o.', 'd.c.',
'p.c.', 'ph.d.'];
prefixList = ['a', 'ab', 'antune', 'ap', 'abu', 'al', 'alm', 'alt', 'bab', 'bäck',
'bar', 'bath', 'bat', 'beau', 'beck', 'ben', 'berg', 'bet', 'bin', 'bint', 'birch',
'björk', 'björn', 'bjur', 'da', 'dahl', 'dal', 'de', 'degli', 'dele', 'del',
'della', 'der', 'di', 'dos', 'du', 'e', 'ek', 'el', 'escob', 'esch', 'fleisch',
'fitz', 'fors', 'gott', 'griff', 'haj', 'haug', 'holm', 'ibn', 'kauf', 'kil',
'koop', 'kvarn', 'la', 'le', 'lind', 'lönn', 'lund', 'mac', 'mhic', 'mic', 'mir',
'na', 'naka', 'neder', 'nic', 'ni', 'nin', 'nord', 'norr', 'ny', 'o', 'ua', 'ui\'',
'öfver', 'ost', 'över', 'öz', 'papa', 'pour', 'quarn', 'skog', 'skoog', 'sten',
'stor', 'ström', 'söder', 'ter', 'ter', 'tre', 'türk', 'van', 'väst', 'väster',
'vest', 'von'];
titleList = ['mr', 'mrs', 'ms', 'miss', 'dr', 'herr', 'monsieur', 'hr', 'frau',
'a v m', 'admiraal', 'admiral', 'air cdre', 'air commodore', 'air marshal',
'air vice marshal', 'alderman', 'alhaji', 'ambassador', 'baron', 'barones',
'brig', 'brig gen', 'brig general', 'brigadier', 'brigadier general',
'brother', 'canon', 'capt', 'captain', 'cardinal', 'cdr', 'chief', 'cik', 'cmdr',
'coach', 'col', 'col dr', 'colonel', 'commandant', 'commander', 'commissioner',
'commodore', 'comte', 'comtessa', 'congressman', 'conseiller', 'consul',
'conte', 'contessa', 'corporal', 'councillor', 'count', 'countess',
'crown prince', 'crown princess', 'dame', 'datin', 'dato', 'datuk',
'datuk seri', 'deacon', 'deaconess', 'dean', 'dhr', 'dipl ing', 'doctor',
'dott', 'dott sa', 'dr', 'dr ing', 'dra', 'drs', 'embajador', 'embajadora', 'en',
'encik', 'eng', 'eur ing', 'exma sra', 'exmo sr', 'f o', 'father',
'first lieutient', 'first officer', 'flt lieut', 'flying officer', 'fr',
'frau', 'fraulein', 'fru', 'gen', 'generaal', 'general', 'governor', 'graaf',
'gravin', 'group captain', 'grp capt', 'h e dr', 'h h', 'h m', 'h r h', 'hajah',
'haji', 'hajim', 'her highness', 'her majesty', 'herr', 'high chief',
'his highness', 'his holiness', 'his majesty', 'hon', 'hr', 'hra', 'ing', 'ir',
'jonkheer', 'judge', 'justice', 'khun ying', 'kolonel', 'lady', 'lcda', 'lic',
'lieut', 'lieut cdr', 'lieut col', 'lieut gen', 'lord', 'm', 'm l', 'm r',
'madame', 'mademoiselle', 'maj gen', 'major', 'master', 'mevrouw', 'miss',
'mlle', 'mme', 'monsieur', 'monsignor', 'mr', 'mrs', 'ms', 'mstr', 'nti', 'pastor',
'president', 'prince', 'princess', 'princesse', 'prinses', 'prof', 'prof dr',
'prof sir', 'professor', 'puan', 'puan sri', 'rabbi', 'rear admiral', 'rev',
'rev canon', 'rev dr', 'rev mother', 'reverend', 'rva', 'senator', 'sergeant',
'sheikh', 'sheikha', 'sig', 'sig na', 'sig ra', 'sir', 'sister', 'sqn ldr', 'sr',
'sr d', 'sra', 'srta', 'sultan', 'tan sri', 'tan sri dato', 'tengku', 'teuku',
'than puying', 'the hon dr', 'the hon justice', 'the hon miss', 'the hon mr',
'the hon mrs', 'the hon ms', 'the hon sir', 'the very rev', 'toh puan', 'tun',
'vice admiral', 'viscount', 'viscountess', 'wg cdr', 'ind', 'misc', 'mx'];
} else {
suffixList = ['esq', 'esquire', 'jr', 'jnr', 'sr', 'snr', '2', 'ii', 'iii', 'iv',
'md', 'phd', 'j.d.', 'll.m.', 'm.d.', 'd.o.', 'd.c.', 'p.c.', 'ph.d.'];
prefixList = ['ab', 'bar', 'bin', 'da', 'dal', 'de', 'de la', 'del', 'della', 'der',
'di', 'du', 'ibn', 'l\'', 'la', 'le', 'san', 'st', 'st.', 'ste', 'ter', 'van',
'van de', 'van der', 'van den', 'vel', 'ver', 'vere', 'von'];
titleList = ['dr', 'miss', 'mr', 'mrs', 'ms', 'prof', 'sir', 'frau', 'herr', 'hr',
'monsieur', 'captain', 'doctor', 'judge', 'officer', 'professor', 'ind', 'misc',
'mx'];
}
// Nickname: remove and store parts with surrounding punctuation as nicknames
regex = /\s(?:[']([^']+)[']|[“”"]([^“”"]+)[“”"]|\[([^\]]+)\]|\(([^\)]+)\)),?\s/g;
partFound = (' ' + nameToParse + ' ').match(regex);
if (partFound) partsFound = partsFound.concat(partFound);
partsFoundCount = partsFound.length;
if (partsFoundCount === 1) {
parsedName.nick = partsFound[0].slice(2).slice(0, -2);
if (parsedName.nick.slice(-1) === ',') {
parsedName.nick = parsedName.nick.slice(0, -1);
}
nameToParse = (' ' + nameToParse + ' ').replace(partsFound[0], ' ').trim();
partsFound = [];
} else if (partsFoundCount > 1) {
handleError(partsFoundCount + ' nicknames found');
for (i = 0; i < partsFoundCount; i++) {
nameToParse = (' ' + nameToParse + ' ')
.replace(partsFound[i], ' ').trim();
partsFound[i] = partsFound[i].slice(2).slice(0, -2);
if (partsFound[i].slice(-1) === ',') {
partsFound[i] = partsFound[i].slice(0, -1);
}
}
parsedName.nick = partsFound.join(', ');
partsFound = [];
}
if (!nameToParse.trim().length) {
parsedName = fixParsedNameCase(parsedName, fixCase);
return partToReturn === 'all' ? parsedName : parsedName[partToReturn];
}
// Split remaining nameToParse into parts, remove and store preceding commas
for (i = 0, n = nameToParse.split(' '), l = n.length; i < l; i++) {
part = n[i];
comma = null;
if (part.slice(-1) === ',') {
comma = ',';
part = part.slice(0, -1);
}
nameParts.push(part);
nameCommas.push(comma);
}
// Suffix: remove and store matching parts as suffixes
for (l = nameParts.length, i = l - 1; i > 0; i--) {
partToCheck = (nameParts[i].slice(-1) === '.' ?
nameParts[i].slice(0, -1).toLowerCase() : nameParts[i].toLowerCase());
if (
suffixList.indexOf(partToCheck) > -1 ||
suffixList.indexOf(partToCheck + '.') > -1
) {
partsFound = nameParts.splice(i, 1).concat(partsFound);
if (nameCommas[i] === ',') { // Keep comma, either before or after
nameCommas.splice(i + 1, 1);
} else {
nameCommas.splice(i, 1);
}
}
}
partsFoundCount = partsFound.length;
if (partsFoundCount === 1) {
parsedName.suffix = partsFound[0];
partsFound = [];
} else if (partsFoundCount > 1) {
handleError(partsFoundCount + ' suffixes found');
parsedName.suffix = partsFound.join(', ');
partsFound = [];
}
if (!nameParts.length) {
parsedName = fixParsedNameCase(parsedName, fixCase);
return partToReturn === 'all' ? parsedName : parsedName[partToReturn];
}
// Title: remove and store matching parts as titles
for (l = nameParts.length, i = l - 1; i >= 0; i--) {
partToCheck = (nameParts[i].slice(-1) === '.' ?
nameParts[i].slice(0, -1).toLowerCase() : nameParts[i].toLowerCase());
if (
titleList.indexOf(partToCheck) > -1 ||
titleList.indexOf(partToCheck + '.') > -1
) {
partsFound = nameParts.splice(i, 1).concat(partsFound);
if (nameCommas[i] === ',') { // Keep comma, either before or after
nameCommas.splice(i + 1, 1);
} else {
nameCommas.splice(i, 1);
}
}
}
partsFoundCount = partsFound.length;
if (partsFoundCount === 1) {
parsedName.title = partsFound[0];
partsFound = [];
} else if (partsFoundCount > 1) {
handleError(partsFoundCount + ' titles found');
parsedName.title = partsFound.join(', ');
partsFound = [];
}
if (!nameParts.length) {
parsedName = fixParsedNameCase(parsedName, fixCase);
return partToReturn === 'all' ? parsedName : parsedName[partToReturn];
}
// Join name prefixes to following names
if (nameParts.length > 1) {
for (i = nameParts.length - 2; i >= 0; i--) {
if (prefixList.indexOf(nameParts[i].toLowerCase()) > -1) {
nameParts[i] = nameParts[i] + ' ' + nameParts[i + 1];
nameParts.splice(i + 1, 1);
nameCommas.splice(i + 1, 1);
}
}
}
// Join conjunctions to surrounding names
if (nameParts.length > 2) {
for (i = nameParts.length - 3; i >= 0; i--) {
if (conjunctionList.indexOf(nameParts[i + 1].toLowerCase()) > -1) {
nameParts[i] = nameParts[i] + ' ' + nameParts[i + 1] + ' ' + nameParts[i + 2];
nameParts.splice(i + 1, 2);
nameCommas.splice(i + 1, 2);
i--;
}
}
}
// Suffix: remove and store items after extra commas as suffixes
nameCommas.pop();
firstComma = nameCommas.indexOf(',');
remainingCommas = nameCommas.filter(function (v) { return v !== null; }).length;
if (firstComma > 1 || remainingCommas > 1) {
for (i = nameParts.length - 1; i >= 2; i--) {
if (nameCommas[i] === ',') {
partsFound = nameParts.splice(i, 1).concat(partsFound);
nameCommas.splice(i, 1);
remainingCommas--;
} else {
break;
}
}
}
if (partsFound.length) {
if (parsedName.suffix) {
partsFound = [parsedName.suffix].concat(partsFound);
}
parsedName.suffix = partsFound.join(', ');
partsFound = [];
}
// Last name: remove and store last name
if (remainingCommas > 0) {
if (remainingCommas > 1) {
handleError((remainingCommas - 1) + ' extra commas found');
}
// Remove and store all parts before first comma as last name
if (nameCommas.indexOf(',')) {
parsedName.last = nameParts.splice(0, nameCommas.indexOf(',')).join(' ');
nameCommas.splice(0, nameCommas.indexOf(','));
}
} else {
// Remove and store last part as last name
parsedName.last = nameParts.pop();
}
if (!nameParts.length) {
parsedName = fixParsedNameCase(parsedName, fixCase);
return partToReturn === 'all' ? parsedName : parsedName[partToReturn];
}
// First name: remove and store first part as first name
parsedName.first = nameParts.shift();
if (!nameParts.length) {
parsedName = fixParsedNameCase(parsedName, fixCase);
return partToReturn === 'all' ? parsedName : parsedName[partToReturn];
}
// Middle name: store all remaining parts as middle name
if (nameParts.length > 2) {
handleError(nameParts.length + ' middle names');
}
parsedName.middle = nameParts.join(' ');
parsedName = fixParsedNameCase(parsedName, fixCase);
return partToReturn === 'all' ? parsedName : parsedName[partToReturn];
};