In Files
Information
Metaphone
Implementation of Lawrence Philips’ Metaphone and Double Metaphone algorithms.
Metaphone encodes names into a phonetic form such that similar-sounding names have the same or similar Metaphone encodings.
The original system was described by Lawrence Philips in Computer Language Vol. 7 No. 12, December 1990, pp 39-43.
There are multiple implementations of Metaphone, each with their own quirks, I have based this on my interpretation of the algorithm specification. Even LP’s original BASIC implementation appears to contain bugs (specifically with the handling of CC and MB), when compared to his explanation of the algorithm.
I have also compared this implementation with that found in PHP’s standard library, which appears to mimic the behaviour of LP’s original BASIC implementation. For compatibility, these rules can also be used by passing :alternate=>true to the methods.
Double Metaphone algoirthm originally published in the June 2000 issue of C/C++ Users Journal. Ruby version based on Stephen Woodbridge’s PHP version - swoodbridge.com/DoubleMetaPhone/
Ruby implementation of Double Metaphone based on work by Paul Battley (pbattley@gmail.com).
Constants
- METAPHONE_RULES
Metaphone rules. These are simply applied in order.
- METAPHONE_RULES_LP
The rules for the ‘buggy’ alternate implementation used by PHP etc.
Public Class Methods
Returns the primary and secondary double metaphone tokens (the secondary will be nil if equal to the primary).
# File lib/english/metaphone.rb, line 99 99: def self.double_metaphone(string) 100: string = string.to_s 101: @db_memo ||= Hash.new 102: @db_memo[string] = calculate_double_metaphone(string) 103: end
Returns the Metaphone representation of a string. If the string contains multiple words, each word in turn is converted into its Metaphone representation. Note that only the letters A-Z are supported, so any language-specific processing should be done beforehand.
If alt is set to true, alternate ‘buggy’ rules are used.
# File lib/english/metaphone.rb, line 80 80: def self.metaphone(string, alt=nil) 81: string.to_s.strip.split(/\s+/).map{ |w| metaphone_word(w, alt) }.join(' ') 82: end
Private Class Methods
# File lib/english/metaphone.rb, line 107 107: def self.calculate_double_metaphone(str) 108: primary, secondary, current = '', '', 0 109: original, length, last = "#{str} ".upcase, str.length, str.length - 1 110: if /^GN|KN|PN|WR|PS$/ =~ original[0, 2] 111: current += 1 112: end 113: if 'X' == original[0, 1] 114: primary << 'S' 115: secondary << 'S' 116: current += 1 117: end 118: while primary.length < 4 || secondary.length < 4 119: break if current > str.length 120: a, b, c = lookup(original, current, length, last) 121: primary << a if a 122: secondary << b if b 123: current += c if c 124: end 125: primary, secondary = primary[0, 4], secondary[0, 4] 126: return primary, (primary == secondary ? nil : secondary) 127: end
# File lib/english/metaphone.rb, line 137 137: def self.lookup(str, pos, length, last) 138: case str[pos, 1] 139: when /^A|E|I|O|U|Y$/ 140: if 0 == pos 141: return 'A', 'A', 1 142: else 143: return nil, nil, 1 144: end 145: when 'B' 146: return 'P', 'P', ('B' == str[pos + 1, 1] ? 2 : 1) 147: when 'Ç' 148: return 'S', 'S', 1 149: when 'C' 150: if pos > 1 && 151: !vowel?(str[pos - 2, 1]) && 152: 'ACH' == str[pos - 1, 3] && 153: str[pos + 2, 1] != 'I' && ( 154: str[pos + 2, 1] != 'E' || 155: str[pos - 2, 6] =~ /^(B|M)ACHER$/ 156: ) then 157: return 'K', 'K', 2 158: elsif 0 == pos && 'CAESAR' == str[pos, 6] 159: return 'S', 'S', 2 160: elsif 'CHIA' == str[pos, 4] 161: return 'K', 'K', 2 162: elsif 'CH' == str[pos, 2] 163: if 0 == pos && 'CHAE' == str[pos, 4] 164: return 'K', 'X', 2 165: elsif 0 == pos && ( 166: ['HARAC', 'HARIS'].include?(str[pos + 1, 5]) || 167: ['HOR', 'HYM', 'HIA', 'HEM'].include?(str[pos + 1, 3]) 168: ) && str[0, 5] != 'CHORE' then 169: return 'K', 'K', 2 170: elsif ['VAN ','VON '].include?(str[0, 4]) || 171: 'SCH' == str[0, 3] || 172: ['ORCHES','ARCHIT','ORCHID'].include?(str[pos - 2, 6]) || 173: ['T','S'].include?(str[pos + 2, 1]) || ( 174: ((0 == pos) || ['A','O','U','E'].include?(str[pos - 1, 1])) && 175: ['L','R','N','M','B','H','F','V','W',' '].include?(str[pos + 2, 1]) 176: ) then 177: return 'K', 'K', 2 178: elsif pos > 0 179: return ('MC' == str[0, 2] ? 'K' : 'X'), 'K', 2 180: else 181: return 'X', 'X', 2 182: end 183: elsif 'CZ' == str[pos, 2] && 'WICZ' != str[pos - 2, 4] 184: return 'S', 'X', 2 185: elsif 'CIA' == str[pos + 1, 3] 186: return 'X', 'X', 3 187: elsif 'CC' == str[pos, 2] && !(1 == pos && 'M' == str[0, 1]) 188: if /^I|E|H$/ =~ str[pos + 2, 1] && 'HU' != str[pos + 2, 2] 189: if (1 == pos && 'A' == str[pos - 1, 1]) || 190: /^UCCE(E|S)$/ =~ str[pos - 1, 5] then 191: return 'KS', 'KS', 3 192: else 193: return 'X', 'X', 3 194: end 195: else 196: return 'K', 'K', 2 197: end 198: elsif /^C(K|G|Q)$/ =~ str[pos, 2] 199: return 'K', 'K', 2 200: elsif /^C(I|E|Y)$/ =~ str[pos, 2] 201: return 'S', (/^CI(O|E|A)$/ =~ str[pos, 3] ? 'X' : 'S'), 2 202: else 203: if /^ (C|Q|G)$/ =~ str[pos + 1, 2] 204: return 'K', 'K', 3 205: else 206: return 'K', 'K', (/^C|K|Q$/ =~ str[pos + 1, 1] && !(['CE','CI'].include?(str[pos + 1, 2])) ? 2 : 1) 207: end 208: end 209: when 'D' 210: if 'DG' == str[pos, 2] 211: if /^I|E|Y$/ =~ str[pos + 2, 1] 212: return 'J', 'J', 3 213: else 214: return 'TK', 'TK', 2 215: end 216: else 217: return 'T', 'T', (/^D(T|D)$/ =~ str[pos, 2] ? 2 : 1) 218: end 219: when 'F' 220: return 'F', 'F', ('F' == str[pos + 1, 1] ? 2 : 1) 221: when 'G' 222: if 'H' == str[pos + 1, 1] 223: if pos > 0 && !vowel?(str[pos - 1, 1]) 224: return 'K', 'K', 2 225: elsif 0 == pos 226: if 'I' == str[pos + 2, 1] 227: return 'J', 'J', 2 228: else 229: return 'K', 'K', 2 230: end 231: elsif (pos > 1 && /^B|H|D$/ =~ str[pos - 2, 1]) || 232: (pos > 2 && /^B|H|D$/ =~ str[pos - 3, 1]) || 233: (pos > 3 && /^B|H$/ =~ str[pos - 4, 1]) 234: return nil, nil, 2 235: else 236: if (pos > 2 && 'U' == str[pos - 1, 1] && /^C|G|L|R|T$/ =~ str[pos - 3, 1]) 237: return 'F', 'F', 2 238: elsif pos > 0 && 'I' != str[pos - 1, 1] 239: return 'K', 'K', 2 240: else 241: return nil, nil, 2 242: end 243: end 244: elsif 'N' == str[pos + 1, 1] 245: if 1 == pos && vowel?(str[0, 1]) && !slavo_germanic?(str) 246: return 'KN', 'N', 2 247: else 248: if 'EY' != str[pos + 2, 2] && 'Y' != str[pos + 1, 1] && !slavo_germanic?(str) 249: return 'N', 'KN', 2 250: else 251: return 'KN', 'KN', 2 252: end 253: end 254: elsif 'LI' == str[pos + 1, 2] && !slavo_germanic?(str) 255: return 'KL', 'L', 2 256: elsif 0 == pos && ('Y' == str[pos + 1, 1] || /^(E(S|P|B|L|Y|I|R)|I(B|L|N|E))$/ =~ str[pos + 1, 2]) 257: return 'K', 'J', 2 258: elsif (('ER' == str[pos + 1, 2] || 'Y' == str[pos + 1, 1]) && 259: /^(D|R|M)ANGER$/ !~ str[0, 6] && 260: /^E|I$/ !~ str[pos - 1, 1] && 261: /^(R|O)GY$/ !~ str[pos - 1, 3]) 262: return 'K', 'J', 2 263: elsif /^E|I|Y$/ =~ str[pos + 1, 1] || /^(A|O)GGI$/ =~ str[pos - 1, 4] 264: if (/^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]) || 'ET' == str[pos + 1, 2] 265: return 'K', 'K', 2 266: else 267: if 'IER ' == str[pos + 1, 4] 268: return 'J', 'J', 2 269: else 270: return 'J', 'K', 2 271: end 272: end 273: elsif 'G' == str[pos + 1, 1] 274: return 'K', 'K', 2 275: else 276: return 'K', 'K', 1 277: end 278: when 'H' 279: if (0 == pos || vowel?(str[pos - 1, 1])) && vowel?(str[pos + 1, 1]) 280: return 'H', 'H', 2 281: else 282: return nil, nil, 1 283: end 284: when 'J' 285: if 'JOSE' == str[pos, 4] || 'SAN ' == str[0, 4] 286: if (0 == pos && ' ' == str[pos + 4, 1]) || 'SAN ' == str[0, 4] 287: return 'H', 'H', 1 288: else 289: return 'J', 'H', 1 290: end 291: else 292: current = ('J' == str[pos + 1, 1] ? 2 : 1) 293: 294: if 0 == pos && 'JOSE' != str[pos, 4] 295: return 'J', 'A', current 296: else 297: if vowel?(str[pos - 1, 1]) && !slavo_germanic?(str) && /^A|O$/ =~ str[pos + 1, 1] 298: return 'J', 'H', current 299: else 300: if last == pos 301: return 'J', nil, current 302: else 303: if /^L|T|K|S|N|M|B|Z$/ !~ str[pos + 1, 1] && /^S|K|L$/ !~ str[pos - 1, 1] 304: return 'J', 'J', current 305: else 306: return nil, nil, current 307: end 308: end 309: end 310: end 311: end 312: when 'K' 313: return 'K', 'K', ('K' == str[pos + 1, 1] ? 2 : 1) 314: when 'L' 315: if 'L' == str[pos + 1, 1] 316: if (((length - 3) == pos && /^(ILL(O|A)|ALLE)$/ =~ str[pos - 1, 4]) || 317: ((/^(A|O)S$/ =~ str[last - 1, 2] || /^A|O$/ =~ str[last, 1]) && 'ALLE' == str[pos - 1, 4])) 318: return 'L', nil, 2 319: else 320: return 'L', 'L', 2 321: end 322: else 323: return 'L', 'L', 1 324: end 325: when 'M' 326: if ('UMB' == str[pos - 1, 3] && 327: ((last - 1) == pos || 'ER' == str[pos + 2, 2])) || 'M' == str[pos + 1, 1] 328: return 'M', 'M', 2 329: else 330: return 'M', 'M', 1 331: end 332: when 'N' 333: return 'N', 'N', ('N' == str[pos + 1, 1] ? 2 : 1) 334: when 'Ñ' 335: return 'N', 'N', 1 336: when 'P' 337: if 'H' == str[pos + 1, 1] 338: return 'F', 'F', 2 339: else 340: return 'P', 'P', (/^P|B$/ =~ str[pos + 1, 1] ? 2 : 1) 341: end 342: when 'Q' 343: return 'K', 'K', ('Q' == str[pos + 1, 1] ? 2 : 1) 344: when 'R' 345: current = ('R' == str[pos + 1, 1] ? 2 : 1) 346: 347: if last == pos && !slavo_germanic?(str) && 'IE' == str[pos - 2, 2] && /^M(E|A)$/ !~ str[pos - 4, 2] 348: return nil, 'R', current 349: else 350: return 'R', 'R', current 351: end 352: when 'S' 353: if /^(I|Y)SL$/ =~ str[pos - 1, 3] 354: return nil, nil, 1 355: elsif 0 == pos && 'SUGAR' == str[pos, 5] 356: return 'X', 'S', 1 357: elsif 'SH' == str[pos, 2] 358: if /^H(EIM|OEK|OLM|OLZ)$/ =~ str[pos + 1, 4] 359: return 'S', 'S', 2 360: else 361: return 'X', 'X', 2 362: end 363: elsif /^SI(O|A)$/ =~ str[pos, 3] || 'SIAN' == str[pos, 4] 364: return 'S', (slavo_germanic?(str) ? 'S' : 'X'), 3 365: elsif (0 == pos && /^M|N|L|W$/ =~ str[pos + 1, 1]) || 'Z' == str[pos + 1, 1] 366: return 'S', 'X', ('Z' == str[pos + 1, 1] ? 2 : 1) 367: elsif 'SC' == str[pos, 2] 368: if 'H' == str[pos + 2, 1] 369: if /^OO|ER|EN|UY|ED|EM$/ =~ str[pos + 3, 2] 370: return (/^E(R|N)$/ =~ str[pos + 3, 2] ? 'X' : 'SK'), 'SK', 3 371: else 372: return 'X', ((0 == pos && !vowel?(str[3, 1]) && ('W' != str[pos + 3, 1])) ? 'S' : 'X'), 3 373: end 374: elsif /^I|E|Y$/ =~ str[pos + 2, 1] 375: return 'S', 'S', 3 376: else 377: return 'SK', 'SK', 3 378: end 379: else 380: return (last == pos && /^(A|O)I$/ =~ str[pos - 2, 2] ? nil : 'S'), 'S', (/^S|Z$/ =~ str[pos + 1, 1] ? 2 : 1) 381: end 382: when 'T' 383: if 'TION' == str[pos, 4] 384: return 'X', 'X', 3 385: elsif /^T(IA|CH)$/ =~ str[pos, 3] 386: return 'X', 'X', 3 387: elsif 'TH' == str[pos, 2] || 'TTH' == str[pos, 3] 388: if /^(O|A)M$/ =~ str[pos + 2, 2] || /^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3] 389: return 'T', 'T', 2 390: else 391: return '0', 'T', 2 392: end 393: else 394: return 'T', 'T', (/^T|D$/ =~ str[pos + 1, 1] ? 2 : 1) 395: end 396: when 'V' 397: return 'F', 'F', ('V' == str[pos + 1, 1] ? 2 : 1) 398: when 'W' 399: if 'WR' == str[pos, 2] 400: return 'R', 'R', 2 401: end 402: pri, sec = nil, nil 403: if 0 == pos && (vowel?(str[pos + 1, 1]) || 'WH' == str[pos, 2]) 404: pri = 'A' 405: sec = vowel?(str[pos + 1, 1]) ? 'F' : 'A' 406: end 407: if (last == pos && vowel?(str[pos - 1, 1])) || 'SCH' == str[0, 3] || 408: /^EWSKI|EWSKY|OWSKI|OWSKY$/ =~ str[pos - 1, 5] 409: return pri, "#{sec}F", 1 410: elsif /^WI(C|T)Z$/ =~ str[pos, 4] 411: return "#{pri}TS", "#{sec}FX", 4 412: else 413: return pri, sec, 1 414: end 415: when 'X' 416: current = (/^C|X$/ =~ str[pos + 1, 1] ? 2 : 1) 417: if !(last == pos && (/^(I|E)AU$/ =~ str[pos - 3, 3] || /^(A|O)U$/ =~ str[pos - 2, 2])) 418: return 'KS', 'KS', current 419: else 420: return nil, nil, current 421: end 422: when 'Z' 423: if 'H' == str[pos + 1, 1] 424: return 'J', 'J', 2 425: else 426: current = ('Z' == str[pos + 1, 1] ? 2 : 1) 427: if /^Z(O|I|A)$/ =~ str[pos + 1, 2] || (slavo_germanic?(str) && (pos > 0 && 'T' != str[pos - 1, 1])) 428: return 'S', 'TS', current 429: else 430: return 'S', 'S', current 431: end 432: end 433: else 434: return nil, nil, 1 435: end 436: end
# File lib/english/metaphone.rb, line 86 86: def self.metaphone_word(w, alt=nil) 87: # Normalise case and remove non-ASCII 88: s = w.downcase.gsub(/[^a-z]/, '') 89: # Apply the Metaphone rules 90: rules = alt ? METAPHONE_RULES_LP : METAPHONE_RULES 91: rules.each { |rx, rep| s.gsub!(rx, rep) } 92: return s.upcase 93: end
Disabled; run with --debug to generate this.