/[wait]/tags/WAIT_1_900/metaphone.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /tags/WAIT_1_900/metaphone.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 107 - (show annotations)
Tue Jul 13 12:45:55 2004 UTC (19 years, 8 months ago) by dpavlin
File MIME type: text/plain
File size: 19223 byte(s)
tag for version 1.900

1 /* Metaphone Conversion Notes
2
3 When I found this Algorithm, in article, there were discrepancies between
4 the BASIC code and the verbal description. The discrepances look like they
5 could have been caused by typing errors in the article.
6
7 I have included the BASIC code from this article for the specific purpose
8 of presenting the Algorithm the way it was originally described.
9 I have tried to reproduce the BASIC EXACTLY the way it appeared. So when
10 you see "ENAM" with nothing behind it, that is how it was presented.
11
12 Lawrence Philips has no doubt spent a lot of time in the development of
13 this algorithm. I am trusting that the algorithm described has been
14 throughly tested to the best of his ability.
15
16 It was my intention to reproduce it using his rules as best as I could
17 discern them.
18
19 It looks like it works better than Soundex. Thank You Lawrence.
20
21 To anyone passing this along. Please include all of the notes they
22 are part of the documentation and credits. Thanks
23
24 Mike Kuhn (mkuhn@rhlab.com)
25
26 Michael J. Kuhn Computer Systems Consultant
27 5916 Glenoak Ave.
28 Baltimore, MD 21214-2009
29 410-254-7060
30
31 P.S.
32 A version of this routine in the Informix Archive was done by:
33
34 Sadru Fidai Munics Information Systems
35 50 Mount Prospect Ave
36 Clifton NJ 07013 (201)778-7753
37 aol.com!SFidai
38
39 Sadru called me to discuss this and said the following:
40
41 His routine was NOT done from the article published in "Computer Language".
42 He started with a working version from a PICK system that was using this.
43 He had 2,000+ names with metaphone from the PICK system that he used
44 to test the C code with.
45
46 You might want to check this routine out.
47
48 I did not use his routine at the time because there was no verbal
49 explanation of the transformations. Also my intent was to be able
50 to easily modify the transformation rules with some of my own.
51
52 I did a mod 100 of my 20,726 test names and got 221 scattered names.
53 I then computed Metaphone for Sadru version and mine. There were 14
54 differences. Excluding the trailing S's in his, which I eliminated.
55 I also changed his code so that O was a ZERO. The differences account
56 for changes I MADE and interpretation of transformation rules.
57
58 At this point I have no need to do a more comprehensive analysis.
59
60 lastname Mike Kuhn Sadru Fidai
61
62 ANASTHA ANS0 ANSX
63 DAVIS-CARTER TFSKRTR TFXKRTR
64 ESCARMANT ESKRMNT EXKRMNT
65 MCCALL MCL MKKL
66 MCCROREY MCRR MKKRR
67 MERSEAL MRSL MRXL
68 PIEURISSAINT PRSNT PRXNT
69 ROTMAN RTMN RXMN
70 SCHEVEL SXFL SKFL
71 SCHROM SXRM SKRM
72 SEAL SL XL
73 SPARR SPR XPR
74 STARLEPER STRLPR XTRLPR
75 THRASH TRX 0RX
76 */
77
78
79 /***************************************************************
80
81 Metaphone Algorithm
82
83 Created by Lawrence Philips (location unknown). Metaphone presented
84 in article in "Computer Language" December 1990 issue.
85
86 Converted from Pick BASIC, as demonstrated in article, to C by
87 Michael J. Kuhn (Baltimore, Maryland)
88
89 My original intention was to replace SOUNDEX with METAPHONE in
90 order to get lists of similar sounding names that were more precise.
91 SOUNDEX maps "William" and "Williams" to the same values. METAPHONE
92 as it turns out DOES THE SAME. There are going to be problems
93 that you need to resolve with your own set of data.
94
95 Basically, for my problem with S's I think that if
96
97 IF metaphone[strlen(metaphone)] == "S"
98 AND strlen(metaphone) >= 4 THEN
99
100 metaphone[strlen(metaphone)] = ""
101
102 You can add you own rules as required.
103
104 Also, Lawrence Philips suggests that for practical reasons only the
105 first 4 characters of the metaphone be used. This happens to be the
106 number of characters that Soundex produces. This is indeed practical
107 if you already have reserved exactly 4 characters in your database.
108
109 In addition an analysis of your data may show that names are split
110 into undesirable "metaphone groups" as the number of metaphone characters
111 increases.
112
113 *********** BEGIN METAPHONE RULES ***********
114
115 Lawrence Philips' RULES follow:
116
117 The 16 consonant sounds:
118 |--- ZERO represents "th"
119 |
120 B X S K J T F H L M N P R 0 W Y
121
122 Exceptions:
123
124 Beginning of word: "ae-", "gn", "kn-", "pn-", "wr-" ----> drop first letter
125 "Aebersold", "Gnagy", "Knuth", "Pniewski", "Wright"
126
127 Beginning of word: "x" ----> change to "s"
128 as in "Deng Xiaopeng"
129
130 Beginning of word: "wh-" ----> change to "w"
131 as in "Whalen"
132
133 Transformations:
134
135 B ----> B unless at the end of word after "m", as in "dumb", "McComb"
136
137 C ----> X (sh) if "-cia-" or "-ch-"
138 S if "-ci-", "-ce-", or "-cy-"
139 SILENT if "-sci-", "-sce-", or "-scy-"
140 K otherwise, including in "-sch-"
141
142 D ----> J if in "-dge-", "-dgy-", or "-dgi-"
143 T otherwise
144
145 F ----> F
146
147 G ----> SILENT if in "-gh-" and not at end or before a vowel
148 in "-gn" or "-gned"
149 in "-dge-" etc., as in above rule
150 J if before "i", or "e", or "y" if not double "gg"
151 K otherwise
152
153 H ----> SILENT if after vowel and no vowel follows
154 or after "-ch-", "-sh-", "-ph-", "-th-", "-gh-"
155 H otherwise
156
157 J ----> J
158
159 K ----> SILENT if after "c"
160 K otherwise
161
162 L ----> L
163
164 M ----> M
165
166 N ----> N
167
168 P ----> F if before "h"
169 P otherwise
170
171 Q ----> K
172
173 R ----> R
174
175 S ----> X (sh) if before "h" or in "-sio-" or "-sia-"
176 S otherwise
177
178 T ----> X (sh) if "-tia-" or "-tio-"
179 0 (th) if before "h"
180 silent if in "-tch-"
181 T otherwise
182
183 V ----> F
184
185 W ----> SILENT if not followed by a vowel
186 W if followed by a vowel
187
188 X ----> KS
189
190 Y ----> SILENT if not followed by a vowel
191 Y if followed by a vowel
192
193 Z ----> S
194
195 **************************************************************/
196
197 /*
198
199 NOTE: This list turned out to be various issues that I passed over
200 while trying to discern this algorithm. The final outcome
201 of these items may or may not be reflected in the code.
202
203 There where some discrepancies between the Pick BASIC code in the
204 original article and the verbal discription of the transformations:
205
206 1. CASE SYMB = "G"
207
208 AND ENAME[N +3] = "D" AND (N + 3) = L)) and ENAM
209 ^
210 this was cut off in the magazine listing |
211
212 I used the verbal discription in the transformation list
213 to add the appropriate code.
214
215 2. H ----> SILENT if after vowel and no vowel follows
216 H otherwise
217
218 This is the transformation description, however, the BASIC
219 routine HAS code do this:
220
221 SILENT if after "ch-", "sh-", "ph-", "th", "gh"
222
223 which is the correct behaviour if you look at c,s,p,t,g
224
225 If did not, however, have "after vowel" coded even though this
226 was in the description. I added it.
227
228 3. The BASIC code appears to skip double letters except "C" yet
229 the transformation code for "G" looks at previous letter to
230 see if we have "GG". This is inconsistent.
231
232 I am making the assumption that "C" was a typo in the BASIC
233 code. It should have been "G".
234
235 4. Transformation notation. "-..-" where .. are letters; means that
236 the letters indicated are bounded by other letters. So "-gned"
237 means at the end and "ch-" means at the beginning. I have noticed
238 that the later is not explicity stated in the verbal description
239 but it is coded in the BASIC.
240
241 5. case 'C' K otherwise, including in "-sch-"
242 this implies that "sch" be bounded by other letters. The BASIC
243 code, however, has: N > 1
244 It should have N > 2 for this to be correct.
245 SCH-
246 123 greater than 1 means that C can be 2nd letter
247
248 I coded it as per the verbal description and not what was in
249 the code.
250
251 6. as of 11-20-95 I am still trying to understand "H". The BASIC
252 code seems to indicate that if "-.h" is at the end it is not
253 silent. But if it is at the end there is no way a vowel could
254 follow the "h". I am looking for examples.
255
256 7. ok now I am really confused. Case "T". There is code in BASIC
257 that says if next = "H" and previous != "T" . There is no
258 way that a double T goes through the code. Double letters
259 are dumped in the beginning.
260
261 MATTHEW, MATTHIES, etc
262
263 The first T goes through the second is skipped so the
264 "th" is never detected.
265
266 Modified routine to allow "G,T" duplicates through the switch.
267
268 8. case "D" -dge- is indicated in transformation
269 -dge- or -dge is coded.
270
271 STEMBRIDGE should have "j" on end and not "t"
272
273 I am leaving the code as is, verbal must be wrong.
274
275 9. Regarding duplicate letters. "C" must be allowed through
276 as in all of the McC... names.
277
278 The way to handle "GG and "TT" I think is to pass over the
279 first duplicate. The transformation rules would then handle
280 duplicates of themselves by looking at the PREVIOUS letter.
281
282 This solves the problems of "TTH" where you want the "th"
283 sound.
284
285 10. Change "CC" so that the metaphone character is "C", they
286 way it is now for McComb and such you get "MKK", which
287 unnecessiarly eats up and extra metaphone character.
288
289 11. "TH" at the beginning as in Thomas. The verbal was not
290 clear about this. I think is should be "T" and not "0"
291 so I am changing code.
292
293 After the first test I think that "THvowel" should be
294 "0" and "TH(!vowel)" should be "T"
295
296 12. I think throwing away 1 "S" and the end would be good.
297 Since I am doing this anyway after the fact. If I
298 do it before then names like. ..
299 BURROUGHS & BURROUGH would be the same
300 because the GH would map to the same value in
301 both cases.
302
303 13. Case "Y", Brian and Bryan give different codes
304 Don't know how to handle this yet.
305
306 14. Comments on metaphone groups. Metaphone actually
307 makes groups bigger. Names like:
308
309 C...R... G...R... K...R... Q...R...
310
311 will map to "KR". Soundex would have produced for example
312
313 C600,C620,G600,G620,K600,K620,Q600,Q620
314
315 the names from these 8 groups would have been collapsed into 1.
316
317 Another way to look at this is for a more exact initial
318 guess of a name Soundex would give you a smaller list of
319 posibilities. If you don't know how to spell it at all
320 however, your success at finding the right match with
321 Metaphone is much greater than with Soundex.
322
323 15. After some tests decided to leave S's at the end of the
324 Metaphone. #12 takes care of my problems with plurals and
325 then S gets used to help make distinct metaphone.
326
327 Lawrence Philips is no longer at the company indicated in the
328 article. So I was unable to verify these items.
329 */
330
331
332 #ifdef __cplusplus
333 extern "C" {
334 #endif
335 #include "EXTERN.h"
336 #include "perl.h"
337 #include "XSUB.h"
338 #ifdef __cplusplus
339 }
340 #endif
341 #include "WAIT.h"
342 #define TRUE (1)
343 #define FALSE (0)
344 #define NULLCHAR (char *) 0
345
346 char *FRONTV = "EIY"; /* special cases for
347 letters in FRONT of
348 these */
349 char *VARSON = "CSPTG"; /* variable sound--those
350 modified by adding an "h" */
351 char *DOUBLE = "."; /* let these double
352 letters through */
353 char *excpPAIR = "AGKPW"; /* exceptions "ae-",
354 "gn-", "kn-", "pn-",
355 "wr-" */
356 char *nextLTR = "ENNNR";
357 char *chrptr, *chrptr1;
358
359 void
360 phonetic(name, metaph, metalen)
361 char *name, *metaph;
362 int metalen;
363 {
364
365 int ii, jj, silent, hard, Lng, lastChr;
366 char curLtr, prevLtr, nextLtr, nextLtr2, nextLtr3;
367 int vowelAfter, vowelBefore, frontvAfter;
368 char wname[60];
369 char *ename = wname;
370
371 jj = 0;
372 for (ii = 0; name[ii] != '\0'; ii++) {
373 ename[jj] = ToUpper(name[ii]);
374 jj++;
375 }
376 ename[jj] = '\0';
377
378 if (!*ename) return;
379
380 /* if ae, gn, kn, pn, wr then drop the first letter */
381 if ((chrptr = strchr(excpPAIR, ename[0])) != NULLCHAR) {
382 chrptr1 = nextLTR + (chrptr - excpPAIR);
383 if (*chrptr1 == ename[1])
384 strcpy(ename, &ename[1]);
385 }
386 /* change x to s */
387 if (ename[0] == 'X')
388 ename[0] = 'S';
389 /* get rid of the "h" in "wh" */
390 if ((ename[0] == 'W') && (ename[1] == 'H'))
391 strcpy(&ename[1], &ename[2]);
392
393 Lng = strlen(ename);
394 lastChr = Lng - 1; /* index to last character in string makes code easier */
395
396 /* Remove an S from the end of the string */
397 if ((ename[lastChr] == 'S') || (ename[lastChr] == 'ß')) {
398 ename[lastChr] = '\0';
399 Lng = strlen(ename);
400 lastChr = Lng - 1;
401 }
402 for (ii = 0; ((strlen(metaph) < metalen) && (ii < Lng)); ii++) {
403
404 curLtr = ename[ii];
405
406 vowelBefore = FALSE;
407 prevLtr = ' ';
408 if (ii > 0) {
409 prevLtr = ename[ii - 1];
410 if (IsVowel(prevLtr))
411 vowelBefore = TRUE;
412 }
413 /* if first letter is a vowel KEEP it */
414 if (ii == 0 && (IsVowel(curLtr))) {
415 strncat(metaph, &curLtr, 1);
416 continue;
417 }
418 vowelAfter = FALSE;
419 frontvAfter = FALSE;
420 nextLtr = ' ';
421 if (ii < lastChr) {
422 nextLtr = ename[ii + 1];
423 if (IsVowel(nextLtr))
424 vowelAfter = TRUE;
425 if (strchr(FRONTV, nextLtr) != NULLCHAR)
426 frontvAfter = TRUE;
427 }
428 /* skip double letters except ones in list */
429 if (curLtr == nextLtr && (strchr(DOUBLE, nextLtr) == NULLCHAR))
430 continue;
431
432 nextLtr2 = ' ';
433 if (ii < (lastChr - 1))
434 nextLtr2 = ename[ii + 2];
435
436 nextLtr3 = ' ';
437 if (ii < (lastChr - 2))
438 nextLtr3 = ename[ii + 3];
439
440 switch (curLtr) {
441
442 case 'B':
443 silent = FALSE;
444 if (ii == lastChr && prevLtr == 'M')
445 silent = TRUE;
446 if (!silent)
447 strncat(metaph, &curLtr, 1);
448 break;
449
450 /*silent -sci-,-sce-,-scy-; sci-, etc OK */
451 case 'C':
452 if (!(ii > 1 && prevLtr == 'S' && frontvAfter))
453 if (ii > 0 && nextLtr == 'I' && nextLtr2 == 'A')
454 strncat(metaph, "X", 1);
455 else if (frontvAfter)
456 strncat(metaph, "S", 1);
457 else if (ii > 1 && prevLtr == 'S' && nextLtr == 'H')
458 strncat(metaph, "K", 1);
459 else if (nextLtr == 'H')
460 if (ii == 0 && (!IsVowel(nextLtr2)))
461 strncat(metaph, "K", 1);
462 else
463 strncat(metaph, "X", 1);
464 else if (prevLtr == 'C')
465 strncat(metaph, "C", 1);
466 else
467 strncat(metaph, "K", 1);
468 break;
469
470 case 'D':
471 if (nextLtr == 'G' && (strchr(FRONTV, nextLtr2) != NULLCHAR))
472 strncat(metaph, "J", 1);
473 else
474 strncat(metaph, "T", 1);
475 break;
476
477 case 'G':
478 silent = FALSE;
479 /* SILENT -gh- except for -gh and no vowel after h */
480 if ((ii < (lastChr - 1) && nextLtr == 'H')
481 && (!IsVowel(nextLtr2)))
482 silent = TRUE;
483
484 if ((ii == (lastChr - 3))
485 && nextLtr == 'N' && nextLtr2 == 'E' && nextLtr3 == 'D')
486 silent = TRUE;
487 else if ((ii == (lastChr - 1)) && nextLtr == 'N')
488 silent = TRUE;
489
490 if (prevLtr == 'D' && frontvAfter)
491 silent = TRUE;
492
493 if (prevLtr == 'G')
494 hard = TRUE;
495 else
496 hard = FALSE;
497
498 if (!silent)
499 if (frontvAfter && (!hard))
500 strncat(metaph, "J", 1);
501 else
502 strncat(metaph, "K", 1);
503 break;
504
505 case 'H':
506 silent = FALSE;
507 if (strchr(VARSON, prevLtr) != NULLCHAR)
508 silent = TRUE;
509
510 if (vowelBefore && !vowelAfter)
511 silent = TRUE;
512
513 if (!silent)
514 strncat(metaph, &curLtr, 1);
515 break;
516
517 case 'F':
518 case 'J':
519 case 'L':
520 case 'M':
521 case 'N':
522 case 'R':
523 strncat(metaph, &curLtr, 1);
524 break;
525
526 case 'K':
527 if (prevLtr != 'C')
528 strncat(metaph, &curLtr, 1);
529 break;
530
531 case 'P':
532 if (nextLtr == 'H')
533 strncat(metaph, "F", 1);
534 else
535 strncat(metaph, "P", 1);
536 break;
537
538 case 'Q':
539 strncat(metaph, "K", 1);
540 break;
541
542 case 'S':
543 if (ii > 1 && nextLtr == 'I'
544 && (nextLtr2 == 'O' || nextLtr2 == 'A'))
545 strncat(metaph, "X", 1);
546 else if (nextLtr == 'H')
547 strncat(metaph, "X", 1);
548 else
549 strncat(metaph, "S", 1);
550 break;
551
552 case 'T':
553 if (ii > 1 && nextLtr == 'I'
554 && (nextLtr2 == 'O' || nextLtr2 == 'A'))
555 strncat(metaph, "X", 1);
556 else if (nextLtr == 'H') /* The=0, Tho=T, Withrow=0 */
557 if (ii > 0 || (IsVowel(nextLtr2)))
558 strncat(metaph, "0", 1);
559 else
560 strncat(metaph, "T", 1);
561 else if (!(ii < (lastChr - 2) && nextLtr == 'C' && nextLtr2 == 'H'))
562 strncat(metaph, "T", 1);
563 break;
564
565 case 'V':
566 strncat(metaph, "F", 1);
567 break;
568
569 case 'W':
570 case 'Y':
571 if (ii < lastChr && vowelAfter)
572 strncat(metaph, &curLtr, 1);
573 break;
574
575 case 'X':
576 strncat(metaph, "KS", 2);
577 break;
578
579 case 'Z':
580 strncat(metaph, "S", 1);
581 break;
582 }
583
584 }
585
586 /* DON'T DO THIS NOW, REMOVING "S" IN BEGINNING HAS the same effect
587 with plurals, in addition imbedded S's in the Metaphone are included
588 Lng = strlen(metaph);
589 lastChr = Lng -1;
590 if ( metaph[lastChr] == 'S' && Lng >= 3 ) metaph[lastChr] = '\0';
591 */
592
593 return;
594 }
595
596 #if 0
597 int
598 metaphone(argc)
599 int argc;
600 {
601
602 char name[128];
603 char metaph[50];
604
605 if (argc != 1) {
606 fprintf(stderr, "metaphone: argc != 1\n");
607 retquote("");
608 return (1);
609 }
610 popquote(name, sizeof(name));
611
612 metaph[0] = '\0';
613 phonetic(name, metaph, 20);
614
615 retquote(metaph);
616 return (1);
617 }
618
619 int
620 main(argc, argv)
621 int argc;
622 char *argv[];
623 {
624 char metaph[50];
625
626 if (argc != 2) {
627 fprintf(stderr, "metaphone: argc == %d\n", argc);
628 exit(1);
629 }
630 metaph[0] = '\0';
631 fprintf(stderr, "%s\n", argv[1]);
632 phonetic(argv[1], metaph, 20);
633 fprintf(stderr, "%s => %s\n", argv[1], metaph);
634 exit(0);
635 }
636 #endif

Properties

Name Value
cvs2svn:cvs-rev 1.1

  ViewVC Help
Powered by ViewVC 1.1.26