/[wait]/trunk/stemmer.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/stemmer.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 88 - (hide annotations)
Mon May 24 13:44:01 2004 UTC (19 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 17701 byte(s)
move cvs-head to trunk

1 ulpfr 10 /* WIDE AREA INFORMATION SERVER SOFTWARE:
2     No guarantees or restrictions. See the readme file for the full standard
3     disclaimer.
4    
5     francois@welchgate.welch.jhu.edu
6     */
7    
8     /*
9     Title: COPYRIGHT freeWAIS-0.2
10     Author: Jane Smith
11     Copyright: Copyright 1993 CNIDR
12     Last update: 10.01.93
13     Abstract: This file contains the copyright statement for freeWAIS 0.2
14    
15     Copyright Statement for freeWAIS 0.2 and subsquent freeWAIS
16     releases:
17    
18     Copyright (c) MCNC, Clearinghouse for Networked Information Discovery
19     and Retrieval, 1993.
20    
21     Permission to use, copy, modify, distribute, and sell this software
22     and its documentation, in whole or in part, for any purpose is hereby
23     granted without fee, provided that
24    
25     1. The above copyright notice and this permission notice appear in all
26     copies of the software and related documentation. Notices of copyright
27     and/or attribution which appear at the beginning of any file included
28     in this distribution must remain intact.
29    
30     2. Users of this software agree to make their best efforts (a) to
31     return to MCNC any improvements or extensions that they make, so that
32     these may be included in future releases; and (b) to inform MCNC/CNIDR
33     of noteworthy uses of this software.
34    
35     3. The names of MCNC and Clearinghouse for Networked Information
36     Discovery and Retrieval may not be used in any advertising or publicity
37     relating to the software without the specific, prior written permission
38     of MCNC/CNIDR.
39    
40     THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
41     EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
42     WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
43    
44     IN NO EVENT SHALL MCNC/CNIDR BE LIABLE FOR ANY SPECIAL, INCIDENTAL,
45     INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND, OR ANY DAMAGES WHATSOEVER
46     RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER OR NOT ADVISED OF
47     THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF LIABILITY, ARISING OUT
48     OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
49     */
50    
51     #ifdef __cplusplus
52     extern "C" {
53     #endif
54     #include "EXTERN.h"
55     #include "perl.h"
56     #include "XSUB.h"
57     #ifdef __cplusplus
58     }
59    
60     #endif
61     #include "stemmer.h"
62    
63     /*
64    
65     Purpose: Implementation of the Porter stemming algorithm documented
66     in: Porter, M.F., "An Algorithm For Suffix Stripping,"
67     Program 14 (3), July 1980, pp. 130-137.
68    
69     Provenance: Written by B. Frakes and C. Cox, 1986.
70     Changed by C. Fox, 1990.
71     - made measure function a DFA
72     - restructured structs
73     - renamed functions and variables
74     - restricted function and variable scopes
75     Changed by C. Fox, July, 1991.
76     - added ANSI C declarations
77     - branch tested to 90% coverage
78    
79     Notes: This code will make little sense without the the Porter
80     article. The stemming function converts its input to
81     lower case.
82     */
83    
84     #define EOS '\0'
85     #define IsVowel(c) ('a'==(c)||'e'==(c)||'i'==(c)||'o'==(c)||'u'==(c))
86    
87     typedef struct {
88     int id; /* returned if rule fired */
89     char *old_end; /* suffix replaced */
90     char *new_end; /* suffix replacement */
91     int old_offset; /* from end of word to start of suffix */
92     int new_offset; /* from beginning to end of new suffix */
93     int min_root_size; /* min root word size for replacement */
94     int (*condition) (); /* the replacement test function */
95     } RuleList;
96    
97     static char LAMBDA[1] = ""; /* the constant empty string */
98     static char *end; /* pointer to the end of the word */
99    
100     /*****************************************************************************/
101     /******************** Private Function Declarations **********************/
102    
103     static int WordSize _((char *word));
104     static int ContainsVowel _((char *word));
105     static int EndsWithCVC _((char *word));
106     static int AddAnE _((char *word));
107     static int RemoveAnE _((char *word));
108     static int ReplaceEnd _((char *word, RuleList * rule));
109    
110     /******************************************************************************/
111     /***************** Initialized Private Data Structures ********************/
112    
113     static RuleList step1a_rules[] =
114     {
115     101, "sses", "ss", 3, 1, -1, NULL,
116     102, "ies", "i", 2, 0, -1, NULL,
117     103, "ss", "ss", 1, 1, -1, NULL,
118     104, "s", LAMBDA, 0, -1, -1, NULL,
119     000, NULL, NULL, 0, 0, 0, NULL,
120     };
121    
122     static RuleList step1b_rules[] =
123     {
124     105, "eed", "ee", 2, 1, 0, NULL,
125     106, "ed", LAMBDA, 1, -1, -1, ContainsVowel,
126     107, "ing", LAMBDA, 2, -1, -1, ContainsVowel,
127     000, NULL, NULL, 0, 0, 0, NULL,
128     };
129    
130     static RuleList step1b1_rules[] =
131     {
132     108, "at", "ate", 1, 2, -1, NULL,
133     109, "bl", "ble", 1, 2, -1, NULL,
134     110, "iz", "ize", 1, 2, -1, NULL,
135     111, "bb", "b", 1, 0, -1, NULL,
136     112, "dd", "d", 1, 0, -1, NULL,
137     113, "ff", "f", 1, 0, -1, NULL,
138     114, "gg", "g", 1, 0, -1, NULL,
139     115, "mm", "m", 1, 0, -1, NULL,
140     116, "nn", "n", 1, 0, -1, NULL,
141     117, "pp", "p", 1, 0, -1, NULL,
142     118, "rr", "r", 1, 0, -1, NULL,
143     119, "tt", "t", 1, 0, -1, NULL,
144     120, "ww", "w", 1, 0, -1, NULL,
145     121, "xx", "x", 1, 0, -1, NULL,
146     122, LAMBDA, "e", -1, 0, -1, AddAnE,
147     000, NULL, NULL, 0, 0, 0, NULL,
148     };
149    
150     static RuleList step1c_rules[] =
151     {
152     123, "y", "i", 0, 0, -1, ContainsVowel,
153     000, NULL, NULL, 0, 0, 0, NULL,
154     };
155    
156     static RuleList step2_rules[] =
157     {
158     203, "ational", "ate", 6, 2, 0, NULL,
159     204, "tional", "tion", 5, 3, 0, NULL,
160     205, "enci", "ence", 3, 3, 0, NULL,
161     206, "anci", "ance", 3, 3, 0, NULL,
162     207, "izer", "ize", 3, 2, 0, NULL,
163     208, "abli", "able", 3, 3, 0, NULL,
164     209, "alli", "al", 3, 1, 0, NULL,
165     210, "entli", "ent", 4, 2, 0, NULL,
166     211, "eli", "e", 2, 0, 0, NULL,
167     213, "ousli", "ous", 4, 2, 0, NULL,
168     214, "ization", "ize", 6, 2, 0, NULL,
169     215, "ation", "ate", 4, 2, 0, NULL,
170     216, "ator", "ate", 3, 2, 0, NULL,
171     217, "alism", "al", 4, 1, 0, NULL,
172     218, "iveness", "ive", 6, 2, 0, NULL,
173     219, "fulnes", "ful", 5, 2, 0, NULL,
174     220, "ousness", "ous", 6, 2, 0, NULL,
175     221, "aliti", "al", 4, 1, 0, NULL,
176     222, "iviti", "ive", 4, 2, 0, NULL,
177     223, "biliti", "ble", 5, 2, 0, NULL,
178     000, NULL, NULL, 0, 0, 0, NULL,
179     };
180    
181     static RuleList step3_rules[] =
182     {
183     301, "icate", "ic", 4, 1, 0, NULL,
184     302, "ative", LAMBDA, 4, -1, 0, NULL,
185     303, "alize", "al", 4, 1, 0, NULL,
186     304, "iciti", "ic", 4, 1, 0, NULL,
187     305, "ical", "ic", 3, 1, 0, NULL,
188     308, "ful", LAMBDA, 2, -1, 0, NULL,
189     309, "ness", LAMBDA, 3, -1, 0, NULL,
190     000, NULL, NULL, 0, 0, 0, NULL,
191     };
192    
193     static RuleList step4_rules[] =
194     {
195     401, "al", LAMBDA, 1, -1, 1, NULL,
196     402, "ance", LAMBDA, 3, -1, 1, NULL,
197     403, "ence", LAMBDA, 3, -1, 1, NULL,
198     405, "er", LAMBDA, 1, -1, 1, NULL,
199     406, "ic", LAMBDA, 1, -1, 1, NULL,
200     407, "able", LAMBDA, 3, -1, 1, NULL,
201     408, "ible", LAMBDA, 3, -1, 1, NULL,
202     409, "ant", LAMBDA, 2, -1, 1, NULL,
203     410, "ement", LAMBDA, 4, -1, 1, NULL,
204     411, "ment", LAMBDA, 3, -1, 1, NULL,
205     412, "ent", LAMBDA, 2, -1, 1, NULL,
206     423, "sion", "s", 3, 0, 1, NULL,
207     424, "tion", "t", 3, 0, 1, NULL,
208     415, "ou", LAMBDA, 1, -1, 1, NULL,
209     416, "ism", LAMBDA, 2, -1, 1, NULL,
210     417, "ate", LAMBDA, 2, -1, 1, NULL,
211     418, "iti", LAMBDA, 2, -1, 1, NULL,
212     419, "ous", LAMBDA, 2, -1, 1, NULL,
213     420, "ive", LAMBDA, 2, -1, 1, NULL,
214     421, "ize", LAMBDA, 2, -1, 1, NULL,
215     000, NULL, NULL, 0, 0, 0, NULL,
216     };
217    
218     static RuleList step5a_rules[] =
219     {
220     501, "e", LAMBDA, 0, -1, 1, NULL,
221     502, "e", LAMBDA, 0, -1, -1, RemoveAnE,
222     000, NULL, NULL, 0, 0, 0, NULL,
223     };
224    
225     static RuleList step5b_rules[] =
226     {
227     503, "ll", "l", 1, 0, 1, NULL,
228     000, NULL, NULL, 0, 0, 0, NULL,
229     };
230    
231    
232     /*
233    
234     WordSize( word )
235    
236     Returns: int -- a weird count of word size in adjusted syllables
237    
238     Purpose: Count syllables in a special way: count the number
239     vowel-consonant pairs in a word, disregarding initial
240     consonants and final vowels. The letter "y" counts as a
241     consonant at the beginning of a word and when it has a vowel
242     in front of it; otherwise (when it follows a consonant) it
243     is treated as a vowel. For example, the WordSize of "cat"
244     is 1, of "any" is 1, of "amount" is 2, of "anything" is 3.
245    
246     Plan: Run a DFA to compute the word size
247    
248     Notes: The easiest and fastest way to compute this funny measure is
249     with a finite state machine. The initial state 0 checks
250     the first letter. If it is a vowel, then the machine changes
251     to state 1, which is the "last letter was a vowel" state.
252     If the first letter is a consonant or y, then it changes
253     to state 2, the "last letter was a consonant state". In
254     state 1, a y is treated as a consonant (since it follows
255     a vowel), but in state 2, y is treated as a vowel (since
256     it follows a consonant. The result counter is incremented
257     on the transition from state 1 to state 2, since this
258     transition only occurs after a vowel-consonant pair, which
259     is what we are counting.
260     */
261    
262     static int
263     WordSize(word)
264     char *word; /* in: word having its WordSize taken */
265     {
266     register int result; /* WordSize of the word */
267     register int state; /* current state in machine */
268    
269     result = 0;
270     state = 0;
271    
272     /* Run a DFA to compute the word size */
273     while (EOS != *word) {
274     switch (state) {
275     case 0:
276     state = (IsVowel(*word)) ? 1 : 2;
277     break;
278     case 1:
279     state = (IsVowel(*word)) ? 1 : 2;
280     if (2 == state)
281     result++;
282     break;
283     case 2:
284     state = (IsVowel(*word) || ('y' == *word)) ? 1 : 2;
285     break;
286     }
287     word++;
288     }
289    
290     return (result);
291    
292     }
293    
294     /*
295     ContainsVowel( word )
296    
297     Returns: int -- TRUE (1) if the word parameter contains a vowel,
298     FALSE (0) otherwise.
299    
300     Purpose: Some of the rewrite rules apply only to a root containing
301     a vowel, where a vowel is one of "aeiou" or y with a
302     consonant in front of it.
303    
304     Plan: Obviously, under the definition of a vowel, a word contains
305     a vowel iff either its first letter is one of "aeiou", or
306     any of its other letters are "aeiouy". The plan is to
307     test this condition.
308    
309     Notes: None
310     */
311    
312     static int
313     ContainsVowel(word)
314     char *word; /* in: buffer with word checked */
315     {
316    
317     if (EOS == *word)
318     return (FALSE);
319     else
320     return (IsVowel(*word) || (NULL != strpbrk(word + 1, "aeiouy")));
321    
322     } /* ContainsVowel */
323    
324     /*
325    
326     EndsWithCVC( word )
327    
328     Returns: int -- TRUE (1) if the current word ends with a
329     consonant-vowel-consonant combination, and the second
330     consonant is not w, x, or y, FALSE (0) otherwise.
331    
332     Purpose: Some of the rewrite rules apply only to a root with
333     this characteristic.
334    
335     Plan: Look at the last three characters.
336    
337     Notes: None
338     */
339    
340     static int
341     EndsWithCVC(word)
342     char *word; /* in: buffer with the word checked */
343     {
344     int length; /* for finding the last three characters */
345    
346     if ((length = strlen(word)) < 2)
347     return (FALSE);
348     else {
349     end = word + length - 1;
350     return ((NULL == strchr("aeiouwxy", *end--)) /* consonant */
351     &&(NULL != strchr("aeiouy", *end--)) /* vowel */
352     &&(NULL == strchr("aeiou", *end))); /* consonant */
353     }
354    
355     }
356    
357     /*
358     AddAnE( word )
359    
360     Returns: int -- TRUE (1) if the current word meets special conditions
361     for adding an e.
362    
363     Purpose: Rule 122 applies only to a root with this characteristic.
364    
365     Plan: Check for size of 1 and a consonant-vowel-consonant ending.
366    
367     Notes: None
368     */
369    
370     static int
371     AddAnE(word)
372     char *word;
373     {
374     return ((1 == WordSize(word)) && EndsWithCVC(word));
375     }
376    
377     /*
378     RemoveAnE( word )
379    
380     Returns: int -- TRUE (1) if the current word meets special conditions
381     for removing an e.
382    
383     Purpose: Rule 502 applies only to a root with this characteristic.
384    
385     Plan: Check for size of 1 and no consonant-vowel-consonant ending.
386    
387     Notes: None
388     */
389    
390     static int
391     RemoveAnE(word)
392     char *word;
393     {
394     return ((1 == WordSize(word)) && !EndsWithCVC(word));
395     } /* RemoveAnE */
396    
397     /*
398     ReplaceEnd( word, rule )
399    
400     Returns: int -- the id for the rule fired, 0 is none is fired
401    
402     Purpose: Apply a set of rules to replace the suffix of a word
403    
404     Plan: Loop through the rule set until a match meeting all conditions
405     is found. If a rule fires, return its id, otherwise return 0.
406     Connditions on the length of the root are checked as part of this
407     function's processing because this check is so often made.
408    
409     Notes: This is the main routine driving the stemmer. It goes through
410     a set of suffix replacement rules looking for a match on the
411     current suffix. When it finds one, if the root of the word
412     is long enough, and it meets whatever other conditions are
413     required, then the suffix is replaced, and the function returns.
414     * */
415    
416     static int
417     ReplaceEnd(word, rule)
418     char *word; /* in/out: buffer with the stemmed word */
419     RuleList *rule; /* in: data structure with replacement rules */
420     {
421     register char *ending; /* set to start of possible stemmed suffix */
422     char tmp_ch; /* save replaced character when testing */
423    
424     while (0 != rule->id) {
425     ending = end - rule->old_offset;
426     if (word <= ending)
427     if ((*ending == *rule->old_end) && /* for speed */
428     !strcmp(ending, rule->old_end)) {
429     tmp_ch = *ending;
430     *ending = EOS;
431     if (rule->min_root_size < WordSize(word))
432     if (!rule->condition || (*rule->condition) (word)) {
433     (void) strcat(word, rule->new_end);
434     end = ending + rule->new_offset;
435     break;
436     }
437     *ending = tmp_ch;
438     }
439     rule++;
440     }
441    
442     return (rule->id);
443     }
444    
445     /*
446     Stem( word )
447    
448     Returns: int -- FALSE (0) if the word contains non-alphabetic characters
449     and hence is not stemmed, TRUE (1) otherwise
450    
451     Purpose: Stem a word
452    
453     Plan: Part 1: Check to ensure the word is all alphabetic
454     Part 2: Run through the Porter algorithm
455     Part 3: Return an indication of successful stemming
456    
457     Notes: This function implements the Porter stemming algorithm, with
458     a few additions here and there. See:
459    
460     Porter, M.F., "An Algorithm For Suffix Stripping,"
461     Program 14 (3), July 1980, pp. 130-137.
462    
463     Porter's algorithm is an ad hoc set of rewrite rules with
464     various conditions on rule firing. The terminology of
465     "step 1a" and so on, is taken directly from Porter's
466     article, which unfortunately gives almost no justification
467     for the various steps. Thus this function more or less
468     faithfully refects the opaque presentation in the article.
469     Changes from the article amount to a few additions to the
470     rewrite rules; these are marked in the RuleList data
471     structures with comments.
472     */
473    
474     int
475     Stem(word)
476     char *word; /* in/out: the word stemmed */
477     {
478     int rule; /* which rule is fired in replacing an end */
479    
480     /* Part 1: Check to ensure the word is all alphabetic */
481     for (end = word; *end != EOS; end++)
482     if (!isalpha(*end))
483     return (FALSE);
484     else
485     *end = tolower(*end);
486     end--;
487    
488     /* Part 2: Run through the Porter algorithm */
489     (void) ReplaceEnd(word, step1a_rules);
490     rule = ReplaceEnd(word, step1b_rules);
491     if ((106 == rule) || (107 == rule))
492     (void) ReplaceEnd(word, step1b1_rules);
493     (void) ReplaceEnd(word, step1c_rules);
494    
495     (void) ReplaceEnd(word, step2_rules);
496    
497     (void) ReplaceEnd(word, step3_rules);
498    
499     (void) ReplaceEnd(word, step4_rules);
500    
501     (void) ReplaceEnd(word, step5a_rules);
502     (void) ReplaceEnd(word, step5b_rules);
503    
504     /* Part 3: Return an indication of successful stemming */
505     return (TRUE);
506     }

Properties

Name Value
cvs2svn:cvs-rev 1.1

  ViewVC Help
Powered by ViewVC 1.1.26