1 |
ulpfr |
10 |
/* WIDE AREA INFORMATION SERVER SOFTWARE: |
2 |
|
|
No guarantees or restrictions. See the readme file for the full standard |
3 |
|
|
disclaimer. |
4 |
|
|
|
5 |
|
|
francois@welchgate.welch.jhu.edu |
6 |
|
|
*/ |
7 |
|
|
|
8 |
|
|
/* |
9 |
|
|
Title: COPYRIGHT freeWAIS-0.2 |
10 |
|
|
Author: Jane Smith |
11 |
|
|
Copyright: Copyright 1993 CNIDR |
12 |
|
|
Last update: 10.01.93 |
13 |
|
|
Abstract: This file contains the copyright statement for freeWAIS 0.2 |
14 |
|
|
|
15 |
|
|
Copyright Statement for freeWAIS 0.2 and subsquent freeWAIS |
16 |
|
|
releases: |
17 |
|
|
|
18 |
|
|
Copyright (c) MCNC, Clearinghouse for Networked Information Discovery |
19 |
|
|
and Retrieval, 1993. |
20 |
|
|
|
21 |
|
|
Permission to use, copy, modify, distribute, and sell this software |
22 |
|
|
and its documentation, in whole or in part, for any purpose is hereby |
23 |
|
|
granted without fee, provided that |
24 |
|
|
|
25 |
|
|
1. The above copyright notice and this permission notice appear in all |
26 |
|
|
copies of the software and related documentation. Notices of copyright |
27 |
|
|
and/or attribution which appear at the beginning of any file included |
28 |
|
|
in this distribution must remain intact. |
29 |
|
|
|
30 |
|
|
2. Users of this software agree to make their best efforts (a) to |
31 |
|
|
return to MCNC any improvements or extensions that they make, so that |
32 |
|
|
these may be included in future releases; and (b) to inform MCNC/CNIDR |
33 |
|
|
of noteworthy uses of this software. |
34 |
|
|
|
35 |
|
|
3. The names of MCNC and Clearinghouse for Networked Information |
36 |
|
|
Discovery and Retrieval may not be used in any advertising or publicity |
37 |
|
|
relating to the software without the specific, prior written permission |
38 |
|
|
of MCNC/CNIDR. |
39 |
|
|
|
40 |
|
|
THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, |
41 |
|
|
EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY |
42 |
|
|
WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. |
43 |
|
|
|
44 |
|
|
IN NO EVENT SHALL MCNC/CNIDR BE LIABLE FOR ANY SPECIAL, INCIDENTAL, |
45 |
|
|
INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND, OR ANY DAMAGES WHATSOEVER |
46 |
|
|
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER OR NOT ADVISED OF |
47 |
|
|
THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF LIABILITY, ARISING OUT |
48 |
|
|
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
49 |
|
|
*/ |
50 |
|
|
|
51 |
|
|
#ifdef __cplusplus |
52 |
|
|
extern "C" { |
53 |
|
|
#endif |
54 |
|
|
#include "EXTERN.h" |
55 |
|
|
#include "perl.h" |
56 |
|
|
#include "XSUB.h" |
57 |
|
|
#ifdef __cplusplus |
58 |
|
|
} |
59 |
|
|
|
60 |
|
|
#endif |
61 |
|
|
#include "stemmer.h" |
62 |
|
|
|
63 |
|
|
/* |
64 |
|
|
|
65 |
|
|
Purpose: Implementation of the Porter stemming algorithm documented |
66 |
|
|
in: Porter, M.F., "An Algorithm For Suffix Stripping," |
67 |
|
|
Program 14 (3), July 1980, pp. 130-137. |
68 |
|
|
|
69 |
|
|
Provenance: Written by B. Frakes and C. Cox, 1986. |
70 |
|
|
Changed by C. Fox, 1990. |
71 |
|
|
- made measure function a DFA |
72 |
|
|
- restructured structs |
73 |
|
|
- renamed functions and variables |
74 |
|
|
- restricted function and variable scopes |
75 |
|
|
Changed by C. Fox, July, 1991. |
76 |
|
|
- added ANSI C declarations |
77 |
|
|
- branch tested to 90% coverage |
78 |
|
|
|
79 |
|
|
Notes: This code will make little sense without the the Porter |
80 |
|
|
article. The stemming function converts its input to |
81 |
|
|
lower case. |
82 |
|
|
*/ |
83 |
|
|
|
84 |
|
|
#define EOS '\0' |
85 |
|
|
#define IsVowel(c) ('a'==(c)||'e'==(c)||'i'==(c)||'o'==(c)||'u'==(c)) |
86 |
|
|
|
87 |
|
|
typedef struct { |
88 |
|
|
int id; /* returned if rule fired */ |
89 |
|
|
char *old_end; /* suffix replaced */ |
90 |
|
|
char *new_end; /* suffix replacement */ |
91 |
|
|
int old_offset; /* from end of word to start of suffix */ |
92 |
|
|
int new_offset; /* from beginning to end of new suffix */ |
93 |
|
|
int min_root_size; /* min root word size for replacement */ |
94 |
|
|
int (*condition) (); /* the replacement test function */ |
95 |
|
|
} RuleList; |
96 |
|
|
|
97 |
|
|
static char LAMBDA[1] = ""; /* the constant empty string */ |
98 |
|
|
static char *end; /* pointer to the end of the word */ |
99 |
|
|
|
100 |
|
|
/*****************************************************************************/ |
101 |
|
|
/******************** Private Function Declarations **********************/ |
102 |
|
|
|
103 |
|
|
static int WordSize _((char *word)); |
104 |
|
|
static int ContainsVowel _((char *word)); |
105 |
|
|
static int EndsWithCVC _((char *word)); |
106 |
|
|
static int AddAnE _((char *word)); |
107 |
|
|
static int RemoveAnE _((char *word)); |
108 |
|
|
static int ReplaceEnd _((char *word, RuleList * rule)); |
109 |
|
|
|
110 |
|
|
/******************************************************************************/ |
111 |
|
|
/***************** Initialized Private Data Structures ********************/ |
112 |
|
|
|
113 |
|
|
static RuleList step1a_rules[] = |
114 |
|
|
{ |
115 |
|
|
101, "sses", "ss", 3, 1, -1, NULL, |
116 |
|
|
102, "ies", "i", 2, 0, -1, NULL, |
117 |
|
|
103, "ss", "ss", 1, 1, -1, NULL, |
118 |
|
|
104, "s", LAMBDA, 0, -1, -1, NULL, |
119 |
|
|
000, NULL, NULL, 0, 0, 0, NULL, |
120 |
|
|
}; |
121 |
|
|
|
122 |
|
|
static RuleList step1b_rules[] = |
123 |
|
|
{ |
124 |
|
|
105, "eed", "ee", 2, 1, 0, NULL, |
125 |
|
|
106, "ed", LAMBDA, 1, -1, -1, ContainsVowel, |
126 |
|
|
107, "ing", LAMBDA, 2, -1, -1, ContainsVowel, |
127 |
|
|
000, NULL, NULL, 0, 0, 0, NULL, |
128 |
|
|
}; |
129 |
|
|
|
130 |
|
|
static RuleList step1b1_rules[] = |
131 |
|
|
{ |
132 |
|
|
108, "at", "ate", 1, 2, -1, NULL, |
133 |
|
|
109, "bl", "ble", 1, 2, -1, NULL, |
134 |
|
|
110, "iz", "ize", 1, 2, -1, NULL, |
135 |
|
|
111, "bb", "b", 1, 0, -1, NULL, |
136 |
|
|
112, "dd", "d", 1, 0, -1, NULL, |
137 |
|
|
113, "ff", "f", 1, 0, -1, NULL, |
138 |
|
|
114, "gg", "g", 1, 0, -1, NULL, |
139 |
|
|
115, "mm", "m", 1, 0, -1, NULL, |
140 |
|
|
116, "nn", "n", 1, 0, -1, NULL, |
141 |
|
|
117, "pp", "p", 1, 0, -1, NULL, |
142 |
|
|
118, "rr", "r", 1, 0, -1, NULL, |
143 |
|
|
119, "tt", "t", 1, 0, -1, NULL, |
144 |
|
|
120, "ww", "w", 1, 0, -1, NULL, |
145 |
|
|
121, "xx", "x", 1, 0, -1, NULL, |
146 |
|
|
122, LAMBDA, "e", -1, 0, -1, AddAnE, |
147 |
|
|
000, NULL, NULL, 0, 0, 0, NULL, |
148 |
|
|
}; |
149 |
|
|
|
150 |
|
|
static RuleList step1c_rules[] = |
151 |
|
|
{ |
152 |
|
|
123, "y", "i", 0, 0, -1, ContainsVowel, |
153 |
|
|
000, NULL, NULL, 0, 0, 0, NULL, |
154 |
|
|
}; |
155 |
|
|
|
156 |
|
|
static RuleList step2_rules[] = |
157 |
|
|
{ |
158 |
|
|
203, "ational", "ate", 6, 2, 0, NULL, |
159 |
|
|
204, "tional", "tion", 5, 3, 0, NULL, |
160 |
|
|
205, "enci", "ence", 3, 3, 0, NULL, |
161 |
|
|
206, "anci", "ance", 3, 3, 0, NULL, |
162 |
|
|
207, "izer", "ize", 3, 2, 0, NULL, |
163 |
|
|
208, "abli", "able", 3, 3, 0, NULL, |
164 |
|
|
209, "alli", "al", 3, 1, 0, NULL, |
165 |
|
|
210, "entli", "ent", 4, 2, 0, NULL, |
166 |
|
|
211, "eli", "e", 2, 0, 0, NULL, |
167 |
|
|
213, "ousli", "ous", 4, 2, 0, NULL, |
168 |
|
|
214, "ization", "ize", 6, 2, 0, NULL, |
169 |
|
|
215, "ation", "ate", 4, 2, 0, NULL, |
170 |
|
|
216, "ator", "ate", 3, 2, 0, NULL, |
171 |
|
|
217, "alism", "al", 4, 1, 0, NULL, |
172 |
|
|
218, "iveness", "ive", 6, 2, 0, NULL, |
173 |
|
|
219, "fulnes", "ful", 5, 2, 0, NULL, |
174 |
|
|
220, "ousness", "ous", 6, 2, 0, NULL, |
175 |
|
|
221, "aliti", "al", 4, 1, 0, NULL, |
176 |
|
|
222, "iviti", "ive", 4, 2, 0, NULL, |
177 |
|
|
223, "biliti", "ble", 5, 2, 0, NULL, |
178 |
|
|
000, NULL, NULL, 0, 0, 0, NULL, |
179 |
|
|
}; |
180 |
|
|
|
181 |
|
|
static RuleList step3_rules[] = |
182 |
|
|
{ |
183 |
|
|
301, "icate", "ic", 4, 1, 0, NULL, |
184 |
|
|
302, "ative", LAMBDA, 4, -1, 0, NULL, |
185 |
|
|
303, "alize", "al", 4, 1, 0, NULL, |
186 |
|
|
304, "iciti", "ic", 4, 1, 0, NULL, |
187 |
|
|
305, "ical", "ic", 3, 1, 0, NULL, |
188 |
|
|
308, "ful", LAMBDA, 2, -1, 0, NULL, |
189 |
|
|
309, "ness", LAMBDA, 3, -1, 0, NULL, |
190 |
|
|
000, NULL, NULL, 0, 0, 0, NULL, |
191 |
|
|
}; |
192 |
|
|
|
193 |
|
|
static RuleList step4_rules[] = |
194 |
|
|
{ |
195 |
|
|
401, "al", LAMBDA, 1, -1, 1, NULL, |
196 |
|
|
402, "ance", LAMBDA, 3, -1, 1, NULL, |
197 |
|
|
403, "ence", LAMBDA, 3, -1, 1, NULL, |
198 |
|
|
405, "er", LAMBDA, 1, -1, 1, NULL, |
199 |
|
|
406, "ic", LAMBDA, 1, -1, 1, NULL, |
200 |
|
|
407, "able", LAMBDA, 3, -1, 1, NULL, |
201 |
|
|
408, "ible", LAMBDA, 3, -1, 1, NULL, |
202 |
|
|
409, "ant", LAMBDA, 2, -1, 1, NULL, |
203 |
|
|
410, "ement", LAMBDA, 4, -1, 1, NULL, |
204 |
|
|
411, "ment", LAMBDA, 3, -1, 1, NULL, |
205 |
|
|
412, "ent", LAMBDA, 2, -1, 1, NULL, |
206 |
|
|
423, "sion", "s", 3, 0, 1, NULL, |
207 |
|
|
424, "tion", "t", 3, 0, 1, NULL, |
208 |
|
|
415, "ou", LAMBDA, 1, -1, 1, NULL, |
209 |
|
|
416, "ism", LAMBDA, 2, -1, 1, NULL, |
210 |
|
|
417, "ate", LAMBDA, 2, -1, 1, NULL, |
211 |
|
|
418, "iti", LAMBDA, 2, -1, 1, NULL, |
212 |
|
|
419, "ous", LAMBDA, 2, -1, 1, NULL, |
213 |
|
|
420, "ive", LAMBDA, 2, -1, 1, NULL, |
214 |
|
|
421, "ize", LAMBDA, 2, -1, 1, NULL, |
215 |
|
|
000, NULL, NULL, 0, 0, 0, NULL, |
216 |
|
|
}; |
217 |
|
|
|
218 |
|
|
static RuleList step5a_rules[] = |
219 |
|
|
{ |
220 |
|
|
501, "e", LAMBDA, 0, -1, 1, NULL, |
221 |
|
|
502, "e", LAMBDA, 0, -1, -1, RemoveAnE, |
222 |
|
|
000, NULL, NULL, 0, 0, 0, NULL, |
223 |
|
|
}; |
224 |
|
|
|
225 |
|
|
static RuleList step5b_rules[] = |
226 |
|
|
{ |
227 |
|
|
503, "ll", "l", 1, 0, 1, NULL, |
228 |
|
|
000, NULL, NULL, 0, 0, 0, NULL, |
229 |
|
|
}; |
230 |
|
|
|
231 |
|
|
|
232 |
|
|
/* |
233 |
|
|
|
234 |
|
|
WordSize( word ) |
235 |
|
|
|
236 |
|
|
Returns: int -- a weird count of word size in adjusted syllables |
237 |
|
|
|
238 |
|
|
Purpose: Count syllables in a special way: count the number |
239 |
|
|
vowel-consonant pairs in a word, disregarding initial |
240 |
|
|
consonants and final vowels. The letter "y" counts as a |
241 |
|
|
consonant at the beginning of a word and when it has a vowel |
242 |
|
|
in front of it; otherwise (when it follows a consonant) it |
243 |
|
|
is treated as a vowel. For example, the WordSize of "cat" |
244 |
|
|
is 1, of "any" is 1, of "amount" is 2, of "anything" is 3. |
245 |
|
|
|
246 |
|
|
Plan: Run a DFA to compute the word size |
247 |
|
|
|
248 |
|
|
Notes: The easiest and fastest way to compute this funny measure is |
249 |
|
|
with a finite state machine. The initial state 0 checks |
250 |
|
|
the first letter. If it is a vowel, then the machine changes |
251 |
|
|
to state 1, which is the "last letter was a vowel" state. |
252 |
|
|
If the first letter is a consonant or y, then it changes |
253 |
|
|
to state 2, the "last letter was a consonant state". In |
254 |
|
|
state 1, a y is treated as a consonant (since it follows |
255 |
|
|
a vowel), but in state 2, y is treated as a vowel (since |
256 |
|
|
it follows a consonant. The result counter is incremented |
257 |
|
|
on the transition from state 1 to state 2, since this |
258 |
|
|
transition only occurs after a vowel-consonant pair, which |
259 |
|
|
is what we are counting. |
260 |
|
|
*/ |
261 |
|
|
|
262 |
|
|
static int |
263 |
|
|
WordSize(word) |
264 |
|
|
char *word; /* in: word having its WordSize taken */ |
265 |
|
|
{ |
266 |
|
|
register int result; /* WordSize of the word */ |
267 |
|
|
register int state; /* current state in machine */ |
268 |
|
|
|
269 |
|
|
result = 0; |
270 |
|
|
state = 0; |
271 |
|
|
|
272 |
|
|
/* Run a DFA to compute the word size */ |
273 |
|
|
while (EOS != *word) { |
274 |
|
|
switch (state) { |
275 |
|
|
case 0: |
276 |
|
|
state = (IsVowel(*word)) ? 1 : 2; |
277 |
|
|
break; |
278 |
|
|
case 1: |
279 |
|
|
state = (IsVowel(*word)) ? 1 : 2; |
280 |
|
|
if (2 == state) |
281 |
|
|
result++; |
282 |
|
|
break; |
283 |
|
|
case 2: |
284 |
|
|
state = (IsVowel(*word) || ('y' == *word)) ? 1 : 2; |
285 |
|
|
break; |
286 |
|
|
} |
287 |
|
|
word++; |
288 |
|
|
} |
289 |
|
|
|
290 |
|
|
return (result); |
291 |
|
|
|
292 |
|
|
} |
293 |
|
|
|
294 |
|
|
/* |
295 |
|
|
ContainsVowel( word ) |
296 |
|
|
|
297 |
|
|
Returns: int -- TRUE (1) if the word parameter contains a vowel, |
298 |
|
|
FALSE (0) otherwise. |
299 |
|
|
|
300 |
|
|
Purpose: Some of the rewrite rules apply only to a root containing |
301 |
|
|
a vowel, where a vowel is one of "aeiou" or y with a |
302 |
|
|
consonant in front of it. |
303 |
|
|
|
304 |
|
|
Plan: Obviously, under the definition of a vowel, a word contains |
305 |
|
|
a vowel iff either its first letter is one of "aeiou", or |
306 |
|
|
any of its other letters are "aeiouy". The plan is to |
307 |
|
|
test this condition. |
308 |
|
|
|
309 |
|
|
Notes: None |
310 |
|
|
*/ |
311 |
|
|
|
312 |
|
|
static int |
313 |
|
|
ContainsVowel(word) |
314 |
|
|
char *word; /* in: buffer with word checked */ |
315 |
|
|
{ |
316 |
|
|
|
317 |
|
|
if (EOS == *word) |
318 |
|
|
return (FALSE); |
319 |
|
|
else |
320 |
|
|
return (IsVowel(*word) || (NULL != strpbrk(word + 1, "aeiouy"))); |
321 |
|
|
|
322 |
|
|
} /* ContainsVowel */ |
323 |
|
|
|
324 |
|
|
/* |
325 |
|
|
|
326 |
|
|
EndsWithCVC( word ) |
327 |
|
|
|
328 |
|
|
Returns: int -- TRUE (1) if the current word ends with a |
329 |
|
|
consonant-vowel-consonant combination, and the second |
330 |
|
|
consonant is not w, x, or y, FALSE (0) otherwise. |
331 |
|
|
|
332 |
|
|
Purpose: Some of the rewrite rules apply only to a root with |
333 |
|
|
this characteristic. |
334 |
|
|
|
335 |
|
|
Plan: Look at the last three characters. |
336 |
|
|
|
337 |
|
|
Notes: None |
338 |
|
|
*/ |
339 |
|
|
|
340 |
|
|
static int |
341 |
|
|
EndsWithCVC(word) |
342 |
|
|
char *word; /* in: buffer with the word checked */ |
343 |
|
|
{ |
344 |
|
|
int length; /* for finding the last three characters */ |
345 |
|
|
|
346 |
|
|
if ((length = strlen(word)) < 2) |
347 |
|
|
return (FALSE); |
348 |
|
|
else { |
349 |
|
|
end = word + length - 1; |
350 |
|
|
return ((NULL == strchr("aeiouwxy", *end--)) /* consonant */ |
351 |
|
|
&&(NULL != strchr("aeiouy", *end--)) /* vowel */ |
352 |
|
|
&&(NULL == strchr("aeiou", *end))); /* consonant */ |
353 |
|
|
} |
354 |
|
|
|
355 |
|
|
} |
356 |
|
|
|
357 |
|
|
/* |
358 |
|
|
AddAnE( word ) |
359 |
|
|
|
360 |
|
|
Returns: int -- TRUE (1) if the current word meets special conditions |
361 |
|
|
for adding an e. |
362 |
|
|
|
363 |
|
|
Purpose: Rule 122 applies only to a root with this characteristic. |
364 |
|
|
|
365 |
|
|
Plan: Check for size of 1 and a consonant-vowel-consonant ending. |
366 |
|
|
|
367 |
|
|
Notes: None |
368 |
|
|
*/ |
369 |
|
|
|
370 |
|
|
static int |
371 |
|
|
AddAnE(word) |
372 |
|
|
char *word; |
373 |
|
|
{ |
374 |
|
|
return ((1 == WordSize(word)) && EndsWithCVC(word)); |
375 |
|
|
} |
376 |
|
|
|
377 |
|
|
/* |
378 |
|
|
RemoveAnE( word ) |
379 |
|
|
|
380 |
|
|
Returns: int -- TRUE (1) if the current word meets special conditions |
381 |
|
|
for removing an e. |
382 |
|
|
|
383 |
|
|
Purpose: Rule 502 applies only to a root with this characteristic. |
384 |
|
|
|
385 |
|
|
Plan: Check for size of 1 and no consonant-vowel-consonant ending. |
386 |
|
|
|
387 |
|
|
Notes: None |
388 |
|
|
*/ |
389 |
|
|
|
390 |
|
|
static int |
391 |
|
|
RemoveAnE(word) |
392 |
|
|
char *word; |
393 |
|
|
{ |
394 |
|
|
return ((1 == WordSize(word)) && !EndsWithCVC(word)); |
395 |
|
|
} /* RemoveAnE */ |
396 |
|
|
|
397 |
|
|
/* |
398 |
|
|
ReplaceEnd( word, rule ) |
399 |
|
|
|
400 |
|
|
Returns: int -- the id for the rule fired, 0 is none is fired |
401 |
|
|
|
402 |
|
|
Purpose: Apply a set of rules to replace the suffix of a word |
403 |
|
|
|
404 |
|
|
Plan: Loop through the rule set until a match meeting all conditions |
405 |
|
|
is found. If a rule fires, return its id, otherwise return 0. |
406 |
|
|
Connditions on the length of the root are checked as part of this |
407 |
|
|
function's processing because this check is so often made. |
408 |
|
|
|
409 |
|
|
Notes: This is the main routine driving the stemmer. It goes through |
410 |
|
|
a set of suffix replacement rules looking for a match on the |
411 |
|
|
current suffix. When it finds one, if the root of the word |
412 |
|
|
is long enough, and it meets whatever other conditions are |
413 |
|
|
required, then the suffix is replaced, and the function returns. |
414 |
|
|
* */ |
415 |
|
|
|
416 |
|
|
static int |
417 |
|
|
ReplaceEnd(word, rule) |
418 |
|
|
char *word; /* in/out: buffer with the stemmed word */ |
419 |
|
|
RuleList *rule; /* in: data structure with replacement rules */ |
420 |
|
|
{ |
421 |
|
|
register char *ending; /* set to start of possible stemmed suffix */ |
422 |
|
|
char tmp_ch; /* save replaced character when testing */ |
423 |
|
|
|
424 |
|
|
while (0 != rule->id) { |
425 |
|
|
ending = end - rule->old_offset; |
426 |
|
|
if (word <= ending) |
427 |
|
|
if ((*ending == *rule->old_end) && /* for speed */ |
428 |
|
|
!strcmp(ending, rule->old_end)) { |
429 |
|
|
tmp_ch = *ending; |
430 |
|
|
*ending = EOS; |
431 |
|
|
if (rule->min_root_size < WordSize(word)) |
432 |
|
|
if (!rule->condition || (*rule->condition) (word)) { |
433 |
|
|
(void) strcat(word, rule->new_end); |
434 |
|
|
end = ending + rule->new_offset; |
435 |
|
|
break; |
436 |
|
|
} |
437 |
|
|
*ending = tmp_ch; |
438 |
|
|
} |
439 |
|
|
rule++; |
440 |
|
|
} |
441 |
|
|
|
442 |
|
|
return (rule->id); |
443 |
|
|
} |
444 |
|
|
|
445 |
|
|
/* |
446 |
|
|
Stem( word ) |
447 |
|
|
|
448 |
|
|
Returns: int -- FALSE (0) if the word contains non-alphabetic characters |
449 |
|
|
and hence is not stemmed, TRUE (1) otherwise |
450 |
|
|
|
451 |
|
|
Purpose: Stem a word |
452 |
|
|
|
453 |
|
|
Plan: Part 1: Check to ensure the word is all alphabetic |
454 |
|
|
Part 2: Run through the Porter algorithm |
455 |
|
|
Part 3: Return an indication of successful stemming |
456 |
|
|
|
457 |
|
|
Notes: This function implements the Porter stemming algorithm, with |
458 |
|
|
a few additions here and there. See: |
459 |
|
|
|
460 |
|
|
Porter, M.F., "An Algorithm For Suffix Stripping," |
461 |
|
|
Program 14 (3), July 1980, pp. 130-137. |
462 |
|
|
|
463 |
|
|
Porter's algorithm is an ad hoc set of rewrite rules with |
464 |
|
|
various conditions on rule firing. The terminology of |
465 |
|
|
"step 1a" and so on, is taken directly from Porter's |
466 |
|
|
article, which unfortunately gives almost no justification |
467 |
|
|
for the various steps. Thus this function more or less |
468 |
|
|
faithfully refects the opaque presentation in the article. |
469 |
|
|
Changes from the article amount to a few additions to the |
470 |
|
|
rewrite rules; these are marked in the RuleList data |
471 |
|
|
structures with comments. |
472 |
|
|
*/ |
473 |
|
|
|
474 |
|
|
int |
475 |
|
|
Stem(word) |
476 |
|
|
char *word; /* in/out: the word stemmed */ |
477 |
|
|
{ |
478 |
|
|
int rule; /* which rule is fired in replacing an end */ |
479 |
|
|
|
480 |
|
|
/* Part 1: Check to ensure the word is all alphabetic */ |
481 |
|
|
for (end = word; *end != EOS; end++) |
482 |
|
|
if (!isalpha(*end)) |
483 |
|
|
return (FALSE); |
484 |
|
|
else |
485 |
|
|
*end = tolower(*end); |
486 |
|
|
end--; |
487 |
|
|
|
488 |
|
|
/* Part 2: Run through the Porter algorithm */ |
489 |
|
|
(void) ReplaceEnd(word, step1a_rules); |
490 |
|
|
rule = ReplaceEnd(word, step1b_rules); |
491 |
|
|
if ((106 == rule) || (107 == rule)) |
492 |
|
|
(void) ReplaceEnd(word, step1b1_rules); |
493 |
|
|
(void) ReplaceEnd(word, step1c_rules); |
494 |
|
|
|
495 |
|
|
(void) ReplaceEnd(word, step2_rules); |
496 |
|
|
|
497 |
|
|
(void) ReplaceEnd(word, step3_rules); |
498 |
|
|
|
499 |
|
|
(void) ReplaceEnd(word, step4_rules); |
500 |
|
|
|
501 |
|
|
(void) ReplaceEnd(word, step5a_rules); |
502 |
|
|
(void) ReplaceEnd(word, step5b_rules); |
503 |
|
|
|
504 |
|
|
/* Part 3: Return an indication of successful stemming */ |
505 |
|
|
return (TRUE); |
506 |
|
|
} |