1 |
# -*- Mode: Cperl -*- |
2 |
# $Basename: Filter.pm $ |
3 |
# $Revision: 1.9 $ |
4 |
# ITIID : $ITI$ $Header $__Header$ |
5 |
# Author : Ulrich Pfeifer |
6 |
# Created On : Thu Aug 15 18:09:51 1996 |
7 |
# Last Modified By: Ulrich Pfeifer |
8 |
# Last Modified On: Sun Nov 22 18:44:46 1998 |
9 |
# Language : CPerl |
10 |
# Update Count : 105 |
11 |
# Status : Unknown, Use with caution! |
12 |
# |
13 |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
14 |
# |
15 |
package WAIT::Filter; |
16 |
require WAIT; |
17 |
use strict; |
18 |
use Carp; |
19 |
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD); |
20 |
use subs qw(grundform); |
21 |
|
22 |
require Exporter; |
23 |
|
24 |
@ISA = qw(Exporter); |
25 |
@EXPORT_OK = qw( |
26 |
Stem |
27 |
Soundex |
28 |
Phonix |
29 |
Metaphone |
30 |
isolc disolc |
31 |
isouc disouc |
32 |
isotr disotr |
33 |
stop grundform |
34 |
utf8iso |
35 |
); |
36 |
# (most implemented in WAIT.xs) |
37 |
|
38 |
$VERSION = substr q$Revision: 1.9 $, 10; |
39 |
|
40 |
sub split { |
41 |
map split(' ', $_), @_; |
42 |
} |
43 |
|
44 |
$SPLIT = q[ |
45 |
sub splitXXX { |
46 |
grep length($_)>=XXX, map split(' ', $_), @_; |
47 |
} |
48 |
]; |
49 |
|
50 |
sub AUTOLOAD { |
51 |
my $func = $AUTOLOAD; $func =~ s/.*:://; |
52 |
|
53 |
if ($func =~ /split(\d+)/) { |
54 |
my $num = $1; |
55 |
my $split = $SPLIT; |
56 |
|
57 |
$split =~ s/XXX/$num/g; |
58 |
eval $split; |
59 |
if ($@ eq '') { |
60 |
goto &$AUTOLOAD; |
61 |
} |
62 |
} elsif ($func eq 'grundform') { |
63 |
eval {require Text::German;}; |
64 |
croak "You must have Text::German to use 'grundform'" |
65 |
if $@ ne ''; |
66 |
*grundform = Text::German->can('reduce'); |
67 |
goto &grundform; |
68 |
} elsif ($func eq 'date') { |
69 |
eval {require Time::ParseDate;}; |
70 |
croak "You must have Time::ParseDate to use 'date'" |
71 |
if $@ ne ''; |
72 |
*date = Time::ParseDate->can('parsedate'); |
73 |
goto \&date; |
74 |
} elsif ($func eq 'decode_entities') { |
75 |
eval {require HTML::Entities;}; |
76 |
croak "You must have HTML::Entities to use 'date'" |
77 |
if $@ ne ''; |
78 |
*decode_entities = HTML::Entities->can('decode_entities'); |
79 |
goto &decode_entities; |
80 |
} elsif ($func =~ /^d?utf8iso$/) { |
81 |
no strict 'refs'; |
82 |
*$func = sub { |
83 |
# Courtesy JHI |
84 |
my $s = shift; |
85 |
$s =~ s{([\xC0-\xDF])([\x80-\xBF])} |
86 |
{chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg; |
87 |
$s; |
88 |
}; |
89 |
goto \&$func; |
90 |
} |
91 |
Carp::confess "Class WAIT::Filter::$func not found"; |
92 |
} |
93 |
|
94 |
while (<DATA>) { |
95 |
chomp; |
96 |
last if /__END__/; |
97 |
next if /^\s*#/; # there's a comment |
98 |
$STOP{$_}++; |
99 |
} |
100 |
close DATA; |
101 |
|
102 |
sub stop { |
103 |
if (exists $STOP{$_[0]}) { |
104 |
'' |
105 |
} else { |
106 |
$_[0]; |
107 |
} |
108 |
} |
109 |
|
110 |
sub gdate { |
111 |
my $date = shift; |
112 |
|
113 |
$date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:; |
114 |
date($date); |
115 |
} |
116 |
|
117 |
1; |
118 |
|
119 |
__DATA__ |
120 |
a |
121 |
about |
122 |
above |
123 |
according |
124 |
across |
125 |
actually |
126 |
adj |
127 |
after |
128 |
afterwards |
129 |
again |
130 |
against |
131 |
all |
132 |
almost |
133 |
alone |
134 |
along |
135 |
already |
136 |
also |
137 |
although |
138 |
always |
139 |
among |
140 |
amongst |
141 |
an |
142 |
and |
143 |
another |
144 |
any |
145 |
anyhow |
146 |
anyone |
147 |
anything |
148 |
anywhere |
149 |
are |
150 |
aren't |
151 |
around |
152 |
as |
153 |
at |
154 |
b |
155 |
be |
156 |
became |
157 |
because |
158 |
become |
159 |
becomes |
160 |
becoming |
161 |
been |
162 |
before |
163 |
beforehand |
164 |
begin |
165 |
beginning |
166 |
behind |
167 |
being |
168 |
below |
169 |
beside |
170 |
besides |
171 |
between |
172 |
beyond |
173 |
billion |
174 |
both |
175 |
but |
176 |
by |
177 |
c |
178 |
can |
179 |
can't |
180 |
cannot |
181 |
caption |
182 |
co |
183 |
co. |
184 |
could |
185 |
couldn't |
186 |
d |
187 |
did |
188 |
didn't |
189 |
do |
190 |
does |
191 |
doesn't |
192 |
don't |
193 |
down |
194 |
during |
195 |
e |
196 |
eg |
197 |
eight |
198 |
eighty |
199 |
either |
200 |
else |
201 |
elsewhere |
202 |
end |
203 |
ending |
204 |
enough |
205 |
etc |
206 |
even |
207 |
ever |
208 |
every |
209 |
everyone |
210 |
everything |
211 |
everywhere |
212 |
except |
213 |
f |
214 |
few |
215 |
fifty |
216 |
first |
217 |
five |
218 |
vfor |
219 |
former |
220 |
formerly |
221 |
forty |
222 |
found |
223 |
four |
224 |
from |
225 |
further |
226 |
g |
227 |
h |
228 |
had |
229 |
has |
230 |
hasn't |
231 |
have |
232 |
haven't |
233 |
he |
234 |
he'd |
235 |
he'll |
236 |
he's |
237 |
hence |
238 |
her |
239 |
here |
240 |
here's |
241 |
hereafter |
242 |
hereby |
243 |
herein |
244 |
hereupon |
245 |
hers |
246 |
herself |
247 |
him |
248 |
himself |
249 |
his |
250 |
how |
251 |
however |
252 |
hundred |
253 |
i |
254 |
i'd |
255 |
i'll |
256 |
i'm |
257 |
i've |
258 |
ie |
259 |
if |
260 |
in |
261 |
inc. |
262 |
indeed |
263 |
instead |
264 |
into |
265 |
is |
266 |
isn't |
267 |
it |
268 |
it's |
269 |
its |
270 |
itself |
271 |
j |
272 |
k |
273 |
l |
274 |
last |
275 |
later |
276 |
latter |
277 |
latterly |
278 |
least |
279 |
less |
280 |
let |
281 |
let's |
282 |
like |
283 |
likely |
284 |
ltd |
285 |
m |
286 |
made |
287 |
make |
288 |
makes |
289 |
many |
290 |
maybe |
291 |
me |
292 |
meantime |
293 |
meanwhile |
294 |
might |
295 |
million |
296 |
miss |
297 |
more |
298 |
moreover |
299 |
most |
300 |
mostly |
301 |
mr |
302 |
mrs |
303 |
much |
304 |
must |
305 |
my |
306 |
myself |
307 |
n |
308 |
namely |
309 |
neither |
310 |
never |
311 |
nevertheless |
312 |
next |
313 |
nine |
314 |
ninety |
315 |
no |
316 |
nobody |
317 |
none |
318 |
nonetheless |
319 |
noone |
320 |
nor |
321 |
not |
322 |
nothing |
323 |
now |
324 |
nowhere |
325 |
o |
326 |
of |
327 |
off |
328 |
often |
329 |
on |
330 |
once |
331 |
one |
332 |
one's |
333 |
only |
334 |
onto |
335 |
or |
336 |
other |
337 |
others |
338 |
otherwise |
339 |
our |
340 |
ours |
341 |
ourselves |
342 |
out |
343 |
over |
344 |
overall |
345 |
own |
346 |
p |
347 |
per |
348 |
perhaps |
349 |
q |
350 |
r |
351 |
rather |
352 |
recent |
353 |
recently |
354 |
s |
355 |
same |
356 |
seem |
357 |
seemed |
358 |
seeming |
359 |
seems |
360 |
seven |
361 |
seventy |
362 |
several |
363 |
she |
364 |
she'd |
365 |
she'll |
366 |
she's |
367 |
should |
368 |
shouldn't |
369 |
since |
370 |
six |
371 |
sixty |
372 |
so |
373 |
some |
374 |
somehow |
375 |
someone |
376 |
something |
377 |
sometime |
378 |
sometimes |
379 |
somewhere |
380 |
still |
381 |
stop |
382 |
such |
383 |
t |
384 |
taking |
385 |
ten |
386 |
than |
387 |
that |
388 |
that'll |
389 |
that's |
390 |
that've |
391 |
the |
392 |
their |
393 |
them |
394 |
themselves |
395 |
then |
396 |
thence |
397 |
there |
398 |
there'd |
399 |
there'll |
400 |
there're |
401 |
there's |
402 |
there've |
403 |
thereafter |
404 |
thereby |
405 |
therefore |
406 |
therein |
407 |
thereupon |
408 |
these |
409 |
they |
410 |
they'd |
411 |
they'll |
412 |
they're |
413 |
they've |
414 |
thirty |
415 |
this |
416 |
those |
417 |
though |
418 |
thousand |
419 |
three |
420 |
through |
421 |
throughout |
422 |
thru |
423 |
thus |
424 |
to |
425 |
together |
426 |
too |
427 |
toward |
428 |
towards |
429 |
trillion |
430 |
twenty |
431 |
two |
432 |
u |
433 |
under |
434 |
unless |
435 |
unlike |
436 |
unlikely |
437 |
until |
438 |
up |
439 |
upon |
440 |
us |
441 |
used |
442 |
using |
443 |
v |
444 |
very |
445 |
via |
446 |
w |
447 |
was |
448 |
wasn't |
449 |
we |
450 |
we'd |
451 |
we'll |
452 |
we're |
453 |
we've |
454 |
well |
455 |
were |
456 |
weren't |
457 |
what |
458 |
what'll |
459 |
what's |
460 |
what've |
461 |
whatever |
462 |
when |
463 |
whence |
464 |
whenever |
465 |
where |
466 |
where's |
467 |
whereafter |
468 |
whereas |
469 |
whereby |
470 |
wherein |
471 |
whereupon |
472 |
wherever |
473 |
whether |
474 |
which |
475 |
while |
476 |
whither |
477 |
who |
478 |
who'd |
479 |
who'll |
480 |
who's |
481 |
whoever |
482 |
whole |
483 |
whom |
484 |
whomever |
485 |
whose |
486 |
why |
487 |
will |
488 |
with |
489 |
within |
490 |
without |
491 |
won't |
492 |
would |
493 |
wouldn't |
494 |
x |
495 |
y |
496 |
yes |
497 |
yet |
498 |
you |
499 |
you'd |
500 |
you'll |
501 |
you're |
502 |
you've |
503 |
your |
504 |
yours |
505 |
yourself |
506 |
yourselves |
507 |
z |
508 |
# occuring in more than 100 files |
509 |
acc |
510 |
accent |
511 |
accents |
512 |
and |
513 |
are |
514 |
bell |
515 |
can |
516 |
character |
517 |
corrections |
518 |
crt |
519 |
daisy |
520 |
dash |
521 |
date |
522 |
defined |
523 |
definitions |
524 |
description |
525 |
devices |
526 |
diablo |
527 |
dummy |
528 |
factors |
529 |
following |
530 |
font |
531 |
for |
532 |
from |
533 |
fudge |
534 |
give |
535 |
have |
536 |
header |
537 |
holds |
538 |
log |
539 |
logo |
540 |
low |
541 |
lpr |
542 |
mark |
543 |
name |
544 |
nroff |
545 |
out |
546 |
output |
547 |
pitch |
548 |
put |
549 |
rcsfile |
550 |
reference |
551 |
resolution |
552 |
revision |
553 |
see |
554 |
set |
555 |
simple |
556 |
smi |
557 |
some |
558 |
string |
559 |
synopsis |
560 |
system |
561 |
that |
562 |
the |
563 |
this |
564 |
translation |
565 |
troff |
566 |
typewriter |
567 |
ucb |
568 |
unbreakable |
569 |
use |
570 |
used |
571 |
user |
572 |
vroff |
573 |
wheel |
574 |
will |
575 |
with |
576 |
you |
577 |
__END__ |
578 |
|
579 |
=head1 NAME |
580 |
|
581 |
WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions |
582 |
|
583 |
=head1 SYNOPSIS |
584 |
|
585 |
use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc |
586 |
isotr disotr stop grundform); |
587 |
|
588 |
$stem = Stem($word); |
589 |
$scode = Soundex($word); |
590 |
$pcode = Phonix($word); |
591 |
$lword = isolc($word); |
592 |
disolc($word); |
593 |
$uword = isouc($word); |
594 |
disouc($word); |
595 |
$trword = isotr($word); |
596 |
disotr($word); |
597 |
$word = stop($word); |
598 |
$word = grundform($word); |
599 |
|
600 |
@words = WAIT::Filter::split($word); |
601 |
@words = WAIT::Filter::split2($word); |
602 |
@words = WAIT::Filter::split3($word); |
603 |
@words = WAIT::Filter::split4($word); # arbitrary numbers allowed |
604 |
|
605 |
=head1 DESCRIPTION |
606 |
|
607 |
This tiny modules gives access to the basic reduction functions build |
608 |
in B<freeWAIS-sf>. |
609 |
|
610 |
=over 5 |
611 |
|
612 |
=item B<Stem>(I<word>) |
613 |
|
614 |
reduces I<word> using the well know Porter algorithm. |
615 |
|
616 |
AU: Porter, M.F. |
617 |
TI: An Algorithm for Suffix Stripping |
618 |
JT: Program |
619 |
VO: 14 |
620 |
PP: 130-137 |
621 |
PY: 1980 |
622 |
PM: JUL |
623 |
|
624 |
=item B<Soundex>(I<word>) |
625 |
|
626 |
|
627 |
computes the 4 byte B<Soundex> code for I<word>. |
628 |
|
629 |
AU: Gadd, T.N. |
630 |
TI: 'Fisching for Werds'. Phonetic Retrieval of written text in |
631 |
Information Retrieval Systems |
632 |
JT: Program |
633 |
VO: 22 |
634 |
NO: 3 |
635 |
PP: 222-237 |
636 |
PY: 1988 |
637 |
|
638 |
|
639 |
=item B<Phonix>(I<word>) |
640 |
|
641 |
computes the 8 byte B<Phonix> code for I<word>. |
642 |
|
643 |
AU: Gadd, T.N. |
644 |
TI: PHONIX: The Algorithm |
645 |
JT: Program |
646 |
VO: 24 |
647 |
NO: 4 |
648 |
PP: 363-366 |
649 |
PY: 1990 |
650 |
PM: OCT |
651 |
|
652 |
=back |
653 |
|
654 |
=head1 ISO charcater case functions |
655 |
|
656 |
There are some additional function which transpose some/most ISOlatin1 |
657 |
characters to upper and lower case. To allow for maximum speed there |
658 |
are also I<destructive> versions which change the argument instead of |
659 |
allocating a copy which is returned. For convenience, the destructive |
660 |
version also B<returns> the argument. So all of the following is |
661 |
valid and C<$word> will contain the lowercased string. |
662 |
|
663 |
$word = isolc($word); |
664 |
$word = disolc($word); |
665 |
disolc($word); |
666 |
|
667 |
Here are the hardcoded characters which are recognized: |
668 |
|
669 |
abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß |
670 |
ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß |
671 |
|
672 |
=over 5 |
673 |
|
674 |
=item C<$new = >B<isolc>C<($word)> |
675 |
|
676 |
=item B<disolc>C<($word)> |
677 |
|
678 |
transposes to lower case. |
679 |
|
680 |
=item C<$new = >B<isouc>C<($word)> |
681 |
|
682 |
=item B<disouc>C<($word)> |
683 |
|
684 |
transposes to upper case. |
685 |
|
686 |
=item C<$new = >B<isotr>C<($word)> |
687 |
|
688 |
=item B<disotr>C<($word)> |
689 |
|
690 |
Remove non-letters according to the above table. |
691 |
|
692 |
=item C<$new = >B<stop>C<($word)> |
693 |
|
694 |
Returns an empty string if $word is a stopword. |
695 |
|
696 |
=item C<$new = >B<grundform>C<($word)> |
697 |
|
698 |
Calls Text::German::reduce |
699 |
|
700 |
=item C<$new = >B<utf8iso>C<($word)> |
701 |
|
702 |
Deprecated due to flux in perl versions between 5.005 and 5.8. The |
703 |
function converts UTF8 encoded strings to ISO-8859-1. WAIT is |
704 |
internally still based on the Latin1 character set, so if you process |
705 |
anything in a different encoding, you should convert to Latin1 as the |
706 |
first filter or refrain from using the iso-latin-1 based filter |
707 |
functions. It is recommended that you use your own converter based on |
708 |
the perl version you're using. |
709 |
|
710 |
=item split, split2, split3, ... |
711 |
|
712 |
The splitN funtions all take a scalar as input and return a list of |
713 |
words. Split acts just like the perl split(' '). Split2 eliminates all |
714 |
words from the list that are shorter than 2 characters (bytes), split3 |
715 |
eliminates those shorter than 3 characters (bytes) and so on. |
716 |
|
717 |
=back |
718 |
|
719 |
=head1 AUTHOR |
720 |
|
721 |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |
722 |
|
723 |
=head1 SEE ALSO |
724 |
|
725 |
perl(1). |
726 |
|
727 |
=cut |
728 |
|