/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 109 - (show annotations)
Tue Jul 13 17:50:27 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 8957 byte(s)
pod fixes

1 # -*- Mode: Cperl -*-
2 # $Basename: Filter.pm $
3 # $Revision: 1.9 $
4 # ITIID : $ITI$ $Header $__Header$
5 # Author : Ulrich Pfeifer
6 # Created On : Thu Aug 15 18:09:51 1996
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Sun Nov 22 18:44:46 1998
9 # Language : CPerl
10 # Update Count : 105
11 # Status : Unknown, Use with caution!
12 #
13 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 #
15 package WAIT::Filter;
16 require WAIT;
17 use strict;
18 use Carp;
19 use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20 use subs qw(grundform);
21
22 require Exporter;
23
24 @ISA = qw(Exporter);
25 @EXPORT_OK = qw(
26 Stem
27 Soundex
28 Phonix
29 Metaphone
30 isolc disolc
31 isouc disouc
32 isotr disotr
33 stop grundform
34 utf8iso
35 );
36 # (most implemented in WAIT.xs)
37
38 $VERSION = substr q$Revision: 1.9 $, 10;
39
40 sub split {
41 map split(' ', $_), @_;
42 }
43
44 $SPLIT = q[
45 sub splitXXX {
46 grep length($_)>=XXX, map split(' ', $_), @_;
47 }
48 ];
49
50 sub AUTOLOAD {
51 my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53 if ($func =~ /split(\d+)/) {
54 my $num = $1;
55 my $split = $SPLIT;
56
57 $split =~ s/XXX/$num/g;
58 eval $split;
59 if ($@ eq '') {
60 goto &$AUTOLOAD;
61 }
62 } elsif ($func eq 'grundform') {
63 eval {require Text::German;};
64 croak "You must have Text::German to use 'grundform'"
65 if $@ ne '';
66 *grundform = Text::German->can('reduce');
67 goto &grundform;
68 } elsif ($func eq 'date') {
69 eval {require Time::ParseDate;};
70 croak "You must have Time::ParseDate to use 'date'"
71 if $@ ne '';
72 *date = Time::ParseDate->can('parsedate');
73 goto \&date;
74 } elsif ($func eq 'decode_entities') {
75 eval {require HTML::Entities;};
76 croak "You must have HTML::Entities to use 'date'"
77 if $@ ne '';
78 *decode_entities = HTML::Entities->can('decode_entities');
79 goto &decode_entities;
80 } elsif ($func =~ /^d?utf8iso$/) {
81 no strict 'refs';
82 *$func = sub {
83 # Courtesy JHI
84 my $s = shift;
85 $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86 {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
87 $s;
88 };
89 goto \&$func;
90 }
91 Carp::confess "Class WAIT::Filter::$func not found";
92 }
93
94 while (<DATA>) {
95 chomp;
96 last if /__END__/;
97 next if /^\s*#/; # there's a comment
98 $STOP{$_}++;
99 }
100 close DATA;
101
102 sub stop {
103 if (exists $STOP{$_[0]}) {
104 ''
105 } else {
106 $_[0];
107 }
108 }
109
110 sub gdate {
111 my $date = shift;
112
113 $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
114 date($date);
115 }
116
117 1;
118
119 __DATA__
120 a
121 about
122 above
123 according
124 across
125 actually
126 adj
127 after
128 afterwards
129 again
130 against
131 all
132 almost
133 alone
134 along
135 already
136 also
137 although
138 always
139 among
140 amongst
141 an
142 and
143 another
144 any
145 anyhow
146 anyone
147 anything
148 anywhere
149 are
150 aren't
151 around
152 as
153 at
154 b
155 be
156 became
157 because
158 become
159 becomes
160 becoming
161 been
162 before
163 beforehand
164 begin
165 beginning
166 behind
167 being
168 below
169 beside
170 besides
171 between
172 beyond
173 billion
174 both
175 but
176 by
177 c
178 can
179 can't
180 cannot
181 caption
182 co
183 co.
184 could
185 couldn't
186 d
187 did
188 didn't
189 do
190 does
191 doesn't
192 don't
193 down
194 during
195 e
196 eg
197 eight
198 eighty
199 either
200 else
201 elsewhere
202 end
203 ending
204 enough
205 etc
206 even
207 ever
208 every
209 everyone
210 everything
211 everywhere
212 except
213 f
214 few
215 fifty
216 first
217 five
218 vfor
219 former
220 formerly
221 forty
222 found
223 four
224 from
225 further
226 g
227 h
228 had
229 has
230 hasn't
231 have
232 haven't
233 he
234 he'd
235 he'll
236 he's
237 hence
238 her
239 here
240 here's
241 hereafter
242 hereby
243 herein
244 hereupon
245 hers
246 herself
247 him
248 himself
249 his
250 how
251 however
252 hundred
253 i
254 i'd
255 i'll
256 i'm
257 i've
258 ie
259 if
260 in
261 inc.
262 indeed
263 instead
264 into
265 is
266 isn't
267 it
268 it's
269 its
270 itself
271 j
272 k
273 l
274 last
275 later
276 latter
277 latterly
278 least
279 less
280 let
281 let's
282 like
283 likely
284 ltd
285 m
286 made
287 make
288 makes
289 many
290 maybe
291 me
292 meantime
293 meanwhile
294 might
295 million
296 miss
297 more
298 moreover
299 most
300 mostly
301 mr
302 mrs
303 much
304 must
305 my
306 myself
307 n
308 namely
309 neither
310 never
311 nevertheless
312 next
313 nine
314 ninety
315 no
316 nobody
317 none
318 nonetheless
319 noone
320 nor
321 not
322 nothing
323 now
324 nowhere
325 o
326 of
327 off
328 often
329 on
330 once
331 one
332 one's
333 only
334 onto
335 or
336 other
337 others
338 otherwise
339 our
340 ours
341 ourselves
342 out
343 over
344 overall
345 own
346 p
347 per
348 perhaps
349 q
350 r
351 rather
352 recent
353 recently
354 s
355 same
356 seem
357 seemed
358 seeming
359 seems
360 seven
361 seventy
362 several
363 she
364 she'd
365 she'll
366 she's
367 should
368 shouldn't
369 since
370 six
371 sixty
372 so
373 some
374 somehow
375 someone
376 something
377 sometime
378 sometimes
379 somewhere
380 still
381 stop
382 such
383 t
384 taking
385 ten
386 than
387 that
388 that'll
389 that's
390 that've
391 the
392 their
393 them
394 themselves
395 then
396 thence
397 there
398 there'd
399 there'll
400 there're
401 there's
402 there've
403 thereafter
404 thereby
405 therefore
406 therein
407 thereupon
408 these
409 they
410 they'd
411 they'll
412 they're
413 they've
414 thirty
415 this
416 those
417 though
418 thousand
419 three
420 through
421 throughout
422 thru
423 thus
424 to
425 together
426 too
427 toward
428 towards
429 trillion
430 twenty
431 two
432 u
433 under
434 unless
435 unlike
436 unlikely
437 until
438 up
439 upon
440 us
441 used
442 using
443 v
444 very
445 via
446 w
447 was
448 wasn't
449 we
450 we'd
451 we'll
452 we're
453 we've
454 well
455 were
456 weren't
457 what
458 what'll
459 what's
460 what've
461 whatever
462 when
463 whence
464 whenever
465 where
466 where's
467 whereafter
468 whereas
469 whereby
470 wherein
471 whereupon
472 wherever
473 whether
474 which
475 while
476 whither
477 who
478 who'd
479 who'll
480 who's
481 whoever
482 whole
483 whom
484 whomever
485 whose
486 why
487 will
488 with
489 within
490 without
491 won't
492 would
493 wouldn't
494 x
495 y
496 yes
497 yet
498 you
499 you'd
500 you'll
501 you're
502 you've
503 your
504 yours
505 yourself
506 yourselves
507 z
508 # occuring in more than 100 files
509 acc
510 accent
511 accents
512 and
513 are
514 bell
515 can
516 character
517 corrections
518 crt
519 daisy
520 dash
521 date
522 defined
523 definitions
524 description
525 devices
526 diablo
527 dummy
528 factors
529 following
530 font
531 for
532 from
533 fudge
534 give
535 have
536 header
537 holds
538 log
539 logo
540 low
541 lpr
542 mark
543 name
544 nroff
545 out
546 output
547 pitch
548 put
549 rcsfile
550 reference
551 resolution
552 revision
553 see
554 set
555 simple
556 smi
557 some
558 string
559 synopsis
560 system
561 that
562 the
563 this
564 translation
565 troff
566 typewriter
567 ucb
568 unbreakable
569 use
570 used
571 user
572 vroff
573 wheel
574 will
575 with
576 you
577 __END__
578
579 =head1 NAME
580
581 WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
582
583 =head1 SYNOPSIS
584
585 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
586 isotr disotr stop grundform);
587
588 $stem = Stem($word);
589 $scode = Soundex($word);
590 $pcode = Phonix($word);
591 $lword = isolc($word);
592 disolc($word);
593 $uword = isouc($word);
594 disouc($word);
595 $trword = isotr($word);
596 disotr($word);
597 $word = stop($word);
598 $word = grundform($word);
599
600 @words = WAIT::Filter::split($word);
601 @words = WAIT::Filter::split2($word);
602 @words = WAIT::Filter::split3($word);
603 @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
604
605 =head1 DESCRIPTION
606
607 This tiny modules gives access to the basic reduction functions build
608 in B<freeWAIS-sf>.
609
610 =over 5
611
612 =item B<Stem>(I<word>)
613
614 reduces I<word> using the well know Porter algorithm.
615
616 AU: Porter, M.F.
617 TI: An Algorithm for Suffix Stripping
618 JT: Program
619 VO: 14
620 PP: 130-137
621 PY: 1980
622 PM: JUL
623
624 =item B<Soundex>(I<word>)
625
626
627 computes the 4 byte B<Soundex> code for I<word>.
628
629 AU: Gadd, T.N.
630 TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
631 Information Retrieval Systems
632 JT: Program
633 VO: 22
634 NO: 3
635 PP: 222-237
636 PY: 1988
637
638
639 =item B<Phonix>(I<word>)
640
641 computes the 8 byte B<Phonix> code for I<word>.
642
643 AU: Gadd, T.N.
644 TI: PHONIX: The Algorithm
645 JT: Program
646 VO: 24
647 NO: 4
648 PP: 363-366
649 PY: 1990
650 PM: OCT
651
652 =back
653
654 =head1 ISO charcater case functions
655
656 There are some additional function which transpose some/most ISOlatin1
657 characters to upper and lower case. To allow for maximum speed there
658 are also I<destructive> versions which change the argument instead of
659 allocating a copy which is returned. For convenience, the destructive
660 version also B<returns> the argument. So all of the following is
661 valid and C<$word> will contain the lowercased string.
662
663 $word = isolc($word);
664 $word = disolc($word);
665 disolc($word);
666
667 Here are the hardcoded characters which are recognized:
668
669 abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
670 ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
671
672 =over 5
673
674 =item C<$new = >B<isolc>C<($word)>
675
676 =item B<disolc>C<($word)>
677
678 transposes to lower case.
679
680 =item C<$new = >B<isouc>C<($word)>
681
682 =item B<disouc>C<($word)>
683
684 transposes to upper case.
685
686 =item C<$new = >B<isotr>C<($word)>
687
688 =item B<disotr>C<($word)>
689
690 Remove non-letters according to the above table.
691
692 =item C<$new = >B<stop>C<($word)>
693
694 Returns an empty string if $word is a stopword.
695
696 =item C<$new = >B<grundform>C<($word)>
697
698 Calls Text::German::reduce
699
700 =item C<$new = >B<utf8iso>C<($word)>
701
702 Deprecated due to flux in perl versions between 5.005 and 5.8. The
703 function converts UTF8 encoded strings to ISO-8859-1. WAIT is
704 internally still based on the Latin1 character set, so if you process
705 anything in a different encoding, you should convert to Latin1 as the
706 first filter or refrain from using the iso-latin-1 based filter
707 functions. It is recommended that you use your own converter based on
708 the perl version you're using.
709
710 =item split, split2, split3, ...
711
712 The splitN funtions all take a scalar as input and return a list of
713 words. Split acts just like the perl split(' '). Split2 eliminates all
714 words from the list that are shorter than 2 characters (bytes), split3
715 eliminates those shorter than 3 characters (bytes) and so on.
716
717 =back
718
719 =head1 AUTHOR
720
721 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
722
723 =head1 SEE ALSO
724
725 perl(1).
726
727 =cut
728

Properties

Name Value
cvs2svn:cvs-rev 1.4

  ViewVC Help
Powered by ViewVC 1.1.26