/[wait]/branches/unido/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/unido/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 106 - (show annotations)
Tue Jul 13 12:22:09 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 9007 byte(s)
Changes made by Andreas J. Koenig <andreas.koenig(at)anima.de> for Unido project

1 # -*- Mode: Cperl -*-
2 # $Basename: Filter.pm $
3 # $Revision: 1.9 $
4 # ITIID : $ITI$ $Header $__Header$
5 # Author : Ulrich Pfeifer
6 # Created On : Thu Aug 15 18:09:51 1996
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Sun Nov 22 18:44:46 1998
9 # Language : CPerl
10 # Update Count : 105
11 # Status : Unknown, Use with caution!
12 #
13 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 #
15 package WAIT::Filter;
16 require WAIT;
17 use strict;
18 use Carp;
19 use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20 use subs qw(grundform);
21
22 require Exporter;
23
24 @ISA = qw(Exporter);
25 @EXPORT_OK = qw(
26 Stem
27 Soundex
28 Phonix
29 Metaphone
30 isolc disolc
31 isouc disouc
32 isotr disotr
33 stop grundform
34 utf8iso
35 );
36 # (most implemented in WAIT.xs)
37
38 $VERSION = substr q$Revision: 1.9 $, 10;
39
40 sub split {
41 map split(' ', $_), @_;
42 }
43
44 $SPLIT = q[
45 sub splitXXX {
46 grep length($_)>=XXX, map split(' ', $_), @_;
47 }
48 ];
49
50 sub AUTOLOAD {
51 my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53 if ($func =~ /split(\d+)/) {
54 my $num = $1;
55 my $split = $SPLIT;
56
57 $split =~ s/XXX/$num/g;
58 eval $split;
59 if ($@ eq '') {
60 goto &$AUTOLOAD;
61 }
62 } elsif ($func eq 'grundform') {
63 eval {require Text::German;};
64 croak "You must have Text::German to use 'grundform'"
65 if $@ ne '';
66 *grundform = Text::German->can('reduce');
67 goto &grundform;
68 } elsif ($func eq 'date') {
69 eval {require Time::ParseDate;};
70 croak "You must have Time::ParseDate to use 'date'"
71 if $@ ne '';
72 *date = Time::ParseDate->can('parsedate');
73 goto \&date;
74 } elsif ($func eq 'decode_entities') {
75 eval {require HTML::Entities;};
76 croak "You must have HTML::Entities to use 'date'"
77 if $@ ne '';
78 *decode_entities = HTML::Entities->can('decode_entities');
79 goto &decode_entities;
80 } elsif ($func =~ /^d?utf8iso$/) {
81 no strict 'refs';
82 *$func = sub {
83 # Courtesy JHI
84 my $s = shift;
85 $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86 {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
87 $s;
88 };
89 goto \&$func;
90 }
91 Carp::confess "Class WAIT::Filter::$func not found";
92 }
93
94 while (<DATA>) {
95 chomp;
96 last if /__END__/;
97 next if /^\s*#/; # there's a comment
98 $STOP{$_}++;
99 }
100 close DATA;
101
102 sub stop {
103 if (exists $STOP{$_[0]}) {
104 ''
105 } else {
106 $_[0];
107 }
108 }
109
110 sub gdate {
111 my $date = shift;
112
113 $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
114 date($date);
115 }
116
117 1;
118 __DATA__
119 a
120 about
121 above
122 according
123 across
124 actually
125 adj
126 after
127 afterwards
128 again
129 against
130 all
131 almost
132 alone
133 along
134 already
135 also
136 although
137 always
138 among
139 amongst
140 an
141 and
142 another
143 any
144 anyhow
145 anyone
146 anything
147 anywhere
148 are
149 aren't
150 around
151 as
152 at
153 b
154 be
155 became
156 because
157 become
158 becomes
159 becoming
160 been
161 before
162 beforehand
163 begin
164 beginning
165 behind
166 being
167 below
168 beside
169 besides
170 between
171 beyond
172 billion
173 both
174 but
175 by
176 c
177 can
178 can't
179 cannot
180 caption
181 co
182 co.
183 could
184 couldn't
185 d
186 did
187 didn't
188 do
189 does
190 doesn't
191 don't
192 down
193 during
194 e
195 eg
196 eight
197 eighty
198 either
199 else
200 elsewhere
201 end
202 ending
203 enough
204 etc
205 even
206 ever
207 every
208 everyone
209 everything
210 everywhere
211 except
212 f
213 few
214 fifty
215 first
216 five
217 vfor
218 former
219 formerly
220 forty
221 found
222 four
223 from
224 further
225 g
226 h
227 had
228 has
229 hasn't
230 have
231 haven't
232 he
233 he'd
234 he'll
235 he's
236 hence
237 her
238 here
239 here's
240 hereafter
241 hereby
242 herein
243 hereupon
244 hers
245 herself
246 him
247 himself
248 his
249 how
250 however
251 hundred
252 i
253 i'd
254 i'll
255 i'm
256 i've
257 ie
258 if
259 in
260 inc.
261 indeed
262 instead
263 into
264 is
265 isn't
266 it
267 it's
268 its
269 itself
270 j
271 k
272 l
273 last
274 later
275 latter
276 latterly
277 least
278 less
279 let
280 let's
281 like
282 likely
283 ltd
284 m
285 made
286 make
287 makes
288 many
289 maybe
290 me
291 meantime
292 meanwhile
293 might
294 million
295 miss
296 more
297 moreover
298 most
299 mostly
300 mr
301 mrs
302 much
303 must
304 my
305 myself
306 n
307 namely
308 neither
309 never
310 nevertheless
311 next
312 nine
313 ninety
314 no
315 nobody
316 none
317 nonetheless
318 noone
319 nor
320 not
321 nothing
322 now
323 nowhere
324 o
325 of
326 off
327 often
328 on
329 once
330 one
331 one's
332 only
333 onto
334 or
335 other
336 others
337 otherwise
338 our
339 ours
340 ourselves
341 out
342 over
343 overall
344 own
345 p
346 per
347 perhaps
348 q
349 r
350 rather
351 recent
352 recently
353 s
354 same
355 seem
356 seemed
357 seeming
358 seems
359 seven
360 seventy
361 several
362 she
363 she'd
364 she'll
365 she's
366 should
367 shouldn't
368 since
369 six
370 sixty
371 so
372 some
373 somehow
374 someone
375 something
376 sometime
377 sometimes
378 somewhere
379 still
380 stop
381 such
382 t
383 taking
384 ten
385 than
386 that
387 that'll
388 that's
389 that've
390 the
391 their
392 them
393 themselves
394 then
395 thence
396 there
397 there'd
398 there'll
399 there're
400 there's
401 there've
402 thereafter
403 thereby
404 therefore
405 therein
406 thereupon
407 these
408 they
409 they'd
410 they'll
411 they're
412 they've
413 thirty
414 this
415 those
416 though
417 thousand
418 three
419 through
420 throughout
421 thru
422 thus
423 to
424 together
425 too
426 toward
427 towards
428 trillion
429 twenty
430 two
431 u
432 under
433 unless
434 unlike
435 unlikely
436 until
437 up
438 upon
439 us
440 used
441 using
442 v
443 very
444 via
445 w
446 was
447 wasn't
448 we
449 we'd
450 we'll
451 we're
452 we've
453 well
454 were
455 weren't
456 what
457 what'll
458 what's
459 what've
460 whatever
461 when
462 whence
463 whenever
464 where
465 where's
466 whereafter
467 whereas
468 whereby
469 wherein
470 whereupon
471 wherever
472 whether
473 which
474 while
475 whither
476 who
477 who'd
478 who'll
479 who's
480 whoever
481 whole
482 whom
483 whomever
484 whose
485 why
486 will
487 with
488 within
489 without
490 won't
491 would
492 wouldn't
493 x
494 y
495 yes
496 yet
497 you
498 you'd
499 you'll
500 you're
501 you've
502 your
503 yours
504 yourself
505 yourselves
506 z
507 # occuring in more than 100 files
508 acc
509 accent
510 accents
511 and
512 are
513 bell
514 can
515 character
516 corrections
517 crt
518 daisy
519 dash
520 date
521 defined
522 definitions
523 description
524 devices
525 diablo
526 dummy
527 factors
528 following
529 font
530 for
531 from
532 fudge
533 give
534 have
535 header
536 holds
537 log
538 logo
539 low
540 lpr
541 mark
542 name
543 nroff
544 out
545 output
546 pitch
547 put
548 rcsfile
549 reference
550 resolution
551 revision
552 see
553 set
554 simple
555 smi
556 some
557 string
558 synopsis
559 system
560 that
561 the
562 this
563 translation
564 troff
565 typewriter
566 ucb
567 unbreakable
568 use
569 used
570 user
571 vroff
572 wheel
573 will
574 with
575 you
576 __END__
577 # Below is the stub of documentation for your module. You better edit it!
578
579 =head1 NAME
580
581 WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
582
583 =head1 SYNOPSIS
584
585 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
586 isotr disotr stop grundform);
587
588 $stem = Stem($word);
589 $scode = Soundex($word);
590 $pcode = Phonix($word);
591 $lword = isolc($word);
592 disolc($word);
593 $uword = isouc($word);
594 disouc($word);
595 $trword = isotr($word);
596 disotr($word);
597 $word = stop($word);
598 $word = grundform($word);
599
600 @words = WAIT::Filter::split($word);
601 @words = WAIT::Filter::split2($word);
602 @words = WAIT::Filter::split3($word);
603 @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
604
605 =head1 DESCRIPTION
606
607 This tiny modules gives access to the basic reduction functions build
608 in B<freeWAIS-sf>.
609
610 =over 5
611
612 =item B<Stem>(I<word>)
613
614 reduces I<word> using the well know Porter algorithm.
615
616 AU: Porter, M.F.
617 TI: An Algorithm for Suffix Stripping
618 JT: Program
619 VO: 14
620 PP: 130-137
621 PY: 1980
622 PM: JUL
623
624 =item B<Soundex>(I<word>)
625
626
627 computes the 4 byte B<Soundex> code for I<word>.
628
629 AU: Gadd, T.N.
630 TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
631 Information Retrieval Systems
632 JT: Program
633 VO: 22
634 NO: 3
635 PP: 222-237
636 PY: 1988
637
638
639 =item B<Phonix>(I<word>)
640
641 computes the 8 byte B<Phonix> code for I<word>.
642
643 AU: Gadd, T.N.
644 TI: PHONIX: The Algorithm
645 JT: Program
646 VO: 24
647 NO: 4
648 PP: 363-366
649 PY: 1990
650 PM: OCT
651
652 =head1 ISO charcater case functions
653
654 There are some additional function which transpose some/most ISOlatin1
655 characters to upper and lower case. To allow for maximum speed there
656 are also I<destructive> versions which change the argument instead of
657 allocating a copy which is returned. For convenience, the destructive
658 version also B<returns> the argument. So all of the following is
659 valid and C<$word> will contain the lowercased string.
660
661 $word = isolc($word);
662 $word = disolc($word);
663 disolc($word);
664
665 Here are the hardcoded characters which are recognized:
666
667 abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
668 ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
669
670 =item C<$new = >B<isolc>C<($word)>
671
672 =item B<disolc>C<($word)>
673
674 transposes to lower case.
675
676 =item C<$new = >B<isouc>C<($word)>
677
678 =item B<disouc>C<($word)>
679
680 transposes to upper case.
681
682 =item C<$new = >B<isotr>C<($word)>
683
684 =item B<disotr>C<($word)>
685
686 Remove non-letters according to the above table.
687
688 =item C<$new = >B<stop>C<($word)>
689
690 Returns an empty string if $word is a stopword.
691
692 =item C<$new = >B<grundform>C<($word)>
693
694 Calls Text::German::reduce
695
696 =item C<$new = >B<utf8iso>C<($word)>
697
698 Deprecated due to flux in perl versions between 5.005 and 5.8. The
699 function converts UTF8 encoded strings to ISO-8859-1. WAIT is
700 internally still based on the Latin1 character set, so if you process
701 anything in a different encoding, you should convert to Latin1 as the
702 first filter or refrain from using the iso-latin-1 based filter
703 functions. It is recommended that you use your own converter based on
704 the perl version you're using.
705
706 =item split, split2, split3, ...
707
708 The splitN funtions all take a scalar as input and return a list of
709 words. Split acts just like the perl split(' '). Split2 eliminates all
710 words from the list that are shorter than 2 characters (bytes), split3
711 eliminates those shorter than 3 characters (bytes) and so on.
712
713 =head1 AUTHOR
714
715 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
716
717 =head1 SEE ALSO
718
719 perl(1).
720
721 =cut
722

  ViewVC Help
Powered by ViewVC 1.1.26