/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 118 - (show annotations)
Fri Jul 15 18:59:10 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 9845 byte(s)
some rather old changes from 2004-05-28

1 # -*- Mode: Cperl -*-
2 # $Basename: Filter.pm $
3 # $Revision: 1.9 $
4 # ITIID : $ITI$ $Header $__Header$
5 # Author : Ulrich Pfeifer
6 # Created On : Thu Aug 15 18:09:51 1996
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Sun Nov 22 18:44:46 1998
9 # Language : CPerl
10 # Update Count : 105
11 # Status : Unknown, Use with caution!
12 #
13 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 #
15 package WAIT::Filter;
16 require WAIT;
17 use strict;
18 use Carp;
19 use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $UNAC $ICONV $AUTOLOAD);
20 use subs qw(grundform);
21
22 use Text::Unaccent;
23 use Text::Iconv;
24
25 require Exporter;
26
27 @ISA = qw(Exporter);
28 @EXPORT_OK = qw(
29 Stem
30 Soundex
31 Phonix
32 Metaphone
33 isolc disolc
34 isouc disouc
35 isotr disotr
36 stop grundform
37 utf8iso
38 );
39 # (most implemented in WAIT.xs)
40
41 $VERSION = substr q$Revision: 1.9 $, 10;
42
43 sub split {
44 map split(' ', $_), @_;
45 }
46
47 $SPLIT = q[
48 sub splitXXX {
49 grep length($_)>=XXX, map split(' ', $_), @_;
50 }
51 ];
52
53 $UNAC = q[
54 sub unac_CHARSET {
55 map split(' ',unac_string('CHARSET', $_) || $_), @_;
56 }
57 ];
58
59 my $iconv;
60
61 $ICONV = q[
62 sub iconv_CHARSETfrom_CHARSETto {
63 my $ic = $iconv->{'CHARSETfrom_CHARSETto'});
64 $ic ||= $iconv->{'CHARSETfrom_CHARSETto'} = Text::Iconv->new('CHARSETfrom','CHARSETto');
65 map split(' ',$ic->convert($_) || $_), @_;
66 }
67 ];
68
69
70 sub AUTOLOAD {
71 my $func = $AUTOLOAD; $func =~ s/.*:://;
72
73 if ($func =~ /split(\d+)/) {
74 my $num = $1;
75 my $split = $SPLIT;
76
77 $split =~ s/XXX/$num/g;
78 eval $split;
79 if ($@ eq '') {
80 goto &$AUTOLOAD;
81 }
82 } elsif ($func eq 'grundform') {
83 eval {require Text::German;};
84 croak "You must have Text::German to use 'grundform'"
85 if $@ ne '';
86 *grundform = Text::German->can('reduce');
87 goto &grundform;
88 } elsif ($func eq 'date') {
89 eval {require Time::ParseDate;};
90 croak "You must have Time::ParseDate to use 'date'"
91 if $@ ne '';
92 *date = Time::ParseDate->can('parsedate');
93 goto \&date;
94 } elsif ($func eq 'decode_entities') {
95 eval {require HTML::Entities;};
96 croak "You must have HTML::Entities to use 'decode_entities'"
97 if $@ ne '';
98 *decode_entities = HTML::Entities->can('decode_entities');
99 goto &decode_entities;
100 } elsif ($func =~ /^d?utf8iso$/) {
101 no strict 'refs';
102 *$func = sub {
103 # Courtesy JHI
104 my $s = shift;
105 $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
106 {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
107 $s;
108 };
109 goto \&$func;
110 } elsif ($func =~ /unac_(.+)/) {
111 my $charset = $1;
112 my $unac = $UNAC;
113 $unac =~ s/CHARSET/$charset/g;
114 print "### $unac ###\n";
115 eval $unac;
116 if ($@ eq '') {
117 goto &$func;
118 }
119 } elsif ($func =~ /iconv_([^_]+)_([^_]+)/) {
120 my ($cf,$ct) = ($1,$2);
121 my $iconv = $ICONV;
122 print "### $cf -> $ct\n";
123 $iconv =~ s/CHARSETfrom/$cf/gs;
124 $iconv =~ s/CHARSETto/$ct/gs;
125 print "### $iconv ###\n";
126 eval $iconv;
127 if ($@ eq '') {
128 goto &$func;
129 }
130 }
131 Carp::confess "Class WAIT::Filter::$func not found";
132 }
133
134 while (<DATA>) {
135 chomp;
136 last if /__END__/;
137 next if /^\s*#/; # there's a comment
138 $STOP{$_}++;
139 }
140 close DATA;
141
142 sub stop {
143 if (exists $STOP{$_[0]}) {
144 ''
145 } else {
146 $_[0];
147 }
148 }
149
150 sub gdate {
151 my $date = shift;
152
153 $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
154 date($date);
155 }
156
157 1;
158
159 __DATA__
160 a
161 about
162 above
163 according
164 across
165 actually
166 adj
167 after
168 afterwards
169 again
170 against
171 all
172 almost
173 alone
174 along
175 already
176 also
177 although
178 always
179 among
180 amongst
181 an
182 and
183 another
184 any
185 anyhow
186 anyone
187 anything
188 anywhere
189 are
190 aren't
191 around
192 as
193 at
194 b
195 be
196 became
197 because
198 become
199 becomes
200 becoming
201 been
202 before
203 beforehand
204 begin
205 beginning
206 behind
207 being
208 below
209 beside
210 besides
211 between
212 beyond
213 billion
214 both
215 but
216 by
217 c
218 can
219 can't
220 cannot
221 caption
222 co
223 co.
224 could
225 couldn't
226 d
227 did
228 didn't
229 do
230 does
231 doesn't
232 don't
233 down
234 during
235 e
236 eg
237 eight
238 eighty
239 either
240 else
241 elsewhere
242 end
243 ending
244 enough
245 etc
246 even
247 ever
248 every
249 everyone
250 everything
251 everywhere
252 except
253 f
254 few
255 fifty
256 first
257 five
258 vfor
259 former
260 formerly
261 forty
262 found
263 four
264 from
265 further
266 g
267 h
268 had
269 has
270 hasn't
271 have
272 haven't
273 he
274 he'd
275 he'll
276 he's
277 hence
278 her
279 here
280 here's
281 hereafter
282 hereby
283 herein
284 hereupon
285 hers
286 herself
287 him
288 himself
289 his
290 how
291 however
292 hundred
293 i
294 i'd
295 i'll
296 i'm
297 i've
298 ie
299 if
300 in
301 inc.
302 indeed
303 instead
304 into
305 is
306 isn't
307 it
308 it's
309 its
310 itself
311 j
312 k
313 l
314 last
315 later
316 latter
317 latterly
318 least
319 less
320 let
321 let's
322 like
323 likely
324 ltd
325 m
326 made
327 make
328 makes
329 many
330 maybe
331 me
332 meantime
333 meanwhile
334 might
335 million
336 miss
337 more
338 moreover
339 most
340 mostly
341 mr
342 mrs
343 much
344 must
345 my
346 myself
347 n
348 namely
349 neither
350 never
351 nevertheless
352 next
353 nine
354 ninety
355 no
356 nobody
357 none
358 nonetheless
359 noone
360 nor
361 not
362 nothing
363 now
364 nowhere
365 o
366 of
367 off
368 often
369 on
370 once
371 one
372 one's
373 only
374 onto
375 or
376 other
377 others
378 otherwise
379 our
380 ours
381 ourselves
382 out
383 over
384 overall
385 own
386 p
387 per
388 perhaps
389 q
390 r
391 rather
392 recent
393 recently
394 s
395 same
396 seem
397 seemed
398 seeming
399 seems
400 seven
401 seventy
402 several
403 she
404 she'd
405 she'll
406 she's
407 should
408 shouldn't
409 since
410 six
411 sixty
412 so
413 some
414 somehow
415 someone
416 something
417 sometime
418 sometimes
419 somewhere
420 still
421 stop
422 such
423 t
424 taking
425 ten
426 than
427 that
428 that'll
429 that's
430 that've
431 the
432 their
433 them
434 themselves
435 then
436 thence
437 there
438 there'd
439 there'll
440 there're
441 there's
442 there've
443 thereafter
444 thereby
445 therefore
446 therein
447 thereupon
448 these
449 they
450 they'd
451 they'll
452 they're
453 they've
454 thirty
455 this
456 those
457 though
458 thousand
459 three
460 through
461 throughout
462 thru
463 thus
464 to
465 together
466 too
467 toward
468 towards
469 trillion
470 twenty
471 two
472 u
473 under
474 unless
475 unlike
476 unlikely
477 until
478 up
479 upon
480 us
481 used
482 using
483 v
484 very
485 via
486 w
487 was
488 wasn't
489 we
490 we'd
491 we'll
492 we're
493 we've
494 well
495 were
496 weren't
497 what
498 what'll
499 what's
500 what've
501 whatever
502 when
503 whence
504 whenever
505 where
506 where's
507 whereafter
508 whereas
509 whereby
510 wherein
511 whereupon
512 wherever
513 whether
514 which
515 while
516 whither
517 who
518 who'd
519 who'll
520 who's
521 whoever
522 whole
523 whom
524 whomever
525 whose
526 why
527 will
528 with
529 within
530 without
531 won't
532 would
533 wouldn't
534 x
535 y
536 yes
537 yet
538 you
539 you'd
540 you'll
541 you're
542 you've
543 your
544 yours
545 yourself
546 yourselves
547 z
548 # occuring in more than 100 files
549 acc
550 accent
551 accents
552 and
553 are
554 bell
555 can
556 character
557 corrections
558 crt
559 daisy
560 dash
561 date
562 defined
563 definitions
564 description
565 devices
566 diablo
567 dummy
568 factors
569 following
570 font
571 for
572 from
573 fudge
574 give
575 have
576 header
577 holds
578 log
579 logo
580 low
581 lpr
582 mark
583 name
584 nroff
585 out
586 output
587 pitch
588 put
589 rcsfile
590 reference
591 resolution
592 revision
593 see
594 set
595 simple
596 smi
597 some
598 string
599 synopsis
600 system
601 that
602 the
603 this
604 translation
605 troff
606 typewriter
607 ucb
608 unbreakable
609 use
610 used
611 user
612 vroff
613 wheel
614 will
615 with
616 you
617 __END__
618
619 =head1 NAME
620
621 WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
622
623 =head1 SYNOPSIS
624
625 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
626 isotr disotr stop grundform);
627
628 $stem = Stem($word);
629 $scode = Soundex($word);
630 $pcode = Phonix($word);
631 $lword = isolc($word);
632 disolc($word);
633 $uword = isouc($word);
634 disouc($word);
635 $trword = isotr($word);
636 disotr($word);
637 $word = stop($word);
638 $word = grundform($word);
639
640 @words = WAIT::Filter::split($word);
641 @words = WAIT::Filter::split2($word);
642 @words = WAIT::Filter::split3($word);
643 @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
644
645 =head1 DESCRIPTION
646
647 This tiny modules gives access to the basic reduction functions build
648 in B<freeWAIS-sf>.
649
650 =over 5
651
652 =item B<Stem>(I<word>)
653
654 reduces I<word> using the well know Porter algorithm.
655
656 AU: Porter, M.F.
657 TI: An Algorithm for Suffix Stripping
658 JT: Program
659 VO: 14
660 PP: 130-137
661 PY: 1980
662 PM: JUL
663
664 =item B<Soundex>(I<word>)
665
666
667 computes the 4 byte B<Soundex> code for I<word>.
668
669 AU: Gadd, T.N.
670 TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
671 Information Retrieval Systems
672 JT: Program
673 VO: 22
674 NO: 3
675 PP: 222-237
676 PY: 1988
677
678
679 =item B<Phonix>(I<word>)
680
681 computes the 8 byte B<Phonix> code for I<word>.
682
683 AU: Gadd, T.N.
684 TI: PHONIX: The Algorithm
685 JT: Program
686 VO: 24
687 NO: 4
688 PP: 363-366
689 PY: 1990
690 PM: OCT
691
692 =back
693
694 =head1 ISO charcater case functions
695
696 There are some additional function which transpose some/most ISOlatin1
697 characters to upper and lower case. To allow for maximum speed there
698 are also I<destructive> versions which change the argument instead of
699 allocating a copy which is returned. For convenience, the destructive
700 version also B<returns> the argument. So all of the following is
701 valid and C<$word> will contain the lowercased string.
702
703 $word = isolc($word);
704 $word = disolc($word);
705 disolc($word);
706
707 Here are the hardcoded characters which are recognized:
708
709 abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
710 ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
711
712 =over 5
713
714 =item C<$new = >B<isolc>C<($word)>
715
716 =item B<disolc>C<($word)>
717
718 transposes to lower case.
719
720 =item C<$new = >B<isouc>C<($word)>
721
722 =item B<disouc>C<($word)>
723
724 transposes to upper case.
725
726 =item C<$new = >B<isotr>C<($word)>
727
728 =item B<disotr>C<($word)>
729
730 Remove non-letters according to the above table.
731
732 =item C<$new = >B<stop>C<($word)>
733
734 Returns an empty string if $word is a stopword.
735
736 =item C<$new = >B<grundform>C<($word)>
737
738 Calls Text::German::reduce
739
740 =item C<$new = >B<utf8iso>C<($word)>
741
742 Deprecated due to flux in perl versions between 5.005 and 5.8. The
743 function converts UTF8 encoded strings to ISO-8859-1. WAIT is
744 internally still based on the Latin1 character set, so if you process
745 anything in a different encoding, you should convert to Latin1 as the
746 first filter or refrain from using the iso-latin-1 based filter
747 functions. It is recommended that you use your own converter based on
748 the perl version you're using.
749
750 =item split, split2, split3, ...
751
752 The splitN funtions all take a scalar as input and return a list of
753 words. Split acts just like the perl split(' '). Split2 eliminates all
754 words from the list that are shorter than 2 characters (bytes), split3
755 eliminates those shorter than 3 characters (bytes) and so on.
756
757 =back
758
759 =head1 AUTHOR
760
761 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
762
763 =head1 SEE ALSO
764
765 perl(1).
766
767 =cut
768

Properties

Name Value
cvs2svn:cvs-rev 1.4

  ViewVC Help
Powered by ViewVC 1.1.26