/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 109 - (hide annotations)
Tue Jul 13 17:50:27 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 8957 byte(s)
pod fixes

1 ulpfr 13 # -*- Mode: Cperl -*-
2 ulpfr 10 # $Basename: Filter.pm $
3 ulpfr 19 # $Revision: 1.9 $
4 ulpfr 10 # ITIID : $ITI$ $Header $__Header$
5     # Author : Ulrich Pfeifer
6     # Created On : Thu Aug 15 18:09:51 1996
7     # Last Modified By: Ulrich Pfeifer
8     # Last Modified On: Sun Nov 22 18:44:46 1998
9     # Language : CPerl
10     # Update Count : 105
11     # Status : Unknown, Use with caution!
12 ulpfr 13 #
13 ulpfr 10 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 ulpfr 13 #
15 ulpfr 10 package WAIT::Filter;
16     require WAIT;
17     use strict;
18     use Carp;
19     use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20     use subs qw(grundform);
21    
22     require Exporter;
23    
24     @ISA = qw(Exporter);
25     @EXPORT_OK = qw(
26     Stem
27     Soundex
28     Phonix
29     Metaphone
30     isolc disolc
31     isouc disouc
32     isotr disotr
33     stop grundform
34 laperla 32 utf8iso
35 ulpfr 10 );
36 ulpfr 13 # (most implemented in WAIT.xs)
37 ulpfr 10
38 ulpfr 19 $VERSION = substr q$Revision: 1.9 $, 10;
39 ulpfr 10
40     sub split {
41     map split(' ', $_), @_;
42     }
43    
44     $SPLIT = q[
45     sub splitXXX {
46     grep length($_)>=XXX, map split(' ', $_), @_;
47     }
48     ];
49    
50     sub AUTOLOAD {
51     my $func = $AUTOLOAD; $func =~ s/.*:://;
52    
53     if ($func =~ /split(\d+)/) {
54     my $num = $1;
55     my $split = $SPLIT;
56    
57     $split =~ s/XXX/$num/g;
58     eval $split;
59     if ($@ eq '') {
60     goto &$AUTOLOAD;
61     }
62     } elsif ($func eq 'grundform') {
63     eval {require Text::German;};
64     croak "You must have Text::German to use 'grundform'"
65     if $@ ne '';
66     *grundform = Text::German->can('reduce');
67     goto &grundform;
68     } elsif ($func eq 'date') {
69     eval {require Time::ParseDate;};
70     croak "You must have Time::ParseDate to use 'date'"
71     if $@ ne '';
72     *date = Time::ParseDate->can('parsedate');
73     goto \&date;
74     } elsif ($func eq 'decode_entities') {
75     eval {require HTML::Entities;};
76     croak "You must have HTML::Entities to use 'date'"
77     if $@ ne '';
78     *decode_entities = HTML::Entities->can('decode_entities');
79     goto &decode_entities;
80 ulpfr 13 } elsif ($func =~ /^d?utf8iso$/) {
81     no strict 'refs';
82 laperla 32 *$func = sub {
83     # Courtesy JHI
84 laperla 41 my $s = shift;
85     $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86     {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
87     $s;
88 laperla 32 };
89     goto \&$func;
90 ulpfr 10 }
91 ulpfr 13 Carp::confess "Class WAIT::Filter::$func not found";
92 ulpfr 10 }
93    
94     while (<DATA>) {
95     chomp;
96     last if /__END__/;
97 ulpfr 19 next if /^\s*#/; # there's a comment
98 ulpfr 10 $STOP{$_}++;
99     }
100 laperla 50 close DATA;
101 ulpfr 10
102     sub stop {
103     if (exists $STOP{$_[0]}) {
104     ''
105     } else {
106     $_[0];
107     }
108     }
109    
110     sub gdate {
111     my $date = shift;
112    
113     $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
114     date($date);
115     }
116    
117     1;
118 dpavlin 109
119 ulpfr 10 __DATA__
120     a
121     about
122     above
123     according
124     across
125     actually
126     adj
127     after
128     afterwards
129     again
130     against
131     all
132     almost
133     alone
134     along
135     already
136     also
137     although
138     always
139     among
140     amongst
141     an
142     and
143     another
144     any
145     anyhow
146     anyone
147     anything
148     anywhere
149     are
150     aren't
151     around
152     as
153     at
154     b
155     be
156     became
157     because
158     become
159     becomes
160     becoming
161     been
162     before
163     beforehand
164     begin
165     beginning
166     behind
167     being
168     below
169     beside
170     besides
171     between
172     beyond
173     billion
174     both
175     but
176     by
177     c
178     can
179     can't
180     cannot
181     caption
182     co
183     co.
184     could
185     couldn't
186     d
187     did
188     didn't
189     do
190     does
191     doesn't
192     don't
193     down
194     during
195     e
196     eg
197     eight
198     eighty
199     either
200     else
201     elsewhere
202     end
203     ending
204     enough
205     etc
206     even
207     ever
208     every
209     everyone
210     everything
211     everywhere
212     except
213     f
214     few
215     fifty
216     first
217     five
218     vfor
219     former
220     formerly
221     forty
222 ulpfr 13 found
223 ulpfr 10 four
224     from
225     further
226     g
227     h
228     had
229     has
230     hasn't
231     have
232     haven't
233     he
234     he'd
235     he'll
236     he's
237     hence
238     her
239     here
240     here's
241     hereafter
242     hereby
243     herein
244     hereupon
245     hers
246     herself
247     him
248     himself
249     his
250     how
251     however
252     hundred
253     i
254     i'd
255     i'll
256     i'm
257     i've
258     ie
259     if
260     in
261     inc.
262     indeed
263     instead
264     into
265     is
266     isn't
267     it
268     it's
269     its
270     itself
271     j
272     k
273     l
274     last
275     later
276     latter
277     latterly
278     least
279     less
280     let
281     let's
282     like
283     likely
284     ltd
285     m
286     made
287     make
288     makes
289     many
290     maybe
291     me
292     meantime
293     meanwhile
294     might
295     million
296     miss
297     more
298     moreover
299     most
300     mostly
301     mr
302     mrs
303     much
304     must
305     my
306     myself
307     n
308     namely
309     neither
310     never
311     nevertheless
312     next
313     nine
314     ninety
315     no
316     nobody
317     none
318     nonetheless
319     noone
320     nor
321     not
322     nothing
323     now
324     nowhere
325     o
326     of
327     off
328     often
329     on
330     once
331     one
332     one's
333     only
334     onto
335     or
336     other
337     others
338     otherwise
339     our
340     ours
341     ourselves
342     out
343     over
344     overall
345     own
346     p
347     per
348     perhaps
349     q
350     r
351     rather
352     recent
353     recently
354     s
355     same
356     seem
357     seemed
358     seeming
359     seems
360     seven
361     seventy
362     several
363     she
364     she'd
365     she'll
366     she's
367     should
368     shouldn't
369     since
370     six
371     sixty
372     so
373     some
374     somehow
375     someone
376     something
377     sometime
378     sometimes
379     somewhere
380     still
381     stop
382     such
383     t
384     taking
385     ten
386     than
387     that
388     that'll
389     that's
390     that've
391     the
392     their
393     them
394     themselves
395     then
396     thence
397     there
398     there'd
399     there'll
400     there're
401     there's
402     there've
403     thereafter
404     thereby
405     therefore
406     therein
407     thereupon
408     these
409     they
410     they'd
411     they'll
412     they're
413     they've
414     thirty
415     this
416     those
417     though
418     thousand
419     three
420     through
421     throughout
422     thru
423     thus
424     to
425     together
426     too
427     toward
428     towards
429     trillion
430     twenty
431     two
432     u
433     under
434     unless
435     unlike
436     unlikely
437     until
438     up
439     upon
440     us
441     used
442     using
443     v
444     very
445     via
446     w
447     was
448     wasn't
449     we
450     we'd
451     we'll
452     we're
453     we've
454     well
455     were
456     weren't
457     what
458     what'll
459     what's
460     what've
461     whatever
462     when
463     whence
464     whenever
465     where
466     where's
467     whereafter
468     whereas
469     whereby
470     wherein
471     whereupon
472     wherever
473     whether
474     which
475     while
476     whither
477     who
478     who'd
479     who'll
480     who's
481     whoever
482     whole
483     whom
484     whomever
485     whose
486     why
487     will
488     with
489     within
490     without
491     won't
492     would
493     wouldn't
494     x
495     y
496     yes
497     yet
498     you
499     you'd
500     you'll
501     you're
502     you've
503     your
504     yours
505     yourself
506     yourselves
507     z
508     # occuring in more than 100 files
509     acc
510     accent
511     accents
512     and
513     are
514     bell
515     can
516     character
517     corrections
518     crt
519     daisy
520     dash
521     date
522     defined
523     definitions
524     description
525     devices
526     diablo
527     dummy
528     factors
529     following
530     font
531     for
532     from
533     fudge
534     give
535     have
536     header
537     holds
538     log
539     logo
540     low
541     lpr
542     mark
543     name
544     nroff
545     out
546     output
547     pitch
548     put
549     rcsfile
550     reference
551     resolution
552     revision
553     see
554     set
555     simple
556     smi
557     some
558     string
559     synopsis
560     system
561     that
562     the
563     this
564     translation
565     troff
566     typewriter
567     ucb
568     unbreakable
569     use
570     used
571     user
572     vroff
573     wheel
574     will
575     with
576     you
577     __END__
578    
579     =head1 NAME
580    
581     WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
582    
583     =head1 SYNOPSIS
584    
585 ulpfr 13 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
586 laperla 32 isotr disotr stop grundform);
587 ulpfr 10
588 ulpfr 13 $stem = Stem($word);
589     $scode = Soundex($word);
590     $pcode = Phonix($word);
591     $lword = isolc($word);
592 ulpfr 10 disolc($word);
593 ulpfr 13 $uword = isouc($word);
594 ulpfr 10 disouc($word);
595 ulpfr 13 $trword = isotr($word);
596     disotr($word);
597     $word = stop($word);
598     $word = grundform($word);
599 ulpfr 10
600 ulpfr 13 @words = WAIT::Filter::split($word);
601     @words = WAIT::Filter::split2($word);
602     @words = WAIT::Filter::split3($word);
603     @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
604    
605 ulpfr 10 =head1 DESCRIPTION
606    
607     This tiny modules gives access to the basic reduction functions build
608     in B<freeWAIS-sf>.
609    
610     =over 5
611    
612     =item B<Stem>(I<word>)
613    
614     reduces I<word> using the well know Porter algorithm.
615    
616     AU: Porter, M.F.
617     TI: An Algorithm for Suffix Stripping
618     JT: Program
619     VO: 14
620     PP: 130-137
621     PY: 1980
622     PM: JUL
623    
624     =item B<Soundex>(I<word>)
625    
626    
627     computes the 4 byte B<Soundex> code for I<word>.
628    
629     AU: Gadd, T.N.
630     TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
631     Information Retrieval Systems
632     JT: Program
633     VO: 22
634     NO: 3
635     PP: 222-237
636     PY: 1988
637    
638    
639     =item B<Phonix>(I<word>)
640    
641     computes the 8 byte B<Phonix> code for I<word>.
642    
643     AU: Gadd, T.N.
644     TI: PHONIX: The Algorithm
645     JT: Program
646     VO: 24
647     NO: 4
648     PP: 363-366
649     PY: 1990
650     PM: OCT
651    
652 dpavlin 109 =back
653    
654 ulpfr 10 =head1 ISO charcater case functions
655    
656     There are some additional function which transpose some/most ISOlatin1
657     characters to upper and lower case. To allow for maximum speed there
658     are also I<destructive> versions which change the argument instead of
659     allocating a copy which is returned. For convenience, the destructive
660 ulpfr 13 version also B<returns> the argument. So all of the following is
661 ulpfr 10 valid and C<$word> will contain the lowercased string.
662    
663 ulpfr 13 $word = isolc($word);
664 ulpfr 10 $word = disolc($word);
665     disolc($word);
666    
667     Here are the hardcoded characters which are recognized:
668    
669     abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
670     ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
671    
672 dpavlin 109 =over 5
673    
674 ulpfr 10 =item C<$new = >B<isolc>C<($word)>
675    
676     =item B<disolc>C<($word)>
677    
678     transposes to lower case.
679    
680     =item C<$new = >B<isouc>C<($word)>
681    
682     =item B<disouc>C<($word)>
683    
684     transposes to upper case.
685    
686 ulpfr 13 =item C<$new = >B<isotr>C<($word)>
687    
688     =item B<disotr>C<($word)>
689    
690     Remove non-letters according to the above table.
691    
692     =item C<$new = >B<stop>C<($word)>
693    
694     Returns an empty string if $word is a stopword.
695    
696     =item C<$new = >B<grundform>C<($word)>
697    
698     Calls Text::German::reduce
699    
700     =item C<$new = >B<utf8iso>C<($word)>
701    
702 laperla 32 Deprecated due to flux in perl versions between 5.005 and 5.8. The
703     function converts UTF8 encoded strings to ISO-8859-1. WAIT is
704     internally still based on the Latin1 character set, so if you process
705 ulpfr 13 anything in a different encoding, you should convert to Latin1 as the
706 laperla 32 first filter or refrain from using the iso-latin-1 based filter
707     functions. It is recommended that you use your own converter based on
708     the perl version you're using.
709 ulpfr 13
710     =item split, split2, split3, ...
711    
712     The splitN funtions all take a scalar as input and return a list of
713     words. Split acts just like the perl split(' '). Split2 eliminates all
714     words from the list that are shorter than 2 characters (bytes), split3
715     eliminates those shorter than 3 characters (bytes) and so on.
716    
717 dpavlin 109 =back
718    
719 ulpfr 10 =head1 AUTHOR
720    
721     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
722    
723     =head1 SEE ALSO
724    
725     perl(1).
726    
727     =cut
728    

Properties

Name Value
cvs2svn:cvs-rev 1.4

  ViewVC Help
Powered by ViewVC 1.1.26