/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 20 - (hide annotations)
Tue May 9 11:29:45 2000 UTC (23 years, 11 months ago) by cvs2svn
Original Path: cvs-head/lib/WAIT/Filter.pm
File size: 8794 byte(s)
This commit was generated by cvs2svn to compensate for changes in r10,
which included commits to RCS files with non-trunk default branches.

1 ulpfr 13 # -*- Mode: Cperl -*-
2 ulpfr 10 # $Basename: Filter.pm $
3 ulpfr 19 # $Revision: 1.9 $
4 ulpfr 10 # ITIID : $ITI$ $Header $__Header$
5     # Author : Ulrich Pfeifer
6     # Created On : Thu Aug 15 18:09:51 1996
7     # Last Modified By: Ulrich Pfeifer
8     # Last Modified On: Sun Nov 22 18:44:46 1998
9     # Language : CPerl
10     # Update Count : 105
11     # Status : Unknown, Use with caution!
12 ulpfr 13 #
13 ulpfr 10 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 ulpfr 13 #
15 ulpfr 10 package WAIT::Filter;
16     require WAIT;
17     use strict;
18     use Carp;
19     use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20     use subs qw(grundform);
21    
22     require Exporter;
23    
24     @ISA = qw(Exporter);
25     @EXPORT_OK = qw(
26     Stem
27     Soundex
28     Phonix
29     Metaphone
30     isolc disolc
31     isouc disouc
32     isotr disotr
33     stop grundform
34 ulpfr 13 utf8iso
35 ulpfr 10 );
36 ulpfr 13 # (most implemented in WAIT.xs)
37 ulpfr 10
38 ulpfr 19 $VERSION = substr q$Revision: 1.9 $, 10;
39 ulpfr 10
40     sub split {
41     map split(' ', $_), @_;
42     }
43    
44     $SPLIT = q[
45     sub splitXXX {
46     grep length($_)>=XXX, map split(' ', $_), @_;
47     }
48     ];
49    
50     sub AUTOLOAD {
51     my $func = $AUTOLOAD; $func =~ s/.*:://;
52    
53     if ($func =~ /split(\d+)/) {
54     my $num = $1;
55     my $split = $SPLIT;
56    
57     $split =~ s/XXX/$num/g;
58     eval $split;
59     if ($@ eq '') {
60     goto &$AUTOLOAD;
61     }
62     } elsif ($func eq 'grundform') {
63     eval {require Text::German;};
64     croak "You must have Text::German to use 'grundform'"
65     if $@ ne '';
66     *grundform = Text::German->can('reduce');
67     goto &grundform;
68     } elsif ($func eq 'date') {
69     eval {require Time::ParseDate;};
70     croak "You must have Time::ParseDate to use 'date'"
71     if $@ ne '';
72     *date = Time::ParseDate->can('parsedate');
73     goto \&date;
74     } elsif ($func eq 'decode_entities') {
75     eval {require HTML::Entities;};
76     croak "You must have HTML::Entities to use 'date'"
77     if $@ ne '';
78     *decode_entities = HTML::Entities->can('decode_entities');
79     goto &decode_entities;
80 ulpfr 13 } elsif ($func =~ /^d?utf8iso$/) {
81     require WAIT::Filter::utf8iso;
82     croak "Your perl version must at least be 5.00556 to use '$func'"
83     if $] < 5.00556;
84     no strict 'refs';
85     *$func = \&{"WAIT::Filter::utf8iso::$func"};
86     goto &utf8iso;
87 ulpfr 10 }
88 ulpfr 13 Carp::confess "Class WAIT::Filter::$func not found";
89 ulpfr 10 }
90    
91     while (<DATA>) {
92     chomp;
93     last if /__END__/;
94 ulpfr 19 next if /^\s*#/; # there's a comment
95 ulpfr 10 $STOP{$_}++;
96     }
97    
98     sub stop {
99     if (exists $STOP{$_[0]}) {
100     ''
101     } else {
102     $_[0];
103     }
104     }
105    
106     sub gdate {
107     my $date = shift;
108    
109     $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
110     date($date);
111     }
112    
113     1;
114     __DATA__
115     a
116     about
117     above
118     according
119     across
120     actually
121     adj
122     after
123     afterwards
124     again
125     against
126     all
127     almost
128     alone
129     along
130     already
131     also
132     although
133     always
134     among
135     amongst
136     an
137     and
138     another
139     any
140     anyhow
141     anyone
142     anything
143     anywhere
144     are
145     aren't
146     around
147     as
148     at
149     b
150     be
151     became
152     because
153     become
154     becomes
155     becoming
156     been
157     before
158     beforehand
159     begin
160     beginning
161     behind
162     being
163     below
164     beside
165     besides
166     between
167     beyond
168     billion
169     both
170     but
171     by
172     c
173     can
174     can't
175     cannot
176     caption
177     co
178     co.
179     could
180     couldn't
181     d
182     did
183     didn't
184     do
185     does
186     doesn't
187     don't
188     down
189     during
190     e
191     eg
192     eight
193     eighty
194     either
195     else
196     elsewhere
197     end
198     ending
199     enough
200     etc
201     even
202     ever
203     every
204     everyone
205     everything
206     everywhere
207     except
208     f
209     few
210     fifty
211     first
212     five
213     vfor
214     former
215     formerly
216     forty
217 ulpfr 13 found
218 ulpfr 10 four
219     from
220     further
221     g
222     h
223     had
224     has
225     hasn't
226     have
227     haven't
228     he
229     he'd
230     he'll
231     he's
232     hence
233     her
234     here
235     here's
236     hereafter
237     hereby
238     herein
239     hereupon
240     hers
241     herself
242     him
243     himself
244     his
245     how
246     however
247     hundred
248     i
249     i'd
250     i'll
251     i'm
252     i've
253     ie
254     if
255     in
256     inc.
257     indeed
258     instead
259     into
260     is
261     isn't
262     it
263     it's
264     its
265     itself
266     j
267     k
268     l
269     last
270     later
271     latter
272     latterly
273     least
274     less
275     let
276     let's
277     like
278     likely
279     ltd
280     m
281     made
282     make
283     makes
284     many
285     maybe
286     me
287     meantime
288     meanwhile
289     might
290     million
291     miss
292     more
293     moreover
294     most
295     mostly
296     mr
297     mrs
298     much
299     must
300     my
301     myself
302     n
303     namely
304     neither
305     never
306     nevertheless
307     next
308     nine
309     ninety
310     no
311     nobody
312     none
313     nonetheless
314     noone
315     nor
316     not
317     nothing
318     now
319     nowhere
320     o
321     of
322     off
323     often
324     on
325     once
326     one
327     one's
328     only
329     onto
330     or
331     other
332     others
333     otherwise
334     our
335     ours
336     ourselves
337     out
338     over
339     overall
340     own
341     p
342     per
343     perhaps
344     q
345     r
346     rather
347     recent
348     recently
349     s
350     same
351     seem
352     seemed
353     seeming
354     seems
355     seven
356     seventy
357     several
358     she
359     she'd
360     she'll
361     she's
362     should
363     shouldn't
364     since
365     six
366     sixty
367     so
368     some
369     somehow
370     someone
371     something
372     sometime
373     sometimes
374     somewhere
375     still
376     stop
377     such
378     t
379     taking
380     ten
381     than
382     that
383     that'll
384     that's
385     that've
386     the
387     their
388     them
389     themselves
390     then
391     thence
392     there
393     there'd
394     there'll
395     there're
396     there's
397     there've
398     thereafter
399     thereby
400     therefore
401     therein
402     thereupon
403     these
404     they
405     they'd
406     they'll
407     they're
408     they've
409     thirty
410     this
411     those
412     though
413     thousand
414     three
415     through
416     throughout
417     thru
418     thus
419     to
420     together
421     too
422     toward
423     towards
424     trillion
425     twenty
426     two
427     u
428     under
429     unless
430     unlike
431     unlikely
432     until
433     up
434     upon
435     us
436     used
437     using
438     v
439     very
440     via
441     w
442     was
443     wasn't
444     we
445     we'd
446     we'll
447     we're
448     we've
449     well
450     were
451     weren't
452     what
453     what'll
454     what's
455     what've
456     whatever
457     when
458     whence
459     whenever
460     where
461     where's
462     whereafter
463     whereas
464     whereby
465     wherein
466     whereupon
467     wherever
468     whether
469     which
470     while
471     whither
472     who
473     who'd
474     who'll
475     who's
476     whoever
477     whole
478     whom
479     whomever
480     whose
481     why
482     will
483     with
484     within
485     without
486     won't
487     would
488     wouldn't
489     x
490     y
491     yes
492     yet
493     you
494     you'd
495     you'll
496     you're
497     you've
498     your
499     yours
500     yourself
501     yourselves
502     z
503     # occuring in more than 100 files
504     acc
505     accent
506     accents
507     and
508     are
509     bell
510     can
511     character
512     corrections
513     crt
514     daisy
515     dash
516     date
517     defined
518     definitions
519     description
520     devices
521     diablo
522     dummy
523     factors
524     following
525     font
526     for
527     from
528     fudge
529     give
530     have
531     header
532     holds
533     log
534     logo
535     low
536     lpr
537     mark
538     name
539     nroff
540     out
541     output
542     pitch
543     put
544     rcsfile
545     reference
546     resolution
547     revision
548     see
549     set
550     simple
551     smi
552     some
553     string
554     synopsis
555     system
556     that
557     the
558     this
559     translation
560     troff
561     typewriter
562     ucb
563     unbreakable
564     use
565     used
566     user
567     vroff
568     wheel
569     will
570     with
571     you
572     __END__
573     # Below is the stub of documentation for your module. You better edit it!
574    
575     =head1 NAME
576    
577     WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
578    
579     =head1 SYNOPSIS
580    
581 ulpfr 13 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
582     isotr disotr stop grundform utf8iso);
583 ulpfr 10
584 ulpfr 13 $stem = Stem($word);
585     $scode = Soundex($word);
586     $pcode = Phonix($word);
587     $lword = isolc($word);
588 ulpfr 10 disolc($word);
589 ulpfr 13 $uword = isouc($word);
590 ulpfr 10 disouc($word);
591 ulpfr 13 $trword = isotr($word);
592     disotr($word);
593     $word = stop($word);
594     $word = grundform($word);
595 ulpfr 10
596 ulpfr 13 @words = WAIT::Filter::split($word);
597     @words = WAIT::Filter::split2($word);
598     @words = WAIT::Filter::split3($word);
599     @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
600    
601 ulpfr 10 =head1 DESCRIPTION
602    
603     This tiny modules gives access to the basic reduction functions build
604     in B<freeWAIS-sf>.
605    
606     =over 5
607    
608     =item B<Stem>(I<word>)
609    
610     reduces I<word> using the well know Porter algorithm.
611    
612     AU: Porter, M.F.
613     TI: An Algorithm for Suffix Stripping
614     JT: Program
615     VO: 14
616     PP: 130-137
617     PY: 1980
618     PM: JUL
619    
620     =item B<Soundex>(I<word>)
621    
622    
623     computes the 4 byte B<Soundex> code for I<word>.
624    
625     AU: Gadd, T.N.
626     TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
627     Information Retrieval Systems
628     JT: Program
629     VO: 22
630     NO: 3
631     PP: 222-237
632     PY: 1988
633    
634    
635     =item B<Phonix>(I<word>)
636    
637     computes the 8 byte B<Phonix> code for I<word>.
638    
639     AU: Gadd, T.N.
640     TI: PHONIX: The Algorithm
641     JT: Program
642     VO: 24
643     NO: 4
644     PP: 363-366
645     PY: 1990
646     PM: OCT
647    
648     =head1 ISO charcater case functions
649    
650     There are some additional function which transpose some/most ISOlatin1
651     characters to upper and lower case. To allow for maximum speed there
652     are also I<destructive> versions which change the argument instead of
653     allocating a copy which is returned. For convenience, the destructive
654 ulpfr 13 version also B<returns> the argument. So all of the following is
655 ulpfr 10 valid and C<$word> will contain the lowercased string.
656    
657 ulpfr 13 $word = isolc($word);
658 ulpfr 10 $word = disolc($word);
659     disolc($word);
660    
661     Here are the hardcoded characters which are recognized:
662    
663     abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
664     ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
665    
666     =item C<$new = >B<isolc>C<($word)>
667    
668     =item B<disolc>C<($word)>
669    
670     transposes to lower case.
671    
672     =item C<$new = >B<isouc>C<($word)>
673    
674     =item B<disouc>C<($word)>
675    
676     transposes to upper case.
677    
678 ulpfr 13 =item C<$new = >B<isotr>C<($word)>
679    
680     =item B<disotr>C<($word)>
681    
682     Remove non-letters according to the above table.
683    
684     =item C<$new = >B<stop>C<($word)>
685    
686     Returns an empty string if $word is a stopword.
687    
688     =item C<$new = >B<grundform>C<($word)>
689    
690     Calls Text::German::reduce
691    
692     =item C<$new = >B<utf8iso>C<($word)>
693    
694     Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is
695     internally based on the Latin1 character set, so if you process
696     anything in a different encoding, you should convert to Latin1 as the
697     first filter.
698    
699     =item split, split2, split3, ...
700    
701     The splitN funtions all take a scalar as input and return a list of
702     words. Split acts just like the perl split(' '). Split2 eliminates all
703     words from the list that are shorter than 2 characters (bytes), split3
704     eliminates those shorter than 3 characters (bytes) and so on.
705    
706 ulpfr 10 =head1 AUTHOR
707    
708     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
709    
710     =head1 SEE ALSO
711    
712     perl(1).
713    
714     =cut
715    

Properties

Name Value
cvs2svn:cvs-rev 1.1.1.3

  ViewVC Help
Powered by ViewVC 1.1.26