/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 11 - (hide annotations)
Fri Apr 28 15:41:10 2000 UTC (24 years ago) by unknown
Original Path: branches/CPAN/lib/WAIT/Filter.pm
File size: 7265 byte(s)
This commit was manufactured by cvs2svn to create branch 'CPAN'.
1 ulpfr 10 # -*- Mode: Perl -*-
2     # $Basename: Filter.pm $
3     # $Revision: 1.7 $
4     # ITIID : $ITI$ $Header $__Header$
5     # Author : Ulrich Pfeifer
6     # Created On : Thu Aug 15 18:09:51 1996
7     # Last Modified By: Ulrich Pfeifer
8     # Last Modified On: Sun Nov 22 18:44:46 1998
9     # Language : CPerl
10     # Update Count : 105
11     # Status : Unknown, Use with caution!
12     #
13     # Copyright (c) 1996-1997, Ulrich Pfeifer
14     #
15     package WAIT::Filter;
16     require WAIT;
17     use strict;
18     use Carp;
19     use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20     use subs qw(grundform);
21    
22     require Exporter;
23    
24     @ISA = qw(Exporter);
25     @EXPORT_OK = qw(
26     Stem
27     Soundex
28     Phonix
29     Metaphone
30     isolc disolc
31     isouc disouc
32     isotr disotr
33     stop grundform
34     );
35    
36     $VERSION = substr q$Revision: 1.7 $, 10;
37    
38     sub split {
39     map split(' ', $_), @_;
40     }
41    
42     $SPLIT = q[
43     sub splitXXX {
44     grep length($_)>=XXX, map split(' ', $_), @_;
45     }
46     ];
47    
48     sub AUTOLOAD {
49     my $func = $AUTOLOAD; $func =~ s/.*:://;
50    
51     if ($func =~ /split(\d+)/) {
52     my $num = $1;
53     my $split = $SPLIT;
54    
55     $split =~ s/XXX/$num/g;
56     eval $split;
57     if ($@ eq '') {
58     goto &$AUTOLOAD;
59     }
60     } elsif ($func eq 'grundform') {
61     eval {require Text::German;};
62     croak "You must have Text::German to use 'grundform'"
63     if $@ ne '';
64     *grundform = Text::German->can('reduce');
65     goto &grundform;
66     } elsif ($func eq 'date') {
67     eval {require Time::ParseDate;};
68     croak "You must have Time::ParseDate to use 'date'"
69     if $@ ne '';
70     *date = Time::ParseDate->can('parsedate');
71     goto \&date;
72     } elsif ($func eq 'decode_entities') {
73     eval {require HTML::Entities;};
74     croak "You must have HTML::Entities to use 'date'"
75     if $@ ne '';
76     *decode_entities = HTML::Entities->can('decode_entities');
77     goto &decode_entities;
78     }
79     croak "Your vendor has not defined WAIT::Filter::$func";
80     }
81    
82     while (<DATA>) {
83     chomp;
84     last if /__END__/;
85     $STOP{$_}++;
86     }
87    
88     sub stop {
89     if (exists $STOP{$_[0]}) {
90     ''
91     } else {
92     $_[0];
93     }
94     }
95    
96     sub gdate {
97     my $date = shift;
98    
99     $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
100     date($date);
101     }
102    
103     1;
104     __DATA__
105     a
106     about
107     above
108     according
109     across
110     actually
111     adj
112     after
113     afterwards
114     again
115     against
116     all
117     almost
118     alone
119     along
120     already
121     also
122     although
123     always
124     among
125     amongst
126     an
127     and
128     another
129     any
130     anyhow
131     anyone
132     anything
133     anywhere
134     are
135     aren't
136     around
137     as
138     at
139     b
140     be
141     became
142     because
143     become
144     becomes
145     becoming
146     been
147     before
148     beforehand
149     begin
150     beginning
151     behind
152     being
153     below
154     beside
155     besides
156     between
157     beyond
158     billion
159     both
160     but
161     by
162     c
163     can
164     can't
165     cannot
166     caption
167     co
168     co.
169     could
170     couldn't
171     d
172     did
173     didn't
174     do
175     does
176     doesn't
177     don't
178     down
179     during
180     e
181     eg
182     eight
183     eighty
184     either
185     else
186     elsewhere
187     end
188     ending
189     enough
190     etc
191     even
192     ever
193     every
194     everyone
195     everything
196     everywhere
197     except
198     f
199     few
200     fifty
201     first
202     five
203     vfor
204     former
205     formerly
206     forty
207     found "
208     four
209     from
210     further
211     g
212     h
213     had
214     has
215     hasn't
216     have
217     haven't
218     he
219     he'd
220     he'll
221     he's
222     hence
223     her
224     here
225     here's
226     hereafter
227     hereby
228     herein
229     hereupon
230     hers
231     herself
232     him
233     himself
234     his
235     how
236     however
237     hundred
238     i
239     i'd
240     i'll
241     i'm
242     i've
243     ie
244     if
245     in
246     inc.
247     indeed
248     instead
249     into
250     is
251     isn't
252     it
253     it's
254     its
255     itself
256     j
257     k
258     l
259     last
260     later
261     latter
262     latterly
263     least
264     less
265     let
266     let's
267     like
268     likely
269     ltd
270     m
271     made
272     make
273     makes
274     many
275     maybe
276     me
277     meantime
278     meanwhile
279     might
280     million
281     miss
282     more
283     moreover
284     most
285     mostly
286     mr
287     mrs
288     much
289     must
290     my
291     myself
292     n
293     namely
294     neither
295     never
296     nevertheless
297     next
298     nine
299     ninety
300     no
301     nobody
302     none
303     nonetheless
304     noone
305     nor
306     not
307     nothing
308     now
309     nowhere
310     o
311     of
312     off
313     often
314     on
315     once
316     one
317     one's
318     only
319     onto
320     or
321     other
322     others
323     otherwise
324     our
325     ours
326     ourselves
327     out
328     over
329     overall
330     own
331     p
332     per
333     perhaps
334     q
335     r
336     rather
337     recent
338     recently
339     s
340     same
341     seem
342     seemed
343     seeming
344     seems
345     seven
346     seventy
347     several
348     she
349     she'd
350     she'll
351     she's
352     should
353     shouldn't
354     since
355     six
356     sixty
357     so
358     some
359     somehow
360     someone
361     something
362     sometime
363     sometimes
364     somewhere
365     still
366     stop
367     such
368     t
369     taking
370     ten
371     than
372     that
373     that'll
374     that's
375     that've
376     the
377     their
378     them
379     themselves
380     then
381     thence
382     there
383     there'd
384     there'll
385     there're
386     there's
387     there've
388     thereafter
389     thereby
390     therefore
391     therein
392     thereupon
393     these
394     they
395     they'd
396     they'll
397     they're
398     they've
399     thirty
400     this
401     those
402     though
403     thousand
404     three
405     through
406     throughout
407     thru
408     thus
409     to
410     together
411     too
412     toward
413     towards
414     trillion
415     twenty
416     two
417     u
418     under
419     unless
420     unlike
421     unlikely
422     until
423     up
424     upon
425     us
426     used
427     using
428     v
429     very
430     via
431     w
432     was
433     wasn't
434     we
435     we'd
436     we'll
437     we're
438     we've
439     well
440     were
441     weren't
442     what
443     what'll
444     what's
445     what've
446     whatever
447     when
448     whence
449     whenever
450     where
451     where's
452     whereafter
453     whereas
454     whereby
455     wherein
456     whereupon
457     wherever
458     whether
459     which
460     while
461     whither
462     who
463     who'd
464     who'll
465     who's
466     whoever
467     whole
468     whom
469     whomever
470     whose
471     why
472     will
473     with
474     within
475     without
476     won't
477     would
478     wouldn't
479     x
480     y
481     yes
482     yet
483     you
484     you'd
485     you'll
486     you're
487     you've
488     your
489     yours
490     yourself
491     yourselves
492     z
493     # occuring in more than 100 files
494     acc
495     accent
496     accents
497     and
498     are
499     bell
500     can
501     character
502     corrections
503     crt
504     daisy
505     dash
506     date
507     defined
508     definitions
509     description
510     devices
511     diablo
512     dummy
513     factors
514     following
515     font
516     for
517     from
518     fudge
519     give
520     have
521     header
522     holds
523     log
524     logo
525     low
526     lpr
527     mark
528     name
529     nroff
530     out
531     output
532     pitch
533     put
534     rcsfile
535     reference
536     resolution
537     revision
538     see
539     set
540     simple
541     smi
542     some
543     string
544     synopsis
545     system
546     that
547     the
548     this
549     translation
550     troff
551     typewriter
552     ucb
553     unbreakable
554     use
555     used
556     user
557     vroff
558     wheel
559     will
560     with
561     you
562     __END__
563     # Below is the stub of documentation for your module. You better edit it!
564    
565     =head1 NAME
566    
567     WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
568    
569     =head1 SYNOPSIS
570    
571     use WAIT::Filter qw(Stem Soundex Phonix isolc isouc disolc disouc);
572    
573     $stem = Stem($word);
574     $scode = Soundex($word);
575     $pcode = Phonix($word);
576     $lword = isolc($word);
577     $uword = isouc($word);
578     disolc($word);
579     disouc($word);
580    
581     =head1 DESCRIPTION
582    
583     This tiny modules gives access to the basic reduction functions build
584     in B<freeWAIS-sf>.
585    
586     =over 5
587    
588     =item B<Stem>(I<word>)
589    
590     reduces I<word> using the well know Porter algorithm.
591    
592     AU: Porter, M.F.
593     TI: An Algorithm for Suffix Stripping
594     JT: Program
595     VO: 14
596     PP: 130-137
597     PY: 1980
598     PM: JUL
599    
600     =item B<Soundex>(I<word>)
601    
602    
603     computes the 4 byte B<Soundex> code for I<word>.
604    
605     AU: Gadd, T.N.
606     TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
607     Information Retrieval Systems
608     JT: Program
609     VO: 22
610     NO: 3
611     PP: 222-237
612     PY: 1988
613    
614    
615     =item B<Phonix>(I<word>)
616    
617     computes the 8 byte B<Phonix> code for I<word>.
618    
619     AU: Gadd, T.N.
620     TI: PHONIX: The Algorithm
621     JT: Program
622     VO: 24
623     NO: 4
624     PP: 363-366
625     PY: 1990
626     PM: OCT
627    
628     =head1 ISO charcater case functions
629    
630     There are some additional function which transpose some/most ISOlatin1
631     characters to upper and lower case. To allow for maximum speed there
632     are also I<destructive> versions which change the argument instead of
633     allocating a copy which is returned. For convenience, the destructive
634     version also B<returns> the argument. So both of the following is
635     valid and C<$word> will contain the lowercased string.
636    
637     $word = disolc($word);
638     disolc($word);
639    
640    
641     Here are the hardcoded characters which are recognized:
642    
643     abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
644     ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
645    
646     =item C<$new = >B<isolc>C<($word)>
647    
648     =item B<disolc>C<($word)>
649    
650     transposes to lower case.
651    
652     =item C<$new = >B<isouc>C<($word)>
653    
654     =item B<disouc>C<($word)>
655    
656     transposes to upper case.
657    
658     =head1 AUTHOR
659    
660     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
661    
662     =head1 SEE ALSO
663    
664     perl(1).
665    
666     =cut
667    

Properties

Name Value
cvs2svn:cvs-rev 1.1

  ViewVC Help
Powered by ViewVC 1.1.26