/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (hide annotations)
Fri Apr 28 15:42:44 2000 UTC (24 years ago) by ulpfr
Original Path: branches/CPAN/lib/WAIT/Filter.pm
File size: 8755 byte(s)
Import of WAIT-1.710

1 ulpfr 13 # -*- Mode: Cperl -*-
2 ulpfr 10 # $Basename: Filter.pm $
3 ulpfr 13 # $Revision: 1.8 $
4 ulpfr 10 # ITIID : $ITI$ $Header $__Header$
5     # Author : Ulrich Pfeifer
6     # Created On : Thu Aug 15 18:09:51 1996
7     # Last Modified By: Ulrich Pfeifer
8     # Last Modified On: Sun Nov 22 18:44:46 1998
9     # Language : CPerl
10     # Update Count : 105
11     # Status : Unknown, Use with caution!
12 ulpfr 13 #
13 ulpfr 10 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 ulpfr 13 #
15 ulpfr 10 package WAIT::Filter;
16     require WAIT;
17     use strict;
18     use Carp;
19     use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20     use subs qw(grundform);
21    
22     require Exporter;
23    
24     @ISA = qw(Exporter);
25     @EXPORT_OK = qw(
26     Stem
27     Soundex
28     Phonix
29     Metaphone
30     isolc disolc
31     isouc disouc
32     isotr disotr
33     stop grundform
34 ulpfr 13 utf8iso
35 ulpfr 10 );
36 ulpfr 13 # (most implemented in WAIT.xs)
37 ulpfr 10
38 ulpfr 13 $VERSION = substr q$Revision: 1.8 $, 10;
39 ulpfr 10
40     sub split {
41     map split(' ', $_), @_;
42     }
43    
44     $SPLIT = q[
45     sub splitXXX {
46     grep length($_)>=XXX, map split(' ', $_), @_;
47     }
48     ];
49    
50     sub AUTOLOAD {
51     my $func = $AUTOLOAD; $func =~ s/.*:://;
52    
53     if ($func =~ /split(\d+)/) {
54     my $num = $1;
55     my $split = $SPLIT;
56    
57     $split =~ s/XXX/$num/g;
58     eval $split;
59     if ($@ eq '') {
60     goto &$AUTOLOAD;
61     }
62     } elsif ($func eq 'grundform') {
63     eval {require Text::German;};
64     croak "You must have Text::German to use 'grundform'"
65     if $@ ne '';
66     *grundform = Text::German->can('reduce');
67     goto &grundform;
68     } elsif ($func eq 'date') {
69     eval {require Time::ParseDate;};
70     croak "You must have Time::ParseDate to use 'date'"
71     if $@ ne '';
72     *date = Time::ParseDate->can('parsedate');
73     goto \&date;
74     } elsif ($func eq 'decode_entities') {
75     eval {require HTML::Entities;};
76     croak "You must have HTML::Entities to use 'date'"
77     if $@ ne '';
78     *decode_entities = HTML::Entities->can('decode_entities');
79     goto &decode_entities;
80 ulpfr 13 } elsif ($func =~ /^d?utf8iso$/) {
81     require WAIT::Filter::utf8iso;
82     croak "Your perl version must at least be 5.00556 to use '$func'"
83     if $] < 5.00556;
84     no strict 'refs';
85     *$func = \&{"WAIT::Filter::utf8iso::$func"};
86     goto &utf8iso;
87 ulpfr 10 }
88 ulpfr 13 Carp::confess "Class WAIT::Filter::$func not found";
89 ulpfr 10 }
90    
91     while (<DATA>) {
92     chomp;
93     last if /__END__/;
94     $STOP{$_}++;
95     }
96    
97     sub stop {
98     if (exists $STOP{$_[0]}) {
99     ''
100     } else {
101     $_[0];
102     }
103     }
104    
105     sub gdate {
106     my $date = shift;
107    
108     $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
109     date($date);
110     }
111    
112     1;
113     __DATA__
114     a
115     about
116     above
117     according
118     across
119     actually
120     adj
121     after
122     afterwards
123     again
124     against
125     all
126     almost
127     alone
128     along
129     already
130     also
131     although
132     always
133     among
134     amongst
135     an
136     and
137     another
138     any
139     anyhow
140     anyone
141     anything
142     anywhere
143     are
144     aren't
145     around
146     as
147     at
148     b
149     be
150     became
151     because
152     become
153     becomes
154     becoming
155     been
156     before
157     beforehand
158     begin
159     beginning
160     behind
161     being
162     below
163     beside
164     besides
165     between
166     beyond
167     billion
168     both
169     but
170     by
171     c
172     can
173     can't
174     cannot
175     caption
176     co
177     co.
178     could
179     couldn't
180     d
181     did
182     didn't
183     do
184     does
185     doesn't
186     don't
187     down
188     during
189     e
190     eg
191     eight
192     eighty
193     either
194     else
195     elsewhere
196     end
197     ending
198     enough
199     etc
200     even
201     ever
202     every
203     everyone
204     everything
205     everywhere
206     except
207     f
208     few
209     fifty
210     first
211     five
212     vfor
213     former
214     formerly
215     forty
216 ulpfr 13 found
217 ulpfr 10 four
218     from
219     further
220     g
221     h
222     had
223     has
224     hasn't
225     have
226     haven't
227     he
228     he'd
229     he'll
230     he's
231     hence
232     her
233     here
234     here's
235     hereafter
236     hereby
237     herein
238     hereupon
239     hers
240     herself
241     him
242     himself
243     his
244     how
245     however
246     hundred
247     i
248     i'd
249     i'll
250     i'm
251     i've
252     ie
253     if
254     in
255     inc.
256     indeed
257     instead
258     into
259     is
260     isn't
261     it
262     it's
263     its
264     itself
265     j
266     k
267     l
268     last
269     later
270     latter
271     latterly
272     least
273     less
274     let
275     let's
276     like
277     likely
278     ltd
279     m
280     made
281     make
282     makes
283     many
284     maybe
285     me
286     meantime
287     meanwhile
288     might
289     million
290     miss
291     more
292     moreover
293     most
294     mostly
295     mr
296     mrs
297     much
298     must
299     my
300     myself
301     n
302     namely
303     neither
304     never
305     nevertheless
306     next
307     nine
308     ninety
309     no
310     nobody
311     none
312     nonetheless
313     noone
314     nor
315     not
316     nothing
317     now
318     nowhere
319     o
320     of
321     off
322     often
323     on
324     once
325     one
326     one's
327     only
328     onto
329     or
330     other
331     others
332     otherwise
333     our
334     ours
335     ourselves
336     out
337     over
338     overall
339     own
340     p
341     per
342     perhaps
343     q
344     r
345     rather
346     recent
347     recently
348     s
349     same
350     seem
351     seemed
352     seeming
353     seems
354     seven
355     seventy
356     several
357     she
358     she'd
359     she'll
360     she's
361     should
362     shouldn't
363     since
364     six
365     sixty
366     so
367     some
368     somehow
369     someone
370     something
371     sometime
372     sometimes
373     somewhere
374     still
375     stop
376     such
377     t
378     taking
379     ten
380     than
381     that
382     that'll
383     that's
384     that've
385     the
386     their
387     them
388     themselves
389     then
390     thence
391     there
392     there'd
393     there'll
394     there're
395     there's
396     there've
397     thereafter
398     thereby
399     therefore
400     therein
401     thereupon
402     these
403     they
404     they'd
405     they'll
406     they're
407     they've
408     thirty
409     this
410     those
411     though
412     thousand
413     three
414     through
415     throughout
416     thru
417     thus
418     to
419     together
420     too
421     toward
422     towards
423     trillion
424     twenty
425     two
426     u
427     under
428     unless
429     unlike
430     unlikely
431     until
432     up
433     upon
434     us
435     used
436     using
437     v
438     very
439     via
440     w
441     was
442     wasn't
443     we
444     we'd
445     we'll
446     we're
447     we've
448     well
449     were
450     weren't
451     what
452     what'll
453     what's
454     what've
455     whatever
456     when
457     whence
458     whenever
459     where
460     where's
461     whereafter
462     whereas
463     whereby
464     wherein
465     whereupon
466     wherever
467     whether
468     which
469     while
470     whither
471     who
472     who'd
473     who'll
474     who's
475     whoever
476     whole
477     whom
478     whomever
479     whose
480     why
481     will
482     with
483     within
484     without
485     won't
486     would
487     wouldn't
488     x
489     y
490     yes
491     yet
492     you
493     you'd
494     you'll
495     you're
496     you've
497     your
498     yours
499     yourself
500     yourselves
501     z
502     # occuring in more than 100 files
503     acc
504     accent
505     accents
506     and
507     are
508     bell
509     can
510     character
511     corrections
512     crt
513     daisy
514     dash
515     date
516     defined
517     definitions
518     description
519     devices
520     diablo
521     dummy
522     factors
523     following
524     font
525     for
526     from
527     fudge
528     give
529     have
530     header
531     holds
532     log
533     logo
534     low
535     lpr
536     mark
537     name
538     nroff
539     out
540     output
541     pitch
542     put
543     rcsfile
544     reference
545     resolution
546     revision
547     see
548     set
549     simple
550     smi
551     some
552     string
553     synopsis
554     system
555     that
556     the
557     this
558     translation
559     troff
560     typewriter
561     ucb
562     unbreakable
563     use
564     used
565     user
566     vroff
567     wheel
568     will
569     with
570     you
571     __END__
572     # Below is the stub of documentation for your module. You better edit it!
573    
574     =head1 NAME
575    
576     WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
577    
578     =head1 SYNOPSIS
579    
580 ulpfr 13 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
581     isotr disotr stop grundform utf8iso);
582 ulpfr 10
583 ulpfr 13 $stem = Stem($word);
584     $scode = Soundex($word);
585     $pcode = Phonix($word);
586     $lword = isolc($word);
587 ulpfr 10 disolc($word);
588 ulpfr 13 $uword = isouc($word);
589 ulpfr 10 disouc($word);
590 ulpfr 13 $trword = isotr($word);
591     disotr($word);
592     $word = stop($word);
593     $word = grundform($word);
594 ulpfr 10
595 ulpfr 13 @words = WAIT::Filter::split($word);
596     @words = WAIT::Filter::split2($word);
597     @words = WAIT::Filter::split3($word);
598     @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
599    
600 ulpfr 10 =head1 DESCRIPTION
601    
602     This tiny modules gives access to the basic reduction functions build
603     in B<freeWAIS-sf>.
604    
605     =over 5
606    
607     =item B<Stem>(I<word>)
608    
609     reduces I<word> using the well know Porter algorithm.
610    
611     AU: Porter, M.F.
612     TI: An Algorithm for Suffix Stripping
613     JT: Program
614     VO: 14
615     PP: 130-137
616     PY: 1980
617     PM: JUL
618    
619     =item B<Soundex>(I<word>)
620    
621    
622     computes the 4 byte B<Soundex> code for I<word>.
623    
624     AU: Gadd, T.N.
625     TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
626     Information Retrieval Systems
627     JT: Program
628     VO: 22
629     NO: 3
630     PP: 222-237
631     PY: 1988
632    
633    
634     =item B<Phonix>(I<word>)
635    
636     computes the 8 byte B<Phonix> code for I<word>.
637    
638     AU: Gadd, T.N.
639     TI: PHONIX: The Algorithm
640     JT: Program
641     VO: 24
642     NO: 4
643     PP: 363-366
644     PY: 1990
645     PM: OCT
646    
647     =head1 ISO charcater case functions
648    
649     There are some additional function which transpose some/most ISOlatin1
650     characters to upper and lower case. To allow for maximum speed there
651     are also I<destructive> versions which change the argument instead of
652     allocating a copy which is returned. For convenience, the destructive
653 ulpfr 13 version also B<returns> the argument. So all of the following is
654 ulpfr 10 valid and C<$word> will contain the lowercased string.
655    
656 ulpfr 13 $word = isolc($word);
657 ulpfr 10 $word = disolc($word);
658     disolc($word);
659    
660     Here are the hardcoded characters which are recognized:
661    
662     abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
663     ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
664    
665     =item C<$new = >B<isolc>C<($word)>
666    
667     =item B<disolc>C<($word)>
668    
669     transposes to lower case.
670    
671     =item C<$new = >B<isouc>C<($word)>
672    
673     =item B<disouc>C<($word)>
674    
675     transposes to upper case.
676    
677 ulpfr 13 =item C<$new = >B<isotr>C<($word)>
678    
679     =item B<disotr>C<($word)>
680    
681     Remove non-letters according to the above table.
682    
683     =item C<$new = >B<stop>C<($word)>
684    
685     Returns an empty string if $word is a stopword.
686    
687     =item C<$new = >B<grundform>C<($word)>
688    
689     Calls Text::German::reduce
690    
691     =item C<$new = >B<utf8iso>C<($word)>
692    
693     Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is
694     internally based on the Latin1 character set, so if you process
695     anything in a different encoding, you should convert to Latin1 as the
696     first filter.
697    
698     =item split, split2, split3, ...
699    
700     The splitN funtions all take a scalar as input and return a list of
701     words. Split acts just like the perl split(' '). Split2 eliminates all
702     words from the list that are shorter than 2 characters (bytes), split3
703     eliminates those shorter than 3 characters (bytes) and so on.
704    
705 ulpfr 10 =head1 AUTHOR
706    
707     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
708    
709     =head1 SEE ALSO
710    
711     perl(1).
712    
713     =cut
714    

Properties

Name Value
cvs2svn:cvs-rev 1.1.1.2

  ViewVC Help
Powered by ViewVC 1.1.26