/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 50 - (hide annotations)
Sat Mar 3 11:24:59 2001 UTC (23 years, 2 months ago) by laperla
Original Path: cvs-head/lib/WAIT/Filter.pm
File size: 9007 byte(s)
close DATA to free filehandle

1 ulpfr 13 # -*- Mode: Cperl -*-
2 ulpfr 10 # $Basename: Filter.pm $
3 ulpfr 19 # $Revision: 1.9 $
4 ulpfr 10 # ITIID : $ITI$ $Header $__Header$
5     # Author : Ulrich Pfeifer
6     # Created On : Thu Aug 15 18:09:51 1996
7     # Last Modified By: Ulrich Pfeifer
8     # Last Modified On: Sun Nov 22 18:44:46 1998
9     # Language : CPerl
10     # Update Count : 105
11     # Status : Unknown, Use with caution!
12 ulpfr 13 #
13 ulpfr 10 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 ulpfr 13 #
15 ulpfr 10 package WAIT::Filter;
16     require WAIT;
17     use strict;
18     use Carp;
19     use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20     use subs qw(grundform);
21    
22     require Exporter;
23    
24     @ISA = qw(Exporter);
25     @EXPORT_OK = qw(
26     Stem
27     Soundex
28     Phonix
29     Metaphone
30     isolc disolc
31     isouc disouc
32     isotr disotr
33     stop grundform
34 laperla 32 utf8iso
35 ulpfr 10 );
36 ulpfr 13 # (most implemented in WAIT.xs)
37 ulpfr 10
38 ulpfr 19 $VERSION = substr q$Revision: 1.9 $, 10;
39 ulpfr 10
40     sub split {
41     map split(' ', $_), @_;
42     }
43    
44     $SPLIT = q[
45     sub splitXXX {
46     grep length($_)>=XXX, map split(' ', $_), @_;
47     }
48     ];
49    
50     sub AUTOLOAD {
51     my $func = $AUTOLOAD; $func =~ s/.*:://;
52    
53     if ($func =~ /split(\d+)/) {
54     my $num = $1;
55     my $split = $SPLIT;
56    
57     $split =~ s/XXX/$num/g;
58     eval $split;
59     if ($@ eq '') {
60     goto &$AUTOLOAD;
61     }
62     } elsif ($func eq 'grundform') {
63     eval {require Text::German;};
64     croak "You must have Text::German to use 'grundform'"
65     if $@ ne '';
66     *grundform = Text::German->can('reduce');
67     goto &grundform;
68     } elsif ($func eq 'date') {
69     eval {require Time::ParseDate;};
70     croak "You must have Time::ParseDate to use 'date'"
71     if $@ ne '';
72     *date = Time::ParseDate->can('parsedate');
73     goto \&date;
74     } elsif ($func eq 'decode_entities') {
75     eval {require HTML::Entities;};
76     croak "You must have HTML::Entities to use 'date'"
77     if $@ ne '';
78     *decode_entities = HTML::Entities->can('decode_entities');
79     goto &decode_entities;
80 ulpfr 13 } elsif ($func =~ /^d?utf8iso$/) {
81     no strict 'refs';
82 laperla 32 *$func = sub {
83     # Courtesy JHI
84 laperla 41 my $s = shift;
85     $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86     {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
87     $s;
88 laperla 32 };
89     goto \&$func;
90 ulpfr 10 }
91 ulpfr 13 Carp::confess "Class WAIT::Filter::$func not found";
92 ulpfr 10 }
93    
94     while (<DATA>) {
95     chomp;
96     last if /__END__/;
97 ulpfr 19 next if /^\s*#/; # there's a comment
98 ulpfr 10 $STOP{$_}++;
99     }
100 laperla 50 close DATA;
101 ulpfr 10
102     sub stop {
103     if (exists $STOP{$_[0]}) {
104     ''
105     } else {
106     $_[0];
107     }
108     }
109    
110     sub gdate {
111     my $date = shift;
112    
113     $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
114     date($date);
115     }
116    
117     1;
118     __DATA__
119     a
120     about
121     above
122     according
123     across
124     actually
125     adj
126     after
127     afterwards
128     again
129     against
130     all
131     almost
132     alone
133     along
134     already
135     also
136     although
137     always
138     among
139     amongst
140     an
141     and
142     another
143     any
144     anyhow
145     anyone
146     anything
147     anywhere
148     are
149     aren't
150     around
151     as
152     at
153     b
154     be
155     became
156     because
157     become
158     becomes
159     becoming
160     been
161     before
162     beforehand
163     begin
164     beginning
165     behind
166     being
167     below
168     beside
169     besides
170     between
171     beyond
172     billion
173     both
174     but
175     by
176     c
177     can
178     can't
179     cannot
180     caption
181     co
182     co.
183     could
184     couldn't
185     d
186     did
187     didn't
188     do
189     does
190     doesn't
191     don't
192     down
193     during
194     e
195     eg
196     eight
197     eighty
198     either
199     else
200     elsewhere
201     end
202     ending
203     enough
204     etc
205     even
206     ever
207     every
208     everyone
209     everything
210     everywhere
211     except
212     f
213     few
214     fifty
215     first
216     five
217     vfor
218     former
219     formerly
220     forty
221 ulpfr 13 found
222 ulpfr 10 four
223     from
224     further
225     g
226     h
227     had
228     has
229     hasn't
230     have
231     haven't
232     he
233     he'd
234     he'll
235     he's
236     hence
237     her
238     here
239     here's
240     hereafter
241     hereby
242     herein
243     hereupon
244     hers
245     herself
246     him
247     himself
248     his
249     how
250     however
251     hundred
252     i
253     i'd
254     i'll
255     i'm
256     i've
257     ie
258     if
259     in
260     inc.
261     indeed
262     instead
263     into
264     is
265     isn't
266     it
267     it's
268     its
269     itself
270     j
271     k
272     l
273     last
274     later
275     latter
276     latterly
277     least
278     less
279     let
280     let's
281     like
282     likely
283     ltd
284     m
285     made
286     make
287     makes
288     many
289     maybe
290     me
291     meantime
292     meanwhile
293     might
294     million
295     miss
296     more
297     moreover
298     most
299     mostly
300     mr
301     mrs
302     much
303     must
304     my
305     myself
306     n
307     namely
308     neither
309     never
310     nevertheless
311     next
312     nine
313     ninety
314     no
315     nobody
316     none
317     nonetheless
318     noone
319     nor
320     not
321     nothing
322     now
323     nowhere
324     o
325     of
326     off
327     often
328     on
329     once
330     one
331     one's
332     only
333     onto
334     or
335     other
336     others
337     otherwise
338     our
339     ours
340     ourselves
341     out
342     over
343     overall
344     own
345     p
346     per
347     perhaps
348     q
349     r
350     rather
351     recent
352     recently
353     s
354     same
355     seem
356     seemed
357     seeming
358     seems
359     seven
360     seventy
361     several
362     she
363     she'd
364     she'll
365     she's
366     should
367     shouldn't
368     since
369     six
370     sixty
371     so
372     some
373     somehow
374     someone
375     something
376     sometime
377     sometimes
378     somewhere
379     still
380     stop
381     such
382     t
383     taking
384     ten
385     than
386     that
387     that'll
388     that's
389     that've
390     the
391     their
392     them
393     themselves
394     then
395     thence
396     there
397     there'd
398     there'll
399     there're
400     there's
401     there've
402     thereafter
403     thereby
404     therefore
405     therein
406     thereupon
407     these
408     they
409     they'd
410     they'll
411     they're
412     they've
413     thirty
414     this
415     those
416     though
417     thousand
418     three
419     through
420     throughout
421     thru
422     thus
423     to
424     together
425     too
426     toward
427     towards
428     trillion
429     twenty
430     two
431     u
432     under
433     unless
434     unlike
435     unlikely
436     until
437     up
438     upon
439     us
440     used
441     using
442     v
443     very
444     via
445     w
446     was
447     wasn't
448     we
449     we'd
450     we'll
451     we're
452     we've
453     well
454     were
455     weren't
456     what
457     what'll
458     what's
459     what've
460     whatever
461     when
462     whence
463     whenever
464     where
465     where's
466     whereafter
467     whereas
468     whereby
469     wherein
470     whereupon
471     wherever
472     whether
473     which
474     while
475     whither
476     who
477     who'd
478     who'll
479     who's
480     whoever
481     whole
482     whom
483     whomever
484     whose
485     why
486     will
487     with
488     within
489     without
490     won't
491     would
492     wouldn't
493     x
494     y
495     yes
496     yet
497     you
498     you'd
499     you'll
500     you're
501     you've
502     your
503     yours
504     yourself
505     yourselves
506     z
507     # occuring in more than 100 files
508     acc
509     accent
510     accents
511     and
512     are
513     bell
514     can
515     character
516     corrections
517     crt
518     daisy
519     dash
520     date
521     defined
522     definitions
523     description
524     devices
525     diablo
526     dummy
527     factors
528     following
529     font
530     for
531     from
532     fudge
533     give
534     have
535     header
536     holds
537     log
538     logo
539     low
540     lpr
541     mark
542     name
543     nroff
544     out
545     output
546     pitch
547     put
548     rcsfile
549     reference
550     resolution
551     revision
552     see
553     set
554     simple
555     smi
556     some
557     string
558     synopsis
559     system
560     that
561     the
562     this
563     translation
564     troff
565     typewriter
566     ucb
567     unbreakable
568     use
569     used
570     user
571     vroff
572     wheel
573     will
574     with
575     you
576     __END__
577     # Below is the stub of documentation for your module. You better edit it!
578    
579     =head1 NAME
580    
581     WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
582    
583     =head1 SYNOPSIS
584    
585 ulpfr 13 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
586 laperla 32 isotr disotr stop grundform);
587 ulpfr 10
588 ulpfr 13 $stem = Stem($word);
589     $scode = Soundex($word);
590     $pcode = Phonix($word);
591     $lword = isolc($word);
592 ulpfr 10 disolc($word);
593 ulpfr 13 $uword = isouc($word);
594 ulpfr 10 disouc($word);
595 ulpfr 13 $trword = isotr($word);
596     disotr($word);
597     $word = stop($word);
598     $word = grundform($word);
599 ulpfr 10
600 ulpfr 13 @words = WAIT::Filter::split($word);
601     @words = WAIT::Filter::split2($word);
602     @words = WAIT::Filter::split3($word);
603     @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
604    
605 ulpfr 10 =head1 DESCRIPTION
606    
607     This tiny modules gives access to the basic reduction functions build
608     in B<freeWAIS-sf>.
609    
610     =over 5
611    
612     =item B<Stem>(I<word>)
613    
614     reduces I<word> using the well know Porter algorithm.
615    
616     AU: Porter, M.F.
617     TI: An Algorithm for Suffix Stripping
618     JT: Program
619     VO: 14
620     PP: 130-137
621     PY: 1980
622     PM: JUL
623    
624     =item B<Soundex>(I<word>)
625    
626    
627     computes the 4 byte B<Soundex> code for I<word>.
628    
629     AU: Gadd, T.N.
630     TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
631     Information Retrieval Systems
632     JT: Program
633     VO: 22
634     NO: 3
635     PP: 222-237
636     PY: 1988
637    
638    
639     =item B<Phonix>(I<word>)
640    
641     computes the 8 byte B<Phonix> code for I<word>.
642    
643     AU: Gadd, T.N.
644     TI: PHONIX: The Algorithm
645     JT: Program
646     VO: 24
647     NO: 4
648     PP: 363-366
649     PY: 1990
650     PM: OCT
651    
652     =head1 ISO charcater case functions
653    
654     There are some additional function which transpose some/most ISOlatin1
655     characters to upper and lower case. To allow for maximum speed there
656     are also I<destructive> versions which change the argument instead of
657     allocating a copy which is returned. For convenience, the destructive
658 ulpfr 13 version also B<returns> the argument. So all of the following is
659 ulpfr 10 valid and C<$word> will contain the lowercased string.
660    
661 ulpfr 13 $word = isolc($word);
662 ulpfr 10 $word = disolc($word);
663     disolc($word);
664    
665     Here are the hardcoded characters which are recognized:
666    
667     abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
668     ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
669    
670     =item C<$new = >B<isolc>C<($word)>
671    
672     =item B<disolc>C<($word)>
673    
674     transposes to lower case.
675    
676     =item C<$new = >B<isouc>C<($word)>
677    
678     =item B<disouc>C<($word)>
679    
680     transposes to upper case.
681    
682 ulpfr 13 =item C<$new = >B<isotr>C<($word)>
683    
684     =item B<disotr>C<($word)>
685    
686     Remove non-letters according to the above table.
687    
688     =item C<$new = >B<stop>C<($word)>
689    
690     Returns an empty string if $word is a stopword.
691    
692     =item C<$new = >B<grundform>C<($word)>
693    
694     Calls Text::German::reduce
695    
696     =item C<$new = >B<utf8iso>C<($word)>
697    
698 laperla 32 Deprecated due to flux in perl versions between 5.005 and 5.8. The
699     function converts UTF8 encoded strings to ISO-8859-1. WAIT is
700     internally still based on the Latin1 character set, so if you process
701 ulpfr 13 anything in a different encoding, you should convert to Latin1 as the
702 laperla 32 first filter or refrain from using the iso-latin-1 based filter
703     functions. It is recommended that you use your own converter based on
704     the perl version you're using.
705 ulpfr 13
706     =item split, split2, split3, ...
707    
708     The splitN funtions all take a scalar as input and return a list of
709     words. Split acts just like the perl split(' '). Split2 eliminates all
710     words from the list that are shorter than 2 characters (bytes), split3
711     eliminates those shorter than 3 characters (bytes) and so on.
712    
713 ulpfr 10 =head1 AUTHOR
714    
715     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
716    
717     =head1 SEE ALSO
718    
719     perl(1).
720    
721     =cut
722    

Properties

Name Value
cvs2svn:cvs-rev 1.4

  ViewVC Help
Powered by ViewVC 1.1.26