/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 118 - (hide annotations)
Fri Jul 15 18:59:10 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 9845 byte(s)
some rather old changes from 2004-05-28

1 ulpfr 13 # -*- Mode: Cperl -*-
2 ulpfr 10 # $Basename: Filter.pm $
3 ulpfr 19 # $Revision: 1.9 $
4 ulpfr 10 # ITIID : $ITI$ $Header $__Header$
5     # Author : Ulrich Pfeifer
6     # Created On : Thu Aug 15 18:09:51 1996
7     # Last Modified By: Ulrich Pfeifer
8     # Last Modified On: Sun Nov 22 18:44:46 1998
9     # Language : CPerl
10     # Update Count : 105
11     # Status : Unknown, Use with caution!
12 ulpfr 13 #
13 ulpfr 10 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 ulpfr 13 #
15 ulpfr 10 package WAIT::Filter;
16     require WAIT;
17     use strict;
18     use Carp;
19 dpavlin 118 use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $UNAC $ICONV $AUTOLOAD);
20 ulpfr 10 use subs qw(grundform);
21    
22 dpavlin 118 use Text::Unaccent;
23     use Text::Iconv;
24    
25 ulpfr 10 require Exporter;
26    
27     @ISA = qw(Exporter);
28     @EXPORT_OK = qw(
29     Stem
30     Soundex
31     Phonix
32     Metaphone
33     isolc disolc
34     isouc disouc
35     isotr disotr
36     stop grundform
37 laperla 32 utf8iso
38 ulpfr 10 );
39 ulpfr 13 # (most implemented in WAIT.xs)
40 ulpfr 10
41 ulpfr 19 $VERSION = substr q$Revision: 1.9 $, 10;
42 ulpfr 10
43     sub split {
44     map split(' ', $_), @_;
45     }
46    
47     $SPLIT = q[
48     sub splitXXX {
49     grep length($_)>=XXX, map split(' ', $_), @_;
50     }
51     ];
52    
53 dpavlin 118 $UNAC = q[
54     sub unac_CHARSET {
55     map split(' ',unac_string('CHARSET', $_) || $_), @_;
56     }
57     ];
58    
59     my $iconv;
60    
61     $ICONV = q[
62     sub iconv_CHARSETfrom_CHARSETto {
63     my $ic = $iconv->{'CHARSETfrom_CHARSETto'});
64     $ic ||= $iconv->{'CHARSETfrom_CHARSETto'} = Text::Iconv->new('CHARSETfrom','CHARSETto');
65     map split(' ',$ic->convert($_) || $_), @_;
66     }
67     ];
68    
69    
70 ulpfr 10 sub AUTOLOAD {
71     my $func = $AUTOLOAD; $func =~ s/.*:://;
72    
73     if ($func =~ /split(\d+)/) {
74     my $num = $1;
75     my $split = $SPLIT;
76    
77     $split =~ s/XXX/$num/g;
78     eval $split;
79     if ($@ eq '') {
80     goto &$AUTOLOAD;
81     }
82     } elsif ($func eq 'grundform') {
83     eval {require Text::German;};
84     croak "You must have Text::German to use 'grundform'"
85     if $@ ne '';
86     *grundform = Text::German->can('reduce');
87     goto &grundform;
88     } elsif ($func eq 'date') {
89     eval {require Time::ParseDate;};
90     croak "You must have Time::ParseDate to use 'date'"
91     if $@ ne '';
92     *date = Time::ParseDate->can('parsedate');
93     goto \&date;
94     } elsif ($func eq 'decode_entities') {
95     eval {require HTML::Entities;};
96 dpavlin 118 croak "You must have HTML::Entities to use 'decode_entities'"
97 ulpfr 10 if $@ ne '';
98     *decode_entities = HTML::Entities->can('decode_entities');
99     goto &decode_entities;
100 ulpfr 13 } elsif ($func =~ /^d?utf8iso$/) {
101     no strict 'refs';
102 laperla 32 *$func = sub {
103     # Courtesy JHI
104 laperla 41 my $s = shift;
105     $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
106     {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
107     $s;
108 laperla 32 };
109     goto \&$func;
110 dpavlin 118 } elsif ($func =~ /unac_(.+)/) {
111     my $charset = $1;
112     my $unac = $UNAC;
113     $unac =~ s/CHARSET/$charset/g;
114     print "### $unac ###\n";
115     eval $unac;
116     if ($@ eq '') {
117     goto &$func;
118     }
119     } elsif ($func =~ /iconv_([^_]+)_([^_]+)/) {
120     my ($cf,$ct) = ($1,$2);
121     my $iconv = $ICONV;
122     print "### $cf -> $ct\n";
123     $iconv =~ s/CHARSETfrom/$cf/gs;
124     $iconv =~ s/CHARSETto/$ct/gs;
125     print "### $iconv ###\n";
126     eval $iconv;
127     if ($@ eq '') {
128     goto &$func;
129     }
130 ulpfr 10 }
131 ulpfr 13 Carp::confess "Class WAIT::Filter::$func not found";
132 ulpfr 10 }
133    
134     while (<DATA>) {
135     chomp;
136     last if /__END__/;
137 ulpfr 19 next if /^\s*#/; # there's a comment
138 ulpfr 10 $STOP{$_}++;
139     }
140 laperla 50 close DATA;
141 ulpfr 10
142     sub stop {
143     if (exists $STOP{$_[0]}) {
144     ''
145     } else {
146     $_[0];
147     }
148     }
149    
150     sub gdate {
151     my $date = shift;
152    
153     $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
154     date($date);
155     }
156    
157     1;
158 dpavlin 109
159 ulpfr 10 __DATA__
160     a
161     about
162     above
163     according
164     across
165     actually
166     adj
167     after
168     afterwards
169     again
170     against
171     all
172     almost
173     alone
174     along
175     already
176     also
177     although
178     always
179     among
180     amongst
181     an
182     and
183     another
184     any
185     anyhow
186     anyone
187     anything
188     anywhere
189     are
190     aren't
191     around
192     as
193     at
194     b
195     be
196     became
197     because
198     become
199     becomes
200     becoming
201     been
202     before
203     beforehand
204     begin
205     beginning
206     behind
207     being
208     below
209     beside
210     besides
211     between
212     beyond
213     billion
214     both
215     but
216     by
217     c
218     can
219     can't
220     cannot
221     caption
222     co
223     co.
224     could
225     couldn't
226     d
227     did
228     didn't
229     do
230     does
231     doesn't
232     don't
233     down
234     during
235     e
236     eg
237     eight
238     eighty
239     either
240     else
241     elsewhere
242     end
243     ending
244     enough
245     etc
246     even
247     ever
248     every
249     everyone
250     everything
251     everywhere
252     except
253     f
254     few
255     fifty
256     first
257     five
258     vfor
259     former
260     formerly
261     forty
262 ulpfr 13 found
263 ulpfr 10 four
264     from
265     further
266     g
267     h
268     had
269     has
270     hasn't
271     have
272     haven't
273     he
274     he'd
275     he'll
276     he's
277     hence
278     her
279     here
280     here's
281     hereafter
282     hereby
283     herein
284     hereupon
285     hers
286     herself
287     him
288     himself
289     his
290     how
291     however
292     hundred
293     i
294     i'd
295     i'll
296     i'm
297     i've
298     ie
299     if
300     in
301     inc.
302     indeed
303     instead
304     into
305     is
306     isn't
307     it
308     it's
309     its
310     itself
311     j
312     k
313     l
314     last
315     later
316     latter
317     latterly
318     least
319     less
320     let
321     let's
322     like
323     likely
324     ltd
325     m
326     made
327     make
328     makes
329     many
330     maybe
331     me
332     meantime
333     meanwhile
334     might
335     million
336     miss
337     more
338     moreover
339     most
340     mostly
341     mr
342     mrs
343     much
344     must
345     my
346     myself
347     n
348     namely
349     neither
350     never
351     nevertheless
352     next
353     nine
354     ninety
355     no
356     nobody
357     none
358     nonetheless
359     noone
360     nor
361     not
362     nothing
363     now
364     nowhere
365     o
366     of
367     off
368     often
369     on
370     once
371     one
372     one's
373     only
374     onto
375     or
376     other
377     others
378     otherwise
379     our
380     ours
381     ourselves
382     out
383     over
384     overall
385     own
386     p
387     per
388     perhaps
389     q
390     r
391     rather
392     recent
393     recently
394     s
395     same
396     seem
397     seemed
398     seeming
399     seems
400     seven
401     seventy
402     several
403     she
404     she'd
405     she'll
406     she's
407     should
408     shouldn't
409     since
410     six
411     sixty
412     so
413     some
414     somehow
415     someone
416     something
417     sometime
418     sometimes
419     somewhere
420     still
421     stop
422     such
423     t
424     taking
425     ten
426     than
427     that
428     that'll
429     that's
430     that've
431     the
432     their
433     them
434     themselves
435     then
436     thence
437     there
438     there'd
439     there'll
440     there're
441     there's
442     there've
443     thereafter
444     thereby
445     therefore
446     therein
447     thereupon
448     these
449     they
450     they'd
451     they'll
452     they're
453     they've
454     thirty
455     this
456     those
457     though
458     thousand
459     three
460     through
461     throughout
462     thru
463     thus
464     to
465     together
466     too
467     toward
468     towards
469     trillion
470     twenty
471     two
472     u
473     under
474     unless
475     unlike
476     unlikely
477     until
478     up
479     upon
480     us
481     used
482     using
483     v
484     very
485     via
486     w
487     was
488     wasn't
489     we
490     we'd
491     we'll
492     we're
493     we've
494     well
495     were
496     weren't
497     what
498     what'll
499     what's
500     what've
501     whatever
502     when
503     whence
504     whenever
505     where
506     where's
507     whereafter
508     whereas
509     whereby
510     wherein
511     whereupon
512     wherever
513     whether
514     which
515     while
516     whither
517     who
518     who'd
519     who'll
520     who's
521     whoever
522     whole
523     whom
524     whomever
525     whose
526     why
527     will
528     with
529     within
530     without
531     won't
532     would
533     wouldn't
534     x
535     y
536     yes
537     yet
538     you
539     you'd
540     you'll
541     you're
542     you've
543     your
544     yours
545     yourself
546     yourselves
547     z
548     # occuring in more than 100 files
549     acc
550     accent
551     accents
552     and
553     are
554     bell
555     can
556     character
557     corrections
558     crt
559     daisy
560     dash
561     date
562     defined
563     definitions
564     description
565     devices
566     diablo
567     dummy
568     factors
569     following
570     font
571     for
572     from
573     fudge
574     give
575     have
576     header
577     holds
578     log
579     logo
580     low
581     lpr
582     mark
583     name
584     nroff
585     out
586     output
587     pitch
588     put
589     rcsfile
590     reference
591     resolution
592     revision
593     see
594     set
595     simple
596     smi
597     some
598     string
599     synopsis
600     system
601     that
602     the
603     this
604     translation
605     troff
606     typewriter
607     ucb
608     unbreakable
609     use
610     used
611     user
612     vroff
613     wheel
614     will
615     with
616     you
617     __END__
618    
619     =head1 NAME
620    
621     WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
622    
623     =head1 SYNOPSIS
624    
625 ulpfr 13 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
626 laperla 32 isotr disotr stop grundform);
627 ulpfr 10
628 ulpfr 13 $stem = Stem($word);
629     $scode = Soundex($word);
630     $pcode = Phonix($word);
631     $lword = isolc($word);
632 ulpfr 10 disolc($word);
633 ulpfr 13 $uword = isouc($word);
634 ulpfr 10 disouc($word);
635 ulpfr 13 $trword = isotr($word);
636     disotr($word);
637     $word = stop($word);
638     $word = grundform($word);
639 ulpfr 10
640 ulpfr 13 @words = WAIT::Filter::split($word);
641     @words = WAIT::Filter::split2($word);
642     @words = WAIT::Filter::split3($word);
643     @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
644    
645 ulpfr 10 =head1 DESCRIPTION
646    
647     This tiny modules gives access to the basic reduction functions build
648     in B<freeWAIS-sf>.
649    
650     =over 5
651    
652     =item B<Stem>(I<word>)
653    
654     reduces I<word> using the well know Porter algorithm.
655    
656     AU: Porter, M.F.
657     TI: An Algorithm for Suffix Stripping
658     JT: Program
659     VO: 14
660     PP: 130-137
661     PY: 1980
662     PM: JUL
663    
664     =item B<Soundex>(I<word>)
665    
666    
667     computes the 4 byte B<Soundex> code for I<word>.
668    
669     AU: Gadd, T.N.
670     TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
671     Information Retrieval Systems
672     JT: Program
673     VO: 22
674     NO: 3
675     PP: 222-237
676     PY: 1988
677    
678    
679     =item B<Phonix>(I<word>)
680    
681     computes the 8 byte B<Phonix> code for I<word>.
682    
683     AU: Gadd, T.N.
684     TI: PHONIX: The Algorithm
685     JT: Program
686     VO: 24
687     NO: 4
688     PP: 363-366
689     PY: 1990
690     PM: OCT
691    
692 dpavlin 109 =back
693    
694 ulpfr 10 =head1 ISO charcater case functions
695    
696     There are some additional function which transpose some/most ISOlatin1
697     characters to upper and lower case. To allow for maximum speed there
698     are also I<destructive> versions which change the argument instead of
699     allocating a copy which is returned. For convenience, the destructive
700 ulpfr 13 version also B<returns> the argument. So all of the following is
701 ulpfr 10 valid and C<$word> will contain the lowercased string.
702    
703 ulpfr 13 $word = isolc($word);
704 ulpfr 10 $word = disolc($word);
705     disolc($word);
706    
707     Here are the hardcoded characters which are recognized:
708    
709     abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
710     ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
711    
712 dpavlin 109 =over 5
713    
714 ulpfr 10 =item C<$new = >B<isolc>C<($word)>
715    
716     =item B<disolc>C<($word)>
717    
718     transposes to lower case.
719    
720     =item C<$new = >B<isouc>C<($word)>
721    
722     =item B<disouc>C<($word)>
723    
724     transposes to upper case.
725    
726 ulpfr 13 =item C<$new = >B<isotr>C<($word)>
727    
728     =item B<disotr>C<($word)>
729    
730     Remove non-letters according to the above table.
731    
732     =item C<$new = >B<stop>C<($word)>
733    
734     Returns an empty string if $word is a stopword.
735    
736     =item C<$new = >B<grundform>C<($word)>
737    
738     Calls Text::German::reduce
739    
740     =item C<$new = >B<utf8iso>C<($word)>
741    
742 laperla 32 Deprecated due to flux in perl versions between 5.005 and 5.8. The
743     function converts UTF8 encoded strings to ISO-8859-1. WAIT is
744     internally still based on the Latin1 character set, so if you process
745 ulpfr 13 anything in a different encoding, you should convert to Latin1 as the
746 laperla 32 first filter or refrain from using the iso-latin-1 based filter
747     functions. It is recommended that you use your own converter based on
748     the perl version you're using.
749 ulpfr 13
750     =item split, split2, split3, ...
751    
752     The splitN funtions all take a scalar as input and return a list of
753     words. Split acts just like the perl split(' '). Split2 eliminates all
754     words from the list that are shorter than 2 characters (bytes), split3
755     eliminates those shorter than 3 characters (bytes) and so on.
756    
757 dpavlin 109 =back
758    
759 ulpfr 10 =head1 AUTHOR
760    
761     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
762    
763     =head1 SEE ALSO
764    
765     perl(1).
766    
767     =cut
768    

Properties

Name Value
cvs2svn:cvs-rev 1.4

  ViewVC Help
Powered by ViewVC 1.1.26