/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 41 - (hide annotations)
Mon Nov 13 20:25:49 2000 UTC (23 years, 5 months ago) by laperla
Original Path: cvs-head/lib/WAIT/Filter.pm
File size: 8995 byte(s)
utf8iso was completely nonsense, returning only true and false.

LockFile::Simple now default-configured to remove stale locks.

1 ulpfr 13 # -*- Mode: Cperl -*-
2 ulpfr 10 # $Basename: Filter.pm $
3 ulpfr 19 # $Revision: 1.9 $
4 ulpfr 10 # ITIID : $ITI$ $Header $__Header$
5     # Author : Ulrich Pfeifer
6     # Created On : Thu Aug 15 18:09:51 1996
7     # Last Modified By: Ulrich Pfeifer
8     # Last Modified On: Sun Nov 22 18:44:46 1998
9     # Language : CPerl
10     # Update Count : 105
11     # Status : Unknown, Use with caution!
12 ulpfr 13 #
13 ulpfr 10 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 ulpfr 13 #
15 ulpfr 10 package WAIT::Filter;
16     require WAIT;
17     use strict;
18     use Carp;
19     use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20     use subs qw(grundform);
21    
22     require Exporter;
23    
24     @ISA = qw(Exporter);
25     @EXPORT_OK = qw(
26     Stem
27     Soundex
28     Phonix
29     Metaphone
30     isolc disolc
31     isouc disouc
32     isotr disotr
33     stop grundform
34 laperla 32 utf8iso
35 ulpfr 10 );
36 ulpfr 13 # (most implemented in WAIT.xs)
37 ulpfr 10
38 ulpfr 19 $VERSION = substr q$Revision: 1.9 $, 10;
39 ulpfr 10
40     sub split {
41     map split(' ', $_), @_;
42     }
43    
44     $SPLIT = q[
45     sub splitXXX {
46     grep length($_)>=XXX, map split(' ', $_), @_;
47     }
48     ];
49    
50     sub AUTOLOAD {
51     my $func = $AUTOLOAD; $func =~ s/.*:://;
52    
53     if ($func =~ /split(\d+)/) {
54     my $num = $1;
55     my $split = $SPLIT;
56    
57     $split =~ s/XXX/$num/g;
58     eval $split;
59     if ($@ eq '') {
60     goto &$AUTOLOAD;
61     }
62     } elsif ($func eq 'grundform') {
63     eval {require Text::German;};
64     croak "You must have Text::German to use 'grundform'"
65     if $@ ne '';
66     *grundform = Text::German->can('reduce');
67     goto &grundform;
68     } elsif ($func eq 'date') {
69     eval {require Time::ParseDate;};
70     croak "You must have Time::ParseDate to use 'date'"
71     if $@ ne '';
72     *date = Time::ParseDate->can('parsedate');
73     goto \&date;
74     } elsif ($func eq 'decode_entities') {
75     eval {require HTML::Entities;};
76     croak "You must have HTML::Entities to use 'date'"
77     if $@ ne '';
78     *decode_entities = HTML::Entities->can('decode_entities');
79     goto &decode_entities;
80 ulpfr 13 } elsif ($func =~ /^d?utf8iso$/) {
81     no strict 'refs';
82 laperla 32 *$func = sub {
83     # Courtesy JHI
84 laperla 41 my $s = shift;
85     $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86     {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
87     $s;
88 laperla 32 };
89     goto \&$func;
90 ulpfr 10 }
91 ulpfr 13 Carp::confess "Class WAIT::Filter::$func not found";
92 ulpfr 10 }
93    
94     while (<DATA>) {
95     chomp;
96     last if /__END__/;
97 ulpfr 19 next if /^\s*#/; # there's a comment
98 ulpfr 10 $STOP{$_}++;
99     }
100    
101     sub stop {
102     if (exists $STOP{$_[0]}) {
103     ''
104     } else {
105     $_[0];
106     }
107     }
108    
109     sub gdate {
110     my $date = shift;
111    
112     $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
113     date($date);
114     }
115    
116     1;
117     __DATA__
118     a
119     about
120     above
121     according
122     across
123     actually
124     adj
125     after
126     afterwards
127     again
128     against
129     all
130     almost
131     alone
132     along
133     already
134     also
135     although
136     always
137     among
138     amongst
139     an
140     and
141     another
142     any
143     anyhow
144     anyone
145     anything
146     anywhere
147     are
148     aren't
149     around
150     as
151     at
152     b
153     be
154     became
155     because
156     become
157     becomes
158     becoming
159     been
160     before
161     beforehand
162     begin
163     beginning
164     behind
165     being
166     below
167     beside
168     besides
169     between
170     beyond
171     billion
172     both
173     but
174     by
175     c
176     can
177     can't
178     cannot
179     caption
180     co
181     co.
182     could
183     couldn't
184     d
185     did
186     didn't
187     do
188     does
189     doesn't
190     don't
191     down
192     during
193     e
194     eg
195     eight
196     eighty
197     either
198     else
199     elsewhere
200     end
201     ending
202     enough
203     etc
204     even
205     ever
206     every
207     everyone
208     everything
209     everywhere
210     except
211     f
212     few
213     fifty
214     first
215     five
216     vfor
217     former
218     formerly
219     forty
220 ulpfr 13 found
221 ulpfr 10 four
222     from
223     further
224     g
225     h
226     had
227     has
228     hasn't
229     have
230     haven't
231     he
232     he'd
233     he'll
234     he's
235     hence
236     her
237     here
238     here's
239     hereafter
240     hereby
241     herein
242     hereupon
243     hers
244     herself
245     him
246     himself
247     his
248     how
249     however
250     hundred
251     i
252     i'd
253     i'll
254     i'm
255     i've
256     ie
257     if
258     in
259     inc.
260     indeed
261     instead
262     into
263     is
264     isn't
265     it
266     it's
267     its
268     itself
269     j
270     k
271     l
272     last
273     later
274     latter
275     latterly
276     least
277     less
278     let
279     let's
280     like
281     likely
282     ltd
283     m
284     made
285     make
286     makes
287     many
288     maybe
289     me
290     meantime
291     meanwhile
292     might
293     million
294     miss
295     more
296     moreover
297     most
298     mostly
299     mr
300     mrs
301     much
302     must
303     my
304     myself
305     n
306     namely
307     neither
308     never
309     nevertheless
310     next
311     nine
312     ninety
313     no
314     nobody
315     none
316     nonetheless
317     noone
318     nor
319     not
320     nothing
321     now
322     nowhere
323     o
324     of
325     off
326     often
327     on
328     once
329     one
330     one's
331     only
332     onto
333     or
334     other
335     others
336     otherwise
337     our
338     ours
339     ourselves
340     out
341     over
342     overall
343     own
344     p
345     per
346     perhaps
347     q
348     r
349     rather
350     recent
351     recently
352     s
353     same
354     seem
355     seemed
356     seeming
357     seems
358     seven
359     seventy
360     several
361     she
362     she'd
363     she'll
364     she's
365     should
366     shouldn't
367     since
368     six
369     sixty
370     so
371     some
372     somehow
373     someone
374     something
375     sometime
376     sometimes
377     somewhere
378     still
379     stop
380     such
381     t
382     taking
383     ten
384     than
385     that
386     that'll
387     that's
388     that've
389     the
390     their
391     them
392     themselves
393     then
394     thence
395     there
396     there'd
397     there'll
398     there're
399     there's
400     there've
401     thereafter
402     thereby
403     therefore
404     therein
405     thereupon
406     these
407     they
408     they'd
409     they'll
410     they're
411     they've
412     thirty
413     this
414     those
415     though
416     thousand
417     three
418     through
419     throughout
420     thru
421     thus
422     to
423     together
424     too
425     toward
426     towards
427     trillion
428     twenty
429     two
430     u
431     under
432     unless
433     unlike
434     unlikely
435     until
436     up
437     upon
438     us
439     used
440     using
441     v
442     very
443     via
444     w
445     was
446     wasn't
447     we
448     we'd
449     we'll
450     we're
451     we've
452     well
453     were
454     weren't
455     what
456     what'll
457     what's
458     what've
459     whatever
460     when
461     whence
462     whenever
463     where
464     where's
465     whereafter
466     whereas
467     whereby
468     wherein
469     whereupon
470     wherever
471     whether
472     which
473     while
474     whither
475     who
476     who'd
477     who'll
478     who's
479     whoever
480     whole
481     whom
482     whomever
483     whose
484     why
485     will
486     with
487     within
488     without
489     won't
490     would
491     wouldn't
492     x
493     y
494     yes
495     yet
496     you
497     you'd
498     you'll
499     you're
500     you've
501     your
502     yours
503     yourself
504     yourselves
505     z
506     # occuring in more than 100 files
507     acc
508     accent
509     accents
510     and
511     are
512     bell
513     can
514     character
515     corrections
516     crt
517     daisy
518     dash
519     date
520     defined
521     definitions
522     description
523     devices
524     diablo
525     dummy
526     factors
527     following
528     font
529     for
530     from
531     fudge
532     give
533     have
534     header
535     holds
536     log
537     logo
538     low
539     lpr
540     mark
541     name
542     nroff
543     out
544     output
545     pitch
546     put
547     rcsfile
548     reference
549     resolution
550     revision
551     see
552     set
553     simple
554     smi
555     some
556     string
557     synopsis
558     system
559     that
560     the
561     this
562     translation
563     troff
564     typewriter
565     ucb
566     unbreakable
567     use
568     used
569     user
570     vroff
571     wheel
572     will
573     with
574     you
575     __END__
576     # Below is the stub of documentation for your module. You better edit it!
577    
578     =head1 NAME
579    
580     WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
581    
582     =head1 SYNOPSIS
583    
584 ulpfr 13 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
585 laperla 32 isotr disotr stop grundform);
586 ulpfr 10
587 ulpfr 13 $stem = Stem($word);
588     $scode = Soundex($word);
589     $pcode = Phonix($word);
590     $lword = isolc($word);
591 ulpfr 10 disolc($word);
592 ulpfr 13 $uword = isouc($word);
593 ulpfr 10 disouc($word);
594 ulpfr 13 $trword = isotr($word);
595     disotr($word);
596     $word = stop($word);
597     $word = grundform($word);
598 ulpfr 10
599 ulpfr 13 @words = WAIT::Filter::split($word);
600     @words = WAIT::Filter::split2($word);
601     @words = WAIT::Filter::split3($word);
602     @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
603    
604 ulpfr 10 =head1 DESCRIPTION
605    
606     This tiny modules gives access to the basic reduction functions build
607     in B<freeWAIS-sf>.
608    
609     =over 5
610    
611     =item B<Stem>(I<word>)
612    
613     reduces I<word> using the well know Porter algorithm.
614    
615     AU: Porter, M.F.
616     TI: An Algorithm for Suffix Stripping
617     JT: Program
618     VO: 14
619     PP: 130-137
620     PY: 1980
621     PM: JUL
622    
623     =item B<Soundex>(I<word>)
624    
625    
626     computes the 4 byte B<Soundex> code for I<word>.
627    
628     AU: Gadd, T.N.
629     TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
630     Information Retrieval Systems
631     JT: Program
632     VO: 22
633     NO: 3
634     PP: 222-237
635     PY: 1988
636    
637    
638     =item B<Phonix>(I<word>)
639    
640     computes the 8 byte B<Phonix> code for I<word>.
641    
642     AU: Gadd, T.N.
643     TI: PHONIX: The Algorithm
644     JT: Program
645     VO: 24
646     NO: 4
647     PP: 363-366
648     PY: 1990
649     PM: OCT
650    
651     =head1 ISO charcater case functions
652    
653     There are some additional function which transpose some/most ISOlatin1
654     characters to upper and lower case. To allow for maximum speed there
655     are also I<destructive> versions which change the argument instead of
656     allocating a copy which is returned. For convenience, the destructive
657 ulpfr 13 version also B<returns> the argument. So all of the following is
658 ulpfr 10 valid and C<$word> will contain the lowercased string.
659    
660 ulpfr 13 $word = isolc($word);
661 ulpfr 10 $word = disolc($word);
662     disolc($word);
663    
664     Here are the hardcoded characters which are recognized:
665    
666     abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
667     ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
668    
669     =item C<$new = >B<isolc>C<($word)>
670    
671     =item B<disolc>C<($word)>
672    
673     transposes to lower case.
674    
675     =item C<$new = >B<isouc>C<($word)>
676    
677     =item B<disouc>C<($word)>
678    
679     transposes to upper case.
680    
681 ulpfr 13 =item C<$new = >B<isotr>C<($word)>
682    
683     =item B<disotr>C<($word)>
684    
685     Remove non-letters according to the above table.
686    
687     =item C<$new = >B<stop>C<($word)>
688    
689     Returns an empty string if $word is a stopword.
690    
691     =item C<$new = >B<grundform>C<($word)>
692    
693     Calls Text::German::reduce
694    
695     =item C<$new = >B<utf8iso>C<($word)>
696    
697 laperla 32 Deprecated due to flux in perl versions between 5.005 and 5.8. The
698     function converts UTF8 encoded strings to ISO-8859-1. WAIT is
699     internally still based on the Latin1 character set, so if you process
700 ulpfr 13 anything in a different encoding, you should convert to Latin1 as the
701 laperla 32 first filter or refrain from using the iso-latin-1 based filter
702     functions. It is recommended that you use your own converter based on
703     the perl version you're using.
704 ulpfr 13
705     =item split, split2, split3, ...
706    
707     The splitN funtions all take a scalar as input and return a list of
708     words. Split acts just like the perl split(' '). Split2 eliminates all
709     words from the list that are shorter than 2 characters (bytes), split3
710     eliminates those shorter than 3 characters (bytes) and so on.
711    
712 ulpfr 10 =head1 AUTHOR
713    
714     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
715    
716     =head1 SEE ALSO
717    
718     perl(1).
719    
720     =cut
721    

Properties

Name Value
cvs2svn:cvs-rev 1.3

  ViewVC Help
Powered by ViewVC 1.1.26