/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 10 by ulpfr, Fri Apr 28 15:40:52 2000 UTC revision 41 by laperla, Mon Nov 13 20:25:49 2000 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Perl -*-  #                              -*- Mode: Cperl -*-
2  # $Basename: Filter.pm $  # $Basename: Filter.pm $
3  # $Revision: 1.7 $  # $Revision: 1.9 $
4  # ITIID           : $ITI$ $Header $__Header$  # ITIID           : $ITI$ $Header $__Header$
5  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
6  # Created On      : Thu Aug 15 18:09:51 1996  # Created On      : Thu Aug 15 18:09:51 1996
# Line 9  Line 9 
9  # Language        : CPerl  # Language        : CPerl
10  # Update Count    : 105  # Update Count    : 105
11  # Status          : Unknown, Use with caution!  # Status          : Unknown, Use with caution!
12  #  #
13  # Copyright (c) 1996-1997, Ulrich Pfeifer  # Copyright (c) 1996-1997, Ulrich Pfeifer
14  #  #
15  package WAIT::Filter;  package WAIT::Filter;
16  require WAIT;  require WAIT;
17  use strict;  use strict;
# Line 31  require Exporter; Line 31  require Exporter;
31                  isouc disouc                  isouc disouc
32                  isotr disotr                  isotr disotr
33                  stop grundform                  stop grundform
34                    utf8iso
35                 );                 );
36    # (most implemented in WAIT.xs)
37    
38  $VERSION = substr q$Revision: 1.7 $, 10;  $VERSION = substr q$Revision: 1.9 $, 10;
39    
40  sub split {  sub split {
41    map split(' ', $_), @_;    map split(' ', $_), @_;
# Line 75  sub AUTOLOAD { Line 77  sub AUTOLOAD {
77        if $@ ne '';        if $@ ne '';
78      *decode_entities = HTML::Entities->can('decode_entities');      *decode_entities = HTML::Entities->can('decode_entities');
79      goto &decode_entities;      goto &decode_entities;
80      } elsif ($func =~ /^d?utf8iso$/) {
81        no strict 'refs';
82        *$func = sub {
83          # Courtesy JHI
84          my $s = shift;
85          $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86                 {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
87          $s;
88        };
89        goto \&$func;
90    }    }
91    croak "Your vendor has not defined WAIT::Filter::$func";    Carp::confess "Class WAIT::Filter::$func not found";
92  }  }
93    
94  while (<DATA>) {  while (<DATA>) {
95    chomp;    chomp;
96    last if /__END__/;    last if /__END__/;
97      next if /^\s*#/; # there's a comment
98    $STOP{$_}++;    $STOP{$_}++;
99  }  }
100    
# Line 204  vfor Line 217  vfor
217  former  former
218  formerly  formerly
219  forty  forty
220  found "  found
221  four  four
222  from  from
223  further  further
# Line 568  WAIT::Filter - Perl extension providing Line 581  WAIT::Filter - Perl extension providing
581    
582  =head1 SYNOPSIS  =head1 SYNOPSIS
583    
584    use WAIT::Filter qw(Stem Soundex Phonix isolc isouc disolc disouc);    use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
585                          isotr disotr stop grundform);
586    
587    $stem  = Stem($word);    $stem   = Stem($word);
588    $scode = Soundex($word);    $scode  = Soundex($word);
589    $pcode = Phonix($word);    $pcode  = Phonix($word);
590    $lword = isolc($word);    $lword  = isolc($word);
   $uword = isouc($word);  
591    disolc($word);    disolc($word);
592      $uword  = isouc($word);
593    disouc($word);    disouc($word);
594      $trword = isotr($word);
595      disotr($word);
596      $word   = stop($word);
597      $word   = grundform($word);
598    
599      @words = WAIT::Filter::split($word);
600      @words = WAIT::Filter::split2($word);
601      @words = WAIT::Filter::split3($word);
602      @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
603    
604  =head1 DESCRIPTION  =head1 DESCRIPTION
605    
# Line 631  There are some additional function which Line 654  There are some additional function which
654  characters to upper and lower case. To allow for maximum speed there  characters to upper and lower case. To allow for maximum speed there
655  are also I<destructive> versions which change the argument instead of  are also I<destructive> versions which change the argument instead of
656  allocating a copy which is returned. For convenience, the destructive  allocating a copy which is returned. For convenience, the destructive
657  version also B<returns> the argument. So both of the following is  version also B<returns> the argument. So all of the following is
658  valid and C<$word> will contain the lowercased string.  valid and C<$word> will contain the lowercased string.
659    
660      $word = isolc($word);
661    $word = disolc($word);    $word = disolc($word);
662    disolc($word);    disolc($word);
     
663    
664  Here are the hardcoded characters which are recognized:  Here are the hardcoded characters which are recognized:
665    
# Line 655  transposes to lower case. Line 678  transposes to lower case.
678    
679  transposes to upper case.  transposes to upper case.
680    
681    =item C<$new = >B<isotr>C<($word)>
682    
683    =item  B<disotr>C<($word)>
684    
685    Remove non-letters according to the above table.
686    
687    =item C<$new = >B<stop>C<($word)>
688    
689    Returns an empty string if $word is a stopword.
690    
691    =item C<$new = >B<grundform>C<($word)>
692    
693    Calls Text::German::reduce
694    
695    =item C<$new = >B<utf8iso>C<($word)>
696    
697    Deprecated due to flux in perl versions between 5.005 and 5.8. The
698    function converts UTF8 encoded strings to ISO-8859-1. WAIT is
699    internally still based on the Latin1 character set, so if you process
700    anything in a different encoding, you should convert to Latin1 as the
701    first filter or refrain from using the iso-latin-1 based filter
702    functions. It is recommended that you use your own converter based on
703    the perl version you're using.
704    
705    =item split, split2, split3, ...
706    
707    The splitN funtions all take a scalar as input and return a list of
708    words. Split acts just like the perl split(' '). Split2 eliminates all
709    words from the list that are shorter than 2 characters (bytes), split3
710    eliminates those shorter than 3 characters (bytes) and so on.
711    
712  =head1 AUTHOR  =head1 AUTHOR
713    
714  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Legend:
Removed from v.10  
changed lines
  Added in v.41

  ViewVC Help
Powered by ViewVC 1.1.26