/[wait]/branches/CPAN/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /branches/CPAN/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 12 by unknown, Fri Apr 28 15:41:10 2000 UTC revision 13 by ulpfr, Fri Apr 28 15:42:44 2000 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Perl -*-  #                              -*- Mode: Cperl -*-
2  # $Basename: Filter.pm $  # $Basename: Filter.pm $
3  # $Revision: 1.7 $  # $Revision: 1.8 $
4  # ITIID           : $ITI$ $Header $__Header$  # ITIID           : $ITI$ $Header $__Header$
5  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
6  # Created On      : Thu Aug 15 18:09:51 1996  # Created On      : Thu Aug 15 18:09:51 1996
# Line 9  Line 9 
9  # Language        : CPerl  # Language        : CPerl
10  # Update Count    : 105  # Update Count    : 105
11  # Status          : Unknown, Use with caution!  # Status          : Unknown, Use with caution!
12  #  #
13  # Copyright (c) 1996-1997, Ulrich Pfeifer  # Copyright (c) 1996-1997, Ulrich Pfeifer
14  #  #
15  package WAIT::Filter;  package WAIT::Filter;
16  require WAIT;  require WAIT;
17  use strict;  use strict;
# Line 31  require Exporter; Line 31  require Exporter;
31                  isouc disouc                  isouc disouc
32                  isotr disotr                  isotr disotr
33                  stop grundform                  stop grundform
34                    utf8iso
35                 );                 );
36    # (most implemented in WAIT.xs)
37    
38  $VERSION = substr q$Revision: 1.7 $, 10;  $VERSION = substr q$Revision: 1.8 $, 10;
39    
40  sub split {  sub split {
41    map split(' ', $_), @_;    map split(' ', $_), @_;
# Line 75  sub AUTOLOAD { Line 77  sub AUTOLOAD {
77        if $@ ne '';        if $@ ne '';
78      *decode_entities = HTML::Entities->can('decode_entities');      *decode_entities = HTML::Entities->can('decode_entities');
79      goto &decode_entities;      goto &decode_entities;
80      } elsif ($func =~ /^d?utf8iso$/) {
81        require WAIT::Filter::utf8iso;
82        croak "Your perl version must at least be 5.00556 to use '$func'"
83            if $] < 5.00556;
84        no strict 'refs';
85        *$func = \&{"WAIT::Filter::utf8iso::$func"};
86        goto &utf8iso;
87    }    }
88    croak "Your vendor has not defined WAIT::Filter::$func";    Carp::confess "Class WAIT::Filter::$func not found";
89  }  }
90    
91  while (<DATA>) {  while (<DATA>) {
# Line 204  vfor Line 213  vfor
213  former  former
214  formerly  formerly
215  forty  forty
216  found "  found
217  four  four
218  from  from
219  further  further
# Line 568  WAIT::Filter - Perl extension providing Line 577  WAIT::Filter - Perl extension providing
577    
578  =head1 SYNOPSIS  =head1 SYNOPSIS
579    
580    use WAIT::Filter qw(Stem Soundex Phonix isolc isouc disolc disouc);    use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
581                          isotr disotr stop grundform utf8iso);
582    
583    $stem  = Stem($word);    $stem   = Stem($word);
584    $scode = Soundex($word);    $scode  = Soundex($word);
585    $pcode = Phonix($word);    $pcode  = Phonix($word);
586    $lword = isolc($word);    $lword  = isolc($word);
   $uword = isouc($word);  
587    disolc($word);    disolc($word);
588      $uword  = isouc($word);
589    disouc($word);    disouc($word);
590      $trword = isotr($word);
591      disotr($word);
592      $word   = stop($word);
593      $word   = grundform($word);
594    
595      @words = WAIT::Filter::split($word);
596      @words = WAIT::Filter::split2($word);
597      @words = WAIT::Filter::split3($word);
598      @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
599    
600  =head1 DESCRIPTION  =head1 DESCRIPTION
601    
# Line 631  There are some additional function which Line 650  There are some additional function which
650  characters to upper and lower case. To allow for maximum speed there  characters to upper and lower case. To allow for maximum speed there
651  are also I<destructive> versions which change the argument instead of  are also I<destructive> versions which change the argument instead of
652  allocating a copy which is returned. For convenience, the destructive  allocating a copy which is returned. For convenience, the destructive
653  version also B<returns> the argument. So both of the following is  version also B<returns> the argument. So all of the following is
654  valid and C<$word> will contain the lowercased string.  valid and C<$word> will contain the lowercased string.
655    
656      $word = isolc($word);
657    $word = disolc($word);    $word = disolc($word);
658    disolc($word);    disolc($word);
     
659    
660  Here are the hardcoded characters which are recognized:  Here are the hardcoded characters which are recognized:
661    
# Line 655  transposes to lower case. Line 674  transposes to lower case.
674    
675  transposes to upper case.  transposes to upper case.
676    
677    =item C<$new = >B<isotr>C<($word)>
678    
679    =item  B<disotr>C<($word)>
680    
681    Remove non-letters according to the above table.
682    
683    =item C<$new = >B<stop>C<($word)>
684    
685    Returns an empty string if $word is a stopword.
686    
687    =item C<$new = >B<grundform>C<($word)>
688    
689    Calls Text::German::reduce
690    
691    =item C<$new = >B<utf8iso>C<($word)>
692    
693    Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is
694    internally based on the Latin1 character set, so if you process
695    anything in a different encoding, you should convert to Latin1 as the
696    first filter.
697    
698    =item split, split2, split3, ...
699    
700    The splitN funtions all take a scalar as input and return a list of
701    words. Split acts just like the perl split(' '). Split2 eliminates all
702    words from the list that are shorter than 2 characters (bytes), split3
703    eliminates those shorter than 3 characters (bytes) and so on.
704    
705  =head1 AUTHOR  =head1 AUTHOR
706    
707  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Legend:
Removed from v.12  
changed lines
  Added in v.13

  ViewVC Help
Powered by ViewVC 1.1.26