1 |
# -*- Mode: Perl -*- |
# -*- Mode: Cperl -*- |
2 |
# $Basename: Filter.pm $ |
# $Basename: Filter.pm $ |
3 |
# $Revision: 1.7 $ |
# $Revision: 1.9 $ |
4 |
# ITIID : $ITI$ $Header $__Header$ |
# ITIID : $ITI$ $Header $__Header$ |
5 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
6 |
# Created On : Thu Aug 15 18:09:51 1996 |
# Created On : Thu Aug 15 18:09:51 1996 |
9 |
# Language : CPerl |
# Language : CPerl |
10 |
# Update Count : 105 |
# Update Count : 105 |
11 |
# Status : Unknown, Use with caution! |
# Status : Unknown, Use with caution! |
12 |
# |
# |
13 |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
14 |
# |
# |
15 |
package WAIT::Filter; |
package WAIT::Filter; |
16 |
require WAIT; |
require WAIT; |
17 |
use strict; |
use strict; |
31 |
isouc disouc |
isouc disouc |
32 |
isotr disotr |
isotr disotr |
33 |
stop grundform |
stop grundform |
34 |
|
utf8iso |
35 |
); |
); |
36 |
|
# (most implemented in WAIT.xs) |
37 |
|
|
38 |
$VERSION = substr q$Revision: 1.7 $, 10; |
$VERSION = substr q$Revision: 1.9 $, 10; |
39 |
|
|
40 |
sub split { |
sub split { |
41 |
map split(' ', $_), @_; |
map split(' ', $_), @_; |
77 |
if $@ ne ''; |
if $@ ne ''; |
78 |
*decode_entities = HTML::Entities->can('decode_entities'); |
*decode_entities = HTML::Entities->can('decode_entities'); |
79 |
goto &decode_entities; |
goto &decode_entities; |
80 |
|
} elsif ($func =~ /^d?utf8iso$/) { |
81 |
|
no strict 'refs'; |
82 |
|
*$func = sub { |
83 |
|
# Courtesy JHI |
84 |
|
my $s = shift; |
85 |
|
$s =~ s{([\xC0-\xDF])([\x80-\xBF])} |
86 |
|
{chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg; |
87 |
|
$s; |
88 |
|
}; |
89 |
|
goto \&$func; |
90 |
} |
} |
91 |
croak "Your vendor has not defined WAIT::Filter::$func"; |
Carp::confess "Class WAIT::Filter::$func not found"; |
92 |
} |
} |
93 |
|
|
94 |
while (<DATA>) { |
while (<DATA>) { |
95 |
chomp; |
chomp; |
96 |
last if /__END__/; |
last if /__END__/; |
97 |
|
next if /^\s*#/; # there's a comment |
98 |
$STOP{$_}++; |
$STOP{$_}++; |
99 |
} |
} |
100 |
|
|
217 |
former |
former |
218 |
formerly |
formerly |
219 |
forty |
forty |
220 |
found " |
found |
221 |
four |
four |
222 |
from |
from |
223 |
further |
further |
581 |
|
|
582 |
=head1 SYNOPSIS |
=head1 SYNOPSIS |
583 |
|
|
584 |
use WAIT::Filter qw(Stem Soundex Phonix isolc isouc disolc disouc); |
use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc |
585 |
|
isotr disotr stop grundform); |
586 |
|
|
587 |
$stem = Stem($word); |
$stem = Stem($word); |
588 |
$scode = Soundex($word); |
$scode = Soundex($word); |
589 |
$pcode = Phonix($word); |
$pcode = Phonix($word); |
590 |
$lword = isolc($word); |
$lword = isolc($word); |
|
$uword = isouc($word); |
|
591 |
disolc($word); |
disolc($word); |
592 |
|
$uword = isouc($word); |
593 |
disouc($word); |
disouc($word); |
594 |
|
$trword = isotr($word); |
595 |
|
disotr($word); |
596 |
|
$word = stop($word); |
597 |
|
$word = grundform($word); |
598 |
|
|
599 |
|
@words = WAIT::Filter::split($word); |
600 |
|
@words = WAIT::Filter::split2($word); |
601 |
|
@words = WAIT::Filter::split3($word); |
602 |
|
@words = WAIT::Filter::split4($word); # arbitrary numbers allowed |
603 |
|
|
604 |
=head1 DESCRIPTION |
=head1 DESCRIPTION |
605 |
|
|
654 |
characters to upper and lower case. To allow for maximum speed there |
characters to upper and lower case. To allow for maximum speed there |
655 |
are also I<destructive> versions which change the argument instead of |
are also I<destructive> versions which change the argument instead of |
656 |
allocating a copy which is returned. For convenience, the destructive |
allocating a copy which is returned. For convenience, the destructive |
657 |
version also B<returns> the argument. So both of the following is |
version also B<returns> the argument. So all of the following is |
658 |
valid and C<$word> will contain the lowercased string. |
valid and C<$word> will contain the lowercased string. |
659 |
|
|
660 |
|
$word = isolc($word); |
661 |
$word = disolc($word); |
$word = disolc($word); |
662 |
disolc($word); |
disolc($word); |
|
|
|
663 |
|
|
664 |
Here are the hardcoded characters which are recognized: |
Here are the hardcoded characters which are recognized: |
665 |
|
|
678 |
|
|
679 |
transposes to upper case. |
transposes to upper case. |
680 |
|
|
681 |
|
=item C<$new = >B<isotr>C<($word)> |
682 |
|
|
683 |
|
=item B<disotr>C<($word)> |
684 |
|
|
685 |
|
Remove non-letters according to the above table. |
686 |
|
|
687 |
|
=item C<$new = >B<stop>C<($word)> |
688 |
|
|
689 |
|
Returns an empty string if $word is a stopword. |
690 |
|
|
691 |
|
=item C<$new = >B<grundform>C<($word)> |
692 |
|
|
693 |
|
Calls Text::German::reduce |
694 |
|
|
695 |
|
=item C<$new = >B<utf8iso>C<($word)> |
696 |
|
|
697 |
|
Deprecated due to flux in perl versions between 5.005 and 5.8. The |
698 |
|
function converts UTF8 encoded strings to ISO-8859-1. WAIT is |
699 |
|
internally still based on the Latin1 character set, so if you process |
700 |
|
anything in a different encoding, you should convert to Latin1 as the |
701 |
|
first filter or refrain from using the iso-latin-1 based filter |
702 |
|
functions. It is recommended that you use your own converter based on |
703 |
|
the perl version you're using. |
704 |
|
|
705 |
|
=item split, split2, split3, ... |
706 |
|
|
707 |
|
The splitN funtions all take a scalar as input and return a list of |
708 |
|
words. Split acts just like the perl split(' '). Split2 eliminates all |
709 |
|
words from the list that are shorter than 2 characters (bytes), split3 |
710 |
|
eliminates those shorter than 3 characters (bytes) and so on. |
711 |
|
|
712 |
=head1 AUTHOR |
=head1 AUTHOR |
713 |
|
|
714 |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |