lib/WAIT/Filter.pm

#                              -*- Mode: Cperl -*- 
# $Basename: Filter.pm $
# $Revision: 1.9 $
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug 15 18:09:51 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:46 1998
# Language        : CPerl
# Update Count    : 105
# Status          : Unknown, Use with caution!
#
# Copyright (c) 1996-1997, Ulrich Pfeifer
#
package WAIT::Filter;
require WAIT;
use strict;
use Carp;
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
use subs qw(grundform);

require Exporter;

@ISA = qw(Exporter);
@EXPORT_OK = qw(
                Stem
                Soundex
                Phonix
                Metaphone
                isolc disolc
                isouc disouc
                isotr disotr
                stop grundform
                utf8iso
               );
# (most implemented in WAIT.xs)

$VERSION = substr q$Revision: 1.9 $, 10;

sub split {
  map split(' ', $_), @_;
}

$SPLIT = q[
           sub splitXXX {
                          grep length($_)>=XXX, map split(' ', $_), @_;
                         }
          ];

sub AUTOLOAD {
  my $func = $AUTOLOAD; $func =~ s/.*:://;

  if ($func =~ /split(\d+)/) {
    my $num = $1;
    my $split = $SPLIT;

    $split =~ s/XXX/$num/g;
    eval $split;
    if ($@ eq '') {
      goto &$AUTOLOAD;
    }
  } elsif ($func eq 'grundform') {
    eval {require Text::German;};
    croak "You must have Text::German to use 'grundform'"
      if $@ ne '';
    *grundform = Text::German->can('reduce');
    goto &grundform;
  } elsif ($func eq 'date') {
    eval {require Time::ParseDate;};
    croak "You must have Time::ParseDate to use 'date'"
      if $@ ne '';
    *date = Time::ParseDate->can('parsedate');
    goto \&date;
  } elsif ($func eq 'decode_entities') {
    eval {require HTML::Entities;};
    croak "You must have HTML::Entities to use 'date'"
      if $@ ne '';
    *decode_entities = HTML::Entities->can('decode_entities');
    goto &decode_entities;
  } elsif ($func =~ /^d?utf8iso$/) {
    no strict 'refs';
    *$func = sub {
      # Courtesy JHI
      my $s = shift;
      $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
             {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
      $s;
    };
    goto \&$func;
  }
  Carp::confess "Class WAIT::Filter::$func not found";
}

while (<DATA>) {
  chomp;
  last if /__END__/;
  next if /^\s*#/; # there's a comment
  $STOP{$_}++;
}

sub stop {
  if (exists $STOP{$_[0]}) {
    ''
  } else {
    $_[0];
  }
}

sub gdate {
  my $date = shift;

  $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
  date($date);
}

1;
__DATA__
a
about
above
according
across
actually
adj
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
b
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
c
can
can't
cannot
caption
co
co.
could
couldn't
d
did
didn't
do
does
doesn't
don't
down
during
e
eg
eight
eighty
either
else
elsewhere
end
ending
enough
etc
even
ever
every
everyone
everything
everywhere
except
f
few
fifty
first
five
vfor
former
formerly
forty
found
four
from
further
g
h
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'd
i'll
i'm
i've
ie
if
in
inc.
indeed
instead
into
is
isn't
it
it's
its
itself
j
k
l
last
later
latter
latterly
least
less
let
let's
like
likely
ltd
m
made
make
makes
many
maybe
me
meantime
meanwhile
might
million
miss
more
moreover
most
mostly
mr
mrs
much
must
my
myself
n
namely
neither
never
nevertheless
next
nine
ninety
no
nobody
none
nonetheless
noone
nor
not
nothing
now
nowhere
o
of
off
often
on
once
one
one's
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
p
per
perhaps
q
r
rather
recent
recently
s
same
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
stop
such
t
taking
ten
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
to
together
too
toward
towards
trillion
twenty
two
u
under
unless
unlike
unlikely
until
up
upon
us
used
using
v
very
via
w
was
wasn't
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
# occuring in more than 100 files
acc
accent
accents
and
are
bell
can
character
corrections
crt
daisy
dash
date
defined
definitions
description
devices
diablo
dummy
factors
following
font
for
from
fudge
give
have
header
holds
log
logo
low
lpr
mark
name
nroff
out
output
pitch
put
rcsfile
reference
resolution
revision
see
set
simple
smi
some
string
synopsis
system
that
the
this
translation
troff
typewriter
ucb
unbreakable
use
used
user
vroff
wheel
will
with
you
__END__
# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions

=head1 SYNOPSIS

  use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
                      isotr disotr stop grundform);

  $stem   = Stem($word);
  $scode  = Soundex($word);
  $pcode  = Phonix($word);
  $lword  = isolc($word);
  disolc($word);
  $uword  = isouc($word);
  disouc($word);
  $trword = isotr($word);
  disotr($word);
  $word   = stop($word);
  $word   = grundform($word);

  @words = WAIT::Filter::split($word);
  @words = WAIT::Filter::split2($word);
  @words = WAIT::Filter::split3($word);
  @words = WAIT::Filter::split4($word); # arbitrary numbers allowed

=head1 DESCRIPTION

This tiny modules gives access to the basic reduction functions build
in B<freeWAIS-sf>.

=over 5

=item B<Stem>(I<word>)

reduces I<word> using the well know Porter algorithm.

  AU: Porter, M.F.
  TI: An Algorithm for Suffix Stripping
  JT: Program
  VO: 14
  PP: 130-137
  PY: 1980
  PM: JUL

=item B<Soundex>(I<word>)


computes the 4 byte B<Soundex> code for I<word>.

  AU: Gadd, T.N.
  TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
      Information Retrieval Systems
  JT: Program
  VO: 22
  NO: 3
  PP: 222-237
  PY: 1988


=item B<Phonix>(I<word>)

computes the 8 byte B<Phonix> code for I<word>.

  AU: Gadd, T.N.
  TI: PHONIX: The Algorithm
  JT: Program
  VO: 24
  NO: 4
  PP: 363-366
  PY: 1990
  PM: OCT

=head1 ISO charcater case functions

There are some additional function which transpose some/most ISOlatin1
characters to upper and lower case. To allow for maximum speed there
are also I<destructive> versions which change the argument instead of
allocating a copy which is returned. For convenience, the destructive
version also B<returns> the argument. So all of the following is
valid and C<$word> will contain the lowercased string.

  $word = isolc($word);
  $word = disolc($word);
  disolc($word);

Here are the hardcoded characters which are recognized:

  abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
  ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß

=item C<$new = >B<isolc>C<($word)>

=item B<disolc>C<($word)>

transposes to lower case.

=item C<$new = >B<isouc>C<($word)>

=item  B<disouc>C<($word)>

transposes to upper case.

=item C<$new = >B<isotr>C<($word)>

=item  B<disotr>C<($word)>

Remove non-letters according to the above table.

=item C<$new = >B<stop>C<($word)>

Returns an empty string if $word is a stopword.

=item C<$new = >B<grundform>C<($word)>

Calls Text::German::reduce

=item C<$new = >B<utf8iso>C<($word)>

Deprecated due to flux in perl versions between 5.005 and 5.8. The
function converts UTF8 encoded strings to ISO-8859-1. WAIT is
internally still based on the Latin1 character set, so if you process
anything in a different encoding, you should convert to Latin1 as the
first filter or refrain from using the iso-latin-1 based filter
functions. It is recommended that you use your own converter based on
the perl version you're using.

=item split, split2, split3, ...

The splitN funtions all take a scalar as input and return a list of
words. Split acts just like the perl split(' '). Split2 eliminates all
words from the list that are shorter than 2 characters (bytes), split3
eliminates those shorter than 3 characters (bytes) and so on.

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

=head1 SEE ALSO

perl(1).

=cut

1	ulpfr	13	# -- Mode: Cperl --
2	ulpfr	10	# $Basename: Filter.pm $
3	ulpfr	19	# $Revision: 1.9 $
4	ulpfr	10	# ITIID : $ITI$ $Header $__Header$
5			# Author : Ulrich Pfeifer
6			# Created On : Thu Aug 15 18:09:51 1996
7			# Last Modified By: Ulrich Pfeifer
8			# Last Modified On: Sun Nov 22 18:44:46 1998
9			# Language : CPerl
10			# Update Count : 105
11			# Status : Unknown, Use with caution!
12	ulpfr	13	#
13	ulpfr	10	# Copyright (c) 1996-1997, Ulrich Pfeifer
14	ulpfr	13	#
15	ulpfr	10	package WAIT::Filter;
16			require WAIT;
17			use strict;
18			use Carp;
19			use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20			use subs qw(grundform);
21
22			require Exporter;
23
24			@ISA = qw(Exporter);
25			@EXPORT_OK = qw(
26			Stem
27			Soundex
28			Phonix
29			Metaphone
30			isolc disolc
31			isouc disouc
32			isotr disotr
33			stop grundform
34	laperla	32	utf8iso
35	ulpfr	10	);
36	ulpfr	13	# (most implemented in WAIT.xs)
37	ulpfr	10
38	ulpfr	19	$VERSION = substr q$Revision: 1.9 $, 10;
39	ulpfr	10
40			sub split {
41			map split(' ', $_), @_;
42			}
43
44			$SPLIT = q[
45			sub splitXXX {
46			grep length($_)>=XXX, map split(' ', $_), @_;
47			}
48			];
49
50			sub AUTOLOAD {
51			my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53			if ($func =~ /split(\d+)/) {
54			my $num = $1;
55			my $split = $SPLIT;
56
57			$split =~ s/XXX/$num/g;
58			eval $split;
59			if ($@ eq '') {
60			goto &$AUTOLOAD;
61			}
62			} elsif ($func eq 'grundform') {
63			eval {require Text::German;};
64			croak "You must have Text::German to use 'grundform'"
65			if $@ ne '';
66			*grundform = Text::German->can('reduce');
67			goto &grundform;
68			} elsif ($func eq 'date') {
69			eval {require Time::ParseDate;};
70			croak "You must have Time::ParseDate to use 'date'"
71			if $@ ne '';
72			*date = Time::ParseDate->can('parsedate');
73			goto \&date;
74			} elsif ($func eq 'decode_entities') {
75			eval {require HTML::Entities;};
76			croak "You must have HTML::Entities to use 'date'"
77			if $@ ne '';
78			*decode_entities = HTML::Entities->can('decode_entities');
79			goto &decode_entities;
80	ulpfr	13	} elsif ($func =~ /^d?utf8iso$/) {
81			no strict 'refs';
82	laperla	32	*$func = sub {
83			# Courtesy JHI
84	laperla	41	my $s = shift;
85			$s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86			{chr(ord($1)<<6&0xC0\|ord($2)&0x3F)}eg;
87			$s;
88	laperla	32	};
89			goto \&$func;
90	ulpfr	10	}
91	ulpfr	13	Carp::confess "Class WAIT::Filter::$func not found";
92	ulpfr	10	}
93
94			while (<DATA>) {
95			chomp;
96			last if /__END__/;
97	ulpfr	19	next if /^\s*#/; # there's a comment
98	ulpfr	10	$STOP{$_}++;
99			}
100
101			sub stop {
102			if (exists $STOP{$_[0]}) {
103			''
104			} else {
105			$_[0];
106			}
107			}
108
109			sub gdate {
110			my $date = shift;
111
112			$date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
113			date($date);
114			}
115
116			1;
117			__DATA__
118			a
119			about
120			above
121			according
122			across
123			actually
124			adj
125			after
126			afterwards
127			again
128			against
129			all
130			almost
131			alone
132			along
133			already
134			also
135			although
136			always
137			among
138			amongst
139			an
140			and
141			another
142			any
143			anyhow
144			anyone
145			anything
146			anywhere
147			are
148			aren't
149			around
150			as
151			at
152			b
153			be
154			became
155			because
156			become
157			becomes
158			becoming
159			been
160			before
161			beforehand
162			begin
163			beginning
164			behind
165			being
166			below
167			beside
168			besides
169			between
170			beyond
171			billion
172			both
173			but
174			by
175			c
176			can
177			can't
178			cannot
179			caption
180			co
181			co.
182			could
183			couldn't
184			d
185			did
186			didn't
187			do
188			does
189			doesn't
190			don't
191			down
192			during
193			e
194			eg
195			eight
196			eighty
197			either
198			else
199			elsewhere
200			end
201			ending
202			enough
203			etc
204			even
205			ever
206			every
207			everyone
208			everything
209			everywhere
210			except
211			f
212			few
213			fifty
214			first
215			five
216			vfor
217			former
218			formerly
219			forty
220	ulpfr	13	found
221	ulpfr	10	four
222			from
223			further
224			g
225			h
226			had
227			has
228			hasn't
229			have
230			haven't
231			he
232			he'd
233			he'll
234			he's
235			hence
236			her
237			here
238			here's
239			hereafter
240			hereby
241			herein
242			hereupon
243			hers
244			herself
245			him
246			himself
247			his
248			how
249			however
250			hundred
251			i
252			i'd
253			i'll
254			i'm
255			i've
256			ie
257			if
258			in
259			inc.
260			indeed
261			instead
262			into
263			is
264			isn't
265			it
266			it's
267			its
268			itself
269			j
270			k
271			l
272			last
273			later
274			latter
275			latterly
276			least
277			less
278			let
279			let's
280			like
281			likely
282			ltd
283			m
284			made
285			make
286			makes
287			many
288			maybe
289			me
290			meantime
291			meanwhile
292			might
293			million
294			miss
295			more
296			moreover
297			most
298			mostly
299			mr
300			mrs
301			much
302			must
303			my
304			myself
305			n
306			namely
307			neither
308			never
309			nevertheless
310			next
311			nine
312			ninety
313			no
314			nobody
315			none
316			nonetheless
317			noone
318			nor
319			not
320			nothing
321			now
322			nowhere
323			o
324			of
325			off
326			often
327			on
328			once
329			one
330			one's
331			only
332			onto
333			or
334			other
335			others
336			otherwise
337			our
338			ours
339			ourselves
340			out
341			over
342			overall
343			own
344			p
345			per
346			perhaps
347			q
348			r
349			rather
350			recent
351			recently
352			s
353			same
354			seem
355			seemed
356			seeming
357			seems
358			seven
359			seventy
360			several
361			she
362			she'd
363			she'll
364			she's
365			should
366			shouldn't
367			since
368			six
369			sixty
370			so
371			some
372			somehow
373			someone
374			something
375			sometime
376			sometimes
377			somewhere
378			still
379			stop
380			such
381			t
382			taking
383			ten
384			than
385			that
386			that'll
387			that's
388			that've
389			the
390			their
391			them
392			themselves
393			then
394			thence
395			there
396			there'd
397			there'll
398			there're
399			there's
400			there've
401			thereafter
402			thereby
403			therefore
404			therein
405			thereupon
406			these
407			they
408			they'd
409			they'll
410			they're
411			they've
412			thirty
413			this
414			those
415			though
416			thousand
417			three
418			through
419			throughout
420			thru
421			thus
422			to
423			together
424			too
425			toward
426			towards
427			trillion
428			twenty
429			two
430			u
431			under
432			unless
433			unlike
434			unlikely
435			until
436			up
437			upon
438			us
439			used
440			using
441			v
442			very
443			via
444			w
445			was
446			wasn't
447			we
448			we'd
449			we'll
450			we're
451			we've
452			well
453			were
454			weren't
455			what
456			what'll
457			what's
458			what've
459			whatever
460			when
461			whence
462			whenever
463			where
464			where's
465			whereafter
466			whereas
467			whereby
468			wherein
469			whereupon
470			wherever
471			whether
472			which
473			while
474			whither
475			who
476			who'd
477			who'll
478			who's
479			whoever
480			whole
481			whom
482			whomever
483			whose
484			why
485			will
486			with
487			within
488			without
489			won't
490			would
491			wouldn't
492			x
493			y
494			yes
495			yet
496			you
497			you'd
498			you'll
499			you're
500			you've
501			your
502			yours
503			yourself
504			yourselves
505			z
506			# occuring in more than 100 files
507			acc
508			accent
509			accents
510			and
511			are
512			bell
513			can
514			character
515			corrections
516			crt
517			daisy
518			dash
519			date
520			defined
521			definitions
522			description
523			devices
524			diablo
525			dummy
526			factors
527			following
528			font
529			for
530			from
531			fudge
532			give
533			have
534			header
535			holds
536			log
537			logo
538			low
539			lpr
540			mark
541			name
542			nroff
543			out
544			output
545			pitch
546			put
547			rcsfile
548			reference
549			resolution
550			revision
551			see
552			set
553			simple
554			smi
555			some
556			string
557			synopsis
558			system
559			that
560			the
561			this
562			translation
563			troff
564			typewriter
565			ucb
566			unbreakable
567			use
568			used
569			user
570			vroff
571			wheel
572			will
573			with
574			you
575			__END__
576			# Below is the stub of documentation for your module. You better edit it!
577
578			=head1 NAME
579
580			WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
581
582			=head1 SYNOPSIS
583
584	ulpfr	13	use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
585	laperla	32	isotr disotr stop grundform);
586	ulpfr	10
587	ulpfr	13	$stem = Stem($word);
588			$scode = Soundex($word);
589			$pcode = Phonix($word);
590			$lword = isolc($word);
591	ulpfr	10	disolc($word);
592	ulpfr	13	$uword = isouc($word);
593	ulpfr	10	disouc($word);
594	ulpfr	13	$trword = isotr($word);
595			disotr($word);
596			$word = stop($word);
597			$word = grundform($word);
598	ulpfr	10
599	ulpfr	13	@words = WAIT::Filter::split($word);
600			@words = WAIT::Filter::split2($word);
601			@words = WAIT::Filter::split3($word);
602			@words = WAIT::Filter::split4($word); # arbitrary numbers allowed
603
604	ulpfr	10	=head1 DESCRIPTION
605
606			This tiny modules gives access to the basic reduction functions build
607			in B<freeWAIS-sf>.
608
609			=over 5
610
611			=item B<Stem>(I<word>)
612
613			reduces I<word> using the well know Porter algorithm.
614
615			AU: Porter, M.F.
616			TI: An Algorithm for Suffix Stripping
617			JT: Program
618			VO: 14
619			PP: 130-137
620			PY: 1980
621			PM: JUL
622
623			=item B<Soundex>(I<word>)
624
625
626			computes the 4 byte B<Soundex> code for I<word>.
627
628			AU: Gadd, T.N.
629			TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
630			Information Retrieval Systems
631			JT: Program
632			VO: 22
633			NO: 3
634			PP: 222-237
635			PY: 1988
636
637
638			=item B<Phonix>(I<word>)
639
640			computes the 8 byte B<Phonix> code for I<word>.
641
642			AU: Gadd, T.N.
643			TI: PHONIX: The Algorithm
644			JT: Program
645			VO: 24
646			NO: 4
647			PP: 363-366
648			PY: 1990
649			PM: OCT
650
651			=head1 ISO charcater case functions
652
653			There are some additional function which transpose some/most ISOlatin1
654			characters to upper and lower case. To allow for maximum speed there
655			are also I<destructive> versions which change the argument instead of
656			allocating a copy which is returned. For convenience, the destructive
657	ulpfr	13	version also B<returns> the argument. So all of the following is
658	ulpfr	10	valid and C<$word> will contain the lowercased string.
659
660	ulpfr	13	$word = isolc($word);
661	ulpfr	10	$word = disolc($word);
662			disolc($word);
663
664			Here are the hardcoded characters which are recognized:
665
666			abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
667			ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
668
669			=item C<$new = >B<isolc>C<($word)>
670
671			=item B<disolc>C<($word)>
672
673			transposes to lower case.
674
675			=item C<$new = >B<isouc>C<($word)>
676
677			=item B<disouc>C<($word)>
678
679			transposes to upper case.
680
681	ulpfr	13	=item C<$new = >B<isotr>C<($word)>
682
683			=item B<disotr>C<($word)>
684
685			Remove non-letters according to the above table.
686
687			=item C<$new = >B<stop>C<($word)>
688
689			Returns an empty string if $word is a stopword.
690
691			=item C<$new = >B<grundform>C<($word)>
692
693			Calls Text::German::reduce
694
695			=item C<$new = >B<utf8iso>C<($word)>
696
697	laperla	32	Deprecated due to flux in perl versions between 5.005 and 5.8. The
698			function converts UTF8 encoded strings to ISO-8859-1. WAIT is
699			internally still based on the Latin1 character set, so if you process
700	ulpfr	13	anything in a different encoding, you should convert to Latin1 as the
701	laperla	32	first filter or refrain from using the iso-latin-1 based filter
702			functions. It is recommended that you use your own converter based on
703			the perl version you're using.
704	ulpfr	13
705			=item split, split2, split3, ...
706
707			The splitN funtions all take a scalar as input and return a list of
708			words. Split acts just like the perl split(' '). Split2 eliminates all
709			words from the list that are shorter than 2 characters (bytes), split3
710			eliminates those shorter than 3 characters (bytes) and so on.
711
712	ulpfr	10	=head1 AUTHOR
713
714			Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
715
716			=head1 SEE ALSO
717
718			perl(1).
719
720			=cut
721