lib/WAIT/Filter.pm

#                              -*- Mode: Cperl -*- 
# $Basename: Filter.pm $
# $Revision: 1.9 $
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug 15 18:09:51 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:46 1998
# Language        : CPerl
# Update Count    : 105
# Status          : Unknown, Use with caution!
#
# Copyright (c) 1996-1997, Ulrich Pfeifer
#
package WAIT::Filter;
require WAIT;
use strict;
use Carp;
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
use subs qw(grundform);

require Exporter;

@ISA = qw(Exporter);
@EXPORT_OK = qw(
                Stem
                Soundex
                Phonix
                Metaphone
                isolc disolc
                isouc disouc
                isotr disotr
                stop grundform
                utf8iso
               );
# (most implemented in WAIT.xs)

$VERSION = substr q$Revision: 1.9 $, 10;

sub split {
  map split(' ', $_), @_;
}

$SPLIT = q[
           sub splitXXX {
                          grep length($_)>=XXX, map split(' ', $_), @_;
                         }
          ];

sub AUTOLOAD {
  my $func = $AUTOLOAD; $func =~ s/.*:://;

  if ($func =~ /split(\d+)/) {
    my $num = $1;
    my $split = $SPLIT;

    $split =~ s/XXX/$num/g;
    eval $split;
    if ($@ eq '') {
      goto &$AUTOLOAD;
    }
  } elsif ($func eq 'grundform') {
    eval {require Text::German;};
    croak "You must have Text::German to use 'grundform'"
      if $@ ne '';
    *grundform = Text::German->can('reduce');
    goto &grundform;
  } elsif ($func eq 'date') {
    eval {require Time::ParseDate;};
    croak "You must have Time::ParseDate to use 'date'"
      if $@ ne '';
    *date = Time::ParseDate->can('parsedate');
    goto \&date;
  } elsif ($func eq 'decode_entities') {
    eval {require HTML::Entities;};
    croak "You must have HTML::Entities to use 'date'"
      if $@ ne '';
    *decode_entities = HTML::Entities->can('decode_entities');
    goto &decode_entities;
  } elsif ($func =~ /^d?utf8iso$/) {
    no strict 'refs';
    *$func = sub {
      # Courtesy JHI
      my $s = shift;
      $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
             {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
      $s;
    };
    goto \&$func;
  }
  Carp::confess "Class WAIT::Filter::$func not found";
}

while (<DATA>) {
  chomp;
  last if /__END__/;
  next if /^\s*#/; # there's a comment
  $STOP{$_}++;
}
close DATA;

sub stop {
  if (exists $STOP{$_[0]}) {
    ''
  } else {
    $_[0];
  }
}

sub gdate {
  my $date = shift;

  $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
  date($date);
}

1;

__DATA__
a
about
above
according
across
actually
adj
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
b
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
c
can
can't
cannot
caption
co
co.
could
couldn't
d
did
didn't
do
does
doesn't
don't
down
during
e
eg
eight
eighty
either
else
elsewhere
end
ending
enough
etc
even
ever
every
everyone
everything
everywhere
except
f
few
fifty
first
five
vfor
former
formerly
forty
found
four
from
further
g
h
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'd
i'll
i'm
i've
ie
if
in
inc.
indeed
instead
into
is
isn't
it
it's
its
itself
j
k
l
last
later
latter
latterly
least
less
let
let's
like
likely
ltd
m
made
make
makes
many
maybe
me
meantime
meanwhile
might
million
miss
more
moreover
most
mostly
mr
mrs
much
must
my
myself
n
namely
neither
never
nevertheless
next
nine
ninety
no
nobody
none
nonetheless
noone
nor
not
nothing
now
nowhere
o
of
off
often
on
once
one
one's
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
p
per
perhaps
q
r
rather
recent
recently
s
same
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
stop
such
t
taking
ten
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
to
together
too
toward
towards
trillion
twenty
two
u
under
unless
unlike
unlikely
until
up
upon
us
used
using
v
very
via
w
was
wasn't
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
# occuring in more than 100 files
acc
accent
accents
and
are
bell
can
character
corrections
crt
daisy
dash
date
defined
definitions
description
devices
diablo
dummy
factors
following
font
for
from
fudge
give
have
header
holds
log
logo
low
lpr
mark
name
nroff
out
output
pitch
put
rcsfile
reference
resolution
revision
see
set
simple
smi
some
string
synopsis
system
that
the
this
translation
troff
typewriter
ucb
unbreakable
use
used
user
vroff
wheel
will
with
you
__END__

=head1 NAME

WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions

=head1 SYNOPSIS

  use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
                      isotr disotr stop grundform);

  $stem   = Stem($word);
  $scode  = Soundex($word);
  $pcode  = Phonix($word);
  $lword  = isolc($word);
  disolc($word);
  $uword  = isouc($word);
  disouc($word);
  $trword = isotr($word);
  disotr($word);
  $word   = stop($word);
  $word   = grundform($word);

  @words = WAIT::Filter::split($word);
  @words = WAIT::Filter::split2($word);
  @words = WAIT::Filter::split3($word);
  @words = WAIT::Filter::split4($word); # arbitrary numbers allowed

=head1 DESCRIPTION

This tiny modules gives access to the basic reduction functions build
in B<freeWAIS-sf>.

=over 5

=item B<Stem>(I<word>)

reduces I<word> using the well know Porter algorithm.

  AU: Porter, M.F.
  TI: An Algorithm for Suffix Stripping
  JT: Program
  VO: 14
  PP: 130-137
  PY: 1980
  PM: JUL

=item B<Soundex>(I<word>)


computes the 4 byte B<Soundex> code for I<word>.

  AU: Gadd, T.N.
  TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
      Information Retrieval Systems
  JT: Program
  VO: 22
  NO: 3
  PP: 222-237
  PY: 1988


=item B<Phonix>(I<word>)

computes the 8 byte B<Phonix> code for I<word>.

  AU: Gadd, T.N.
  TI: PHONIX: The Algorithm
  JT: Program
  VO: 24
  NO: 4
  PP: 363-366
  PY: 1990
  PM: OCT

=back

=head1 ISO charcater case functions

There are some additional function which transpose some/most ISOlatin1
characters to upper and lower case. To allow for maximum speed there
are also I<destructive> versions which change the argument instead of
allocating a copy which is returned. For convenience, the destructive
version also B<returns> the argument. So all of the following is
valid and C<$word> will contain the lowercased string.

  $word = isolc($word);
  $word = disolc($word);
  disolc($word);

Here are the hardcoded characters which are recognized:

  abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
  ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß

=over 5

=item C<$new = >B<isolc>C<($word)>

=item B<disolc>C<($word)>

transposes to lower case.

=item C<$new = >B<isouc>C<($word)>

=item  B<disouc>C<($word)>

transposes to upper case.

=item C<$new = >B<isotr>C<($word)>

=item  B<disotr>C<($word)>

Remove non-letters according to the above table.

=item C<$new = >B<stop>C<($word)>

Returns an empty string if $word is a stopword.

=item C<$new = >B<grundform>C<($word)>

Calls Text::German::reduce

=item C<$new = >B<utf8iso>C<($word)>

Deprecated due to flux in perl versions between 5.005 and 5.8. The
function converts UTF8 encoded strings to ISO-8859-1. WAIT is
internally still based on the Latin1 character set, so if you process
anything in a different encoding, you should convert to Latin1 as the
first filter or refrain from using the iso-latin-1 based filter
functions. It is recommended that you use your own converter based on
the perl version you're using.

=item split, split2, split3, ...

The splitN funtions all take a scalar as input and return a list of
words. Split acts just like the perl split(' '). Split2 eliminates all
words from the list that are shorter than 2 characters (bytes), split3
eliminates those shorter than 3 characters (bytes) and so on.

=back

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

=head1 SEE ALSO

perl(1).

=cut

1	# -- Mode: Cperl --
2	# $Basename: Filter.pm $
3	# $Revision: 1.9 $
4	# ITIID : $ITI$ $Header $__Header$
5	# Author : Ulrich Pfeifer
6	# Created On : Thu Aug 15 18:09:51 1996
7	# Last Modified By: Ulrich Pfeifer
8	# Last Modified On: Sun Nov 22 18:44:46 1998
9	# Language : CPerl
10	# Update Count : 105
11	# Status : Unknown, Use with caution!
12	#
13	# Copyright (c) 1996-1997, Ulrich Pfeifer
14	#
15	package WAIT::Filter;
16	require WAIT;
17	use strict;
18	use Carp;
19	use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20	use subs qw(grundform);
21
22	require Exporter;
23
24	@ISA = qw(Exporter);
25	@EXPORT_OK = qw(
26	Stem
27	Soundex
28	Phonix
29	Metaphone
30	isolc disolc
31	isouc disouc
32	isotr disotr
33	stop grundform
34	utf8iso
35	);
36	# (most implemented in WAIT.xs)
37
38	$VERSION = substr q$Revision: 1.9 $, 10;
39
40	sub split {
41	map split(' ', $_), @_;
42	}
43
44	$SPLIT = q[
45	sub splitXXX {
46	grep length($_)>=XXX, map split(' ', $_), @_;
47	}
48	];
49
50	sub AUTOLOAD {
51	my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53	if ($func =~ /split(\d+)/) {
54	my $num = $1;
55	my $split = $SPLIT;
56
57	$split =~ s/XXX/$num/g;
58	eval $split;
59	if ($@ eq '') {
60	goto &$AUTOLOAD;
61	}
62	} elsif ($func eq 'grundform') {
63	eval {require Text::German;};
64	croak "You must have Text::German to use 'grundform'"
65	if $@ ne '';
66	*grundform = Text::German->can('reduce');
67	goto &grundform;
68	} elsif ($func eq 'date') {
69	eval {require Time::ParseDate;};
70	croak "You must have Time::ParseDate to use 'date'"
71	if $@ ne '';
72	*date = Time::ParseDate->can('parsedate');
73	goto \&date;
74	} elsif ($func eq 'decode_entities') {
75	eval {require HTML::Entities;};
76	croak "You must have HTML::Entities to use 'date'"
77	if $@ ne '';
78	*decode_entities = HTML::Entities->can('decode_entities');
79	goto &decode_entities;
80	} elsif ($func =~ /^d?utf8iso$/) {
81	no strict 'refs';
82	*$func = sub {
83	# Courtesy JHI
84	my $s = shift;
85	$s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86	{chr(ord($1)<<6&0xC0\|ord($2)&0x3F)}eg;
87	$s;
88	};
89	goto \&$func;
90	}
91	Carp::confess "Class WAIT::Filter::$func not found";
92	}
93
94	while (<DATA>) {
95	chomp;
96	last if /__END__/;
97	next if /^\s*#/; # there's a comment
98	$STOP{$_}++;
99	}
100	close DATA;
101
102	sub stop {
103	if (exists $STOP{$_[0]}) {
104	''
105	} else {
106	$_[0];
107	}
108	}
109
110	sub gdate {
111	my $date = shift;
112
113	$date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
114	date($date);
115	}
116
117	1;
118
119	__DATA__
120	a
121	about
122	above
123	according
124	across
125	actually
126	adj
127	after
128	afterwards
129	again
130	against
131	all
132	almost
133	alone
134	along
135	already
136	also
137	although
138	always
139	among
140	amongst
141	an
142	and
143	another
144	any
145	anyhow
146	anyone
147	anything
148	anywhere
149	are
150	aren't
151	around
152	as
153	at
154	b
155	be
156	became
157	because
158	become
159	becomes
160	becoming
161	been
162	before
163	beforehand
164	begin
165	beginning
166	behind
167	being
168	below
169	beside
170	besides
171	between
172	beyond
173	billion
174	both
175	but
176	by
177	c
178	can
179	can't
180	cannot
181	caption
182	co
183	co.
184	could
185	couldn't
186	d
187	did
188	didn't
189	do
190	does
191	doesn't
192	don't
193	down
194	during
195	e
196	eg
197	eight
198	eighty
199	either
200	else
201	elsewhere
202	end
203	ending
204	enough
205	etc
206	even
207	ever
208	every
209	everyone
210	everything
211	everywhere
212	except
213	f
214	few
215	fifty
216	first
217	five
218	vfor
219	former
220	formerly
221	forty
222	found
223	four
224	from
225	further
226	g
227	h
228	had
229	has
230	hasn't
231	have
232	haven't
233	he
234	he'd
235	he'll
236	he's
237	hence
238	her
239	here
240	here's
241	hereafter
242	hereby
243	herein
244	hereupon
245	hers
246	herself
247	him
248	himself
249	his
250	how
251	however
252	hundred
253	i
254	i'd
255	i'll
256	i'm
257	i've
258	ie
259	if
260	in
261	inc.
262	indeed
263	instead
264	into
265	is
266	isn't
267	it
268	it's
269	its
270	itself
271	j
272	k
273	l
274	last
275	later
276	latter
277	latterly
278	least
279	less
280	let
281	let's
282	like
283	likely
284	ltd
285	m
286	made
287	make
288	makes
289	many
290	maybe
291	me
292	meantime
293	meanwhile
294	might
295	million
296	miss
297	more
298	moreover
299	most
300	mostly
301	mr
302	mrs
303	much
304	must
305	my
306	myself
307	n
308	namely
309	neither
310	never
311	nevertheless
312	next
313	nine
314	ninety
315	no
316	nobody
317	none
318	nonetheless
319	noone
320	nor
321	not
322	nothing
323	now
324	nowhere
325	o
326	of
327	off
328	often
329	on
330	once
331	one
332	one's
333	only
334	onto
335	or
336	other
337	others
338	otherwise
339	our
340	ours
341	ourselves
342	out
343	over
344	overall
345	own
346	p
347	per
348	perhaps
349	q
350	r
351	rather
352	recent
353	recently
354	s
355	same
356	seem
357	seemed
358	seeming
359	seems
360	seven
361	seventy
362	several
363	she
364	she'd
365	she'll
366	she's
367	should
368	shouldn't
369	since
370	six
371	sixty
372	so
373	some
374	somehow
375	someone
376	something
377	sometime
378	sometimes
379	somewhere
380	still
381	stop
382	such
383	t
384	taking
385	ten
386	than
387	that
388	that'll
389	that's
390	that've
391	the
392	their
393	them
394	themselves
395	then
396	thence
397	there
398	there'd
399	there'll
400	there're
401	there's
402	there've
403	thereafter
404	thereby
405	therefore
406	therein
407	thereupon
408	these
409	they
410	they'd
411	they'll
412	they're
413	they've
414	thirty
415	this
416	those
417	though
418	thousand
419	three
420	through
421	throughout
422	thru
423	thus
424	to
425	together
426	too
427	toward
428	towards
429	trillion
430	twenty
431	two
432	u
433	under
434	unless
435	unlike
436	unlikely
437	until
438	up
439	upon
440	us
441	used
442	using
443	v
444	very
445	via
446	w
447	was
448	wasn't
449	we
450	we'd
451	we'll
452	we're
453	we've
454	well
455	were
456	weren't
457	what
458	what'll
459	what's
460	what've
461	whatever
462	when
463	whence
464	whenever
465	where
466	where's
467	whereafter
468	whereas
469	whereby
470	wherein
471	whereupon
472	wherever
473	whether
474	which
475	while
476	whither
477	who
478	who'd
479	who'll
480	who's
481	whoever
482	whole
483	whom
484	whomever
485	whose
486	why
487	will
488	with
489	within
490	without
491	won't
492	would
493	wouldn't
494	x
495	y
496	yes
497	yet
498	you
499	you'd
500	you'll
501	you're
502	you've
503	your
504	yours
505	yourself
506	yourselves
507	z
508	# occuring in more than 100 files
509	acc
510	accent
511	accents
512	and
513	are
514	bell
515	can
516	character
517	corrections
518	crt
519	daisy
520	dash
521	date
522	defined
523	definitions
524	description
525	devices
526	diablo
527	dummy
528	factors
529	following
530	font
531	for
532	from
533	fudge
534	give
535	have
536	header
537	holds
538	log
539	logo
540	low
541	lpr
542	mark
543	name
544	nroff
545	out
546	output
547	pitch
548	put
549	rcsfile
550	reference
551	resolution
552	revision
553	see
554	set
555	simple
556	smi
557	some
558	string
559	synopsis
560	system
561	that
562	the
563	this
564	translation
565	troff
566	typewriter
567	ucb
568	unbreakable
569	use
570	used
571	user
572	vroff
573	wheel
574	will
575	with
576	you
577	__END__
578
579	=head1 NAME
580
581	WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
582
583	=head1 SYNOPSIS
584
585	use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
586	isotr disotr stop grundform);
587
588	$stem = Stem($word);
589	$scode = Soundex($word);
590	$pcode = Phonix($word);
591	$lword = isolc($word);
592	disolc($word);
593	$uword = isouc($word);
594	disouc($word);
595	$trword = isotr($word);
596	disotr($word);
597	$word = stop($word);
598	$word = grundform($word);
599
600	@words = WAIT::Filter::split($word);
601	@words = WAIT::Filter::split2($word);
602	@words = WAIT::Filter::split3($word);
603	@words = WAIT::Filter::split4($word); # arbitrary numbers allowed
604
605	=head1 DESCRIPTION
606
607	This tiny modules gives access to the basic reduction functions build
608	in B<freeWAIS-sf>.
609
610	=over 5
611
612	=item B<Stem>(I<word>)
613
614	reduces I<word> using the well know Porter algorithm.
615
616	AU: Porter, M.F.
617	TI: An Algorithm for Suffix Stripping
618	JT: Program
619	VO: 14
620	PP: 130-137
621	PY: 1980
622	PM: JUL
623
624	=item B<Soundex>(I<word>)
625
626
627	computes the 4 byte B<Soundex> code for I<word>.
628
629	AU: Gadd, T.N.
630	TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
631	Information Retrieval Systems
632	JT: Program
633	VO: 22
634	NO: 3
635	PP: 222-237
636	PY: 1988
637
638
639	=item B<Phonix>(I<word>)
640
641	computes the 8 byte B<Phonix> code for I<word>.
642
643	AU: Gadd, T.N.
644	TI: PHONIX: The Algorithm
645	JT: Program
646	VO: 24
647	NO: 4
648	PP: 363-366
649	PY: 1990
650	PM: OCT
651
652	=back
653
654	=head1 ISO charcater case functions
655
656	There are some additional function which transpose some/most ISOlatin1
657	characters to upper and lower case. To allow for maximum speed there
658	are also I<destructive> versions which change the argument instead of
659	allocating a copy which is returned. For convenience, the destructive
660	version also B<returns> the argument. So all of the following is
661	valid and C<$word> will contain the lowercased string.
662
663	$word = isolc($word);
664	$word = disolc($word);
665	disolc($word);
666
667	Here are the hardcoded characters which are recognized:
668
669	abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
670	ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
671
672	=over 5
673
674	=item C<$new = >B<isolc>C<($word)>
675
676	=item B<disolc>C<($word)>
677
678	transposes to lower case.
679
680	=item C<$new = >B<isouc>C<($word)>
681
682	=item B<disouc>C<($word)>
683
684	transposes to upper case.
685
686	=item C<$new = >B<isotr>C<($word)>
687
688	=item B<disotr>C<($word)>
689
690	Remove non-letters according to the above table.
691
692	=item C<$new = >B<stop>C<($word)>
693
694	Returns an empty string if $word is a stopword.
695
696	=item C<$new = >B<grundform>C<($word)>
697
698	Calls Text::German::reduce
699
700	=item C<$new = >B<utf8iso>C<($word)>
701
702	Deprecated due to flux in perl versions between 5.005 and 5.8. The
703	function converts UTF8 encoded strings to ISO-8859-1. WAIT is
704	internally still based on the Latin1 character set, so if you process
705	anything in a different encoding, you should convert to Latin1 as the
706	first filter or refrain from using the iso-latin-1 based filter
707	functions. It is recommended that you use your own converter based on
708	the perl version you're using.
709
710	=item split, split2, split3, ...
711
712	The splitN funtions all take a scalar as input and return a list of
713	words. Split acts just like the perl split(' '). Split2 eliminates all
714	words from the list that are shorter than 2 characters (bytes), split3
715	eliminates those shorter than 3 characters (bytes) and so on.
716
717	=back
718
719	=head1 AUTHOR
720
721	Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
722
723	=head1 SEE ALSO
724
725	perl(1).
726
727	=cut
728