lib/WAIT/Filter.pm

#                              -*- Mode: Cperl -*- 
# $Basename: Filter.pm $
# $Revision: 1.9 $
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug 15 18:09:51 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:46 1998
# Language        : CPerl
# Update Count    : 105
# Status          : Unknown, Use with caution!
#
# Copyright (c) 1996-1997, Ulrich Pfeifer
#
package WAIT::Filter;
require WAIT;
use strict;
use Carp;
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
use subs qw(grundform);

require Exporter;

@ISA = qw(Exporter);
@EXPORT_OK = qw(
                Stem
                Soundex
                Phonix
                Metaphone
                isolc disolc
                isouc disouc
                isotr disotr
                stop grundform
                utf8iso
               );
# (most implemented in WAIT.xs)

$VERSION = substr q$Revision: 1.9 $, 10;

sub split {
  map split(' ', $_), @_;
}

$SPLIT = q[
           sub splitXXX {
                          grep length($_)>=XXX, map split(' ', $_), @_;
                         }
          ];

sub AUTOLOAD {
  my $func = $AUTOLOAD; $func =~ s/.*:://;

  if ($func =~ /split(\d+)/) {
    my $num = $1;
    my $split = $SPLIT;

    $split =~ s/XXX/$num/g;
    eval $split;
    if ($@ eq '') {
      goto &$AUTOLOAD;
    }
  } elsif ($func eq 'grundform') {
    eval {require Text::German;};
    croak "You must have Text::German to use 'grundform'"
      if $@ ne '';
    *grundform = Text::German->can('reduce');
    goto &grundform;
  } elsif ($func eq 'date') {
    eval {require Time::ParseDate;};
    croak "You must have Time::ParseDate to use 'date'"
      if $@ ne '';
    *date = Time::ParseDate->can('parsedate');
    goto \&date;
  } elsif ($func eq 'decode_entities') {
    eval {require HTML::Entities;};
    croak "You must have HTML::Entities to use 'date'"
      if $@ ne '';
    *decode_entities = HTML::Entities->can('decode_entities');
    goto &decode_entities;
  } elsif ($func =~ /^d?utf8iso$/) {
    no strict 'refs';
    *$func = sub {
      # Courtesy JHI
      my $s = shift;
      $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
             {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
      $s;
    };
    goto \&$func;
  }
  Carp::confess "Class WAIT::Filter::$func not found";
}

while (<DATA>) {
  chomp;
  last if /__END__/;
  next if /^\s*#/; # there's a comment
  $STOP{$_}++;
}

sub stop {
  if (exists $STOP{$_[0]}) {
    ''
  } else {
    $_[0];
  }
}

sub gdate {
  my $date = shift;

  $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
  date($date);
}

1;
__DATA__
a
about
above
according
across
actually
adj
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
b
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
c
can
can't
cannot
caption
co
co.
could
couldn't
d
did
didn't
do
does
doesn't
don't
down
during
e
eg
eight
eighty
either
else
elsewhere
end
ending
enough
etc
even
ever
every
everyone
everything
everywhere
except
f
few
fifty
first
five
vfor
former
formerly
forty
found
four
from
further
g
h
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'd
i'll
i'm
i've
ie
if
in
inc.
indeed
instead
into
is
isn't
it
it's
its
itself
j
k
l
last
later
latter
latterly
least
less
let
let's
like
likely
ltd
m
made
make
makes
many
maybe
me
meantime
meanwhile
might
million
miss
more
moreover
most
mostly
mr
mrs
much
must
my
myself
n
namely
neither
never
nevertheless
next
nine
ninety
no
nobody
none
nonetheless
noone
nor
not
nothing
now
nowhere
o
of
off
often
on
once
one
one's
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
p
per
perhaps
q
r
rather
recent
recently
s
same
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
stop
such
t
taking
ten
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
to
together
too
toward
towards
trillion
twenty
two
u
under
unless
unlike
unlikely
until
up
upon
us
used
using
v
very
via
w
was
wasn't
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
# occuring in more than 100 files
acc
accent
accents
and
are
bell
can
character
corrections
crt
daisy
dash
date
defined
definitions
description
devices
diablo
dummy
factors
following
font
for
from
fudge
give
have
header
holds
log
logo
low
lpr
mark
name
nroff
out
output
pitch
put
rcsfile
reference
resolution
revision
see
set
simple
smi
some
string
synopsis
system
that
the
this
translation
troff
typewriter
ucb
unbreakable
use
used
user
vroff
wheel
will
with
you
__END__
# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions

=head1 SYNOPSIS

  use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
                      isotr disotr stop grundform);

  $stem   = Stem($word);
  $scode  = Soundex($word);
  $pcode  = Phonix($word);
  $lword  = isolc($word);
  disolc($word);
  $uword  = isouc($word);
  disouc($word);
  $trword = isotr($word);
  disotr($word);
  $word   = stop($word);
  $word   = grundform($word);

  @words = WAIT::Filter::split($word);
  @words = WAIT::Filter::split2($word);
  @words = WAIT::Filter::split3($word);
  @words = WAIT::Filter::split4($word); # arbitrary numbers allowed

=head1 DESCRIPTION

This tiny modules gives access to the basic reduction functions build
in B<freeWAIS-sf>.

=over 5

=item B<Stem>(I<word>)

reduces I<word> using the well know Porter algorithm.

  AU: Porter, M.F.
  TI: An Algorithm for Suffix Stripping
  JT: Program
  VO: 14
  PP: 130-137
  PY: 1980
  PM: JUL

=item B<Soundex>(I<word>)


computes the 4 byte B<Soundex> code for I<word>.

  AU: Gadd, T.N.
  TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
      Information Retrieval Systems
  JT: Program
  VO: 22
  NO: 3
  PP: 222-237
  PY: 1988


=item B<Phonix>(I<word>)

computes the 8 byte B<Phonix> code for I<word>.

  AU: Gadd, T.N.
  TI: PHONIX: The Algorithm
  JT: Program
  VO: 24
  NO: 4
  PP: 363-366
  PY: 1990
  PM: OCT

=head1 ISO charcater case functions

There are some additional function which transpose some/most ISOlatin1
characters to upper and lower case. To allow for maximum speed there
are also I<destructive> versions which change the argument instead of
allocating a copy which is returned. For convenience, the destructive
version also B<returns> the argument. So all of the following is
valid and C<$word> will contain the lowercased string.

  $word = isolc($word);
  $word = disolc($word);
  disolc($word);

Here are the hardcoded characters which are recognized:

  abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
  ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß

=item C<$new = >B<isolc>C<($word)>

=item B<disolc>C<($word)>

transposes to lower case.

=item C<$new = >B<isouc>C<($word)>

=item  B<disouc>C<($word)>

transposes to upper case.

=item C<$new = >B<isotr>C<($word)>

=item  B<disotr>C<($word)>

Remove non-letters according to the above table.

=item C<$new = >B<stop>C<($word)>

Returns an empty string if $word is a stopword.

=item C<$new = >B<grundform>C<($word)>

Calls Text::German::reduce

=item C<$new = >B<utf8iso>C<($word)>

Deprecated due to flux in perl versions between 5.005 and 5.8. The
function converts UTF8 encoded strings to ISO-8859-1. WAIT is
internally still based on the Latin1 character set, so if you process
anything in a different encoding, you should convert to Latin1 as the
first filter or refrain from using the iso-latin-1 based filter
functions. It is recommended that you use your own converter based on
the perl version you're using.

=item split, split2, split3, ...

The splitN funtions all take a scalar as input and return a list of
words. Split acts just like the perl split(' '). Split2 eliminates all
words from the list that are shorter than 2 characters (bytes), split3
eliminates those shorter than 3 characters (bytes) and so on.

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

=head1 SEE ALSO

perl(1).

=cut

1	# -- Mode: Cperl --
2	# $Basename: Filter.pm $
3	# $Revision: 1.9 $
4	# ITIID : $ITI$ $Header $__Header$
5	# Author : Ulrich Pfeifer
6	# Created On : Thu Aug 15 18:09:51 1996
7	# Last Modified By: Ulrich Pfeifer
8	# Last Modified On: Sun Nov 22 18:44:46 1998
9	# Language : CPerl
10	# Update Count : 105
11	# Status : Unknown, Use with caution!
12	#
13	# Copyright (c) 1996-1997, Ulrich Pfeifer
14	#
15	package WAIT::Filter;
16	require WAIT;
17	use strict;
18	use Carp;
19	use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20	use subs qw(grundform);
21
22	require Exporter;
23
24	@ISA = qw(Exporter);
25	@EXPORT_OK = qw(
26	Stem
27	Soundex
28	Phonix
29	Metaphone
30	isolc disolc
31	isouc disouc
32	isotr disotr
33	stop grundform
34	utf8iso
35	);
36	# (most implemented in WAIT.xs)
37
38	$VERSION = substr q$Revision: 1.9 $, 10;
39
40	sub split {
41	map split(' ', $_), @_;
42	}
43
44	$SPLIT = q[
45	sub splitXXX {
46	grep length($_)>=XXX, map split(' ', $_), @_;
47	}
48	];
49
50	sub AUTOLOAD {
51	my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53	if ($func =~ /split(\d+)/) {
54	my $num = $1;
55	my $split = $SPLIT;
56
57	$split =~ s/XXX/$num/g;
58	eval $split;
59	if ($@ eq '') {
60	goto &$AUTOLOAD;
61	}
62	} elsif ($func eq 'grundform') {
63	eval {require Text::German;};
64	croak "You must have Text::German to use 'grundform'"
65	if $@ ne '';
66	*grundform = Text::German->can('reduce');
67	goto &grundform;
68	} elsif ($func eq 'date') {
69	eval {require Time::ParseDate;};
70	croak "You must have Time::ParseDate to use 'date'"
71	if $@ ne '';
72	*date = Time::ParseDate->can('parsedate');
73	goto \&date;
74	} elsif ($func eq 'decode_entities') {
75	eval {require HTML::Entities;};
76	croak "You must have HTML::Entities to use 'date'"
77	if $@ ne '';
78	*decode_entities = HTML::Entities->can('decode_entities');
79	goto &decode_entities;
80	} elsif ($func =~ /^d?utf8iso$/) {
81	no strict 'refs';
82	*$func = sub {
83	# Courtesy JHI
84	my $s = shift;
85	$s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86	{chr(ord($1)<<6&0xC0\|ord($2)&0x3F)}eg;
87	$s;
88	};
89	goto \&$func;
90	}
91	Carp::confess "Class WAIT::Filter::$func not found";
92	}
93
94	while (<DATA>) {
95	chomp;
96	last if /__END__/;
97	next if /^\s*#/; # there's a comment
98	$STOP{$_}++;
99	}
100
101	sub stop {
102	if (exists $STOP{$_[0]}) {
103	''
104	} else {
105	$_[0];
106	}
107	}
108
109	sub gdate {
110	my $date = shift;
111
112	$date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
113	date($date);
114	}
115
116	1;
117	__DATA__
118	a
119	about
120	above
121	according
122	across
123	actually
124	adj
125	after
126	afterwards
127	again
128	against
129	all
130	almost
131	alone
132	along
133	already
134	also
135	although
136	always
137	among
138	amongst
139	an
140	and
141	another
142	any
143	anyhow
144	anyone
145	anything
146	anywhere
147	are
148	aren't
149	around
150	as
151	at
152	b
153	be
154	became
155	because
156	become
157	becomes
158	becoming
159	been
160	before
161	beforehand
162	begin
163	beginning
164	behind
165	being
166	below
167	beside
168	besides
169	between
170	beyond
171	billion
172	both
173	but
174	by
175	c
176	can
177	can't
178	cannot
179	caption
180	co
181	co.
182	could
183	couldn't
184	d
185	did
186	didn't
187	do
188	does
189	doesn't
190	don't
191	down
192	during
193	e
194	eg
195	eight
196	eighty
197	either
198	else
199	elsewhere
200	end
201	ending
202	enough
203	etc
204	even
205	ever
206	every
207	everyone
208	everything
209	everywhere
210	except
211	f
212	few
213	fifty
214	first
215	five
216	vfor
217	former
218	formerly
219	forty
220	found
221	four
222	from
223	further
224	g
225	h
226	had
227	has
228	hasn't
229	have
230	haven't
231	he
232	he'd
233	he'll
234	he's
235	hence
236	her
237	here
238	here's
239	hereafter
240	hereby
241	herein
242	hereupon
243	hers
244	herself
245	him
246	himself
247	his
248	how
249	however
250	hundred
251	i
252	i'd
253	i'll
254	i'm
255	i've
256	ie
257	if
258	in
259	inc.
260	indeed
261	instead
262	into
263	is
264	isn't
265	it
266	it's
267	its
268	itself
269	j
270	k
271	l
272	last
273	later
274	latter
275	latterly
276	least
277	less
278	let
279	let's
280	like
281	likely
282	ltd
283	m
284	made
285	make
286	makes
287	many
288	maybe
289	me
290	meantime
291	meanwhile
292	might
293	million
294	miss
295	more
296	moreover
297	most
298	mostly
299	mr
300	mrs
301	much
302	must
303	my
304	myself
305	n
306	namely
307	neither
308	never
309	nevertheless
310	next
311	nine
312	ninety
313	no
314	nobody
315	none
316	nonetheless
317	noone
318	nor
319	not
320	nothing
321	now
322	nowhere
323	o
324	of
325	off
326	often
327	on
328	once
329	one
330	one's
331	only
332	onto
333	or
334	other
335	others
336	otherwise
337	our
338	ours
339	ourselves
340	out
341	over
342	overall
343	own
344	p
345	per
346	perhaps
347	q
348	r
349	rather
350	recent
351	recently
352	s
353	same
354	seem
355	seemed
356	seeming
357	seems
358	seven
359	seventy
360	several
361	she
362	she'd
363	she'll
364	she's
365	should
366	shouldn't
367	since
368	six
369	sixty
370	so
371	some
372	somehow
373	someone
374	something
375	sometime
376	sometimes
377	somewhere
378	still
379	stop
380	such
381	t
382	taking
383	ten
384	than
385	that
386	that'll
387	that's
388	that've
389	the
390	their
391	them
392	themselves
393	then
394	thence
395	there
396	there'd
397	there'll
398	there're
399	there's
400	there've
401	thereafter
402	thereby
403	therefore
404	therein
405	thereupon
406	these
407	they
408	they'd
409	they'll
410	they're
411	they've
412	thirty
413	this
414	those
415	though
416	thousand
417	three
418	through
419	throughout
420	thru
421	thus
422	to
423	together
424	too
425	toward
426	towards
427	trillion
428	twenty
429	two
430	u
431	under
432	unless
433	unlike
434	unlikely
435	until
436	up
437	upon
438	us
439	used
440	using
441	v
442	very
443	via
444	w
445	was
446	wasn't
447	we
448	we'd
449	we'll
450	we're
451	we've
452	well
453	were
454	weren't
455	what
456	what'll
457	what's
458	what've
459	whatever
460	when
461	whence
462	whenever
463	where
464	where's
465	whereafter
466	whereas
467	whereby
468	wherein
469	whereupon
470	wherever
471	whether
472	which
473	while
474	whither
475	who
476	who'd
477	who'll
478	who's
479	whoever
480	whole
481	whom
482	whomever
483	whose
484	why
485	will
486	with
487	within
488	without
489	won't
490	would
491	wouldn't
492	x
493	y
494	yes
495	yet
496	you
497	you'd
498	you'll
499	you're
500	you've
501	your
502	yours
503	yourself
504	yourselves
505	z
506	# occuring in more than 100 files
507	acc
508	accent
509	accents
510	and
511	are
512	bell
513	can
514	character
515	corrections
516	crt
517	daisy
518	dash
519	date
520	defined
521	definitions
522	description
523	devices
524	diablo
525	dummy
526	factors
527	following
528	font
529	for
530	from
531	fudge
532	give
533	have
534	header
535	holds
536	log
537	logo
538	low
539	lpr
540	mark
541	name
542	nroff
543	out
544	output
545	pitch
546	put
547	rcsfile
548	reference
549	resolution
550	revision
551	see
552	set
553	simple
554	smi
555	some
556	string
557	synopsis
558	system
559	that
560	the
561	this
562	translation
563	troff
564	typewriter
565	ucb
566	unbreakable
567	use
568	used
569	user
570	vroff
571	wheel
572	will
573	with
574	you
575	__END__
576	# Below is the stub of documentation for your module. You better edit it!
577
578	=head1 NAME
579
580	WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
581
582	=head1 SYNOPSIS
583
584	use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
585	isotr disotr stop grundform);
586
587	$stem = Stem($word);
588	$scode = Soundex($word);
589	$pcode = Phonix($word);
590	$lword = isolc($word);
591	disolc($word);
592	$uword = isouc($word);
593	disouc($word);
594	$trword = isotr($word);
595	disotr($word);
596	$word = stop($word);
597	$word = grundform($word);
598
599	@words = WAIT::Filter::split($word);
600	@words = WAIT::Filter::split2($word);
601	@words = WAIT::Filter::split3($word);
602	@words = WAIT::Filter::split4($word); # arbitrary numbers allowed
603
604	=head1 DESCRIPTION
605
606	This tiny modules gives access to the basic reduction functions build
607	in B<freeWAIS-sf>.
608
609	=over 5
610
611	=item B<Stem>(I<word>)
612
613	reduces I<word> using the well know Porter algorithm.
614
615	AU: Porter, M.F.
616	TI: An Algorithm for Suffix Stripping
617	JT: Program
618	VO: 14
619	PP: 130-137
620	PY: 1980
621	PM: JUL
622
623	=item B<Soundex>(I<word>)
624
625
626	computes the 4 byte B<Soundex> code for I<word>.
627
628	AU: Gadd, T.N.
629	TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
630	Information Retrieval Systems
631	JT: Program
632	VO: 22
633	NO: 3
634	PP: 222-237
635	PY: 1988
636
637
638	=item B<Phonix>(I<word>)
639
640	computes the 8 byte B<Phonix> code for I<word>.
641
642	AU: Gadd, T.N.
643	TI: PHONIX: The Algorithm
644	JT: Program
645	VO: 24
646	NO: 4
647	PP: 363-366
648	PY: 1990
649	PM: OCT
650
651	=head1 ISO charcater case functions
652
653	There are some additional function which transpose some/most ISOlatin1
654	characters to upper and lower case. To allow for maximum speed there
655	are also I<destructive> versions which change the argument instead of
656	allocating a copy which is returned. For convenience, the destructive
657	version also B<returns> the argument. So all of the following is
658	valid and C<$word> will contain the lowercased string.
659
660	$word = isolc($word);
661	$word = disolc($word);
662	disolc($word);
663
664	Here are the hardcoded characters which are recognized:
665
666	abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
667	ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
668
669	=item C<$new = >B<isolc>C<($word)>
670
671	=item B<disolc>C<($word)>
672
673	transposes to lower case.
674
675	=item C<$new = >B<isouc>C<($word)>
676
677	=item B<disouc>C<($word)>
678
679	transposes to upper case.
680
681	=item C<$new = >B<isotr>C<($word)>
682
683	=item B<disotr>C<($word)>
684
685	Remove non-letters according to the above table.
686
687	=item C<$new = >B<stop>C<($word)>
688
689	Returns an empty string if $word is a stopword.
690
691	=item C<$new = >B<grundform>C<($word)>
692
693	Calls Text::German::reduce
694
695	=item C<$new = >B<utf8iso>C<($word)>
696
697	Deprecated due to flux in perl versions between 5.005 and 5.8. The
698	function converts UTF8 encoded strings to ISO-8859-1. WAIT is
699	internally still based on the Latin1 character set, so if you process
700	anything in a different encoding, you should convert to Latin1 as the
701	first filter or refrain from using the iso-latin-1 based filter
702	functions. It is recommended that you use your own converter based on
703	the perl version you're using.
704
705	=item split, split2, split3, ...
706
707	The splitN funtions all take a scalar as input and return a list of
708	words. Split acts just like the perl split(' '). Split2 eliminates all
709	words from the list that are shorter than 2 characters (bytes), split3
710	eliminates those shorter than 3 characters (bytes) and so on.
711
712	=head1 AUTHOR
713
714	Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
715
716	=head1 SEE ALSO
717
718	perl(1).
719
720	=cut
721