lib/WAIT/Filter.pm

#                              -*- Mode: Cperl -*- 
# $Basename: Filter.pm $
# $Revision: 1.9 $
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug 15 18:09:51 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:46 1998
# Language        : CPerl
# Update Count    : 105
# Status          : Unknown, Use with caution!
#
# Copyright (c) 1996-1997, Ulrich Pfeifer
#
package WAIT::Filter;
require WAIT;
use strict;
use Carp;
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $UNAC $ICONV $AUTOLOAD);
use subs qw(grundform);

use Text::Unaccent;
use Text::Iconv;

require Exporter;

@ISA = qw(Exporter);
@EXPORT_OK = qw(
                Stem
                Soundex
                Phonix
                Metaphone
                isolc disolc
                isouc disouc
                isotr disotr
                stop grundform
                utf8iso
               );
# (most implemented in WAIT.xs)

$VERSION = substr q$Revision: 1.9 $, 10;

sub split {
  map split(' ', $_), @_;
}

$SPLIT = q[
           sub splitXXX {
                          grep length($_)>=XXX, map split(' ', $_), @_;
                         }
          ];

$UNAC = q[
        sub unac_CHARSET {
                map split(' ',unac_string('CHARSET', $_) || $_), @_;
        }
];

my $iconv;

$ICONV = q[
        sub iconv_CHARSETfrom_CHARSETto {
                my $ic = $iconv->{'CHARSETfrom_CHARSETto'});
                $ic ||= $iconv->{'CHARSETfrom_CHARSETto'} = Text::Iconv->new('CHARSETfrom','CHARSETto');
                map split(' ',$ic->convert($_) || $_), @_;
        }
];
                

sub AUTOLOAD {
  my $func = $AUTOLOAD; $func =~ s/.*:://;

  if ($func =~ /split(\d+)/) {
    my $num = $1;
    my $split = $SPLIT;

    $split =~ s/XXX/$num/g;
    eval $split;
    if ($@ eq '') {
      goto &$AUTOLOAD;
    }
  } elsif ($func eq 'grundform') {
    eval {require Text::German;};
    croak "You must have Text::German to use 'grundform'"
      if $@ ne '';
    *grundform = Text::German->can('reduce');
    goto &grundform;
  } elsif ($func eq 'date') {
    eval {require Time::ParseDate;};
    croak "You must have Time::ParseDate to use 'date'"
      if $@ ne '';
    *date = Time::ParseDate->can('parsedate');
    goto \&date;
  } elsif ($func eq 'decode_entities') {
    eval {require HTML::Entities;};
    croak "You must have HTML::Entities to use 'decode_entities'"
      if $@ ne '';
    *decode_entities = HTML::Entities->can('decode_entities');
    goto &decode_entities;
  } elsif ($func =~ /^d?utf8iso$/) {
    no strict 'refs';
    *$func = sub {
      # Courtesy JHI
      my $s = shift;
      $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
             {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
      $s;
    };
    goto \&$func;
  } elsif ($func =~ /unac_(.+)/) {
    my $charset = $1;
    my $unac = $UNAC;
    $unac =~ s/CHARSET/$charset/g;
print "### $unac ###\n";
    eval $unac;
    if ($@ eq '') {
     goto &$func;
    }
  } elsif ($func =~ /iconv_([^_]+)_([^_]+)/) {
    my ($cf,$ct) = ($1,$2);
    my $iconv = $ICONV;
print "### $cf -> $ct\n";
    $iconv =~ s/CHARSETfrom/$cf/gs;
    $iconv =~ s/CHARSETto/$ct/gs;
print "### $iconv ###\n";
    eval $iconv;
    if ($@ eq '') {
     goto &$func;
    }
  }
  Carp::confess "Class WAIT::Filter::$func not found";
}

while (<DATA>) {
  chomp;
  last if /__END__/;
  next if /^\s*#/; # there's a comment
  $STOP{$_}++;
}
close DATA;

sub stop {
  if (exists $STOP{$_[0]}) {
    ''
  } else {
    $_[0];
  }
}

sub gdate {
  my $date = shift;

  $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
  date($date);
}

1;

__DATA__
a
about
above
according
across
actually
adj
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
b
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
c
can
can't
cannot
caption
co
co.
could
couldn't
d
did
didn't
do
does
doesn't
don't
down
during
e
eg
eight
eighty
either
else
elsewhere
end
ending
enough
etc
even
ever
every
everyone
everything
everywhere
except
f
few
fifty
first
five
vfor
former
formerly
forty
found
four
from
further
g
h
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'd
i'll
i'm
i've
ie
if
in
inc.
indeed
instead
into
is
isn't
it
it's
its
itself
j
k
l
last
later
latter
latterly
least
less
let
let's
like
likely
ltd
m
made
make
makes
many
maybe
me
meantime
meanwhile
might
million
miss
more
moreover
most
mostly
mr
mrs
much
must
my
myself
n
namely
neither
never
nevertheless
next
nine
ninety
no
nobody
none
nonetheless
noone
nor
not
nothing
now
nowhere
o
of
off
often
on
once
one
one's
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
p
per
perhaps
q
r
rather
recent
recently
s
same
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
stop
such
t
taking
ten
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
to
together
too
toward
towards
trillion
twenty
two
u
under
unless
unlike
unlikely
until
up
upon
us
used
using
v
very
via
w
was
wasn't
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
# occuring in more than 100 files
acc
accent
accents
and
are
bell
can
character
corrections
crt
daisy
dash
date
defined
definitions
description
devices
diablo
dummy
factors
following
font
for
from
fudge
give
have
header
holds
log
logo
low
lpr
mark
name
nroff
out
output
pitch
put
rcsfile
reference
resolution
revision
see
set
simple
smi
some
string
synopsis
system
that
the
this
translation
troff
typewriter
ucb
unbreakable
use
used
user
vroff
wheel
will
with
you
__END__

=head1 NAME

WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions

=head1 SYNOPSIS

  use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
                      isotr disotr stop grundform);

  $stem   = Stem($word);
  $scode  = Soundex($word);
  $pcode  = Phonix($word);
  $lword  = isolc($word);
  disolc($word);
  $uword  = isouc($word);
  disouc($word);
  $trword = isotr($word);
  disotr($word);
  $word   = stop($word);
  $word   = grundform($word);

  @words = WAIT::Filter::split($word);
  @words = WAIT::Filter::split2($word);
  @words = WAIT::Filter::split3($word);
  @words = WAIT::Filter::split4($word); # arbitrary numbers allowed

=head1 DESCRIPTION

This tiny modules gives access to the basic reduction functions build
in B<freeWAIS-sf>.

=over 5

=item B<Stem>(I<word>)

reduces I<word> using the well know Porter algorithm.

  AU: Porter, M.F.
  TI: An Algorithm for Suffix Stripping
  JT: Program
  VO: 14
  PP: 130-137
  PY: 1980
  PM: JUL

=item B<Soundex>(I<word>)


computes the 4 byte B<Soundex> code for I<word>.

  AU: Gadd, T.N.
  TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
      Information Retrieval Systems
  JT: Program
  VO: 22
  NO: 3
  PP: 222-237
  PY: 1988


=item B<Phonix>(I<word>)

computes the 8 byte B<Phonix> code for I<word>.

  AU: Gadd, T.N.
  TI: PHONIX: The Algorithm
  JT: Program
  VO: 24
  NO: 4
  PP: 363-366
  PY: 1990
  PM: OCT

=back

=head1 ISO charcater case functions

There are some additional function which transpose some/most ISOlatin1
characters to upper and lower case. To allow for maximum speed there
are also I<destructive> versions which change the argument instead of
allocating a copy which is returned. For convenience, the destructive
version also B<returns> the argument. So all of the following is
valid and C<$word> will contain the lowercased string.

  $word = isolc($word);
  $word = disolc($word);
  disolc($word);

Here are the hardcoded characters which are recognized:

  abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
  ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß

=over 5

=item C<$new = >B<isolc>C<($word)>

=item B<disolc>C<($word)>

transposes to lower case.

=item C<$new = >B<isouc>C<($word)>

=item  B<disouc>C<($word)>

transposes to upper case.

=item C<$new = >B<isotr>C<($word)>

=item  B<disotr>C<($word)>

Remove non-letters according to the above table.

=item C<$new = >B<stop>C<($word)>

Returns an empty string if $word is a stopword.

=item C<$new = >B<grundform>C<($word)>

Calls Text::German::reduce

=item C<$new = >B<utf8iso>C<($word)>

Deprecated due to flux in perl versions between 5.005 and 5.8. The
function converts UTF8 encoded strings to ISO-8859-1. WAIT is
internally still based on the Latin1 character set, so if you process
anything in a different encoding, you should convert to Latin1 as the
first filter or refrain from using the iso-latin-1 based filter
functions. It is recommended that you use your own converter based on
the perl version you're using.

=item split, split2, split3, ...

The splitN funtions all take a scalar as input and return a list of
words. Split acts just like the perl split(' '). Split2 eliminates all
words from the list that are shorter than 2 characters (bytes), split3
eliminates those shorter than 3 characters (bytes) and so on.

=back

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

=head1 SEE ALSO

perl(1).

=cut

1	# -- Mode: Cperl --
2	# $Basename: Filter.pm $
3	# $Revision: 1.9 $
4	# ITIID : $ITI$ $Header $__Header$
5	# Author : Ulrich Pfeifer
6	# Created On : Thu Aug 15 18:09:51 1996
7	# Last Modified By: Ulrich Pfeifer
8	# Last Modified On: Sun Nov 22 18:44:46 1998
9	# Language : CPerl
10	# Update Count : 105
11	# Status : Unknown, Use with caution!
12	#
13	# Copyright (c) 1996-1997, Ulrich Pfeifer
14	#
15	package WAIT::Filter;
16	require WAIT;
17	use strict;
18	use Carp;
19	use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $UNAC $ICONV $AUTOLOAD);
20	use subs qw(grundform);
21
22	use Text::Unaccent;
23	use Text::Iconv;
24
25	require Exporter;
26
27	@ISA = qw(Exporter);
28	@EXPORT_OK = qw(
29	Stem
30	Soundex
31	Phonix
32	Metaphone
33	isolc disolc
34	isouc disouc
35	isotr disotr
36	stop grundform
37	utf8iso
38	);
39	# (most implemented in WAIT.xs)
40
41	$VERSION = substr q$Revision: 1.9 $, 10;
42
43	sub split {
44	map split(' ', $_), @_;
45	}
46
47	$SPLIT = q[
48	sub splitXXX {
49	grep length($_)>=XXX, map split(' ', $_), @_;
50	}
51	];
52
53	$UNAC = q[
54	sub unac_CHARSET {
55	map split(' ',unac_string('CHARSET', $_) \|\| $_), @_;
56	}
57	];
58
59	my $iconv;
60
61	$ICONV = q[
62	sub iconv_CHARSETfrom_CHARSETto {
63	my $ic = $iconv->{'CHARSETfrom_CHARSETto'});
64	$ic \|\|= $iconv->{'CHARSETfrom_CHARSETto'} = Text::Iconv->new('CHARSETfrom','CHARSETto');
65	map split(' ',$ic->convert($_) \|\| $_), @_;
66	}
67	];
68
69
70	sub AUTOLOAD {
71	my $func = $AUTOLOAD; $func =~ s/.*:://;
72
73	if ($func =~ /split(\d+)/) {
74	my $num = $1;
75	my $split = $SPLIT;
76
77	$split =~ s/XXX/$num/g;
78	eval $split;
79	if ($@ eq '') {
80	goto &$AUTOLOAD;
81	}
82	} elsif ($func eq 'grundform') {
83	eval {require Text::German;};
84	croak "You must have Text::German to use 'grundform'"
85	if $@ ne '';
86	*grundform = Text::German->can('reduce');
87	goto &grundform;
88	} elsif ($func eq 'date') {
89	eval {require Time::ParseDate;};
90	croak "You must have Time::ParseDate to use 'date'"
91	if $@ ne '';
92	*date = Time::ParseDate->can('parsedate');
93	goto \&date;
94	} elsif ($func eq 'decode_entities') {
95	eval {require HTML::Entities;};
96	croak "You must have HTML::Entities to use 'decode_entities'"
97	if $@ ne '';
98	*decode_entities = HTML::Entities->can('decode_entities');
99	goto &decode_entities;
100	} elsif ($func =~ /^d?utf8iso$/) {
101	no strict 'refs';
102	*$func = sub {
103	# Courtesy JHI
104	my $s = shift;
105	$s =~ s{([\xC0-\xDF])([\x80-\xBF])}
106	{chr(ord($1)<<6&0xC0\|ord($2)&0x3F)}eg;
107	$s;
108	};
109	goto \&$func;
110	} elsif ($func =~ /unac_(.+)/) {
111	my $charset = $1;
112	my $unac = $UNAC;
113	$unac =~ s/CHARSET/$charset/g;
114	print "### $unac ###\n";
115	eval $unac;
116	if ($@ eq '') {
117	goto &$func;
118	}
119	} elsif ($func =~ /iconv_([^_]+)_([^_]+)/) {
120	my ($cf,$ct) = ($1,$2);
121	my $iconv = $ICONV;
122	print "### $cf -> $ct\n";
123	$iconv =~ s/CHARSETfrom/$cf/gs;
124	$iconv =~ s/CHARSETto/$ct/gs;
125	print "### $iconv ###\n";
126	eval $iconv;
127	if ($@ eq '') {
128	goto &$func;
129	}
130	}
131	Carp::confess "Class WAIT::Filter::$func not found";
132	}
133
134	while (<DATA>) {
135	chomp;
136	last if /__END__/;
137	next if /^\s*#/; # there's a comment
138	$STOP{$_}++;
139	}
140	close DATA;
141
142	sub stop {
143	if (exists $STOP{$_[0]}) {
144	''
145	} else {
146	$_[0];
147	}
148	}
149
150	sub gdate {
151	my $date = shift;
152
153	$date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
154	date($date);
155	}
156
157	1;
158
159	__DATA__
160	a
161	about
162	above
163	according
164	across
165	actually
166	adj
167	after
168	afterwards
169	again
170	against
171	all
172	almost
173	alone
174	along
175	already
176	also
177	although
178	always
179	among
180	amongst
181	an
182	and
183	another
184	any
185	anyhow
186	anyone
187	anything
188	anywhere
189	are
190	aren't
191	around
192	as
193	at
194	b
195	be
196	became
197	because
198	become
199	becomes
200	becoming
201	been
202	before
203	beforehand
204	begin
205	beginning
206	behind
207	being
208	below
209	beside
210	besides
211	between
212	beyond
213	billion
214	both
215	but
216	by
217	c
218	can
219	can't
220	cannot
221	caption
222	co
223	co.
224	could
225	couldn't
226	d
227	did
228	didn't
229	do
230	does
231	doesn't
232	don't
233	down
234	during
235	e
236	eg
237	eight
238	eighty
239	either
240	else
241	elsewhere
242	end
243	ending
244	enough
245	etc
246	even
247	ever
248	every
249	everyone
250	everything
251	everywhere
252	except
253	f
254	few
255	fifty
256	first
257	five
258	vfor
259	former
260	formerly
261	forty
262	found
263	four
264	from
265	further
266	g
267	h
268	had
269	has
270	hasn't
271	have
272	haven't
273	he
274	he'd
275	he'll
276	he's
277	hence
278	her
279	here
280	here's
281	hereafter
282	hereby
283	herein
284	hereupon
285	hers
286	herself
287	him
288	himself
289	his
290	how
291	however
292	hundred
293	i
294	i'd
295	i'll
296	i'm
297	i've
298	ie
299	if
300	in
301	inc.
302	indeed
303	instead
304	into
305	is
306	isn't
307	it
308	it's
309	its
310	itself
311	j
312	k
313	l
314	last
315	later
316	latter
317	latterly
318	least
319	less
320	let
321	let's
322	like
323	likely
324	ltd
325	m
326	made
327	make
328	makes
329	many
330	maybe
331	me
332	meantime
333	meanwhile
334	might
335	million
336	miss
337	more
338	moreover
339	most
340	mostly
341	mr
342	mrs
343	much
344	must
345	my
346	myself
347	n
348	namely
349	neither
350	never
351	nevertheless
352	next
353	nine
354	ninety
355	no
356	nobody
357	none
358	nonetheless
359	noone
360	nor
361	not
362	nothing
363	now
364	nowhere
365	o
366	of
367	off
368	often
369	on
370	once
371	one
372	one's
373	only
374	onto
375	or
376	other
377	others
378	otherwise
379	our
380	ours
381	ourselves
382	out
383	over
384	overall
385	own
386	p
387	per
388	perhaps
389	q
390	r
391	rather
392	recent
393	recently
394	s
395	same
396	seem
397	seemed
398	seeming
399	seems
400	seven
401	seventy
402	several
403	she
404	she'd
405	she'll
406	she's
407	should
408	shouldn't
409	since
410	six
411	sixty
412	so
413	some
414	somehow
415	someone
416	something
417	sometime
418	sometimes
419	somewhere
420	still
421	stop
422	such
423	t
424	taking
425	ten
426	than
427	that
428	that'll
429	that's
430	that've
431	the
432	their
433	them
434	themselves
435	then
436	thence
437	there
438	there'd
439	there'll
440	there're
441	there's
442	there've
443	thereafter
444	thereby
445	therefore
446	therein
447	thereupon
448	these
449	they
450	they'd
451	they'll
452	they're
453	they've
454	thirty
455	this
456	those
457	though
458	thousand
459	three
460	through
461	throughout
462	thru
463	thus
464	to
465	together
466	too
467	toward
468	towards
469	trillion
470	twenty
471	two
472	u
473	under
474	unless
475	unlike
476	unlikely
477	until
478	up
479	upon
480	us
481	used
482	using
483	v
484	very
485	via
486	w
487	was
488	wasn't
489	we
490	we'd
491	we'll
492	we're
493	we've
494	well
495	were
496	weren't
497	what
498	what'll
499	what's
500	what've
501	whatever
502	when
503	whence
504	whenever
505	where
506	where's
507	whereafter
508	whereas
509	whereby
510	wherein
511	whereupon
512	wherever
513	whether
514	which
515	while
516	whither
517	who
518	who'd
519	who'll
520	who's
521	whoever
522	whole
523	whom
524	whomever
525	whose
526	why
527	will
528	with
529	within
530	without
531	won't
532	would
533	wouldn't
534	x
535	y
536	yes
537	yet
538	you
539	you'd
540	you'll
541	you're
542	you've
543	your
544	yours
545	yourself
546	yourselves
547	z
548	# occuring in more than 100 files
549	acc
550	accent
551	accents
552	and
553	are
554	bell
555	can
556	character
557	corrections
558	crt
559	daisy
560	dash
561	date
562	defined
563	definitions
564	description
565	devices
566	diablo
567	dummy
568	factors
569	following
570	font
571	for
572	from
573	fudge
574	give
575	have
576	header
577	holds
578	log
579	logo
580	low
581	lpr
582	mark
583	name
584	nroff
585	out
586	output
587	pitch
588	put
589	rcsfile
590	reference
591	resolution
592	revision
593	see
594	set
595	simple
596	smi
597	some
598	string
599	synopsis
600	system
601	that
602	the
603	this
604	translation
605	troff
606	typewriter
607	ucb
608	unbreakable
609	use
610	used
611	user
612	vroff
613	wheel
614	will
615	with
616	you
617	__END__
618
619	=head1 NAME
620
621	WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
622
623	=head1 SYNOPSIS
624
625	use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
626	isotr disotr stop grundform);
627
628	$stem = Stem($word);
629	$scode = Soundex($word);
630	$pcode = Phonix($word);
631	$lword = isolc($word);
632	disolc($word);
633	$uword = isouc($word);
634	disouc($word);
635	$trword = isotr($word);
636	disotr($word);
637	$word = stop($word);
638	$word = grundform($word);
639
640	@words = WAIT::Filter::split($word);
641	@words = WAIT::Filter::split2($word);
642	@words = WAIT::Filter::split3($word);
643	@words = WAIT::Filter::split4($word); # arbitrary numbers allowed
644
645	=head1 DESCRIPTION
646
647	This tiny modules gives access to the basic reduction functions build
648	in B<freeWAIS-sf>.
649
650	=over 5
651
652	=item B<Stem>(I<word>)
653
654	reduces I<word> using the well know Porter algorithm.
655
656	AU: Porter, M.F.
657	TI: An Algorithm for Suffix Stripping
658	JT: Program
659	VO: 14
660	PP: 130-137
661	PY: 1980
662	PM: JUL
663
664	=item B<Soundex>(I<word>)
665
666
667	computes the 4 byte B<Soundex> code for I<word>.
668
669	AU: Gadd, T.N.
670	TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
671	Information Retrieval Systems
672	JT: Program
673	VO: 22
674	NO: 3
675	PP: 222-237
676	PY: 1988
677
678
679	=item B<Phonix>(I<word>)
680
681	computes the 8 byte B<Phonix> code for I<word>.
682
683	AU: Gadd, T.N.
684	TI: PHONIX: The Algorithm
685	JT: Program
686	VO: 24
687	NO: 4
688	PP: 363-366
689	PY: 1990
690	PM: OCT
691
692	=back
693
694	=head1 ISO charcater case functions
695
696	There are some additional function which transpose some/most ISOlatin1
697	characters to upper and lower case. To allow for maximum speed there
698	are also I<destructive> versions which change the argument instead of
699	allocating a copy which is returned. For convenience, the destructive
700	version also B<returns> the argument. So all of the following is
701	valid and C<$word> will contain the lowercased string.
702
703	$word = isolc($word);
704	$word = disolc($word);
705	disolc($word);
706
707	Here are the hardcoded characters which are recognized:
708
709	abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
710	ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
711
712	=over 5
713
714	=item C<$new = >B<isolc>C<($word)>
715
716	=item B<disolc>C<($word)>
717
718	transposes to lower case.
719
720	=item C<$new = >B<isouc>C<($word)>
721
722	=item B<disouc>C<($word)>
723
724	transposes to upper case.
725
726	=item C<$new = >B<isotr>C<($word)>
727
728	=item B<disotr>C<($word)>
729
730	Remove non-letters according to the above table.
731
732	=item C<$new = >B<stop>C<($word)>
733
734	Returns an empty string if $word is a stopword.
735
736	=item C<$new = >B<grundform>C<($word)>
737
738	Calls Text::German::reduce
739
740	=item C<$new = >B<utf8iso>C<($word)>
741
742	Deprecated due to flux in perl versions between 5.005 and 5.8. The
743	function converts UTF8 encoded strings to ISO-8859-1. WAIT is
744	internally still based on the Latin1 character set, so if you process
745	anything in a different encoding, you should convert to Latin1 as the
746	first filter or refrain from using the iso-latin-1 based filter
747	functions. It is recommended that you use your own converter based on
748	the perl version you're using.
749
750	=item split, split2, split3, ...
751
752	The splitN funtions all take a scalar as input and return a list of
753	words. Split acts just like the perl split(' '). Split2 eliminates all
754	words from the list that are shorter than 2 characters (bytes), split3
755	eliminates those shorter than 3 characters (bytes) and so on.
756
757	=back
758
759	=head1 AUTHOR
760
761	Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
762
763	=head1 SEE ALSO
764
765	perl(1).
766
767	=cut
768