lib/WAIT/Filter.pm

#                              -*- Mode: Cperl -*- 
# $Basename: Filter.pm $
# $Revision: 1.9 $
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug 15 18:09:51 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:46 1998
# Language        : CPerl
# Update Count    : 105
# Status          : Unknown, Use with caution!
#
# Copyright (c) 1996-1997, Ulrich Pfeifer
#
package WAIT::Filter;
require WAIT;
use strict;
use Carp;
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $UNAC $ICONV $AUTOLOAD);
use subs qw(grundform);

use Text::Unaccent;
use Text::Iconv;

require Exporter;

@ISA = qw(Exporter);
@EXPORT_OK = qw(
                Stem
                Soundex
                Phonix
                Metaphone
                isolc disolc
                isouc disouc
                isotr disotr
                stop grundform
                utf8iso
               );
# (most implemented in WAIT.xs)

$VERSION = substr q$Revision: 1.9 $, 10;

sub split {
  map split(' ', $_), @_;
}

$SPLIT = q[
           sub splitXXX {
                          grep length($_)>=XXX, map split(' ', $_), @_;
                         }
          ];

$UNAC = q[
        sub unac_CHARSET {
                map split(' ',unac_string('CHARSET', $_) || $_), @_;
        }
];

my $iconv;

$ICONV = q[
        sub iconv_CHARSETfrom_CHARSETto {
                my $ic = $iconv->{'CHARSETfrom_CHARSETto'});
                $ic ||= $iconv->{'CHARSETfrom_CHARSETto'} = Text::Iconv->new('CHARSETfrom','CHARSETto');
                map split(' ',$ic->convert($_) || $_), @_;
        }
];
                

sub AUTOLOAD {
  my $func = $AUTOLOAD; $func =~ s/.*:://;

  if ($func =~ /split(\d+)/) {
    my $num = $1;
    my $split = $SPLIT;

    $split =~ s/XXX/$num/g;
    eval $split;
    if ($@ eq '') {
      goto &$AUTOLOAD;
    }
  } elsif ($func eq 'grundform') {
    eval {require Text::German;};
    croak "You must have Text::German to use 'grundform'"
      if $@ ne '';
    *grundform = Text::German->can('reduce');
    goto &grundform;
  } elsif ($func eq 'date') {
    eval {require Time::ParseDate;};
    croak "You must have Time::ParseDate to use 'date'"
      if $@ ne '';
    *date = Time::ParseDate->can('parsedate');
    goto \&date;
  } elsif ($func eq 'decode_entities') {
    eval {require HTML::Entities;};
    croak "You must have HTML::Entities to use 'decode_entities'"
      if $@ ne '';
    *decode_entities = HTML::Entities->can('decode_entities');
    goto &decode_entities;
  } elsif ($func =~ /^d?utf8iso$/) {
    no strict 'refs';
    *$func = sub {
      # Courtesy JHI
      my $s = shift;
      $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
             {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
      $s;
    };
    goto \&$func;
  } elsif ($func =~ /unac_(.+)/) {
    my $charset = $1;
    my $unac = $UNAC;
    $unac =~ s/CHARSET/$charset/g;
print "### $unac ###\n";
    eval $unac;
    if ($@ eq '') {
     goto &$func;
    }
  } elsif ($func =~ /iconv_([^_]+)_([^_]+)/) {
    my ($cf,$ct) = ($1,$2);
    my $iconv = $ICONV;
print "### $cf -> $ct\n";
    $iconv =~ s/CHARSETfrom/$cf/gs;
    $iconv =~ s/CHARSETto/$ct/gs;
print "### $iconv ###\n";
    eval $iconv;
    if ($@ eq '') {
     goto &$func;
    }
  }
  Carp::confess "Class WAIT::Filter::$func not found";
}

while (<DATA>) {
  chomp;
  last if /__END__/;
  next if /^\s*#/; # there's a comment
  $STOP{$_}++;
}
close DATA;

sub stop {
  if (exists $STOP{$_[0]}) {
    ''
  } else {
    $_[0];
  }
}

sub gdate {
  my $date = shift;

  $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
  date($date);
}

1;

__DATA__
a
about
above
according
across
actually
adj
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
b
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
c
can
can't
cannot
caption
co
co.
could
couldn't
d
did
didn't
do
does
doesn't
don't
down
during
e
eg
eight
eighty
either
else
elsewhere
end
ending
enough
etc
even
ever
every
everyone
everything
everywhere
except
f
few
fifty
first
five
vfor
former
formerly
forty
found
four
from
further
g
h
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'd
i'll
i'm
i've
ie
if
in
inc.
indeed
instead
into
is
isn't
it
it's
its
itself
j
k
l
last
later
latter
latterly
least
less
let
let's
like
likely
ltd
m
made
make
makes
many
maybe
me
meantime
meanwhile
might
million
miss
more
moreover
most
mostly
mr
mrs
much
must
my
myself
n
namely
neither
never
nevertheless
next
nine
ninety
no
nobody
none
nonetheless
noone
nor
not
nothing
now
nowhere
o
of
off
often
on
once
one
one's
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
p
per
perhaps
q
r
rather
recent
recently
s
same
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
stop
such
t
taking
ten
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
to
together
too
toward
towards
trillion
twenty
two
u
under
unless
unlike
unlikely
until
up
upon
us
used
using
v
very
via
w
was
wasn't
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
# occuring in more than 100 files
acc
accent
accents
and
are
bell
can
character
corrections
crt
daisy
dash
date
defined
definitions
description
devices
diablo
dummy
factors
following
font
for
from
fudge
give
have
header
holds
log
logo
low
lpr
mark
name
nroff
out
output
pitch
put
rcsfile
reference
resolution
revision
see
set
simple
smi
some
string
synopsis
system
that
the
this
translation
troff
typewriter
ucb
unbreakable
use
used
user
vroff
wheel
will
with
you
__END__

=head1 NAME

WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions

=head1 SYNOPSIS

  use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
                      isotr disotr stop grundform);

  $stem   = Stem($word);
  $scode  = Soundex($word);
  $pcode  = Phonix($word);
  $lword  = isolc($word);
  disolc($word);
  $uword  = isouc($word);
  disouc($word);
  $trword = isotr($word);
  disotr($word);
  $word   = stop($word);
  $word   = grundform($word);

  @words = WAIT::Filter::split($word);
  @words = WAIT::Filter::split2($word);
  @words = WAIT::Filter::split3($word);
  @words = WAIT::Filter::split4($word); # arbitrary numbers allowed

=head1 DESCRIPTION

This tiny modules gives access to the basic reduction functions build
in B<freeWAIS-sf>.

=over 5

=item B<Stem>(I<word>)

reduces I<word> using the well know Porter algorithm.

  AU: Porter, M.F.
  TI: An Algorithm for Suffix Stripping
  JT: Program
  VO: 14
  PP: 130-137
  PY: 1980
  PM: JUL

=item B<Soundex>(I<word>)


computes the 4 byte B<Soundex> code for I<word>.

  AU: Gadd, T.N.
  TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
      Information Retrieval Systems
  JT: Program
  VO: 22
  NO: 3
  PP: 222-237
  PY: 1988


=item B<Phonix>(I<word>)

computes the 8 byte B<Phonix> code for I<word>.

  AU: Gadd, T.N.
  TI: PHONIX: The Algorithm
  JT: Program
  VO: 24
  NO: 4
  PP: 363-366
  PY: 1990
  PM: OCT

=back

=head1 ISO charcater case functions

There are some additional function which transpose some/most ISOlatin1
characters to upper and lower case. To allow for maximum speed there
are also I<destructive> versions which change the argument instead of
allocating a copy which is returned. For convenience, the destructive
version also B<returns> the argument. So all of the following is
valid and C<$word> will contain the lowercased string.

  $word = isolc($word);
  $word = disolc($word);
  disolc($word);

Here are the hardcoded characters which are recognized:

  abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
  ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß

=over 5

=item C<$new = >B<isolc>C<($word)>

=item B<disolc>C<($word)>

transposes to lower case.

=item C<$new = >B<isouc>C<($word)>

=item  B<disouc>C<($word)>

transposes to upper case.

=item C<$new = >B<isotr>C<($word)>

=item  B<disotr>C<($word)>

Remove non-letters according to the above table.

=item C<$new = >B<stop>C<($word)>

Returns an empty string if $word is a stopword.

=item C<$new = >B<grundform>C<($word)>

Calls Text::German::reduce

=item C<$new = >B<utf8iso>C<($word)>

Deprecated due to flux in perl versions between 5.005 and 5.8. The
function converts UTF8 encoded strings to ISO-8859-1. WAIT is
internally still based on the Latin1 character set, so if you process
anything in a different encoding, you should convert to Latin1 as the
first filter or refrain from using the iso-latin-1 based filter
functions. It is recommended that you use your own converter based on
the perl version you're using.

=item split, split2, split3, ...

The splitN funtions all take a scalar as input and return a list of
words. Split acts just like the perl split(' '). Split2 eliminates all
words from the list that are shorter than 2 characters (bytes), split3
eliminates those shorter than 3 characters (bytes) and so on.

=back

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

=head1 SEE ALSO

perl(1).

=cut

1	ulpfr	13	# -- Mode: Cperl --
2	ulpfr	10	# $Basename: Filter.pm $
3	ulpfr	19	# $Revision: 1.9 $
4	ulpfr	10	# ITIID : $ITI$ $Header $__Header$
5			# Author : Ulrich Pfeifer
6			# Created On : Thu Aug 15 18:09:51 1996
7			# Last Modified By: Ulrich Pfeifer
8			# Last Modified On: Sun Nov 22 18:44:46 1998
9			# Language : CPerl
10			# Update Count : 105
11			# Status : Unknown, Use with caution!
12	ulpfr	13	#
13	ulpfr	10	# Copyright (c) 1996-1997, Ulrich Pfeifer
14	ulpfr	13	#
15	ulpfr	10	package WAIT::Filter;
16			require WAIT;
17			use strict;
18			use Carp;
19	dpavlin	118	use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $UNAC $ICONV $AUTOLOAD);
20	ulpfr	10	use subs qw(grundform);
21
22	dpavlin	118	use Text::Unaccent;
23			use Text::Iconv;
24
25	ulpfr	10	require Exporter;
26
27			@ISA = qw(Exporter);
28			@EXPORT_OK = qw(
29			Stem
30			Soundex
31			Phonix
32			Metaphone
33			isolc disolc
34			isouc disouc
35			isotr disotr
36			stop grundform
37	laperla	32	utf8iso
38	ulpfr	10	);
39	ulpfr	13	# (most implemented in WAIT.xs)
40	ulpfr	10
41	ulpfr	19	$VERSION = substr q$Revision: 1.9 $, 10;
42	ulpfr	10
43			sub split {
44			map split(' ', $_), @_;
45			}
46
47			$SPLIT = q[
48			sub splitXXX {
49			grep length($_)>=XXX, map split(' ', $_), @_;
50			}
51			];
52
53	dpavlin	118	$UNAC = q[
54			sub unac_CHARSET {
55			map split(' ',unac_string('CHARSET', $_) \|\| $_), @_;
56			}
57			];
58
59			my $iconv;
60
61			$ICONV = q[
62			sub iconv_CHARSETfrom_CHARSETto {
63			my $ic = $iconv->{'CHARSETfrom_CHARSETto'});
64			$ic \|\|= $iconv->{'CHARSETfrom_CHARSETto'} = Text::Iconv->new('CHARSETfrom','CHARSETto');
65			map split(' ',$ic->convert($_) \|\| $_), @_;
66			}
67			];
68
69
70	ulpfr	10	sub AUTOLOAD {
71			my $func = $AUTOLOAD; $func =~ s/.*:://;
72
73			if ($func =~ /split(\d+)/) {
74			my $num = $1;
75			my $split = $SPLIT;
76
77			$split =~ s/XXX/$num/g;
78			eval $split;
79			if ($@ eq '') {
80			goto &$AUTOLOAD;
81			}
82			} elsif ($func eq 'grundform') {
83			eval {require Text::German;};
84			croak "You must have Text::German to use 'grundform'"
85			if $@ ne '';
86			*grundform = Text::German->can('reduce');
87			goto &grundform;
88			} elsif ($func eq 'date') {
89			eval {require Time::ParseDate;};
90			croak "You must have Time::ParseDate to use 'date'"
91			if $@ ne '';
92			*date = Time::ParseDate->can('parsedate');
93			goto \&date;
94			} elsif ($func eq 'decode_entities') {
95			eval {require HTML::Entities;};
96	dpavlin	118	croak "You must have HTML::Entities to use 'decode_entities'"
97	ulpfr	10	if $@ ne '';
98			*decode_entities = HTML::Entities->can('decode_entities');
99			goto &decode_entities;
100	ulpfr	13	} elsif ($func =~ /^d?utf8iso$/) {
101			no strict 'refs';
102	laperla	32	*$func = sub {
103			# Courtesy JHI
104	laperla	41	my $s = shift;
105			$s =~ s{([\xC0-\xDF])([\x80-\xBF])}
106			{chr(ord($1)<<6&0xC0\|ord($2)&0x3F)}eg;
107			$s;
108	laperla	32	};
109			goto \&$func;
110	dpavlin	118	} elsif ($func =~ /unac_(.+)/) {
111			my $charset = $1;
112			my $unac = $UNAC;
113			$unac =~ s/CHARSET/$charset/g;
114			print "### $unac ###\n";
115			eval $unac;
116			if ($@ eq '') {
117			goto &$func;
118			}
119			} elsif ($func =~ /iconv_([^_]+)_([^_]+)/) {
120			my ($cf,$ct) = ($1,$2);
121			my $iconv = $ICONV;
122			print "### $cf -> $ct\n";
123			$iconv =~ s/CHARSETfrom/$cf/gs;
124			$iconv =~ s/CHARSETto/$ct/gs;
125			print "### $iconv ###\n";
126			eval $iconv;
127			if ($@ eq '') {
128			goto &$func;
129			}
130	ulpfr	10	}
131	ulpfr	13	Carp::confess "Class WAIT::Filter::$func not found";
132	ulpfr	10	}
133
134			while (<DATA>) {
135			chomp;
136			last if /__END__/;
137	ulpfr	19	next if /^\s*#/; # there's a comment
138	ulpfr	10	$STOP{$_}++;
139			}
140	laperla	50	close DATA;
141	ulpfr	10
142			sub stop {
143			if (exists $STOP{$_[0]}) {
144			''
145			} else {
146			$_[0];
147			}
148			}
149
150			sub gdate {
151			my $date = shift;
152
153			$date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
154			date($date);
155			}
156
157			1;
158	dpavlin	109
159	ulpfr	10	__DATA__
160			a
161			about
162			above
163			according
164			across
165			actually
166			adj
167			after
168			afterwards
169			again
170			against
171			all
172			almost
173			alone
174			along
175			already
176			also
177			although
178			always
179			among
180			amongst
181			an
182			and
183			another
184			any
185			anyhow
186			anyone
187			anything
188			anywhere
189			are
190			aren't
191			around
192			as
193			at
194			b
195			be
196			became
197			because
198			become
199			becomes
200			becoming
201			been
202			before
203			beforehand
204			begin
205			beginning
206			behind
207			being
208			below
209			beside
210			besides
211			between
212			beyond
213			billion
214			both
215			but
216			by
217			c
218			can
219			can't
220			cannot
221			caption
222			co
223			co.
224			could
225			couldn't
226			d
227			did
228			didn't
229			do
230			does
231			doesn't
232			don't
233			down
234			during
235			e
236			eg
237			eight
238			eighty
239			either
240			else
241			elsewhere
242			end
243			ending
244			enough
245			etc
246			even
247			ever
248			every
249			everyone
250			everything
251			everywhere
252			except
253			f
254			few
255			fifty
256			first
257			five
258			vfor
259			former
260			formerly
261			forty
262	ulpfr	13	found
263	ulpfr	10	four
264			from
265			further
266			g
267			h
268			had
269			has
270			hasn't
271			have
272			haven't
273			he
274			he'd
275			he'll
276			he's
277			hence
278			her
279			here
280			here's
281			hereafter
282			hereby
283			herein
284			hereupon
285			hers
286			herself
287			him
288			himself
289			his
290			how
291			however
292			hundred
293			i
294			i'd
295			i'll
296			i'm
297			i've
298			ie
299			if
300			in
301			inc.
302			indeed
303			instead
304			into
305			is
306			isn't
307			it
308			it's
309			its
310			itself
311			j
312			k
313			l
314			last
315			later
316			latter
317			latterly
318			least
319			less
320			let
321			let's
322			like
323			likely
324			ltd
325			m
326			made
327			make
328			makes
329			many
330			maybe
331			me
332			meantime
333			meanwhile
334			might
335			million
336			miss
337			more
338			moreover
339			most
340			mostly
341			mr
342			mrs
343			much
344			must
345			my
346			myself
347			n
348			namely
349			neither
350			never
351			nevertheless
352			next
353			nine
354			ninety
355			no
356			nobody
357			none
358			nonetheless
359			noone
360			nor
361			not
362			nothing
363			now
364			nowhere
365			o
366			of
367			off
368			often
369			on
370			once
371			one
372			one's
373			only
374			onto
375			or
376			other
377			others
378			otherwise
379			our
380			ours
381			ourselves
382			out
383			over
384			overall
385			own
386			p
387			per
388			perhaps
389			q
390			r
391			rather
392			recent
393			recently
394			s
395			same
396			seem
397			seemed
398			seeming
399			seems
400			seven
401			seventy
402			several
403			she
404			she'd
405			she'll
406			she's
407			should
408			shouldn't
409			since
410			six
411			sixty
412			so
413			some
414			somehow
415			someone
416			something
417			sometime
418			sometimes
419			somewhere
420			still
421			stop
422			such
423			t
424			taking
425			ten
426			than
427			that
428			that'll
429			that's
430			that've
431			the
432			their
433			them
434			themselves
435			then
436			thence
437			there
438			there'd
439			there'll
440			there're
441			there's
442			there've
443			thereafter
444			thereby
445			therefore
446			therein
447			thereupon
448			these
449			they
450			they'd
451			they'll
452			they're
453			they've
454			thirty
455			this
456			those
457			though
458			thousand
459			three
460			through
461			throughout
462			thru
463			thus
464			to
465			together
466			too
467			toward
468			towards
469			trillion
470			twenty
471			two
472			u
473			under
474			unless
475			unlike
476			unlikely
477			until
478			up
479			upon
480			us
481			used
482			using
483			v
484			very
485			via
486			w
487			was
488			wasn't
489			we
490			we'd
491			we'll
492			we're
493			we've
494			well
495			were
496			weren't
497			what
498			what'll
499			what's
500			what've
501			whatever
502			when
503			whence
504			whenever
505			where
506			where's
507			whereafter
508			whereas
509			whereby
510			wherein
511			whereupon
512			wherever
513			whether
514			which
515			while
516			whither
517			who
518			who'd
519			who'll
520			who's
521			whoever
522			whole
523			whom
524			whomever
525			whose
526			why
527			will
528			with
529			within
530			without
531			won't
532			would
533			wouldn't
534			x
535			y
536			yes
537			yet
538			you
539			you'd
540			you'll
541			you're
542			you've
543			your
544			yours
545			yourself
546			yourselves
547			z
548			# occuring in more than 100 files
549			acc
550			accent
551			accents
552			and
553			are
554			bell
555			can
556			character
557			corrections
558			crt
559			daisy
560			dash
561			date
562			defined
563			definitions
564			description
565			devices
566			diablo
567			dummy
568			factors
569			following
570			font
571			for
572			from
573			fudge
574			give
575			have
576			header
577			holds
578			log
579			logo
580			low
581			lpr
582			mark
583			name
584			nroff
585			out
586			output
587			pitch
588			put
589			rcsfile
590			reference
591			resolution
592			revision
593			see
594			set
595			simple
596			smi
597			some
598			string
599			synopsis
600			system
601			that
602			the
603			this
604			translation
605			troff
606			typewriter
607			ucb
608			unbreakable
609			use
610			used
611			user
612			vroff
613			wheel
614			will
615			with
616			you
617			__END__
618
619			=head1 NAME
620
621			WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
622
623			=head1 SYNOPSIS
624
625	ulpfr	13	use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
626	laperla	32	isotr disotr stop grundform);
627	ulpfr	10
628	ulpfr	13	$stem = Stem($word);
629			$scode = Soundex($word);
630			$pcode = Phonix($word);
631			$lword = isolc($word);
632	ulpfr	10	disolc($word);
633	ulpfr	13	$uword = isouc($word);
634	ulpfr	10	disouc($word);
635	ulpfr	13	$trword = isotr($word);
636			disotr($word);
637			$word = stop($word);
638			$word = grundform($word);
639	ulpfr	10
640	ulpfr	13	@words = WAIT::Filter::split($word);
641			@words = WAIT::Filter::split2($word);
642			@words = WAIT::Filter::split3($word);
643			@words = WAIT::Filter::split4($word); # arbitrary numbers allowed
644
645	ulpfr	10	=head1 DESCRIPTION
646
647			This tiny modules gives access to the basic reduction functions build
648			in B<freeWAIS-sf>.
649
650			=over 5
651
652			=item B<Stem>(I<word>)
653
654			reduces I<word> using the well know Porter algorithm.
655
656			AU: Porter, M.F.
657			TI: An Algorithm for Suffix Stripping
658			JT: Program
659			VO: 14
660			PP: 130-137
661			PY: 1980
662			PM: JUL
663
664			=item B<Soundex>(I<word>)
665
666
667			computes the 4 byte B<Soundex> code for I<word>.
668
669			AU: Gadd, T.N.
670			TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
671			Information Retrieval Systems
672			JT: Program
673			VO: 22
674			NO: 3
675			PP: 222-237
676			PY: 1988
677
678
679			=item B<Phonix>(I<word>)
680
681			computes the 8 byte B<Phonix> code for I<word>.
682
683			AU: Gadd, T.N.
684			TI: PHONIX: The Algorithm
685			JT: Program
686			VO: 24
687			NO: 4
688			PP: 363-366
689			PY: 1990
690			PM: OCT
691
692	dpavlin	109	=back
693
694	ulpfr	10	=head1 ISO charcater case functions
695
696			There are some additional function which transpose some/most ISOlatin1
697			characters to upper and lower case. To allow for maximum speed there
698			are also I<destructive> versions which change the argument instead of
699			allocating a copy which is returned. For convenience, the destructive
700	ulpfr	13	version also B<returns> the argument. So all of the following is
701	ulpfr	10	valid and C<$word> will contain the lowercased string.
702
703	ulpfr	13	$word = isolc($word);
704	ulpfr	10	$word = disolc($word);
705			disolc($word);
706
707			Here are the hardcoded characters which are recognized:
708
709			abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
710			ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
711
712	dpavlin	109	=over 5
713
714	ulpfr	10	=item C<$new = >B<isolc>C<($word)>
715
716			=item B<disolc>C<($word)>
717
718			transposes to lower case.
719
720			=item C<$new = >B<isouc>C<($word)>
721
722			=item B<disouc>C<($word)>
723
724			transposes to upper case.
725
726	ulpfr	13	=item C<$new = >B<isotr>C<($word)>
727
728			=item B<disotr>C<($word)>
729
730			Remove non-letters according to the above table.
731
732			=item C<$new = >B<stop>C<($word)>
733
734			Returns an empty string if $word is a stopword.
735
736			=item C<$new = >B<grundform>C<($word)>
737
738			Calls Text::German::reduce
739
740			=item C<$new = >B<utf8iso>C<($word)>
741
742	laperla	32	Deprecated due to flux in perl versions between 5.005 and 5.8. The
743			function converts UTF8 encoded strings to ISO-8859-1. WAIT is
744			internally still based on the Latin1 character set, so if you process
745	ulpfr	13	anything in a different encoding, you should convert to Latin1 as the
746	laperla	32	first filter or refrain from using the iso-latin-1 based filter
747			functions. It is recommended that you use your own converter based on
748			the perl version you're using.
749	ulpfr	13
750			=item split, split2, split3, ...
751
752			The splitN funtions all take a scalar as input and return a list of
753			words. Split acts just like the perl split(' '). Split2 eliminates all
754			words from the list that are shorter than 2 characters (bytes), split3
755			eliminates those shorter than 3 characters (bytes) and so on.
756
757	dpavlin	109	=back
758
759	ulpfr	10	=head1 AUTHOR
760
761			Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
762
763			=head1 SEE ALSO
764
765			perl(1).
766
767			=cut
768