1 |
ulpfr |
13 |
# -*- Mode: Cperl -*- |
2 |
ulpfr |
10 |
# $Basename: Filter.pm $ |
3 |
ulpfr |
19 |
# $Revision: 1.9 $ |
4 |
ulpfr |
10 |
# ITIID : $ITI$ $Header $__Header$ |
5 |
|
|
# Author : Ulrich Pfeifer |
6 |
|
|
# Created On : Thu Aug 15 18:09:51 1996 |
7 |
|
|
# Last Modified By: Ulrich Pfeifer |
8 |
|
|
# Last Modified On: Sun Nov 22 18:44:46 1998 |
9 |
|
|
# Language : CPerl |
10 |
|
|
# Update Count : 105 |
11 |
|
|
# Status : Unknown, Use with caution! |
12 |
ulpfr |
13 |
# |
13 |
ulpfr |
10 |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
14 |
ulpfr |
13 |
# |
15 |
ulpfr |
10 |
package WAIT::Filter; |
16 |
|
|
require WAIT; |
17 |
|
|
use strict; |
18 |
|
|
use Carp; |
19 |
dpavlin |
118 |
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $UNAC $ICONV $AUTOLOAD); |
20 |
ulpfr |
10 |
use subs qw(grundform); |
21 |
|
|
|
22 |
dpavlin |
118 |
use Text::Unaccent; |
23 |
|
|
use Text::Iconv; |
24 |
|
|
|
25 |
ulpfr |
10 |
require Exporter; |
26 |
|
|
|
27 |
|
|
@ISA = qw(Exporter); |
28 |
|
|
@EXPORT_OK = qw( |
29 |
|
|
Stem |
30 |
|
|
Soundex |
31 |
|
|
Phonix |
32 |
|
|
Metaphone |
33 |
|
|
isolc disolc |
34 |
|
|
isouc disouc |
35 |
|
|
isotr disotr |
36 |
|
|
stop grundform |
37 |
laperla |
32 |
utf8iso |
38 |
ulpfr |
10 |
); |
39 |
ulpfr |
13 |
# (most implemented in WAIT.xs) |
40 |
ulpfr |
10 |
|
41 |
ulpfr |
19 |
$VERSION = substr q$Revision: 1.9 $, 10; |
42 |
ulpfr |
10 |
|
43 |
|
|
sub split { |
44 |
|
|
map split(' ', $_), @_; |
45 |
|
|
} |
46 |
|
|
|
47 |
|
|
$SPLIT = q[ |
48 |
|
|
sub splitXXX { |
49 |
|
|
grep length($_)>=XXX, map split(' ', $_), @_; |
50 |
|
|
} |
51 |
|
|
]; |
52 |
|
|
|
53 |
dpavlin |
118 |
$UNAC = q[ |
54 |
|
|
sub unac_CHARSET { |
55 |
|
|
map split(' ',unac_string('CHARSET', $_) || $_), @_; |
56 |
|
|
} |
57 |
|
|
]; |
58 |
|
|
|
59 |
|
|
my $iconv; |
60 |
|
|
|
61 |
|
|
$ICONV = q[ |
62 |
|
|
sub iconv_CHARSETfrom_CHARSETto { |
63 |
|
|
my $ic = $iconv->{'CHARSETfrom_CHARSETto'}); |
64 |
|
|
$ic ||= $iconv->{'CHARSETfrom_CHARSETto'} = Text::Iconv->new('CHARSETfrom','CHARSETto'); |
65 |
|
|
map split(' ',$ic->convert($_) || $_), @_; |
66 |
|
|
} |
67 |
|
|
]; |
68 |
|
|
|
69 |
|
|
|
70 |
ulpfr |
10 |
sub AUTOLOAD { |
71 |
|
|
my $func = $AUTOLOAD; $func =~ s/.*:://; |
72 |
|
|
|
73 |
|
|
if ($func =~ /split(\d+)/) { |
74 |
|
|
my $num = $1; |
75 |
|
|
my $split = $SPLIT; |
76 |
|
|
|
77 |
|
|
$split =~ s/XXX/$num/g; |
78 |
|
|
eval $split; |
79 |
|
|
if ($@ eq '') { |
80 |
|
|
goto &$AUTOLOAD; |
81 |
|
|
} |
82 |
|
|
} elsif ($func eq 'grundform') { |
83 |
|
|
eval {require Text::German;}; |
84 |
|
|
croak "You must have Text::German to use 'grundform'" |
85 |
|
|
if $@ ne ''; |
86 |
|
|
*grundform = Text::German->can('reduce'); |
87 |
|
|
goto &grundform; |
88 |
|
|
} elsif ($func eq 'date') { |
89 |
|
|
eval {require Time::ParseDate;}; |
90 |
|
|
croak "You must have Time::ParseDate to use 'date'" |
91 |
|
|
if $@ ne ''; |
92 |
|
|
*date = Time::ParseDate->can('parsedate'); |
93 |
|
|
goto \&date; |
94 |
|
|
} elsif ($func eq 'decode_entities') { |
95 |
|
|
eval {require HTML::Entities;}; |
96 |
dpavlin |
118 |
croak "You must have HTML::Entities to use 'decode_entities'" |
97 |
ulpfr |
10 |
if $@ ne ''; |
98 |
|
|
*decode_entities = HTML::Entities->can('decode_entities'); |
99 |
|
|
goto &decode_entities; |
100 |
ulpfr |
13 |
} elsif ($func =~ /^d?utf8iso$/) { |
101 |
|
|
no strict 'refs'; |
102 |
laperla |
32 |
*$func = sub { |
103 |
|
|
# Courtesy JHI |
104 |
laperla |
41 |
my $s = shift; |
105 |
|
|
$s =~ s{([\xC0-\xDF])([\x80-\xBF])} |
106 |
|
|
{chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg; |
107 |
|
|
$s; |
108 |
laperla |
32 |
}; |
109 |
|
|
goto \&$func; |
110 |
dpavlin |
118 |
} elsif ($func =~ /unac_(.+)/) { |
111 |
|
|
my $charset = $1; |
112 |
|
|
my $unac = $UNAC; |
113 |
|
|
$unac =~ s/CHARSET/$charset/g; |
114 |
|
|
print "### $unac ###\n"; |
115 |
|
|
eval $unac; |
116 |
|
|
if ($@ eq '') { |
117 |
|
|
goto &$func; |
118 |
|
|
} |
119 |
|
|
} elsif ($func =~ /iconv_([^_]+)_([^_]+)/) { |
120 |
|
|
my ($cf,$ct) = ($1,$2); |
121 |
|
|
my $iconv = $ICONV; |
122 |
|
|
print "### $cf -> $ct\n"; |
123 |
|
|
$iconv =~ s/CHARSETfrom/$cf/gs; |
124 |
|
|
$iconv =~ s/CHARSETto/$ct/gs; |
125 |
|
|
print "### $iconv ###\n"; |
126 |
|
|
eval $iconv; |
127 |
|
|
if ($@ eq '') { |
128 |
|
|
goto &$func; |
129 |
|
|
} |
130 |
ulpfr |
10 |
} |
131 |
ulpfr |
13 |
Carp::confess "Class WAIT::Filter::$func not found"; |
132 |
ulpfr |
10 |
} |
133 |
|
|
|
134 |
|
|
while (<DATA>) { |
135 |
|
|
chomp; |
136 |
|
|
last if /__END__/; |
137 |
ulpfr |
19 |
next if /^\s*#/; # there's a comment |
138 |
ulpfr |
10 |
$STOP{$_}++; |
139 |
|
|
} |
140 |
laperla |
50 |
close DATA; |
141 |
ulpfr |
10 |
|
142 |
|
|
sub stop { |
143 |
|
|
if (exists $STOP{$_[0]}) { |
144 |
|
|
'' |
145 |
|
|
} else { |
146 |
|
|
$_[0]; |
147 |
|
|
} |
148 |
|
|
} |
149 |
|
|
|
150 |
|
|
sub gdate { |
151 |
|
|
my $date = shift; |
152 |
|
|
|
153 |
|
|
$date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:; |
154 |
|
|
date($date); |
155 |
|
|
} |
156 |
|
|
|
157 |
|
|
1; |
158 |
dpavlin |
109 |
|
159 |
ulpfr |
10 |
__DATA__ |
160 |
|
|
a |
161 |
|
|
about |
162 |
|
|
above |
163 |
|
|
according |
164 |
|
|
across |
165 |
|
|
actually |
166 |
|
|
adj |
167 |
|
|
after |
168 |
|
|
afterwards |
169 |
|
|
again |
170 |
|
|
against |
171 |
|
|
all |
172 |
|
|
almost |
173 |
|
|
alone |
174 |
|
|
along |
175 |
|
|
already |
176 |
|
|
also |
177 |
|
|
although |
178 |
|
|
always |
179 |
|
|
among |
180 |
|
|
amongst |
181 |
|
|
an |
182 |
|
|
and |
183 |
|
|
another |
184 |
|
|
any |
185 |
|
|
anyhow |
186 |
|
|
anyone |
187 |
|
|
anything |
188 |
|
|
anywhere |
189 |
|
|
are |
190 |
|
|
aren't |
191 |
|
|
around |
192 |
|
|
as |
193 |
|
|
at |
194 |
|
|
b |
195 |
|
|
be |
196 |
|
|
became |
197 |
|
|
because |
198 |
|
|
become |
199 |
|
|
becomes |
200 |
|
|
becoming |
201 |
|
|
been |
202 |
|
|
before |
203 |
|
|
beforehand |
204 |
|
|
begin |
205 |
|
|
beginning |
206 |
|
|
behind |
207 |
|
|
being |
208 |
|
|
below |
209 |
|
|
beside |
210 |
|
|
besides |
211 |
|
|
between |
212 |
|
|
beyond |
213 |
|
|
billion |
214 |
|
|
both |
215 |
|
|
but |
216 |
|
|
by |
217 |
|
|
c |
218 |
|
|
can |
219 |
|
|
can't |
220 |
|
|
cannot |
221 |
|
|
caption |
222 |
|
|
co |
223 |
|
|
co. |
224 |
|
|
could |
225 |
|
|
couldn't |
226 |
|
|
d |
227 |
|
|
did |
228 |
|
|
didn't |
229 |
|
|
do |
230 |
|
|
does |
231 |
|
|
doesn't |
232 |
|
|
don't |
233 |
|
|
down |
234 |
|
|
during |
235 |
|
|
e |
236 |
|
|
eg |
237 |
|
|
eight |
238 |
|
|
eighty |
239 |
|
|
either |
240 |
|
|
else |
241 |
|
|
elsewhere |
242 |
|
|
end |
243 |
|
|
ending |
244 |
|
|
enough |
245 |
|
|
etc |
246 |
|
|
even |
247 |
|
|
ever |
248 |
|
|
every |
249 |
|
|
everyone |
250 |
|
|
everything |
251 |
|
|
everywhere |
252 |
|
|
except |
253 |
|
|
f |
254 |
|
|
few |
255 |
|
|
fifty |
256 |
|
|
first |
257 |
|
|
five |
258 |
|
|
vfor |
259 |
|
|
former |
260 |
|
|
formerly |
261 |
|
|
forty |
262 |
ulpfr |
13 |
found |
263 |
ulpfr |
10 |
four |
264 |
|
|
from |
265 |
|
|
further |
266 |
|
|
g |
267 |
|
|
h |
268 |
|
|
had |
269 |
|
|
has |
270 |
|
|
hasn't |
271 |
|
|
have |
272 |
|
|
haven't |
273 |
|
|
he |
274 |
|
|
he'd |
275 |
|
|
he'll |
276 |
|
|
he's |
277 |
|
|
hence |
278 |
|
|
her |
279 |
|
|
here |
280 |
|
|
here's |
281 |
|
|
hereafter |
282 |
|
|
hereby |
283 |
|
|
herein |
284 |
|
|
hereupon |
285 |
|
|
hers |
286 |
|
|
herself |
287 |
|
|
him |
288 |
|
|
himself |
289 |
|
|
his |
290 |
|
|
how |
291 |
|
|
however |
292 |
|
|
hundred |
293 |
|
|
i |
294 |
|
|
i'd |
295 |
|
|
i'll |
296 |
|
|
i'm |
297 |
|
|
i've |
298 |
|
|
ie |
299 |
|
|
if |
300 |
|
|
in |
301 |
|
|
inc. |
302 |
|
|
indeed |
303 |
|
|
instead |
304 |
|
|
into |
305 |
|
|
is |
306 |
|
|
isn't |
307 |
|
|
it |
308 |
|
|
it's |
309 |
|
|
its |
310 |
|
|
itself |
311 |
|
|
j |
312 |
|
|
k |
313 |
|
|
l |
314 |
|
|
last |
315 |
|
|
later |
316 |
|
|
latter |
317 |
|
|
latterly |
318 |
|
|
least |
319 |
|
|
less |
320 |
|
|
let |
321 |
|
|
let's |
322 |
|
|
like |
323 |
|
|
likely |
324 |
|
|
ltd |
325 |
|
|
m |
326 |
|
|
made |
327 |
|
|
make |
328 |
|
|
makes |
329 |
|
|
many |
330 |
|
|
maybe |
331 |
|
|
me |
332 |
|
|
meantime |
333 |
|
|
meanwhile |
334 |
|
|
might |
335 |
|
|
million |
336 |
|
|
miss |
337 |
|
|
more |
338 |
|
|
moreover |
339 |
|
|
most |
340 |
|
|
mostly |
341 |
|
|
mr |
342 |
|
|
mrs |
343 |
|
|
much |
344 |
|
|
must |
345 |
|
|
my |
346 |
|
|
myself |
347 |
|
|
n |
348 |
|
|
namely |
349 |
|
|
neither |
350 |
|
|
never |
351 |
|
|
nevertheless |
352 |
|
|
next |
353 |
|
|
nine |
354 |
|
|
ninety |
355 |
|
|
no |
356 |
|
|
nobody |
357 |
|
|
none |
358 |
|
|
nonetheless |
359 |
|
|
noone |
360 |
|
|
nor |
361 |
|
|
not |
362 |
|
|
nothing |
363 |
|
|
now |
364 |
|
|
nowhere |
365 |
|
|
o |
366 |
|
|
of |
367 |
|
|
off |
368 |
|
|
often |
369 |
|
|
on |
370 |
|
|
once |
371 |
|
|
one |
372 |
|
|
one's |
373 |
|
|
only |
374 |
|
|
onto |
375 |
|
|
or |
376 |
|
|
other |
377 |
|
|
others |
378 |
|
|
otherwise |
379 |
|
|
our |
380 |
|
|
ours |
381 |
|
|
ourselves |
382 |
|
|
out |
383 |
|
|
over |
384 |
|
|
overall |
385 |
|
|
own |
386 |
|
|
p |
387 |
|
|
per |
388 |
|
|
perhaps |
389 |
|
|
q |
390 |
|
|
r |
391 |
|
|
rather |
392 |
|
|
recent |
393 |
|
|
recently |
394 |
|
|
s |
395 |
|
|
same |
396 |
|
|
seem |
397 |
|
|
seemed |
398 |
|
|
seeming |
399 |
|
|
seems |
400 |
|
|
seven |
401 |
|
|
seventy |
402 |
|
|
several |
403 |
|
|
she |
404 |
|
|
she'd |
405 |
|
|
she'll |
406 |
|
|
she's |
407 |
|
|
should |
408 |
|
|
shouldn't |
409 |
|
|
since |
410 |
|
|
six |
411 |
|
|
sixty |
412 |
|
|
so |
413 |
|
|
some |
414 |
|
|
somehow |
415 |
|
|
someone |
416 |
|
|
something |
417 |
|
|
sometime |
418 |
|
|
sometimes |
419 |
|
|
somewhere |
420 |
|
|
still |
421 |
|
|
stop |
422 |
|
|
such |
423 |
|
|
t |
424 |
|
|
taking |
425 |
|
|
ten |
426 |
|
|
than |
427 |
|
|
that |
428 |
|
|
that'll |
429 |
|
|
that's |
430 |
|
|
that've |
431 |
|
|
the |
432 |
|
|
their |
433 |
|
|
them |
434 |
|
|
themselves |
435 |
|
|
then |
436 |
|
|
thence |
437 |
|
|
there |
438 |
|
|
there'd |
439 |
|
|
there'll |
440 |
|
|
there're |
441 |
|
|
there's |
442 |
|
|
there've |
443 |
|
|
thereafter |
444 |
|
|
thereby |
445 |
|
|
therefore |
446 |
|
|
therein |
447 |
|
|
thereupon |
448 |
|
|
these |
449 |
|
|
they |
450 |
|
|
they'd |
451 |
|
|
they'll |
452 |
|
|
they're |
453 |
|
|
they've |
454 |
|
|
thirty |
455 |
|
|
this |
456 |
|
|
those |
457 |
|
|
though |
458 |
|
|
thousand |
459 |
|
|
three |
460 |
|
|
through |
461 |
|
|
throughout |
462 |
|
|
thru |
463 |
|
|
thus |
464 |
|
|
to |
465 |
|
|
together |
466 |
|
|
too |
467 |
|
|
toward |
468 |
|
|
towards |
469 |
|
|
trillion |
470 |
|
|
twenty |
471 |
|
|
two |
472 |
|
|
u |
473 |
|
|
under |
474 |
|
|
unless |
475 |
|
|
unlike |
476 |
|
|
unlikely |
477 |
|
|
until |
478 |
|
|
up |
479 |
|
|
upon |
480 |
|
|
us |
481 |
|
|
used |
482 |
|
|
using |
483 |
|
|
v |
484 |
|
|
very |
485 |
|
|
via |
486 |
|
|
w |
487 |
|
|
was |
488 |
|
|
wasn't |
489 |
|
|
we |
490 |
|
|
we'd |
491 |
|
|
we'll |
492 |
|
|
we're |
493 |
|
|
we've |
494 |
|
|
well |
495 |
|
|
were |
496 |
|
|
weren't |
497 |
|
|
what |
498 |
|
|
what'll |
499 |
|
|
what's |
500 |
|
|
what've |
501 |
|
|
whatever |
502 |
|
|
when |
503 |
|
|
whence |
504 |
|
|
whenever |
505 |
|
|
where |
506 |
|
|
where's |
507 |
|
|
whereafter |
508 |
|
|
whereas |
509 |
|
|
whereby |
510 |
|
|
wherein |
511 |
|
|
whereupon |
512 |
|
|
wherever |
513 |
|
|
whether |
514 |
|
|
which |
515 |
|
|
while |
516 |
|
|
whither |
517 |
|
|
who |
518 |
|
|
who'd |
519 |
|
|
who'll |
520 |
|
|
who's |
521 |
|
|
whoever |
522 |
|
|
whole |
523 |
|
|
whom |
524 |
|
|
whomever |
525 |
|
|
whose |
526 |
|
|
why |
527 |
|
|
will |
528 |
|
|
with |
529 |
|
|
within |
530 |
|
|
without |
531 |
|
|
won't |
532 |
|
|
would |
533 |
|
|
wouldn't |
534 |
|
|
x |
535 |
|
|
y |
536 |
|
|
yes |
537 |
|
|
yet |
538 |
|
|
you |
539 |
|
|
you'd |
540 |
|
|
you'll |
541 |
|
|
you're |
542 |
|
|
you've |
543 |
|
|
your |
544 |
|
|
yours |
545 |
|
|
yourself |
546 |
|
|
yourselves |
547 |
|
|
z |
548 |
|
|
# occuring in more than 100 files |
549 |
|
|
acc |
550 |
|
|
accent |
551 |
|
|
accents |
552 |
|
|
and |
553 |
|
|
are |
554 |
|
|
bell |
555 |
|
|
can |
556 |
|
|
character |
557 |
|
|
corrections |
558 |
|
|
crt |
559 |
|
|
daisy |
560 |
|
|
dash |
561 |
|
|
date |
562 |
|
|
defined |
563 |
|
|
definitions |
564 |
|
|
description |
565 |
|
|
devices |
566 |
|
|
diablo |
567 |
|
|
dummy |
568 |
|
|
factors |
569 |
|
|
following |
570 |
|
|
font |
571 |
|
|
for |
572 |
|
|
from |
573 |
|
|
fudge |
574 |
|
|
give |
575 |
|
|
have |
576 |
|
|
header |
577 |
|
|
holds |
578 |
|
|
log |
579 |
|
|
logo |
580 |
|
|
low |
581 |
|
|
lpr |
582 |
|
|
mark |
583 |
|
|
name |
584 |
|
|
nroff |
585 |
|
|
out |
586 |
|
|
output |
587 |
|
|
pitch |
588 |
|
|
put |
589 |
|
|
rcsfile |
590 |
|
|
reference |
591 |
|
|
resolution |
592 |
|
|
revision |
593 |
|
|
see |
594 |
|
|
set |
595 |
|
|
simple |
596 |
|
|
smi |
597 |
|
|
some |
598 |
|
|
string |
599 |
|
|
synopsis |
600 |
|
|
system |
601 |
|
|
that |
602 |
|
|
the |
603 |
|
|
this |
604 |
|
|
translation |
605 |
|
|
troff |
606 |
|
|
typewriter |
607 |
|
|
ucb |
608 |
|
|
unbreakable |
609 |
|
|
use |
610 |
|
|
used |
611 |
|
|
user |
612 |
|
|
vroff |
613 |
|
|
wheel |
614 |
|
|
will |
615 |
|
|
with |
616 |
|
|
you |
617 |
|
|
__END__ |
618 |
|
|
|
619 |
|
|
=head1 NAME |
620 |
|
|
|
621 |
|
|
WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions |
622 |
|
|
|
623 |
|
|
=head1 SYNOPSIS |
624 |
|
|
|
625 |
ulpfr |
13 |
use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc |
626 |
laperla |
32 |
isotr disotr stop grundform); |
627 |
ulpfr |
10 |
|
628 |
ulpfr |
13 |
$stem = Stem($word); |
629 |
|
|
$scode = Soundex($word); |
630 |
|
|
$pcode = Phonix($word); |
631 |
|
|
$lword = isolc($word); |
632 |
ulpfr |
10 |
disolc($word); |
633 |
ulpfr |
13 |
$uword = isouc($word); |
634 |
ulpfr |
10 |
disouc($word); |
635 |
ulpfr |
13 |
$trword = isotr($word); |
636 |
|
|
disotr($word); |
637 |
|
|
$word = stop($word); |
638 |
|
|
$word = grundform($word); |
639 |
ulpfr |
10 |
|
640 |
ulpfr |
13 |
@words = WAIT::Filter::split($word); |
641 |
|
|
@words = WAIT::Filter::split2($word); |
642 |
|
|
@words = WAIT::Filter::split3($word); |
643 |
|
|
@words = WAIT::Filter::split4($word); # arbitrary numbers allowed |
644 |
|
|
|
645 |
ulpfr |
10 |
=head1 DESCRIPTION |
646 |
|
|
|
647 |
|
|
This tiny modules gives access to the basic reduction functions build |
648 |
|
|
in B<freeWAIS-sf>. |
649 |
|
|
|
650 |
|
|
=over 5 |
651 |
|
|
|
652 |
|
|
=item B<Stem>(I<word>) |
653 |
|
|
|
654 |
|
|
reduces I<word> using the well know Porter algorithm. |
655 |
|
|
|
656 |
|
|
AU: Porter, M.F. |
657 |
|
|
TI: An Algorithm for Suffix Stripping |
658 |
|
|
JT: Program |
659 |
|
|
VO: 14 |
660 |
|
|
PP: 130-137 |
661 |
|
|
PY: 1980 |
662 |
|
|
PM: JUL |
663 |
|
|
|
664 |
|
|
=item B<Soundex>(I<word>) |
665 |
|
|
|
666 |
|
|
|
667 |
|
|
computes the 4 byte B<Soundex> code for I<word>. |
668 |
|
|
|
669 |
|
|
AU: Gadd, T.N. |
670 |
|
|
TI: 'Fisching for Werds'. Phonetic Retrieval of written text in |
671 |
|
|
Information Retrieval Systems |
672 |
|
|
JT: Program |
673 |
|
|
VO: 22 |
674 |
|
|
NO: 3 |
675 |
|
|
PP: 222-237 |
676 |
|
|
PY: 1988 |
677 |
|
|
|
678 |
|
|
|
679 |
|
|
=item B<Phonix>(I<word>) |
680 |
|
|
|
681 |
|
|
computes the 8 byte B<Phonix> code for I<word>. |
682 |
|
|
|
683 |
|
|
AU: Gadd, T.N. |
684 |
|
|
TI: PHONIX: The Algorithm |
685 |
|
|
JT: Program |
686 |
|
|
VO: 24 |
687 |
|
|
NO: 4 |
688 |
|
|
PP: 363-366 |
689 |
|
|
PY: 1990 |
690 |
|
|
PM: OCT |
691 |
|
|
|
692 |
dpavlin |
109 |
=back |
693 |
|
|
|
694 |
ulpfr |
10 |
=head1 ISO charcater case functions |
695 |
|
|
|
696 |
|
|
There are some additional function which transpose some/most ISOlatin1 |
697 |
|
|
characters to upper and lower case. To allow for maximum speed there |
698 |
|
|
are also I<destructive> versions which change the argument instead of |
699 |
|
|
allocating a copy which is returned. For convenience, the destructive |
700 |
ulpfr |
13 |
version also B<returns> the argument. So all of the following is |
701 |
ulpfr |
10 |
valid and C<$word> will contain the lowercased string. |
702 |
|
|
|
703 |
ulpfr |
13 |
$word = isolc($word); |
704 |
ulpfr |
10 |
$word = disolc($word); |
705 |
|
|
disolc($word); |
706 |
|
|
|
707 |
|
|
Here are the hardcoded characters which are recognized: |
708 |
|
|
|
709 |
|
|
abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß |
710 |
|
|
ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß |
711 |
|
|
|
712 |
dpavlin |
109 |
=over 5 |
713 |
|
|
|
714 |
ulpfr |
10 |
=item C<$new = >B<isolc>C<($word)> |
715 |
|
|
|
716 |
|
|
=item B<disolc>C<($word)> |
717 |
|
|
|
718 |
|
|
transposes to lower case. |
719 |
|
|
|
720 |
|
|
=item C<$new = >B<isouc>C<($word)> |
721 |
|
|
|
722 |
|
|
=item B<disouc>C<($word)> |
723 |
|
|
|
724 |
|
|
transposes to upper case. |
725 |
|
|
|
726 |
ulpfr |
13 |
=item C<$new = >B<isotr>C<($word)> |
727 |
|
|
|
728 |
|
|
=item B<disotr>C<($word)> |
729 |
|
|
|
730 |
|
|
Remove non-letters according to the above table. |
731 |
|
|
|
732 |
|
|
=item C<$new = >B<stop>C<($word)> |
733 |
|
|
|
734 |
|
|
Returns an empty string if $word is a stopword. |
735 |
|
|
|
736 |
|
|
=item C<$new = >B<grundform>C<($word)> |
737 |
|
|
|
738 |
|
|
Calls Text::German::reduce |
739 |
|
|
|
740 |
|
|
=item C<$new = >B<utf8iso>C<($word)> |
741 |
|
|
|
742 |
laperla |
32 |
Deprecated due to flux in perl versions between 5.005 and 5.8. The |
743 |
|
|
function converts UTF8 encoded strings to ISO-8859-1. WAIT is |
744 |
|
|
internally still based on the Latin1 character set, so if you process |
745 |
ulpfr |
13 |
anything in a different encoding, you should convert to Latin1 as the |
746 |
laperla |
32 |
first filter or refrain from using the iso-latin-1 based filter |
747 |
|
|
functions. It is recommended that you use your own converter based on |
748 |
|
|
the perl version you're using. |
749 |
ulpfr |
13 |
|
750 |
|
|
=item split, split2, split3, ... |
751 |
|
|
|
752 |
|
|
The splitN funtions all take a scalar as input and return a list of |
753 |
|
|
words. Split acts just like the perl split(' '). Split2 eliminates all |
754 |
|
|
words from the list that are shorter than 2 characters (bytes), split3 |
755 |
|
|
eliminates those shorter than 3 characters (bytes) and so on. |
756 |
|
|
|
757 |
dpavlin |
109 |
=back |
758 |
|
|
|
759 |
ulpfr |
10 |
=head1 AUTHOR |
760 |
|
|
|
761 |
|
|
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |
762 |
|
|
|
763 |
|
|
=head1 SEE ALSO |
764 |
|
|
|
765 |
|
|
perl(1). |
766 |
|
|
|
767 |
|
|
=cut |
768 |
|
|
|