/[wait]/branches/CPAN/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/CPAN/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Fri Apr 28 15:42:44 2000 UTC (24 years ago) by ulpfr
File size: 8755 byte(s)
Import of WAIT-1.710

1 # -*- Mode: Cperl -*-
2 # $Basename: Filter.pm $
3 # $Revision: 1.8 $
4 # ITIID : $ITI$ $Header $__Header$
5 # Author : Ulrich Pfeifer
6 # Created On : Thu Aug 15 18:09:51 1996
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Sun Nov 22 18:44:46 1998
9 # Language : CPerl
10 # Update Count : 105
11 # Status : Unknown, Use with caution!
12 #
13 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 #
15 package WAIT::Filter;
16 require WAIT;
17 use strict;
18 use Carp;
19 use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20 use subs qw(grundform);
21
22 require Exporter;
23
24 @ISA = qw(Exporter);
25 @EXPORT_OK = qw(
26 Stem
27 Soundex
28 Phonix
29 Metaphone
30 isolc disolc
31 isouc disouc
32 isotr disotr
33 stop grundform
34 utf8iso
35 );
36 # (most implemented in WAIT.xs)
37
38 $VERSION = substr q$Revision: 1.8 $, 10;
39
40 sub split {
41 map split(' ', $_), @_;
42 }
43
44 $SPLIT = q[
45 sub splitXXX {
46 grep length($_)>=XXX, map split(' ', $_), @_;
47 }
48 ];
49
50 sub AUTOLOAD {
51 my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53 if ($func =~ /split(\d+)/) {
54 my $num = $1;
55 my $split = $SPLIT;
56
57 $split =~ s/XXX/$num/g;
58 eval $split;
59 if ($@ eq '') {
60 goto &$AUTOLOAD;
61 }
62 } elsif ($func eq 'grundform') {
63 eval {require Text::German;};
64 croak "You must have Text::German to use 'grundform'"
65 if $@ ne '';
66 *grundform = Text::German->can('reduce');
67 goto &grundform;
68 } elsif ($func eq 'date') {
69 eval {require Time::ParseDate;};
70 croak "You must have Time::ParseDate to use 'date'"
71 if $@ ne '';
72 *date = Time::ParseDate->can('parsedate');
73 goto \&date;
74 } elsif ($func eq 'decode_entities') {
75 eval {require HTML::Entities;};
76 croak "You must have HTML::Entities to use 'date'"
77 if $@ ne '';
78 *decode_entities = HTML::Entities->can('decode_entities');
79 goto &decode_entities;
80 } elsif ($func =~ /^d?utf8iso$/) {
81 require WAIT::Filter::utf8iso;
82 croak "Your perl version must at least be 5.00556 to use '$func'"
83 if $] < 5.00556;
84 no strict 'refs';
85 *$func = \&{"WAIT::Filter::utf8iso::$func"};
86 goto &utf8iso;
87 }
88 Carp::confess "Class WAIT::Filter::$func not found";
89 }
90
91 while (<DATA>) {
92 chomp;
93 last if /__END__/;
94 $STOP{$_}++;
95 }
96
97 sub stop {
98 if (exists $STOP{$_[0]}) {
99 ''
100 } else {
101 $_[0];
102 }
103 }
104
105 sub gdate {
106 my $date = shift;
107
108 $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
109 date($date);
110 }
111
112 1;
113 __DATA__
114 a
115 about
116 above
117 according
118 across
119 actually
120 adj
121 after
122 afterwards
123 again
124 against
125 all
126 almost
127 alone
128 along
129 already
130 also
131 although
132 always
133 among
134 amongst
135 an
136 and
137 another
138 any
139 anyhow
140 anyone
141 anything
142 anywhere
143 are
144 aren't
145 around
146 as
147 at
148 b
149 be
150 became
151 because
152 become
153 becomes
154 becoming
155 been
156 before
157 beforehand
158 begin
159 beginning
160 behind
161 being
162 below
163 beside
164 besides
165 between
166 beyond
167 billion
168 both
169 but
170 by
171 c
172 can
173 can't
174 cannot
175 caption
176 co
177 co.
178 could
179 couldn't
180 d
181 did
182 didn't
183 do
184 does
185 doesn't
186 don't
187 down
188 during
189 e
190 eg
191 eight
192 eighty
193 either
194 else
195 elsewhere
196 end
197 ending
198 enough
199 etc
200 even
201 ever
202 every
203 everyone
204 everything
205 everywhere
206 except
207 f
208 few
209 fifty
210 first
211 five
212 vfor
213 former
214 formerly
215 forty
216 found
217 four
218 from
219 further
220 g
221 h
222 had
223 has
224 hasn't
225 have
226 haven't
227 he
228 he'd
229 he'll
230 he's
231 hence
232 her
233 here
234 here's
235 hereafter
236 hereby
237 herein
238 hereupon
239 hers
240 herself
241 him
242 himself
243 his
244 how
245 however
246 hundred
247 i
248 i'd
249 i'll
250 i'm
251 i've
252 ie
253 if
254 in
255 inc.
256 indeed
257 instead
258 into
259 is
260 isn't
261 it
262 it's
263 its
264 itself
265 j
266 k
267 l
268 last
269 later
270 latter
271 latterly
272 least
273 less
274 let
275 let's
276 like
277 likely
278 ltd
279 m
280 made
281 make
282 makes
283 many
284 maybe
285 me
286 meantime
287 meanwhile
288 might
289 million
290 miss
291 more
292 moreover
293 most
294 mostly
295 mr
296 mrs
297 much
298 must
299 my
300 myself
301 n
302 namely
303 neither
304 never
305 nevertheless
306 next
307 nine
308 ninety
309 no
310 nobody
311 none
312 nonetheless
313 noone
314 nor
315 not
316 nothing
317 now
318 nowhere
319 o
320 of
321 off
322 often
323 on
324 once
325 one
326 one's
327 only
328 onto
329 or
330 other
331 others
332 otherwise
333 our
334 ours
335 ourselves
336 out
337 over
338 overall
339 own
340 p
341 per
342 perhaps
343 q
344 r
345 rather
346 recent
347 recently
348 s
349 same
350 seem
351 seemed
352 seeming
353 seems
354 seven
355 seventy
356 several
357 she
358 she'd
359 she'll
360 she's
361 should
362 shouldn't
363 since
364 six
365 sixty
366 so
367 some
368 somehow
369 someone
370 something
371 sometime
372 sometimes
373 somewhere
374 still
375 stop
376 such
377 t
378 taking
379 ten
380 than
381 that
382 that'll
383 that's
384 that've
385 the
386 their
387 them
388 themselves
389 then
390 thence
391 there
392 there'd
393 there'll
394 there're
395 there's
396 there've
397 thereafter
398 thereby
399 therefore
400 therein
401 thereupon
402 these
403 they
404 they'd
405 they'll
406 they're
407 they've
408 thirty
409 this
410 those
411 though
412 thousand
413 three
414 through
415 throughout
416 thru
417 thus
418 to
419 together
420 too
421 toward
422 towards
423 trillion
424 twenty
425 two
426 u
427 under
428 unless
429 unlike
430 unlikely
431 until
432 up
433 upon
434 us
435 used
436 using
437 v
438 very
439 via
440 w
441 was
442 wasn't
443 we
444 we'd
445 we'll
446 we're
447 we've
448 well
449 were
450 weren't
451 what
452 what'll
453 what's
454 what've
455 whatever
456 when
457 whence
458 whenever
459 where
460 where's
461 whereafter
462 whereas
463 whereby
464 wherein
465 whereupon
466 wherever
467 whether
468 which
469 while
470 whither
471 who
472 who'd
473 who'll
474 who's
475 whoever
476 whole
477 whom
478 whomever
479 whose
480 why
481 will
482 with
483 within
484 without
485 won't
486 would
487 wouldn't
488 x
489 y
490 yes
491 yet
492 you
493 you'd
494 you'll
495 you're
496 you've
497 your
498 yours
499 yourself
500 yourselves
501 z
502 # occuring in more than 100 files
503 acc
504 accent
505 accents
506 and
507 are
508 bell
509 can
510 character
511 corrections
512 crt
513 daisy
514 dash
515 date
516 defined
517 definitions
518 description
519 devices
520 diablo
521 dummy
522 factors
523 following
524 font
525 for
526 from
527 fudge
528 give
529 have
530 header
531 holds
532 log
533 logo
534 low
535 lpr
536 mark
537 name
538 nroff
539 out
540 output
541 pitch
542 put
543 rcsfile
544 reference
545 resolution
546 revision
547 see
548 set
549 simple
550 smi
551 some
552 string
553 synopsis
554 system
555 that
556 the
557 this
558 translation
559 troff
560 typewriter
561 ucb
562 unbreakable
563 use
564 used
565 user
566 vroff
567 wheel
568 will
569 with
570 you
571 __END__
572 # Below is the stub of documentation for your module. You better edit it!
573
574 =head1 NAME
575
576 WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
577
578 =head1 SYNOPSIS
579
580 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
581 isotr disotr stop grundform utf8iso);
582
583 $stem = Stem($word);
584 $scode = Soundex($word);
585 $pcode = Phonix($word);
586 $lword = isolc($word);
587 disolc($word);
588 $uword = isouc($word);
589 disouc($word);
590 $trword = isotr($word);
591 disotr($word);
592 $word = stop($word);
593 $word = grundform($word);
594
595 @words = WAIT::Filter::split($word);
596 @words = WAIT::Filter::split2($word);
597 @words = WAIT::Filter::split3($word);
598 @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
599
600 =head1 DESCRIPTION
601
602 This tiny modules gives access to the basic reduction functions build
603 in B<freeWAIS-sf>.
604
605 =over 5
606
607 =item B<Stem>(I<word>)
608
609 reduces I<word> using the well know Porter algorithm.
610
611 AU: Porter, M.F.
612 TI: An Algorithm for Suffix Stripping
613 JT: Program
614 VO: 14
615 PP: 130-137
616 PY: 1980
617 PM: JUL
618
619 =item B<Soundex>(I<word>)
620
621
622 computes the 4 byte B<Soundex> code for I<word>.
623
624 AU: Gadd, T.N.
625 TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
626 Information Retrieval Systems
627 JT: Program
628 VO: 22
629 NO: 3
630 PP: 222-237
631 PY: 1988
632
633
634 =item B<Phonix>(I<word>)
635
636 computes the 8 byte B<Phonix> code for I<word>.
637
638 AU: Gadd, T.N.
639 TI: PHONIX: The Algorithm
640 JT: Program
641 VO: 24
642 NO: 4
643 PP: 363-366
644 PY: 1990
645 PM: OCT
646
647 =head1 ISO charcater case functions
648
649 There are some additional function which transpose some/most ISOlatin1
650 characters to upper and lower case. To allow for maximum speed there
651 are also I<destructive> versions which change the argument instead of
652 allocating a copy which is returned. For convenience, the destructive
653 version also B<returns> the argument. So all of the following is
654 valid and C<$word> will contain the lowercased string.
655
656 $word = isolc($word);
657 $word = disolc($word);
658 disolc($word);
659
660 Here are the hardcoded characters which are recognized:
661
662 abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
663 ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
664
665 =item C<$new = >B<isolc>C<($word)>
666
667 =item B<disolc>C<($word)>
668
669 transposes to lower case.
670
671 =item C<$new = >B<isouc>C<($word)>
672
673 =item B<disouc>C<($word)>
674
675 transposes to upper case.
676
677 =item C<$new = >B<isotr>C<($word)>
678
679 =item B<disotr>C<($word)>
680
681 Remove non-letters according to the above table.
682
683 =item C<$new = >B<stop>C<($word)>
684
685 Returns an empty string if $word is a stopword.
686
687 =item C<$new = >B<grundform>C<($word)>
688
689 Calls Text::German::reduce
690
691 =item C<$new = >B<utf8iso>C<($word)>
692
693 Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is
694 internally based on the Latin1 character set, so if you process
695 anything in a different encoding, you should convert to Latin1 as the
696 first filter.
697
698 =item split, split2, split3, ...
699
700 The splitN funtions all take a scalar as input and return a list of
701 words. Split acts just like the perl split(' '). Split2 eliminates all
702 words from the list that are shorter than 2 characters (bytes), split3
703 eliminates those shorter than 3 characters (bytes) and so on.
704
705 =head1 AUTHOR
706
707 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
708
709 =head1 SEE ALSO
710
711 perl(1).
712
713 =cut
714

Properties

Name Value
cvs2svn:cvs-rev 1.1.1.2

  ViewVC Help
Powered by ViewVC 1.1.26