/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 10 - (show annotations)
Fri Apr 28 15:40:52 2000 UTC (24 years ago) by ulpfr
Original Path: cvs-head/lib/WAIT/Filter.pm
File size: 7265 byte(s)
Initial revision

1 # -*- Mode: Perl -*-
2 # $Basename: Filter.pm $
3 # $Revision: 1.7 $
4 # ITIID : $ITI$ $Header $__Header$
5 # Author : Ulrich Pfeifer
6 # Created On : Thu Aug 15 18:09:51 1996
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Sun Nov 22 18:44:46 1998
9 # Language : CPerl
10 # Update Count : 105
11 # Status : Unknown, Use with caution!
12 #
13 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 #
15 package WAIT::Filter;
16 require WAIT;
17 use strict;
18 use Carp;
19 use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20 use subs qw(grundform);
21
22 require Exporter;
23
24 @ISA = qw(Exporter);
25 @EXPORT_OK = qw(
26 Stem
27 Soundex
28 Phonix
29 Metaphone
30 isolc disolc
31 isouc disouc
32 isotr disotr
33 stop grundform
34 );
35
36 $VERSION = substr q$Revision: 1.7 $, 10;
37
38 sub split {
39 map split(' ', $_), @_;
40 }
41
42 $SPLIT = q[
43 sub splitXXX {
44 grep length($_)>=XXX, map split(' ', $_), @_;
45 }
46 ];
47
48 sub AUTOLOAD {
49 my $func = $AUTOLOAD; $func =~ s/.*:://;
50
51 if ($func =~ /split(\d+)/) {
52 my $num = $1;
53 my $split = $SPLIT;
54
55 $split =~ s/XXX/$num/g;
56 eval $split;
57 if ($@ eq '') {
58 goto &$AUTOLOAD;
59 }
60 } elsif ($func eq 'grundform') {
61 eval {require Text::German;};
62 croak "You must have Text::German to use 'grundform'"
63 if $@ ne '';
64 *grundform = Text::German->can('reduce');
65 goto &grundform;
66 } elsif ($func eq 'date') {
67 eval {require Time::ParseDate;};
68 croak "You must have Time::ParseDate to use 'date'"
69 if $@ ne '';
70 *date = Time::ParseDate->can('parsedate');
71 goto \&date;
72 } elsif ($func eq 'decode_entities') {
73 eval {require HTML::Entities;};
74 croak "You must have HTML::Entities to use 'date'"
75 if $@ ne '';
76 *decode_entities = HTML::Entities->can('decode_entities');
77 goto &decode_entities;
78 }
79 croak "Your vendor has not defined WAIT::Filter::$func";
80 }
81
82 while (<DATA>) {
83 chomp;
84 last if /__END__/;
85 $STOP{$_}++;
86 }
87
88 sub stop {
89 if (exists $STOP{$_[0]}) {
90 ''
91 } else {
92 $_[0];
93 }
94 }
95
96 sub gdate {
97 my $date = shift;
98
99 $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
100 date($date);
101 }
102
103 1;
104 __DATA__
105 a
106 about
107 above
108 according
109 across
110 actually
111 adj
112 after
113 afterwards
114 again
115 against
116 all
117 almost
118 alone
119 along
120 already
121 also
122 although
123 always
124 among
125 amongst
126 an
127 and
128 another
129 any
130 anyhow
131 anyone
132 anything
133 anywhere
134 are
135 aren't
136 around
137 as
138 at
139 b
140 be
141 became
142 because
143 become
144 becomes
145 becoming
146 been
147 before
148 beforehand
149 begin
150 beginning
151 behind
152 being
153 below
154 beside
155 besides
156 between
157 beyond
158 billion
159 both
160 but
161 by
162 c
163 can
164 can't
165 cannot
166 caption
167 co
168 co.
169 could
170 couldn't
171 d
172 did
173 didn't
174 do
175 does
176 doesn't
177 don't
178 down
179 during
180 e
181 eg
182 eight
183 eighty
184 either
185 else
186 elsewhere
187 end
188 ending
189 enough
190 etc
191 even
192 ever
193 every
194 everyone
195 everything
196 everywhere
197 except
198 f
199 few
200 fifty
201 first
202 five
203 vfor
204 former
205 formerly
206 forty
207 found "
208 four
209 from
210 further
211 g
212 h
213 had
214 has
215 hasn't
216 have
217 haven't
218 he
219 he'd
220 he'll
221 he's
222 hence
223 her
224 here
225 here's
226 hereafter
227 hereby
228 herein
229 hereupon
230 hers
231 herself
232 him
233 himself
234 his
235 how
236 however
237 hundred
238 i
239 i'd
240 i'll
241 i'm
242 i've
243 ie
244 if
245 in
246 inc.
247 indeed
248 instead
249 into
250 is
251 isn't
252 it
253 it's
254 its
255 itself
256 j
257 k
258 l
259 last
260 later
261 latter
262 latterly
263 least
264 less
265 let
266 let's
267 like
268 likely
269 ltd
270 m
271 made
272 make
273 makes
274 many
275 maybe
276 me
277 meantime
278 meanwhile
279 might
280 million
281 miss
282 more
283 moreover
284 most
285 mostly
286 mr
287 mrs
288 much
289 must
290 my
291 myself
292 n
293 namely
294 neither
295 never
296 nevertheless
297 next
298 nine
299 ninety
300 no
301 nobody
302 none
303 nonetheless
304 noone
305 nor
306 not
307 nothing
308 now
309 nowhere
310 o
311 of
312 off
313 often
314 on
315 once
316 one
317 one's
318 only
319 onto
320 or
321 other
322 others
323 otherwise
324 our
325 ours
326 ourselves
327 out
328 over
329 overall
330 own
331 p
332 per
333 perhaps
334 q
335 r
336 rather
337 recent
338 recently
339 s
340 same
341 seem
342 seemed
343 seeming
344 seems
345 seven
346 seventy
347 several
348 she
349 she'd
350 she'll
351 she's
352 should
353 shouldn't
354 since
355 six
356 sixty
357 so
358 some
359 somehow
360 someone
361 something
362 sometime
363 sometimes
364 somewhere
365 still
366 stop
367 such
368 t
369 taking
370 ten
371 than
372 that
373 that'll
374 that's
375 that've
376 the
377 their
378 them
379 themselves
380 then
381 thence
382 there
383 there'd
384 there'll
385 there're
386 there's
387 there've
388 thereafter
389 thereby
390 therefore
391 therein
392 thereupon
393 these
394 they
395 they'd
396 they'll
397 they're
398 they've
399 thirty
400 this
401 those
402 though
403 thousand
404 three
405 through
406 throughout
407 thru
408 thus
409 to
410 together
411 too
412 toward
413 towards
414 trillion
415 twenty
416 two
417 u
418 under
419 unless
420 unlike
421 unlikely
422 until
423 up
424 upon
425 us
426 used
427 using
428 v
429 very
430 via
431 w
432 was
433 wasn't
434 we
435 we'd
436 we'll
437 we're
438 we've
439 well
440 were
441 weren't
442 what
443 what'll
444 what's
445 what've
446 whatever
447 when
448 whence
449 whenever
450 where
451 where's
452 whereafter
453 whereas
454 whereby
455 wherein
456 whereupon
457 wherever
458 whether
459 which
460 while
461 whither
462 who
463 who'd
464 who'll
465 who's
466 whoever
467 whole
468 whom
469 whomever
470 whose
471 why
472 will
473 with
474 within
475 without
476 won't
477 would
478 wouldn't
479 x
480 y
481 yes
482 yet
483 you
484 you'd
485 you'll
486 you're
487 you've
488 your
489 yours
490 yourself
491 yourselves
492 z
493 # occuring in more than 100 files
494 acc
495 accent
496 accents
497 and
498 are
499 bell
500 can
501 character
502 corrections
503 crt
504 daisy
505 dash
506 date
507 defined
508 definitions
509 description
510 devices
511 diablo
512 dummy
513 factors
514 following
515 font
516 for
517 from
518 fudge
519 give
520 have
521 header
522 holds
523 log
524 logo
525 low
526 lpr
527 mark
528 name
529 nroff
530 out
531 output
532 pitch
533 put
534 rcsfile
535 reference
536 resolution
537 revision
538 see
539 set
540 simple
541 smi
542 some
543 string
544 synopsis
545 system
546 that
547 the
548 this
549 translation
550 troff
551 typewriter
552 ucb
553 unbreakable
554 use
555 used
556 user
557 vroff
558 wheel
559 will
560 with
561 you
562 __END__
563 # Below is the stub of documentation for your module. You better edit it!
564
565 =head1 NAME
566
567 WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
568
569 =head1 SYNOPSIS
570
571 use WAIT::Filter qw(Stem Soundex Phonix isolc isouc disolc disouc);
572
573 $stem = Stem($word);
574 $scode = Soundex($word);
575 $pcode = Phonix($word);
576 $lword = isolc($word);
577 $uword = isouc($word);
578 disolc($word);
579 disouc($word);
580
581 =head1 DESCRIPTION
582
583 This tiny modules gives access to the basic reduction functions build
584 in B<freeWAIS-sf>.
585
586 =over 5
587
588 =item B<Stem>(I<word>)
589
590 reduces I<word> using the well know Porter algorithm.
591
592 AU: Porter, M.F.
593 TI: An Algorithm for Suffix Stripping
594 JT: Program
595 VO: 14
596 PP: 130-137
597 PY: 1980
598 PM: JUL
599
600 =item B<Soundex>(I<word>)
601
602
603 computes the 4 byte B<Soundex> code for I<word>.
604
605 AU: Gadd, T.N.
606 TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
607 Information Retrieval Systems
608 JT: Program
609 VO: 22
610 NO: 3
611 PP: 222-237
612 PY: 1988
613
614
615 =item B<Phonix>(I<word>)
616
617 computes the 8 byte B<Phonix> code for I<word>.
618
619 AU: Gadd, T.N.
620 TI: PHONIX: The Algorithm
621 JT: Program
622 VO: 24
623 NO: 4
624 PP: 363-366
625 PY: 1990
626 PM: OCT
627
628 =head1 ISO charcater case functions
629
630 There are some additional function which transpose some/most ISOlatin1
631 characters to upper and lower case. To allow for maximum speed there
632 are also I<destructive> versions which change the argument instead of
633 allocating a copy which is returned. For convenience, the destructive
634 version also B<returns> the argument. So both of the following is
635 valid and C<$word> will contain the lowercased string.
636
637 $word = disolc($word);
638 disolc($word);
639
640
641 Here are the hardcoded characters which are recognized:
642
643 abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
644 ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
645
646 =item C<$new = >B<isolc>C<($word)>
647
648 =item B<disolc>C<($word)>
649
650 transposes to lower case.
651
652 =item C<$new = >B<isouc>C<($word)>
653
654 =item B<disouc>C<($word)>
655
656 transposes to upper case.
657
658 =head1 AUTHOR
659
660 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
661
662 =head1 SEE ALSO
663
664 perl(1).
665
666 =cut
667

Properties

Name Value
cvs2svn:cvs-rev 1.1

  ViewVC Help
Powered by ViewVC 1.1.26