/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 41 - (show annotations)
Mon Nov 13 20:25:49 2000 UTC (23 years, 5 months ago) by laperla
Original Path: cvs-head/lib/WAIT/Filter.pm
File size: 8995 byte(s)
utf8iso was completely nonsense, returning only true and false.

LockFile::Simple now default-configured to remove stale locks.

1 # -*- Mode: Cperl -*-
2 # $Basename: Filter.pm $
3 # $Revision: 1.9 $
4 # ITIID : $ITI$ $Header $__Header$
5 # Author : Ulrich Pfeifer
6 # Created On : Thu Aug 15 18:09:51 1996
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Sun Nov 22 18:44:46 1998
9 # Language : CPerl
10 # Update Count : 105
11 # Status : Unknown, Use with caution!
12 #
13 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 #
15 package WAIT::Filter;
16 require WAIT;
17 use strict;
18 use Carp;
19 use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20 use subs qw(grundform);
21
22 require Exporter;
23
24 @ISA = qw(Exporter);
25 @EXPORT_OK = qw(
26 Stem
27 Soundex
28 Phonix
29 Metaphone
30 isolc disolc
31 isouc disouc
32 isotr disotr
33 stop grundform
34 utf8iso
35 );
36 # (most implemented in WAIT.xs)
37
38 $VERSION = substr q$Revision: 1.9 $, 10;
39
40 sub split {
41 map split(' ', $_), @_;
42 }
43
44 $SPLIT = q[
45 sub splitXXX {
46 grep length($_)>=XXX, map split(' ', $_), @_;
47 }
48 ];
49
50 sub AUTOLOAD {
51 my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53 if ($func =~ /split(\d+)/) {
54 my $num = $1;
55 my $split = $SPLIT;
56
57 $split =~ s/XXX/$num/g;
58 eval $split;
59 if ($@ eq '') {
60 goto &$AUTOLOAD;
61 }
62 } elsif ($func eq 'grundform') {
63 eval {require Text::German;};
64 croak "You must have Text::German to use 'grundform'"
65 if $@ ne '';
66 *grundform = Text::German->can('reduce');
67 goto &grundform;
68 } elsif ($func eq 'date') {
69 eval {require Time::ParseDate;};
70 croak "You must have Time::ParseDate to use 'date'"
71 if $@ ne '';
72 *date = Time::ParseDate->can('parsedate');
73 goto \&date;
74 } elsif ($func eq 'decode_entities') {
75 eval {require HTML::Entities;};
76 croak "You must have HTML::Entities to use 'date'"
77 if $@ ne '';
78 *decode_entities = HTML::Entities->can('decode_entities');
79 goto &decode_entities;
80 } elsif ($func =~ /^d?utf8iso$/) {
81 no strict 'refs';
82 *$func = sub {
83 # Courtesy JHI
84 my $s = shift;
85 $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86 {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
87 $s;
88 };
89 goto \&$func;
90 }
91 Carp::confess "Class WAIT::Filter::$func not found";
92 }
93
94 while (<DATA>) {
95 chomp;
96 last if /__END__/;
97 next if /^\s*#/; # there's a comment
98 $STOP{$_}++;
99 }
100
101 sub stop {
102 if (exists $STOP{$_[0]}) {
103 ''
104 } else {
105 $_[0];
106 }
107 }
108
109 sub gdate {
110 my $date = shift;
111
112 $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
113 date($date);
114 }
115
116 1;
117 __DATA__
118 a
119 about
120 above
121 according
122 across
123 actually
124 adj
125 after
126 afterwards
127 again
128 against
129 all
130 almost
131 alone
132 along
133 already
134 also
135 although
136 always
137 among
138 amongst
139 an
140 and
141 another
142 any
143 anyhow
144 anyone
145 anything
146 anywhere
147 are
148 aren't
149 around
150 as
151 at
152 b
153 be
154 became
155 because
156 become
157 becomes
158 becoming
159 been
160 before
161 beforehand
162 begin
163 beginning
164 behind
165 being
166 below
167 beside
168 besides
169 between
170 beyond
171 billion
172 both
173 but
174 by
175 c
176 can
177 can't
178 cannot
179 caption
180 co
181 co.
182 could
183 couldn't
184 d
185 did
186 didn't
187 do
188 does
189 doesn't
190 don't
191 down
192 during
193 e
194 eg
195 eight
196 eighty
197 either
198 else
199 elsewhere
200 end
201 ending
202 enough
203 etc
204 even
205 ever
206 every
207 everyone
208 everything
209 everywhere
210 except
211 f
212 few
213 fifty
214 first
215 five
216 vfor
217 former
218 formerly
219 forty
220 found
221 four
222 from
223 further
224 g
225 h
226 had
227 has
228 hasn't
229 have
230 haven't
231 he
232 he'd
233 he'll
234 he's
235 hence
236 her
237 here
238 here's
239 hereafter
240 hereby
241 herein
242 hereupon
243 hers
244 herself
245 him
246 himself
247 his
248 how
249 however
250 hundred
251 i
252 i'd
253 i'll
254 i'm
255 i've
256 ie
257 if
258 in
259 inc.
260 indeed
261 instead
262 into
263 is
264 isn't
265 it
266 it's
267 its
268 itself
269 j
270 k
271 l
272 last
273 later
274 latter
275 latterly
276 least
277 less
278 let
279 let's
280 like
281 likely
282 ltd
283 m
284 made
285 make
286 makes
287 many
288 maybe
289 me
290 meantime
291 meanwhile
292 might
293 million
294 miss
295 more
296 moreover
297 most
298 mostly
299 mr
300 mrs
301 much
302 must
303 my
304 myself
305 n
306 namely
307 neither
308 never
309 nevertheless
310 next
311 nine
312 ninety
313 no
314 nobody
315 none
316 nonetheless
317 noone
318 nor
319 not
320 nothing
321 now
322 nowhere
323 o
324 of
325 off
326 often
327 on
328 once
329 one
330 one's
331 only
332 onto
333 or
334 other
335 others
336 otherwise
337 our
338 ours
339 ourselves
340 out
341 over
342 overall
343 own
344 p
345 per
346 perhaps
347 q
348 r
349 rather
350 recent
351 recently
352 s
353 same
354 seem
355 seemed
356 seeming
357 seems
358 seven
359 seventy
360 several
361 she
362 she'd
363 she'll
364 she's
365 should
366 shouldn't
367 since
368 six
369 sixty
370 so
371 some
372 somehow
373 someone
374 something
375 sometime
376 sometimes
377 somewhere
378 still
379 stop
380 such
381 t
382 taking
383 ten
384 than
385 that
386 that'll
387 that's
388 that've
389 the
390 their
391 them
392 themselves
393 then
394 thence
395 there
396 there'd
397 there'll
398 there're
399 there's
400 there've
401 thereafter
402 thereby
403 therefore
404 therein
405 thereupon
406 these
407 they
408 they'd
409 they'll
410 they're
411 they've
412 thirty
413 this
414 those
415 though
416 thousand
417 three
418 through
419 throughout
420 thru
421 thus
422 to
423 together
424 too
425 toward
426 towards
427 trillion
428 twenty
429 two
430 u
431 under
432 unless
433 unlike
434 unlikely
435 until
436 up
437 upon
438 us
439 used
440 using
441 v
442 very
443 via
444 w
445 was
446 wasn't
447 we
448 we'd
449 we'll
450 we're
451 we've
452 well
453 were
454 weren't
455 what
456 what'll
457 what's
458 what've
459 whatever
460 when
461 whence
462 whenever
463 where
464 where's
465 whereafter
466 whereas
467 whereby
468 wherein
469 whereupon
470 wherever
471 whether
472 which
473 while
474 whither
475 who
476 who'd
477 who'll
478 who's
479 whoever
480 whole
481 whom
482 whomever
483 whose
484 why
485 will
486 with
487 within
488 without
489 won't
490 would
491 wouldn't
492 x
493 y
494 yes
495 yet
496 you
497 you'd
498 you'll
499 you're
500 you've
501 your
502 yours
503 yourself
504 yourselves
505 z
506 # occuring in more than 100 files
507 acc
508 accent
509 accents
510 and
511 are
512 bell
513 can
514 character
515 corrections
516 crt
517 daisy
518 dash
519 date
520 defined
521 definitions
522 description
523 devices
524 diablo
525 dummy
526 factors
527 following
528 font
529 for
530 from
531 fudge
532 give
533 have
534 header
535 holds
536 log
537 logo
538 low
539 lpr
540 mark
541 name
542 nroff
543 out
544 output
545 pitch
546 put
547 rcsfile
548 reference
549 resolution
550 revision
551 see
552 set
553 simple
554 smi
555 some
556 string
557 synopsis
558 system
559 that
560 the
561 this
562 translation
563 troff
564 typewriter
565 ucb
566 unbreakable
567 use
568 used
569 user
570 vroff
571 wheel
572 will
573 with
574 you
575 __END__
576 # Below is the stub of documentation for your module. You better edit it!
577
578 =head1 NAME
579
580 WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
581
582 =head1 SYNOPSIS
583
584 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
585 isotr disotr stop grundform);
586
587 $stem = Stem($word);
588 $scode = Soundex($word);
589 $pcode = Phonix($word);
590 $lword = isolc($word);
591 disolc($word);
592 $uword = isouc($word);
593 disouc($word);
594 $trword = isotr($word);
595 disotr($word);
596 $word = stop($word);
597 $word = grundform($word);
598
599 @words = WAIT::Filter::split($word);
600 @words = WAIT::Filter::split2($word);
601 @words = WAIT::Filter::split3($word);
602 @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
603
604 =head1 DESCRIPTION
605
606 This tiny modules gives access to the basic reduction functions build
607 in B<freeWAIS-sf>.
608
609 =over 5
610
611 =item B<Stem>(I<word>)
612
613 reduces I<word> using the well know Porter algorithm.
614
615 AU: Porter, M.F.
616 TI: An Algorithm for Suffix Stripping
617 JT: Program
618 VO: 14
619 PP: 130-137
620 PY: 1980
621 PM: JUL
622
623 =item B<Soundex>(I<word>)
624
625
626 computes the 4 byte B<Soundex> code for I<word>.
627
628 AU: Gadd, T.N.
629 TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
630 Information Retrieval Systems
631 JT: Program
632 VO: 22
633 NO: 3
634 PP: 222-237
635 PY: 1988
636
637
638 =item B<Phonix>(I<word>)
639
640 computes the 8 byte B<Phonix> code for I<word>.
641
642 AU: Gadd, T.N.
643 TI: PHONIX: The Algorithm
644 JT: Program
645 VO: 24
646 NO: 4
647 PP: 363-366
648 PY: 1990
649 PM: OCT
650
651 =head1 ISO charcater case functions
652
653 There are some additional function which transpose some/most ISOlatin1
654 characters to upper and lower case. To allow for maximum speed there
655 are also I<destructive> versions which change the argument instead of
656 allocating a copy which is returned. For convenience, the destructive
657 version also B<returns> the argument. So all of the following is
658 valid and C<$word> will contain the lowercased string.
659
660 $word = isolc($word);
661 $word = disolc($word);
662 disolc($word);
663
664 Here are the hardcoded characters which are recognized:
665
666 abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
667 ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
668
669 =item C<$new = >B<isolc>C<($word)>
670
671 =item B<disolc>C<($word)>
672
673 transposes to lower case.
674
675 =item C<$new = >B<isouc>C<($word)>
676
677 =item B<disouc>C<($word)>
678
679 transposes to upper case.
680
681 =item C<$new = >B<isotr>C<($word)>
682
683 =item B<disotr>C<($word)>
684
685 Remove non-letters according to the above table.
686
687 =item C<$new = >B<stop>C<($word)>
688
689 Returns an empty string if $word is a stopword.
690
691 =item C<$new = >B<grundform>C<($word)>
692
693 Calls Text::German::reduce
694
695 =item C<$new = >B<utf8iso>C<($word)>
696
697 Deprecated due to flux in perl versions between 5.005 and 5.8. The
698 function converts UTF8 encoded strings to ISO-8859-1. WAIT is
699 internally still based on the Latin1 character set, so if you process
700 anything in a different encoding, you should convert to Latin1 as the
701 first filter or refrain from using the iso-latin-1 based filter
702 functions. It is recommended that you use your own converter based on
703 the perl version you're using.
704
705 =item split, split2, split3, ...
706
707 The splitN funtions all take a scalar as input and return a list of
708 words. Split acts just like the perl split(' '). Split2 eliminates all
709 words from the list that are shorter than 2 characters (bytes), split3
710 eliminates those shorter than 3 characters (bytes) and so on.
711
712 =head1 AUTHOR
713
714 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
715
716 =head1 SEE ALSO
717
718 perl(1).
719
720 =cut
721

Properties

Name Value
cvs2svn:cvs-rev 1.3

  ViewVC Help
Powered by ViewVC 1.1.26