/[wait]/branches/CPAN/lib/WAIT.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /branches/CPAN/lib/WAIT.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 12 by unknown, Fri Apr 28 15:41:10 2000 UTC revision 13 by ulpfr, Fri Apr 28 15:42:44 2000 UTC
# Line 1  Line 1 
1  #!/usr/bin/perl  #!/usr/bin/perl
2  #                              -*- Mode: Perl -*-  #                              -*- Mode: Cperl -*-
3  # $Basename: WAIT.pm $  # $Basename: WAIT.pm $
4  # $Revision: 1.4 $  # $Revision: 1.6 $
5  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
6  # Created On      : Wed Nov  5 16:59:32 1997  # Created On      : Wed Nov  5 16:59:32 1997
7  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
# Line 9  Line 9 
9  # Language        : CPerl  # Language        : CPerl
10  # Update Count    : 4  # Update Count    : 4
11  # Status          : Unknown, Use with caution!  # Status          : Unknown, Use with caution!
12  #  #
13  # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.  # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14  #  #
15  #  #
16    
17  package WAIT;  package WAIT;
18  require DynaLoader;  require DynaLoader;
19  use vars qw($VERSION @ISA);  use vars qw($VERSION @ISA);
20  @ISA = qw(DynaLoader);  @ISA = qw(DynaLoader);
21    
22  $VERSION = sprintf '%5.3f', map $_/10,'$ProjectVersion: 16.2 $ ' =~ /([\d.]+)/;  $VERSION = sprintf '%.4f', map $_/10,'$ProjectVersion: 17.1 $ ' =~ /([\d.]+)/;
23    
24  bootstrap WAIT $VERSION;  bootstrap WAIT $VERSION;
25    
# Line 27  __END__ Line 27  __END__
27    
28  =head1 NAME  =head1 NAME
29    
30  WAIT - a rewrite of the freeWAIS-sf engine in Perl  WAIT - a rewrite of the freeWAIS-sf engine in Perl and XS
31    
32    =head1 SYNOPSIS
33    
34    A Synopsis is not yet available.
35    
36  =head1 Status of this document  =head1 Status of this document
37    
# Line 75  time in addition a B<query> and a B<disp Line 79  time in addition a B<query> and a B<disp
79    
80  =head2 Access  =head2 Access
81    
82  The access module defines which documents where members of a  The access module defines which documents are members of a database.
83  database. Usually an access module is a tied hash, whose keys are the  Usually an access module is a tied hash, whose keys are the Ids of the
84  Ids of the documents (did = document id) and whose values are the  documents (did = document id) and whose values are the documents
85  documents themselves. The indexing process loops over the keys using  themselves. The indexing process loops over the keys using C<FIRSTKEY>
86  C<FIRSTKEY> and C<NEXTKEY>. Documents are retrieved with C<FETCH>.  and C<NEXTKEY>. Documents are retrieved with C<FETCH>.
87    
88  By convention access modules should be members of the  By convention access modules should be members of the
89  C<WAIT::Document> hierarchy. Have a look at the  C<WAIT::Document> hierarchy. Have a look at the
# Line 88  C<WAIT::Document::Split> module to get t Line 92  C<WAIT::Document::Split> module to get t
92    
93  =head2 Parse  =head2 Parse
94    
95  The task parse module is to split the documents into logical parts  The task of the parse module is to split the documents into logical
96  via the C<split> method.  E.g. the C<WAIT::Parse::Nroff> splits  parts via the C<split> method. E.g. the C<WAIT::Parse::Nroff> splits
97  manuals piped through B<nroff>(1) into the sections I<name>,  manuals piped through B<nroff>(1) into the sections I<name>,
98  I<synopsis>, I<options>, I<description>, I<author>, I<example>,  I<synopsis>, I<options>, I<description>, I<author>, I<example>,
99  I<bugs>, I<text>, I<see>, and I<environment>. Here is the  I<bugs>, I<text>, I<see>, and I<environment>. Here is the
100  implementation of C<WAIT::Parse::Base> which handes documents with a  implementation of C<WAIT::Parse::Base> which handles documents with a
101  pretty simple tagged format:  pretty simple tagged format:
102    
103    AU: Pfeifer, U.; Fuhr, N.; Huynh, T.    AU: Pfeifer, U.; Fuhr, N.; Huynh, T.
# Line 110  pretty simple tagged format: Line 114  pretty simple tagged format:
114    sub split {                     # called as method    sub split {                     # called as method
115      my %result;      my %result;
116      my $fld;      my $fld;
117      
118      for (split /\n/, $_[1]) {      for (split /\n/, $_[1]) {
119        if (s/^(\S+):\s*//) {        if (s/^(\S+):\s*//) {
120          $fld = lc $1;          $fld = lc $1;
# Line 118  pretty simple tagged format: Line 122  pretty simple tagged format:
122        $result{$fld} .= $_ if defined $fld;        $result{$fld} .= $_ if defined $fld;
123      }      }
124      return \%result;      return \%result;
125    }    }
126    
127  Since the original document cannot be reconstructed from its  Since the original document cannot be reconstructed from its
128  attributes, we need a second method (I<tag>) which marks the regions  attributes, we need a second method (I<tag>) which marks the regions
# Line 131  regions. Line 135  regions.
135    sub tag {    sub tag {
136      my @result;      my @result;
137      my $tag;      my $tag;
138        
139      for (split /\n/, $_[1]) {      for (split /\n/, $_[1]) {
140        next if /^\w\w:\s*$/;        next if /^\w\w:\s*$/;
141        if (s/^(\S+)://) {        if (s/^(\S+)://) {
# Line 145  regions. Line 149  regions.
149        }        }
150      }      }
151      return @result;               # we don't go for speed      return @result;               # we don't go for speed
152    }    }
153    
154  Obviously one could implement C<split> via C<tag>. The reason for  Obviously one could implement C<split> via C<tag>. The reason for
155  having two functions is speed. We need to call C<split> for each  having two functions is speed. We need to call C<split> for each
# Line 179  words shorter than two characters. C<sto Line 183  words shorter than two characters. C<sto
183  stopwords and C<Stem> applies the Porter algorithm for computing the  stopwords and C<Stem> applies the Porter algorithm for computing the
184  stem of the words.  stem of the words.
185    
186  The filter definition for a collection defines a set of piplines for  The filter definition for a collection defines a set of pipelines for
187  the attributes and modifies the pipelines which should be used for  the attributes and modifies the pipelines which should be used for
188  prefix and interval searches.  prefix and interval searches.
189    
190  Here is a complete example:  Several complete working examples come with WAIT in the script
191    directory. It is recommended to follow the pattern of the scripts
192    smakewhatis and sman.
193    
194    my $stem  = [{  =cut
                 'prefix'    => ['unroff', 'isotr', 'isolc'],  
                 'intervall' => ['unroff', 'isotr', 'isolc'],  
                },'unroff', 'isotr', 'isolc', 'split2', 'stop', 'Stem'];  
   my $text  = [{  
                 'prefix'    => ['unroff', 'isotr', 'isolc'],  
                 'intervall' => ['unroff', 'isotr', 'isolc'],  
                },  
                 'unroff', 'isotr', 'isolc', 'split2', 'stop'];  
   my $sound = ['unroff', 'isotr', 'isolc', 'split2', 'Soundex'];  
     
   my $spec  = [  
       'name'         => $stem,  
       'synopsis'     => $stem,  
       'bugs'         => $stem,  
       'description'  => $stem,  
       'text'         => $stem,  
       'environment'  => $text,  
       'example'      => $text,  'example' => $stem,  
       'author'       => $sound, 'author'  => $stem,  
      ]  
195    

Legend:
Removed from v.12  
changed lines
  Added in v.13

  ViewVC Help
Powered by ViewVC 1.1.26