/[wait]/cvs-head/lib/WAIT.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 11 by unknown, Fri Apr 28 15:41:10 2000 UTC revision 19 by ulpfr, Tue May 9 11:29:45 2000 UTC
# Line 1  Line 1 
1  #!/usr/bin/perl  #!/usr/bin/perl
2  #                              -*- Mode: Perl -*-  #                              -*- Mode: Cperl -*-
3  # $Basename: WAIT.pm $  # $Basename: WAIT.pm $
4  # $Revision: 1.4 $  # $Revision: 1.7 $
5  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
6  # Created On      : Wed Nov  5 16:59:32 1997  # Created On      : Wed Nov  5 16:59:32 1997
7  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
8  # Last Modified On: Wed Nov 12 18:26:44 1997  # Last Modified On: Mon May 31 22:34:35 1999
9  # Language        : CPerl  # Language        : CPerl
10  # Update Count    : 4  # Update Count    : 5
11  # Status          : Unknown, Use with caution!  # Status          : Unknown, Use with caution!
12  #  #
13  # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.  # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14  #  #
15  #  #
16    
17  package WAIT;  package WAIT;
18  require DynaLoader;  require DynaLoader;
19  use vars qw($VERSION @ISA);  use vars qw($VERSION @ISA);
20  @ISA = qw(DynaLoader);  @ISA = qw(DynaLoader);
21    
22  $VERSION = sprintf '%5.3f', map $_/10,'$ProjectVersion: 16.2 $ ' =~ /([\d.]+)/;  # $Format: "$\VERSION = sprintf '%5.3f', ($ProjectMajorVersion$ * 100 + ($ProjectMinorVersion$-1))/1000;"$
23    $VERSION = sprintf '%5.3f', (18 * 100 + (1-1))/1000;
24    
25    
26  bootstrap WAIT $VERSION;  bootstrap WAIT $VERSION;
27    
# Line 27  __END__ Line 29  __END__
29    
30  =head1 NAME  =head1 NAME
31    
32  WAIT - a rewrite of the freeWAIS-sf engine in Perl  WAIT - a rewrite of the freeWAIS-sf engine in Perl and XS
33    
34    =head1 SYNOPSIS
35    
36    A Synopsis is not yet available.
37    
38  =head1 Status of this document  =head1 Status of this document
39    
# Line 75  time in addition a B<query> and a B<disp Line 81  time in addition a B<query> and a B<disp
81    
82  =head2 Access  =head2 Access
83    
84  The access module defines which documents where members of a  The access module defines which documents are members of a database.
85  database. Usually an access module is a tied hash, whose keys are the  Usually an access module is a tied hash, whose keys are the Ids of the
86  Ids of the documents (did = document id) and whose values are the  documents (did = document id) and whose values are the documents
87  documents themselves. The indexing process loops over the keys using  themselves. The indexing process loops over the keys using C<FIRSTKEY>
88  C<FIRSTKEY> and C<NEXTKEY>. Documents are retrieved with C<FETCH>.  and C<NEXTKEY>. Documents are retrieved with C<FETCH>.
89    
90  By convention access modules should be members of the  By convention access modules should be members of the
91  C<WAIT::Document> hierarchy. Have a look at the  C<WAIT::Document> hierarchy. Have a look at the
# Line 88  C<WAIT::Document::Split> module to get t Line 94  C<WAIT::Document::Split> module to get t
94    
95  =head2 Parse  =head2 Parse
96    
97  The task parse module is to split the documents into logical parts  The task of the parse module is to split the documents into logical
98  via the C<split> method.  E.g. the C<WAIT::Parse::Nroff> splits  parts via the C<split> method. E.g. the C<WAIT::Parse::Nroff> splits
99  manuals piped through B<nroff>(1) into the sections I<name>,  manuals piped through B<nroff>(1) into the sections I<name>,
100  I<synopsis>, I<options>, I<description>, I<author>, I<example>,  I<synopsis>, I<options>, I<description>, I<author>, I<example>,
101  I<bugs>, I<text>, I<see>, and I<environment>. Here is the  I<bugs>, I<text>, I<see>, and I<environment>. Here is the
102  implementation of C<WAIT::Parse::Base> which handes documents with a  implementation of C<WAIT::Parse::Base> which handles documents with a
103  pretty simple tagged format:  pretty simple tagged format:
104    
105    AU: Pfeifer, U.; Fuhr, N.; Huynh, T.    AU: Pfeifer, U.; Fuhr, N.; Huynh, T.
# Line 110  pretty simple tagged format: Line 116  pretty simple tagged format:
116    sub split {                     # called as method    sub split {                     # called as method
117      my %result;      my %result;
118      my $fld;      my $fld;
119      
120      for (split /\n/, $_[1]) {      for (split /\n/, $_[1]) {
121        if (s/^(\S+):\s*//) {        if (s/^(\S+):\s*//) {
122          $fld = lc $1;          $fld = lc $1;
# Line 118  pretty simple tagged format: Line 124  pretty simple tagged format:
124        $result{$fld} .= $_ if defined $fld;        $result{$fld} .= $_ if defined $fld;
125      }      }
126      return \%result;      return \%result;
127    }    }
128    
129  Since the original document cannot be reconstructed from its  Since the original document cannot be reconstructed from its
130  attributes, we need a second method (I<tag>) which marks the regions  attributes, we need a second method (I<tag>) which marks the regions
# Line 131  regions. Line 137  regions.
137    sub tag {    sub tag {
138      my @result;      my @result;
139      my $tag;      my $tag;
140        
141      for (split /\n/, $_[1]) {      for (split /\n/, $_[1]) {
142        next if /^\w\w:\s*$/;        next if /^\w\w:\s*$/;
143        if (s/^(\S+)://) {        if (s/^(\S+)://) {
# Line 145  regions. Line 151  regions.
151        }        }
152      }      }
153      return @result;               # we don't go for speed      return @result;               # we don't go for speed
154    }    }
155    
156  Obviously one could implement C<split> via C<tag>. The reason for  Obviously one could implement C<split> via C<tag>. The reason for
157  having two functions is speed. We need to call C<split> for each  having two functions is speed. We need to call C<split> for each
# Line 179  words shorter than two characters. C<sto Line 185  words shorter than two characters. C<sto
185  stopwords and C<Stem> applies the Porter algorithm for computing the  stopwords and C<Stem> applies the Porter algorithm for computing the
186  stem of the words.  stem of the words.
187    
188  The filter definition for a collection defines a set of piplines for  The filter definition for a collection defines a set of pipelines for
189  the attributes and modifies the pipelines which should be used for  the attributes and modifies the pipelines which should be used for
190  prefix and interval searches.  prefix and interval searches.
191    
192  Here is a complete example:  Several complete working examples come with WAIT in the script
193    directory. It is recommended to follow the pattern of the scripts
194    smakewhatis and sman.
195    
196    my $stem  = [{  =cut
                 'prefix'    => ['unroff', 'isotr', 'isolc'],  
                 'intervall' => ['unroff', 'isotr', 'isolc'],  
                },'unroff', 'isotr', 'isolc', 'split2', 'stop', 'Stem'];  
   my $text  = [{  
                 'prefix'    => ['unroff', 'isotr', 'isolc'],  
                 'intervall' => ['unroff', 'isotr', 'isolc'],  
                },  
                 'unroff', 'isotr', 'isolc', 'split2', 'stop'];  
   my $sound = ['unroff', 'isotr', 'isolc', 'split2', 'Soundex'];  
     
   my $spec  = [  
       'name'         => $stem,  
       'synopsis'     => $stem,  
       'bugs'         => $stem,  
       'description'  => $stem,  
       'text'         => $stem,  
       'environment'  => $text,  
       'example'      => $text,  'example' => $stem,  
       'author'       => $sound, 'author'  => $stem,  
      ]  
197    

Legend:
Removed from v.11  
changed lines
  Added in v.19

  ViewVC Help
Powered by ViewVC 1.1.26