1 |
#!/usr/bin/perl |
#!/usr/bin/perl |
2 |
# -*- Mode: Perl -*- |
# -*- Mode: Cperl -*- |
3 |
# $Basename: WAIT.pm $ |
# $Basename: WAIT.pm $ |
4 |
# $Revision: 1.4 $ |
# $Revision: 1.6 $ |
5 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
6 |
# Created On : Wed Nov 5 16:59:32 1997 |
# Created On : Wed Nov 5 16:59:32 1997 |
7 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
9 |
# Language : CPerl |
# Language : CPerl |
10 |
# Update Count : 4 |
# Update Count : 4 |
11 |
# Status : Unknown, Use with caution! |
# Status : Unknown, Use with caution! |
12 |
# |
# |
13 |
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved. |
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved. |
14 |
# |
# |
15 |
# |
# |
16 |
|
|
17 |
package WAIT; |
package WAIT; |
18 |
require DynaLoader; |
require DynaLoader; |
19 |
use vars qw($VERSION @ISA); |
use vars qw($VERSION @ISA); |
20 |
@ISA = qw(DynaLoader); |
@ISA = qw(DynaLoader); |
21 |
|
|
22 |
$VERSION = sprintf '%5.3f', map $_/10,'$ProjectVersion: 16.2 $ ' =~ /([\d.]+)/; |
$VERSION = sprintf '%.4f', map $_/10,'$ProjectVersion: 17.1 $ ' =~ /([\d.]+)/; |
23 |
|
|
24 |
bootstrap WAIT $VERSION; |
bootstrap WAIT $VERSION; |
25 |
|
|
27 |
|
|
28 |
=head1 NAME |
=head1 NAME |
29 |
|
|
30 |
WAIT - a rewrite of the freeWAIS-sf engine in Perl |
WAIT - a rewrite of the freeWAIS-sf engine in Perl and XS |
31 |
|
|
32 |
|
=head1 SYNOPSIS |
33 |
|
|
34 |
|
A Synopsis is not yet available. |
35 |
|
|
36 |
=head1 Status of this document |
=head1 Status of this document |
37 |
|
|
79 |
|
|
80 |
=head2 Access |
=head2 Access |
81 |
|
|
82 |
The access module defines which documents where members of a |
The access module defines which documents are members of a database. |
83 |
database. Usually an access module is a tied hash, whose keys are the |
Usually an access module is a tied hash, whose keys are the Ids of the |
84 |
Ids of the documents (did = document id) and whose values are the |
documents (did = document id) and whose values are the documents |
85 |
documents themselves. The indexing process loops over the keys using |
themselves. The indexing process loops over the keys using C<FIRSTKEY> |
86 |
C<FIRSTKEY> and C<NEXTKEY>. Documents are retrieved with C<FETCH>. |
and C<NEXTKEY>. Documents are retrieved with C<FETCH>. |
87 |
|
|
88 |
By convention access modules should be members of the |
By convention access modules should be members of the |
89 |
C<WAIT::Document> hierarchy. Have a look at the |
C<WAIT::Document> hierarchy. Have a look at the |
92 |
|
|
93 |
=head2 Parse |
=head2 Parse |
94 |
|
|
95 |
The task parse module is to split the documents into logical parts |
The task of the parse module is to split the documents into logical |
96 |
via the C<split> method. E.g. the C<WAIT::Parse::Nroff> splits |
parts via the C<split> method. E.g. the C<WAIT::Parse::Nroff> splits |
97 |
manuals piped through B<nroff>(1) into the sections I<name>, |
manuals piped through B<nroff>(1) into the sections I<name>, |
98 |
I<synopsis>, I<options>, I<description>, I<author>, I<example>, |
I<synopsis>, I<options>, I<description>, I<author>, I<example>, |
99 |
I<bugs>, I<text>, I<see>, and I<environment>. Here is the |
I<bugs>, I<text>, I<see>, and I<environment>. Here is the |
100 |
implementation of C<WAIT::Parse::Base> which handes documents with a |
implementation of C<WAIT::Parse::Base> which handles documents with a |
101 |
pretty simple tagged format: |
pretty simple tagged format: |
102 |
|
|
103 |
AU: Pfeifer, U.; Fuhr, N.; Huynh, T. |
AU: Pfeifer, U.; Fuhr, N.; Huynh, T. |
114 |
sub split { # called as method |
sub split { # called as method |
115 |
my %result; |
my %result; |
116 |
my $fld; |
my $fld; |
117 |
|
|
118 |
for (split /\n/, $_[1]) { |
for (split /\n/, $_[1]) { |
119 |
if (s/^(\S+):\s*//) { |
if (s/^(\S+):\s*//) { |
120 |
$fld = lc $1; |
$fld = lc $1; |
122 |
$result{$fld} .= $_ if defined $fld; |
$result{$fld} .= $_ if defined $fld; |
123 |
} |
} |
124 |
return \%result; |
return \%result; |
125 |
} |
} |
126 |
|
|
127 |
Since the original document cannot be reconstructed from its |
Since the original document cannot be reconstructed from its |
128 |
attributes, we need a second method (I<tag>) which marks the regions |
attributes, we need a second method (I<tag>) which marks the regions |
135 |
sub tag { |
sub tag { |
136 |
my @result; |
my @result; |
137 |
my $tag; |
my $tag; |
138 |
|
|
139 |
for (split /\n/, $_[1]) { |
for (split /\n/, $_[1]) { |
140 |
next if /^\w\w:\s*$/; |
next if /^\w\w:\s*$/; |
141 |
if (s/^(\S+)://) { |
if (s/^(\S+)://) { |
149 |
} |
} |
150 |
} |
} |
151 |
return @result; # we don't go for speed |
return @result; # we don't go for speed |
152 |
} |
} |
153 |
|
|
154 |
Obviously one could implement C<split> via C<tag>. The reason for |
Obviously one could implement C<split> via C<tag>. The reason for |
155 |
having two functions is speed. We need to call C<split> for each |
having two functions is speed. We need to call C<split> for each |
183 |
stopwords and C<Stem> applies the Porter algorithm for computing the |
stopwords and C<Stem> applies the Porter algorithm for computing the |
184 |
stem of the words. |
stem of the words. |
185 |
|
|
186 |
The filter definition for a collection defines a set of piplines for |
The filter definition for a collection defines a set of pipelines for |
187 |
the attributes and modifies the pipelines which should be used for |
the attributes and modifies the pipelines which should be used for |
188 |
prefix and interval searches. |
prefix and interval searches. |
189 |
|
|
190 |
Here is a complete example: |
Several complete working examples come with WAIT in the script |
191 |
|
directory. It is recommended to follow the pattern of the scripts |
192 |
|
smakewhatis and sman. |
193 |
|
|
194 |
my $stem = [{ |
=cut |
|
'prefix' => ['unroff', 'isotr', 'isolc'], |
|
|
'intervall' => ['unroff', 'isotr', 'isolc'], |
|
|
},'unroff', 'isotr', 'isolc', 'split2', 'stop', 'Stem']; |
|
|
my $text = [{ |
|
|
'prefix' => ['unroff', 'isotr', 'isolc'], |
|
|
'intervall' => ['unroff', 'isotr', 'isolc'], |
|
|
}, |
|
|
'unroff', 'isotr', 'isolc', 'split2', 'stop']; |
|
|
my $sound = ['unroff', 'isotr', 'isolc', 'split2', 'Soundex']; |
|
|
|
|
|
my $spec = [ |
|
|
'name' => $stem, |
|
|
'synopsis' => $stem, |
|
|
'bugs' => $stem, |
|
|
'description' => $stem, |
|
|
'text' => $stem, |
|
|
'environment' => $text, |
|
|
'example' => $text, 'example' => $stem, |
|
|
'author' => $sound, 'author' => $stem, |
|
|
] |
|
195 |
|
|