1 |
# -*- Mode: Perl -*- |
2 |
# InvertedIndex.pm -- |
3 |
# ITIID : $ITI$ $Header $__Header$ |
4 |
# Author : Ulrich Pfeifer |
5 |
# Created On : Thu Aug 8 13:05:10 1996 |
6 |
# Last Modified By: Ulrich Pfeifer |
7 |
# Last Modified On: Sun Nov 22 18:44:42 1998 |
8 |
# Language : CPerl |
9 |
# Status : Unknown, Use with caution! |
10 |
# |
11 |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
12 |
# |
13 |
|
14 |
package WAIT::InvertedIndex; |
15 |
use strict; |
16 |
use DB_File; |
17 |
use Fcntl; |
18 |
use WAIT::Filter; |
19 |
use Carp; |
20 |
use vars qw(%FUNC); |
21 |
|
22 |
my $O = pack('C', 0xff)."o"; # occurances |
23 |
my $M = pack('C', 0xff)."m"; # maxtf |
24 |
|
25 |
sub new { |
26 |
my $type = shift; |
27 |
my %parm = @_; |
28 |
my $self = {}; |
29 |
|
30 |
$self->{file} = $parm{file} or croak "No file specified"; |
31 |
$self->{attr} = $parm{attr} or croak "No attributes specified"; |
32 |
$self->{filter} = $parm{filter}; |
33 |
$self->{'name'} = $parm{'name'}; |
34 |
$self->{records} = 0; |
35 |
for (qw(intervall prefix)) { |
36 |
if (exists $parm{$_}) { |
37 |
if (ref $parm{$_}) { |
38 |
$self->{$_} = [@{$parm{$_}}] # clone |
39 |
} else { |
40 |
$self->{$_} = $parm{$_} |
41 |
} |
42 |
} |
43 |
} |
44 |
bless $self, ref($type) || $type; |
45 |
} |
46 |
|
47 |
sub name {$_[0]->{'name'}} |
48 |
|
49 |
sub _split_pos { |
50 |
my ($text, $pos) = @{$_[0]}; |
51 |
my @result; |
52 |
|
53 |
$text =~ s/(^\s+)// and $pos += length($1); |
54 |
while ($text =~ s/(^\S+)//) { |
55 |
my $word = $1; |
56 |
push @result, [$word, $pos]; |
57 |
$pos += length($word); |
58 |
$text =~ s/(^\s+)// and $pos += length($1); |
59 |
} |
60 |
@result; |
61 |
} |
62 |
|
63 |
sub _xfiltergen { |
64 |
my $filter = pop @_; |
65 |
|
66 |
if ($filter eq 'stop') { # avoid the slow stopword elimination |
67 |
return _xfiltergen(@_); # it's cheaper to look them up afterwards |
68 |
} |
69 |
if (@_) { |
70 |
if ($filter =~ /^split(\d*)/) { |
71 |
if ($1) { |
72 |
"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ; |
73 |
} else { |
74 |
"map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ; |
75 |
} |
76 |
} else { |
77 |
"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')'; |
78 |
} |
79 |
} else { |
80 |
if ($filter =~ /^split(\d*)/) { |
81 |
if ($1) { |
82 |
"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ; |
83 |
} else { |
84 |
"map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ; |
85 |
} |
86 |
} else { |
87 |
"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])"; |
88 |
} |
89 |
} |
90 |
} |
91 |
|
92 |
sub parse_pos { |
93 |
my $self = shift; |
94 |
|
95 |
unless (exists $self->{xfunc}) { |
96 |
$self->{xfunc} = |
97 |
eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}})); |
98 |
#printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}}); |
99 |
} |
100 |
&{$self->{xfunc}}($_[0]); |
101 |
} |
102 |
|
103 |
sub _filtergen { |
104 |
my $filter = pop @_; |
105 |
|
106 |
if (@_) { |
107 |
"map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')'; |
108 |
} else { |
109 |
"map(&WAIT::Filter::$filter(\$_), \@_)"; |
110 |
} |
111 |
} |
112 |
|
113 |
sub drop { |
114 |
my $self = shift; |
115 |
if ((caller)[0] eq 'WAIT::Table') { # Table knows about this |
116 |
my $file = $self->{file}; |
117 |
|
118 |
! (!-e $file or unlink $file); |
119 |
} else { # notify our database |
120 |
croak ref($self)."::drop called directly"; |
121 |
} |
122 |
} |
123 |
|
124 |
sub open { |
125 |
my $self = shift; |
126 |
my $file = $self->{file}; |
127 |
|
128 |
if (defined $self->{dbh}) { |
129 |
$self->{dbh}; |
130 |
} else { |
131 |
$self->{func} = |
132 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}})); |
133 |
$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file, |
134 |
$self->{mode}, 0664, $DB_BTREE); |
135 |
# tie(%{$self->{cache}}, 'DB_File', undef, |
136 |
# $self->{mode}, 0664, $DB_BTREE) |
137 |
$self->{cache} = {} |
138 |
if $self->{mode} & O_RDWR; |
139 |
# tie(%{$self->{cdict}}, 'DB_File', undef, |
140 |
# $self->{mode}, 0664, $DB_BTREE) |
141 |
$self->{cdict} = {} |
142 |
if $self->{mode} & O_RDWR; |
143 |
$self->{cached} = 0; |
144 |
} |
145 |
} |
146 |
|
147 |
sub insert { |
148 |
my $self = shift; |
149 |
my $key = shift; |
150 |
my %occ; |
151 |
|
152 |
defined $self->{db} or $self->open; |
153 |
grep $occ{$_}++, &{$self->{func}}(@_); |
154 |
my ($word, $noc); |
155 |
$self->{records}++; |
156 |
while (($word, $noc) = each %occ) { |
157 |
if (defined $self->{cache}->{$word}) { |
158 |
$self->{cdict}->{$O,$word}++; |
159 |
$self->{cache}->{$word} .= pack 'w2', $key, $noc; |
160 |
} else { |
161 |
$self->{cdict}->{$O,$word} = 1; |
162 |
$self->{cache}->{$word} = pack 'w2', $key, $noc; |
163 |
} |
164 |
$self->{cached}++; |
165 |
} |
166 |
$self->sync if $self->{cached} > 100_000; |
167 |
my $maxtf = 0; |
168 |
for (values %occ) { |
169 |
$maxtf = $_ if $_ > $maxtf; |
170 |
} |
171 |
$self->{db}->{$M, $key} = $maxtf; |
172 |
} |
173 |
|
174 |
sub delete { |
175 |
my $self = shift; |
176 |
my $key = shift; |
177 |
my %occ; |
178 |
|
179 |
defined $self->{db} or $self->open; |
180 |
$self->sync; |
181 |
$self->{records}--; |
182 |
grep $occ{$_}++, &{$self->{func}}(@_); |
183 |
for (keys %occ) { |
184 |
# may reorder posting list |
185 |
my %post = unpack 'w*', $self->{db}->{$_}; |
186 |
$self->{db}->{$O,$_}--; |
187 |
delete $post{$key}; |
188 |
$self->{db}->{$_} = pack 'w*', %post; |
189 |
} |
190 |
delete $self->{db}->{$M, $key}; |
191 |
} |
192 |
|
193 |
sub intervall { |
194 |
my ($self, $first, $last) = @_; |
195 |
my $value = ''; |
196 |
my $word = ''; |
197 |
my @result; |
198 |
|
199 |
return unless exists $self->{'intervall'}; |
200 |
|
201 |
defined $self->{db} or $self->open; |
202 |
$self->sync; |
203 |
my $dbh = $self->{dbh}; # for convenience |
204 |
|
205 |
if (ref $self->{'intervall'}) { |
206 |
unless (exists $self->{'ifunc'}) { |
207 |
$self->{'ifunc'} = |
208 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}})); |
209 |
} |
210 |
($first) = &{$self->{'ifunc'}}($first) if $first; |
211 |
($last) = &{$self->{'ifunc'}}($last) if $last; |
212 |
} |
213 |
if (defined $first and $first ne '') { # set the cursor to $first |
214 |
$dbh->seq($first, $value, R_CURSOR); |
215 |
} else { |
216 |
$dbh->seq($first, $value, R_FIRST); |
217 |
} |
218 |
# We assume that word do not start with the character \377 |
219 |
# $last = pack 'C', 0xff unless defined $last and $last ne ''; |
220 |
return () if defined $last and $first gt $last; # $first would be after the last word |
221 |
|
222 |
push @result, $first; |
223 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
224 |
# We should limit this to a "resonable" number of words |
225 |
last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o; |
226 |
push @result, $word; |
227 |
} |
228 |
\@result; # speed |
229 |
} |
230 |
|
231 |
sub prefix { |
232 |
my ($self, $prefix) = @_; |
233 |
my $value = ''; |
234 |
my $word = ''; |
235 |
my @result; |
236 |
|
237 |
return () unless defined $prefix; # Full dictionary requested !! |
238 |
return unless exists $self->{'prefix'}; |
239 |
defined $self->{db} or $self->open; |
240 |
$self->sync; |
241 |
my $dbh = $self->{dbh}; |
242 |
|
243 |
if (ref $self->{'prefix'}) { |
244 |
unless (exists $self->{'pfunc'}) { |
245 |
$self->{'pfunc'} = |
246 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}})); |
247 |
} |
248 |
($prefix) = &{$self->{'pfunc'}}($prefix); |
249 |
} |
250 |
|
251 |
if ($dbh->seq($word = $prefix, $value, R_CURSOR)) { |
252 |
return (); |
253 |
} |
254 |
return () if $word !~ /^$prefix/; |
255 |
push @result, $word; |
256 |
|
257 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
258 |
# We should limit this to a "resonable" number of words |
259 |
last if $word !~ /^$prefix/; |
260 |
push @result, $word; |
261 |
} |
262 |
\@result; # speed |
263 |
} |
264 |
|
265 |
sub search { |
266 |
my $self = shift; |
267 |
|
268 |
defined $self->{db} or $self->open; |
269 |
$self->sync; |
270 |
$self->search_raw(&{$self->{func}}(@_)); # No call to parse() here |
271 |
} |
272 |
|
273 |
sub parse { |
274 |
my $self = shift; |
275 |
|
276 |
defined $self->{db} or $self->open; |
277 |
&{$self->{func}}(@_); |
278 |
} |
279 |
|
280 |
sub search_prefix { |
281 |
my $self = shift; |
282 |
|
283 |
# print "search_prefix(@_)\n"; |
284 |
defined $self->{db} or $self->open; |
285 |
$self->search_raw(map($self->prefix($_), @_)); |
286 |
} |
287 |
|
288 |
sub search_raw { |
289 |
my $self = shift; |
290 |
my %occ; |
291 |
my %score; |
292 |
|
293 |
return () unless @_; |
294 |
|
295 |
defined $self->{db} or $self->open; |
296 |
$self->sync; |
297 |
grep $occ{$_}++, @_; |
298 |
for (keys %occ) { |
299 |
if (defined $self->{db}->{$_}) { |
300 |
my %post = unpack 'w*', $self->{db}->{$_}; |
301 |
my $idf = log($self->{records}/$self->{db}->{$O,$_}); |
302 |
my $did; |
303 |
for $did (keys %post) { |
304 |
$score{$did} = 0 unless defined $score{$did}; # perl -w |
305 |
$score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf |
306 |
if $self->{db}->{$M, $did}; # db may be broken |
307 |
} |
308 |
} |
309 |
} |
310 |
%score; |
311 |
} |
312 |
|
313 |
sub sync { |
314 |
my $self = shift; |
315 |
|
316 |
if ($self->{mode} & O_RDWR) { |
317 |
print STDERR "\aFlushing $self->{cached} postings\n"; |
318 |
while (my($key, $value) = each %{$self->{cache}}) { |
319 |
$self->{db}->{$key} .= $value; |
320 |
#delete $self->{cache}->{$key}; |
321 |
} |
322 |
while (my($key, $value) = each %{$self->{cdict}}) { |
323 |
$self->{db}->{$key} = 0 unless $self->{db}->{$key}; |
324 |
$self->{db}->{$key} += $value; |
325 |
#delete $self->{cdict}->{$key}; |
326 |
} |
327 |
$self->{cache} = {}; |
328 |
$self->{cdict} = {}; |
329 |
# print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n"; |
330 |
$self->{cached} = 0; |
331 |
# $self->{dbh}->sync if $self->{dbh}; |
332 |
} |
333 |
} |
334 |
|
335 |
sub close { |
336 |
my $self = shift; |
337 |
|
338 |
if ($self->{dbh}) { |
339 |
$self->sync; |
340 |
delete $self->{dbh}; |
341 |
untie %{$self->{db}}; |
342 |
delete $self->{db}; |
343 |
delete $self->{func}; |
344 |
delete $self->{cache}; |
345 |
delete $self->{cached}; |
346 |
delete $self->{cdict}; |
347 |
delete $self->{pfunc} if defined $self->{pfunc}; |
348 |
delete $self->{ifunc} if defined $self->{ifunc}; |
349 |
delete $self->{xfunc} if defined $self->{xfunc}; |
350 |
} |
351 |
} |
352 |
|
353 |
1; |
354 |
|