1 |
dpavlin |
108 |
# -*- Mode: cperl; fill-column: 79 -*- |
2 |
ulpfr |
19 |
# $Basename: InvertedIndex.pm $ |
3 |
|
|
# $Revision: 1.30 $ |
4 |
ulpfr |
10 |
# Author : Ulrich Pfeifer |
5 |
|
|
# Created On : Thu Aug 8 13:05:10 1996 |
6 |
|
|
# Last Modified By: Ulrich Pfeifer |
7 |
dpavlin |
89 |
# Last Modified On: Mon Apr 22 16:52:01 2002 |
8 |
ulpfr |
10 |
# Language : CPerl |
9 |
ulpfr |
19 |
# |
10 |
ulpfr |
80 |
# (C) Copyright 1996-2002, Ulrich Pfeifer |
11 |
ulpfr |
19 |
# |
12 |
ulpfr |
10 |
|
13 |
|
|
package WAIT::InvertedIndex; |
14 |
|
|
use strict; |
15 |
dpavlin |
108 |
use BerkeleyDB; |
16 |
ulpfr |
10 |
use Fcntl; |
17 |
|
|
use WAIT::Filter; |
18 |
|
|
use Carp; |
19 |
laperla |
30 |
use vars qw(%FUNC $VERSION); |
20 |
dpavlin |
108 |
use Time::HiRes qw(time); |
21 |
ulpfr |
10 |
|
22 |
dpavlin |
108 |
$VERSION = "2.000"; # others test if we are loaded by checking $VERSION |
23 |
laperla |
30 |
|
24 |
dpavlin |
108 |
use constant DOCFREQ_O => "o"; |
25 |
|
|
use constant MAXTF_M => "m"; |
26 |
|
|
use constant POSTINGLIST_P => "p"; |
27 |
|
|
use constant PMATCH => qr/^(??{POSTINGLIST_P()})/; |
28 |
|
|
|
29 |
ulpfr |
22 |
# The dictionary has three different key types: |
30 |
dpavlin |
108 |
|
31 |
ulpfr |
22 |
# 'o'.$word |
32 |
|
|
# |
33 |
|
|
# The document frequency is the number of documents a term occurs |
34 |
ulpfr |
80 |
# in. The idea is that a term occuring in a significant portion of the |
35 |
ulpfr |
22 |
# documents is not too significant. |
36 |
|
|
# |
37 |
|
|
# 'm'.$word |
38 |
|
|
# |
39 |
|
|
# The maximum term frequency of a document is the frequency of the |
40 |
|
|
# most frequent term in the document. It is related to the document |
41 |
laperla |
71 |
# length obviously. A document in which the most frequent term occurs |
42 |
ulpfr |
22 |
# 100 times is probably much longer than a document whichs most |
43 |
|
|
# frequent term occurs five time. |
44 |
|
|
# |
45 |
|
|
# 'p'.$word |
46 |
|
|
# |
47 |
|
|
# Under this key we store the actual posting list as pairs of |
48 |
|
|
# packed integers. |
49 |
ulpfr |
10 |
|
50 |
|
|
sub new { |
51 |
|
|
my $type = shift; |
52 |
|
|
my %parm = @_; |
53 |
|
|
my $self = {}; |
54 |
|
|
|
55 |
dpavlin |
116 |
for my $x (qw(path attr subname env maindbfile tablename)) { |
56 |
|
|
$self->{$x} = $parm{$x} or confess "No $x specified"; |
57 |
dpavlin |
108 |
} |
58 |
|
|
|
59 |
ulpfr |
10 |
$self->{filter} = $parm{filter}; |
60 |
|
|
$self->{'name'} = $parm{'name'}; |
61 |
|
|
$self->{records} = 0; |
62 |
|
|
for (qw(intervall prefix)) { |
63 |
|
|
if (exists $parm{$_}) { |
64 |
|
|
if (ref $parm{$_}) { |
65 |
|
|
$self->{$_} = [@{$parm{$_}}] # clone |
66 |
|
|
} else { |
67 |
|
|
$self->{$_} = $parm{$_} |
68 |
|
|
} |
69 |
|
|
} |
70 |
|
|
} |
71 |
|
|
bless $self, ref($type) || $type; |
72 |
|
|
} |
73 |
|
|
|
74 |
dpavlin |
108 |
for my $accessor (qw(name maindbfile tablename subname)) { |
75 |
|
|
no strict 'refs'; |
76 |
|
|
*{$accessor} = sub { |
77 |
|
|
my($self) = @_; |
78 |
|
|
return $self->{$accessor} if $self->{$accessor}; |
79 |
|
|
require Carp; |
80 |
|
|
Carp::confess("accessor $accessor not there"); |
81 |
|
|
} |
82 |
|
|
} |
83 |
ulpfr |
10 |
|
84 |
|
|
sub _split_pos { |
85 |
|
|
my ($text, $pos) = @{$_[0]}; |
86 |
|
|
my @result; |
87 |
|
|
|
88 |
|
|
$text =~ s/(^\s+)// and $pos += length($1); |
89 |
|
|
while ($text =~ s/(^\S+)//) { |
90 |
|
|
my $word = $1; |
91 |
|
|
push @result, [$word, $pos]; |
92 |
|
|
$pos += length($word); |
93 |
|
|
$text =~ s/(^\s+)// and $pos += length($1); |
94 |
|
|
} |
95 |
|
|
@result; |
96 |
|
|
} |
97 |
|
|
|
98 |
|
|
sub _xfiltergen { |
99 |
|
|
my $filter = pop @_; |
100 |
|
|
|
101 |
ulpfr |
13 |
# Oops, we cannot overrule the user's choice. Other filters may kill |
102 |
|
|
# stopwords, such as isotr clobbers "isn't" to "isnt". |
103 |
|
|
|
104 |
|
|
# if ($filter eq 'stop') { # avoid the slow stopword elimination |
105 |
|
|
# return _xfiltergen(@_); # it's cheaper to look them up afterwards |
106 |
|
|
# } |
107 |
ulpfr |
10 |
if (@_) { |
108 |
|
|
if ($filter =~ /^split(\d*)/) { |
109 |
|
|
if ($1) { |
110 |
|
|
"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ; |
111 |
|
|
} else { |
112 |
|
|
"map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ; |
113 |
|
|
} |
114 |
|
|
} else { |
115 |
|
|
"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')'; |
116 |
|
|
} |
117 |
|
|
} else { |
118 |
|
|
if ($filter =~ /^split(\d*)/) { |
119 |
|
|
if ($1) { |
120 |
|
|
"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ; |
121 |
|
|
} else { |
122 |
|
|
"map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ; |
123 |
|
|
} |
124 |
|
|
} else { |
125 |
|
|
"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])"; |
126 |
|
|
} |
127 |
|
|
} |
128 |
|
|
} |
129 |
|
|
|
130 |
|
|
sub parse_pos { |
131 |
|
|
my $self = shift; |
132 |
|
|
|
133 |
|
|
unless (exists $self->{xfunc}) { |
134 |
|
|
$self->{xfunc} = |
135 |
|
|
eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}})); |
136 |
|
|
#printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}}); |
137 |
|
|
} |
138 |
|
|
&{$self->{xfunc}}($_[0]); |
139 |
|
|
} |
140 |
|
|
|
141 |
|
|
sub _filtergen { |
142 |
|
|
my $filter = pop @_; |
143 |
|
|
|
144 |
|
|
if (@_) { |
145 |
|
|
"map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')'; |
146 |
|
|
} else { |
147 |
|
|
"map(&WAIT::Filter::$filter(\$_), \@_)"; |
148 |
|
|
} |
149 |
|
|
} |
150 |
|
|
|
151 |
|
|
sub drop { |
152 |
|
|
my $self = shift; |
153 |
|
|
if ((caller)[0] eq 'WAIT::Table') { # Table knows about this |
154 |
dpavlin |
116 |
my $path = $self->{path}; |
155 |
ulpfr |
10 |
|
156 |
dpavlin |
116 |
# ! (!-e $path or unlink $path); |
157 |
|
|
warn "DEBUG: fix drop index!"; |
158 |
ulpfr |
10 |
} else { # notify our database |
159 |
dpavlin |
116 |
confess ref($self)."::drop called directly"; |
160 |
ulpfr |
10 |
} |
161 |
|
|
} |
162 |
|
|
|
163 |
|
|
sub open { |
164 |
|
|
my $self = shift; |
165 |
dpavlin |
116 |
my $path = $self->{path}; |
166 |
ulpfr |
10 |
|
167 |
|
|
if (defined $self->{dbh}) { |
168 |
|
|
$self->{dbh}; |
169 |
|
|
} else { |
170 |
|
|
$self->{func} = |
171 |
|
|
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}})); |
172 |
dpavlin |
108 |
my $flags; |
173 |
|
|
if ($self->{mode} & O_RDWR) { |
174 |
|
|
$flags = DB_CREATE; # | DB_INIT_MPOOL | DB_PRIVATE | DB_INIT_CDB; |
175 |
dpavlin |
116 |
#warn "DEBUG: Flags on inverted $path set to 'writing'\n"; |
176 |
dpavlin |
108 |
} else { |
177 |
|
|
$flags = DB_RDONLY; |
178 |
dpavlin |
116 |
#warn "DEBUG: Flags on inverted $path set to 'readonly'\n"; |
179 |
dpavlin |
108 |
} |
180 |
|
|
my $filename = $self->maindbfile or die; |
181 |
|
|
my $subname = join("/",$self->tablename || die,$self->subname || die); |
182 |
|
|
my $env = $self->{env} || "[undef]"; |
183 |
|
|
$self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree', |
184 |
dpavlin |
116 |
# Filename => $path, |
185 |
dpavlin |
108 |
Filename => $filename, |
186 |
|
|
$self->{env} ? (Env => $self->{env}) : (), |
187 |
|
|
Subname => $subname, |
188 |
|
|
Mode => 0664, |
189 |
|
|
Flags => $flags, |
190 |
|
|
$WAIT::Database::Cachesize?(Cachesize => $WAIT::Database::Cachesize):(), |
191 |
|
|
$WAIT::Database::Pagesize?(Pagesize => $WAIT::Database::Pagesize):(), |
192 |
|
|
) or die "Couldn't tie: $BerkeleyDB::Error; filename=>'$filename', env=>'$env',subname=>'$subname',flags=>'$flags'"; |
193 |
ulpfr |
10 |
$self->{cache} = {} |
194 |
|
|
if $self->{mode} & O_RDWR; |
195 |
|
|
$self->{cdict} = {} |
196 |
|
|
if $self->{mode} & O_RDWR; |
197 |
|
|
$self->{cached} = 0; |
198 |
|
|
} |
199 |
|
|
} |
200 |
|
|
|
201 |
|
|
sub insert { |
202 |
|
|
my $self = shift; |
203 |
|
|
my $key = shift; |
204 |
|
|
my %occ; |
205 |
ulpfr |
13 |
|
206 |
ulpfr |
10 |
defined $self->{db} or $self->open; |
207 |
dpavlin |
108 |
defined $self->{db} or die "open didn't help!!!"; |
208 |
ulpfr |
10 |
grep $occ{$_}++, &{$self->{func}}(@_); |
209 |
|
|
my ($word, $noc); |
210 |
|
|
$self->{records}++; |
211 |
|
|
while (($word, $noc) = each %occ) { |
212 |
|
|
if (defined $self->{cache}->{$word}) { |
213 |
ulpfr |
22 |
$self->{cdict}->{$word}++; |
214 |
ulpfr |
10 |
$self->{cache}->{$word} .= pack 'w2', $key, $noc; |
215 |
dpavlin |
108 |
} else { |
216 |
ulpfr |
22 |
$self->{cdict}->{$word} = 1; |
217 |
ulpfr |
10 |
$self->{cache}->{$word} = pack 'w2', $key, $noc; |
218 |
ulpfr |
13 |
} |
219 |
ulpfr |
10 |
$self->{cached}++; |
220 |
|
|
} |
221 |
ulpfr |
19 |
# This cache limit should be configurable |
222 |
ulpfr |
10 |
$self->sync if $self->{cached} > 100_000; |
223 |
|
|
my $maxtf = 0; |
224 |
|
|
for (values %occ) { |
225 |
|
|
$maxtf = $_ if $_ > $maxtf; |
226 |
|
|
} |
227 |
dpavlin |
108 |
$self->{db}->{MAXTF_M . $key} = $maxtf; |
228 |
ulpfr |
10 |
} |
229 |
|
|
|
230 |
ulpfr |
19 |
# We sort postings by increasing max term frequency (~ by increasing |
231 |
|
|
# document length. This reduces the quality degradation if we process |
232 |
|
|
# only the first part of a posting list. |
233 |
|
|
|
234 |
|
|
sub sort_postings { |
235 |
|
|
my $self = shift; |
236 |
|
|
my $post = shift; # reference to a hash or packed string |
237 |
|
|
|
238 |
|
|
if (ref $post) { |
239 |
|
|
# we skip the sort part, if the index is not sorted |
240 |
|
|
return pack('w*', %$post) unless $self->{reorg}; |
241 |
|
|
} else { |
242 |
|
|
$post = { unpack 'w*', $post }; |
243 |
|
|
} |
244 |
|
|
|
245 |
|
|
my $r = ''; |
246 |
|
|
|
247 |
|
|
# Sort posting list by increasing ratio of maximum term frequency (~ |
248 |
ulpfr |
80 |
# "document length") and term frequency. This ratio multipied by the |
249 |
ulpfr |
19 |
# inverse document frequence gives the score for a term. This sort |
250 |
|
|
# order can be exploited for tuning of single term queries. |
251 |
|
|
|
252 |
laperla |
30 |
for my $did (keys %$post) { # sanity check |
253 |
dpavlin |
108 |
unless ($self->{db}->{MAXTF_M . $did}) { |
254 |
|
|
warn "WAIT Warning: DIVZERO threat from did[$did]post[$post]post{did}[$post->{$did}]"; |
255 |
|
|
$self->{db}->{MAXTF_M . $did} = 1; # fails if we have not opened for writing |
256 |
laperla |
30 |
} |
257 |
|
|
} |
258 |
dpavlin |
108 |
for my $did (sort { $post->{$b} / $self->{db}->{MAXTF_M . $b} |
259 |
ulpfr |
19 |
<=> |
260 |
dpavlin |
108 |
$post->{$a} / $self->{db}->{MAXTF_M . $a} |
261 |
ulpfr |
19 |
} keys %$post) { |
262 |
|
|
$r .= pack 'w2', $did, $post->{$did}; |
263 |
|
|
} |
264 |
|
|
#warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r; |
265 |
|
|
$r; |
266 |
|
|
} |
267 |
|
|
|
268 |
ulpfr |
10 |
sub delete { |
269 |
|
|
my $self = shift; |
270 |
|
|
my $key = shift; |
271 |
|
|
my %occ; |
272 |
|
|
|
273 |
ulpfr |
19 |
my $db; |
274 |
ulpfr |
10 |
defined $self->{db} or $self->open; |
275 |
ulpfr |
19 |
$db = $self->{db}; |
276 |
ulpfr |
10 |
$self->sync; |
277 |
|
|
$self->{records}--; |
278 |
ulpfr |
19 |
|
279 |
|
|
# less than zero documents in database? |
280 |
|
|
_complain('delete of document', $key) and $self->{records} = 0 |
281 |
|
|
if $self->{records} < 0; |
282 |
|
|
|
283 |
ulpfr |
10 |
grep $occ{$_}++, &{$self->{func}}(@_); |
284 |
ulpfr |
19 |
|
285 |
laperla |
30 |
# Be prepared for "Odd number of elements in hash assignment" |
286 |
|
|
local $SIG{__WARN__} = sub { |
287 |
|
|
my $warning = shift; |
288 |
|
|
chomp $warning; |
289 |
|
|
warn "Catching warning[$warning] during delete of key[$key]"; |
290 |
|
|
}; |
291 |
ulpfr |
19 |
for (keys %occ) {# may reorder posting list |
292 |
dpavlin |
108 |
my %post = unpack 'w*', $db->{POSTINGLIST_P . $_}; |
293 |
ulpfr |
10 |
delete $post{$key}; |
294 |
dpavlin |
108 |
$db->{POSTINGLIST_P . $_} = $self->sort_postings(\%post); |
295 |
|
|
_complain('delete of term', $_) if $db->{DOCFREQ_O . $_}-1 != keys %post; |
296 |
|
|
$db->{DOCFREQ_O . $_} = scalar keys %post; |
297 |
ulpfr |
10 |
} |
298 |
dpavlin |
108 |
delete $db->{MAXTF_M . $key}; |
299 |
ulpfr |
10 |
} |
300 |
|
|
|
301 |
|
|
sub intervall { |
302 |
|
|
my ($self, $first, $last) = @_; |
303 |
|
|
|
304 |
dpavlin |
108 |
die "intervall broken in this version of WAIT: need to fix the |
305 |
|
|
R_CURSOR and R_NEXT lines"; |
306 |
ulpfr |
10 |
|
307 |
dpavlin |
108 |
#### my $value = ''; |
308 |
|
|
#### my $word = ''; |
309 |
|
|
#### my @result; |
310 |
|
|
#### |
311 |
|
|
#### return unless exists $self->{'intervall'}; |
312 |
|
|
#### |
313 |
|
|
#### defined $self->{db} or $self->open; |
314 |
|
|
#### $self->sync; |
315 |
|
|
#### my $dbh = $self->{dbh}; # for convenience |
316 |
|
|
#### |
317 |
|
|
#### if (ref $self->{'intervall'}) { |
318 |
|
|
#### unless (exists $self->{'ifunc'}) { |
319 |
|
|
#### $self->{'ifunc'} = |
320 |
|
|
#### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}})); |
321 |
|
|
#### } |
322 |
|
|
#### ($first) = &{$self->{'ifunc'}}($first) if $first; |
323 |
|
|
#### ($last) = &{$self->{'ifunc'}}($last) if $last; |
324 |
|
|
#### } |
325 |
|
|
#### $first = POSTINGLIST_P . ($first||''); |
326 |
|
|
#### $last = (defined $last)?POSTINGLIST_P . $last:'q'; |
327 |
|
|
#### |
328 |
|
|
#### # set the cursor to $first |
329 |
|
|
#### $dbh->seq($first, $value, R_CURSOR); |
330 |
|
|
#### |
331 |
|
|
#### # $first would be after the last word |
332 |
|
|
#### return () if $first gt $last; |
333 |
|
|
#### |
334 |
|
|
#### push @result, substr($first,1); |
335 |
|
|
#### while (!$dbh->seq($word, $value, R_NEXT)) { |
336 |
|
|
#### # We should limit this to a "resonable" number of words |
337 |
|
|
#### last if $word gt $last; |
338 |
|
|
#### push @result, substr($word,1); |
339 |
|
|
#### } |
340 |
|
|
#### \@result; # speed |
341 |
ulpfr |
10 |
} |
342 |
|
|
|
343 |
|
|
sub prefix { |
344 |
|
|
my ($self, $prefix) = @_; |
345 |
|
|
|
346 |
dpavlin |
108 |
die "prefix not supported in this version of WAIT: need to fix the R_CURSOR"; |
347 |
ulpfr |
10 |
|
348 |
|
|
|
349 |
dpavlin |
108 |
#### my $value = ''; |
350 |
|
|
#### my $word = ''; |
351 |
|
|
#### my @result; |
352 |
|
|
#### |
353 |
|
|
#### return () unless defined $prefix; # Full dictionary requested !! |
354 |
|
|
#### return unless exists $self->{'prefix'}; |
355 |
|
|
#### defined $self->{db} or $self->open; |
356 |
|
|
#### $self->sync; |
357 |
|
|
#### my $dbh = $self->{dbh}; |
358 |
|
|
#### |
359 |
|
|
#### if (ref $self->{'prefix'}) { |
360 |
|
|
#### unless (exists $self->{'pfunc'}) { |
361 |
|
|
#### $self->{'pfunc'} = |
362 |
|
|
#### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}})); |
363 |
|
|
#### } |
364 |
|
|
#### ($prefix) = &{$self->{'pfunc'}}($prefix); |
365 |
|
|
#### } |
366 |
|
|
#### |
367 |
|
|
#### if ($dbh->seq($word = POSTINGLIST_P . $prefix, $value, R_CURSOR)) { |
368 |
|
|
#### return (); |
369 |
|
|
#### } |
370 |
|
|
#### return () if $word !~ /^p$prefix/; |
371 |
|
|
#### push @result, substr($word,1); |
372 |
|
|
#### |
373 |
|
|
#### while (!$dbh->seq($word, $value, R_NEXT)) { |
374 |
|
|
#### # We should limit this to a "resonable" number of words |
375 |
|
|
#### last if $word !~ /^p$prefix/; |
376 |
|
|
#### push @result, substr($word,1); |
377 |
|
|
#### } |
378 |
|
|
#### \@result; # speed |
379 |
ulpfr |
10 |
} |
380 |
|
|
|
381 |
ulpfr |
19 |
=head2 search($query) |
382 |
|
|
|
383 |
|
|
The search method supports a range of search algorithms. It is |
384 |
|
|
recommended to tune the index by calling |
385 |
|
|
C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents |
386 |
|
|
into the table. This is a computing intense operation and all inserts |
387 |
|
|
and deletes after this optimization are slightly more expensive. Once |
388 |
|
|
reorganized, the index is kept sorted automatically until you switch |
389 |
|
|
the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>. |
390 |
|
|
|
391 |
|
|
When searching a tuned index, a query can be processed faster if the |
392 |
|
|
caller requests only the topmost documents. This can be done by |
393 |
|
|
passing a C<top =E<gt>> I<n> parameter to the search method. |
394 |
|
|
|
395 |
|
|
For single term queries, the method returns only the I<n> top ranking |
396 |
|
|
documents. For multi term queries two optimized algorithms are |
397 |
|
|
available. The first algorithm computes the top n documents |
398 |
|
|
approximately but very fast, sacrificing a little bit of precision for |
399 |
|
|
speed. The second algorithm computes the topmost I<n> documents |
400 |
|
|
precisely. This algorithm is slower and should be used only for small |
401 |
|
|
values of I<n>. It can be requested by passing the query attribute |
402 |
|
|
C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits. |
403 |
|
|
While the picky version might not be faster than the brute force |
404 |
|
|
version on average for modest size databases it uses less memory and |
405 |
|
|
the processing time is almost linear in the number of query terms, not |
406 |
|
|
in the size of the lists. |
407 |
|
|
|
408 |
|
|
=cut |
409 |
|
|
|
410 |
dpavlin |
108 |
sub search_ref { |
411 |
ulpfr |
10 |
my $self = shift; |
412 |
ulpfr |
19 |
my $query = shift; |
413 |
ulpfr |
10 |
|
414 |
dpavlin |
108 |
my $debugtime = 0; |
415 |
|
|
my($time,$entertime); |
416 |
|
|
our $STARTTIME; |
417 |
|
|
if ($debugtime) { |
418 |
|
|
$time = time; |
419 |
|
|
$STARTTIME ||= $time; |
420 |
|
|
if ($time-$STARTTIME > 5) { |
421 |
|
|
$STARTTIME = $time; |
422 |
|
|
warn "STARTTIME: $STARTTIME\n"; |
423 |
|
|
} |
424 |
|
|
$entertime = time-$STARTTIME; |
425 |
|
|
warn sprintf "ENTER TIME: %.4f\n", $entertime; |
426 |
|
|
} |
427 |
ulpfr |
10 |
defined $self->{db} or $self->open; |
428 |
|
|
$self->sync; |
429 |
dpavlin |
108 |
my $ref = $self->search_raw_ref($query, &{$self->{func}}(@_)); # No call to parse() there |
430 |
|
|
if ($debugtime) { |
431 |
|
|
my $leavetime = time-$STARTTIME; |
432 |
|
|
warn sprintf "LEAVE TIME: %.4f\n", $leavetime; |
433 |
|
|
if ($leavetime-$entertime > .4) { |
434 |
|
|
require Data::Dumper; |
435 |
|
|
print STDERR "Line " . __LINE__ . ", File: " . __FILE__ . "\n" . |
436 |
|
|
Data::Dumper->new([$query,\@_],[qw(query at_)])->Indent(1)->Useqq(1)->Dump; # XXX |
437 |
|
|
} |
438 |
|
|
} |
439 |
|
|
$ref; |
440 |
ulpfr |
10 |
} |
441 |
|
|
|
442 |
|
|
sub parse { |
443 |
|
|
my $self = shift; |
444 |
|
|
|
445 |
|
|
defined $self->{db} or $self->open; |
446 |
|
|
&{$self->{func}}(@_); |
447 |
|
|
} |
448 |
|
|
|
449 |
|
|
sub search_prefix { |
450 |
|
|
my $self = shift; |
451 |
|
|
|
452 |
|
|
# print "search_prefix(@_)\n"; |
453 |
|
|
defined $self->{db} or $self->open; |
454 |
dpavlin |
108 |
$self->search_raw_ref(map($self->prefix($_), @_)); |
455 |
ulpfr |
10 |
} |
456 |
|
|
|
457 |
ulpfr |
19 |
sub _complain ($$) { |
458 |
|
|
my ($action, $term) = @_; |
459 |
|
|
|
460 |
|
|
require Carp; |
461 |
|
|
Carp::cluck |
462 |
|
|
(sprintf("WAIT database inconsistency during $action [%s]: ". |
463 |
|
|
"Please rebuild index\n", |
464 |
|
|
$term,)); |
465 |
|
|
} |
466 |
|
|
|
467 |
dpavlin |
108 |
sub search_raw_ref { |
468 |
ulpfr |
10 |
my $self = shift; |
469 |
ulpfr |
19 |
my $query = shift; |
470 |
dpavlin |
108 |
# warn "DEBUG WAIT: search_raw_ref args 2..[@_]"; |
471 |
ulpfr |
10 |
my %score; |
472 |
|
|
|
473 |
dpavlin |
108 |
# Top $top_wanted documents must be correct. Zero means all matching documents. |
474 |
|
|
my $top_wanted = $query->{top}; |
475 |
|
|
my $picky_strict = $query->{picky}; |
476 |
|
|
# the option is really ignore_excess |
477 |
|
|
my $ignore_excess = $query->{ignore_excess}; |
478 |
ulpfr |
10 |
|
479 |
dpavlin |
108 |
# Return at least $minacc documents. Zero means all matching documents. |
480 |
ulpfr |
19 |
|
481 |
dpavlin |
108 |
# my $minacc = $query->{accus} || $top_wanted; |
482 |
|
|
|
483 |
ulpfr |
19 |
# Open index and flush cache if necessary |
484 |
ulpfr |
10 |
defined $self->{db} or $self->open; |
485 |
|
|
$self->sync; |
486 |
ulpfr |
19 |
|
487 |
|
|
# We keep duplicates |
488 |
|
|
my @terms = |
489 |
|
|
# Sort words by decreasing document frequency |
490 |
dpavlin |
108 |
sort { $self->{db}->{DOCFREQ_O . $a} <=> $self->{db}->{DOCFREQ_O . $b} } |
491 |
ulpfr |
19 |
# check which words occur in the index. |
492 |
dpavlin |
108 |
grep { $self->{db}->{DOCFREQ_O . $_} } @_; |
493 |
ulpfr |
19 |
|
494 |
dpavlin |
108 |
# warn "DEBUG WAIT: wanted[$top_wanted]terms[@terms]"; |
495 |
laperla |
40 |
return unless @terms; |
496 |
ulpfr |
19 |
|
497 |
|
|
# We special-case one term queries here. If the index was sorted, |
498 |
|
|
# choping off the rest of the list will return the same ranking. |
499 |
dpavlin |
108 |
if ($top_wanted and @terms == 1) { |
500 |
ulpfr |
19 |
my $term = shift @terms; |
501 |
dpavlin |
108 |
my $idf = log($self->{records}/$self->{db}->{DOCFREQ_O . $term}); |
502 |
ulpfr |
19 |
my @res; |
503 |
|
|
|
504 |
|
|
if ($self->{reorg}) { # or not $query->{picky} |
505 |
dpavlin |
108 |
@res = unpack "w". int(2*$top_wanted), $self->{db}->{POSTINGLIST_P . $term}; |
506 |
|
|
# warn sprintf "DEBUG WAIT: scalar(\@res)[%d]", scalar(@res); |
507 |
ulpfr |
19 |
} else { |
508 |
dpavlin |
108 |
@res = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term}; |
509 |
ulpfr |
19 |
} |
510 |
|
|
|
511 |
|
|
for (my $i=1; $i<@res; $i+=2) { |
512 |
dpavlin |
108 |
# $res[$i] /= $self->{db}->{MAXTF_M . $res[$i-1]} / $idf; |
513 |
laperla |
30 |
# above was written badly, allows two DIV_ZERO problems. |
514 |
dpavlin |
108 |
my $maxtf = $self->{db}->{MAXTF_M . $res[$i-1]}; |
515 |
laperla |
30 |
unless ($maxtf) { |
516 |
|
|
warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]"; |
517 |
|
|
$maxtf = 1; |
518 |
|
|
} |
519 |
|
|
$res[$i] = ($res[$i] / $maxtf) * $idf; |
520 |
ulpfr |
19 |
} |
521 |
|
|
|
522 |
dpavlin |
108 |
my %res = @res; # bloed: @res waere schon sortiert gewesen |
523 |
|
|
return \%res; |
524 |
ulpfr |
19 |
} |
525 |
|
|
|
526 |
|
|
# We separate exhaustive search here to avoid overhead and make the |
527 |
|
|
# code more readable. The block can be removed without changing the |
528 |
|
|
# result. |
529 |
dpavlin |
108 |
unless ($top_wanted) { |
530 |
ulpfr |
19 |
for (@terms) { |
531 |
dpavlin |
108 |
my $df = $self->{db}->{DOCFREQ_O . $_}; |
532 |
ulpfr |
19 |
|
533 |
|
|
# The frequency *must* be 1 at least since the posting list is nonempty |
534 |
|
|
_complain('search for term', $_) and $df = 1 if $df < 1; |
535 |
|
|
|
536 |
|
|
# Unpack posting list for current query term $_ |
537 |
dpavlin |
108 |
my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_}; |
538 |
ulpfr |
19 |
|
539 |
dpavlin |
108 |
_complain('search for term', $_) if $self->{db}->{DOCFREQ_O . $_} != keys %post; |
540 |
ulpfr |
19 |
# This is the inverse document frequency. The log of the inverse |
541 |
|
|
# fraction of documents the term occurs in. |
542 |
|
|
my $idf = log($self->{records}/$df); |
543 |
|
|
for my $did (keys %post) { |
544 |
dpavlin |
108 |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
545 |
ulpfr |
19 |
$score{$did} += $post{$did} / $freq * $idf; |
546 |
|
|
} |
547 |
ulpfr |
10 |
} |
548 |
|
|
} |
549 |
ulpfr |
19 |
# warn sprintf "Used %d accumulators\n", scalar keys %score; |
550 |
dpavlin |
108 |
return \%score; |
551 |
ulpfr |
10 |
} |
552 |
ulpfr |
19 |
|
553 |
|
|
# A sloppy but fast algorithm for multiple term queries. |
554 |
dpavlin |
108 |
unless ($picky_strict) { |
555 |
ulpfr |
19 |
for (@terms) { |
556 |
|
|
# Unpack posting list for current query term $_ |
557 |
dpavlin |
108 |
my %post; |
558 |
|
|
if ($self->{reorg} && $top_wanted && $ignore_excess) { |
559 |
|
|
%post = unpack 'w'. int(2*$ignore_excess) , $self->{db}->{POSTINGLIST_P . $_}; |
560 |
|
|
} else { |
561 |
|
|
%post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_}; |
562 |
|
|
} |
563 |
|
|
# warn sprintf "DEBUG WAIT: term[%s] keys %%post[%s]", $_, scalar keys %post; |
564 |
ulpfr |
19 |
|
565 |
|
|
# Lookup the number of documents the term occurs in (document frequency) |
566 |
dpavlin |
108 |
my $occ = $self->{db}->{DOCFREQ_O . $_}; |
567 |
ulpfr |
19 |
|
568 |
dpavlin |
108 |
_complain('search for term', $_) if !$ignore_excess && $occ != keys %post; |
569 |
ulpfr |
19 |
# The frequency *must* be 1 at least since the posting list is nonempty |
570 |
|
|
_complain('search for term', $_) and $occ = 1 if $occ < 1; |
571 |
|
|
|
572 |
dpavlin |
108 |
# This is the inverse document frequency. The log of the inverse fraction |
573 |
|
|
# of documents the term occurs in. |
574 |
ulpfr |
19 |
my $idf = log($self->{records}/$occ); |
575 |
|
|
|
576 |
|
|
# If we have a reasonable number of accumulators, change the |
577 |
|
|
# loop to iterate over the accumulators. This will compromise |
578 |
|
|
# quality for better speed. The algorithm still computes the |
579 |
|
|
# exact weights, but the result is not guaranteed to contain the |
580 |
|
|
# *best* results. The database might contain documents better |
581 |
|
|
# than the worst returned document. |
582 |
|
|
|
583 |
|
|
# We process the lists in order of increasing length. When the |
584 |
|
|
# number of accumulators exceeds $wanted, no new documents are |
585 |
|
|
# added, only the ranking/weighting of the seen documents is |
586 |
|
|
# improved. The resulting ranking list must be pruned, since only |
587 |
|
|
# the top most documents end up near their "optimal" rank. |
588 |
|
|
|
589 |
dpavlin |
108 |
if (keys %score < $top_wanted) { |
590 |
|
|
|
591 |
|
|
# Diese folgende Schleife ist (WAR!) der Hammer fuer die Suche "mysql |
592 |
|
|
# für dummies bellomo". Sie frisst 3.1+1.7 Sekunden. |
593 |
|
|
|
594 |
|
|
# Der erste Grund ist, dass 3 Begriffe noch nicht genug gebracht haben, |
595 |
|
|
# aber der vierte viel zu viel bringt. Der zweite Grund ist, dass wir |
596 |
|
|
# so viele Lookups in $self->{db} machen. Das Rechnen hingegen ist |
597 |
|
|
# vermutlich billig. |
598 |
|
|
|
599 |
ulpfr |
19 |
for my $did (keys %post) { |
600 |
dpavlin |
108 |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
601 |
ulpfr |
19 |
$score{$did} += $post{$did} / $freq * $idf; |
602 |
|
|
} |
603 |
|
|
} |
604 |
|
|
} else { |
605 |
|
|
for my $did (keys %score) { |
606 |
|
|
next unless exists $post{$did}; |
607 |
dpavlin |
108 |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
608 |
ulpfr |
19 |
$score{$did} += $post{$did} / $freq * $idf; |
609 |
|
|
} |
610 |
|
|
} |
611 |
|
|
} |
612 |
|
|
} |
613 |
dpavlin |
108 |
warn sprintf("DEBUG WAIT: returning from search_raw_ref at [%.3f] after terms[%s] with keys[%d]", |
614 |
|
|
time, |
615 |
|
|
join(":",@terms), |
616 |
|
|
scalar keys %score, |
617 |
|
|
); |
618 |
|
|
return \%score; |
619 |
ulpfr |
19 |
} |
620 |
|
|
my @max; $max[$#terms+1]=0; |
621 |
|
|
my @idf; |
622 |
|
|
|
623 |
|
|
# Preparation loop. This extra loop makes sense only when "reorg" |
624 |
|
|
# and "wanted" are true. But at the time beeing, keeping the code |
625 |
|
|
# for the different search algorithms in one place seems more |
626 |
|
|
# desirable than some minor speedup of the brute force version. We |
627 |
|
|
# do cache $idf though. |
628 |
|
|
|
629 |
|
|
for (my $i = $#terms; $i >=0; $i--) { |
630 |
|
|
local $_ = $terms[$i]; |
631 |
|
|
# Lookup the number of documents the term occurs in (document frequency) |
632 |
dpavlin |
108 |
my $df = $self->{db}->{DOCFREQ_O . $_}; |
633 |
ulpfr |
19 |
|
634 |
|
|
# The frequency *must* be 1 at least since the posting list is nonempty |
635 |
|
|
_complain('search for term', $_) and $df = 1 if $df < 1; |
636 |
|
|
|
637 |
|
|
# This is the inverse document frequency. The log of the inverse |
638 |
|
|
# fraction of documents the term occurs in. |
639 |
|
|
$idf[$i] = log($self->{records}/$df); |
640 |
|
|
|
641 |
|
|
my ($did,$occ); |
642 |
|
|
if ($self->{reorg}) { |
643 |
dpavlin |
108 |
($did,$occ) = unpack 'w2', $self->{db}->{POSTINGLIST_P . $_}; |
644 |
ulpfr |
19 |
} else { # Maybe this costs more than it helps |
645 |
dpavlin |
108 |
($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{POSTINGLIST_P . $_}); |
646 |
ulpfr |
19 |
} |
647 |
dpavlin |
108 |
my $freq = $self->{db}->{MAXTF_M . $did}; |
648 |
ulpfr |
19 |
my $max = $occ/$freq*$idf[$i]; |
649 |
|
|
$max[$i] = $max + $max[$i+1]; |
650 |
|
|
} |
651 |
|
|
|
652 |
dpavlin |
108 |
# Main loop |
653 |
ulpfr |
19 |
for my $i (0 .. $#terms) { |
654 |
|
|
my $term = $terms[$i]; |
655 |
|
|
# Unpack posting list for current query term $term. We loose the |
656 |
|
|
# sorting order because the assignment to a hash. |
657 |
dpavlin |
108 |
my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term}; |
658 |
ulpfr |
19 |
|
659 |
|
|
_complain('search for term', $term) |
660 |
dpavlin |
108 |
if $self->{db}->{DOCFREQ_O . $term} != keys %post; |
661 |
ulpfr |
19 |
|
662 |
|
|
my $idf = $idf[$i]; |
663 |
|
|
my $full; # Need to process all postings |
664 |
|
|
my $chop; # Score necessary to enter the ranking list |
665 |
|
|
|
666 |
ulpfr |
80 |
if (# We know that wanted is true since we special cased the |
667 |
ulpfr |
19 |
# exhaustive search. |
668 |
|
|
|
669 |
dpavlin |
108 |
$top_wanted and |
670 |
ulpfr |
19 |
|
671 |
dpavlin |
108 |
# We did sort here if necessary in the preparation loop: |
672 |
ulpfr |
19 |
# $self->{reorg} and |
673 |
|
|
|
674 |
dpavlin |
108 |
scalar keys %score > $top_wanted) { |
675 |
|
|
$chop = (sort { $b <=> $a } values %score)[$top_wanted]; |
676 |
ulpfr |
19 |
$full = $max[$i] > $chop; |
677 |
|
|
} else { |
678 |
|
|
$full = 1; |
679 |
|
|
} |
680 |
|
|
|
681 |
|
|
if ($full) { |
682 |
dpavlin |
108 |
# We need to inspect the full list. Either $top_wanted is not given, |
683 |
ulpfr |
19 |
# the index is not sorted, or we don't have enough accumulators |
684 |
|
|
# yet. |
685 |
|
|
if (defined $chop) { |
686 |
|
|
# We might be able to avoid allocating accumulators |
687 |
|
|
for my $did (keys %post) { |
688 |
dpavlin |
108 |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
689 |
ulpfr |
19 |
my $wgt = $post{$did} / $freq * $idf; |
690 |
|
|
# We add an accumulator if $wgt exeeds $chop |
691 |
|
|
if (exists $score{$did} or $wgt > $chop) { |
692 |
|
|
$score{$did} += $wgt; |
693 |
|
|
} |
694 |
|
|
} |
695 |
|
|
} |
696 |
|
|
} else { |
697 |
|
|
# Allocate acumulators for each seen document. |
698 |
|
|
for my $did (keys %post) { |
699 |
dpavlin |
108 |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
700 |
ulpfr |
19 |
$score{$did} += $post{$did} / $freq * $idf; |
701 |
|
|
} |
702 |
|
|
} |
703 |
|
|
} |
704 |
|
|
} else { |
705 |
|
|
# Update existing accumulators |
706 |
|
|
for my $did (keys %score) { |
707 |
|
|
next unless exists $post{$did}; |
708 |
dpavlin |
108 |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
709 |
ulpfr |
19 |
$score{$did} += $post{$did} / $freq * $idf; |
710 |
|
|
} |
711 |
|
|
} |
712 |
|
|
} |
713 |
|
|
} |
714 |
|
|
#warn sprintf "Used %d accumulators\n", scalar keys %score; |
715 |
dpavlin |
108 |
\%score; |
716 |
ulpfr |
10 |
} |
717 |
|
|
|
718 |
ulpfr |
19 |
sub set { |
719 |
|
|
my ($self, $attr, $value) = @_; |
720 |
|
|
|
721 |
ulpfr |
83 |
die "No such index attribute: '$attr'" unless $attr eq 'top'; |
722 |
ulpfr |
19 |
|
723 |
|
|
return delete $self->{reorg} if $value == 0; |
724 |
|
|
|
725 |
|
|
return if $self->{reorg}; # we are sorted already |
726 |
|
|
return unless $self->{mode} & O_RDWR; |
727 |
|
|
defined $self->{db} or $self->open; |
728 |
|
|
|
729 |
|
|
$self->sync; |
730 |
|
|
while (my($key, $value) = each %{$self->{db}}) { |
731 |
dpavlin |
108 |
next if $key !~ /^p/; # some day use PMATCH |
732 |
|
|
$self->{db}{$key} = $self->sort_postings($value); |
733 |
ulpfr |
19 |
} |
734 |
|
|
$self->{reorg} = 1; |
735 |
|
|
} |
736 |
|
|
|
737 |
ulpfr |
10 |
sub sync { |
738 |
|
|
my $self = shift; |
739 |
dpavlin |
108 |
return unless $self->{mode} & O_RDWR; |
740 |
|
|
Carp::carp(sprintf "[%s] Flushing %d postings", scalar(localtime), $self->{cached}) |
741 |
|
|
if $self->{cached}; |
742 |
|
|
while (my($key, $value) = each %{$self->{cache}}) { |
743 |
|
|
$self->{db}{POSTINGLIST_P . $key} ||= ""; |
744 |
|
|
if ($self->{reorg}) { |
745 |
|
|
$self->{db}->{POSTINGLIST_P . $key} = |
746 |
|
|
$self->sort_postings($self->{db}->{POSTINGLIST_P . $key} |
747 |
|
|
. $value); |
748 |
|
|
} else { |
749 |
|
|
$self->{db}->{POSTINGLIST_P . $key} .= $value; |
750 |
ulpfr |
10 |
} |
751 |
|
|
} |
752 |
dpavlin |
108 |
while (my($key, $value) = each %{$self->{cdict}}) { |
753 |
|
|
$self->{db}->{DOCFREQ_O . $key} = 0 unless $self->{db}->{DOCFREQ_O . $key}; |
754 |
|
|
$self->{db}->{DOCFREQ_O . $key} += $value; |
755 |
|
|
} |
756 |
|
|
$self->{cache} = {}; |
757 |
|
|
$self->{cdict} = {}; |
758 |
|
|
$self->{cached} = 0; |
759 |
ulpfr |
10 |
} |
760 |
|
|
|
761 |
|
|
sub close { |
762 |
|
|
my $self = shift; |
763 |
|
|
|
764 |
dpavlin |
108 |
delete $self->{env}; |
765 |
ulpfr |
10 |
if ($self->{dbh}) { |
766 |
|
|
$self->sync; |
767 |
|
|
delete $self->{dbh}; |
768 |
|
|
untie %{$self->{db}}; |
769 |
dpavlin |
116 |
for my $att (qw(db func cache cached cdict path maindbfile)) { |
770 |
dpavlin |
108 |
delete $self->{$att}; |
771 |
|
|
} |
772 |
|
|
for my $att (qw(pfunc ifunc xfunc)) { |
773 |
|
|
delete $self->{$att} if defined $self->{$att}; |
774 |
|
|
} |
775 |
ulpfr |
10 |
} |
776 |
|
|
} |
777 |
|
|
|
778 |
ulpfr |
51 |
sub keys { |
779 |
|
|
my $self = shift; |
780 |
|
|
|
781 |
|
|
defined $self->{db} or $self->open; |
782 |
|
|
keys %{$self->{db}}; |
783 |
|
|
} |
784 |
|
|
|
785 |
ulpfr |
10 |
1; |
786 |
|
|
|