1 |
# -*- Mode: Perl -*- |
# -*- Mode: cperl; fill-column: 79 -*- |
2 |
# $Basename: InvertedIndex.pm $ |
# $Basename: InvertedIndex.pm $ |
3 |
# $Revision: 1.30 $ |
# $Revision: 1.30 $ |
4 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
12 |
|
|
13 |
package WAIT::InvertedIndex; |
package WAIT::InvertedIndex; |
14 |
use strict; |
use strict; |
15 |
use DB_File; |
use BerkeleyDB; |
16 |
use Fcntl; |
use Fcntl; |
17 |
use WAIT::Filter; |
use WAIT::Filter; |
18 |
use Carp; |
use Carp; |
19 |
use vars qw(%FUNC $VERSION); |
use vars qw(%FUNC $VERSION); |
20 |
|
use Time::HiRes qw(time); |
21 |
|
|
22 |
$VERSION = "1.900"; # others test if we are loaded by checking $VERSION |
$VERSION = "2.000"; # others test if we are loaded by checking $VERSION |
23 |
|
|
24 |
|
use constant DOCFREQ_O => "o"; |
25 |
|
use constant MAXTF_M => "m"; |
26 |
|
use constant POSTINGLIST_P => "p"; |
27 |
|
use constant PMATCH => qr/^(??{POSTINGLIST_P()})/; |
28 |
|
|
29 |
# The dictionary has three different key types: |
# The dictionary has three different key types: |
30 |
|
|
31 |
# 'o'.$word |
# 'o'.$word |
32 |
# |
# |
33 |
# The document frequency is the number of documents a term occurs |
# The document frequency is the number of documents a term occurs |
47 |
# Under this key we store the actual posting list as pairs of |
# Under this key we store the actual posting list as pairs of |
48 |
# packed integers. |
# packed integers. |
49 |
|
|
|
my $no_old_index_support = 0; # do not check for old indices if set |
|
|
|
|
50 |
sub new { |
sub new { |
51 |
my $type = shift; |
my $type = shift; |
52 |
my %parm = @_; |
my %parm = @_; |
53 |
my $self = {}; |
my $self = {}; |
54 |
|
|
55 |
$self->{file} = $parm{file} or croak "No file specified"; |
for my $x (qw(file attr subname env maindbfile tablename)) { |
56 |
$self->{attr} = $parm{attr} or croak "No attributes specified"; |
$self->{$x} = $parm{$x} or croak "No $x specified"; |
57 |
|
} |
58 |
|
|
59 |
$self->{filter} = $parm{filter}; |
$self->{filter} = $parm{filter}; |
60 |
$self->{'name'} = $parm{'name'}; |
$self->{'name'} = $parm{'name'}; |
61 |
$self->{records} = 0; |
$self->{records} = 0; |
71 |
bless $self, ref($type) || $type; |
bless $self, ref($type) || $type; |
72 |
} |
} |
73 |
|
|
74 |
sub name {$_[0]->{'name'}} |
for my $accessor (qw(name maindbfile tablename subname)) { |
75 |
|
no strict 'refs'; |
76 |
|
*{$accessor} = sub { |
77 |
|
my($self) = @_; |
78 |
|
return $self->{$accessor} if $self->{$accessor}; |
79 |
|
require Carp; |
80 |
|
Carp::confess("accessor $accessor not there"); |
81 |
|
} |
82 |
|
} |
83 |
|
|
84 |
sub _split_pos { |
sub _split_pos { |
85 |
my ($text, $pos) = @{$_[0]}; |
my ($text, $pos) = @{$_[0]}; |
159 |
} |
} |
160 |
} |
} |
161 |
|
|
|
sub is_an_old_index { |
|
|
my $self = shift; |
|
|
|
|
|
return 0 if $no_old_index_support; |
|
|
return $self->{old_index} if exists $self->{old_index}; |
|
|
|
|
|
# We can only guess if this is an old index. We lookup the first 10 |
|
|
# $O entries. If all values are integers, we assume that the index |
|
|
# is an old one. |
|
|
|
|
|
defined $self->{db} or $self->open; |
|
|
$self->sync; |
|
|
my $dbh = $self->{dbh} or return $self->{old_index} = 0; # for convenience |
|
|
|
|
|
my $O = pack('C', 0xff)."o"; |
|
|
my ($word, $value) = ($O.$;); # $word and $value are modified by seq! |
|
|
if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) { |
|
|
# warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O"; |
|
|
return $self->{old_index} = 0; |
|
|
} |
|
|
for (my $i=0; $i<10;$i++) { |
|
|
if ($value !~ /^\d+$/) { |
|
|
# warn "DEBUG: word[$word]value[$value], not an old index"; |
|
|
return $self->{old_index} = 0; |
|
|
} |
|
|
if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left |
|
|
$word !~ /^$O$;/o # no $O values left |
|
|
) { |
|
|
# we are not sure enough that this is an old index |
|
|
# warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index"; |
|
|
return $self->{old_index} = 0; |
|
|
} |
|
|
} |
|
|
# warn "DEBUG: old index"; |
|
|
return $self->{old_index} = 1; |
|
|
} |
|
|
|
|
162 |
sub open { |
sub open { |
163 |
my $self = shift; |
my $self = shift; |
164 |
my $file = $self->{file}; |
my $file = $self->{file}; |
168 |
} else { |
} else { |
169 |
$self->{func} = |
$self->{func} = |
170 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}})); |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}})); |
171 |
$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file, |
my $flags; |
172 |
$self->{mode}, 0664, $DB_BTREE); |
if ($self->{mode} & O_RDWR) { |
173 |
|
$flags = DB_CREATE; # | DB_INIT_MPOOL | DB_PRIVATE | DB_INIT_CDB; |
174 |
|
warn "Flags on inverted $file set to 'writing'"; |
175 |
|
} else { |
176 |
|
$flags = DB_RDONLY; |
177 |
|
# warn "Flags on inverted $file set to 'readonly'"; |
178 |
|
} |
179 |
|
my $filename = $self->maindbfile or die; |
180 |
|
my $subname = join("/",$self->tablename || die,$self->subname || die); |
181 |
|
my $env = $self->{env} || "[undef]"; |
182 |
|
$self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree', |
183 |
|
# Filename => $file, |
184 |
|
Filename => $filename, |
185 |
|
$self->{env} ? (Env => $self->{env}) : (), |
186 |
|
Subname => $subname, |
187 |
|
Mode => 0664, |
188 |
|
Flags => $flags, |
189 |
|
$WAIT::Database::Cachesize?(Cachesize => $WAIT::Database::Cachesize):(), |
190 |
|
$WAIT::Database::Pagesize?(Pagesize => $WAIT::Database::Pagesize):(), |
191 |
|
) or die "Couldn't tie: $BerkeleyDB::Error; filename=>'$filename', env=>'$env',subname=>'$subname',flags=>'$flags'"; |
192 |
$self->{cache} = {} |
$self->{cache} = {} |
193 |
if $self->{mode} & O_RDWR; |
if $self->{mode} & O_RDWR; |
194 |
$self->{cdict} = {} |
$self->{cdict} = {} |
195 |
if $self->{mode} & O_RDWR; |
if $self->{mode} & O_RDWR; |
196 |
$self->{cached} = 0; |
$self->{cached} = 0; |
|
if (!$no_old_index_support and $self->is_an_old_index()) { |
|
|
warn "This is an old index, upgrade you database"; |
|
|
require WAIT::InvertedIndexOld; |
|
|
bless $self, 'WAIT::InvertedIndexOld'; |
|
|
} |
|
197 |
} |
} |
198 |
} |
} |
199 |
|
|
203 |
my %occ; |
my %occ; |
204 |
|
|
205 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
206 |
|
defined $self->{db} or die "open didn't help!!!"; |
207 |
grep $occ{$_}++, &{$self->{func}}(@_); |
grep $occ{$_}++, &{$self->{func}}(@_); |
208 |
my ($word, $noc); |
my ($word, $noc); |
209 |
$self->{records}++; |
$self->{records}++; |
211 |
if (defined $self->{cache}->{$word}) { |
if (defined $self->{cache}->{$word}) { |
212 |
$self->{cdict}->{$word}++; |
$self->{cdict}->{$word}++; |
213 |
$self->{cache}->{$word} .= pack 'w2', $key, $noc; |
$self->{cache}->{$word} .= pack 'w2', $key, $noc; |
214 |
} else { |
} else { |
215 |
$self->{cdict}->{$word} = 1; |
$self->{cdict}->{$word} = 1; |
216 |
$self->{cache}->{$word} = pack 'w2', $key, $noc; |
$self->{cache}->{$word} = pack 'w2', $key, $noc; |
217 |
} |
} |
223 |
for (values %occ) { |
for (values %occ) { |
224 |
$maxtf = $_ if $_ > $maxtf; |
$maxtf = $_ if $_ > $maxtf; |
225 |
} |
} |
226 |
$self->{db}->{'m'. $key} = $maxtf; |
$self->{db}->{MAXTF_M . $key} = $maxtf; |
227 |
} |
} |
228 |
|
|
229 |
# We sort postings by increasing max term frequency (~ by increasing |
# We sort postings by increasing max term frequency (~ by increasing |
249 |
# order can be exploited for tuning of single term queries. |
# order can be exploited for tuning of single term queries. |
250 |
|
|
251 |
for my $did (keys %$post) { # sanity check |
for my $did (keys %$post) { # sanity check |
252 |
unless ($self->{db}->{"m". $did}) { |
unless ($self->{db}->{MAXTF_M . $did}) { |
253 |
warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]"; |
warn "WAIT Warning: DIVZERO threat from did[$did]post[$post]post{did}[$post->{$did}]"; |
254 |
$self->{db}->{"m". $did} = 1; # fails if we have not opened for writing |
$self->{db}->{MAXTF_M . $did} = 1; # fails if we have not opened for writing |
255 |
} |
} |
256 |
} |
} |
257 |
for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b} |
for my $did (sort { $post->{$b} / $self->{db}->{MAXTF_M . $b} |
258 |
<=> |
<=> |
259 |
$post->{$a} / $self->{db}->{'m'. $a} |
$post->{$a} / $self->{db}->{MAXTF_M . $a} |
260 |
} keys %$post) { |
} keys %$post) { |
261 |
$r .= pack 'w2', $did, $post->{$did}; |
$r .= pack 'w2', $did, $post->{$did}; |
262 |
} |
} |
288 |
warn "Catching warning[$warning] during delete of key[$key]"; |
warn "Catching warning[$warning] during delete of key[$key]"; |
289 |
}; |
}; |
290 |
for (keys %occ) {# may reorder posting list |
for (keys %occ) {# may reorder posting list |
291 |
my %post = unpack 'w*', $db->{'p'.$_}; |
my %post = unpack 'w*', $db->{POSTINGLIST_P . $_}; |
292 |
delete $post{$key}; |
delete $post{$key}; |
293 |
$db->{'p'.$_} = $self->sort_postings(\%post); |
$db->{POSTINGLIST_P . $_} = $self->sort_postings(\%post); |
294 |
_complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post; |
_complain('delete of term', $_) if $db->{DOCFREQ_O . $_}-1 != keys %post; |
295 |
$db->{'o'.$_} = scalar keys %post; |
$db->{DOCFREQ_O . $_} = scalar keys %post; |
296 |
} |
} |
297 |
delete $db->{'m'. $key}; |
delete $db->{MAXTF_M . $key}; |
298 |
} |
} |
299 |
|
|
300 |
sub intervall { |
sub intervall { |
301 |
my ($self, $first, $last) = @_; |
my ($self, $first, $last) = @_; |
|
my $value = ''; |
|
|
my $word = ''; |
|
|
my @result; |
|
|
|
|
|
return unless exists $self->{'intervall'}; |
|
302 |
|
|
303 |
defined $self->{db} or $self->open; |
die "intervall broken in this version of WAIT: need to fix the |
304 |
$self->sync; |
R_CURSOR and R_NEXT lines"; |
|
my $dbh = $self->{dbh}; # for convenience |
|
305 |
|
|
306 |
if (ref $self->{'intervall'}) { |
#### my $value = ''; |
307 |
unless (exists $self->{'ifunc'}) { |
#### my $word = ''; |
308 |
$self->{'ifunc'} = |
#### my @result; |
309 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}})); |
#### |
310 |
} |
#### return unless exists $self->{'intervall'}; |
311 |
($first) = &{$self->{'ifunc'}}($first) if $first; |
#### |
312 |
($last) = &{$self->{'ifunc'}}($last) if $last; |
#### defined $self->{db} or $self->open; |
313 |
} |
#### $self->sync; |
314 |
$first = 'p'.($first||''); |
#### my $dbh = $self->{dbh}; # for convenience |
315 |
$last = (defined $last)?'p'.$last:'q'; |
#### |
316 |
|
#### if (ref $self->{'intervall'}) { |
317 |
# set the cursor to $first |
#### unless (exists $self->{'ifunc'}) { |
318 |
$dbh->seq($first, $value, R_CURSOR); |
#### $self->{'ifunc'} = |
319 |
|
#### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}})); |
320 |
# $first would be after the last word |
#### } |
321 |
return () if $first gt $last; |
#### ($first) = &{$self->{'ifunc'}}($first) if $first; |
322 |
|
#### ($last) = &{$self->{'ifunc'}}($last) if $last; |
323 |
push @result, substr($first,1); |
#### } |
324 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
#### $first = POSTINGLIST_P . ($first||''); |
325 |
# We should limit this to a "resonable" number of words |
#### $last = (defined $last)?POSTINGLIST_P . $last:'q'; |
326 |
last if $word gt $last; |
#### |
327 |
push @result, substr($word,1); |
#### # set the cursor to $first |
328 |
} |
#### $dbh->seq($first, $value, R_CURSOR); |
329 |
\@result; # speed |
#### |
330 |
|
#### # $first would be after the last word |
331 |
|
#### return () if $first gt $last; |
332 |
|
#### |
333 |
|
#### push @result, substr($first,1); |
334 |
|
#### while (!$dbh->seq($word, $value, R_NEXT)) { |
335 |
|
#### # We should limit this to a "resonable" number of words |
336 |
|
#### last if $word gt $last; |
337 |
|
#### push @result, substr($word,1); |
338 |
|
#### } |
339 |
|
#### \@result; # speed |
340 |
} |
} |
341 |
|
|
342 |
sub prefix { |
sub prefix { |
343 |
my ($self, $prefix) = @_; |
my ($self, $prefix) = @_; |
|
my $value = ''; |
|
|
my $word = ''; |
|
|
my @result; |
|
344 |
|
|
345 |
return () unless defined $prefix; # Full dictionary requested !! |
die "prefix not supported in this version of WAIT: need to fix the R_CURSOR"; |
|
return unless exists $self->{'prefix'}; |
|
|
defined $self->{db} or $self->open; |
|
|
$self->sync; |
|
|
my $dbh = $self->{dbh}; |
|
|
|
|
|
if (ref $self->{'prefix'}) { |
|
|
unless (exists $self->{'pfunc'}) { |
|
|
$self->{'pfunc'} = |
|
|
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}})); |
|
|
} |
|
|
($prefix) = &{$self->{'pfunc'}}($prefix); |
|
|
} |
|
346 |
|
|
|
if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) { |
|
|
return (); |
|
|
} |
|
|
return () if $word !~ /^p$prefix/; |
|
|
push @result, substr($word,1); |
|
347 |
|
|
348 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
#### my $value = ''; |
349 |
# We should limit this to a "resonable" number of words |
#### my $word = ''; |
350 |
last if $word !~ /^p$prefix/; |
#### my @result; |
351 |
push @result, substr($word,1); |
#### |
352 |
} |
#### return () unless defined $prefix; # Full dictionary requested !! |
353 |
\@result; # speed |
#### return unless exists $self->{'prefix'}; |
354 |
|
#### defined $self->{db} or $self->open; |
355 |
|
#### $self->sync; |
356 |
|
#### my $dbh = $self->{dbh}; |
357 |
|
#### |
358 |
|
#### if (ref $self->{'prefix'}) { |
359 |
|
#### unless (exists $self->{'pfunc'}) { |
360 |
|
#### $self->{'pfunc'} = |
361 |
|
#### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}})); |
362 |
|
#### } |
363 |
|
#### ($prefix) = &{$self->{'pfunc'}}($prefix); |
364 |
|
#### } |
365 |
|
#### |
366 |
|
#### if ($dbh->seq($word = POSTINGLIST_P . $prefix, $value, R_CURSOR)) { |
367 |
|
#### return (); |
368 |
|
#### } |
369 |
|
#### return () if $word !~ /^p$prefix/; |
370 |
|
#### push @result, substr($word,1); |
371 |
|
#### |
372 |
|
#### while (!$dbh->seq($word, $value, R_NEXT)) { |
373 |
|
#### # We should limit this to a "resonable" number of words |
374 |
|
#### last if $word !~ /^p$prefix/; |
375 |
|
#### push @result, substr($word,1); |
376 |
|
#### } |
377 |
|
#### \@result; # speed |
378 |
} |
} |
379 |
|
|
380 |
=head2 search($query) |
=head2 search($query) |
406 |
|
|
407 |
=cut |
=cut |
408 |
|
|
409 |
sub search { |
sub search_ref { |
410 |
my $self = shift; |
my $self = shift; |
411 |
my $query = shift; |
my $query = shift; |
412 |
|
|
413 |
|
my $debugtime = 0; |
414 |
|
my($time,$entertime); |
415 |
|
our $STARTTIME; |
416 |
|
if ($debugtime) { |
417 |
|
$time = time; |
418 |
|
$STARTTIME ||= $time; |
419 |
|
if ($time-$STARTTIME > 5) { |
420 |
|
$STARTTIME = $time; |
421 |
|
warn "STARTTIME: $STARTTIME\n"; |
422 |
|
} |
423 |
|
$entertime = time-$STARTTIME; |
424 |
|
warn sprintf "ENTER TIME: %.4f\n", $entertime; |
425 |
|
} |
426 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
427 |
$self->sync; |
$self->sync; |
428 |
$self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() there |
my $ref = $self->search_raw_ref($query, &{$self->{func}}(@_)); # No call to parse() there |
429 |
|
if ($debugtime) { |
430 |
|
my $leavetime = time-$STARTTIME; |
431 |
|
warn sprintf "LEAVE TIME: %.4f\n", $leavetime; |
432 |
|
if ($leavetime-$entertime > .4) { |
433 |
|
require Data::Dumper; |
434 |
|
print STDERR "Line " . __LINE__ . ", File: " . __FILE__ . "\n" . |
435 |
|
Data::Dumper->new([$query,\@_],[qw(query at_)])->Indent(1)->Useqq(1)->Dump; # XXX |
436 |
|
} |
437 |
|
} |
438 |
|
$ref; |
439 |
} |
} |
440 |
|
|
441 |
sub parse { |
sub parse { |
450 |
|
|
451 |
# print "search_prefix(@_)\n"; |
# print "search_prefix(@_)\n"; |
452 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
453 |
$self->search_raw(map($self->prefix($_), @_)); |
$self->search_raw_ref(map($self->prefix($_), @_)); |
454 |
} |
} |
455 |
|
|
456 |
sub _complain ($$) { |
sub _complain ($$) { |
463 |
$term,)); |
$term,)); |
464 |
} |
} |
465 |
|
|
466 |
sub search_raw { |
sub search_raw_ref { |
467 |
my $self = shift; |
my $self = shift; |
468 |
my $query = shift; |
my $query = shift; |
469 |
|
# warn "DEBUG WAIT: search_raw_ref args 2..[@_]"; |
470 |
my %score; |
my %score; |
471 |
|
|
472 |
# Top $wanted documents must be correct. Zero means all matching |
# Top $top_wanted documents must be correct. Zero means all matching documents. |
473 |
# documents. |
my $top_wanted = $query->{top}; |
474 |
my $wanted = $query->{top}; |
my $picky_strict = $query->{picky}; |
475 |
my $strict = $query->{picky}; |
# the option is really ignore_excess |
476 |
|
my $ignore_excess = $query->{ignore_excess}; |
477 |
# Return at least $minacc documents. Zero means all matching |
|
478 |
# documents. |
# Return at least $minacc documents. Zero means all matching documents. |
479 |
# my $minacc = $query->{accus} || $wanted; |
|
480 |
|
# my $minacc = $query->{accus} || $top_wanted; |
481 |
|
|
482 |
# Open index and flush cache if necessary |
# Open index and flush cache if necessary |
483 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
486 |
# We keep duplicates |
# We keep duplicates |
487 |
my @terms = |
my @terms = |
488 |
# Sort words by decreasing document frequency |
# Sort words by decreasing document frequency |
489 |
sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} } |
sort { $self->{db}->{DOCFREQ_O . $a} <=> $self->{db}->{DOCFREQ_O . $b} } |
490 |
# check which words occur in the index. |
# check which words occur in the index. |
491 |
grep { $self->{db}->{'o'.$_} } @_; |
grep { $self->{db}->{DOCFREQ_O . $_} } @_; |
492 |
|
|
493 |
|
# warn "DEBUG WAIT: wanted[$top_wanted]terms[@terms]"; |
494 |
return unless @terms; |
return unless @terms; |
495 |
|
|
496 |
# We special-case one term queries here. If the index was sorted, |
# We special-case one term queries here. If the index was sorted, |
497 |
# choping off the rest of the list will return the same ranking. |
# choping off the rest of the list will return the same ranking. |
498 |
if ($wanted and @terms == 1) { |
if ($top_wanted and @terms == 1) { |
499 |
my $term = shift @terms; |
my $term = shift @terms; |
500 |
my $idf = log($self->{records}/$self->{db}->{'o'.$term}); |
my $idf = log($self->{records}/$self->{db}->{DOCFREQ_O . $term}); |
501 |
my @res; |
my @res; |
502 |
|
|
503 |
if ($self->{reorg}) { # or not $query->{picky} |
if ($self->{reorg}) { # or not $query->{picky} |
504 |
@res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term}; |
@res = unpack "w". int(2*$top_wanted), $self->{db}->{POSTINGLIST_P . $term}; |
505 |
|
# warn sprintf "DEBUG WAIT: scalar(\@res)[%d]", scalar(@res); |
506 |
} else { |
} else { |
507 |
@res = unpack 'w*', $self->{db}->{'p'.$term}; |
@res = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term}; |
508 |
} |
} |
509 |
|
|
510 |
for (my $i=1; $i<@res; $i+=2) { |
for (my $i=1; $i<@res; $i+=2) { |
511 |
# $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf; |
# $res[$i] /= $self->{db}->{MAXTF_M . $res[$i-1]} / $idf; |
512 |
# above was written badly, allows two DIV_ZERO problems. |
# above was written badly, allows two DIV_ZERO problems. |
513 |
my $maxtf = $self->{db}->{"m". $res[$i-1]}; |
my $maxtf = $self->{db}->{MAXTF_M . $res[$i-1]}; |
514 |
unless ($maxtf) { |
unless ($maxtf) { |
515 |
warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]"; |
warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]"; |
516 |
$maxtf = 1; |
$maxtf = 1; |
518 |
$res[$i] = ($res[$i] / $maxtf) * $idf; |
$res[$i] = ($res[$i] / $maxtf) * $idf; |
519 |
} |
} |
520 |
|
|
521 |
return @res |
my %res = @res; # bloed: @res waere schon sortiert gewesen |
522 |
|
return \%res; |
523 |
} |
} |
524 |
|
|
525 |
# We separate exhaustive search here to avoid overhead and make the |
# We separate exhaustive search here to avoid overhead and make the |
526 |
# code more readable. The block can be removed without changing the |
# code more readable. The block can be removed without changing the |
527 |
# result. |
# result. |
528 |
unless ($wanted) { |
unless ($top_wanted) { |
529 |
for (@terms) { |
for (@terms) { |
530 |
my $df = $self->{db}->{'o'.$_}; |
my $df = $self->{db}->{DOCFREQ_O . $_}; |
531 |
|
|
532 |
# The frequency *must* be 1 at least since the posting list is nonempty |
# The frequency *must* be 1 at least since the posting list is nonempty |
533 |
_complain('search for term', $_) and $df = 1 if $df < 1; |
_complain('search for term', $_) and $df = 1 if $df < 1; |
534 |
|
|
535 |
# Unpack posting list for current query term $_ |
# Unpack posting list for current query term $_ |
536 |
my %post = unpack 'w*', $self->{db}->{'p'.$_}; |
my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_}; |
537 |
|
|
538 |
_complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post; |
_complain('search for term', $_) if $self->{db}->{DOCFREQ_O . $_} != keys %post; |
539 |
# This is the inverse document frequency. The log of the inverse |
# This is the inverse document frequency. The log of the inverse |
540 |
# fraction of documents the term occurs in. |
# fraction of documents the term occurs in. |
541 |
my $idf = log($self->{records}/$df); |
my $idf = log($self->{records}/$df); |
542 |
for my $did (keys %post) { |
for my $did (keys %post) { |
543 |
if (my $freq = $self->{db}->{'m'. $did}) { |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
544 |
$score{$did} += $post{$did} / $freq * $idf; |
$score{$did} += $post{$did} / $freq * $idf; |
545 |
} |
} |
546 |
} |
} |
547 |
} |
} |
548 |
# warn sprintf "Used %d accumulators\n", scalar keys %score; |
# warn sprintf "Used %d accumulators\n", scalar keys %score; |
549 |
return %score; |
return \%score; |
550 |
} |
} |
551 |
|
|
552 |
# A sloppy but fast algorithm for multiple term queries. |
# A sloppy but fast algorithm for multiple term queries. |
553 |
unless ($strict) { |
unless ($picky_strict) { |
554 |
for (@terms) { |
for (@terms) { |
555 |
# Unpack posting list for current query term $_ |
# Unpack posting list for current query term $_ |
556 |
my %post = unpack 'w*', $self->{db}->{'p'.$_}; |
my %post; |
557 |
|
if ($self->{reorg} && $top_wanted && $ignore_excess) { |
558 |
|
%post = unpack 'w'. int(2*$ignore_excess) , $self->{db}->{POSTINGLIST_P . $_}; |
559 |
|
} else { |
560 |
|
%post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_}; |
561 |
|
} |
562 |
|
# warn sprintf "DEBUG WAIT: term[%s] keys %%post[%s]", $_, scalar keys %post; |
563 |
|
|
564 |
# Lookup the number of documents the term occurs in (document frequency) |
# Lookup the number of documents the term occurs in (document frequency) |
565 |
my $occ = $self->{db}->{'o'.$_}; |
my $occ = $self->{db}->{DOCFREQ_O . $_}; |
566 |
|
|
567 |
_complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post; |
_complain('search for term', $_) if !$ignore_excess && $occ != keys %post; |
568 |
# The frequency *must* be 1 at least since the posting list is nonempty |
# The frequency *must* be 1 at least since the posting list is nonempty |
569 |
_complain('search for term', $_) and $occ = 1 if $occ < 1; |
_complain('search for term', $_) and $occ = 1 if $occ < 1; |
570 |
|
|
571 |
# This is the inverse document frequency. The log of the inverse |
# This is the inverse document frequency. The log of the inverse fraction |
572 |
# fraction of documents the term occurs in. |
# of documents the term occurs in. |
573 |
my $idf = log($self->{records}/$occ); |
my $idf = log($self->{records}/$occ); |
574 |
|
|
575 |
# If we have a reasonable number of accumulators, change the |
# If we have a reasonable number of accumulators, change the |
585 |
# improved. The resulting ranking list must be pruned, since only |
# improved. The resulting ranking list must be pruned, since only |
586 |
# the top most documents end up near their "optimal" rank. |
# the top most documents end up near their "optimal" rank. |
587 |
|
|
588 |
if (keys %score < $wanted) { |
if (keys %score < $top_wanted) { |
589 |
|
|
590 |
|
# Diese folgende Schleife ist (WAR!) der Hammer fuer die Suche "mysql |
591 |
|
# für dummies bellomo". Sie frisst 3.1+1.7 Sekunden. |
592 |
|
|
593 |
|
# Der erste Grund ist, dass 3 Begriffe noch nicht genug gebracht haben, |
594 |
|
# aber der vierte viel zu viel bringt. Der zweite Grund ist, dass wir |
595 |
|
# so viele Lookups in $self->{db} machen. Das Rechnen hingegen ist |
596 |
|
# vermutlich billig. |
597 |
|
|
598 |
for my $did (keys %post) { |
for my $did (keys %post) { |
599 |
if (my $freq = $self->{db}->{'m'. $did}) { |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
600 |
$score{$did} += $post{$did} / $freq * $idf; |
$score{$did} += $post{$did} / $freq * $idf; |
601 |
} |
} |
602 |
} |
} |
603 |
} else { |
} else { |
604 |
for my $did (keys %score) { |
for my $did (keys %score) { |
605 |
next unless exists $post{$did}; |
next unless exists $post{$did}; |
606 |
if (my $freq = $self->{db}->{'m'. $did}) { |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
607 |
$score{$did} += $post{$did} / $freq * $idf; |
$score{$did} += $post{$did} / $freq * $idf; |
608 |
} |
} |
609 |
} |
} |
610 |
} |
} |
611 |
} |
} |
612 |
return %score; |
warn sprintf("DEBUG WAIT: returning from search_raw_ref at [%.3f] after terms[%s] with keys[%d]", |
613 |
|
time, |
614 |
|
join(":",@terms), |
615 |
|
scalar keys %score, |
616 |
|
); |
617 |
|
return \%score; |
618 |
} |
} |
619 |
my @max; $max[$#terms+1]=0; |
my @max; $max[$#terms+1]=0; |
620 |
my @idf; |
my @idf; |
628 |
for (my $i = $#terms; $i >=0; $i--) { |
for (my $i = $#terms; $i >=0; $i--) { |
629 |
local $_ = $terms[$i]; |
local $_ = $terms[$i]; |
630 |
# Lookup the number of documents the term occurs in (document frequency) |
# Lookup the number of documents the term occurs in (document frequency) |
631 |
my $df = $self->{db}->{'o'.$_}; |
my $df = $self->{db}->{DOCFREQ_O . $_}; |
632 |
|
|
633 |
# The frequency *must* be 1 at least since the posting list is nonempty |
# The frequency *must* be 1 at least since the posting list is nonempty |
634 |
_complain('search for term', $_) and $df = 1 if $df < 1; |
_complain('search for term', $_) and $df = 1 if $df < 1; |
639 |
|
|
640 |
my ($did,$occ); |
my ($did,$occ); |
641 |
if ($self->{reorg}) { |
if ($self->{reorg}) { |
642 |
($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_}; |
($did,$occ) = unpack 'w2', $self->{db}->{POSTINGLIST_P . $_}; |
643 |
} else { # Maybe this costs more than it helps |
} else { # Maybe this costs more than it helps |
644 |
($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_}); |
($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{POSTINGLIST_P . $_}); |
645 |
} |
} |
646 |
my $freq = $self->{db}->{'m'. $did}; |
my $freq = $self->{db}->{MAXTF_M . $did}; |
647 |
my $max = $occ/$freq*$idf[$i]; |
my $max = $occ/$freq*$idf[$i]; |
648 |
$max[$i] = $max + $max[$i+1]; |
$max[$i] = $max + $max[$i+1]; |
649 |
} |
} |
650 |
|
|
651 |
# Main loop |
# Main loop |
652 |
for my $i (0 .. $#terms) { |
for my $i (0 .. $#terms) { |
653 |
my $term = $terms[$i]; |
my $term = $terms[$i]; |
654 |
# Unpack posting list for current query term $term. We loose the |
# Unpack posting list for current query term $term. We loose the |
655 |
# sorting order because the assignment to a hash. |
# sorting order because the assignment to a hash. |
656 |
my %post = unpack 'w*', $self->{db}->{'p'.$term}; |
my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term}; |
657 |
|
|
658 |
_complain('search for term', $term) |
_complain('search for term', $term) |
659 |
if $self->{db}->{'o'.$term} != keys %post; |
if $self->{db}->{DOCFREQ_O . $term} != keys %post; |
660 |
|
|
661 |
my $idf = $idf[$i]; |
my $idf = $idf[$i]; |
662 |
my $full; # Need to process all postings |
my $full; # Need to process all postings |
665 |
if (# We know that wanted is true since we special cased the |
if (# We know that wanted is true since we special cased the |
666 |
# exhaustive search. |
# exhaustive search. |
667 |
|
|
668 |
$wanted and |
$top_wanted and |
669 |
|
|
670 |
# We did sort here if necessary in |
# We did sort here if necessary in the preparation loop: |
|
# the preparation loop |
|
671 |
# $self->{reorg} and |
# $self->{reorg} and |
672 |
|
|
673 |
scalar keys %score > $wanted) { |
scalar keys %score > $top_wanted) { |
674 |
$chop = (sort { $b <=> $a } values %score)[$wanted]; |
$chop = (sort { $b <=> $a } values %score)[$top_wanted]; |
675 |
$full = $max[$i] > $chop; |
$full = $max[$i] > $chop; |
676 |
} else { |
} else { |
677 |
$full = 1; |
$full = 1; |
678 |
} |
} |
679 |
|
|
680 |
if ($full) { |
if ($full) { |
681 |
# We need to inspect the full list. Either $wanted is not given, |
# We need to inspect the full list. Either $top_wanted is not given, |
682 |
# the index is not sorted, or we don't have enough accumulators |
# the index is not sorted, or we don't have enough accumulators |
683 |
# yet. |
# yet. |
684 |
if (defined $chop) { |
if (defined $chop) { |
685 |
# We might be able to avoid allocating accumulators |
# We might be able to avoid allocating accumulators |
686 |
for my $did (keys %post) { |
for my $did (keys %post) { |
687 |
if (my $freq = $self->{db}->{'m'. $did}) { |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
688 |
my $wgt = $post{$did} / $freq * $idf; |
my $wgt = $post{$did} / $freq * $idf; |
689 |
# We add an accumulator if $wgt exeeds $chop |
# We add an accumulator if $wgt exeeds $chop |
690 |
if (exists $score{$did} or $wgt > $chop) { |
if (exists $score{$did} or $wgt > $chop) { |
695 |
} else { |
} else { |
696 |
# Allocate acumulators for each seen document. |
# Allocate acumulators for each seen document. |
697 |
for my $did (keys %post) { |
for my $did (keys %post) { |
698 |
if (my $freq = $self->{db}->{'m'. $did}) { |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
699 |
$score{$did} += $post{$did} / $freq * $idf; |
$score{$did} += $post{$did} / $freq * $idf; |
700 |
} |
} |
701 |
} |
} |
704 |
# Update existing accumulators |
# Update existing accumulators |
705 |
for my $did (keys %score) { |
for my $did (keys %score) { |
706 |
next unless exists $post{$did}; |
next unless exists $post{$did}; |
707 |
if (my $freq = $self->{db}->{'m'. $did}) { |
if (my $freq = $self->{db}->{MAXTF_M . $did}) { |
708 |
$score{$did} += $post{$did} / $freq * $idf; |
$score{$did} += $post{$did} / $freq * $idf; |
709 |
} |
} |
710 |
} |
} |
711 |
} |
} |
712 |
} |
} |
713 |
#warn sprintf "Used %d accumulators\n", scalar keys %score; |
#warn sprintf "Used %d accumulators\n", scalar keys %score; |
714 |
%score; |
\%score; |
715 |
} |
} |
716 |
|
|
717 |
sub set { |
sub set { |
727 |
|
|
728 |
$self->sync; |
$self->sync; |
729 |
while (my($key, $value) = each %{$self->{db}}) { |
while (my($key, $value) = each %{$self->{db}}) { |
730 |
next if $key !~ /^p/; |
next if $key !~ /^p/; # some day use PMATCH |
731 |
$self->{db}->{$key} = $self->sort_postings($value); |
$self->{db}{$key} = $self->sort_postings($value); |
732 |
} |
} |
733 |
$self->{reorg} = 1; |
$self->{reorg} = 1; |
734 |
} |
} |
735 |
|
|
736 |
sub sync { |
sub sync { |
737 |
my $self = shift; |
my $self = shift; |
738 |
|
return unless $self->{mode} & O_RDWR; |
739 |
if ($self->{mode} & O_RDWR) { |
Carp::carp(sprintf "[%s] Flushing %d postings", scalar(localtime), $self->{cached}) |
740 |
print STDERR "Flushing $self->{cached} postings\n" if $self->{cached}; |
if $self->{cached}; |
741 |
while (my($key, $value) = each %{$self->{cache}}) { |
while (my($key, $value) = each %{$self->{cache}}) { |
742 |
$self->{db}->{"p". $key} ||= ""; |
$self->{db}{POSTINGLIST_P . $key} ||= ""; |
743 |
if ($self->{reorg}) { |
if ($self->{reorg}) { |
744 |
$self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key} |
$self->{db}->{POSTINGLIST_P . $key} = |
745 |
. $value); |
$self->sort_postings($self->{db}->{POSTINGLIST_P . $key} |
746 |
} else { |
. $value); |
747 |
$self->{db}->{'p'.$key} .= $value; |
} else { |
748 |
} |
$self->{db}->{POSTINGLIST_P . $key} .= $value; |
|
} |
|
|
while (my($key, $value) = each %{$self->{cdict}}) { |
|
|
$self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key}; |
|
|
$self->{db}->{'o'.$key} += $value; |
|
749 |
} |
} |
|
$self->{cache} = {}; |
|
|
$self->{cdict} = {}; |
|
|
$self->{cached} = 0; |
|
750 |
} |
} |
751 |
|
while (my($key, $value) = each %{$self->{cdict}}) { |
752 |
|
$self->{db}->{DOCFREQ_O . $key} = 0 unless $self->{db}->{DOCFREQ_O . $key}; |
753 |
|
$self->{db}->{DOCFREQ_O . $key} += $value; |
754 |
|
} |
755 |
|
$self->{cache} = {}; |
756 |
|
$self->{cdict} = {}; |
757 |
|
$self->{cached} = 0; |
758 |
} |
} |
759 |
|
|
760 |
sub close { |
sub close { |
761 |
my $self = shift; |
my $self = shift; |
762 |
|
|
763 |
|
delete $self->{env}; |
764 |
if ($self->{dbh}) { |
if ($self->{dbh}) { |
765 |
$self->sync; |
$self->sync; |
766 |
delete $self->{dbh}; |
delete $self->{dbh}; |
767 |
untie %{$self->{db}}; |
untie %{$self->{db}}; |
768 |
delete $self->{db}; |
for my $att (qw(db func cache cached cdict file maindbfile)) { |
769 |
delete $self->{func}; |
delete $self->{$att}; |
770 |
delete $self->{cache}; |
} |
771 |
delete $self->{cached}; |
for my $att (qw(pfunc ifunc xfunc)) { |
772 |
delete $self->{cdict}; |
delete $self->{$att} if defined $self->{$att}; |
773 |
delete $self->{pfunc} if defined $self->{pfunc}; |
} |
|
delete $self->{ifunc} if defined $self->{ifunc}; |
|
|
delete $self->{xfunc} if defined $self->{xfunc}; |
|
774 |
} |
} |
775 |
} |
} |
776 |
|
|