/[wait]/trunk/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 116 - (show annotations)
Wed Jul 14 09:48:26 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 25238 byte(s)
more fixes, more debug

1 # -*- Mode: cperl; fill-column: 79 -*-
2 # $Basename: InvertedIndex.pm $
3 # $Revision: 1.30 $
4 # Author : Ulrich Pfeifer
5 # Created On : Thu Aug 8 13:05:10 1996
6 # Last Modified By: Ulrich Pfeifer
7 # Last Modified On: Mon Apr 22 16:52:01 2002
8 # Language : CPerl
9 #
10 # (C) Copyright 1996-2002, Ulrich Pfeifer
11 #
12
13 package WAIT::InvertedIndex;
14 use strict;
15 use BerkeleyDB;
16 use Fcntl;
17 use WAIT::Filter;
18 use Carp;
19 use vars qw(%FUNC $VERSION);
20 use Time::HiRes qw(time);
21
22 $VERSION = "2.000"; # others test if we are loaded by checking $VERSION
23
24 use constant DOCFREQ_O => "o";
25 use constant MAXTF_M => "m";
26 use constant POSTINGLIST_P => "p";
27 use constant PMATCH => qr/^(??{POSTINGLIST_P()})/;
28
29 # The dictionary has three different key types:
30
31 # 'o'.$word
32 #
33 # The document frequency is the number of documents a term occurs
34 # in. The idea is that a term occuring in a significant portion of the
35 # documents is not too significant.
36 #
37 # 'm'.$word
38 #
39 # The maximum term frequency of a document is the frequency of the
40 # most frequent term in the document. It is related to the document
41 # length obviously. A document in which the most frequent term occurs
42 # 100 times is probably much longer than a document whichs most
43 # frequent term occurs five time.
44 #
45 # 'p'.$word
46 #
47 # Under this key we store the actual posting list as pairs of
48 # packed integers.
49
50 sub new {
51 my $type = shift;
52 my %parm = @_;
53 my $self = {};
54
55 for my $x (qw(path attr subname env maindbfile tablename)) {
56 $self->{$x} = $parm{$x} or confess "No $x specified";
57 }
58
59 $self->{filter} = $parm{filter};
60 $self->{'name'} = $parm{'name'};
61 $self->{records} = 0;
62 for (qw(intervall prefix)) {
63 if (exists $parm{$_}) {
64 if (ref $parm{$_}) {
65 $self->{$_} = [@{$parm{$_}}] # clone
66 } else {
67 $self->{$_} = $parm{$_}
68 }
69 }
70 }
71 bless $self, ref($type) || $type;
72 }
73
74 for my $accessor (qw(name maindbfile tablename subname)) {
75 no strict 'refs';
76 *{$accessor} = sub {
77 my($self) = @_;
78 return $self->{$accessor} if $self->{$accessor};
79 require Carp;
80 Carp::confess("accessor $accessor not there");
81 }
82 }
83
84 sub _split_pos {
85 my ($text, $pos) = @{$_[0]};
86 my @result;
87
88 $text =~ s/(^\s+)// and $pos += length($1);
89 while ($text =~ s/(^\S+)//) {
90 my $word = $1;
91 push @result, [$word, $pos];
92 $pos += length($word);
93 $text =~ s/(^\s+)// and $pos += length($1);
94 }
95 @result;
96 }
97
98 sub _xfiltergen {
99 my $filter = pop @_;
100
101 # Oops, we cannot overrule the user's choice. Other filters may kill
102 # stopwords, such as isotr clobbers "isn't" to "isnt".
103
104 # if ($filter eq 'stop') { # avoid the slow stopword elimination
105 # return _xfiltergen(@_); # it's cheaper to look them up afterwards
106 # }
107 if (@_) {
108 if ($filter =~ /^split(\d*)/) {
109 if ($1) {
110 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
111 } else {
112 "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
113 }
114 } else {
115 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
116 }
117 } else {
118 if ($filter =~ /^split(\d*)/) {
119 if ($1) {
120 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
121 } else {
122 "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
123 }
124 } else {
125 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
126 }
127 }
128 }
129
130 sub parse_pos {
131 my $self = shift;
132
133 unless (exists $self->{xfunc}) {
134 $self->{xfunc} =
135 eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
136 #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
137 }
138 &{$self->{xfunc}}($_[0]);
139 }
140
141 sub _filtergen {
142 my $filter = pop @_;
143
144 if (@_) {
145 "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
146 } else {
147 "map(&WAIT::Filter::$filter(\$_), \@_)";
148 }
149 }
150
151 sub drop {
152 my $self = shift;
153 if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
154 my $path = $self->{path};
155
156 # ! (!-e $path or unlink $path);
157 warn "DEBUG: fix drop index!";
158 } else { # notify our database
159 confess ref($self)."::drop called directly";
160 }
161 }
162
163 sub open {
164 my $self = shift;
165 my $path = $self->{path};
166
167 if (defined $self->{dbh}) {
168 $self->{dbh};
169 } else {
170 $self->{func} =
171 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
172 my $flags;
173 if ($self->{mode} & O_RDWR) {
174 $flags = DB_CREATE; # | DB_INIT_MPOOL | DB_PRIVATE | DB_INIT_CDB;
175 #warn "DEBUG: Flags on inverted $path set to 'writing'\n";
176 } else {
177 $flags = DB_RDONLY;
178 #warn "DEBUG: Flags on inverted $path set to 'readonly'\n";
179 }
180 my $filename = $self->maindbfile or die;
181 my $subname = join("/",$self->tablename || die,$self->subname || die);
182 my $env = $self->{env} || "[undef]";
183 $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
184 # Filename => $path,
185 Filename => $filename,
186 $self->{env} ? (Env => $self->{env}) : (),
187 Subname => $subname,
188 Mode => 0664,
189 Flags => $flags,
190 $WAIT::Database::Cachesize?(Cachesize => $WAIT::Database::Cachesize):(),
191 $WAIT::Database::Pagesize?(Pagesize => $WAIT::Database::Pagesize):(),
192 ) or die "Couldn't tie: $BerkeleyDB::Error; filename=>'$filename', env=>'$env',subname=>'$subname',flags=>'$flags'";
193 $self->{cache} = {}
194 if $self->{mode} & O_RDWR;
195 $self->{cdict} = {}
196 if $self->{mode} & O_RDWR;
197 $self->{cached} = 0;
198 }
199 }
200
201 sub insert {
202 my $self = shift;
203 my $key = shift;
204 my %occ;
205
206 defined $self->{db} or $self->open;
207 defined $self->{db} or die "open didn't help!!!";
208 grep $occ{$_}++, &{$self->{func}}(@_);
209 my ($word, $noc);
210 $self->{records}++;
211 while (($word, $noc) = each %occ) {
212 if (defined $self->{cache}->{$word}) {
213 $self->{cdict}->{$word}++;
214 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
215 } else {
216 $self->{cdict}->{$word} = 1;
217 $self->{cache}->{$word} = pack 'w2', $key, $noc;
218 }
219 $self->{cached}++;
220 }
221 # This cache limit should be configurable
222 $self->sync if $self->{cached} > 100_000;
223 my $maxtf = 0;
224 for (values %occ) {
225 $maxtf = $_ if $_ > $maxtf;
226 }
227 $self->{db}->{MAXTF_M . $key} = $maxtf;
228 }
229
230 # We sort postings by increasing max term frequency (~ by increasing
231 # document length. This reduces the quality degradation if we process
232 # only the first part of a posting list.
233
234 sub sort_postings {
235 my $self = shift;
236 my $post = shift; # reference to a hash or packed string
237
238 if (ref $post) {
239 # we skip the sort part, if the index is not sorted
240 return pack('w*', %$post) unless $self->{reorg};
241 } else {
242 $post = { unpack 'w*', $post };
243 }
244
245 my $r = '';
246
247 # Sort posting list by increasing ratio of maximum term frequency (~
248 # "document length") and term frequency. This ratio multipied by the
249 # inverse document frequence gives the score for a term. This sort
250 # order can be exploited for tuning of single term queries.
251
252 for my $did (keys %$post) { # sanity check
253 unless ($self->{db}->{MAXTF_M . $did}) {
254 warn "WAIT Warning: DIVZERO threat from did[$did]post[$post]post{did}[$post->{$did}]";
255 $self->{db}->{MAXTF_M . $did} = 1; # fails if we have not opened for writing
256 }
257 }
258 for my $did (sort { $post->{$b} / $self->{db}->{MAXTF_M . $b}
259 <=>
260 $post->{$a} / $self->{db}->{MAXTF_M . $a}
261 } keys %$post) {
262 $r .= pack 'w2', $did, $post->{$did};
263 }
264 #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
265 $r;
266 }
267
268 sub delete {
269 my $self = shift;
270 my $key = shift;
271 my %occ;
272
273 my $db;
274 defined $self->{db} or $self->open;
275 $db = $self->{db};
276 $self->sync;
277 $self->{records}--;
278
279 # less than zero documents in database?
280 _complain('delete of document', $key) and $self->{records} = 0
281 if $self->{records} < 0;
282
283 grep $occ{$_}++, &{$self->{func}}(@_);
284
285 # Be prepared for "Odd number of elements in hash assignment"
286 local $SIG{__WARN__} = sub {
287 my $warning = shift;
288 chomp $warning;
289 warn "Catching warning[$warning] during delete of key[$key]";
290 };
291 for (keys %occ) {# may reorder posting list
292 my %post = unpack 'w*', $db->{POSTINGLIST_P . $_};
293 delete $post{$key};
294 $db->{POSTINGLIST_P . $_} = $self->sort_postings(\%post);
295 _complain('delete of term', $_) if $db->{DOCFREQ_O . $_}-1 != keys %post;
296 $db->{DOCFREQ_O . $_} = scalar keys %post;
297 }
298 delete $db->{MAXTF_M . $key};
299 }
300
301 sub intervall {
302 my ($self, $first, $last) = @_;
303
304 die "intervall broken in this version of WAIT: need to fix the
305 R_CURSOR and R_NEXT lines";
306
307 #### my $value = '';
308 #### my $word = '';
309 #### my @result;
310 ####
311 #### return unless exists $self->{'intervall'};
312 ####
313 #### defined $self->{db} or $self->open;
314 #### $self->sync;
315 #### my $dbh = $self->{dbh}; # for convenience
316 ####
317 #### if (ref $self->{'intervall'}) {
318 #### unless (exists $self->{'ifunc'}) {
319 #### $self->{'ifunc'} =
320 #### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
321 #### }
322 #### ($first) = &{$self->{'ifunc'}}($first) if $first;
323 #### ($last) = &{$self->{'ifunc'}}($last) if $last;
324 #### }
325 #### $first = POSTINGLIST_P . ($first||'');
326 #### $last = (defined $last)?POSTINGLIST_P . $last:'q';
327 ####
328 #### # set the cursor to $first
329 #### $dbh->seq($first, $value, R_CURSOR);
330 ####
331 #### # $first would be after the last word
332 #### return () if $first gt $last;
333 ####
334 #### push @result, substr($first,1);
335 #### while (!$dbh->seq($word, $value, R_NEXT)) {
336 #### # We should limit this to a "resonable" number of words
337 #### last if $word gt $last;
338 #### push @result, substr($word,1);
339 #### }
340 #### \@result; # speed
341 }
342
343 sub prefix {
344 my ($self, $prefix) = @_;
345
346 die "prefix not supported in this version of WAIT: need to fix the R_CURSOR";
347
348
349 #### my $value = '';
350 #### my $word = '';
351 #### my @result;
352 ####
353 #### return () unless defined $prefix; # Full dictionary requested !!
354 #### return unless exists $self->{'prefix'};
355 #### defined $self->{db} or $self->open;
356 #### $self->sync;
357 #### my $dbh = $self->{dbh};
358 ####
359 #### if (ref $self->{'prefix'}) {
360 #### unless (exists $self->{'pfunc'}) {
361 #### $self->{'pfunc'} =
362 #### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
363 #### }
364 #### ($prefix) = &{$self->{'pfunc'}}($prefix);
365 #### }
366 ####
367 #### if ($dbh->seq($word = POSTINGLIST_P . $prefix, $value, R_CURSOR)) {
368 #### return ();
369 #### }
370 #### return () if $word !~ /^p$prefix/;
371 #### push @result, substr($word,1);
372 ####
373 #### while (!$dbh->seq($word, $value, R_NEXT)) {
374 #### # We should limit this to a "resonable" number of words
375 #### last if $word !~ /^p$prefix/;
376 #### push @result, substr($word,1);
377 #### }
378 #### \@result; # speed
379 }
380
381 =head2 search($query)
382
383 The search method supports a range of search algorithms. It is
384 recommended to tune the index by calling
385 C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
386 into the table. This is a computing intense operation and all inserts
387 and deletes after this optimization are slightly more expensive. Once
388 reorganized, the index is kept sorted automatically until you switch
389 the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
390
391 When searching a tuned index, a query can be processed faster if the
392 caller requests only the topmost documents. This can be done by
393 passing a C<top =E<gt>> I<n> parameter to the search method.
394
395 For single term queries, the method returns only the I<n> top ranking
396 documents. For multi term queries two optimized algorithms are
397 available. The first algorithm computes the top n documents
398 approximately but very fast, sacrificing a little bit of precision for
399 speed. The second algorithm computes the topmost I<n> documents
400 precisely. This algorithm is slower and should be used only for small
401 values of I<n>. It can be requested by passing the query attribute
402 C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
403 While the picky version might not be faster than the brute force
404 version on average for modest size databases it uses less memory and
405 the processing time is almost linear in the number of query terms, not
406 in the size of the lists.
407
408 =cut
409
410 sub search_ref {
411 my $self = shift;
412 my $query = shift;
413
414 my $debugtime = 0;
415 my($time,$entertime);
416 our $STARTTIME;
417 if ($debugtime) {
418 $time = time;
419 $STARTTIME ||= $time;
420 if ($time-$STARTTIME > 5) {
421 $STARTTIME = $time;
422 warn "STARTTIME: $STARTTIME\n";
423 }
424 $entertime = time-$STARTTIME;
425 warn sprintf "ENTER TIME: %.4f\n", $entertime;
426 }
427 defined $self->{db} or $self->open;
428 $self->sync;
429 my $ref = $self->search_raw_ref($query, &{$self->{func}}(@_)); # No call to parse() there
430 if ($debugtime) {
431 my $leavetime = time-$STARTTIME;
432 warn sprintf "LEAVE TIME: %.4f\n", $leavetime;
433 if ($leavetime-$entertime > .4) {
434 require Data::Dumper;
435 print STDERR "Line " . __LINE__ . ", File: " . __FILE__ . "\n" .
436 Data::Dumper->new([$query,\@_],[qw(query at_)])->Indent(1)->Useqq(1)->Dump; # XXX
437 }
438 }
439 $ref;
440 }
441
442 sub parse {
443 my $self = shift;
444
445 defined $self->{db} or $self->open;
446 &{$self->{func}}(@_);
447 }
448
449 sub search_prefix {
450 my $self = shift;
451
452 # print "search_prefix(@_)\n";
453 defined $self->{db} or $self->open;
454 $self->search_raw_ref(map($self->prefix($_), @_));
455 }
456
457 sub _complain ($$) {
458 my ($action, $term) = @_;
459
460 require Carp;
461 Carp::cluck
462 (sprintf("WAIT database inconsistency during $action [%s]: ".
463 "Please rebuild index\n",
464 $term,));
465 }
466
467 sub search_raw_ref {
468 my $self = shift;
469 my $query = shift;
470 # warn "DEBUG WAIT: search_raw_ref args 2..[@_]";
471 my %score;
472
473 # Top $top_wanted documents must be correct. Zero means all matching documents.
474 my $top_wanted = $query->{top};
475 my $picky_strict = $query->{picky};
476 # the option is really ignore_excess
477 my $ignore_excess = $query->{ignore_excess};
478
479 # Return at least $minacc documents. Zero means all matching documents.
480
481 # my $minacc = $query->{accus} || $top_wanted;
482
483 # Open index and flush cache if necessary
484 defined $self->{db} or $self->open;
485 $self->sync;
486
487 # We keep duplicates
488 my @terms =
489 # Sort words by decreasing document frequency
490 sort { $self->{db}->{DOCFREQ_O . $a} <=> $self->{db}->{DOCFREQ_O . $b} }
491 # check which words occur in the index.
492 grep { $self->{db}->{DOCFREQ_O . $_} } @_;
493
494 # warn "DEBUG WAIT: wanted[$top_wanted]terms[@terms]";
495 return unless @terms;
496
497 # We special-case one term queries here. If the index was sorted,
498 # choping off the rest of the list will return the same ranking.
499 if ($top_wanted and @terms == 1) {
500 my $term = shift @terms;
501 my $idf = log($self->{records}/$self->{db}->{DOCFREQ_O . $term});
502 my @res;
503
504 if ($self->{reorg}) { # or not $query->{picky}
505 @res = unpack "w". int(2*$top_wanted), $self->{db}->{POSTINGLIST_P . $term};
506 # warn sprintf "DEBUG WAIT: scalar(\@res)[%d]", scalar(@res);
507 } else {
508 @res = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
509 }
510
511 for (my $i=1; $i<@res; $i+=2) {
512 # $res[$i] /= $self->{db}->{MAXTF_M . $res[$i-1]} / $idf;
513 # above was written badly, allows two DIV_ZERO problems.
514 my $maxtf = $self->{db}->{MAXTF_M . $res[$i-1]};
515 unless ($maxtf) {
516 warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
517 $maxtf = 1;
518 }
519 $res[$i] = ($res[$i] / $maxtf) * $idf;
520 }
521
522 my %res = @res; # bloed: @res waere schon sortiert gewesen
523 return \%res;
524 }
525
526 # We separate exhaustive search here to avoid overhead and make the
527 # code more readable. The block can be removed without changing the
528 # result.
529 unless ($top_wanted) {
530 for (@terms) {
531 my $df = $self->{db}->{DOCFREQ_O . $_};
532
533 # The frequency *must* be 1 at least since the posting list is nonempty
534 _complain('search for term', $_) and $df = 1 if $df < 1;
535
536 # Unpack posting list for current query term $_
537 my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
538
539 _complain('search for term', $_) if $self->{db}->{DOCFREQ_O . $_} != keys %post;
540 # This is the inverse document frequency. The log of the inverse
541 # fraction of documents the term occurs in.
542 my $idf = log($self->{records}/$df);
543 for my $did (keys %post) {
544 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
545 $score{$did} += $post{$did} / $freq * $idf;
546 }
547 }
548 }
549 # warn sprintf "Used %d accumulators\n", scalar keys %score;
550 return \%score;
551 }
552
553 # A sloppy but fast algorithm for multiple term queries.
554 unless ($picky_strict) {
555 for (@terms) {
556 # Unpack posting list for current query term $_
557 my %post;
558 if ($self->{reorg} && $top_wanted && $ignore_excess) {
559 %post = unpack 'w'. int(2*$ignore_excess) , $self->{db}->{POSTINGLIST_P . $_};
560 } else {
561 %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
562 }
563 # warn sprintf "DEBUG WAIT: term[%s] keys %%post[%s]", $_, scalar keys %post;
564
565 # Lookup the number of documents the term occurs in (document frequency)
566 my $occ = $self->{db}->{DOCFREQ_O . $_};
567
568 _complain('search for term', $_) if !$ignore_excess && $occ != keys %post;
569 # The frequency *must* be 1 at least since the posting list is nonempty
570 _complain('search for term', $_) and $occ = 1 if $occ < 1;
571
572 # This is the inverse document frequency. The log of the inverse fraction
573 # of documents the term occurs in.
574 my $idf = log($self->{records}/$occ);
575
576 # If we have a reasonable number of accumulators, change the
577 # loop to iterate over the accumulators. This will compromise
578 # quality for better speed. The algorithm still computes the
579 # exact weights, but the result is not guaranteed to contain the
580 # *best* results. The database might contain documents better
581 # than the worst returned document.
582
583 # We process the lists in order of increasing length. When the
584 # number of accumulators exceeds $wanted, no new documents are
585 # added, only the ranking/weighting of the seen documents is
586 # improved. The resulting ranking list must be pruned, since only
587 # the top most documents end up near their "optimal" rank.
588
589 if (keys %score < $top_wanted) {
590
591 # Diese folgende Schleife ist (WAR!) der Hammer fuer die Suche "mysql
592 # für dummies bellomo". Sie frisst 3.1+1.7 Sekunden.
593
594 # Der erste Grund ist, dass 3 Begriffe noch nicht genug gebracht haben,
595 # aber der vierte viel zu viel bringt. Der zweite Grund ist, dass wir
596 # so viele Lookups in $self->{db} machen. Das Rechnen hingegen ist
597 # vermutlich billig.
598
599 for my $did (keys %post) {
600 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
601 $score{$did} += $post{$did} / $freq * $idf;
602 }
603 }
604 } else {
605 for my $did (keys %score) {
606 next unless exists $post{$did};
607 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
608 $score{$did} += $post{$did} / $freq * $idf;
609 }
610 }
611 }
612 }
613 warn sprintf("DEBUG WAIT: returning from search_raw_ref at [%.3f] after terms[%s] with keys[%d]",
614 time,
615 join(":",@terms),
616 scalar keys %score,
617 );
618 return \%score;
619 }
620 my @max; $max[$#terms+1]=0;
621 my @idf;
622
623 # Preparation loop. This extra loop makes sense only when "reorg"
624 # and "wanted" are true. But at the time beeing, keeping the code
625 # for the different search algorithms in one place seems more
626 # desirable than some minor speedup of the brute force version. We
627 # do cache $idf though.
628
629 for (my $i = $#terms; $i >=0; $i--) {
630 local $_ = $terms[$i];
631 # Lookup the number of documents the term occurs in (document frequency)
632 my $df = $self->{db}->{DOCFREQ_O . $_};
633
634 # The frequency *must* be 1 at least since the posting list is nonempty
635 _complain('search for term', $_) and $df = 1 if $df < 1;
636
637 # This is the inverse document frequency. The log of the inverse
638 # fraction of documents the term occurs in.
639 $idf[$i] = log($self->{records}/$df);
640
641 my ($did,$occ);
642 if ($self->{reorg}) {
643 ($did,$occ) = unpack 'w2', $self->{db}->{POSTINGLIST_P . $_};
644 } else { # Maybe this costs more than it helps
645 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{POSTINGLIST_P . $_});
646 }
647 my $freq = $self->{db}->{MAXTF_M . $did};
648 my $max = $occ/$freq*$idf[$i];
649 $max[$i] = $max + $max[$i+1];
650 }
651
652 # Main loop
653 for my $i (0 .. $#terms) {
654 my $term = $terms[$i];
655 # Unpack posting list for current query term $term. We loose the
656 # sorting order because the assignment to a hash.
657 my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
658
659 _complain('search for term', $term)
660 if $self->{db}->{DOCFREQ_O . $term} != keys %post;
661
662 my $idf = $idf[$i];
663 my $full; # Need to process all postings
664 my $chop; # Score necessary to enter the ranking list
665
666 if (# We know that wanted is true since we special cased the
667 # exhaustive search.
668
669 $top_wanted and
670
671 # We did sort here if necessary in the preparation loop:
672 # $self->{reorg} and
673
674 scalar keys %score > $top_wanted) {
675 $chop = (sort { $b <=> $a } values %score)[$top_wanted];
676 $full = $max[$i] > $chop;
677 } else {
678 $full = 1;
679 }
680
681 if ($full) {
682 # We need to inspect the full list. Either $top_wanted is not given,
683 # the index is not sorted, or we don't have enough accumulators
684 # yet.
685 if (defined $chop) {
686 # We might be able to avoid allocating accumulators
687 for my $did (keys %post) {
688 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
689 my $wgt = $post{$did} / $freq * $idf;
690 # We add an accumulator if $wgt exeeds $chop
691 if (exists $score{$did} or $wgt > $chop) {
692 $score{$did} += $wgt;
693 }
694 }
695 }
696 } else {
697 # Allocate acumulators for each seen document.
698 for my $did (keys %post) {
699 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
700 $score{$did} += $post{$did} / $freq * $idf;
701 }
702 }
703 }
704 } else {
705 # Update existing accumulators
706 for my $did (keys %score) {
707 next unless exists $post{$did};
708 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
709 $score{$did} += $post{$did} / $freq * $idf;
710 }
711 }
712 }
713 }
714 #warn sprintf "Used %d accumulators\n", scalar keys %score;
715 \%score;
716 }
717
718 sub set {
719 my ($self, $attr, $value) = @_;
720
721 die "No such index attribute: '$attr'" unless $attr eq 'top';
722
723 return delete $self->{reorg} if $value == 0;
724
725 return if $self->{reorg}; # we are sorted already
726 return unless $self->{mode} & O_RDWR;
727 defined $self->{db} or $self->open;
728
729 $self->sync;
730 while (my($key, $value) = each %{$self->{db}}) {
731 next if $key !~ /^p/; # some day use PMATCH
732 $self->{db}{$key} = $self->sort_postings($value);
733 }
734 $self->{reorg} = 1;
735 }
736
737 sub sync {
738 my $self = shift;
739 return unless $self->{mode} & O_RDWR;
740 Carp::carp(sprintf "[%s] Flushing %d postings", scalar(localtime), $self->{cached})
741 if $self->{cached};
742 while (my($key, $value) = each %{$self->{cache}}) {
743 $self->{db}{POSTINGLIST_P . $key} ||= "";
744 if ($self->{reorg}) {
745 $self->{db}->{POSTINGLIST_P . $key} =
746 $self->sort_postings($self->{db}->{POSTINGLIST_P . $key}
747 . $value);
748 } else {
749 $self->{db}->{POSTINGLIST_P . $key} .= $value;
750 }
751 }
752 while (my($key, $value) = each %{$self->{cdict}}) {
753 $self->{db}->{DOCFREQ_O . $key} = 0 unless $self->{db}->{DOCFREQ_O . $key};
754 $self->{db}->{DOCFREQ_O . $key} += $value;
755 }
756 $self->{cache} = {};
757 $self->{cdict} = {};
758 $self->{cached} = 0;
759 }
760
761 sub close {
762 my $self = shift;
763
764 delete $self->{env};
765 if ($self->{dbh}) {
766 $self->sync;
767 delete $self->{dbh};
768 untie %{$self->{db}};
769 for my $att (qw(db func cache cached cdict path maindbfile)) {
770 delete $self->{$att};
771 }
772 for my $att (qw(pfunc ifunc xfunc)) {
773 delete $self->{$att} if defined $self->{$att};
774 }
775 }
776 }
777
778 sub keys {
779 my $self = shift;
780
781 defined $self->{db} or $self->open;
782 keys %{$self->{db}};
783 }
784
785 1;
786

Properties

Name Value
cvs2svn:cvs-rev 1.12

  ViewVC Help
Powered by ViewVC 1.1.26