/[wait]/trunk/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 116 - (hide annotations)
Wed Jul 14 09:48:26 2004 UTC (19 years, 10 months ago) by dpavlin
File size: 25238 byte(s)
more fixes, more debug

1 dpavlin 108 # -*- Mode: cperl; fill-column: 79 -*-
2 ulpfr 19 # $Basename: InvertedIndex.pm $
3     # $Revision: 1.30 $
4 ulpfr 10 # Author : Ulrich Pfeifer
5     # Created On : Thu Aug 8 13:05:10 1996
6     # Last Modified By: Ulrich Pfeifer
7 dpavlin 89 # Last Modified On: Mon Apr 22 16:52:01 2002
8 ulpfr 10 # Language : CPerl
9 ulpfr 19 #
10 ulpfr 80 # (C) Copyright 1996-2002, Ulrich Pfeifer
11 ulpfr 19 #
12 ulpfr 10
13     package WAIT::InvertedIndex;
14     use strict;
15 dpavlin 108 use BerkeleyDB;
16 ulpfr 10 use Fcntl;
17     use WAIT::Filter;
18     use Carp;
19 laperla 30 use vars qw(%FUNC $VERSION);
20 dpavlin 108 use Time::HiRes qw(time);
21 ulpfr 10
22 dpavlin 108 $VERSION = "2.000"; # others test if we are loaded by checking $VERSION
23 laperla 30
24 dpavlin 108 use constant DOCFREQ_O => "o";
25     use constant MAXTF_M => "m";
26     use constant POSTINGLIST_P => "p";
27     use constant PMATCH => qr/^(??{POSTINGLIST_P()})/;
28    
29 ulpfr 22 # The dictionary has three different key types:
30 dpavlin 108
31 ulpfr 22 # 'o'.$word
32     #
33     # The document frequency is the number of documents a term occurs
34 ulpfr 80 # in. The idea is that a term occuring in a significant portion of the
35 ulpfr 22 # documents is not too significant.
36     #
37     # 'm'.$word
38     #
39     # The maximum term frequency of a document is the frequency of the
40     # most frequent term in the document. It is related to the document
41 laperla 71 # length obviously. A document in which the most frequent term occurs
42 ulpfr 22 # 100 times is probably much longer than a document whichs most
43     # frequent term occurs five time.
44     #
45     # 'p'.$word
46     #
47     # Under this key we store the actual posting list as pairs of
48     # packed integers.
49 ulpfr 10
50     sub new {
51     my $type = shift;
52     my %parm = @_;
53     my $self = {};
54    
55 dpavlin 116 for my $x (qw(path attr subname env maindbfile tablename)) {
56     $self->{$x} = $parm{$x} or confess "No $x specified";
57 dpavlin 108 }
58    
59 ulpfr 10 $self->{filter} = $parm{filter};
60     $self->{'name'} = $parm{'name'};
61     $self->{records} = 0;
62     for (qw(intervall prefix)) {
63     if (exists $parm{$_}) {
64     if (ref $parm{$_}) {
65     $self->{$_} = [@{$parm{$_}}] # clone
66     } else {
67     $self->{$_} = $parm{$_}
68     }
69     }
70     }
71     bless $self, ref($type) || $type;
72     }
73    
74 dpavlin 108 for my $accessor (qw(name maindbfile tablename subname)) {
75     no strict 'refs';
76     *{$accessor} = sub {
77     my($self) = @_;
78     return $self->{$accessor} if $self->{$accessor};
79     require Carp;
80     Carp::confess("accessor $accessor not there");
81     }
82     }
83 ulpfr 10
84     sub _split_pos {
85     my ($text, $pos) = @{$_[0]};
86     my @result;
87    
88     $text =~ s/(^\s+)// and $pos += length($1);
89     while ($text =~ s/(^\S+)//) {
90     my $word = $1;
91     push @result, [$word, $pos];
92     $pos += length($word);
93     $text =~ s/(^\s+)// and $pos += length($1);
94     }
95     @result;
96     }
97    
98     sub _xfiltergen {
99     my $filter = pop @_;
100    
101 ulpfr 13 # Oops, we cannot overrule the user's choice. Other filters may kill
102     # stopwords, such as isotr clobbers "isn't" to "isnt".
103    
104     # if ($filter eq 'stop') { # avoid the slow stopword elimination
105     # return _xfiltergen(@_); # it's cheaper to look them up afterwards
106     # }
107 ulpfr 10 if (@_) {
108     if ($filter =~ /^split(\d*)/) {
109     if ($1) {
110     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
111     } else {
112     "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
113     }
114     } else {
115     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
116     }
117     } else {
118     if ($filter =~ /^split(\d*)/) {
119     if ($1) {
120     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
121     } else {
122     "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
123     }
124     } else {
125     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
126     }
127     }
128     }
129    
130     sub parse_pos {
131     my $self = shift;
132    
133     unless (exists $self->{xfunc}) {
134     $self->{xfunc} =
135     eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
136     #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
137     }
138     &{$self->{xfunc}}($_[0]);
139     }
140    
141     sub _filtergen {
142     my $filter = pop @_;
143    
144     if (@_) {
145     "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
146     } else {
147     "map(&WAIT::Filter::$filter(\$_), \@_)";
148     }
149     }
150    
151     sub drop {
152     my $self = shift;
153     if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
154 dpavlin 116 my $path = $self->{path};
155 ulpfr 10
156 dpavlin 116 # ! (!-e $path or unlink $path);
157     warn "DEBUG: fix drop index!";
158 ulpfr 10 } else { # notify our database
159 dpavlin 116 confess ref($self)."::drop called directly";
160 ulpfr 10 }
161     }
162    
163     sub open {
164     my $self = shift;
165 dpavlin 116 my $path = $self->{path};
166 ulpfr 10
167     if (defined $self->{dbh}) {
168     $self->{dbh};
169     } else {
170     $self->{func} =
171     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
172 dpavlin 108 my $flags;
173     if ($self->{mode} & O_RDWR) {
174     $flags = DB_CREATE; # | DB_INIT_MPOOL | DB_PRIVATE | DB_INIT_CDB;
175 dpavlin 116 #warn "DEBUG: Flags on inverted $path set to 'writing'\n";
176 dpavlin 108 } else {
177     $flags = DB_RDONLY;
178 dpavlin 116 #warn "DEBUG: Flags on inverted $path set to 'readonly'\n";
179 dpavlin 108 }
180     my $filename = $self->maindbfile or die;
181     my $subname = join("/",$self->tablename || die,$self->subname || die);
182     my $env = $self->{env} || "[undef]";
183     $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
184 dpavlin 116 # Filename => $path,
185 dpavlin 108 Filename => $filename,
186     $self->{env} ? (Env => $self->{env}) : (),
187     Subname => $subname,
188     Mode => 0664,
189     Flags => $flags,
190     $WAIT::Database::Cachesize?(Cachesize => $WAIT::Database::Cachesize):(),
191     $WAIT::Database::Pagesize?(Pagesize => $WAIT::Database::Pagesize):(),
192     ) or die "Couldn't tie: $BerkeleyDB::Error; filename=>'$filename', env=>'$env',subname=>'$subname',flags=>'$flags'";
193 ulpfr 10 $self->{cache} = {}
194     if $self->{mode} & O_RDWR;
195     $self->{cdict} = {}
196     if $self->{mode} & O_RDWR;
197     $self->{cached} = 0;
198     }
199     }
200    
201     sub insert {
202     my $self = shift;
203     my $key = shift;
204     my %occ;
205 ulpfr 13
206 ulpfr 10 defined $self->{db} or $self->open;
207 dpavlin 108 defined $self->{db} or die "open didn't help!!!";
208 ulpfr 10 grep $occ{$_}++, &{$self->{func}}(@_);
209     my ($word, $noc);
210     $self->{records}++;
211     while (($word, $noc) = each %occ) {
212     if (defined $self->{cache}->{$word}) {
213 ulpfr 22 $self->{cdict}->{$word}++;
214 ulpfr 10 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
215 dpavlin 108 } else {
216 ulpfr 22 $self->{cdict}->{$word} = 1;
217 ulpfr 10 $self->{cache}->{$word} = pack 'w2', $key, $noc;
218 ulpfr 13 }
219 ulpfr 10 $self->{cached}++;
220     }
221 ulpfr 19 # This cache limit should be configurable
222 ulpfr 10 $self->sync if $self->{cached} > 100_000;
223     my $maxtf = 0;
224     for (values %occ) {
225     $maxtf = $_ if $_ > $maxtf;
226     }
227 dpavlin 108 $self->{db}->{MAXTF_M . $key} = $maxtf;
228 ulpfr 10 }
229    
230 ulpfr 19 # We sort postings by increasing max term frequency (~ by increasing
231     # document length. This reduces the quality degradation if we process
232     # only the first part of a posting list.
233    
234     sub sort_postings {
235     my $self = shift;
236     my $post = shift; # reference to a hash or packed string
237    
238     if (ref $post) {
239     # we skip the sort part, if the index is not sorted
240     return pack('w*', %$post) unless $self->{reorg};
241     } else {
242     $post = { unpack 'w*', $post };
243     }
244    
245     my $r = '';
246    
247     # Sort posting list by increasing ratio of maximum term frequency (~
248 ulpfr 80 # "document length") and term frequency. This ratio multipied by the
249 ulpfr 19 # inverse document frequence gives the score for a term. This sort
250     # order can be exploited for tuning of single term queries.
251    
252 laperla 30 for my $did (keys %$post) { # sanity check
253 dpavlin 108 unless ($self->{db}->{MAXTF_M . $did}) {
254     warn "WAIT Warning: DIVZERO threat from did[$did]post[$post]post{did}[$post->{$did}]";
255     $self->{db}->{MAXTF_M . $did} = 1; # fails if we have not opened for writing
256 laperla 30 }
257     }
258 dpavlin 108 for my $did (sort { $post->{$b} / $self->{db}->{MAXTF_M . $b}
259 ulpfr 19 <=>
260 dpavlin 108 $post->{$a} / $self->{db}->{MAXTF_M . $a}
261 ulpfr 19 } keys %$post) {
262     $r .= pack 'w2', $did, $post->{$did};
263     }
264     #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
265     $r;
266     }
267    
268 ulpfr 10 sub delete {
269     my $self = shift;
270     my $key = shift;
271     my %occ;
272    
273 ulpfr 19 my $db;
274 ulpfr 10 defined $self->{db} or $self->open;
275 ulpfr 19 $db = $self->{db};
276 ulpfr 10 $self->sync;
277     $self->{records}--;
278 ulpfr 19
279     # less than zero documents in database?
280     _complain('delete of document', $key) and $self->{records} = 0
281     if $self->{records} < 0;
282    
283 ulpfr 10 grep $occ{$_}++, &{$self->{func}}(@_);
284 ulpfr 19
285 laperla 30 # Be prepared for "Odd number of elements in hash assignment"
286     local $SIG{__WARN__} = sub {
287     my $warning = shift;
288     chomp $warning;
289     warn "Catching warning[$warning] during delete of key[$key]";
290     };
291 ulpfr 19 for (keys %occ) {# may reorder posting list
292 dpavlin 108 my %post = unpack 'w*', $db->{POSTINGLIST_P . $_};
293 ulpfr 10 delete $post{$key};
294 dpavlin 108 $db->{POSTINGLIST_P . $_} = $self->sort_postings(\%post);
295     _complain('delete of term', $_) if $db->{DOCFREQ_O . $_}-1 != keys %post;
296     $db->{DOCFREQ_O . $_} = scalar keys %post;
297 ulpfr 10 }
298 dpavlin 108 delete $db->{MAXTF_M . $key};
299 ulpfr 10 }
300    
301     sub intervall {
302     my ($self, $first, $last) = @_;
303    
304 dpavlin 108 die "intervall broken in this version of WAIT: need to fix the
305     R_CURSOR and R_NEXT lines";
306 ulpfr 10
307 dpavlin 108 #### my $value = '';
308     #### my $word = '';
309     #### my @result;
310     ####
311     #### return unless exists $self->{'intervall'};
312     ####
313     #### defined $self->{db} or $self->open;
314     #### $self->sync;
315     #### my $dbh = $self->{dbh}; # for convenience
316     ####
317     #### if (ref $self->{'intervall'}) {
318     #### unless (exists $self->{'ifunc'}) {
319     #### $self->{'ifunc'} =
320     #### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
321     #### }
322     #### ($first) = &{$self->{'ifunc'}}($first) if $first;
323     #### ($last) = &{$self->{'ifunc'}}($last) if $last;
324     #### }
325     #### $first = POSTINGLIST_P . ($first||'');
326     #### $last = (defined $last)?POSTINGLIST_P . $last:'q';
327     ####
328     #### # set the cursor to $first
329     #### $dbh->seq($first, $value, R_CURSOR);
330     ####
331     #### # $first would be after the last word
332     #### return () if $first gt $last;
333     ####
334     #### push @result, substr($first,1);
335     #### while (!$dbh->seq($word, $value, R_NEXT)) {
336     #### # We should limit this to a "resonable" number of words
337     #### last if $word gt $last;
338     #### push @result, substr($word,1);
339     #### }
340     #### \@result; # speed
341 ulpfr 10 }
342    
343     sub prefix {
344     my ($self, $prefix) = @_;
345    
346 dpavlin 108 die "prefix not supported in this version of WAIT: need to fix the R_CURSOR";
347 ulpfr 10
348    
349 dpavlin 108 #### my $value = '';
350     #### my $word = '';
351     #### my @result;
352     ####
353     #### return () unless defined $prefix; # Full dictionary requested !!
354     #### return unless exists $self->{'prefix'};
355     #### defined $self->{db} or $self->open;
356     #### $self->sync;
357     #### my $dbh = $self->{dbh};
358     ####
359     #### if (ref $self->{'prefix'}) {
360     #### unless (exists $self->{'pfunc'}) {
361     #### $self->{'pfunc'} =
362     #### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
363     #### }
364     #### ($prefix) = &{$self->{'pfunc'}}($prefix);
365     #### }
366     ####
367     #### if ($dbh->seq($word = POSTINGLIST_P . $prefix, $value, R_CURSOR)) {
368     #### return ();
369     #### }
370     #### return () if $word !~ /^p$prefix/;
371     #### push @result, substr($word,1);
372     ####
373     #### while (!$dbh->seq($word, $value, R_NEXT)) {
374     #### # We should limit this to a "resonable" number of words
375     #### last if $word !~ /^p$prefix/;
376     #### push @result, substr($word,1);
377     #### }
378     #### \@result; # speed
379 ulpfr 10 }
380    
381 ulpfr 19 =head2 search($query)
382    
383     The search method supports a range of search algorithms. It is
384     recommended to tune the index by calling
385     C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
386     into the table. This is a computing intense operation and all inserts
387     and deletes after this optimization are slightly more expensive. Once
388     reorganized, the index is kept sorted automatically until you switch
389     the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
390    
391     When searching a tuned index, a query can be processed faster if the
392     caller requests only the topmost documents. This can be done by
393     passing a C<top =E<gt>> I<n> parameter to the search method.
394    
395     For single term queries, the method returns only the I<n> top ranking
396     documents. For multi term queries two optimized algorithms are
397     available. The first algorithm computes the top n documents
398     approximately but very fast, sacrificing a little bit of precision for
399     speed. The second algorithm computes the topmost I<n> documents
400     precisely. This algorithm is slower and should be used only for small
401     values of I<n>. It can be requested by passing the query attribute
402     C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
403     While the picky version might not be faster than the brute force
404     version on average for modest size databases it uses less memory and
405     the processing time is almost linear in the number of query terms, not
406     in the size of the lists.
407    
408     =cut
409    
410 dpavlin 108 sub search_ref {
411 ulpfr 10 my $self = shift;
412 ulpfr 19 my $query = shift;
413 ulpfr 10
414 dpavlin 108 my $debugtime = 0;
415     my($time,$entertime);
416     our $STARTTIME;
417     if ($debugtime) {
418     $time = time;
419     $STARTTIME ||= $time;
420     if ($time-$STARTTIME > 5) {
421     $STARTTIME = $time;
422     warn "STARTTIME: $STARTTIME\n";
423     }
424     $entertime = time-$STARTTIME;
425     warn sprintf "ENTER TIME: %.4f\n", $entertime;
426     }
427 ulpfr 10 defined $self->{db} or $self->open;
428     $self->sync;
429 dpavlin 108 my $ref = $self->search_raw_ref($query, &{$self->{func}}(@_)); # No call to parse() there
430     if ($debugtime) {
431     my $leavetime = time-$STARTTIME;
432     warn sprintf "LEAVE TIME: %.4f\n", $leavetime;
433     if ($leavetime-$entertime > .4) {
434     require Data::Dumper;
435     print STDERR "Line " . __LINE__ . ", File: " . __FILE__ . "\n" .
436     Data::Dumper->new([$query,\@_],[qw(query at_)])->Indent(1)->Useqq(1)->Dump; # XXX
437     }
438     }
439     $ref;
440 ulpfr 10 }
441    
442     sub parse {
443     my $self = shift;
444    
445     defined $self->{db} or $self->open;
446     &{$self->{func}}(@_);
447     }
448    
449     sub search_prefix {
450     my $self = shift;
451    
452     # print "search_prefix(@_)\n";
453     defined $self->{db} or $self->open;
454 dpavlin 108 $self->search_raw_ref(map($self->prefix($_), @_));
455 ulpfr 10 }
456    
457 ulpfr 19 sub _complain ($$) {
458     my ($action, $term) = @_;
459    
460     require Carp;
461     Carp::cluck
462     (sprintf("WAIT database inconsistency during $action [%s]: ".
463     "Please rebuild index\n",
464     $term,));
465     }
466    
467 dpavlin 108 sub search_raw_ref {
468 ulpfr 10 my $self = shift;
469 ulpfr 19 my $query = shift;
470 dpavlin 108 # warn "DEBUG WAIT: search_raw_ref args 2..[@_]";
471 ulpfr 10 my %score;
472    
473 dpavlin 108 # Top $top_wanted documents must be correct. Zero means all matching documents.
474     my $top_wanted = $query->{top};
475     my $picky_strict = $query->{picky};
476     # the option is really ignore_excess
477     my $ignore_excess = $query->{ignore_excess};
478 ulpfr 10
479 dpavlin 108 # Return at least $minacc documents. Zero means all matching documents.
480 ulpfr 19
481 dpavlin 108 # my $minacc = $query->{accus} || $top_wanted;
482    
483 ulpfr 19 # Open index and flush cache if necessary
484 ulpfr 10 defined $self->{db} or $self->open;
485     $self->sync;
486 ulpfr 19
487     # We keep duplicates
488     my @terms =
489     # Sort words by decreasing document frequency
490 dpavlin 108 sort { $self->{db}->{DOCFREQ_O . $a} <=> $self->{db}->{DOCFREQ_O . $b} }
491 ulpfr 19 # check which words occur in the index.
492 dpavlin 108 grep { $self->{db}->{DOCFREQ_O . $_} } @_;
493 ulpfr 19
494 dpavlin 108 # warn "DEBUG WAIT: wanted[$top_wanted]terms[@terms]";
495 laperla 40 return unless @terms;
496 ulpfr 19
497     # We special-case one term queries here. If the index was sorted,
498     # choping off the rest of the list will return the same ranking.
499 dpavlin 108 if ($top_wanted and @terms == 1) {
500 ulpfr 19 my $term = shift @terms;
501 dpavlin 108 my $idf = log($self->{records}/$self->{db}->{DOCFREQ_O . $term});
502 ulpfr 19 my @res;
503    
504     if ($self->{reorg}) { # or not $query->{picky}
505 dpavlin 108 @res = unpack "w". int(2*$top_wanted), $self->{db}->{POSTINGLIST_P . $term};
506     # warn sprintf "DEBUG WAIT: scalar(\@res)[%d]", scalar(@res);
507 ulpfr 19 } else {
508 dpavlin 108 @res = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
509 ulpfr 19 }
510    
511     for (my $i=1; $i<@res; $i+=2) {
512 dpavlin 108 # $res[$i] /= $self->{db}->{MAXTF_M . $res[$i-1]} / $idf;
513 laperla 30 # above was written badly, allows two DIV_ZERO problems.
514 dpavlin 108 my $maxtf = $self->{db}->{MAXTF_M . $res[$i-1]};
515 laperla 30 unless ($maxtf) {
516     warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
517     $maxtf = 1;
518     }
519     $res[$i] = ($res[$i] / $maxtf) * $idf;
520 ulpfr 19 }
521    
522 dpavlin 108 my %res = @res; # bloed: @res waere schon sortiert gewesen
523     return \%res;
524 ulpfr 19 }
525    
526     # We separate exhaustive search here to avoid overhead and make the
527     # code more readable. The block can be removed without changing the
528     # result.
529 dpavlin 108 unless ($top_wanted) {
530 ulpfr 19 for (@terms) {
531 dpavlin 108 my $df = $self->{db}->{DOCFREQ_O . $_};
532 ulpfr 19
533     # The frequency *must* be 1 at least since the posting list is nonempty
534     _complain('search for term', $_) and $df = 1 if $df < 1;
535    
536     # Unpack posting list for current query term $_
537 dpavlin 108 my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
538 ulpfr 19
539 dpavlin 108 _complain('search for term', $_) if $self->{db}->{DOCFREQ_O . $_} != keys %post;
540 ulpfr 19 # This is the inverse document frequency. The log of the inverse
541     # fraction of documents the term occurs in.
542     my $idf = log($self->{records}/$df);
543     for my $did (keys %post) {
544 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
545 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
546     }
547 ulpfr 10 }
548     }
549 ulpfr 19 # warn sprintf "Used %d accumulators\n", scalar keys %score;
550 dpavlin 108 return \%score;
551 ulpfr 10 }
552 ulpfr 19
553     # A sloppy but fast algorithm for multiple term queries.
554 dpavlin 108 unless ($picky_strict) {
555 ulpfr 19 for (@terms) {
556     # Unpack posting list for current query term $_
557 dpavlin 108 my %post;
558     if ($self->{reorg} && $top_wanted && $ignore_excess) {
559     %post = unpack 'w'. int(2*$ignore_excess) , $self->{db}->{POSTINGLIST_P . $_};
560     } else {
561     %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
562     }
563     # warn sprintf "DEBUG WAIT: term[%s] keys %%post[%s]", $_, scalar keys %post;
564 ulpfr 19
565     # Lookup the number of documents the term occurs in (document frequency)
566 dpavlin 108 my $occ = $self->{db}->{DOCFREQ_O . $_};
567 ulpfr 19
568 dpavlin 108 _complain('search for term', $_) if !$ignore_excess && $occ != keys %post;
569 ulpfr 19 # The frequency *must* be 1 at least since the posting list is nonempty
570     _complain('search for term', $_) and $occ = 1 if $occ < 1;
571    
572 dpavlin 108 # This is the inverse document frequency. The log of the inverse fraction
573     # of documents the term occurs in.
574 ulpfr 19 my $idf = log($self->{records}/$occ);
575    
576     # If we have a reasonable number of accumulators, change the
577     # loop to iterate over the accumulators. This will compromise
578     # quality for better speed. The algorithm still computes the
579     # exact weights, but the result is not guaranteed to contain the
580     # *best* results. The database might contain documents better
581     # than the worst returned document.
582    
583     # We process the lists in order of increasing length. When the
584     # number of accumulators exceeds $wanted, no new documents are
585     # added, only the ranking/weighting of the seen documents is
586     # improved. The resulting ranking list must be pruned, since only
587     # the top most documents end up near their "optimal" rank.
588    
589 dpavlin 108 if (keys %score < $top_wanted) {
590    
591     # Diese folgende Schleife ist (WAR!) der Hammer fuer die Suche "mysql
592     # für dummies bellomo". Sie frisst 3.1+1.7 Sekunden.
593    
594     # Der erste Grund ist, dass 3 Begriffe noch nicht genug gebracht haben,
595     # aber der vierte viel zu viel bringt. Der zweite Grund ist, dass wir
596     # so viele Lookups in $self->{db} machen. Das Rechnen hingegen ist
597     # vermutlich billig.
598    
599 ulpfr 19 for my $did (keys %post) {
600 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
601 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
602     }
603     }
604     } else {
605     for my $did (keys %score) {
606     next unless exists $post{$did};
607 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
608 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
609     }
610     }
611     }
612     }
613 dpavlin 108 warn sprintf("DEBUG WAIT: returning from search_raw_ref at [%.3f] after terms[%s] with keys[%d]",
614     time,
615     join(":",@terms),
616     scalar keys %score,
617     );
618     return \%score;
619 ulpfr 19 }
620     my @max; $max[$#terms+1]=0;
621     my @idf;
622    
623     # Preparation loop. This extra loop makes sense only when "reorg"
624     # and "wanted" are true. But at the time beeing, keeping the code
625     # for the different search algorithms in one place seems more
626     # desirable than some minor speedup of the brute force version. We
627     # do cache $idf though.
628    
629     for (my $i = $#terms; $i >=0; $i--) {
630     local $_ = $terms[$i];
631     # Lookup the number of documents the term occurs in (document frequency)
632 dpavlin 108 my $df = $self->{db}->{DOCFREQ_O . $_};
633 ulpfr 19
634     # The frequency *must* be 1 at least since the posting list is nonempty
635     _complain('search for term', $_) and $df = 1 if $df < 1;
636    
637     # This is the inverse document frequency. The log of the inverse
638     # fraction of documents the term occurs in.
639     $idf[$i] = log($self->{records}/$df);
640    
641     my ($did,$occ);
642     if ($self->{reorg}) {
643 dpavlin 108 ($did,$occ) = unpack 'w2', $self->{db}->{POSTINGLIST_P . $_};
644 ulpfr 19 } else { # Maybe this costs more than it helps
645 dpavlin 108 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{POSTINGLIST_P . $_});
646 ulpfr 19 }
647 dpavlin 108 my $freq = $self->{db}->{MAXTF_M . $did};
648 ulpfr 19 my $max = $occ/$freq*$idf[$i];
649     $max[$i] = $max + $max[$i+1];
650     }
651    
652 dpavlin 108 # Main loop
653 ulpfr 19 for my $i (0 .. $#terms) {
654     my $term = $terms[$i];
655     # Unpack posting list for current query term $term. We loose the
656     # sorting order because the assignment to a hash.
657 dpavlin 108 my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
658 ulpfr 19
659     _complain('search for term', $term)
660 dpavlin 108 if $self->{db}->{DOCFREQ_O . $term} != keys %post;
661 ulpfr 19
662     my $idf = $idf[$i];
663     my $full; # Need to process all postings
664     my $chop; # Score necessary to enter the ranking list
665    
666 ulpfr 80 if (# We know that wanted is true since we special cased the
667 ulpfr 19 # exhaustive search.
668    
669 dpavlin 108 $top_wanted and
670 ulpfr 19
671 dpavlin 108 # We did sort here if necessary in the preparation loop:
672 ulpfr 19 # $self->{reorg} and
673    
674 dpavlin 108 scalar keys %score > $top_wanted) {
675     $chop = (sort { $b <=> $a } values %score)[$top_wanted];
676 ulpfr 19 $full = $max[$i] > $chop;
677     } else {
678     $full = 1;
679     }
680    
681     if ($full) {
682 dpavlin 108 # We need to inspect the full list. Either $top_wanted is not given,
683 ulpfr 19 # the index is not sorted, or we don't have enough accumulators
684     # yet.
685     if (defined $chop) {
686     # We might be able to avoid allocating accumulators
687     for my $did (keys %post) {
688 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
689 ulpfr 19 my $wgt = $post{$did} / $freq * $idf;
690     # We add an accumulator if $wgt exeeds $chop
691     if (exists $score{$did} or $wgt > $chop) {
692     $score{$did} += $wgt;
693     }
694     }
695     }
696     } else {
697     # Allocate acumulators for each seen document.
698     for my $did (keys %post) {
699 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
700 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
701     }
702     }
703     }
704     } else {
705     # Update existing accumulators
706     for my $did (keys %score) {
707     next unless exists $post{$did};
708 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
709 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
710     }
711     }
712     }
713     }
714     #warn sprintf "Used %d accumulators\n", scalar keys %score;
715 dpavlin 108 \%score;
716 ulpfr 10 }
717    
718 ulpfr 19 sub set {
719     my ($self, $attr, $value) = @_;
720    
721 ulpfr 83 die "No such index attribute: '$attr'" unless $attr eq 'top';
722 ulpfr 19
723     return delete $self->{reorg} if $value == 0;
724    
725     return if $self->{reorg}; # we are sorted already
726     return unless $self->{mode} & O_RDWR;
727     defined $self->{db} or $self->open;
728    
729     $self->sync;
730     while (my($key, $value) = each %{$self->{db}}) {
731 dpavlin 108 next if $key !~ /^p/; # some day use PMATCH
732     $self->{db}{$key} = $self->sort_postings($value);
733 ulpfr 19 }
734     $self->{reorg} = 1;
735     }
736    
737 ulpfr 10 sub sync {
738     my $self = shift;
739 dpavlin 108 return unless $self->{mode} & O_RDWR;
740     Carp::carp(sprintf "[%s] Flushing %d postings", scalar(localtime), $self->{cached})
741     if $self->{cached};
742     while (my($key, $value) = each %{$self->{cache}}) {
743     $self->{db}{POSTINGLIST_P . $key} ||= "";
744     if ($self->{reorg}) {
745     $self->{db}->{POSTINGLIST_P . $key} =
746     $self->sort_postings($self->{db}->{POSTINGLIST_P . $key}
747     . $value);
748     } else {
749     $self->{db}->{POSTINGLIST_P . $key} .= $value;
750 ulpfr 10 }
751     }
752 dpavlin 108 while (my($key, $value) = each %{$self->{cdict}}) {
753     $self->{db}->{DOCFREQ_O . $key} = 0 unless $self->{db}->{DOCFREQ_O . $key};
754     $self->{db}->{DOCFREQ_O . $key} += $value;
755     }
756     $self->{cache} = {};
757     $self->{cdict} = {};
758     $self->{cached} = 0;
759 ulpfr 10 }
760    
761     sub close {
762     my $self = shift;
763    
764 dpavlin 108 delete $self->{env};
765 ulpfr 10 if ($self->{dbh}) {
766     $self->sync;
767     delete $self->{dbh};
768     untie %{$self->{db}};
769 dpavlin 116 for my $att (qw(db func cache cached cdict path maindbfile)) {
770 dpavlin 108 delete $self->{$att};
771     }
772     for my $att (qw(pfunc ifunc xfunc)) {
773     delete $self->{$att} if defined $self->{$att};
774     }
775 ulpfr 10 }
776     }
777    
778 ulpfr 51 sub keys {
779     my $self = shift;
780    
781     defined $self->{db} or $self->open;
782     keys %{$self->{db}};
783     }
784    
785 ulpfr 10 1;
786    

Properties

Name Value
cvs2svn:cvs-rev 1.12

  ViewVC Help
Powered by ViewVC 1.1.26