/[wait]/trunk/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 107 by dpavlin, Mon May 24 20:57:08 2004 UTC revision 108 by dpavlin, Tue Jul 13 17:41:12 2004 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Perl -*-  # -*- Mode: cperl; fill-column: 79 -*-
2  # $Basename: InvertedIndex.pm $  # $Basename: InvertedIndex.pm $
3  # $Revision: 1.30 $  # $Revision: 1.30 $
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
# Line 12  Line 12 
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
14  use strict;  use strict;
15  use DB_File;  use BerkeleyDB;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC $VERSION);  use vars qw(%FUNC $VERSION);
20    use Time::HiRes qw(time);
21    
22  $VERSION = "1.900"; # others test if we are loaded by checking $VERSION  $VERSION = "2.000"; # others test if we are loaded by checking $VERSION
23    
24    use constant DOCFREQ_O     => "o";
25    use constant MAXTF_M       => "m";
26    use constant POSTINGLIST_P => "p";
27    use constant PMATCH        => qr/^(??{POSTINGLIST_P()})/;
28    
29  # The dictionary has three different key types:  # The dictionary has three different key types:
30    
31  #  'o'.$word  #  'o'.$word
32  #  #
33  #     The document frequency is the number of documents a term occurs  #     The document frequency is the number of documents a term occurs
# Line 40  $VERSION = "1.900"; # others test if we Line 47  $VERSION = "1.900"; # others test if we
47  #     Under this key we store the actual posting list as pairs of  #     Under this key we store the actual posting list as pairs of
48  #     packed integers.  #     packed integers.
49    
 my $no_old_index_support = 0; # do not check for old indices if set  
   
50  sub new {  sub new {
51    my $type = shift;    my $type = shift;
52    my %parm = @_;    my %parm = @_;
53    my $self = {};    my $self = {};
54    
55    $self->{file}     = $parm{file}     or croak "No file specified";    for my $x (qw(file attr subname env maindbfile tablename)) {
56    $self->{attr}     = $parm{attr}     or croak "No attributes specified";      $self->{$x}     = $parm{$x}     or croak "No $x specified";
57      }
58    
59    $self->{filter}   = $parm{filter};    $self->{filter}   = $parm{filter};
60    $self->{'name'}   = $parm{'name'};    $self->{'name'}   = $parm{'name'};
61    $self->{records}  = 0;    $self->{records}  = 0;
# Line 64  sub new { Line 71  sub new {
71    bless $self, ref($type) || $type;    bless $self, ref($type) || $type;
72  }  }
73    
74  sub name {$_[0]->{'name'}}  for my $accessor (qw(name maindbfile tablename subname)) {
75      no strict 'refs';
76      *{$accessor} = sub {
77        my($self) = @_;
78        return $self->{$accessor} if $self->{$accessor};
79        require Carp;
80        Carp::confess("accessor $accessor not there");
81      }
82    }
83    
84  sub _split_pos {  sub _split_pos {
85    my ($text, $pos) = @{$_[0]};    my ($text, $pos) = @{$_[0]};
# Line 144  sub drop { Line 159  sub drop {
159    }    }
160  }  }
161    
 sub is_an_old_index {  
   my $self = shift;  
   
   return 0 if $no_old_index_support;  
   return $self->{old_index} if exists $self->{old_index};  
   
   # We can only guess if this is an old index. We lookup the first 10  
   # $O entries. If all values are integers, we assume that the index  
   # is an old one.  
   
   defined $self->{db} or $self->open;  
   $self->sync;  
   my $dbh = $self->{dbh} or return $self->{old_index} = 0;       # for convenience  
   
   my $O = pack('C', 0xff)."o";  
   my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!  
   if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {  
     # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";  
     return $self->{old_index} = 0;  
   }  
   for (my $i=0; $i<10;$i++) {  
     if ($value !~ /^\d+$/) {  
       # warn "DEBUG: word[$word]value[$value], not an old index";  
       return $self->{old_index} = 0;  
     }  
     if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left  
         $word !~ /^$O$;/o                   # no $O values left  
        ) {  
       # we are not sure enough that this is an old index  
       # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";  
       return $self->{old_index} = 0;  
     }  
   }  
   # warn "DEBUG: old index";  
   return $self->{old_index} = 1;  
 }  
   
162  sub open {  sub open {
163    my $self = shift;    my $self = shift;
164    my $file = $self->{file};    my $file = $self->{file};
# Line 190  sub open { Line 168  sub open {
168    } else {    } else {
169      $self->{func}     =      $self->{func}     =
170        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
171      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,      my $flags;
172                         $self->{mode}, 0664, $DB_BTREE);      if ($self->{mode} & O_RDWR) {
173          $flags = DB_CREATE; # | DB_INIT_MPOOL | DB_PRIVATE | DB_INIT_CDB;
174          warn "Flags on inverted $file set to 'writing'";
175        } else {
176          $flags = DB_RDONLY;
177          # warn "Flags on inverted $file set to 'readonly'";
178        }
179        my $filename = $self->maindbfile or die;
180        my $subname  = join("/",$self->tablename || die,$self->subname || die);
181        my $env = $self->{env} || "[undef]";
182        $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
183                           # Filename => $file,
184                           Filename => $filename,
185                           $self->{env} ? (Env  => $self->{env}) : (),
186                           Subname => $subname,
187                           Mode => 0664,
188                           Flags => $flags,
189                           $WAIT::Database::Cachesize?(Cachesize => $WAIT::Database::Cachesize):(),
190                           $WAIT::Database::Pagesize?(Pagesize => $WAIT::Database::Pagesize):(),
191                           ) or die "Couldn't tie: $BerkeleyDB::Error; filename=>'$filename', env=>'$env',subname=>'$subname',flags=>'$flags'";
192      $self->{cache} = {}      $self->{cache} = {}
193        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
194      $self->{cdict} = {}      $self->{cdict} = {}
195        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
196      $self->{cached} = 0;      $self->{cached} = 0;
     if (!$no_old_index_support and $self->is_an_old_index()) {  
       warn "This is an old index, upgrade you database";  
       require WAIT::InvertedIndexOld;  
       bless $self, 'WAIT::InvertedIndexOld';  
     }  
197    }    }
198  }  }
199    
# Line 211  sub insert { Line 203  sub insert {
203    my %occ;    my %occ;
204    
205    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
206      defined $self->{db} or die "open didn't help!!!";
207    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
208    my ($word, $noc);    my ($word, $noc);
209    $self->{records}++;    $self->{records}++;
# Line 218  sub insert { Line 211  sub insert {
211      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
212        $self->{cdict}->{$word}++;        $self->{cdict}->{$word}++;
213        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
214      } else {                } else {
215        $self->{cdict}->{$word} = 1;        $self->{cdict}->{$word} = 1;
216        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
217      }      }
# Line 230  sub insert { Line 223  sub insert {
223    for (values %occ) {    for (values %occ) {
224      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
225    }    }
226    $self->{db}->{'m'. $key} = $maxtf;    $self->{db}->{MAXTF_M . $key} = $maxtf;
227  }  }
228    
229  # We sort postings by increasing max term frequency (~ by increasing  # We sort postings by increasing max term frequency (~ by increasing
# Line 256  sub sort_postings { Line 249  sub sort_postings {
249    # order can be exploited for tuning of single term queries.    # order can be exploited for tuning of single term queries.
250    
251    for my $did (keys %$post) { # sanity check    for my $did (keys %$post) { # sanity check
252      unless ($self->{db}->{"m". $did}) {      unless ($self->{db}->{MAXTF_M . $did}) {
253        warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";        warn "WAIT Warning: DIVZERO threat from did[$did]post[$post]post{did}[$post->{$did}]";
254        $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing        $self->{db}->{MAXTF_M . $did} = 1; # fails if we have not opened for writing
255      }      }
256    }    }
257    for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}    for my $did (sort {    $post->{$b} / $self->{db}->{MAXTF_M . $b}
258                                        <=>                                        <=>
259                           $post->{$a} / $self->{db}->{'m'. $a}                           $post->{$a} / $self->{db}->{MAXTF_M . $a}
260                      } keys %$post) {                      } keys %$post) {
261      $r .= pack 'w2', $did, $post->{$did};      $r .= pack 'w2', $did, $post->{$did};
262    }    }
# Line 295  sub delete { Line 288  sub delete {
288      warn "Catching warning[$warning] during delete of key[$key]";      warn "Catching warning[$warning] during delete of key[$key]";
289    };    };
290    for (keys %occ) {# may reorder posting list    for (keys %occ) {# may reorder posting list
291      my %post = unpack 'w*', $db->{'p'.$_};      my %post = unpack 'w*', $db->{POSTINGLIST_P . $_};
292      delete $post{$key};      delete $post{$key};
293      $db->{'p'.$_}    = $self->sort_postings(\%post);      $db->{POSTINGLIST_P . $_}    = $self->sort_postings(\%post);
294      _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;      _complain('delete of term', $_) if $db->{DOCFREQ_O . $_}-1 != keys %post;
295      $db->{'o'.$_} = scalar keys %post;      $db->{DOCFREQ_O . $_} = scalar keys %post;
296    }    }
297    delete $db->{'m'. $key};    delete $db->{MAXTF_M . $key};
298  }  }
299    
300  sub intervall {  sub intervall {
301    my ($self, $first, $last) = @_;    my ($self, $first, $last) = @_;
   my $value = '';  
   my $word  = '';  
   my @result;  
   
   return unless exists $self->{'intervall'};  
302    
303    defined $self->{db} or $self->open;    die "intervall broken in this version of WAIT: need to fix the
304    $self->sync;    R_CURSOR and R_NEXT lines";
   my $dbh = $self->{dbh};       # for convenience  
305    
306    if (ref $self->{'intervall'}) {  ####      my $value = '';
307      unless (exists $self->{'ifunc'}) {  ####      my $word  = '';
308        $self->{'ifunc'} =  ####      my @result;
309          eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));  ####    
310      }  ####      return unless exists $self->{'intervall'};
311      ($first) = &{$self->{'ifunc'}}($first) if $first;  ####    
312      ($last)  = &{$self->{'ifunc'}}($last) if $last;  ####      defined $self->{db} or $self->open;
313    }  ####      $self->sync;
314    $first = 'p'.($first||'');  ####      my $dbh = $self->{dbh};       # for convenience
315    $last  = (defined $last)?'p'.$last:'q';  ####    
316    ####      if (ref $self->{'intervall'}) {
317    # set the cursor to $first  ####        unless (exists $self->{'ifunc'}) {
318    $dbh->seq($first, $value, R_CURSOR);  ####          $self->{'ifunc'} =
319    ####            eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
320    # $first would be after the last word  ####        }
321    return () if $first gt $last;  ####        ($first) = &{$self->{'ifunc'}}($first) if $first;
322      ####        ($last)  = &{$self->{'ifunc'}}($last) if $last;
323    push @result, substr($first,1);  ####      }
324    while (!$dbh->seq($word, $value, R_NEXT)) {  ####      $first = POSTINGLIST_P . ($first||'');
325      # We should limit this to a "resonable" number of words  ####      $last  = (defined $last)?POSTINGLIST_P . $last:'q';
326      last if $word gt $last;  ####    
327      push @result, substr($word,1);  ####      # set the cursor to $first
328    }  ####      $dbh->seq($first, $value, R_CURSOR);
329    \@result;                     # speed  ####    
330    ####      # $first would be after the last word
331    ####      return () if $first gt $last;
332    ####      
333    ####      push @result, substr($first,1);
334    ####      while (!$dbh->seq($word, $value, R_NEXT)) {
335    ####        # We should limit this to a "resonable" number of words
336    ####        last if $word gt $last;
337    ####        push @result, substr($word,1);
338    ####      }
339    ####      \@result;                     # speed
340  }  }
341    
342  sub prefix {  sub prefix {
343    my ($self, $prefix) = @_;    my ($self, $prefix) = @_;
   my $value = '';  
   my $word  = '';  
   my @result;  
344    
345    return () unless defined $prefix; # Full dictionary requested !!    die "prefix not supported in this version of WAIT: need to fix the R_CURSOR";
   return unless exists $self->{'prefix'};  
   defined $self->{db} or $self->open;  
   $self->sync;  
   my $dbh = $self->{dbh};  
     
   if (ref $self->{'prefix'}) {  
     unless (exists $self->{'pfunc'}) {  
       $self->{'pfunc'} =  
         eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));  
     }  
     ($prefix) = &{$self->{'pfunc'}}($prefix);  
   }  
346    
   if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {  
     return ();  
   }  
   return () if $word !~ /^p$prefix/;  
   push @result, substr($word,1);  
347    
348    while (!$dbh->seq($word, $value, R_NEXT)) {  ####      my $value = '';
349      # We should limit this to a "resonable" number of words  ####      my $word  = '';
350      last if $word !~ /^p$prefix/;  ####      my @result;
351      push @result, substr($word,1);  ####    
352    }  ####      return () unless defined $prefix; # Full dictionary requested !!
353    \@result;                     # speed  ####      return unless exists $self->{'prefix'};
354    ####      defined $self->{db} or $self->open;
355    ####      $self->sync;
356    ####      my $dbh = $self->{dbh};
357    ####      
358    ####      if (ref $self->{'prefix'}) {
359    ####        unless (exists $self->{'pfunc'}) {
360    ####          $self->{'pfunc'} =
361    ####            eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
362    ####        }
363    ####        ($prefix) = &{$self->{'pfunc'}}($prefix);
364    ####      }
365    ####    
366    ####      if ($dbh->seq($word = POSTINGLIST_P . $prefix, $value, R_CURSOR)) {
367    ####        return ();
368    ####      }
369    ####      return () if $word !~ /^p$prefix/;
370    ####      push @result, substr($word,1);
371    ####    
372    ####      while (!$dbh->seq($word, $value, R_NEXT)) {
373    ####        # We should limit this to a "resonable" number of words
374    ####        last if $word !~ /^p$prefix/;
375    ####        push @result, substr($word,1);
376    ####      }
377    ####      \@result;                     # speed
378  }  }
379    
380  =head2 search($query)  =head2 search($query)
# Line 405  in the size of the lists. Line 406  in the size of the lists.
406    
407  =cut  =cut
408    
409  sub search {  sub search_ref {
410    my $self  = shift;    my $self  = shift;
411    my $query = shift;    my $query = shift;
412    
413      my $debugtime = 0;
414      my($time,$entertime);
415      our $STARTTIME;
416      if ($debugtime) {
417        $time = time;
418        $STARTTIME ||= $time;
419        if ($time-$STARTTIME > 5) {
420          $STARTTIME = $time;
421          warn "STARTTIME: $STARTTIME\n";
422        }
423        $entertime = time-$STARTTIME;
424        warn sprintf "ENTER TIME: %.4f\n", $entertime;
425      }
426    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
427    $self->sync;    $self->sync;
428    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() there    my $ref = $self->search_raw_ref($query, &{$self->{func}}(@_)); # No call to parse() there
429      if ($debugtime) {
430        my $leavetime = time-$STARTTIME;
431        warn sprintf "LEAVE TIME: %.4f\n", $leavetime;
432        if ($leavetime-$entertime > .4) {
433          require Data::Dumper;
434          print STDERR "Line " . __LINE__ . ", File: " . __FILE__ . "\n" .
435              Data::Dumper->new([$query,\@_],[qw(query at_)])->Indent(1)->Useqq(1)->Dump; # XXX
436        }
437      }
438      $ref;
439  }  }
440    
441  sub parse {  sub parse {
# Line 426  sub search_prefix { Line 450  sub search_prefix {
450    
451    # print "search_prefix(@_)\n";    # print "search_prefix(@_)\n";
452    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
453    $self->search_raw(map($self->prefix($_), @_));    $self->search_raw_ref(map($self->prefix($_), @_));
454  }  }
455    
456  sub _complain ($$) {  sub _complain ($$) {
# Line 439  sub _complain ($$) { Line 463  sub _complain ($$) {
463               $term,));               $term,));
464  }  }
465    
466  sub search_raw {  sub search_raw_ref {
467    my $self  = shift;    my $self  = shift;
468    my $query = shift;    my $query = shift;
469      # warn "DEBUG WAIT: search_raw_ref args 2..[@_]";
470    my %score;    my %score;
471    
472    # Top $wanted documents must be correct. Zero means all matching    # Top $top_wanted documents must be correct. Zero means all matching documents.
473    # documents.    my $top_wanted = $query->{top};
474    my $wanted = $query->{top};    my $picky_strict = $query->{picky};
475    my $strict = $query->{picky};    # the option is really ignore_excess
476      my $ignore_excess = $query->{ignore_excess};
477    # Return at least $minacc documents. Zero means all matching  
478    # documents.    # Return at least $minacc documents. Zero means all matching documents.
479    # my $minacc = $query->{accus} || $wanted;  
480      # my $minacc = $query->{accus} || $top_wanted;
481    
482    # Open index and flush cache if necessary    # Open index and flush cache if necessary
483    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
# Line 460  sub search_raw { Line 486  sub search_raw {
486    # We keep duplicates    # We keep duplicates
487    my @terms =    my @terms =
488      # Sort words by decreasing document frequency      # Sort words by decreasing document frequency
489      sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }      sort { $self->{db}->{DOCFREQ_O . $a} <=> $self->{db}->{DOCFREQ_O . $b} }
490        # check which words occur in the index.        # check which words occur in the index.
491        grep { $self->{db}->{'o'.$_} } @_;        grep { $self->{db}->{DOCFREQ_O . $_} } @_;
492    
493      # warn "DEBUG WAIT: wanted[$top_wanted]terms[@terms]";
494    return unless @terms;    return unless @terms;
495    
496    # We special-case one term queries here.  If the index was sorted,    # We special-case one term queries here.  If the index was sorted,
497    # choping off the rest of the list will return the same ranking.    # choping off the rest of the list will return the same ranking.
498    if ($wanted and @terms == 1) {    if ($top_wanted and @terms == 1) {
499      my $term  = shift @terms;      my $term  = shift @terms;
500      my $idf   = log($self->{records}/$self->{db}->{'o'.$term});      my $idf   = log($self->{records}/$self->{db}->{DOCFREQ_O . $term});
501      my @res;      my @res;
502    
503      if ($self->{reorg}) { # or not $query->{picky}      if ($self->{reorg}) { # or not $query->{picky}
504        @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};        @res = unpack "w". int(2*$top_wanted), $self->{db}->{POSTINGLIST_P . $term};
505          # warn sprintf "DEBUG WAIT: scalar(\@res)[%d]", scalar(@res);
506      } else {      } else {
507        @res = unpack 'w*',                $self->{db}->{'p'.$term};        @res = unpack 'w*',                $self->{db}->{POSTINGLIST_P . $term};
508      }      }
509    
510      for (my $i=1; $i<@res; $i+=2) {      for (my $i=1; $i<@res; $i+=2) {
511        # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;        # $res[$i] /= $self->{db}->{MAXTF_M . $res[$i-1]} / $idf;
512        # above was written badly, allows two DIV_ZERO problems.        # above was written badly, allows two DIV_ZERO problems.
513        my $maxtf = $self->{db}->{"m". $res[$i-1]};        my $maxtf = $self->{db}->{MAXTF_M . $res[$i-1]};
514        unless ($maxtf) {        unless ($maxtf) {
515          warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";          warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
516          $maxtf = 1;          $maxtf = 1;
# Line 490  sub search_raw { Line 518  sub search_raw {
518        $res[$i] = ($res[$i] / $maxtf) * $idf;        $res[$i] = ($res[$i] / $maxtf) * $idf;
519      }      }
520    
521      return @res      my %res = @res; # bloed: @res waere schon sortiert gewesen
522        return \%res;
523    }    }
524    
525    # We separate exhaustive search here to avoid overhead and make the    # We separate exhaustive search here to avoid overhead and make the
526    # code more readable. The block can be removed without changing the    # code more readable. The block can be removed without changing the
527    # result.    # result.
528    unless ($wanted) {    unless ($top_wanted) {
529      for (@terms) {      for (@terms) {
530        my $df      = $self->{db}->{'o'.$_};        my $df      = $self->{db}->{DOCFREQ_O . $_};
531    
532        # The frequency *must* be 1 at least since the posting list is nonempty        # The frequency *must* be 1 at least since the posting list is nonempty
533        _complain('search for term', $_) and $df = 1 if $df < 1;        _complain('search for term', $_) and $df = 1 if $df < 1;
534    
535        # Unpack posting list for current query term $_        # Unpack posting list for current query term $_
536        my %post = unpack 'w*', $self->{db}->{'p'.$_};        my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
537    
538        _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;        _complain('search for term', $_) if $self->{db}->{DOCFREQ_O . $_} != keys %post;
539        # This is the inverse document frequency. The log of the inverse        # This is the inverse document frequency. The log of the inverse
540        # fraction of documents the term occurs in.        # fraction of documents the term occurs in.
541        my $idf = log($self->{records}/$df);        my $idf = log($self->{records}/$df);
542        for my $did (keys %post) {        for my $did (keys %post) {
543          if (my $freq = $self->{db}->{'m'. $did}) {          if (my $freq = $self->{db}->{MAXTF_M . $did}) {
544            $score{$did} += $post{$did} / $freq * $idf;            $score{$did} += $post{$did} / $freq * $idf;
545          }          }
546        }        }
547      }      }
548      # warn sprintf "Used %d accumulators\n", scalar keys %score;      # warn sprintf "Used %d accumulators\n", scalar keys %score;
549      return %score;      return \%score;
550    }    }
551    
552    # A sloppy but fast algorithm for multiple term queries.    # A sloppy but fast algorithm for multiple term queries.
553    unless ($strict) {    unless ($picky_strict) {
554      for (@terms) {      for (@terms) {
555        # Unpack posting list for current query term $_        # Unpack posting list for current query term $_
556        my %post = unpack 'w*', $self->{db}->{'p'.$_};        my %post;
557          if ($self->{reorg} && $top_wanted && $ignore_excess) {
558            %post = unpack 'w'. int(2*$ignore_excess) , $self->{db}->{POSTINGLIST_P . $_};
559          } else {
560            %post = unpack 'w*',                        $self->{db}->{POSTINGLIST_P . $_};
561          }
562          # warn sprintf "DEBUG WAIT: term[%s] keys %%post[%s]", $_, scalar keys %post;
563    
564        # Lookup the number of documents the term occurs in (document frequency)        # Lookup the number of documents the term occurs in (document frequency)
565        my $occ  = $self->{db}->{'o'.$_};        my $occ  = $self->{db}->{DOCFREQ_O . $_};
566    
567        _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;        _complain('search for term', $_) if !$ignore_excess && $occ != keys %post;
568        # The frequency *must* be 1 at least since the posting list is nonempty        # The frequency *must* be 1 at least since the posting list is nonempty
569        _complain('search for term', $_) and $occ = 1 if $occ < 1;        _complain('search for term', $_) and $occ = 1 if $occ < 1;
570    
571        # This is the inverse document frequency. The log of the inverse        # This is the inverse document frequency. The log of the inverse fraction
572        # fraction of documents the term occurs in.        # of documents the term occurs in.
573        my $idf = log($self->{records}/$occ);        my $idf = log($self->{records}/$occ);
574    
575        # If we have a reasonable number of accumulators, change the        # If we have a reasonable number of accumulators, change the
# Line 550  sub search_raw { Line 585  sub search_raw {
585        # improved.  The resulting ranking list must be pruned, since only        # improved.  The resulting ranking list must be pruned, since only
586        # the top most documents end up near their "optimal" rank.        # the top most documents end up near their "optimal" rank.
587                
588        if (keys %score < $wanted) {        if (keys %score < $top_wanted) {
589    
590            # Diese folgende Schleife ist (WAR!) der Hammer fuer die Suche "mysql
591            # für dummies bellomo". Sie frisst 3.1+1.7 Sekunden.
592    
593            # Der erste Grund ist, dass 3 Begriffe noch nicht genug gebracht haben,
594            # aber der vierte viel zu viel bringt. Der zweite Grund ist, dass wir
595            # so viele Lookups in $self->{db} machen. Das Rechnen hingegen ist
596            # vermutlich billig.
597    
598          for my $did (keys %post) {          for my $did (keys %post) {
599            if (my $freq = $self->{db}->{'m'. $did}) {            if (my $freq = $self->{db}->{MAXTF_M . $did}) {
600              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
601            }            }
602          }          }
603        } else {        } else {
604          for my $did (keys %score) {          for my $did (keys %score) {
605            next unless exists $post{$did};            next unless exists $post{$did};
606            if (my $freq = $self->{db}->{'m'. $did}) {            if (my $freq = $self->{db}->{MAXTF_M . $did}) {
607              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
608            }            }
609          }          }
610        }        }
611      }      }
612      return %score;      warn sprintf("DEBUG WAIT: returning from search_raw_ref at [%.3f] after terms[%s] with keys[%d]",
613                     time,
614                     join(":",@terms),
615                     scalar keys %score,
616                    );
617        return \%score;
618    }    }
619    my @max; $max[$#terms+1]=0;    my @max; $max[$#terms+1]=0;
620    my @idf;    my @idf;
# Line 579  sub search_raw { Line 628  sub search_raw {
628    for (my $i = $#terms; $i >=0; $i--) {    for (my $i = $#terms; $i >=0; $i--) {
629      local $_ = $terms[$i];      local $_ = $terms[$i];
630      # Lookup the number of documents the term occurs in (document frequency)      # Lookup the number of documents the term occurs in (document frequency)
631      my $df      = $self->{db}->{'o'.$_};      my $df      = $self->{db}->{DOCFREQ_O . $_};
632    
633      # The frequency *must* be 1 at least since the posting list is nonempty      # The frequency *must* be 1 at least since the posting list is nonempty
634      _complain('search for term', $_) and $df = 1 if $df < 1;      _complain('search for term', $_) and $df = 1 if $df < 1;
# Line 590  sub search_raw { Line 639  sub search_raw {
639    
640      my ($did,$occ);      my ($did,$occ);
641      if ($self->{reorg}) {      if ($self->{reorg}) {
642        ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};        ($did,$occ) = unpack 'w2', $self->{db}->{POSTINGLIST_P . $_};
643      } else {                    # Maybe this costs more than it helps      } else {                    # Maybe this costs more than it helps
644        ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});        ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{POSTINGLIST_P . $_});
645      }      }
646      my $freq      = $self->{db}->{'m'. $did};      my $freq      = $self->{db}->{MAXTF_M . $did};
647      my $max       = $occ/$freq*$idf[$i];      my $max       = $occ/$freq*$idf[$i];
648      $max[$i]      = $max + $max[$i+1];      $max[$i]      = $max + $max[$i+1];
649    }    }
650    
651    # Main loop    # Main loop
652    for my $i (0 .. $#terms) {    for my $i (0 .. $#terms) {
653      my $term = $terms[$i];      my $term = $terms[$i];
654      # Unpack posting list for current query term $term. We loose the      # Unpack posting list for current query term $term. We loose the
655      # sorting order because the assignment to a hash.      # sorting order because the assignment to a hash.
656      my %post = unpack 'w*', $self->{db}->{'p'.$term};      my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
657    
658      _complain('search for term', $term)      _complain('search for term', $term)
659        if $self->{db}->{'o'.$term} != keys %post;        if $self->{db}->{DOCFREQ_O . $term} != keys %post;
660    
661      my $idf  = $idf[$i];      my $idf  = $idf[$i];
662      my $full;                   # Need to process all postings      my $full;                   # Need to process all postings
# Line 616  sub search_raw { Line 665  sub search_raw {
665      if (# We know that wanted is true since we special cased the      if (# We know that wanted is true since we special cased the
666          # exhaustive search.          # exhaustive search.
667    
668          $wanted and          $top_wanted and
669    
670          # We did sort here if necessary in          # We did sort here if necessary in the preparation loop:
         # the preparation loop  
671          # $self->{reorg} and          # $self->{reorg} and
672    
673          scalar keys %score > $wanted) {          scalar keys %score > $top_wanted) {
674        $chop = (sort { $b <=> $a } values %score)[$wanted];        $chop = (sort { $b <=> $a } values %score)[$top_wanted];
675        $full = $max[$i] > $chop;        $full = $max[$i] > $chop;
676      } else {      } else {
677        $full = 1;        $full = 1;
678      }      }
679    
680      if ($full) {      if ($full) {
681        # We need to inspect the full list. Either $wanted is not given,        # We need to inspect the full list. Either $top_wanted is not given,
682        # the index is not sorted, or we don't have enough accumulators        # the index is not sorted, or we don't have enough accumulators
683        # yet.        # yet.
684        if (defined $chop) {        if (defined $chop) {
685          # We might be able to avoid allocating accumulators          # We might be able to avoid allocating accumulators
686          for my $did (keys %post) {          for my $did (keys %post) {
687            if (my $freq = $self->{db}->{'m'. $did}) {            if (my $freq = $self->{db}->{MAXTF_M . $did}) {
688              my $wgt = $post{$did} / $freq * $idf;              my $wgt = $post{$did} / $freq * $idf;
689              # We add an accumulator if $wgt exeeds $chop              # We add an accumulator if $wgt exeeds $chop
690              if (exists $score{$did} or $wgt > $chop) {              if (exists $score{$did} or $wgt > $chop) {
# Line 647  sub search_raw { Line 695  sub search_raw {
695        } else {        } else {
696          # Allocate acumulators for each seen document.          # Allocate acumulators for each seen document.
697          for my $did (keys %post) {          for my $did (keys %post) {
698            if (my $freq = $self->{db}->{'m'. $did}) {            if (my $freq = $self->{db}->{MAXTF_M . $did}) {
699              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
700            }            }
701          }          }
# Line 656  sub search_raw { Line 704  sub search_raw {
704        # Update existing accumulators        # Update existing accumulators
705        for my $did (keys %score) {        for my $did (keys %score) {
706          next unless exists $post{$did};          next unless exists $post{$did};
707          if (my $freq = $self->{db}->{'m'. $did}) {          if (my $freq = $self->{db}->{MAXTF_M . $did}) {
708            $score{$did} += $post{$did} / $freq * $idf;            $score{$did} += $post{$did} / $freq * $idf;
709          }          }
710        }        }
711      }      }
712    }    }
713    #warn sprintf "Used %d accumulators\n", scalar keys %score;    #warn sprintf "Used %d accumulators\n", scalar keys %score;
714    %score;    \%score;
715  }  }
716    
717  sub set {  sub set {
# Line 679  sub set { Line 727  sub set {
727    
728    $self->sync;    $self->sync;
729    while (my($key, $value) = each %{$self->{db}}) {    while (my($key, $value) = each %{$self->{db}}) {
730      next if $key !~ /^p/;      next if $key !~ /^p/; # some day use PMATCH
731      $self->{db}->{$key} = $self->sort_postings($value);      $self->{db}{$key} = $self->sort_postings($value);
732    }    }
733    $self->{reorg} = 1;    $self->{reorg} = 1;
734  }  }
735    
736  sub sync {  sub sync {
737    my $self = shift;    my $self = shift;
738      return unless $self->{mode} & O_RDWR;
739    if ($self->{mode} & O_RDWR) {    Carp::carp(sprintf "[%s] Flushing %d postings", scalar(localtime), $self->{cached})
740      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};          if $self->{cached};
741      while (my($key, $value) = each %{$self->{cache}}) {    while (my($key, $value) = each %{$self->{cache}}) {
742        $self->{db}->{"p". $key} ||= "";      $self->{db}{POSTINGLIST_P . $key} ||= "";
743        if ($self->{reorg}) {      if ($self->{reorg}) {
744          $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}        $self->{db}->{POSTINGLIST_P . $key} =
745                                                     . $value);            $self->sort_postings($self->{db}->{POSTINGLIST_P . $key}
746        } else {                                 . $value);
747          $self->{db}->{'p'.$key} .= $value;      } else {
748        }        $self->{db}->{POSTINGLIST_P . $key} .= $value;
     }  
     while (my($key, $value) = each %{$self->{cdict}}) {  
       $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};  
       $self->{db}->{'o'.$key} += $value;  
749      }      }
     $self->{cache}  = {};  
     $self->{cdict}  = {};  
     $self->{cached} = 0;  
750    }    }
751      while (my($key, $value) = each %{$self->{cdict}}) {
752        $self->{db}->{DOCFREQ_O . $key} = 0 unless  $self->{db}->{DOCFREQ_O . $key};
753        $self->{db}->{DOCFREQ_O . $key} += $value;
754      }
755      $self->{cache}  = {};
756      $self->{cdict}  = {};
757      $self->{cached} = 0;
758  }  }
759    
760  sub close {  sub close {
761    my $self = shift;    my $self = shift;
762    
763      delete $self->{env};
764    if ($self->{dbh}) {    if ($self->{dbh}) {
765      $self->sync;      $self->sync;
766      delete $self->{dbh};      delete $self->{dbh};
767      untie %{$self->{db}};      untie %{$self->{db}};
768      delete $self->{db};      for my $att (qw(db func cache cached cdict file maindbfile)) {
769      delete $self->{func};        delete $self->{$att};
770      delete $self->{cache};      }
771      delete $self->{cached};      for my $att (qw(pfunc ifunc xfunc)) {
772      delete $self->{cdict};        delete $self->{$att} if defined $self->{$att};
773      delete $self->{pfunc} if defined $self->{pfunc};      }
     delete $self->{ifunc} if defined $self->{ifunc};  
     delete $self->{xfunc} if defined $self->{xfunc};  
774    }    }
775  }  }
776    

Legend:
Removed from v.107  
changed lines
  Added in v.108

  ViewVC Help
Powered by ViewVC 1.1.26