/[wait]/trunk/lib/WAIT/InvertedIndex.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory | Revision Log | View Patch Patch

-revision 107 by dpavlin,
Mon May 24 20:57:08 2004 UTC
+revision 108 by dpavlin,
Tue Jul 13 17:41:12 2004 UTC
 Line 1
- #                              -*- Mode: Perl -*-
+ # -*- Mode: cperl; fill-column: 79 -*-
  # $Basename: InvertedIndex.pm $
  # $Revision: 1.30 $
  # Author          : Ulrich Pfeifer
 Line 12
  package WAIT::InvertedIndex;
  use strict;
- use DB_File;
+ use BerkeleyDB;
  use Fcntl;
  use WAIT::Filter;
  use Carp;
  use vars qw(%FUNC $VERSION);
+ use Time::HiRes qw(time);
- $VERSION = "1.900"; # others test if we are loaded by checking $VERSION
+ $VERSION = "2.000"; # others test if we are loaded by checking $VERSION
+ use constant DOCFREQ_O     => "o";
+ use constant MAXTF_M       => "m";
+ use constant POSTINGLIST_P => "p";
+ use constant PMATCH        => qr/^(??{POSTINGLIST_P()})/;
  # The dictionary has three different key types:
  #  'o'.$word
  #
  #     The document frequency is the number of documents a term occurs
-Line 40 
 $VERSION = "1.900"; # others test if we
+Line 47 
 $VERSION = "1.900"; # others test if we
  #     Under this key we store the actual posting list as pairs of
  #     packed integers.
- my $no_old_index_support = 0; # do not check for old indices if set
  sub new {
    my $type = shift;
    my %parm = @_;
    my $self = {};
-   $self->{file}     = $parm{file}     or croak "No file specified";
+   for my $x (qw(file attr subname env maindbfile tablename)) {
-   $self->{attr}     = $parm{attr}     or croak "No attributes specified";
+     $self->{$x}     = $parm{$x}     or croak "No $x specified";
+   }
    $self->{filter}   = $parm{filter};
    $self->{'name'}   = $parm{'name'};
    $self->{records}  = 0;
-Line 64 
 sub new {
+Line 71 
 sub new {
    bless $self, ref($type) || $type;
  }
- sub name {$_[0]->{'name'}}
+ for my $accessor (qw(name maindbfile tablename subname)) {
+   no strict 'refs';
+   *{$accessor} = sub {
+     my($self) = @_;
+     return $self->{$accessor} if $self->{$accessor};
+     require Carp;
+     Carp::confess("accessor $accessor not there");
+   }
+ }
  sub _split_pos {
    my ($text, $pos) = @{$_[0]};
-Line 144 
 sub drop {
+Line 159 
 sub drop {
    }
  }
- sub is_an_old_index {
-   my $self = shift;
-   return 0 if $no_old_index_support;
-   return $self->{old_index} if exists $self->{old_index};
-   # We can only guess if this is an old index. We lookup the first 10
-   # $O entries. If all values are integers, we assume that the index
-   # is an old one.
-   defined $self->{db} or $self->open;
-   $self->sync;
-   my $dbh = $self->{dbh} or return $self->{old_index} = 0;       # for convenience
-   my $O = pack('C', 0xff)."o";
-   my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
-   if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
-     # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
-     return $self->{old_index} = 0;
-   }
-   for (my $i=0; $i<10;$i++) {
-     if ($value !~ /^\d+$/) {
-       # warn "DEBUG: word[$word]value[$value], not an old index";
-       return $self->{old_index} = 0;
-     }
-     if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
-         $word !~ /^$O$;/o                   # no $O values left
-        ) {
-       # we are not sure enough that this is an old index
-       # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
-       return $self->{old_index} = 0;
-     }
-   }
-   # warn "DEBUG: old index";
-   return $self->{old_index} = 1;
- }
  sub open {
    my $self = shift;
    my $file = $self->{file};
-Line 190 
 sub open {
+Line 168 
 sub open {
    } else {
      $self->{func}     =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
-     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
+     my $flags;
-                        $self->{mode}, 0664, $DB_BTREE);
+     if ($self->{mode} & O_RDWR) {
+       $flags = DB_CREATE; # | DB_INIT_MPOOL | DB_PRIVATE | DB_INIT_CDB;
+       warn "Flags on inverted $file set to 'writing'";
+     } else {
+       $flags = DB_RDONLY;
+       # warn "Flags on inverted $file set to 'readonly'";
+     }
+     my $filename = $self->maindbfile or die;
+     my $subname  = join("/",$self->tablename || die,$self->subname || die);
+     my $env = $self->{env} || "[undef]";
+     $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
+                        # Filename => $file,
+                        Filename => $filename,
+                        $self->{env} ? (Env  => $self->{env}) : (),
+                        Subname => $subname,
+                        Mode => 0664,
+                        Flags => $flags,
+                        $WAIT::Database::Cachesize?(Cachesize => $WAIT::Database::Cachesize):(),
+                        $WAIT::Database::Pagesize?(Pagesize => $WAIT::Database::Pagesize):(),
+                        ) or die "Couldn't tie: $BerkeleyDB::Error; filename=>'$filename', env=>'$env',subname=>'$subname',flags=>'$flags'";
      $self->{cache} = {}
        if $self->{mode} & O_RDWR;
      $self->{cdict} = {}
        if $self->{mode} & O_RDWR;
      $self->{cached} = 0;
-     if (!$no_old_index_support and $self->is_an_old_index()) {
-       warn "This is an old index, upgrade you database";
-       require WAIT::InvertedIndexOld;
-       bless $self, 'WAIT::InvertedIndexOld';
-     }
    }
  }
-Line 211 
 sub insert {
+Line 203 
 sub insert {
    my %occ;
    defined $self->{db} or $self->open;
+   defined $self->{db} or die "open didn't help!!!";
    grep $occ{$_}++, &{$self->{func}}(@_);
    my ($word, $noc);
    $self->{records}++;
-Line 218 
 sub insert {
+Line 211 
 sub insert {
      if (defined $self->{cache}->{$word}) {
        $self->{cdict}->{$word}++;
        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
      } else {
        $self->{cdict}->{$word} = 1;
        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
      }
-Line 230 
 sub insert {
+Line 223 
 sub insert {
    for (values %occ) {
      $maxtf = $_ if $_ > $maxtf;
    }
-   $self->{db}->{'m'. $key} = $maxtf;
+   $self->{db}->{MAXTF_M . $key} = $maxtf;
  }
  # We sort postings by increasing max term frequency (~ by increasing
-Line 256 
 sub sort_postings {
+Line 249 
 sub sort_postings {
    # order can be exploited for tuning of single term queries.
    for my $did (keys %$post) { # sanity check
-     unless ($self->{db}->{"m". $did}) {
+     unless ($self->{db}->{MAXTF_M . $did}) {
-       warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
+       warn "WAIT Warning: DIVZERO threat from did[$did]post[$post]post{did}[$post->{$did}]";
-       $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
+       $self->{db}->{MAXTF_M . $did} = 1; # fails if we have not opened for writing
      }
    }
-   for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
+   for my $did (sort {    $post->{$b} / $self->{db}->{MAXTF_M . $b}
                                        <=>
-                          $post->{$a} / $self->{db}->{'m'. $a}
+                          $post->{$a} / $self->{db}->{MAXTF_M . $a}
                      } keys %$post) {
      $r .= pack 'w2', $did, $post->{$did};
    }
-Line 295 
 sub delete {
+Line 288 
 sub delete {
      warn "Catching warning[$warning] during delete of key[$key]";
    };
    for (keys %occ) {# may reorder posting list
-     my %post = unpack 'w*', $db->{'p'.$_};
+     my %post = unpack 'w*', $db->{POSTINGLIST_P . $_};
      delete $post{$key};
-     $db->{'p'.$_}    = $self->sort_postings(\%post);
+     $db->{POSTINGLIST_P . $_}    = $self->sort_postings(\%post);
-     _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
+     _complain('delete of term', $_) if $db->{DOCFREQ_O . $_}-1 != keys %post;
-     $db->{'o'.$_} = scalar keys %post;
+     $db->{DOCFREQ_O . $_} = scalar keys %post;
    }
-   delete $db->{'m'. $key};
+   delete $db->{MAXTF_M . $key};
  }
  sub intervall {
    my ($self, $first, $last) = @_;
-   my $value = '';
-   my $word  = '';
-   my @result;
-   return unless exists $self->{'intervall'};
-   defined $self->{db} or $self->open;
+   die "intervall broken in this version of WAIT: need to fix the
-   $self->sync;
+   R_CURSOR and R_NEXT lines";
-   my $dbh = $self->{dbh};       # for convenience
-   if (ref $self->{'intervall'}) {
+ ####      my $value = '';
-     unless (exists $self->{'ifunc'}) {
+ ####      my $word  = '';
-       $self->{'ifunc'} =
+ ####      my @result;
-         eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
+ ####
-     }
+ ####      return unless exists $self->{'intervall'};
-     ($first) = &{$self->{'ifunc'}}($first) if $first;
+ ####
-     ($last)  = &{$self->{'ifunc'}}($last) if $last;
+ ####      defined $self->{db} or $self->open;
-   }
+ ####      $self->sync;
-   $first = 'p'.($first||'');
+ ####      my $dbh = $self->{dbh};       # for convenience
-   $last  = (defined $last)?'p'.$last:'q';
+ ####
+ ####      if (ref $self->{'intervall'}) {
-   # set the cursor to $first
+ ####        unless (exists $self->{'ifunc'}) {
-   $dbh->seq($first, $value, R_CURSOR);
+ ####          $self->{'ifunc'} =
+ ####            eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
-   # $first would be after the last word
+ ####        }
-   return () if $first gt $last;
+ ####        ($first) = &{$self->{'ifunc'}}($first) if $first;
+ ####        ($last)  = &{$self->{'ifunc'}}($last) if $last;
-   push @result, substr($first,1);
+ ####      }
-   while (!$dbh->seq($word, $value, R_NEXT)) {
+ ####      $first = POSTINGLIST_P . ($first||'');
-     # We should limit this to a "resonable" number of words
+ ####      $last  = (defined $last)?POSTINGLIST_P . $last:'q';
-     last if $word gt $last;
+ ####
-     push @result, substr($word,1);
+ ####      # set the cursor to $first
-   }
+ ####      $dbh->seq($first, $value, R_CURSOR);
-   \@result;                     # speed
+ ####
+ ####      # $first would be after the last word
+ ####      return () if $first gt $last;
+ ####
+ ####      push @result, substr($first,1);
+ ####      while (!$dbh->seq($word, $value, R_NEXT)) {
+ ####        # We should limit this to a "resonable" number of words
+ ####        last if $word gt $last;
+ ####        push @result, substr($word,1);
+ ####      }
+ ####      \@result;                     # speed
  }
  sub prefix {
    my ($self, $prefix) = @_;
-   my $value = '';
-   my $word  = '';
-   my @result;
-   return () unless defined $prefix; # Full dictionary requested !!
+   die "prefix not supported in this version of WAIT: need to fix the R_CURSOR";
-   return unless exists $self->{'prefix'};
-   defined $self->{db} or $self->open;
-   $self->sync;
-   my $dbh = $self->{dbh};
-   if (ref $self->{'prefix'}) {
-     unless (exists $self->{'pfunc'}) {
-       $self->{'pfunc'} =
-         eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
-     }
-     ($prefix) = &{$self->{'pfunc'}}($prefix);
-   }
-   if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
-     return ();
-   }
-   return () if $word !~ /^p$prefix/;
-   push @result, substr($word,1);
-   while (!$dbh->seq($word, $value, R_NEXT)) {
+ ####      my $value = '';
-     # We should limit this to a "resonable" number of words
+ ####      my $word  = '';
-     last if $word !~ /^p$prefix/;
+ ####      my @result;
-     push @result, substr($word,1);
+ ####
-   }
+ ####      return () unless defined $prefix; # Full dictionary requested !!
-   \@result;                     # speed
+ ####      return unless exists $self->{'prefix'};
+ ####      defined $self->{db} or $self->open;
+ ####      $self->sync;
+ ####      my $dbh = $self->{dbh};
+ ####
+ ####      if (ref $self->{'prefix'}) {
+ ####        unless (exists $self->{'pfunc'}) {
+ ####          $self->{'pfunc'} =
+ ####            eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
+ ####        }
+ ####        ($prefix) = &{$self->{'pfunc'}}($prefix);
+ ####      }
+ ####
+ ####      if ($dbh->seq($word = POSTINGLIST_P . $prefix, $value, R_CURSOR)) {
+ ####        return ();
+ ####      }
+ ####      return () if $word !~ /^p$prefix/;
+ ####      push @result, substr($word,1);
+ ####
+ ####      while (!$dbh->seq($word, $value, R_NEXT)) {
+ ####        # We should limit this to a "resonable" number of words
+ ####        last if $word !~ /^p$prefix/;
+ ####        push @result, substr($word,1);
+ ####      }
+ ####      \@result;                     # speed
  }
  =head2 search($query)
-Line 405 
 in the size of the lists.
+Line 406 
 in the size of the lists.
  =cut
- sub search {
+ sub search_ref {
    my $self  = shift;
    my $query = shift;
+   my $debugtime = 0;
+   my($time,$entertime);
+   our $STARTTIME;
+   if ($debugtime) {
+     $time = time;
+     $STARTTIME ||= $time;
+     if ($time-$STARTTIME > 5) {
+       $STARTTIME = $time;
+       warn "STARTTIME: $STARTTIME\n";
+     }
+     $entertime = time-$STARTTIME;
+     warn sprintf "ENTER TIME: %.4f\n", $entertime;
+   }
    defined $self->{db} or $self->open;
    $self->sync;
-   $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() there
+   my $ref = $self->search_raw_ref($query, &{$self->{func}}(@_)); # No call to parse() there
+   if ($debugtime) {
+     my $leavetime = time-$STARTTIME;
+     warn sprintf "LEAVE TIME: %.4f\n", $leavetime;
+     if ($leavetime-$entertime > .4) {
+       require Data::Dumper;
+       print STDERR "Line " . __LINE__ . ", File: " . __FILE__ . "\n" .
+           Data::Dumper->new([$query,\@_],[qw(query at_)])->Indent(1)->Useqq(1)->Dump; # XXX
+     }
+   }
+   $ref;
  }
  sub parse {
-Line 426 
 sub search_prefix {
+Line 450 
 sub search_prefix {
    # print "search_prefix(@_)\n";
    defined $self->{db} or $self->open;
-   $self->search_raw(map($self->prefix($_), @_));
+   $self->search_raw_ref(map($self->prefix($_), @_));
  }
  sub _complain ($$) {
-Line 439 
 sub _complain ($$) {
+Line 463 
 sub _complain ($$) {
               $term,));
  }
- sub search_raw {
+ sub search_raw_ref {
    my $self  = shift;
    my $query = shift;
+   # warn "DEBUG WAIT: search_raw_ref args 2..[@_]";
    my %score;
-   # Top $wanted documents must be correct. Zero means all matching
+   # Top $top_wanted documents must be correct. Zero means all matching documents.
-   # documents.
+   my $top_wanted = $query->{top};
-   my $wanted = $query->{top};
+   my $picky_strict = $query->{picky};
-   my $strict = $query->{picky};
+   # the option is really ignore_excess
+   my $ignore_excess = $query->{ignore_excess};
-   # Return at least $minacc documents. Zero means all matching
-   # documents.
+   # Return at least $minacc documents. Zero means all matching documents.
-   # my $minacc = $query->{accus} || $wanted;
+   # my $minacc = $query->{accus} || $top_wanted;
    # Open index and flush cache if necessary
    defined $self->{db} or $self->open;
-Line 460 
 sub search_raw {
+Line 486 
 sub search_raw {
    # We keep duplicates
    my @terms =
      # Sort words by decreasing document frequency
-     sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
+     sort { $self->{db}->{DOCFREQ_O . $a} <=> $self->{db}->{DOCFREQ_O . $b} }
        # check which words occur in the index.
-       grep { $self->{db}->{'o'.$_} } @_;
+       grep { $self->{db}->{DOCFREQ_O . $_} } @_;
+   # warn "DEBUG WAIT: wanted[$top_wanted]terms[@terms]";
    return unless @terms;
    # We special-case one term queries here.  If the index was sorted,
    # choping off the rest of the list will return the same ranking.
-   if ($wanted and @terms == 1) {
+   if ($top_wanted and @terms == 1) {
      my $term  = shift @terms;
-     my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
+     my $idf   = log($self->{records}/$self->{db}->{DOCFREQ_O . $term});
      my @res;
      if ($self->{reorg}) { # or not $query->{picky}
-       @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
+       @res = unpack "w". int(2*$top_wanted), $self->{db}->{POSTINGLIST_P . $term};
+       # warn sprintf "DEBUG WAIT: scalar(\@res)[%d]", scalar(@res);
      } else {
-       @res = unpack 'w*',                $self->{db}->{'p'.$term};
+       @res = unpack 'w*',                $self->{db}->{POSTINGLIST_P . $term};
      }
      for (my $i=1; $i<@res; $i+=2) {
-       # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
+       # $res[$i] /= $self->{db}->{MAXTF_M . $res[$i-1]} / $idf;
        # above was written badly, allows two DIV_ZERO problems.
-       my $maxtf = $self->{db}->{"m". $res[$i-1]};
+       my $maxtf = $self->{db}->{MAXTF_M . $res[$i-1]};
        unless ($maxtf) {
          warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
          $maxtf = 1;
-Line 490 
 sub search_raw {
+Line 518 
 sub search_raw {
        $res[$i] = ($res[$i] / $maxtf) * $idf;
      }
-     return @res
+     my %res = @res; # bloed: @res waere schon sortiert gewesen
+     return \%res;
    }
    # We separate exhaustive search here to avoid overhead and make the
    # code more readable. The block can be removed without changing the
    # result.
-   unless ($wanted) {
+   unless ($top_wanted) {
      for (@terms) {
-       my $df      = $self->{db}->{'o'.$_};
+       my $df      = $self->{db}->{DOCFREQ_O . $_};
        # The frequency *must* be 1 at least since the posting list is nonempty
        _complain('search for term', $_) and $df = 1 if $df < 1;
        # Unpack posting list for current query term $_
-       my %post = unpack 'w*', $self->{db}->{'p'.$_};
+       my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
-       _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
+       _complain('search for term', $_) if $self->{db}->{DOCFREQ_O . $_} != keys %post;
        # This is the inverse document frequency. The log of the inverse
        # fraction of documents the term occurs in.
        my $idf = log($self->{records}/$df);
        for my $did (keys %post) {
-         if (my $freq = $self->{db}->{'m'. $did}) {
+         if (my $freq = $self->{db}->{MAXTF_M . $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      }
      # warn sprintf "Used %d accumulators\n", scalar keys %score;
-     return %score;
+     return \%score;
    }
    # A sloppy but fast algorithm for multiple term queries.
-   unless ($strict) {
+   unless ($picky_strict) {
      for (@terms) {
        # Unpack posting list for current query term $_
-       my %post = unpack 'w*', $self->{db}->{'p'.$_};
+       my %post;
+       if ($self->{reorg} && $top_wanted && $ignore_excess) {
+         %post = unpack 'w'. int(2*$ignore_excess) , $self->{db}->{POSTINGLIST_P . $_};
+       } else {
+         %post = unpack 'w*',                        $self->{db}->{POSTINGLIST_P . $_};
+       }
+       # warn sprintf "DEBUG WAIT: term[%s] keys %%post[%s]", $_, scalar keys %post;
        # Lookup the number of documents the term occurs in (document frequency)
-       my $occ  = $self->{db}->{'o'.$_};
+       my $occ  = $self->{db}->{DOCFREQ_O . $_};
-       _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
+       _complain('search for term', $_) if !$ignore_excess && $occ != keys %post;
        # The frequency *must* be 1 at least since the posting list is nonempty
        _complain('search for term', $_) and $occ = 1 if $occ < 1;
-       # This is the inverse document frequency. The log of the inverse
+       # This is the inverse document frequency. The log of the inverse fraction
-       # fraction of documents the term occurs in.
+       # of documents the term occurs in.
        my $idf = log($self->{records}/$occ);
        # If we have a reasonable number of accumulators, change the
-Line 550 
 sub search_raw {
+Line 585 
 sub search_raw {
        # improved.  The resulting ranking list must be pruned, since only
        # the top most documents end up near their "optimal" rank.
-       if (keys %score < $wanted) {
+       if (keys %score < $top_wanted) {
+         # Diese folgende Schleife ist (WAR!) der Hammer fuer die Suche "mysql
+         # für dummies bellomo". Sie frisst 3.1+1.7 Sekunden.
+         # Der erste Grund ist, dass 3 Begriffe noch nicht genug gebracht haben,
+         # aber der vierte viel zu viel bringt. Der zweite Grund ist, dass wir
+         # so viele Lookups in $self->{db} machen. Das Rechnen hingegen ist
+         # vermutlich billig.
          for my $did (keys %post) {
-           if (my $freq = $self->{db}->{'m'. $did}) {
+           if (my $freq = $self->{db}->{MAXTF_M . $did}) {
              $score{$did} += $post{$did} / $freq * $idf;
            }
          }
        } else {
          for my $did (keys %score) {
            next unless exists $post{$did};
-           if (my $freq = $self->{db}->{'m'. $did}) {
+           if (my $freq = $self->{db}->{MAXTF_M . $did}) {
              $score{$did} += $post{$did} / $freq * $idf;
            }
          }
        }
      }
-     return %score;
+     warn sprintf("DEBUG WAIT: returning from search_raw_ref at [%.3f] after terms[%s] with keys[%d]",
+                  time,
+                  join(":",@terms),
+                  scalar keys %score,
+                 );
+     return \%score;
    }
    my @max; $max[$#terms+1]=0;
    my @idf;
-Line 579 
 sub search_raw {
+Line 628 
 sub search_raw {
    for (my $i = $#terms; $i >=0; $i--) {
      local $_ = $terms[$i];
      # Lookup the number of documents the term occurs in (document frequency)
-     my $df      = $self->{db}->{'o'.$_};
+     my $df      = $self->{db}->{DOCFREQ_O . $_};
      # The frequency *must* be 1 at least since the posting list is nonempty
      _complain('search for term', $_) and $df = 1 if $df < 1;
-Line 590 
 sub search_raw {
+Line 639 
 sub search_raw {
      my ($did,$occ);
      if ($self->{reorg}) {
-       ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
+       ($did,$occ) = unpack 'w2', $self->{db}->{POSTINGLIST_P . $_};
      } else {                    # Maybe this costs more than it helps
-       ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
+       ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{POSTINGLIST_P . $_});
      }
-     my $freq      = $self->{db}->{'m'. $did};
+     my $freq      = $self->{db}->{MAXTF_M . $did};
      my $max       = $occ/$freq*$idf[$i];
      $max[$i]      = $max + $max[$i+1];
    }
    # Main loop
    for my $i (0 .. $#terms) {
      my $term = $terms[$i];
      # Unpack posting list for current query term $term. We loose the
      # sorting order because the assignment to a hash.
-     my %post = unpack 'w*', $self->{db}->{'p'.$term};
+     my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
      _complain('search for term', $term)
-       if $self->{db}->{'o'.$term} != keys %post;
+       if $self->{db}->{DOCFREQ_O . $term} != keys %post;
      my $idf  = $idf[$i];
      my $full;                   # Need to process all postings
-Line 616 
 sub search_raw {
+Line 665 
 sub search_raw {
      if (# We know that wanted is true since we special cased the
          # exhaustive search.
-         $wanted and
+         $top_wanted and
-         # We did sort here if necessary in
+         # We did sort here if necessary in the preparation loop:
-         # the preparation loop
          # $self->{reorg} and
-         scalar keys %score > $wanted) {
+         scalar keys %score > $top_wanted) {
-       $chop = (sort { $b <=> $a } values %score)[$wanted];
+       $chop = (sort { $b <=> $a } values %score)[$top_wanted];
        $full = $max[$i] > $chop;
      } else {
        $full = 1;
      }
      if ($full) {
-       # We need to inspect the full list. Either $wanted is not given,
+       # We need to inspect the full list. Either $top_wanted is not given,
        # the index is not sorted, or we don't have enough accumulators
        # yet.
        if (defined $chop) {
          # We might be able to avoid allocating accumulators
          for my $did (keys %post) {
-           if (my $freq = $self->{db}->{'m'. $did}) {
+           if (my $freq = $self->{db}->{MAXTF_M . $did}) {
              my $wgt = $post{$did} / $freq * $idf;
              # We add an accumulator if $wgt exeeds $chop
              if (exists $score{$did} or $wgt > $chop) {
-Line 647 
 sub search_raw {
+Line 695 
 sub search_raw {
        } else {
          # Allocate acumulators for each seen document.
          for my $did (keys %post) {
-           if (my $freq = $self->{db}->{'m'. $did}) {
+           if (my $freq = $self->{db}->{MAXTF_M . $did}) {
              $score{$did} += $post{$did} / $freq * $idf;
            }
          }
-Line 656 
 sub search_raw {
+Line 704 
 sub search_raw {
        # Update existing accumulators
        for my $did (keys %score) {
          next unless exists $post{$did};
-         if (my $freq = $self->{db}->{'m'. $did}) {
+         if (my $freq = $self->{db}->{MAXTF_M . $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      }
    }
    #warn sprintf "Used %d accumulators\n", scalar keys %score;
-   %score;
+   \%score;
  }
  sub set {
-Line 679 
 sub set {
+Line 727 
 sub set {
    $self->sync;
    while (my($key, $value) = each %{$self->{db}}) {
-     next if $key !~ /^p/;
+     next if $key !~ /^p/; # some day use PMATCH
-     $self->{db}->{$key} = $self->sort_postings($value);
+     $self->{db}{$key} = $self->sort_postings($value);
    }
    $self->{reorg} = 1;
  }
  sub sync {
    my $self = shift;
+   return unless $self->{mode} & O_RDWR;
-   if ($self->{mode} & O_RDWR) {
+   Carp::carp(sprintf "[%s] Flushing %d postings", scalar(localtime), $self->{cached})
-     print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
+         if $self->{cached};
-     while (my($key, $value) = each %{$self->{cache}}) {
+   while (my($key, $value) = each %{$self->{cache}}) {
-       $self->{db}->{"p". $key} ||= "";
+     $self->{db}{POSTINGLIST_P . $key} ||= "";
-       if ($self->{reorg}) {
+     if ($self->{reorg}) {
-         $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
+       $self->{db}->{POSTINGLIST_P . $key} =
-                                                    . $value);
+           $self->sort_postings($self->{db}->{POSTINGLIST_P . $key}
-       } else {
+                                . $value);
-         $self->{db}->{'p'.$key} .= $value;
+     } else {
-       }
+       $self->{db}->{POSTINGLIST_P . $key} .= $value;
-     }
-     while (my($key, $value) = each %{$self->{cdict}}) {
-       $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
-       $self->{db}->{'o'.$key} += $value;
      }
-     $self->{cache}  = {};
-     $self->{cdict}  = {};
-     $self->{cached} = 0;
    }
+   while (my($key, $value) = each %{$self->{cdict}}) {
+     $self->{db}->{DOCFREQ_O . $key} = 0 unless  $self->{db}->{DOCFREQ_O . $key};
+     $self->{db}->{DOCFREQ_O . $key} += $value;
+   }
+   $self->{cache}  = {};
+   $self->{cdict}  = {};
+   $self->{cached} = 0;
  }
  sub close {
    my $self = shift;
+   delete $self->{env};
    if ($self->{dbh}) {
      $self->sync;
      delete $self->{dbh};
      untie %{$self->{db}};
-     delete $self->{db};
+     for my $att (qw(db func cache cached cdict file maindbfile)) {
-     delete $self->{func};
+       delete $self->{$att};
-     delete $self->{cache};
+     }
-     delete $self->{cached};
+     for my $att (qw(pfunc ifunc xfunc)) {
-     delete $self->{cdict};
+       delete $self->{$att} if defined $self->{$att};
-     delete $self->{pfunc} if defined $self->{pfunc};
+     }
-     delete $self->{ifunc} if defined $self->{ifunc};
-     delete $self->{xfunc} if defined $self->{xfunc};
    }
  }

 Legend:



Removed from v.107
 


changed lines


 
Added in v.108
 Legend:



Removed from v.107
 


changed lines


 
Added in v.108
-Removed from v.107
+Added in v.108

	ViewVC Help
Powered by ViewVC 1.1.26