--- cvs-head/lib/WAIT/InvertedIndex.pm	2004/05/24 13:00:15	21
+++ cvs-head/lib/WAIT/InvertedIndex.pm	2000/11/11 16:58:53	22
@@ -4,7 +4,7 @@
 # Author          : Ulrich Pfeifer
 # Created On      : Thu Aug  8 13:05:10 1996
 # Last Modified By: Ulrich Pfeifer
-# Last Modified On: Tue May  9 08:33:28 2000
+# Last Modified On: Sat Nov 11 16:32:38 2000
 # Language        : CPerl
 # 
 # (C) Copyright 1996-2000, Ulrich Pfeifer
@@ -18,19 +18,27 @@
 use Carp;
 use vars qw(%FUNC);
 
-my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)
+# The dictionary has three different key types:
+#  'o'.$word
+#
+#     The document frequency is the number of documents a term occurs
+#     in. The idea is that a term occuring in a significant part of the
+#     documents is not too significant.
+#
+# 'm'.$word
+#
+#     The maximum term frequency of a document is the frequency of the
+#     most frequent term in the document.  It is related to the document
+#     length obviously.  A document in which the most frequnet term occurs
+#     100 times is probably much longer than a document whichs most
+#     frequent term occurs five time.
+#
+# 'p'.$word
+#
+#     Under this key we store the actual posting list as pairs of
+#     packed integers.
 
-# The document frequency is the number of documents a term occurs
-# in. The idea is that a term occuring in a significant part of the
-# documents is not too significant.
-
-my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)
-
-# The maximum term frequency of a document is the frequency of the
-# most frequent term in the document.  It is related to the document
-# length obviously.  A document in which the most frequnet term occurs
-# 100 times is probably much longer than a document whichs most
-# frequent term occurs five time.
+my $no_old_index_support = 0; # do not check for old indices if set
 
 sub new {
   my $type = shift;
@@ -134,6 +142,37 @@
   }
 }
 
+sub is_an_old_index {
+  my $self = shift;
+
+  return 0 if $no_old_index_support;
+  return $self->{old_index} if exists $self->{old_index};
+
+  # We can only guess if this is an old index. We lookup the first 10
+  # $O entries. If all values are integers, we assume that the index
+  # is an old one.
+
+  defined $self->{db} or $self->open;
+  $self->sync;
+  my $dbh = $self->{dbh};       # for convenience
+
+  my $O = pack('C', 0xff)."o";
+  my ($word, $value) = ($O.$;);
+  $dbh->seq($word, $value, R_CURSOR);
+  for (my $i=0; $i<10;$i++) {
+    if ($value !~ /^\d+$/) {
+      return $self->{old_index} = 0;
+    }
+    if ($dbh->seq($word, $value, R_NEXT) or # no values left
+        $word !~ /^$O/o                     # no $O values left
+       ) {
+      # we are not sure enough that this is an old index
+      return $self->{old_index} = 0;
+    }
+  }
+  return $self->{old_index} = 1;
+}
+
 sub open {
   my $self = shift;
   my $file = $self->{file};
@@ -150,6 +189,11 @@
     $self->{cdict} = {}
       if $self->{mode} & O_RDWR;
     $self->{cached} = 0;
+    if (!$no_old_index_support and $self->is_an_old_index()) {
+      warn "This is an old index, upgrade you database";
+      require WAIT::InvertedIndexOld;
+      bless $self, 'WAIT::InvertedIndexOld';
+    }
   }
 }
 
@@ -164,10 +208,10 @@
   $self->{records}++;
   while (($word, $noc) = each %occ) {
     if (defined $self->{cache}->{$word}) {
-      $self->{cdict}->{$O,$word}++;
+      $self->{cdict}->{$word}++;
       $self->{cache}->{$word} .= pack 'w2', $key, $noc;
-    } else {
-      $self->{cdict}->{$O,$word} = 1;
+    } else {           
+      $self->{cdict}->{$word} = 1;
       $self->{cache}->{$word}  = pack 'w2', $key, $noc;
     }
     $self->{cached}++;
@@ -178,7 +222,7 @@
   for (values %occ) {
     $maxtf = $_ if $_ > $maxtf;
   }
-  $self->{db}->{$M, $key} = $maxtf;
+  $self->{db}->{'m'. $key} = $maxtf;
 }
 
 # We sort postings by increasing max term frequency (~ by increasing
@@ -203,9 +247,9 @@
   # inverse document frequence gives the score for a term.  This sort
   # order can be exploited for tuning of single term queries.
 
-  for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}
+  for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
                                       <=>
-                         $post->{$a} / $self->{db}->{$M, $a}
+                         $post->{$a} / $self->{db}->{'m'. $a}
                     } keys %$post) {
     $r .= pack 'w2', $did, $post->{$did};
   }
@@ -231,13 +275,13 @@
   grep $occ{$_}++, &{$self->{func}}(@_);
 
   for (keys %occ) {# may reorder posting list
-    my %post = unpack 'w*', $db->{$_};
+    my %post = unpack 'w*', $db->{'p'.$_};
     delete $post{$key};
-    $db->{$_}    = $self->sort_postings(\%post);
-    _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
-    $db->{$O,$_} = scalar keys %post;
+    $db->{'p'.$_}    = $self->sort_postings(\%post);
+    _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
+    $db->{'o'.$_} = scalar keys %post;
   }
-  delete $db->{$M, $key};
+  delete $db->{'m'. $key};
 }
 
 sub intervall {
@@ -260,20 +304,20 @@
     ($first) = &{$self->{'ifunc'}}($first) if $first;
     ($last)  = &{$self->{'ifunc'}}($last) if $last;
   }
-  if (defined $first and $first ne '') {         # set the cursor to $first
-    $dbh->seq($first, $value, R_CURSOR);
-  } else {
-    $dbh->seq($first, $value, R_FIRST);
-  }
-  # We assume that word do not start with the character \377
-  # $last = pack 'C', 0xff unless defined $last and $last ne '';
-  return () if defined $last and $first gt $last; # $first would be after the last word
+  $first = 'p'.($first||'');
+  $last  = (defined $last)?'p'.$last:'q';
+
+  # set the cursor to $first
+  $dbh->seq($first, $value, R_CURSOR);
+
+  # $first would be after the last word
+  return () if $first gt $last;
   
-  push @result, $first;
+  push @result, substr($first,1);
   while (!$dbh->seq($word, $value, R_NEXT)) {
     # We should limit this to a "resonable" number of words
-    last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
-    push @result, $word;
+    last if $word gt $last;
+    push @result, substr($word,1);
   }
   \@result;                     # speed
 }
@@ -298,16 +342,16 @@
     ($prefix) = &{$self->{'pfunc'}}($prefix);
   }
 
-  if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
+  if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
     return ();
   }
-  return () if $word !~ /^$prefix/;
-  push @result, $word;
+  return () if $word !~ /^p$prefix/;
+  push @result, substr($word,1);
 
   while (!$dbh->seq($word, $value, R_NEXT)) {
     # We should limit this to a "resonable" number of words
-    last if $word !~ /^$prefix/;
-    push @result, $word;
+    last if $word !~ /^p$prefix/;
+    push @result, substr($word,1);
   }
   \@result;                     # speed
 }
@@ -403,9 +447,9 @@
   # We keep duplicates
   my @terms = 
     # Sort words by decreasing document frequency
-    sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
+    sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
       # check which words occur in the index. 
-      grep { $self->{db}->{$O,$_} } @_;
+      grep { $self->{db}->{'o'.$_} } @_;
 
   return () unless @terms;                 # nothing to search for
 
@@ -413,17 +457,17 @@
   # choping off the rest of the list will return the same ranking.
   if ($wanted and @terms == 1) {
     my $term  = shift @terms;
-    my $idf   = log($self->{records}/$self->{db}->{$O,$term});
+    my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
     my @res;
 
     if ($self->{reorg}) { # or not $query->{picky}
-      @res = unpack "w". int(2*$wanted), $self->{db}->{$term};
+      @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
     } else {
-      @res = unpack 'w*',                $self->{db}->{$term};
+      @res = unpack 'w*',                $self->{db}->{'p'.$term};
     }
 
     for (my $i=1; $i<@res; $i+=2) {
-      $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
+      $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
     }
 
     return @res
@@ -434,20 +478,20 @@
   # result.
   unless ($wanted) {
     for (@terms) {
-      my $df      = $self->{db}->{$O,$_};
+      my $df      = $self->{db}->{'o'.$_};
 
       # The frequency *must* be 1 at least since the posting list is nonempty
       _complain('search for term', $_) and $df = 1 if $df < 1;
 
       # Unpack posting list for current query term $_
-      my %post = unpack 'w*', $self->{db}->{$_};
+      my %post = unpack 'w*', $self->{db}->{'p'.$_};
 
-      _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
+      _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
       # This is the inverse document frequency. The log of the inverse
       # fraction of documents the term occurs in.
       my $idf = log($self->{records}/$df);
       for my $did (keys %post) {
-        if (my $freq = $self->{db}->{$M, $did}) {
+        if (my $freq = $self->{db}->{'m'. $did}) {
           $score{$did} += $post{$did} / $freq * $idf;
         }
       }
@@ -460,12 +504,12 @@
   unless ($strict) {
     for (@terms) {
       # Unpack posting list for current query term $_
-      my %post = unpack 'w*', $self->{db}->{$_};
+      my %post = unpack 'w*', $self->{db}->{'p'.$_};
 
       # Lookup the number of documents the term occurs in (document frequency)
-      my $occ  = $self->{db}->{$O,$_};
+      my $occ  = $self->{db}->{'o'.$_};
 
-      _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
+      _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
       # The frequency *must* be 1 at least since the posting list is nonempty
       _complain('search for term', $_) and $occ = 1 if $occ < 1;
 
@@ -488,14 +532,14 @@
       
       if (keys %score < $wanted) {
         for my $did (keys %post) {
-          if (my $freq = $self->{db}->{$M, $did}) {
+          if (my $freq = $self->{db}->{'m'. $did}) {
             $score{$did} += $post{$did} / $freq * $idf;
           }
         }
       } else {
         for my $did (keys %score) {
           next unless exists $post{$did};
-          if (my $freq = $self->{db}->{$M, $did}) {
+          if (my $freq = $self->{db}->{'m'. $did}) {
             $score{$did} += $post{$did} / $freq * $idf;
           }
         }
@@ -515,7 +559,7 @@
   for (my $i = $#terms; $i >=0; $i--) {
     local $_ = $terms[$i];
     # Lookup the number of documents the term occurs in (document frequency)
-    my $df      = $self->{db}->{$O,$_};
+    my $df      = $self->{db}->{'o'.$_};
 
     # The frequency *must* be 1 at least since the posting list is nonempty
     _complain('search for term', $_) and $df = 1 if $df < 1;
@@ -526,11 +570,11 @@
 
     my ($did,$occ);
     if ($self->{reorg}) {
-      ($did,$occ) = unpack 'w2', $self->{db}->{$_};
+      ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
     } else {                    # Maybe this costs more than it helps
-      ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
+      ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
     }
-    my $freq      = $self->{db}->{$M, $did};
+    my $freq      = $self->{db}->{'m'. $did};
     my $max       = $occ/$freq*$idf[$i];
     $max[$i]      = $max + $max[$i+1];
   }
@@ -540,10 +584,10 @@
     my $term = $terms[$i];
     # Unpack posting list for current query term $term. We loose the
     # sorting order because the assignment to a hash.
-    my %post = unpack 'w*', $self->{db}->{$term};
+    my %post = unpack 'w*', $self->{db}->{'p'.$term};
 
     _complain('search for term', $term)
-      if $self->{db}->{$O,$term} != keys %post;
+      if $self->{db}->{'o'.$term} != keys %post;
 
     my $idf  = $idf[$i];
     my $full;                   # Need to process all postings
@@ -572,7 +616,7 @@
       if (defined $chop) {
         # We might be able to avoid allocating accumulators
         for my $did (keys %post) {
-          if (my $freq = $self->{db}->{$M, $did}) {
+          if (my $freq = $self->{db}->{'m'. $did}) {
             my $wgt = $post{$did} / $freq * $idf;
             # We add an accumulator if $wgt exeeds $chop
             if (exists $score{$did} or $wgt > $chop) {
@@ -583,7 +627,7 @@
       } else {
         # Allocate acumulators for each seen document.
         for my $did (keys %post) {
-          if (my $freq = $self->{db}->{$M, $did}) {
+          if (my $freq = $self->{db}->{'m'. $did}) {
             $score{$did} += $post{$did} / $freq * $idf;
           }
         }
@@ -592,7 +636,7 @@
       # Update existing accumulators
       for my $did (keys %score) {
         next unless exists $post{$did};
-        if (my $freq = $self->{db}->{$M, $did}) {
+        if (my $freq = $self->{db}->{'m'. $did}) {
           $score{$did} += $post{$did} / $freq * $idf;
         }
       }
@@ -615,7 +659,7 @@
 
   $self->sync;
   while (my($key, $value) = each %{$self->{db}}) {
-    next if $key =~ /^\377[om]/;
+    next if $key !~ /^p/;
     $self->{db}->{$key} = $self->sort_postings($value);
   }
   $self->{reorg} = 1;
@@ -628,15 +672,15 @@
     print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
     while (my($key, $value) = each %{$self->{cache}}) {
       if ($self->{reorg}) {
-        $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
+        $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
                                                    . $value);
       } else {
-        $self->{db}->{$key} .= $value;
+        $self->{db}->{'p'.$key} .= $value;
       }
     }
     while (my($key, $value) = each %{$self->{cdict}}) {
-      $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
-      $self->{db}->{$key} += $value;
+      $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
+      $self->{db}->{'o'.$key} += $value;
     }
     $self->{cache}  = {};
     $self->{cdict}  = {};