--- cvs-head/lib/WAIT/InvertedIndex.pm 2004/05/24 13:00:15 21 +++ cvs-head/lib/WAIT/InvertedIndex.pm 2000/11/11 16:58:53 22 @@ -4,7 +4,7 @@ # Author : Ulrich Pfeifer # Created On : Thu Aug 8 13:05:10 1996 # Last Modified By: Ulrich Pfeifer -# Last Modified On: Tue May 9 08:33:28 2000 +# Last Modified On: Sat Nov 11 16:32:38 2000 # Language : CPerl # # (C) Copyright 1996-2000, Ulrich Pfeifer @@ -18,19 +18,27 @@ use Carp; use vars qw(%FUNC); -my $O = pack('C', 0xff)."o"; # occurances (document ferquency) +# The dictionary has three different key types: +# 'o'.$word +# +# The document frequency is the number of documents a term occurs +# in. The idea is that a term occuring in a significant part of the +# documents is not too significant. +# +# 'm'.$word +# +# The maximum term frequency of a document is the frequency of the +# most frequent term in the document. It is related to the document +# length obviously. A document in which the most frequnet term occurs +# 100 times is probably much longer than a document whichs most +# frequent term occurs five time. +# +# 'p'.$word +# +# Under this key we store the actual posting list as pairs of +# packed integers. -# The document frequency is the number of documents a term occurs -# in. The idea is that a term occuring in a significant part of the -# documents is not too significant. - -my $M = pack('C', 0xff)."m"; # maxtf (term frequency) - -# The maximum term frequency of a document is the frequency of the -# most frequent term in the document. It is related to the document -# length obviously. A document in which the most frequnet term occurs -# 100 times is probably much longer than a document whichs most -# frequent term occurs five time. +my $no_old_index_support = 0; # do not check for old indices if set sub new { my $type = shift; @@ -134,6 +142,37 @@ } } +sub is_an_old_index { + my $self = shift; + + return 0 if $no_old_index_support; + return $self->{old_index} if exists $self->{old_index}; + + # We can only guess if this is an old index. We lookup the first 10 + # $O entries. If all values are integers, we assume that the index + # is an old one. + + defined $self->{db} or $self->open; + $self->sync; + my $dbh = $self->{dbh}; # for convenience + + my $O = pack('C', 0xff)."o"; + my ($word, $value) = ($O.$;); + $dbh->seq($word, $value, R_CURSOR); + for (my $i=0; $i<10;$i++) { + if ($value !~ /^\d+$/) { + return $self->{old_index} = 0; + } + if ($dbh->seq($word, $value, R_NEXT) or # no values left + $word !~ /^$O/o # no $O values left + ) { + # we are not sure enough that this is an old index + return $self->{old_index} = 0; + } + } + return $self->{old_index} = 1; +} + sub open { my $self = shift; my $file = $self->{file}; @@ -150,6 +189,11 @@ $self->{cdict} = {} if $self->{mode} & O_RDWR; $self->{cached} = 0; + if (!$no_old_index_support and $self->is_an_old_index()) { + warn "This is an old index, upgrade you database"; + require WAIT::InvertedIndexOld; + bless $self, 'WAIT::InvertedIndexOld'; + } } } @@ -164,10 +208,10 @@ $self->{records}++; while (($word, $noc) = each %occ) { if (defined $self->{cache}->{$word}) { - $self->{cdict}->{$O,$word}++; + $self->{cdict}->{$word}++; $self->{cache}->{$word} .= pack 'w2', $key, $noc; - } else { - $self->{cdict}->{$O,$word} = 1; + } else { + $self->{cdict}->{$word} = 1; $self->{cache}->{$word} = pack 'w2', $key, $noc; } $self->{cached}++; @@ -178,7 +222,7 @@ for (values %occ) { $maxtf = $_ if $_ > $maxtf; } - $self->{db}->{$M, $key} = $maxtf; + $self->{db}->{'m'. $key} = $maxtf; } # We sort postings by increasing max term frequency (~ by increasing @@ -203,9 +247,9 @@ # inverse document frequence gives the score for a term. This sort # order can be exploited for tuning of single term queries. - for my $did (sort { $post->{$b} / $self->{db}->{$M, $b} + for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b} <=> - $post->{$a} / $self->{db}->{$M, $a} + $post->{$a} / $self->{db}->{'m'. $a} } keys %$post) { $r .= pack 'w2', $did, $post->{$did}; } @@ -231,13 +275,13 @@ grep $occ{$_}++, &{$self->{func}}(@_); for (keys %occ) {# may reorder posting list - my %post = unpack 'w*', $db->{$_}; + my %post = unpack 'w*', $db->{'p'.$_}; delete $post{$key}; - $db->{$_} = $self->sort_postings(\%post); - _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post; - $db->{$O,$_} = scalar keys %post; + $db->{'p'.$_} = $self->sort_postings(\%post); + _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post; + $db->{'o'.$_} = scalar keys %post; } - delete $db->{$M, $key}; + delete $db->{'m'. $key}; } sub intervall { @@ -260,20 +304,20 @@ ($first) = &{$self->{'ifunc'}}($first) if $first; ($last) = &{$self->{'ifunc'}}($last) if $last; } - if (defined $first and $first ne '') { # set the cursor to $first - $dbh->seq($first, $value, R_CURSOR); - } else { - $dbh->seq($first, $value, R_FIRST); - } - # We assume that word do not start with the character \377 - # $last = pack 'C', 0xff unless defined $last and $last ne ''; - return () if defined $last and $first gt $last; # $first would be after the last word + $first = 'p'.($first||''); + $last = (defined $last)?'p'.$last:'q'; + + # set the cursor to $first + $dbh->seq($first, $value, R_CURSOR); + + # $first would be after the last word + return () if $first gt $last; - push @result, $first; + push @result, substr($first,1); while (!$dbh->seq($word, $value, R_NEXT)) { # We should limit this to a "resonable" number of words - last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o; - push @result, $word; + last if $word gt $last; + push @result, substr($word,1); } \@result; # speed } @@ -298,16 +342,16 @@ ($prefix) = &{$self->{'pfunc'}}($prefix); } - if ($dbh->seq($word = $prefix, $value, R_CURSOR)) { + if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) { return (); } - return () if $word !~ /^$prefix/; - push @result, $word; + return () if $word !~ /^p$prefix/; + push @result, substr($word,1); while (!$dbh->seq($word, $value, R_NEXT)) { # We should limit this to a "resonable" number of words - last if $word !~ /^$prefix/; - push @result, $word; + last if $word !~ /^p$prefix/; + push @result, substr($word,1); } \@result; # speed } @@ -403,9 +447,9 @@ # We keep duplicates my @terms = # Sort words by decreasing document frequency - sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} } + sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} } # check which words occur in the index. - grep { $self->{db}->{$O,$_} } @_; + grep { $self->{db}->{'o'.$_} } @_; return () unless @terms; # nothing to search for @@ -413,17 +457,17 @@ # choping off the rest of the list will return the same ranking. if ($wanted and @terms == 1) { my $term = shift @terms; - my $idf = log($self->{records}/$self->{db}->{$O,$term}); + my $idf = log($self->{records}/$self->{db}->{'o'.$term}); my @res; if ($self->{reorg}) { # or not $query->{picky} - @res = unpack "w". int(2*$wanted), $self->{db}->{$term}; + @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term}; } else { - @res = unpack 'w*', $self->{db}->{$term}; + @res = unpack 'w*', $self->{db}->{'p'.$term}; } for (my $i=1; $i<@res; $i+=2) { - $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf; + $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf; } return @res @@ -434,20 +478,20 @@ # result. unless ($wanted) { for (@terms) { - my $df = $self->{db}->{$O,$_}; + my $df = $self->{db}->{'o'.$_}; # The frequency *must* be 1 at least since the posting list is nonempty _complain('search for term', $_) and $df = 1 if $df < 1; # Unpack posting list for current query term $_ - my %post = unpack 'w*', $self->{db}->{$_}; + my %post = unpack 'w*', $self->{db}->{'p'.$_}; - _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post; + _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post; # This is the inverse document frequency. The log of the inverse # fraction of documents the term occurs in. my $idf = log($self->{records}/$df); for my $did (keys %post) { - if (my $freq = $self->{db}->{$M, $did}) { + if (my $freq = $self->{db}->{'m'. $did}) { $score{$did} += $post{$did} / $freq * $idf; } } @@ -460,12 +504,12 @@ unless ($strict) { for (@terms) { # Unpack posting list for current query term $_ - my %post = unpack 'w*', $self->{db}->{$_}; + my %post = unpack 'w*', $self->{db}->{'p'.$_}; # Lookup the number of documents the term occurs in (document frequency) - my $occ = $self->{db}->{$O,$_}; + my $occ = $self->{db}->{'o'.$_}; - _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post; + _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post; # The frequency *must* be 1 at least since the posting list is nonempty _complain('search for term', $_) and $occ = 1 if $occ < 1; @@ -488,14 +532,14 @@ if (keys %score < $wanted) { for my $did (keys %post) { - if (my $freq = $self->{db}->{$M, $did}) { + if (my $freq = $self->{db}->{'m'. $did}) { $score{$did} += $post{$did} / $freq * $idf; } } } else { for my $did (keys %score) { next unless exists $post{$did}; - if (my $freq = $self->{db}->{$M, $did}) { + if (my $freq = $self->{db}->{'m'. $did}) { $score{$did} += $post{$did} / $freq * $idf; } } @@ -515,7 +559,7 @@ for (my $i = $#terms; $i >=0; $i--) { local $_ = $terms[$i]; # Lookup the number of documents the term occurs in (document frequency) - my $df = $self->{db}->{$O,$_}; + my $df = $self->{db}->{'o'.$_}; # The frequency *must* be 1 at least since the posting list is nonempty _complain('search for term', $_) and $df = 1 if $df < 1; @@ -526,11 +570,11 @@ my ($did,$occ); if ($self->{reorg}) { - ($did,$occ) = unpack 'w2', $self->{db}->{$_}; + ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_}; } else { # Maybe this costs more than it helps - ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_}); + ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_}); } - my $freq = $self->{db}->{$M, $did}; + my $freq = $self->{db}->{'m'. $did}; my $max = $occ/$freq*$idf[$i]; $max[$i] = $max + $max[$i+1]; } @@ -540,10 +584,10 @@ my $term = $terms[$i]; # Unpack posting list for current query term $term. We loose the # sorting order because the assignment to a hash. - my %post = unpack 'w*', $self->{db}->{$term}; + my %post = unpack 'w*', $self->{db}->{'p'.$term}; _complain('search for term', $term) - if $self->{db}->{$O,$term} != keys %post; + if $self->{db}->{'o'.$term} != keys %post; my $idf = $idf[$i]; my $full; # Need to process all postings @@ -572,7 +616,7 @@ if (defined $chop) { # We might be able to avoid allocating accumulators for my $did (keys %post) { - if (my $freq = $self->{db}->{$M, $did}) { + if (my $freq = $self->{db}->{'m'. $did}) { my $wgt = $post{$did} / $freq * $idf; # We add an accumulator if $wgt exeeds $chop if (exists $score{$did} or $wgt > $chop) { @@ -583,7 +627,7 @@ } else { # Allocate acumulators for each seen document. for my $did (keys %post) { - if (my $freq = $self->{db}->{$M, $did}) { + if (my $freq = $self->{db}->{'m'. $did}) { $score{$did} += $post{$did} / $freq * $idf; } } @@ -592,7 +636,7 @@ # Update existing accumulators for my $did (keys %score) { next unless exists $post{$did}; - if (my $freq = $self->{db}->{$M, $did}) { + if (my $freq = $self->{db}->{'m'. $did}) { $score{$did} += $post{$did} / $freq * $idf; } } @@ -615,7 +659,7 @@ $self->sync; while (my($key, $value) = each %{$self->{db}}) { - next if $key =~ /^\377[om]/; + next if $key !~ /^p/; $self->{db}->{$key} = $self->sort_postings($value); } $self->{reorg} = 1; @@ -628,15 +672,15 @@ print STDERR "Flushing $self->{cached} postings\n" if $self->{cached}; while (my($key, $value) = each %{$self->{cache}}) { if ($self->{reorg}) { - $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key} + $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key} . $value); } else { - $self->{db}->{$key} .= $value; + $self->{db}->{'p'.$key} .= $value; } } while (my($key, $value) = each %{$self->{cdict}}) { - $self->{db}->{$key} = 0 unless $self->{db}->{$key}; - $self->{db}->{$key} += $value; + $self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key}; + $self->{db}->{'o'.$key} += $value; } $self->{cache} = {}; $self->{cdict} = {};