--- cvs-head/lib/WAIT/InvertedIndex.pm 2000/04/28 15:40:52 10 +++ branches/CPAN/lib/WAIT/InvertedIndex.pm 2000/04/28 15:42:44 13 @@ -1,5 +1,5 @@ -# -*- Mode: Perl -*- -# InvertedIndex.pm -- +# -*- Mode: Cperl -*- +# InvertedIndex.pm -- # ITIID : $ITI$ $Header $__Header$ # Author : Ulrich Pfeifer # Created On : Thu Aug 8 13:05:10 1996 @@ -7,9 +7,9 @@ # Last Modified On: Sun Nov 22 18:44:42 1998 # Language : CPerl # Status : Unknown, Use with caution! -# +# # Copyright (c) 1996-1997, Ulrich Pfeifer -# +# package WAIT::InvertedIndex; use strict; @@ -63,9 +63,12 @@ sub _xfiltergen { my $filter = pop @_; - if ($filter eq 'stop') { # avoid the slow stopword elimination - return _xfiltergen(@_); # it's cheaper to look them up afterwards - } +# Oops, we cannot overrule the user's choice. Other filters may kill +# stopwords, such as isotr clobbers "isn't" to "isnt". + +# if ($filter eq 'stop') { # avoid the slow stopword elimination +# return _xfiltergen(@_); # it's cheaper to look them up afterwards +# } if (@_) { if ($filter =~ /^split(\d*)/) { if ($1) { @@ -148,7 +151,7 @@ my $self = shift; my $key = shift; my %occ; - + defined $self->{db} or $self->open; grep $occ{$_}++, &{$self->{func}}(@_); my ($word, $noc); @@ -160,7 +163,7 @@ } else { $self->{cdict}->{$O,$word} = 1; $self->{cache}->{$word} = pack 'w2', $key, $noc; - } + } $self->{cached}++; } $self->sync if $self->{cached} > 100_000; @@ -277,6 +280,13 @@ &{$self->{func}}(@_); } +sub keys { + my $self = shift; + + defined $self->{db} or $self->open; + keys %{$self->{db}}; +} + sub search_prefix { my $self = shift; @@ -298,7 +308,7 @@ for (keys %occ) { if (defined $self->{db}->{$_}) { my %post = unpack 'w*', $self->{db}->{$_}; - my $idf = log($self->{records}/$self->{db}->{$O,$_}); + my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1)); my $did; for $did (keys %post) { $score{$did} = 0 unless defined $score{$did}; # perl -w @@ -314,7 +324,7 @@ my $self = shift; if ($self->{mode} & O_RDWR) { - print STDERR "\aFlushing $self->{cached} postings\n"; + print STDERR "Flushing $self->{cached} postings\n"; while (my($key, $value) = each %{$self->{cache}}) { $self->{db}->{$key} .= $value; #delete $self->{cache}->{$key};