/[wait]/trunk/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/CPAN/lib/WAIT/InvertedIndex.pm revision 13 by ulpfr, Fri Apr 28 15:42:44 2000 UTC cvs-head/lib/WAIT/InvertedIndex.pm revision 85 by ulpfr, Fri May 3 16:16:10 2002 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Cperl -*-  #                              -*- Mode: Perl -*-
2  # InvertedIndex.pm --  # $Basename: InvertedIndex.pm $
3  # ITIID           : $ITI$ $Header $__Header$  # $Revision: 1.30 $
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Sun Nov 22 18:44:42 1998  # Last Modified On: Sat Apr 27 16:13:55 2002
8  # Language        : CPerl  # Language        : CPerl
9  # Status          : Unknown, Use with caution!  #
10  #  # (C) Copyright 1996-2002, Ulrich Pfeifer
11  # Copyright (c) 1996-1997, Ulrich Pfeifer  #
 #  
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
14  use strict;  use strict;
15  use DB_File;  use BerkeleyDB;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC $VERSION);
20    
21    $VERSION = "1.900"; # others test if we are loaded by checking $VERSION
22    
23    # The dictionary has three different key types:
24    #  'o'.$word
25    #
26    #     The document frequency is the number of documents a term occurs
27    #     in. The idea is that a term occuring in a significant portion of the
28    #     documents is not too significant.
29    #
30    # 'm'.$word
31    #
32    #     The maximum term frequency of a document is the frequency of the
33    #     most frequent term in the document.  It is related to the document
34    #     length obviously.  A document in which the most frequent term occurs
35    #     100 times is probably much longer than a document whichs most
36    #     frequent term occurs five time.
37    #
38    # 'p'.$word
39    #
40    #     Under this key we store the actual posting list as pairs of
41    #     packed integers.
42    
43  my $O = pack('C', 0xff)."o";                  # occurances  my $no_old_index_support = 0; # do not check for old indices if set
 my $M = pack('C', 0xff)."m";                  # maxtf  
44    
45  sub new {  sub new {
46    my $type = shift;    my $type = shift;
# Line 124  sub drop { Line 144  sub drop {
144    }    }
145  }  }
146    
147    sub is_an_old_index {
148      my $self = shift;
149    
150      return 0 if $no_old_index_support;
151      return $self->{old_index} if exists $self->{old_index};
152    
153      # We can only guess if this is an old index. We lookup the first 10
154      # $O entries. If all values are integers, we assume that the index
155      # is an old one.
156    
157      defined $self->{db} or $self->open;
158      $self->sync;
159      my $dbh = $self->{dbh} or return $self->{old_index} = 0;       # for convenience
160    
161      my $O = pack('C', 0xff)."o";
162      my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
163      if ( my $ret = $dbh->seq($word, $value, DB_CURSOR) ) {
164        # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165        return $self->{old_index} = 0;
166      }
167      for (my $i=0; $i<10;$i++) {
168        if ($value !~ /^\d+$/) {
169          # warn "DEBUG: word[$word]value[$value], not an old index";
170          return $self->{old_index} = 0;
171        }
172        if (my $ret = $dbh->seq($word, $value, DB_NEXT) or # no values left
173            $word !~ /^$O$;/o                   # no $O values left
174           ) {
175          # we are not sure enough that this is an old index
176          # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177          return $self->{old_index} = 0;
178        }
179      }
180      # warn "DEBUG: old index";
181      return $self->{old_index} = 1;
182    }
183    
184  sub open {  sub open {
185    my $self = shift;    my $self = shift;
186    my $file = $self->{file};    my $file = $self->{file};
# Line 133  sub open { Line 190  sub open {
190    } else {    } else {
191      $self->{func}     =      $self->{func}     =
192        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
193      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,      $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
194                         $self->{mode}, 0664, $DB_BTREE);                         -Filename => $self->{file},
195  #    tie(%{$self->{cache}}, 'DB_File', undef,                         -Subname  => $self->{name},
196  #        $self->{mode}, 0664, $DB_BTREE)                         -Mode     => $self->{mode};
197      $self->{cache} = {}      $self->{cache} = {}
198        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
 #    tie(%{$self->{cdict}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
199      $self->{cdict} = {}      $self->{cdict} = {}
200        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
201      $self->{cached} = 0;      $self->{cached} = 0;
202        if (!$no_old_index_support and $self->is_an_old_index()) {
203          warn "This is an old index, upgrade you database";
204          require WAIT::InvertedIndexOld;
205          bless $self, 'WAIT::InvertedIndexOld';
206        }
207    }    }
208  }  }
209    
# Line 158  sub insert { Line 218  sub insert {
218    $self->{records}++;    $self->{records}++;
219    while (($word, $noc) = each %occ) {    while (($word, $noc) = each %occ) {
220      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
221        $self->{cdict}->{$O,$word}++;        $self->{cdict}->{$word}++;
222        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
223      } else {      } else {          
224        $self->{cdict}->{$O,$word} = 1;        $self->{cdict}->{$word} = 1;
225        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
226      }      }
227      $self->{cached}++;      $self->{cached}++;
228    }    }
229      # This cache limit should be configurable
230    $self->sync if $self->{cached} > 100_000;    $self->sync if $self->{cached} > 100_000;
231    my $maxtf = 0;    my $maxtf = 0;
232    for (values %occ) {    for (values %occ) {
233      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
234    }    }
235    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{'m'. $key} = $maxtf;
236    }
237    
238    # We sort postings by increasing max term frequency (~ by increasing
239    # document length.  This reduces the quality degradation if we process
240    # only the first part of a posting list.
241    
242    sub sort_postings {
243      my $self = shift;
244      my $post = shift;             # reference to a hash or packed string
245    
246      if (ref $post) {
247        # we skip the sort part, if the index is not sorted
248        return pack('w*', %$post) unless $self->{reorg};
249      } else {
250        $post = { unpack 'w*', $post };
251      }
252    
253      my $r = '';
254    
255      # Sort posting list by increasing ratio of maximum term frequency (~
256      # "document length") and term frequency. This ratio multipied by the
257      # inverse document frequence gives the score for a term.  This sort
258      # order can be exploited for tuning of single term queries.
259    
260      for my $did (keys %$post) { # sanity check
261        unless ($self->{db}->{"m". $did}) {
262          warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
263          $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
264        }
265      }
266      for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
267                                          <=>
268                             $post->{$a} / $self->{db}->{'m'. $a}
269                        } keys %$post) {
270        $r .= pack 'w2', $did, $post->{$did};
271      }
272      #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
273      $r;
274  }  }
275    
276  sub delete {  sub delete {
# Line 179  sub delete { Line 278  sub delete {
278    my $key   = shift;    my $key   = shift;
279    my %occ;    my %occ;
280    
281      my $db;
282    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
283      $db = $self->{db};
284    $self->sync;    $self->sync;
285    $self->{records}--;    $self->{records}--;
286    
287      # less than zero documents in database?
288      _complain('delete of document', $key) and $self->{records} = 0
289        if $self->{records} < 0;
290    
291    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
292    for (keys %occ) {  
293      # may reorder posting list    # Be prepared for "Odd number of elements in hash assignment"
294      my %post = unpack 'w*', $self->{db}->{$_};    local $SIG{__WARN__} = sub {
295      $self->{db}->{$O,$_}--;      my $warning = shift;
296        chomp $warning;
297        warn "Catching warning[$warning] during delete of key[$key]";
298      };
299      for (keys %occ) {# may reorder posting list
300        my %post = unpack 'w*', $db->{'p'.$_};
301      delete $post{$key};      delete $post{$key};
302      $self->{db}->{$_} = pack 'w*', %post;      $db->{'p'.$_}    = $self->sort_postings(\%post);
303        _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
304        $db->{'o'.$_} = scalar keys %post;
305    }    }
306    delete $self->{db}->{$M, $key};    delete $db->{'m'. $key};
307  }  }
308    
309  sub intervall {  sub intervall {
# Line 213  sub intervall { Line 326  sub intervall {
326      ($first) = &{$self->{'ifunc'}}($first) if $first;      ($first) = &{$self->{'ifunc'}}($first) if $first;
327      ($last)  = &{$self->{'ifunc'}}($last) if $last;      ($last)  = &{$self->{'ifunc'}}($last) if $last;
328    }    }
329    if (defined $first and $first ne '') {         # set the cursor to $first    $first = 'p'.($first||'');
330      $dbh->seq($first, $value, R_CURSOR);    $last  = (defined $last)?'p'.$last:'q';
331    } else {  
332      $dbh->seq($first, $value, R_FIRST);    # set the cursor to $first
333    }    $dbh->seq($first, $value, DB_CURSOR);
334    # We assume that word do not start with the character \377  
335    # $last = pack 'C', 0xff unless defined $last and $last ne '';    # $first would be after the last word
336    return () if defined $last and $first gt $last; # $first would be after the last word    return () if $first gt $last;
337        
338    push @result, $first;    push @result, substr($first,1);
339    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, DB_NEXT)) {
340      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
341      last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;      last if $word gt $last;
342      push @result, $word;      push @result, substr($word,1);
343    }    }
344    \@result;                     # speed    \@result;                     # speed
345  }  }
# Line 251  sub prefix { Line 364  sub prefix {
364      ($prefix) = &{$self->{'pfunc'}}($prefix);      ($prefix) = &{$self->{'pfunc'}}($prefix);
365    }    }
366    
367    if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {    if ($dbh->seq($word = 'p'.$prefix, $value, DB_CURRENT)) {
368      return ();      return ();
369    }    }
370    return () if $word !~ /^$prefix/;    return () if $word !~ /^p$prefix/;
371    push @result, $word;    push @result, substr($word,1);
372    
373    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, DB_NEXT)) {
374      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
375      last if $word !~ /^$prefix/;      last if $word !~ /^p$prefix/;
376      push @result, $word;      push @result, substr($word,1);
377    }    }
378    \@result;                     # speed    \@result;                     # speed
379  }  }
380    
381    =head2 search($query)
382    
383    The search method supports a range of search algorithms.  It is
384    recommended to tune the index by calling
385    C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
386    into the table.  This is a computing intense operation and all inserts
387    and deletes after this optimization are slightly more expensive.  Once
388    reorganized, the index is kept sorted automatically until you switch
389    the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
390    
391    When searching a tuned index, a query can be processed faster if the
392    caller requests only the topmost documents.  This can be done by
393    passing a C<top =E<gt>> I<n> parameter to the search method.
394    
395    For single term queries, the method returns only the I<n> top ranking
396    documents.  For multi term queries two optimized algorithms are
397    available. The first algorithm computes the top n documents
398    approximately but very fast, sacrificing a little bit of precision for
399    speed.  The second algorithm computes the topmost I<n> documents
400    precisely.  This algorithm is slower and should be used only for small
401    values of I<n>.  It can be requested by passing the query attribute
402    C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
403    While the picky version might not be faster than the brute force
404    version on average for modest size databases it uses less memory and
405    the processing time is almost linear in the number of query terms, not
406    in the size of the lists.
407    
408    =cut
409    
410  sub search {  sub search {
411    my $self  = shift;    my $self  = shift;
412      my $query = shift;
413    
414    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
415    $self->sync;    $self->sync;
416    $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() there
417  }  }
418    
419  sub parse {  sub parse {
# Line 280  sub parse { Line 423  sub parse {
423    &{$self->{func}}(@_);    &{$self->{func}}(@_);
424  }  }
425    
 sub keys {  
   my $self  = shift;  
   
   defined $self->{db} or $self->open;  
   keys %{$self->{db}};  
 }  
   
426  sub search_prefix {  sub search_prefix {
427    my $self  = shift;    my $self  = shift;
428    
# Line 295  sub search_prefix { Line 431  sub search_prefix {
431    $self->search_raw(map($self->prefix($_), @_));    $self->search_raw(map($self->prefix($_), @_));
432  }  }
433    
434    sub _complain ($$) {
435      my ($action, $term) = @_;
436    
437      require Carp;
438      Carp::cluck
439        (sprintf("WAIT database inconsistency during $action [%s]: ".
440                 "Please rebuild index\n",
441                 $term,));
442    }
443    
444  sub search_raw {  sub search_raw {
445    my $self  = shift;    my $self  = shift;
446    my %occ;    my $query = shift;
447    my %score;    my %score;
448    
449    return () unless @_;    # Top $wanted documents must be correct. Zero means all matching
450      # documents.
451      my $wanted = $query->{top};
452      my $strict = $query->{picky};
453    
454      # Return at least $minacc documents. Zero means all matching
455      # documents.
456      # my $minacc = $query->{accus} || $wanted;
457    
458      # Open index and flush cache if necessary
459    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
460    $self->sync;    $self->sync;
461    grep $occ{$_}++, @_;  
462    for (keys %occ) {    # We keep duplicates
463      if (defined $self->{db}->{$_}) {    my @terms =
464        my %post = unpack 'w*', $self->{db}->{$_};      # Sort words by decreasing document frequency
465        my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1));      sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
466        my $did;        # check which words occur in the index.
467        for $did (keys %post) {        grep { $self->{db}->{'o'.$_} } @_;
468          $score{$did} = 0 unless defined $score{$did}; # perl -w  
469          $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf    return unless @terms;
470            if $self->{db}->{$M, $did}; # db may be broken  
471      # We special-case one term queries here.  If the index was sorted,
472      # choping off the rest of the list will return the same ranking.
473      if ($wanted and @terms == 1) {
474        my $term  = shift @terms;
475        my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
476        my @res;
477    
478        if ($self->{reorg}) { # or not $query->{picky}
479          @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
480        } else {
481          @res = unpack 'w*',                $self->{db}->{'p'.$term};
482        }
483    
484        for (my $i=1; $i<@res; $i+=2) {
485          # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
486          # above was written badly, allows two DIV_ZERO problems.
487          my $maxtf = $self->{db}->{"m". $res[$i-1]};
488          unless ($maxtf) {
489            warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
490            $maxtf = 1;
491          }
492          $res[$i] = ($res[$i] / $maxtf) * $idf;
493        }
494    
495        return @res
496      }
497    
498      # We separate exhaustive search here to avoid overhead and make the
499      # code more readable. The block can be removed without changing the
500      # result.
501      unless ($wanted) {
502        for (@terms) {
503          my $df      = $self->{db}->{'o'.$_};
504    
505          # The frequency *must* be 1 at least since the posting list is nonempty
506          _complain('search for term', $_) and $df = 1 if $df < 1;
507    
508          # Unpack posting list for current query term $_
509          my %post = unpack 'w*', $self->{db}->{'p'.$_};
510    
511          _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
512          # This is the inverse document frequency. The log of the inverse
513          # fraction of documents the term occurs in.
514          my $idf = log($self->{records}/$df);
515          for my $did (keys %post) {
516            if (my $freq = $self->{db}->{'m'. $did}) {
517              $score{$did} += $post{$did} / $freq * $idf;
518            }
519          }
520        }
521        # warn sprintf "Used %d accumulators\n", scalar keys %score;
522        return %score;
523      }
524    
525      # A sloppy but fast algorithm for multiple term queries.
526      unless ($strict) {
527        for (@terms) {
528          # Unpack posting list for current query term $_
529          my %post = unpack 'w*', $self->{db}->{'p'.$_};
530    
531          # Lookup the number of documents the term occurs in (document frequency)
532          my $occ  = $self->{db}->{'o'.$_};
533    
534          _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
535          # The frequency *must* be 1 at least since the posting list is nonempty
536          _complain('search for term', $_) and $occ = 1 if $occ < 1;
537    
538          # This is the inverse document frequency. The log of the inverse
539          # fraction of documents the term occurs in.
540          my $idf = log($self->{records}/$occ);
541    
542          # If we have a reasonable number of accumulators, change the
543          # loop to iterate over the accumulators.  This will compromise
544          # quality for better speed.  The algorithm still computes the
545          # exact weights, but the result is not guaranteed to contain the
546          # *best* results.  The database might contain documents better
547          # than the worst returned document.
548          
549          # We process the lists in order of increasing length.  When the
550          # number of accumulators exceeds $wanted, no new documents are
551          # added, only the ranking/weighting of the seen documents is
552          # improved.  The resulting ranking list must be pruned, since only
553          # the top most documents end up near their "optimal" rank.
554          
555          if (keys %score < $wanted) {
556            for my $did (keys %post) {
557              if (my $freq = $self->{db}->{'m'. $did}) {
558                $score{$did} += $post{$did} / $freq * $idf;
559              }
560            }
561          } else {
562            for my $did (keys %score) {
563              next unless exists $post{$did};
564              if (my $freq = $self->{db}->{'m'. $did}) {
565                $score{$did} += $post{$did} / $freq * $idf;
566              }
567            }
568        }        }
569      }      }
570        return %score;
571    }    }
572      my @max; $max[$#terms+1]=0;
573      my @idf;
574    
575      # Preparation loop.  This extra loop makes sense only when "reorg"
576      # and "wanted" are true.  But at the time beeing, keeping the code
577      # for the different search algorithms in one place seems more
578      # desirable than some minor speedup of the brute force version.  We
579      # do cache $idf though.
580    
581      for (my $i = $#terms; $i >=0; $i--) {
582        local $_ = $terms[$i];
583        # Lookup the number of documents the term occurs in (document frequency)
584        my $df      = $self->{db}->{'o'.$_};
585    
586        # The frequency *must* be 1 at least since the posting list is nonempty
587        _complain('search for term', $_) and $df = 1 if $df < 1;
588    
589        # This is the inverse document frequency. The log of the inverse
590        # fraction of documents the term occurs in.
591        $idf[$i] = log($self->{records}/$df);
592    
593        my ($did,$occ);
594        if ($self->{reorg}) {
595          ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
596        } else {                    # Maybe this costs more than it helps
597          ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
598        }
599        my $freq      = $self->{db}->{'m'. $did};
600        my $max       = $occ/$freq*$idf[$i];
601        $max[$i]      = $max + $max[$i+1];
602      }
603    
604      # Main loop
605      for my $i (0 .. $#terms) {
606        my $term = $terms[$i];
607        # Unpack posting list for current query term $term. We loose the
608        # sorting order because the assignment to a hash.
609        my %post = unpack 'w*', $self->{db}->{'p'.$term};
610    
611        _complain('search for term', $term)
612          if $self->{db}->{'o'.$term} != keys %post;
613    
614        my $idf  = $idf[$i];
615        my $full;                   # Need to process all postings
616        my $chop;                   # Score necessary to enter the ranking list
617    
618        if (# We know that wanted is true since we special cased the
619            # exhaustive search.
620    
621            $wanted and
622    
623            # We did sort here if necessary in
624            # the preparation loop
625            # $self->{reorg} and
626    
627            scalar keys %score > $wanted) {
628          $chop = (sort { $b <=> $a } values %score)[$wanted];
629          $full = $max[$i] > $chop;
630        } else {
631          $full = 1;
632        }
633    
634        if ($full) {
635          # We need to inspect the full list. Either $wanted is not given,
636          # the index is not sorted, or we don't have enough accumulators
637          # yet.
638          if (defined $chop) {
639            # We might be able to avoid allocating accumulators
640            for my $did (keys %post) {
641              if (my $freq = $self->{db}->{'m'. $did}) {
642                my $wgt = $post{$did} / $freq * $idf;
643                # We add an accumulator if $wgt exeeds $chop
644                if (exists $score{$did} or $wgt > $chop) {
645                  $score{$did} += $wgt;
646                }
647              }
648            }
649          } else {
650            # Allocate acumulators for each seen document.
651            for my $did (keys %post) {
652              if (my $freq = $self->{db}->{'m'. $did}) {
653                $score{$did} += $post{$did} / $freq * $idf;
654              }
655            }
656          }
657        } else {
658          # Update existing accumulators
659          for my $did (keys %score) {
660            next unless exists $post{$did};
661            if (my $freq = $self->{db}->{'m'. $did}) {
662              $score{$did} += $post{$did} / $freq * $idf;
663            }
664          }
665        }
666      }
667      #warn sprintf "Used %d accumulators\n", scalar keys %score;
668    %score;    %score;
669  }  }
670    
671    sub set {
672      my ($self, $attr, $value) = @_;
673    
674      die "No such index attribute: '$attr'" unless $attr eq 'top';
675    
676      return delete $self->{reorg} if $value == 0;
677    
678      return if     $self->{reorg};     # we are sorted already
679      return unless $self->{mode} & O_RDWR;
680      defined $self->{db} or $self->open;
681    
682      $self->sync;
683      while (my($key, $value) = each %{$self->{db}}) {
684        next if $key !~ /^p/;
685        $self->{db}->{$key} = $self->sort_postings($value);
686      }
687      $self->{reorg} = 1;
688    }
689    
690  sub sync {  sub sync {
691    my $self = shift;    my $self = shift;
692    
693    if ($self->{mode} & O_RDWR) {    if ($self->{mode} & O_RDWR) {
694      print STDERR "Flushing $self->{cached} postings\n";      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
695      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
696        $self->{db}->{$key} .= $value;        $self->{db}->{"p". $key} ||= "";
697        #delete $self->{cache}->{$key};        if ($self->{reorg}) {
698            $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
699                                                       . $value);
700          } else {
701            $self->{db}->{'p'.$key} .= $value;
702          }
703      }      }
704      while (my($key, $value) = each %{$self->{cdict}}) {      while (my($key, $value) = each %{$self->{cdict}}) {
705        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};        $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
706        $self->{db}->{$key} += $value;        $self->{db}->{'o'.$key} += $value;
707        #delete $self->{cdict}->{$key};      }
708      }      $self->{cache}  = {};
709      $self->{cache} = {};      $self->{cdict}  = {};
     $self->{cdict} = {};  
     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";  
710      $self->{cached} = 0;      $self->{cached} = 0;
     # $self->{dbh}->sync if $self->{dbh};  
711    }    }
712  }  }
713    
# Line 360  sub close { Line 729  sub close {
729    }    }
730  }  }
731    
732    sub keys {
733      my $self  = shift;
734    
735      defined $self->{db} or $self->open;
736      keys %{$self->{db}};
737    }
738    
739  1;  1;
740    

Legend:
Removed from v.13  
changed lines
  Added in v.85

  ViewVC Help
Powered by ViewVC 1.1.26