/[wait]/trunk/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/CPAN/lib/WAIT/InvertedIndex.pm revision 13 by ulpfr, Fri Apr 28 15:42:44 2000 UTC cvs-head/lib/WAIT/InvertedIndex.pm revision 71 by laperla, Sun Jan 27 15:27:38 2002 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Cperl -*-  #                              -*- Mode: Perl -*-
2  # InvertedIndex.pm --  # $Basename: InvertedIndex.pm $
3  # ITIID           : $ITI$ $Header $__Header$  # $Revision: 1.30 $
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Sun Nov 22 18:44:42 1998  # Last Modified On: Mon Dec 31 14:30:05 2001
8  # Language        : CPerl  # Language        : CPerl
9  # Status          : Unknown, Use with caution!  #
10  #  # (C) Copyright 1996-2000, Ulrich Pfeifer
11  # Copyright (c) 1996-1997, Ulrich Pfeifer  #
 #  
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
14  use strict;  use strict;
# Line 17  use DB_File; Line 16  use DB_File;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC $VERSION);
20    
21    $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22    
23    # The dictionary has three different key types:
24    #  'o'.$word
25    #
26    #     The document frequency is the number of documents a term occurs
27    #     in. The idea is that a term occuring in a significant part of the
28    #     documents is not too significant.
29    #
30    # 'm'.$word
31    #
32    #     The maximum term frequency of a document is the frequency of the
33    #     most frequent term in the document.  It is related to the document
34    #     length obviously.  A document in which the most frequent term occurs
35    #     100 times is probably much longer than a document whichs most
36    #     frequent term occurs five time.
37    #
38    # 'p'.$word
39    #
40    #     Under this key we store the actual posting list as pairs of
41    #     packed integers.
42    
43  my $O = pack('C', 0xff)."o";                  # occurances  my $no_old_index_support = 0; # do not check for old indices if set
 my $M = pack('C', 0xff)."m";                  # maxtf  
44    
45  sub new {  sub new {
46    my $type = shift;    my $type = shift;
# Line 124  sub drop { Line 144  sub drop {
144    }    }
145  }  }
146    
147    sub is_an_old_index {
148      my $self = shift;
149    
150      return 0 if $no_old_index_support;
151      return $self->{old_index} if exists $self->{old_index};
152    
153      # We can only guess if this is an old index. We lookup the first 10
154      # $O entries. If all values are integers, we assume that the index
155      # is an old one.
156    
157      defined $self->{db} or $self->open;
158      $self->sync;
159      my $dbh = $self->{dbh};       # for convenience
160    
161      my $O = pack('C', 0xff)."o";
162      my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
163      if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
164        # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165        return $self->{old_index} = 0;
166      }
167      for (my $i=0; $i<10;$i++) {
168        if ($value !~ /^\d+$/) {
169          # warn "DEBUG: word[$word]value[$value], not an old index";
170          return $self->{old_index} = 0;
171        }
172        if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
173            $word !~ /^$O$;/o                   # no $O values left
174           ) {
175          # we are not sure enough that this is an old index
176          # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177          return $self->{old_index} = 0;
178        }
179      }
180      # warn "DEBUG: old index";
181      return $self->{old_index} = 1;
182    }
183    
184  sub open {  sub open {
185    my $self = shift;    my $self = shift;
186    my $file = $self->{file};    my $file = $self->{file};
# Line 135  sub open { Line 192  sub open {
192        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
193      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
194                         $self->{mode}, 0664, $DB_BTREE);                         $self->{mode}, 0664, $DB_BTREE);
 #    tie(%{$self->{cache}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
195      $self->{cache} = {}      $self->{cache} = {}
196        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
 #    tie(%{$self->{cdict}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
197      $self->{cdict} = {}      $self->{cdict} = {}
198        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
199      $self->{cached} = 0;      $self->{cached} = 0;
200        if (!$no_old_index_support and $self->is_an_old_index()) {
201          warn "This is an old index, upgrade you database";
202          require WAIT::InvertedIndexOld;
203          bless $self, 'WAIT::InvertedIndexOld';
204        }
205    }    }
206  }  }
207    
# Line 158  sub insert { Line 216  sub insert {
216    $self->{records}++;    $self->{records}++;
217    while (($word, $noc) = each %occ) {    while (($word, $noc) = each %occ) {
218      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
219        $self->{cdict}->{$O,$word}++;        $self->{cdict}->{$word}++;
220        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
221      } else {      } else {          
222        $self->{cdict}->{$O,$word} = 1;        $self->{cdict}->{$word} = 1;
223        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
224      }      }
225      $self->{cached}++;      $self->{cached}++;
226    }    }
227      # This cache limit should be configurable
228    $self->sync if $self->{cached} > 100_000;    $self->sync if $self->{cached} > 100_000;
229    my $maxtf = 0;    my $maxtf = 0;
230    for (values %occ) {    for (values %occ) {
231      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
232    }    }
233    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{'m'. $key} = $maxtf;
234    }
235    
236    # We sort postings by increasing max term frequency (~ by increasing
237    # document length.  This reduces the quality degradation if we process
238    # only the first part of a posting list.
239    
240    sub sort_postings {
241      my $self = shift;
242      my $post = shift;             # reference to a hash or packed string
243    
244      if (ref $post) {
245        # we skip the sort part, if the index is not sorted
246        return pack('w*', %$post) unless $self->{reorg};
247      } else {
248        $post = { unpack 'w*', $post };
249      }
250    
251      my $r = '';
252    
253      # Sort posting list by increasing ratio of maximum term frequency (~
254      # "document length") and term frequency. This rati multipied by the
255      # inverse document frequence gives the score for a term.  This sort
256      # order can be exploited for tuning of single term queries.
257    
258      for my $did (keys %$post) { # sanity check
259        unless ($self->{db}->{"m". $did}) {
260          warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
261          $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
262        }
263      }
264      for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
265                                          <=>
266                             $post->{$a} / $self->{db}->{'m'. $a}
267                        } keys %$post) {
268        $r .= pack 'w2', $did, $post->{$did};
269      }
270      #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
271      $r;
272  }  }
273    
274  sub delete {  sub delete {
# Line 179  sub delete { Line 276  sub delete {
276    my $key   = shift;    my $key   = shift;
277    my %occ;    my %occ;
278    
279      my $db;
280    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
281      $db = $self->{db};
282    $self->sync;    $self->sync;
283    $self->{records}--;    $self->{records}--;
284    
285      # less than zero documents in database?
286      _complain('delete of document', $key) and $self->{records} = 0
287        if $self->{records} < 0;
288    
289    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
290    for (keys %occ) {  
291      # may reorder posting list    # Be prepared for "Odd number of elements in hash assignment"
292      my %post = unpack 'w*', $self->{db}->{$_};    local $SIG{__WARN__} = sub {
293      $self->{db}->{$O,$_}--;      my $warning = shift;
294        chomp $warning;
295        warn "Catching warning[$warning] during delete of key[$key]";
296      };
297      for (keys %occ) {# may reorder posting list
298        my %post = unpack 'w*', $db->{'p'.$_};
299      delete $post{$key};      delete $post{$key};
300      $self->{db}->{$_} = pack 'w*', %post;      $db->{'p'.$_}    = $self->sort_postings(\%post);
301        _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
302        $db->{'o'.$_} = scalar keys %post;
303    }    }
304    delete $self->{db}->{$M, $key};    delete $db->{'m'. $key};
305  }  }
306    
307  sub intervall {  sub intervall {
# Line 213  sub intervall { Line 324  sub intervall {
324      ($first) = &{$self->{'ifunc'}}($first) if $first;      ($first) = &{$self->{'ifunc'}}($first) if $first;
325      ($last)  = &{$self->{'ifunc'}}($last) if $last;      ($last)  = &{$self->{'ifunc'}}($last) if $last;
326    }    }
327    if (defined $first and $first ne '') {         # set the cursor to $first    $first = 'p'.($first||'');
328      $dbh->seq($first, $value, R_CURSOR);    $last  = (defined $last)?'p'.$last:'q';
329    } else {  
330      $dbh->seq($first, $value, R_FIRST);    # set the cursor to $first
331    }    $dbh->seq($first, $value, R_CURSOR);
332    # We assume that word do not start with the character \377  
333    # $last = pack 'C', 0xff unless defined $last and $last ne '';    # $first would be after the last word
334    return () if defined $last and $first gt $last; # $first would be after the last word    return () if $first gt $last;
335        
336    push @result, $first;    push @result, substr($first,1);
337    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
338      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
339      last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;      last if $word gt $last;
340      push @result, $word;      push @result, substr($word,1);
341    }    }
342    \@result;                     # speed    \@result;                     # speed
343  }  }
# Line 251  sub prefix { Line 362  sub prefix {
362      ($prefix) = &{$self->{'pfunc'}}($prefix);      ($prefix) = &{$self->{'pfunc'}}($prefix);
363    }    }
364    
365    if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {    if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
366      return ();      return ();
367    }    }
368    return () if $word !~ /^$prefix/;    return () if $word !~ /^p$prefix/;
369    push @result, $word;    push @result, substr($word,1);
370    
371    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
372      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
373      last if $word !~ /^$prefix/;      last if $word !~ /^p$prefix/;
374      push @result, $word;      push @result, substr($word,1);
375    }    }
376    \@result;                     # speed    \@result;                     # speed
377  }  }
378    
379    =head2 search($query)
380    
381    The search method supports a range of search algorithms.  It is
382    recommended to tune the index by calling
383    C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
384    into the table.  This is a computing intense operation and all inserts
385    and deletes after this optimization are slightly more expensive.  Once
386    reorganized, the index is kept sorted automatically until you switch
387    the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
388    
389    When searching a tuned index, a query can be processed faster if the
390    caller requests only the topmost documents.  This can be done by
391    passing a C<top =E<gt>> I<n> parameter to the search method.
392    
393    For single term queries, the method returns only the I<n> top ranking
394    documents.  For multi term queries two optimized algorithms are
395    available. The first algorithm computes the top n documents
396    approximately but very fast, sacrificing a little bit of precision for
397    speed.  The second algorithm computes the topmost I<n> documents
398    precisely.  This algorithm is slower and should be used only for small
399    values of I<n>.  It can be requested by passing the query attribute
400    C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
401    While the picky version might not be faster than the brute force
402    version on average for modest size databases it uses less memory and
403    the processing time is almost linear in the number of query terms, not
404    in the size of the lists.
405    
406    =cut
407    
408  sub search {  sub search {
409    my $self  = shift;    my $self  = shift;
410      my $query = shift;
411    
412    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
413    $self->sync;    $self->sync;
414    $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
415  }  }
416    
417  sub parse {  sub parse {
# Line 280  sub parse { Line 421  sub parse {
421    &{$self->{func}}(@_);    &{$self->{func}}(@_);
422  }  }
423    
 sub keys {  
   my $self  = shift;  
   
   defined $self->{db} or $self->open;  
   keys %{$self->{db}};  
 }  
   
424  sub search_prefix {  sub search_prefix {
425    my $self  = shift;    my $self  = shift;
426    
# Line 295  sub search_prefix { Line 429  sub search_prefix {
429    $self->search_raw(map($self->prefix($_), @_));    $self->search_raw(map($self->prefix($_), @_));
430  }  }
431    
432    sub _complain ($$) {
433      my ($action, $term) = @_;
434    
435      require Carp;
436      Carp::cluck
437        (sprintf("WAIT database inconsistency during $action [%s]: ".
438                 "Please rebuild index\n",
439                 $term,));
440    }
441    
442  sub search_raw {  sub search_raw {
443    my $self  = shift;    my $self  = shift;
444    my %occ;    my $query = shift;
445    my %score;    my %score;
446    
447    return () unless @_;    # Top $wanted documents must be correct. Zero means all matching
448      # documents.
449      my $wanted = $query->{top};
450      my $strict = $query->{picky};
451    
452      # Return at least $minacc documents. Zero means all matching
453      # documents.
454      # my $minacc = $query->{accus} || $wanted;
455    
456      # Open index and flush cache if necessary
457    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
458    $self->sync;    $self->sync;
459    grep $occ{$_}++, @_;  
460    for (keys %occ) {    # We keep duplicates
461      if (defined $self->{db}->{$_}) {    my @terms =
462        my %post = unpack 'w*', $self->{db}->{$_};      # Sort words by decreasing document frequency
463        my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1));      sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
464        my $did;        # check which words occur in the index.
465        for $did (keys %post) {        grep { $self->{db}->{'o'.$_} } @_;
466          $score{$did} = 0 unless defined $score{$did}; # perl -w  
467          $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf    return unless @terms;
468            if $self->{db}->{$M, $did}; # db may be broken  
469      # We special-case one term queries here.  If the index was sorted,
470      # choping off the rest of the list will return the same ranking.
471      if ($wanted and @terms == 1) {
472        my $term  = shift @terms;
473        my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
474        my @res;
475    
476        if ($self->{reorg}) { # or not $query->{picky}
477          @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
478        } else {
479          @res = unpack 'w*',                $self->{db}->{'p'.$term};
480        }
481    
482        for (my $i=1; $i<@res; $i+=2) {
483          # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
484          # above was written badly, allows two DIV_ZERO problems.
485          my $maxtf = $self->{db}->{"m". $res[$i-1]};
486          unless ($maxtf) {
487            warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
488            $maxtf = 1;
489          }
490          $res[$i] = ($res[$i] / $maxtf) * $idf;
491        }
492    
493        return @res
494      }
495    
496      # We separate exhaustive search here to avoid overhead and make the
497      # code more readable. The block can be removed without changing the
498      # result.
499      unless ($wanted) {
500        for (@terms) {
501          my $df      = $self->{db}->{'o'.$_};
502    
503          # The frequency *must* be 1 at least since the posting list is nonempty
504          _complain('search for term', $_) and $df = 1 if $df < 1;
505    
506          # Unpack posting list for current query term $_
507          my %post = unpack 'w*', $self->{db}->{'p'.$_};
508    
509          _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
510          # This is the inverse document frequency. The log of the inverse
511          # fraction of documents the term occurs in.
512          my $idf = log($self->{records}/$df);
513          for my $did (keys %post) {
514            if (my $freq = $self->{db}->{'m'. $did}) {
515              $score{$did} += $post{$did} / $freq * $idf;
516            }
517          }
518        }
519        # warn sprintf "Used %d accumulators\n", scalar keys %score;
520        return %score;
521      }
522    
523      # A sloppy but fast algorithm for multiple term queries.
524      unless ($strict) {
525        for (@terms) {
526          # Unpack posting list for current query term $_
527          my %post = unpack 'w*', $self->{db}->{'p'.$_};
528    
529          # Lookup the number of documents the term occurs in (document frequency)
530          my $occ  = $self->{db}->{'o'.$_};
531    
532          _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
533          # The frequency *must* be 1 at least since the posting list is nonempty
534          _complain('search for term', $_) and $occ = 1 if $occ < 1;
535    
536          # This is the inverse document frequency. The log of the inverse
537          # fraction of documents the term occurs in.
538          my $idf = log($self->{records}/$occ);
539    
540          # If we have a reasonable number of accumulators, change the
541          # loop to iterate over the accumulators.  This will compromise
542          # quality for better speed.  The algorithm still computes the
543          # exact weights, but the result is not guaranteed to contain the
544          # *best* results.  The database might contain documents better
545          # than the worst returned document.
546          
547          # We process the lists in order of increasing length.  When the
548          # number of accumulators exceeds $wanted, no new documents are
549          # added, only the ranking/weighting of the seen documents is
550          # improved.  The resulting ranking list must be pruned, since only
551          # the top most documents end up near their "optimal" rank.
552          
553          if (keys %score < $wanted) {
554            for my $did (keys %post) {
555              if (my $freq = $self->{db}->{'m'. $did}) {
556                $score{$did} += $post{$did} / $freq * $idf;
557              }
558            }
559          } else {
560            for my $did (keys %score) {
561              next unless exists $post{$did};
562              if (my $freq = $self->{db}->{'m'. $did}) {
563                $score{$did} += $post{$did} / $freq * $idf;
564              }
565            }
566        }        }
567      }      }
568        return %score;
569    }    }
570      my @max; $max[$#terms+1]=0;
571      my @idf;
572    
573      # Preparation loop.  This extra loop makes sense only when "reorg"
574      # and "wanted" are true.  But at the time beeing, keeping the code
575      # for the different search algorithms in one place seems more
576      # desirable than some minor speedup of the brute force version.  We
577      # do cache $idf though.
578    
579      for (my $i = $#terms; $i >=0; $i--) {
580        local $_ = $terms[$i];
581        # Lookup the number of documents the term occurs in (document frequency)
582        my $df      = $self->{db}->{'o'.$_};
583    
584        # The frequency *must* be 1 at least since the posting list is nonempty
585        _complain('search for term', $_) and $df = 1 if $df < 1;
586    
587        # This is the inverse document frequency. The log of the inverse
588        # fraction of documents the term occurs in.
589        $idf[$i] = log($self->{records}/$df);
590    
591        my ($did,$occ);
592        if ($self->{reorg}) {
593          ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
594        } else {                    # Maybe this costs more than it helps
595          ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
596        }
597        my $freq      = $self->{db}->{'m'. $did};
598        my $max       = $occ/$freq*$idf[$i];
599        $max[$i]      = $max + $max[$i+1];
600      }
601    
602      # Main loop
603      for my $i (0 .. $#terms) {
604        my $term = $terms[$i];
605        # Unpack posting list for current query term $term. We loose the
606        # sorting order because the assignment to a hash.
607        my %post = unpack 'w*', $self->{db}->{'p'.$term};
608    
609        _complain('search for term', $term)
610          if $self->{db}->{'o'.$term} != keys %post;
611    
612        my $idf  = $idf[$i];
613        my $full;                   # Need to process all postings
614        my $chop;                   # Score necessary to enter the ranking list
615    
616        if (# We know that wanted is true since we especial cased the
617            # exhaustive search.
618    
619            $wanted and
620    
621            # We did sort here if necessary in
622            # the preparation loop
623            # $self->{reorg} and
624    
625            scalar keys %score > $wanted) {
626          $chop = (sort { $b <=> $a } values %score)[$wanted];
627          $full = $max[$i] > $chop;
628        } else {
629          $full = 1;
630        }
631    
632        if ($full) {
633          # We need to inspect the full list. Either $wanted is not given,
634          # the index is not sorted, or we don't have enough accumulators
635          # yet.
636          if (defined $chop) {
637            # We might be able to avoid allocating accumulators
638            for my $did (keys %post) {
639              if (my $freq = $self->{db}->{'m'. $did}) {
640                my $wgt = $post{$did} / $freq * $idf;
641                # We add an accumulator if $wgt exeeds $chop
642                if (exists $score{$did} or $wgt > $chop) {
643                  $score{$did} += $wgt;
644                }
645              }
646            }
647          } else {
648            # Allocate acumulators for each seen document.
649            for my $did (keys %post) {
650              if (my $freq = $self->{db}->{'m'. $did}) {
651                $score{$did} += $post{$did} / $freq * $idf;
652              }
653            }
654          }
655        } else {
656          # Update existing accumulators
657          for my $did (keys %score) {
658            next unless exists $post{$did};
659            if (my $freq = $self->{db}->{'m'. $did}) {
660              $score{$did} += $post{$did} / $freq * $idf;
661            }
662          }
663        }
664      }
665      #warn sprintf "Used %d accumulators\n", scalar keys %score;
666    %score;    %score;
667  }  }
668    
669    sub set {
670      my ($self, $attr, $value) = @_;
671    
672      die "No such indexy attribute: '$attr'" unless $attr eq 'top';
673    
674      return delete $self->{reorg} if $value == 0;
675    
676      return if     $self->{reorg};     # we are sorted already
677      return unless $self->{mode} & O_RDWR;
678      defined $self->{db} or $self->open;
679    
680      $self->sync;
681      while (my($key, $value) = each %{$self->{db}}) {
682        next if $key !~ /^p/;
683        $self->{db}->{$key} = $self->sort_postings($value);
684      }
685      $self->{reorg} = 1;
686    }
687    
688  sub sync {  sub sync {
689    my $self = shift;    my $self = shift;
690    
691    if ($self->{mode} & O_RDWR) {    if ($self->{mode} & O_RDWR) {
692      print STDERR "Flushing $self->{cached} postings\n";      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
693      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
694        $self->{db}->{$key} .= $value;        $self->{db}->{"p". $key} ||= "";
695        #delete $self->{cache}->{$key};        if ($self->{reorg}) {
696            $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
697                                                       . $value);
698          } else {
699            $self->{db}->{'p'.$key} .= $value;
700          }
701      }      }
702      while (my($key, $value) = each %{$self->{cdict}}) {      while (my($key, $value) = each %{$self->{cdict}}) {
703        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};        $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
704        $self->{db}->{$key} += $value;        $self->{db}->{'o'.$key} += $value;
705        #delete $self->{cdict}->{$key};      }
706      }      $self->{cache}  = {};
707      $self->{cache} = {};      $self->{cdict}  = {};
     $self->{cdict} = {};  
     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";  
708      $self->{cached} = 0;      $self->{cached} = 0;
     # $self->{dbh}->sync if $self->{dbh};  
709    }    }
710  }  }
711    
# Line 360  sub close { Line 727  sub close {
727    }    }
728  }  }
729    
730    sub keys {
731      my $self  = shift;
732    
733      defined $self->{db} or $self->open;
734      keys %{$self->{db}};
735    }
736    
737  1;  1;
738    

Legend:
Removed from v.13  
changed lines
  Added in v.71

  ViewVC Help
Powered by ViewVC 1.1.26