/[wait]/trunk/lib/WAIT/InvertedIndexOld.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WAIT/InvertedIndexOld.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 89 - (show annotations)
Mon May 24 20:57:08 2004 UTC (19 years, 10 months ago) by dpavlin
File size: 19904 byte(s)
revert to DB_File from BerkeleyDB

1 # -*- Mode: Perl -*-
2 # $Basename: InvertedIndex.pm $
3 # $Revision: 1.1 $
4 # Author : Ulrich Pfeifer
5 # Created On : Thu Aug 8 13:05:10 1996
6 # Last Modified By: Ulrich Pfeifer
7 # Last Modified On: Sat Nov 11 14:36:39 2000
8 # Language : CPerl
9 #
10 # (C) Copyright 1996-2000, Ulrich Pfeifer
11 #
12
13 package WAIT::InvertedIndexOld;
14 use strict;
15 use DB_File;
16 use Fcntl;
17 use WAIT::Filter;
18 use Carp;
19 use vars qw(%FUNC);
20
21 my $O = pack('C', 0xff)."o"; # occurances (document ferquency)
22
23 # The document frequency is the number of documents a term occurs
24 # in. The idea is that a term occuring in a significant part of the
25 # documents is not too significant.
26
27 my $M = pack('C', 0xff)."m"; # maxtf (term frequency)
28
29 # The maximum term frequency of a document is the frequency of the
30 # most frequent term in the document. It is related to the document
31 # length obviously. A document in which the most frequnet term occurs
32 # 100 times is probably much longer than a document whichs most
33 # frequent term occurs five time.
34
35 sub new {
36 my $type = shift;
37 my %parm = @_;
38 my $self = {};
39
40 $self->{file} = $parm{file} or croak "No file specified";
41 $self->{attr} = $parm{attr} or croak "No attributes specified";
42 $self->{filter} = $parm{filter};
43 $self->{'name'} = $parm{'name'};
44 $self->{records} = 0;
45 for (qw(intervall prefix)) {
46 if (exists $parm{$_}) {
47 if (ref $parm{$_}) {
48 $self->{$_} = [@{$parm{$_}}] # clone
49 } else {
50 $self->{$_} = $parm{$_}
51 }
52 }
53 }
54 bless $self, ref($type) || $type;
55 }
56
57 sub name {$_[0]->{'name'}}
58
59 sub _split_pos {
60 my ($text, $pos) = @{$_[0]};
61 my @result;
62
63 $text =~ s/(^\s+)// and $pos += length($1);
64 while ($text =~ s/(^\S+)//) {
65 my $word = $1;
66 push @result, [$word, $pos];
67 $pos += length($word);
68 $text =~ s/(^\s+)// and $pos += length($1);
69 }
70 @result;
71 }
72
73 sub _xfiltergen {
74 my $filter = pop @_;
75
76 # Oops, we cannot overrule the user's choice. Other filters may kill
77 # stopwords, such as isotr clobbers "isn't" to "isnt".
78
79 # if ($filter eq 'stop') { # avoid the slow stopword elimination
80 # return _xfiltergen(@_); # it's cheaper to look them up afterwards
81 # }
82 if (@_) {
83 if ($filter =~ /^split(\d*)/) {
84 if ($1) {
85 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
86 } else {
87 "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
88 }
89 } else {
90 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
91 }
92 } else {
93 if ($filter =~ /^split(\d*)/) {
94 if ($1) {
95 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
96 } else {
97 "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
98 }
99 } else {
100 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
101 }
102 }
103 }
104
105 sub parse_pos {
106 my $self = shift;
107
108 unless (exists $self->{xfunc}) {
109 $self->{xfunc} =
110 eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
111 #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
112 }
113 &{$self->{xfunc}}($_[0]);
114 }
115
116 sub _filtergen {
117 my $filter = pop @_;
118
119 if (@_) {
120 "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
121 } else {
122 "map(&WAIT::Filter::$filter(\$_), \@_)";
123 }
124 }
125
126 sub drop {
127 my $self = shift;
128 if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
129 my $file = $self->{file};
130
131 ! (!-e $file or unlink $file);
132 } else { # notify our database
133 croak ref($self)."::drop called directly";
134 }
135 }
136
137 sub open {
138 my $self = shift;
139 my $file = $self->{file};
140
141 if (defined $self->{dbh}) {
142 $self->{dbh};
143 } else {
144 $self->{func} =
145 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
146 $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
147 $self->{mode}, 0664, $DB_BTREE);
148 $self->{cache} = {}
149 if $self->{mode} & O_RDWR;
150 $self->{cdict} = {}
151 if $self->{mode} & O_RDWR;
152 $self->{cached} = 0;
153 }
154 }
155
156 sub insert {
157 my $self = shift;
158 my $key = shift;
159 my %occ;
160
161 defined $self->{db} or $self->open;
162 grep $occ{$_}++, &{$self->{func}}(@_);
163 my ($word, $noc);
164 $self->{records}++;
165 while (($word, $noc) = each %occ) {
166 if (defined $self->{cache}->{$word}) {
167 $self->{cdict}->{$O,$word}++;
168 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
169 } else {
170 $self->{cdict}->{$O,$word} = 1;
171 $self->{cache}->{$word} = pack 'w2', $key, $noc;
172 }
173 $self->{cached}++;
174 }
175 # This cache limit should be configurable
176 $self->sync if $self->{cached} > 100_000;
177 my $maxtf = 0;
178 for (values %occ) {
179 $maxtf = $_ if $_ > $maxtf;
180 }
181 $self->{db}->{$M, $key} = $maxtf;
182 }
183
184 # We sort postings by increasing max term frequency (~ by increasing
185 # document length. This reduces the quality degradation if we process
186 # only the first part of a posting list.
187
188 sub sort_postings {
189 my $self = shift;
190 my $post = shift; # reference to a hash or packed string
191
192 if (ref $post) {
193 # we skip the sort part, if the index is not sorted
194 return pack('w*', %$post) unless $self->{reorg};
195 } else {
196 $post = { unpack 'w*', $post };
197 }
198
199 my $r = '';
200
201 # Sort posting list by increasing ratio of maximum term frequency (~
202 # "document length") and term frequency. This rati multipied by the
203 # inverse document frequence gives the score for a term. This sort
204 # order can be exploited for tuning of single term queries.
205
206 for my $did (sort { $post->{$b} / $self->{db}->{$M, $b}
207 <=>
208 $post->{$a} / $self->{db}->{$M, $a}
209 } keys %$post) {
210 $r .= pack 'w2', $did, $post->{$did};
211 }
212 #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
213 $r;
214 }
215
216 sub delete {
217 my $self = shift;
218 my $key = shift;
219 my %occ;
220
221 my $db;
222 defined $self->{db} or $self->open;
223 $db = $self->{db};
224 $self->sync;
225 $self->{records}--;
226
227 # less than zero documents in database?
228 _complain('delete of document', $key) and $self->{records} = 0
229 if $self->{records} < 0;
230
231 grep $occ{$_}++, &{$self->{func}}(@_);
232
233 for (keys %occ) {# may reorder posting list
234 my %post = unpack 'w*', $db->{$_};
235 delete $post{$key};
236 $db->{$_} = $self->sort_postings(\%post);
237 _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
238 $db->{$O,$_} = scalar keys %post;
239 }
240 delete $db->{$M, $key};
241 }
242
243 sub intervall {
244 my ($self, $first, $last) = @_;
245 my $value = '';
246 my $word = '';
247 my @result;
248
249 return unless exists $self->{'intervall'};
250
251 defined $self->{db} or $self->open;
252 $self->sync;
253 my $dbh = $self->{dbh}; # for convenience
254
255 if (ref $self->{'intervall'}) {
256 unless (exists $self->{'ifunc'}) {
257 $self->{'ifunc'} =
258 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
259 }
260 ($first) = &{$self->{'ifunc'}}($first) if $first;
261 ($last) = &{$self->{'ifunc'}}($last) if $last;
262 }
263 if (defined $first and $first ne '') { # set the cursor to $first
264 $dbh->seq($first, $value, R_CURSOR);
265 } else {
266 $dbh->seq($first, $value, R_FIRST);
267 }
268 # We assume that word do not start with the character \377
269 # $last = pack 'C', 0xff unless defined $last and $last ne '';
270 return () if defined $last and $first gt $last; # $first would be after the last word
271
272 push @result, $first;
273 while (!$dbh->seq($word, $value, R_NEXT)) {
274 # We should limit this to a "resonable" number of words
275 last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
276 push @result, $word;
277 }
278 \@result; # speed
279 }
280
281 sub prefix {
282 my ($self, $prefix) = @_;
283 my $value = '';
284 my $word = '';
285 my @result;
286
287 return () unless defined $prefix; # Full dictionary requested !!
288 return unless exists $self->{'prefix'};
289 defined $self->{db} or $self->open;
290 $self->sync;
291 my $dbh = $self->{dbh};
292
293 if (ref $self->{'prefix'}) {
294 unless (exists $self->{'pfunc'}) {
295 $self->{'pfunc'} =
296 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
297 }
298 ($prefix) = &{$self->{'pfunc'}}($prefix);
299 }
300
301 if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
302 return ();
303 }
304 return () if $word !~ /^$prefix/;
305 push @result, $word;
306
307 while (!$dbh->seq($word, $value, R_NEXT)) {
308 # We should limit this to a "resonable" number of words
309 last if $word !~ /^$prefix/;
310 push @result, $word;
311 }
312 \@result; # speed
313 }
314
315 =head2 search($query)
316
317 The search method supports a range of search algorithms. It is
318 recommended to tune the index by calling
319 C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
320 into the table. This is a computing intense operation and all inserts
321 and deletes after this optimization are slightly more expensive. Once
322 reorganized, the index is kept sorted automatically until you switch
323 the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
324
325 When searching a tuned index, a query can be processed faster if the
326 caller requests only the topmost documents. This can be done by
327 passing a C<top =E<gt>> I<n> parameter to the search method.
328
329 For single term queries, the method returns only the I<n> top ranking
330 documents. For multi term queries two optimized algorithms are
331 available. The first algorithm computes the top n documents
332 approximately but very fast, sacrificing a little bit of precision for
333 speed. The second algorithm computes the topmost I<n> documents
334 precisely. This algorithm is slower and should be used only for small
335 values of I<n>. It can be requested by passing the query attribute
336 C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
337 While the picky version might not be faster than the brute force
338 version on average for modest size databases it uses less memory and
339 the processing time is almost linear in the number of query terms, not
340 in the size of the lists.
341
342 =cut
343
344 sub search {
345 my $self = shift;
346 my $query = shift;
347
348 defined $self->{db} or $self->open;
349 $self->sync;
350 $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
351 }
352
353 sub parse {
354 my $self = shift;
355
356 defined $self->{db} or $self->open;
357 &{$self->{func}}(@_);
358 }
359
360 sub keys {
361 my $self = shift;
362
363 defined $self->{db} or $self->open;
364 keys %{$self->{db}};
365 }
366
367 sub search_prefix {
368 my $self = shift;
369
370 # print "search_prefix(@_)\n";
371 defined $self->{db} or $self->open;
372 $self->search_raw(map($self->prefix($_), @_));
373 }
374
375 sub _complain ($$) {
376 my ($action, $term) = @_;
377
378 require Carp;
379 Carp::cluck
380 (sprintf("WAIT database inconsistency during $action [%s]: ".
381 "Please rebuild index\n",
382 $term,));
383 }
384
385 sub search_raw {
386 my $self = shift;
387 my $query = shift;
388 my %score;
389
390 # Top $wanted documents must be correct. Zero means all matching
391 # documents.
392 my $wanted = $query->{top};
393 my $strict = $query->{picky};
394
395 # Return at least $minacc documents. Zero means all matching
396 # documents.
397 # my $minacc = $query->{accus} || $wanted;
398
399 # Open index and flush cache if necessary
400 defined $self->{db} or $self->open;
401 $self->sync;
402
403 # We keep duplicates
404 my @terms =
405 # Sort words by decreasing document frequency
406 sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
407 # check which words occur in the index.
408 grep { $self->{db}->{$O,$_} } @_;
409
410 return () unless @terms; # nothing to search for
411
412 # We special-case one term queries here. If the index was sorted,
413 # choping off the rest of the list will return the same ranking.
414 if ($wanted and @terms == 1) {
415 my $term = shift @terms;
416 my $idf = log($self->{records}/$self->{db}->{$O,$term});
417 my @res;
418
419 if ($self->{reorg}) { # or not $query->{picky}
420 @res = unpack "w". int(2*$wanted), $self->{db}->{$term};
421 } else {
422 @res = unpack 'w*', $self->{db}->{$term};
423 }
424
425 for (my $i=1; $i<@res; $i+=2) {
426 $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
427 }
428
429 return @res
430 }
431
432 # We separate exhaustive search here to avoid overhead and make the
433 # code more readable. The block can be removed without changing the
434 # result.
435 unless ($wanted) {
436 for (@terms) {
437 my $df = $self->{db}->{$O,$_};
438
439 # The frequency *must* be 1 at least since the posting list is nonempty
440 _complain('search for term', $_) and $df = 1 if $df < 1;
441
442 # Unpack posting list for current query term $_
443 my %post = unpack 'w*', $self->{db}->{$_};
444
445 _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
446 # This is the inverse document frequency. The log of the inverse
447 # fraction of documents the term occurs in.
448 my $idf = log($self->{records}/$df);
449 for my $did (keys %post) {
450 if (my $freq = $self->{db}->{$M, $did}) {
451 $score{$did} += $post{$did} / $freq * $idf;
452 }
453 }
454 }
455 # warn sprintf "Used %d accumulators\n", scalar keys %score;
456 return %score;
457 }
458
459 # A sloppy but fast algorithm for multiple term queries.
460 unless ($strict) {
461 for (@terms) {
462 # Unpack posting list for current query term $_
463 my %post = unpack 'w*', $self->{db}->{$_};
464
465 # Lookup the number of documents the term occurs in (document frequency)
466 my $occ = $self->{db}->{$O,$_};
467
468 _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
469 # The frequency *must* be 1 at least since the posting list is nonempty
470 _complain('search for term', $_) and $occ = 1 if $occ < 1;
471
472 # This is the inverse document frequency. The log of the inverse
473 # fraction of documents the term occurs in.
474 my $idf = log($self->{records}/$occ);
475
476 # If we have a reasonable number of accumulators, change the
477 # loop to iterate over the accumulators. This will compromise
478 # quality for better speed. The algorithm still computes the
479 # exact weights, but the result is not guaranteed to contain the
480 # *best* results. The database might contain documents better
481 # than the worst returned document.
482
483 # We process the lists in order of increasing length. When the
484 # number of accumulators exceeds $wanted, no new documents are
485 # added, only the ranking/weighting of the seen documents is
486 # improved. The resulting ranking list must be pruned, since only
487 # the top most documents end up near their "optimal" rank.
488
489 if (keys %score < $wanted) {
490 for my $did (keys %post) {
491 if (my $freq = $self->{db}->{$M, $did}) {
492 $score{$did} += $post{$did} / $freq * $idf;
493 }
494 }
495 } else {
496 for my $did (keys %score) {
497 next unless exists $post{$did};
498 if (my $freq = $self->{db}->{$M, $did}) {
499 $score{$did} += $post{$did} / $freq * $idf;
500 }
501 }
502 }
503 }
504 return %score;
505 }
506 my @max; $max[$#terms+1]=0;
507 my @idf;
508
509 # Preparation loop. This extra loop makes sense only when "reorg"
510 # and "wanted" are true. But at the time beeing, keeping the code
511 # for the different search algorithms in one place seems more
512 # desirable than some minor speedup of the brute force version. We
513 # do cache $idf though.
514
515 for (my $i = $#terms; $i >=0; $i--) {
516 local $_ = $terms[$i];
517 # Lookup the number of documents the term occurs in (document frequency)
518 my $df = $self->{db}->{$O,$_};
519
520 # The frequency *must* be 1 at least since the posting list is nonempty
521 _complain('search for term', $_) and $df = 1 if $df < 1;
522
523 # This is the inverse document frequency. The log of the inverse
524 # fraction of documents the term occurs in.
525 $idf[$i] = log($self->{records}/$df);
526
527 my ($did,$occ);
528 if ($self->{reorg}) {
529 ($did,$occ) = unpack 'w2', $self->{db}->{$_};
530 } else { # Maybe this costs more than it helps
531 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
532 }
533 my $freq = $self->{db}->{$M, $did};
534 my $max = $occ/$freq*$idf[$i];
535 $max[$i] = $max + $max[$i+1];
536 }
537
538 # Main loop
539 for my $i (0 .. $#terms) {
540 my $term = $terms[$i];
541 # Unpack posting list for current query term $term. We loose the
542 # sorting order because the assignment to a hash.
543 my %post = unpack 'w*', $self->{db}->{$term};
544
545 _complain('search for term', $term)
546 if $self->{db}->{$O,$term} != keys %post;
547
548 my $idf = $idf[$i];
549 my $full; # Need to process all postings
550 my $chop; # Score necessary to enter the ranking list
551
552 if (# We know that wanted is true since we especial cased the
553 # exhaustive search.
554
555 $wanted and
556
557 # We did sort here if necessary in
558 # the preparation loop
559 # $self->{reorg} and
560
561 scalar keys %score > $wanted) {
562 $chop = (sort { $b <=> $a } values %score)[$wanted];
563 $full = $max[$i] > $chop;
564 } else {
565 $full = 1;
566 }
567
568 if ($full) {
569 # We need to inspect the full list. Either $wanted is not given,
570 # the index is not sorted, or we don't have enough accumulators
571 # yet.
572 if (defined $chop) {
573 # We might be able to avoid allocating accumulators
574 for my $did (keys %post) {
575 if (my $freq = $self->{db}->{$M, $did}) {
576 my $wgt = $post{$did} / $freq * $idf;
577 # We add an accumulator if $wgt exeeds $chop
578 if (exists $score{$did} or $wgt > $chop) {
579 $score{$did} += $wgt;
580 }
581 }
582 }
583 } else {
584 # Allocate acumulators for each seen document.
585 for my $did (keys %post) {
586 if (my $freq = $self->{db}->{$M, $did}) {
587 $score{$did} += $post{$did} / $freq * $idf;
588 }
589 }
590 }
591 } else {
592 # Update existing accumulators
593 for my $did (keys %score) {
594 next unless exists $post{$did};
595 if (my $freq = $self->{db}->{$M, $did}) {
596 $score{$did} += $post{$did} / $freq * $idf;
597 }
598 }
599 }
600 }
601 #warn sprintf "Used %d accumulators\n", scalar keys %score;
602 %score;
603 }
604
605 sub set {
606 my ($self, $attr, $value) = @_;
607
608 die "No such indexy attribute: '$attr'" unless $attr eq 'top';
609
610 return delete $self->{reorg} if $value == 0;
611
612 return if $self->{reorg}; # we are sorted already
613 return unless $self->{mode} & O_RDWR;
614 defined $self->{db} or $self->open;
615
616 $self->sync;
617 while (my($key, $value) = each %{$self->{db}}) {
618 next if $key =~ /^\377[om]/;
619 $self->{db}->{$key} = $self->sort_postings($value);
620 }
621 $self->{reorg} = 1;
622 }
623
624 sub sync {
625 my $self = shift;
626
627 if ($self->{mode} & O_RDWR) {
628 print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
629 while (my($key, $value) = each %{$self->{cache}}) {
630 if ($self->{reorg}) {
631 $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
632 . $value);
633 } else {
634 $self->{db}->{$key} .= $value;
635 }
636 }
637 while (my($key, $value) = each %{$self->{cdict}}) {
638 $self->{db}->{$key} = 0 unless $self->{db}->{$key};
639 $self->{db}->{$key} += $value;
640 }
641 $self->{cache} = {};
642 $self->{cdict} = {};
643 $self->{cached} = 0;
644 }
645 }
646
647 sub close {
648 my $self = shift;
649
650 if ($self->{dbh}) {
651 $self->sync;
652 delete $self->{dbh};
653 untie %{$self->{db}};
654 delete $self->{db};
655 delete $self->{func};
656 delete $self->{cache};
657 delete $self->{cached};
658 delete $self->{cdict};
659 delete $self->{pfunc} if defined $self->{pfunc};
660 delete $self->{ifunc} if defined $self->{ifunc};
661 delete $self->{xfunc} if defined $self->{xfunc};
662 }
663 }
664
665 1;
666

Properties

Name Value
cvs2svn:cvs-rev 1.2

  ViewVC Help
Powered by ViewVC 1.1.26