/[wait]/branches/CPAN/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/CPAN/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Fri Apr 28 15:42:44 2000 UTC (24 years ago) by ulpfr
File size: 9201 byte(s)
Import of WAIT-1.710

1 # -*- Mode: Cperl -*-
2 # InvertedIndex.pm --
3 # ITIID : $ITI$ $Header $__Header$
4 # Author : Ulrich Pfeifer
5 # Created On : Thu Aug 8 13:05:10 1996
6 # Last Modified By: Ulrich Pfeifer
7 # Last Modified On: Sun Nov 22 18:44:42 1998
8 # Language : CPerl
9 # Status : Unknown, Use with caution!
10 #
11 # Copyright (c) 1996-1997, Ulrich Pfeifer
12 #
13
14 package WAIT::InvertedIndex;
15 use strict;
16 use DB_File;
17 use Fcntl;
18 use WAIT::Filter;
19 use Carp;
20 use vars qw(%FUNC);
21
22 my $O = pack('C', 0xff)."o"; # occurances
23 my $M = pack('C', 0xff)."m"; # maxtf
24
25 sub new {
26 my $type = shift;
27 my %parm = @_;
28 my $self = {};
29
30 $self->{file} = $parm{file} or croak "No file specified";
31 $self->{attr} = $parm{attr} or croak "No attributes specified";
32 $self->{filter} = $parm{filter};
33 $self->{'name'} = $parm{'name'};
34 $self->{records} = 0;
35 for (qw(intervall prefix)) {
36 if (exists $parm{$_}) {
37 if (ref $parm{$_}) {
38 $self->{$_} = [@{$parm{$_}}] # clone
39 } else {
40 $self->{$_} = $parm{$_}
41 }
42 }
43 }
44 bless $self, ref($type) || $type;
45 }
46
47 sub name {$_[0]->{'name'}}
48
49 sub _split_pos {
50 my ($text, $pos) = @{$_[0]};
51 my @result;
52
53 $text =~ s/(^\s+)// and $pos += length($1);
54 while ($text =~ s/(^\S+)//) {
55 my $word = $1;
56 push @result, [$word, $pos];
57 $pos += length($word);
58 $text =~ s/(^\s+)// and $pos += length($1);
59 }
60 @result;
61 }
62
63 sub _xfiltergen {
64 my $filter = pop @_;
65
66 # Oops, we cannot overrule the user's choice. Other filters may kill
67 # stopwords, such as isotr clobbers "isn't" to "isnt".
68
69 # if ($filter eq 'stop') { # avoid the slow stopword elimination
70 # return _xfiltergen(@_); # it's cheaper to look them up afterwards
71 # }
72 if (@_) {
73 if ($filter =~ /^split(\d*)/) {
74 if ($1) {
75 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
76 } else {
77 "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
78 }
79 } else {
80 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
81 }
82 } else {
83 if ($filter =~ /^split(\d*)/) {
84 if ($1) {
85 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
86 } else {
87 "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
88 }
89 } else {
90 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
91 }
92 }
93 }
94
95 sub parse_pos {
96 my $self = shift;
97
98 unless (exists $self->{xfunc}) {
99 $self->{xfunc} =
100 eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
101 #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
102 }
103 &{$self->{xfunc}}($_[0]);
104 }
105
106 sub _filtergen {
107 my $filter = pop @_;
108
109 if (@_) {
110 "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
111 } else {
112 "map(&WAIT::Filter::$filter(\$_), \@_)";
113 }
114 }
115
116 sub drop {
117 my $self = shift;
118 if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
119 my $file = $self->{file};
120
121 ! (!-e $file or unlink $file);
122 } else { # notify our database
123 croak ref($self)."::drop called directly";
124 }
125 }
126
127 sub open {
128 my $self = shift;
129 my $file = $self->{file};
130
131 if (defined $self->{dbh}) {
132 $self->{dbh};
133 } else {
134 $self->{func} =
135 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
136 $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
137 $self->{mode}, 0664, $DB_BTREE);
138 # tie(%{$self->{cache}}, 'DB_File', undef,
139 # $self->{mode}, 0664, $DB_BTREE)
140 $self->{cache} = {}
141 if $self->{mode} & O_RDWR;
142 # tie(%{$self->{cdict}}, 'DB_File', undef,
143 # $self->{mode}, 0664, $DB_BTREE)
144 $self->{cdict} = {}
145 if $self->{mode} & O_RDWR;
146 $self->{cached} = 0;
147 }
148 }
149
150 sub insert {
151 my $self = shift;
152 my $key = shift;
153 my %occ;
154
155 defined $self->{db} or $self->open;
156 grep $occ{$_}++, &{$self->{func}}(@_);
157 my ($word, $noc);
158 $self->{records}++;
159 while (($word, $noc) = each %occ) {
160 if (defined $self->{cache}->{$word}) {
161 $self->{cdict}->{$O,$word}++;
162 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
163 } else {
164 $self->{cdict}->{$O,$word} = 1;
165 $self->{cache}->{$word} = pack 'w2', $key, $noc;
166 }
167 $self->{cached}++;
168 }
169 $self->sync if $self->{cached} > 100_000;
170 my $maxtf = 0;
171 for (values %occ) {
172 $maxtf = $_ if $_ > $maxtf;
173 }
174 $self->{db}->{$M, $key} = $maxtf;
175 }
176
177 sub delete {
178 my $self = shift;
179 my $key = shift;
180 my %occ;
181
182 defined $self->{db} or $self->open;
183 $self->sync;
184 $self->{records}--;
185 grep $occ{$_}++, &{$self->{func}}(@_);
186 for (keys %occ) {
187 # may reorder posting list
188 my %post = unpack 'w*', $self->{db}->{$_};
189 $self->{db}->{$O,$_}--;
190 delete $post{$key};
191 $self->{db}->{$_} = pack 'w*', %post;
192 }
193 delete $self->{db}->{$M, $key};
194 }
195
196 sub intervall {
197 my ($self, $first, $last) = @_;
198 my $value = '';
199 my $word = '';
200 my @result;
201
202 return unless exists $self->{'intervall'};
203
204 defined $self->{db} or $self->open;
205 $self->sync;
206 my $dbh = $self->{dbh}; # for convenience
207
208 if (ref $self->{'intervall'}) {
209 unless (exists $self->{'ifunc'}) {
210 $self->{'ifunc'} =
211 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
212 }
213 ($first) = &{$self->{'ifunc'}}($first) if $first;
214 ($last) = &{$self->{'ifunc'}}($last) if $last;
215 }
216 if (defined $first and $first ne '') { # set the cursor to $first
217 $dbh->seq($first, $value, R_CURSOR);
218 } else {
219 $dbh->seq($first, $value, R_FIRST);
220 }
221 # We assume that word do not start with the character \377
222 # $last = pack 'C', 0xff unless defined $last and $last ne '';
223 return () if defined $last and $first gt $last; # $first would be after the last word
224
225 push @result, $first;
226 while (!$dbh->seq($word, $value, R_NEXT)) {
227 # We should limit this to a "resonable" number of words
228 last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
229 push @result, $word;
230 }
231 \@result; # speed
232 }
233
234 sub prefix {
235 my ($self, $prefix) = @_;
236 my $value = '';
237 my $word = '';
238 my @result;
239
240 return () unless defined $prefix; # Full dictionary requested !!
241 return unless exists $self->{'prefix'};
242 defined $self->{db} or $self->open;
243 $self->sync;
244 my $dbh = $self->{dbh};
245
246 if (ref $self->{'prefix'}) {
247 unless (exists $self->{'pfunc'}) {
248 $self->{'pfunc'} =
249 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
250 }
251 ($prefix) = &{$self->{'pfunc'}}($prefix);
252 }
253
254 if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
255 return ();
256 }
257 return () if $word !~ /^$prefix/;
258 push @result, $word;
259
260 while (!$dbh->seq($word, $value, R_NEXT)) {
261 # We should limit this to a "resonable" number of words
262 last if $word !~ /^$prefix/;
263 push @result, $word;
264 }
265 \@result; # speed
266 }
267
268 sub search {
269 my $self = shift;
270
271 defined $self->{db} or $self->open;
272 $self->sync;
273 $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
274 }
275
276 sub parse {
277 my $self = shift;
278
279 defined $self->{db} or $self->open;
280 &{$self->{func}}(@_);
281 }
282
283 sub keys {
284 my $self = shift;
285
286 defined $self->{db} or $self->open;
287 keys %{$self->{db}};
288 }
289
290 sub search_prefix {
291 my $self = shift;
292
293 # print "search_prefix(@_)\n";
294 defined $self->{db} or $self->open;
295 $self->search_raw(map($self->prefix($_), @_));
296 }
297
298 sub search_raw {
299 my $self = shift;
300 my %occ;
301 my %score;
302
303 return () unless @_;
304
305 defined $self->{db} or $self->open;
306 $self->sync;
307 grep $occ{$_}++, @_;
308 for (keys %occ) {
309 if (defined $self->{db}->{$_}) {
310 my %post = unpack 'w*', $self->{db}->{$_};
311 my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1));
312 my $did;
313 for $did (keys %post) {
314 $score{$did} = 0 unless defined $score{$did}; # perl -w
315 $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
316 if $self->{db}->{$M, $did}; # db may be broken
317 }
318 }
319 }
320 %score;
321 }
322
323 sub sync {
324 my $self = shift;
325
326 if ($self->{mode} & O_RDWR) {
327 print STDERR "Flushing $self->{cached} postings\n";
328 while (my($key, $value) = each %{$self->{cache}}) {
329 $self->{db}->{$key} .= $value;
330 #delete $self->{cache}->{$key};
331 }
332 while (my($key, $value) = each %{$self->{cdict}}) {
333 $self->{db}->{$key} = 0 unless $self->{db}->{$key};
334 $self->{db}->{$key} += $value;
335 #delete $self->{cdict}->{$key};
336 }
337 $self->{cache} = {};
338 $self->{cdict} = {};
339 # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
340 $self->{cached} = 0;
341 # $self->{dbh}->sync if $self->{dbh};
342 }
343 }
344
345 sub close {
346 my $self = shift;
347
348 if ($self->{dbh}) {
349 $self->sync;
350 delete $self->{dbh};
351 untie %{$self->{db}};
352 delete $self->{db};
353 delete $self->{func};
354 delete $self->{cache};
355 delete $self->{cached};
356 delete $self->{cdict};
357 delete $self->{pfunc} if defined $self->{pfunc};
358 delete $self->{ifunc} if defined $self->{ifunc};
359 delete $self->{xfunc} if defined $self->{xfunc};
360 }
361 }
362
363 1;
364

Properties

Name Value
cvs2svn:cvs-rev 1.1.1.2

  ViewVC Help
Powered by ViewVC 1.1.26