/[wait]/branches/CPAN/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/CPAN/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 11 - (show annotations)
Fri Apr 28 15:41:10 2000 UTC (24 years ago) by unknown
File size: 8976 byte(s)
This commit was manufactured by cvs2svn to create branch 'CPAN'.
1 # -*- Mode: Perl -*-
2 # InvertedIndex.pm --
3 # ITIID : $ITI$ $Header $__Header$
4 # Author : Ulrich Pfeifer
5 # Created On : Thu Aug 8 13:05:10 1996
6 # Last Modified By: Ulrich Pfeifer
7 # Last Modified On: Sun Nov 22 18:44:42 1998
8 # Language : CPerl
9 # Status : Unknown, Use with caution!
10 #
11 # Copyright (c) 1996-1997, Ulrich Pfeifer
12 #
13
14 package WAIT::InvertedIndex;
15 use strict;
16 use DB_File;
17 use Fcntl;
18 use WAIT::Filter;
19 use Carp;
20 use vars qw(%FUNC);
21
22 my $O = pack('C', 0xff)."o"; # occurances
23 my $M = pack('C', 0xff)."m"; # maxtf
24
25 sub new {
26 my $type = shift;
27 my %parm = @_;
28 my $self = {};
29
30 $self->{file} = $parm{file} or croak "No file specified";
31 $self->{attr} = $parm{attr} or croak "No attributes specified";
32 $self->{filter} = $parm{filter};
33 $self->{'name'} = $parm{'name'};
34 $self->{records} = 0;
35 for (qw(intervall prefix)) {
36 if (exists $parm{$_}) {
37 if (ref $parm{$_}) {
38 $self->{$_} = [@{$parm{$_}}] # clone
39 } else {
40 $self->{$_} = $parm{$_}
41 }
42 }
43 }
44 bless $self, ref($type) || $type;
45 }
46
47 sub name {$_[0]->{'name'}}
48
49 sub _split_pos {
50 my ($text, $pos) = @{$_[0]};
51 my @result;
52
53 $text =~ s/(^\s+)// and $pos += length($1);
54 while ($text =~ s/(^\S+)//) {
55 my $word = $1;
56 push @result, [$word, $pos];
57 $pos += length($word);
58 $text =~ s/(^\s+)// and $pos += length($1);
59 }
60 @result;
61 }
62
63 sub _xfiltergen {
64 my $filter = pop @_;
65
66 if ($filter eq 'stop') { # avoid the slow stopword elimination
67 return _xfiltergen(@_); # it's cheaper to look them up afterwards
68 }
69 if (@_) {
70 if ($filter =~ /^split(\d*)/) {
71 if ($1) {
72 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
73 } else {
74 "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
75 }
76 } else {
77 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
78 }
79 } else {
80 if ($filter =~ /^split(\d*)/) {
81 if ($1) {
82 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
83 } else {
84 "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
85 }
86 } else {
87 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
88 }
89 }
90 }
91
92 sub parse_pos {
93 my $self = shift;
94
95 unless (exists $self->{xfunc}) {
96 $self->{xfunc} =
97 eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
98 #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
99 }
100 &{$self->{xfunc}}($_[0]);
101 }
102
103 sub _filtergen {
104 my $filter = pop @_;
105
106 if (@_) {
107 "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
108 } else {
109 "map(&WAIT::Filter::$filter(\$_), \@_)";
110 }
111 }
112
113 sub drop {
114 my $self = shift;
115 if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
116 my $file = $self->{file};
117
118 ! (!-e $file or unlink $file);
119 } else { # notify our database
120 croak ref($self)."::drop called directly";
121 }
122 }
123
124 sub open {
125 my $self = shift;
126 my $file = $self->{file};
127
128 if (defined $self->{dbh}) {
129 $self->{dbh};
130 } else {
131 $self->{func} =
132 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
133 $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
134 $self->{mode}, 0664, $DB_BTREE);
135 # tie(%{$self->{cache}}, 'DB_File', undef,
136 # $self->{mode}, 0664, $DB_BTREE)
137 $self->{cache} = {}
138 if $self->{mode} & O_RDWR;
139 # tie(%{$self->{cdict}}, 'DB_File', undef,
140 # $self->{mode}, 0664, $DB_BTREE)
141 $self->{cdict} = {}
142 if $self->{mode} & O_RDWR;
143 $self->{cached} = 0;
144 }
145 }
146
147 sub insert {
148 my $self = shift;
149 my $key = shift;
150 my %occ;
151
152 defined $self->{db} or $self->open;
153 grep $occ{$_}++, &{$self->{func}}(@_);
154 my ($word, $noc);
155 $self->{records}++;
156 while (($word, $noc) = each %occ) {
157 if (defined $self->{cache}->{$word}) {
158 $self->{cdict}->{$O,$word}++;
159 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
160 } else {
161 $self->{cdict}->{$O,$word} = 1;
162 $self->{cache}->{$word} = pack 'w2', $key, $noc;
163 }
164 $self->{cached}++;
165 }
166 $self->sync if $self->{cached} > 100_000;
167 my $maxtf = 0;
168 for (values %occ) {
169 $maxtf = $_ if $_ > $maxtf;
170 }
171 $self->{db}->{$M, $key} = $maxtf;
172 }
173
174 sub delete {
175 my $self = shift;
176 my $key = shift;
177 my %occ;
178
179 defined $self->{db} or $self->open;
180 $self->sync;
181 $self->{records}--;
182 grep $occ{$_}++, &{$self->{func}}(@_);
183 for (keys %occ) {
184 # may reorder posting list
185 my %post = unpack 'w*', $self->{db}->{$_};
186 $self->{db}->{$O,$_}--;
187 delete $post{$key};
188 $self->{db}->{$_} = pack 'w*', %post;
189 }
190 delete $self->{db}->{$M, $key};
191 }
192
193 sub intervall {
194 my ($self, $first, $last) = @_;
195 my $value = '';
196 my $word = '';
197 my @result;
198
199 return unless exists $self->{'intervall'};
200
201 defined $self->{db} or $self->open;
202 $self->sync;
203 my $dbh = $self->{dbh}; # for convenience
204
205 if (ref $self->{'intervall'}) {
206 unless (exists $self->{'ifunc'}) {
207 $self->{'ifunc'} =
208 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
209 }
210 ($first) = &{$self->{'ifunc'}}($first) if $first;
211 ($last) = &{$self->{'ifunc'}}($last) if $last;
212 }
213 if (defined $first and $first ne '') { # set the cursor to $first
214 $dbh->seq($first, $value, R_CURSOR);
215 } else {
216 $dbh->seq($first, $value, R_FIRST);
217 }
218 # We assume that word do not start with the character \377
219 # $last = pack 'C', 0xff unless defined $last and $last ne '';
220 return () if defined $last and $first gt $last; # $first would be after the last word
221
222 push @result, $first;
223 while (!$dbh->seq($word, $value, R_NEXT)) {
224 # We should limit this to a "resonable" number of words
225 last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
226 push @result, $word;
227 }
228 \@result; # speed
229 }
230
231 sub prefix {
232 my ($self, $prefix) = @_;
233 my $value = '';
234 my $word = '';
235 my @result;
236
237 return () unless defined $prefix; # Full dictionary requested !!
238 return unless exists $self->{'prefix'};
239 defined $self->{db} or $self->open;
240 $self->sync;
241 my $dbh = $self->{dbh};
242
243 if (ref $self->{'prefix'}) {
244 unless (exists $self->{'pfunc'}) {
245 $self->{'pfunc'} =
246 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
247 }
248 ($prefix) = &{$self->{'pfunc'}}($prefix);
249 }
250
251 if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
252 return ();
253 }
254 return () if $word !~ /^$prefix/;
255 push @result, $word;
256
257 while (!$dbh->seq($word, $value, R_NEXT)) {
258 # We should limit this to a "resonable" number of words
259 last if $word !~ /^$prefix/;
260 push @result, $word;
261 }
262 \@result; # speed
263 }
264
265 sub search {
266 my $self = shift;
267
268 defined $self->{db} or $self->open;
269 $self->sync;
270 $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
271 }
272
273 sub parse {
274 my $self = shift;
275
276 defined $self->{db} or $self->open;
277 &{$self->{func}}(@_);
278 }
279
280 sub search_prefix {
281 my $self = shift;
282
283 # print "search_prefix(@_)\n";
284 defined $self->{db} or $self->open;
285 $self->search_raw(map($self->prefix($_), @_));
286 }
287
288 sub search_raw {
289 my $self = shift;
290 my %occ;
291 my %score;
292
293 return () unless @_;
294
295 defined $self->{db} or $self->open;
296 $self->sync;
297 grep $occ{$_}++, @_;
298 for (keys %occ) {
299 if (defined $self->{db}->{$_}) {
300 my %post = unpack 'w*', $self->{db}->{$_};
301 my $idf = log($self->{records}/$self->{db}->{$O,$_});
302 my $did;
303 for $did (keys %post) {
304 $score{$did} = 0 unless defined $score{$did}; # perl -w
305 $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
306 if $self->{db}->{$M, $did}; # db may be broken
307 }
308 }
309 }
310 %score;
311 }
312
313 sub sync {
314 my $self = shift;
315
316 if ($self->{mode} & O_RDWR) {
317 print STDERR "\aFlushing $self->{cached} postings\n";
318 while (my($key, $value) = each %{$self->{cache}}) {
319 $self->{db}->{$key} .= $value;
320 #delete $self->{cache}->{$key};
321 }
322 while (my($key, $value) = each %{$self->{cdict}}) {
323 $self->{db}->{$key} = 0 unless $self->{db}->{$key};
324 $self->{db}->{$key} += $value;
325 #delete $self->{cdict}->{$key};
326 }
327 $self->{cache} = {};
328 $self->{cdict} = {};
329 # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
330 $self->{cached} = 0;
331 # $self->{dbh}->sync if $self->{dbh};
332 }
333 }
334
335 sub close {
336 my $self = shift;
337
338 if ($self->{dbh}) {
339 $self->sync;
340 delete $self->{dbh};
341 untie %{$self->{db}};
342 delete $self->{db};
343 delete $self->{func};
344 delete $self->{cache};
345 delete $self->{cached};
346 delete $self->{cdict};
347 delete $self->{pfunc} if defined $self->{pfunc};
348 delete $self->{ifunc} if defined $self->{ifunc};
349 delete $self->{xfunc} if defined $self->{xfunc};
350 }
351 }
352
353 1;
354

Properties

Name Value
cvs2svn:cvs-rev 1.1

  ViewVC Help
Powered by ViewVC 1.1.26