1 |
# -*- Mode: Cperl -*- |
2 |
# InvertedIndex.pm -- |
3 |
# ITIID : $ITI$ $Header $__Header$ |
4 |
# Author : Ulrich Pfeifer |
5 |
# Created On : Thu Aug 8 13:05:10 1996 |
6 |
# Last Modified By: Ulrich Pfeifer |
7 |
# Last Modified On: Sun Nov 22 18:44:42 1998 |
8 |
# Language : CPerl |
9 |
# Status : Unknown, Use with caution! |
10 |
# |
11 |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
12 |
# |
13 |
|
14 |
package WAIT::InvertedIndex; |
15 |
use strict; |
16 |
use DB_File; |
17 |
use Fcntl; |
18 |
use WAIT::Filter; |
19 |
use Carp; |
20 |
use vars qw(%FUNC); |
21 |
|
22 |
my $O = pack('C', 0xff)."o"; # occurances |
23 |
my $M = pack('C', 0xff)."m"; # maxtf |
24 |
|
25 |
sub new { |
26 |
my $type = shift; |
27 |
my %parm = @_; |
28 |
my $self = {}; |
29 |
|
30 |
$self->{file} = $parm{file} or croak "No file specified"; |
31 |
$self->{attr} = $parm{attr} or croak "No attributes specified"; |
32 |
$self->{filter} = $parm{filter}; |
33 |
$self->{'name'} = $parm{'name'}; |
34 |
$self->{records} = 0; |
35 |
for (qw(intervall prefix)) { |
36 |
if (exists $parm{$_}) { |
37 |
if (ref $parm{$_}) { |
38 |
$self->{$_} = [@{$parm{$_}}] # clone |
39 |
} else { |
40 |
$self->{$_} = $parm{$_} |
41 |
} |
42 |
} |
43 |
} |
44 |
bless $self, ref($type) || $type; |
45 |
} |
46 |
|
47 |
sub name {$_[0]->{'name'}} |
48 |
|
49 |
sub _split_pos { |
50 |
my ($text, $pos) = @{$_[0]}; |
51 |
my @result; |
52 |
|
53 |
$text =~ s/(^\s+)// and $pos += length($1); |
54 |
while ($text =~ s/(^\S+)//) { |
55 |
my $word = $1; |
56 |
push @result, [$word, $pos]; |
57 |
$pos += length($word); |
58 |
$text =~ s/(^\s+)// and $pos += length($1); |
59 |
} |
60 |
@result; |
61 |
} |
62 |
|
63 |
sub _xfiltergen { |
64 |
my $filter = pop @_; |
65 |
|
66 |
# Oops, we cannot overrule the user's choice. Other filters may kill |
67 |
# stopwords, such as isotr clobbers "isn't" to "isnt". |
68 |
|
69 |
# if ($filter eq 'stop') { # avoid the slow stopword elimination |
70 |
# return _xfiltergen(@_); # it's cheaper to look them up afterwards |
71 |
# } |
72 |
if (@_) { |
73 |
if ($filter =~ /^split(\d*)/) { |
74 |
if ($1) { |
75 |
"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ; |
76 |
} else { |
77 |
"map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ; |
78 |
} |
79 |
} else { |
80 |
"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')'; |
81 |
} |
82 |
} else { |
83 |
if ($filter =~ /^split(\d*)/) { |
84 |
if ($1) { |
85 |
"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ; |
86 |
} else { |
87 |
"map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ; |
88 |
} |
89 |
} else { |
90 |
"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])"; |
91 |
} |
92 |
} |
93 |
} |
94 |
|
95 |
sub parse_pos { |
96 |
my $self = shift; |
97 |
|
98 |
unless (exists $self->{xfunc}) { |
99 |
$self->{xfunc} = |
100 |
eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}})); |
101 |
#printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}}); |
102 |
} |
103 |
&{$self->{xfunc}}($_[0]); |
104 |
} |
105 |
|
106 |
sub _filtergen { |
107 |
my $filter = pop @_; |
108 |
|
109 |
if (@_) { |
110 |
"map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')'; |
111 |
} else { |
112 |
"map(&WAIT::Filter::$filter(\$_), \@_)"; |
113 |
} |
114 |
} |
115 |
|
116 |
sub drop { |
117 |
my $self = shift; |
118 |
if ((caller)[0] eq 'WAIT::Table') { # Table knows about this |
119 |
my $file = $self->{file}; |
120 |
|
121 |
! (!-e $file or unlink $file); |
122 |
} else { # notify our database |
123 |
croak ref($self)."::drop called directly"; |
124 |
} |
125 |
} |
126 |
|
127 |
sub open { |
128 |
my $self = shift; |
129 |
my $file = $self->{file}; |
130 |
|
131 |
if (defined $self->{dbh}) { |
132 |
$self->{dbh}; |
133 |
} else { |
134 |
$self->{func} = |
135 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}})); |
136 |
$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file, |
137 |
$self->{mode}, 0664, $DB_BTREE); |
138 |
# tie(%{$self->{cache}}, 'DB_File', undef, |
139 |
# $self->{mode}, 0664, $DB_BTREE) |
140 |
$self->{cache} = {} |
141 |
if $self->{mode} & O_RDWR; |
142 |
# tie(%{$self->{cdict}}, 'DB_File', undef, |
143 |
# $self->{mode}, 0664, $DB_BTREE) |
144 |
$self->{cdict} = {} |
145 |
if $self->{mode} & O_RDWR; |
146 |
$self->{cached} = 0; |
147 |
} |
148 |
} |
149 |
|
150 |
sub insert { |
151 |
my $self = shift; |
152 |
my $key = shift; |
153 |
my %occ; |
154 |
|
155 |
defined $self->{db} or $self->open; |
156 |
grep $occ{$_}++, &{$self->{func}}(@_); |
157 |
my ($word, $noc); |
158 |
$self->{records}++; |
159 |
while (($word, $noc) = each %occ) { |
160 |
if (defined $self->{cache}->{$word}) { |
161 |
$self->{cdict}->{$O,$word}++; |
162 |
$self->{cache}->{$word} .= pack 'w2', $key, $noc; |
163 |
} else { |
164 |
$self->{cdict}->{$O,$word} = 1; |
165 |
$self->{cache}->{$word} = pack 'w2', $key, $noc; |
166 |
} |
167 |
$self->{cached}++; |
168 |
} |
169 |
$self->sync if $self->{cached} > 100_000; |
170 |
my $maxtf = 0; |
171 |
for (values %occ) { |
172 |
$maxtf = $_ if $_ > $maxtf; |
173 |
} |
174 |
$self->{db}->{$M, $key} = $maxtf; |
175 |
} |
176 |
|
177 |
sub delete { |
178 |
my $self = shift; |
179 |
my $key = shift; |
180 |
my %occ; |
181 |
|
182 |
defined $self->{db} or $self->open; |
183 |
$self->sync; |
184 |
$self->{records}--; |
185 |
grep $occ{$_}++, &{$self->{func}}(@_); |
186 |
for (keys %occ) { |
187 |
# may reorder posting list |
188 |
my %post = unpack 'w*', $self->{db}->{$_}; |
189 |
$self->{db}->{$O,$_}--; |
190 |
delete $post{$key}; |
191 |
$self->{db}->{$_} = pack 'w*', %post; |
192 |
} |
193 |
delete $self->{db}->{$M, $key}; |
194 |
} |
195 |
|
196 |
sub intervall { |
197 |
my ($self, $first, $last) = @_; |
198 |
my $value = ''; |
199 |
my $word = ''; |
200 |
my @result; |
201 |
|
202 |
return unless exists $self->{'intervall'}; |
203 |
|
204 |
defined $self->{db} or $self->open; |
205 |
$self->sync; |
206 |
my $dbh = $self->{dbh}; # for convenience |
207 |
|
208 |
if (ref $self->{'intervall'}) { |
209 |
unless (exists $self->{'ifunc'}) { |
210 |
$self->{'ifunc'} = |
211 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}})); |
212 |
} |
213 |
($first) = &{$self->{'ifunc'}}($first) if $first; |
214 |
($last) = &{$self->{'ifunc'}}($last) if $last; |
215 |
} |
216 |
if (defined $first and $first ne '') { # set the cursor to $first |
217 |
$dbh->seq($first, $value, R_CURSOR); |
218 |
} else { |
219 |
$dbh->seq($first, $value, R_FIRST); |
220 |
} |
221 |
# We assume that word do not start with the character \377 |
222 |
# $last = pack 'C', 0xff unless defined $last and $last ne ''; |
223 |
return () if defined $last and $first gt $last; # $first would be after the last word |
224 |
|
225 |
push @result, $first; |
226 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
227 |
# We should limit this to a "resonable" number of words |
228 |
last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o; |
229 |
push @result, $word; |
230 |
} |
231 |
\@result; # speed |
232 |
} |
233 |
|
234 |
sub prefix { |
235 |
my ($self, $prefix) = @_; |
236 |
my $value = ''; |
237 |
my $word = ''; |
238 |
my @result; |
239 |
|
240 |
return () unless defined $prefix; # Full dictionary requested !! |
241 |
return unless exists $self->{'prefix'}; |
242 |
defined $self->{db} or $self->open; |
243 |
$self->sync; |
244 |
my $dbh = $self->{dbh}; |
245 |
|
246 |
if (ref $self->{'prefix'}) { |
247 |
unless (exists $self->{'pfunc'}) { |
248 |
$self->{'pfunc'} = |
249 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}})); |
250 |
} |
251 |
($prefix) = &{$self->{'pfunc'}}($prefix); |
252 |
} |
253 |
|
254 |
if ($dbh->seq($word = $prefix, $value, R_CURSOR)) { |
255 |
return (); |
256 |
} |
257 |
return () if $word !~ /^$prefix/; |
258 |
push @result, $word; |
259 |
|
260 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
261 |
# We should limit this to a "resonable" number of words |
262 |
last if $word !~ /^$prefix/; |
263 |
push @result, $word; |
264 |
} |
265 |
\@result; # speed |
266 |
} |
267 |
|
268 |
sub search { |
269 |
my $self = shift; |
270 |
|
271 |
defined $self->{db} or $self->open; |
272 |
$self->sync; |
273 |
$self->search_raw(&{$self->{func}}(@_)); # No call to parse() here |
274 |
} |
275 |
|
276 |
sub parse { |
277 |
my $self = shift; |
278 |
|
279 |
defined $self->{db} or $self->open; |
280 |
&{$self->{func}}(@_); |
281 |
} |
282 |
|
283 |
sub keys { |
284 |
my $self = shift; |
285 |
|
286 |
defined $self->{db} or $self->open; |
287 |
keys %{$self->{db}}; |
288 |
} |
289 |
|
290 |
sub search_prefix { |
291 |
my $self = shift; |
292 |
|
293 |
# print "search_prefix(@_)\n"; |
294 |
defined $self->{db} or $self->open; |
295 |
$self->search_raw(map($self->prefix($_), @_)); |
296 |
} |
297 |
|
298 |
sub search_raw { |
299 |
my $self = shift; |
300 |
my %occ; |
301 |
my %score; |
302 |
|
303 |
return () unless @_; |
304 |
|
305 |
defined $self->{db} or $self->open; |
306 |
$self->sync; |
307 |
grep $occ{$_}++, @_; |
308 |
for (keys %occ) { |
309 |
if (defined $self->{db}->{$_}) { |
310 |
my %post = unpack 'w*', $self->{db}->{$_}; |
311 |
my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1)); |
312 |
my $did; |
313 |
for $did (keys %post) { |
314 |
$score{$did} = 0 unless defined $score{$did}; # perl -w |
315 |
$score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf |
316 |
if $self->{db}->{$M, $did}; # db may be broken |
317 |
} |
318 |
} |
319 |
} |
320 |
%score; |
321 |
} |
322 |
|
323 |
sub sync { |
324 |
my $self = shift; |
325 |
|
326 |
if ($self->{mode} & O_RDWR) { |
327 |
print STDERR "Flushing $self->{cached} postings\n"; |
328 |
while (my($key, $value) = each %{$self->{cache}}) { |
329 |
$self->{db}->{$key} .= $value; |
330 |
#delete $self->{cache}->{$key}; |
331 |
} |
332 |
while (my($key, $value) = each %{$self->{cdict}}) { |
333 |
$self->{db}->{$key} = 0 unless $self->{db}->{$key}; |
334 |
$self->{db}->{$key} += $value; |
335 |
#delete $self->{cdict}->{$key}; |
336 |
} |
337 |
$self->{cache} = {}; |
338 |
$self->{cdict} = {}; |
339 |
# print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n"; |
340 |
$self->{cached} = 0; |
341 |
# $self->{dbh}->sync if $self->{dbh}; |
342 |
} |
343 |
} |
344 |
|
345 |
sub close { |
346 |
my $self = shift; |
347 |
|
348 |
if ($self->{dbh}) { |
349 |
$self->sync; |
350 |
delete $self->{dbh}; |
351 |
untie %{$self->{db}}; |
352 |
delete $self->{db}; |
353 |
delete $self->{func}; |
354 |
delete $self->{cache}; |
355 |
delete $self->{cached}; |
356 |
delete $self->{cdict}; |
357 |
delete $self->{pfunc} if defined $self->{pfunc}; |
358 |
delete $self->{ifunc} if defined $self->{ifunc}; |
359 |
delete $self->{xfunc} if defined $self->{xfunc}; |
360 |
} |
361 |
} |
362 |
|
363 |
1; |
364 |
|