lib/WAIT/InvertedIndex.pm

#                              -*- Mode: Perl -*- 
# InvertedIndex.pm -- 
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug  8 13:05:10 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:42 1998
# Language        : CPerl
# Status          : Unknown, Use with caution!
# 
# Copyright (c) 1996-1997, Ulrich Pfeifer
# 

package WAIT::InvertedIndex;
use strict;
use DB_File;
use Fcntl;
use WAIT::Filter;
use Carp;
use vars qw(%FUNC);

my $O = pack('C', 0xff)."o";                  # occurances
my $M = pack('C', 0xff)."m";                  # maxtf

sub new {
  my $type = shift;
  my %parm = @_;
  my $self = {};

  $self->{file}     = $parm{file}     or croak "No file specified";
  $self->{attr}     = $parm{attr}     or croak "No attributes specified";
  $self->{filter}   = $parm{filter};
  $self->{'name'}   = $parm{'name'};
  $self->{records}  = 0;
  for (qw(intervall prefix)) {
    if (exists $parm{$_}) {
      if (ref $parm{$_}) {
        $self->{$_} = [@{$parm{$_}}] # clone
      } else {
        $self->{$_} = $parm{$_}
      }
    }
  }
  bless $self, ref($type) || $type;
}

sub name {$_[0]->{'name'}}

sub _split_pos {
  my ($text, $pos) = @{$_[0]};
  my @result;

  $text =~ s/(^\s+)// and $pos += length($1);
  while ($text =~ s/(^\S+)//) {
    my $word = $1;
    push @result, [$word, $pos];
    $pos += length($word);
    $text =~ s/(^\s+)// and $pos += length($1);
  }
  @result;
}

sub _xfiltergen {
  my $filter = pop @_;

  if ($filter eq 'stop') {      # avoid the slow stopword elimination
    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
  }
  if (@_) {
    if ($filter =~ /^split(\d*)/) {
      if ($1) {
        "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
      } else {
        "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
      }
    } else {
      "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
    }
  } else {
    if ($filter =~ /^split(\d*)/) {
      if ($1) {
        "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
      } else {
        "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
      }
    } else {
      "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
    }
  }
}

sub parse_pos {
  my $self = shift;

  unless (exists $self->{xfunc}) {
    $self->{xfunc}     =
      eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
    #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
  }
  &{$self->{xfunc}}($_[0]);
}

sub _filtergen {
  my $filter = pop @_;

  if (@_) {
    "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
  } else {
    "map(&WAIT::Filter::$filter(\$_), \@_)";
  }
}

sub drop {
  my $self = shift;
  if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
    my $file = $self->{file};

    ! (!-e $file or unlink $file);
  } else {                              # notify our database
    croak ref($self)."::drop called directly";
  }
}

sub open {
  my $self = shift;
  my $file = $self->{file};

  if (defined $self->{dbh}) {
    $self->{dbh};
  } else {
    $self->{func}     =
      eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
    $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
                       $self->{mode}, 0664, $DB_BTREE);
#    tie(%{$self->{cache}}, 'DB_File', undef,
#        $self->{mode}, 0664, $DB_BTREE)
    $self->{cache} = {}
      if $self->{mode} & O_RDWR;
#    tie(%{$self->{cdict}}, 'DB_File', undef,
#        $self->{mode}, 0664, $DB_BTREE)
    $self->{cdict} = {}
      if $self->{mode} & O_RDWR;
    $self->{cached} = 0;
  }
}

sub insert {
  my $self  = shift;
  my $key   = shift;
  my %occ;
  
  defined $self->{db} or $self->open;
  grep $occ{$_}++, &{$self->{func}}(@_);
  my ($word, $noc);
  $self->{records}++;
  while (($word, $noc) = each %occ) {
    if (defined $self->{cache}->{$word}) {
      $self->{cdict}->{$O,$word}++;
      $self->{cache}->{$word} .= pack 'w2', $key, $noc;
    } else {
      $self->{cdict}->{$O,$word} = 1;
      $self->{cache}->{$word}  = pack 'w2', $key, $noc;
    } 
    $self->{cached}++;
  }
  $self->sync if $self->{cached} > 100_000;
  my $maxtf = 0;
  for (values %occ) {
    $maxtf = $_ if $_ > $maxtf;
  }
  $self->{db}->{$M, $key} = $maxtf;
}

sub delete {
  my $self  = shift;
  my $key   = shift;
  my %occ;

  defined $self->{db} or $self->open;
  $self->sync;
  $self->{records}--;
  grep $occ{$_}++, &{$self->{func}}(@_);
  for (keys %occ) {
    # may reorder posting list
    my %post = unpack 'w*', $self->{db}->{$_};
    $self->{db}->{$O,$_}--;
    delete $post{$key};
    $self->{db}->{$_} = pack 'w*', %post;
  }
  delete $self->{db}->{$M, $key};
}

sub intervall {
  my ($self, $first, $last) = @_;
  my $value = '';
  my $word  = '';
  my @result;

  return unless exists $self->{'intervall'};

  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};       # for convenience

  if (ref $self->{'intervall'}) {
    unless (exists $self->{'ifunc'}) {
      $self->{'ifunc'} =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
    }
    ($first) = &{$self->{'ifunc'}}($first) if $first;
    ($last)  = &{$self->{'ifunc'}}($last) if $last;
  }
  if (defined $first and $first ne '') {         # set the cursor to $first
    $dbh->seq($first, $value, R_CURSOR);
  } else {
    $dbh->seq($first, $value, R_FIRST);
  }
  # We assume that word do not start with the character \377
  # $last = pack 'C', 0xff unless defined $last and $last ne '';
  return () if defined $last and $first gt $last; # $first would be after the last word
  
  push @result, $first;
  while (!$dbh->seq($word, $value, R_NEXT)) {
    # We should limit this to a "resonable" number of words
    last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
    push @result, $word;
  }
  \@result;                     # speed
}

sub prefix {
  my ($self, $prefix) = @_;
  my $value = '';
  my $word  = '';
  my @result;

  return () unless defined $prefix; # Full dictionary requested !!
  return unless exists $self->{'prefix'};
  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};
  
  if (ref $self->{'prefix'}) {
    unless (exists $self->{'pfunc'}) {
      $self->{'pfunc'} =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
    }
    ($prefix) = &{$self->{'pfunc'}}($prefix);
  }

  if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
    return ();
  }
  return () if $word !~ /^$prefix/;
  push @result, $word;

  while (!$dbh->seq($word, $value, R_NEXT)) {
    # We should limit this to a "resonable" number of words
    last if $word !~ /^$prefix/;
    push @result, $word;
  }
  \@result;                     # speed
}

sub search {
  my $self  = shift;

  defined $self->{db} or $self->open;
  $self->sync;
  $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
}

sub parse {
  my $self  = shift;

  defined $self->{db} or $self->open;
  &{$self->{func}}(@_);
}

sub search_prefix {
  my $self  = shift;

  # print "search_prefix(@_)\n";
  defined $self->{db} or $self->open;
  $self->search_raw(map($self->prefix($_), @_));
}

sub search_raw {
  my $self  = shift;
  my %occ;
  my %score;

  return () unless @_;

  defined $self->{db} or $self->open;
  $self->sync;
  grep $occ{$_}++, @_;
  for (keys %occ) {
    if (defined $self->{db}->{$_}) {
      my %post = unpack 'w*', $self->{db}->{$_};
      my $idf = log($self->{records}/$self->{db}->{$O,$_});
      my $did;
      for $did (keys %post) {
        $score{$did} = 0 unless defined $score{$did}; # perl -w 
        $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
          if $self->{db}->{$M, $did}; # db may be broken
      }
    }
  }
  %score;
}

sub sync {
  my $self = shift;

  if ($self->{mode} & O_RDWR) {
    print STDERR "\aFlushing $self->{cached} postings\n";
    while (my($key, $value) = each %{$self->{cache}}) {
      $self->{db}->{$key} .= $value;
      #delete $self->{cache}->{$key};
    }
    while (my($key, $value) = each %{$self->{cdict}}) {
      $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
      $self->{db}->{$key} += $value;
      #delete $self->{cdict}->{$key};
    }
    $self->{cache} = {};
    $self->{cdict} = {};
    # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
    $self->{cached} = 0;
    # $self->{dbh}->sync if $self->{dbh};
  }
}

sub close {
  my $self = shift;

  if ($self->{dbh}) {
    $self->sync;
    delete $self->{dbh};
    untie %{$self->{db}};
    delete $self->{db};
    delete $self->{func};
    delete $self->{cache};
    delete $self->{cached};
    delete $self->{cdict};
    delete $self->{pfunc} if defined $self->{pfunc};
    delete $self->{ifunc} if defined $self->{ifunc};
    delete $self->{xfunc} if defined $self->{xfunc};
  }
}

1;

1	# -- Mode: Perl --
2	# InvertedIndex.pm --
3	# ITIID : $ITI$ $Header $__Header$
4	# Author : Ulrich Pfeifer
5	# Created On : Thu Aug 8 13:05:10 1996
6	# Last Modified By: Ulrich Pfeifer
7	# Last Modified On: Sun Nov 22 18:44:42 1998
8	# Language : CPerl
9	# Status : Unknown, Use with caution!
10	#
11	# Copyright (c) 1996-1997, Ulrich Pfeifer
12	#
13
14	package WAIT::InvertedIndex;
15	use strict;
16	use DB_File;
17	use Fcntl;
18	use WAIT::Filter;
19	use Carp;
20	use vars qw(%FUNC);
21
22	my $O = pack('C', 0xff)."o"; # occurances
23	my $M = pack('C', 0xff)."m"; # maxtf
24
25	sub new {
26	my $type = shift;
27	my %parm = @_;
28	my $self = {};
29
30	$self->{file} = $parm{file} or croak "No file specified";
31	$self->{attr} = $parm{attr} or croak "No attributes specified";
32	$self->{filter} = $parm{filter};
33	$self->{'name'} = $parm{'name'};
34	$self->{records} = 0;
35	for (qw(intervall prefix)) {
36	if (exists $parm{$_}) {
37	if (ref $parm{$_}) {
38	$self->{$_} = [@{$parm{$_}}] # clone
39	} else {
40	$self->{$_} = $parm{$_}
41	}
42	}
43	}
44	bless $self, ref($type) \|\| $type;
45	}
46
47	sub name {$_[0]->{'name'}}
48
49	sub _split_pos {
50	my ($text, $pos) = @{$_[0]};
51	my @result;
52
53	$text =~ s/(^\s+)// and $pos += length($1);
54	while ($text =~ s/(^\S+)//) {
55	my $word = $1;
56	push @result, [$word, $pos];
57	$pos += length($word);
58	$text =~ s/(^\s+)// and $pos += length($1);
59	}
60	@result;
61	}
62
63	sub _xfiltergen {
64	my $filter = pop @_;
65
66	if ($filter eq 'stop') { # avoid the slow stopword elimination
67	return _xfiltergen(@_); # it's cheaper to look them up afterwards
68	}
69	if (@_) {
70	if ($filter =~ /^split(\d*)/) {
71	if ($1) {
72	"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
73	} else {
74	"map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
75	}
76	} else {
77	"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
78	}
79	} else {
80	if ($filter =~ /^split(\d*)/) {
81	if ($1) {
82	"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
83	} else {
84	"map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
85	}
86	} else {
87	"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
88	}
89	}
90	}
91
92	sub parse_pos {
93	my $self = shift;
94
95	unless (exists $self->{xfunc}) {
96	$self->{xfunc} =
97	eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
98	#printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
99	}
100	&{$self->{xfunc}}($_[0]);
101	}
102
103	sub _filtergen {
104	my $filter = pop @_;
105
106	if (@_) {
107	"map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
108	} else {
109	"map(&WAIT::Filter::$filter(\$_), \@_)";
110	}
111	}
112
113	sub drop {
114	my $self = shift;
115	if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
116	my $file = $self->{file};
117
118	! (!-e $file or unlink $file);
119	} else { # notify our database
120	croak ref($self)."::drop called directly";
121	}
122	}
123
124	sub open {
125	my $self = shift;
126	my $file = $self->{file};
127
128	if (defined $self->{dbh}) {
129	$self->{dbh};
130	} else {
131	$self->{func} =
132	eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
133	$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
134	$self->{mode}, 0664, $DB_BTREE);
135	# tie(%{$self->{cache}}, 'DB_File', undef,
136	# $self->{mode}, 0664, $DB_BTREE)
137	$self->{cache} = {}
138	if $self->{mode} & O_RDWR;
139	# tie(%{$self->{cdict}}, 'DB_File', undef,
140	# $self->{mode}, 0664, $DB_BTREE)
141	$self->{cdict} = {}
142	if $self->{mode} & O_RDWR;
143	$self->{cached} = 0;
144	}
145	}
146
147	sub insert {
148	my $self = shift;
149	my $key = shift;
150	my %occ;
151
152	defined $self->{db} or $self->open;
153	grep $occ{$_}++, &{$self->{func}}(@_);
154	my ($word, $noc);
155	$self->{records}++;
156	while (($word, $noc) = each %occ) {
157	if (defined $self->{cache}->{$word}) {
158	$self->{cdict}->{$O,$word}++;
159	$self->{cache}->{$word} .= pack 'w2', $key, $noc;
160	} else {
161	$self->{cdict}->{$O,$word} = 1;
162	$self->{cache}->{$word} = pack 'w2', $key, $noc;
163	}
164	$self->{cached}++;
165	}
166	$self->sync if $self->{cached} > 100_000;
167	my $maxtf = 0;
168	for (values %occ) {
169	$maxtf = $_ if $_ > $maxtf;
170	}
171	$self->{db}->{$M, $key} = $maxtf;
172	}
173
174	sub delete {
175	my $self = shift;
176	my $key = shift;
177	my %occ;
178
179	defined $self->{db} or $self->open;
180	$self->sync;
181	$self->{records}--;
182	grep $occ{$_}++, &{$self->{func}}(@_);
183	for (keys %occ) {
184	# may reorder posting list
185	my %post = unpack 'w*', $self->{db}->{$_};
186	$self->{db}->{$O,$_}--;
187	delete $post{$key};
188	$self->{db}->{$_} = pack 'w*', %post;
189	}
190	delete $self->{db}->{$M, $key};
191	}
192
193	sub intervall {
194	my ($self, $first, $last) = @_;
195	my $value = '';
196	my $word = '';
197	my @result;
198
199	return unless exists $self->{'intervall'};
200
201	defined $self->{db} or $self->open;
202	$self->sync;
203	my $dbh = $self->{dbh}; # for convenience
204
205	if (ref $self->{'intervall'}) {
206	unless (exists $self->{'ifunc'}) {
207	$self->{'ifunc'} =
208	eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
209	}
210	($first) = &{$self->{'ifunc'}}($first) if $first;
211	($last) = &{$self->{'ifunc'}}($last) if $last;
212	}
213	if (defined $first and $first ne '') { # set the cursor to $first
214	$dbh->seq($first, $value, R_CURSOR);
215	} else {
216	$dbh->seq($first, $value, R_FIRST);
217	}
218	# We assume that word do not start with the character \377
219	# $last = pack 'C', 0xff unless defined $last and $last ne '';
220	return () if defined $last and $first gt $last; # $first would be after the last word
221
222	push @result, $first;
223	while (!$dbh->seq($word, $value, R_NEXT)) {
224	# We should limit this to a "resonable" number of words
225	last if (defined $last and $word gt $last) or $word =~ /^($M\|$O)/o;
226	push @result, $word;
227	}
228	\@result; # speed
229	}
230
231	sub prefix {
232	my ($self, $prefix) = @_;
233	my $value = '';
234	my $word = '';
235	my @result;
236
237	return () unless defined $prefix; # Full dictionary requested !!
238	return unless exists $self->{'prefix'};
239	defined $self->{db} or $self->open;
240	$self->sync;
241	my $dbh = $self->{dbh};
242
243	if (ref $self->{'prefix'}) {
244	unless (exists $self->{'pfunc'}) {
245	$self->{'pfunc'} =
246	eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
247	}
248	($prefix) = &{$self->{'pfunc'}}($prefix);
249	}
250
251	if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
252	return ();
253	}
254	return () if $word !~ /^$prefix/;
255	push @result, $word;
256
257	while (!$dbh->seq($word, $value, R_NEXT)) {
258	# We should limit this to a "resonable" number of words
259	last if $word !~ /^$prefix/;
260	push @result, $word;
261	}
262	\@result; # speed
263	}
264
265	sub search {
266	my $self = shift;
267
268	defined $self->{db} or $self->open;
269	$self->sync;
270	$self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
271	}
272
273	sub parse {
274	my $self = shift;
275
276	defined $self->{db} or $self->open;
277	&{$self->{func}}(@_);
278	}
279
280	sub search_prefix {
281	my $self = shift;
282
283	# print "search_prefix(@_)\n";
284	defined $self->{db} or $self->open;
285	$self->search_raw(map($self->prefix($_), @_));
286	}
287
288	sub search_raw {
289	my $self = shift;
290	my %occ;
291	my %score;
292
293	return () unless @_;
294
295	defined $self->{db} or $self->open;
296	$self->sync;
297	grep $occ{$_}++, @_;
298	for (keys %occ) {
299	if (defined $self->{db}->{$_}) {
300	my %post = unpack 'w*', $self->{db}->{$_};
301	my $idf = log($self->{records}/$self->{db}->{$O,$_});
302	my $did;
303	for $did (keys %post) {
304	$score{$did} = 0 unless defined $score{$did}; # perl -w
305	$score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
306	if $self->{db}->{$M, $did}; # db may be broken
307	}
308	}
309	}
310	%score;
311	}
312
313	sub sync {
314	my $self = shift;
315
316	if ($self->{mode} & O_RDWR) {
317	print STDERR "\aFlushing $self->{cached} postings\n";
318	while (my($key, $value) = each %{$self->{cache}}) {
319	$self->{db}->{$key} .= $value;
320	#delete $self->{cache}->{$key};
321	}
322	while (my($key, $value) = each %{$self->{cdict}}) {
323	$self->{db}->{$key} = 0 unless $self->{db}->{$key};
324	$self->{db}->{$key} += $value;
325	#delete $self->{cdict}->{$key};
326	}
327	$self->{cache} = {};
328	$self->{cdict} = {};
329	# print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
330	$self->{cached} = 0;
331	# $self->{dbh}->sync if $self->{dbh};
332	}
333	}
334
335	sub close {
336	my $self = shift;
337
338	if ($self->{dbh}) {
339	$self->sync;
340	delete $self->{dbh};
341	untie %{$self->{db}};
342	delete $self->{db};
343	delete $self->{func};
344	delete $self->{cache};
345	delete $self->{cached};
346	delete $self->{cdict};
347	delete $self->{pfunc} if defined $self->{pfunc};
348	delete $self->{ifunc} if defined $self->{ifunc};
349	delete $self->{xfunc} if defined $self->{xfunc};
350	}
351	}
352
353	1;
354