lib/WAIT/Table.pm

#                              -*- Mode: Perl -*- 
# Table.pm -- 
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug  8 13:05:10 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:37 1998
# Language        : CPerl
# Update Count    : 51
# Status          : Unknown, Use with caution!
# 
# Copyright (c) 1996-1997, Ulrich Pfeifer
# 

=head1 NAME

WAIT::Table -- Module for maintaining Tables / Relations

=head1 SYNOPSIS

  require WAIT::Table;

=head1 DESCRIPTION

=cut

package WAIT::Table;
require WAIT::Parse::Base;
use strict;
use Carp;
use DB_File;
use Fcntl;

my $USE_RECNO = 0;

=head2 Creating a Table.

The constructor WAIT::Table-<gt>new is normally called via the
create_table method of a database handle. This is not enforced, but
creating a table doesn not make any sense unless the table is
registered by the database because the latter implements persistence
of the meta data. Registering is done automatically by letting the
database handle create a table.

  my $db = create WAIT::Database name => 'sample';
  my $tb = $db->create_table (name     => 'test',
                              attr     => ['docid', 'headline'],
                              layout   => $layout,
                              access   => $access,
                             );

The constructor returns a handle for the table. This handle is hidden by the
table module, to prevent direct access if called via Table.

=over 10

=item C<access> => I<accesobj>

A reference to a acces object for the external parts (attributes) of
tuples. As you may remember, the WAIT System does not enforce that
objects are completely stored inside the system to avoid duplication.
There is no (strong) point in storing all you HTML-Documents inside
the system when indexing your WWW-Server.

=item C<file> => I<fname>

The filename of the records file. Files for indexes will have I<fname>
as prefix. I<Mandatory>

=item C<name> => I<name>

The name of this table. I<Mandatory>

=item C<attr> => [ I<attr> ... ]

A reference to an array of attribute names. I<Mandatory>

=item C<djk> => [ I<attr> ... ]

A reference to an array of attribute names which make up the
I<disjointness key>. Don't think about it - i's of no use yet;

=item C<layout> => I<layoutobj>

A reference to an external parser object. Defaults to anew instance of
C<WAIT::Parse::Base>

=item C<access> => I<accesobj>

A reference to a acces object for the external parts of tuples.

=back

=cut

sub new {
  my $type = shift;
  my %parm = @_;
  my $self = {};

  # Do that before we eventually add '_weight' to attributes.
  $self->{keyset}   = $parm{keyset}   || [[@{$parm{attr}}]];
  $self->{mode}     = O_CREAT | O_RDWR;
  # Determine and set up subclass
  $type = ref($type) || $type;
  if (defined $parm{djk}) {
    if (@{$parm{djk}} == @{$parm{attr}}) {
      # All attributes in DK (sloppy test here!)
      $type .= '::Independent';
      require WAIT::Table::Independent;
    } else {
      $type .= '::Disjoint';
      require WAIT::Table::Disjoint;
    }
    # Add '_weight' to attributes
    my %attr;
    @attr{@{$parm{attr}}} = (1) x @{$parm{attr}};
    unshift @{$parm{attr}}, '_weight' unless $attr{'_weight'};
  }

  $self->{file}     = $parm{file}     or croak "No file specified";
  if (-d  $self->{file} or !mkdir($self->{file}, 0775)) {
    croak "Could not 'mkdir $self->{file}': $!\n";
  }
  $self->{name}     = $parm{name}     or croak "No name specified";
  $self->{attr}     = $parm{attr}     or croak "No attributes specified";
  $self->{djk}      = $parm{djk}      if defined $parm{djk};
  $self->{layout}   = $parm{layout} || new WAIT::Parse::Base;
  $self->{access}   = $parm{access} if defined $parm{access};
  $self->{nextk}    = 1;        # next record to insert; first record unused
  $self->{deleted}  = {};       # no deleted records yet
  $self->{indexes}  = {};

  bless $self, $type;
  # Call create_index() and create_index() for compatibility
  for (@{$self->{keyset}||[]}) {
    #carp "Specification of indexes at table create time is deprecated";
    $self->create_index(@$_);
  }
  while (@{$parm{invindex}||[]}) {
    # carp "Specification of inverted indexes at table create time is deprecated";
    my $att  = shift @{$parm{invindex}};
    my @spec = @{shift @{$parm{invindex}}};
    my @opt;
    
    if (ref($spec[0])) {
      carp "Secondary pipelines are deprecated\n";
      @opt = %{shift @spec};
    }
    $self->create_inverted_index(attribute => $att, pipeline  => \@spec, @opt);
  }
  $self;
  # end of backwarn compatibility stuff
}

=head2 Creating an index

  $tb->create_index('docid');

=item C<create_index>

must be called with a list of attributes. This must be a subset of the
attributes specified when the table was created. Currently this
method must be called before the first tuple is inserted in the
table!

=cut

sub create_index {
  my $self= shift;
  
  croak "Cannot create index for table aready populated"
    if $self->{nextk} > 1;
  
  require WAIT::Index;
  
  my $name = join '-', @_;
  $self->{indexes}->{$name} =
    new WAIT::Index file => $self->{file}.'/'.$name, attr => $_;
}

=head2 Creating an inverted index

  $tb->create_inverted_index
    (attribute => 'au',
     pipeline  => ['detex', 'isotr', 'isolc', 'split2', 'stop'],
     predicate => 'plain',
    );

=over 5

=item C<attribute>

The attribute to build the index on. This attribute may not be in the
set attributes specified when the table was created.

=item C<pipeline>

A piplines specification is a reference to and array of method names
(from package C<WAIT::Filter>) which are to applied in sequence to the
contents of the named attribute. The attribute name may not be in the
attribute list.

=item C<predicate>

An indication which predicate the index implements. This may be
e.g. 'plain', 'stemming' or 'soundex'. The indicator will be used for
query processing. Currently there is no standard set of predicate
names. The predicate defaults to the last member of the ppline if
omitted.

=back

Currently this method must be called before the first tuple is
inserted in the table!

=cut

sub create_inverted_index {
  my $self  = shift;
  my %parm  = @_;

  croak "No attribute specified" unless $parm{attribute};
  croak "No pipeline specified"  unless $parm{pipeline};

  $parm{predicate} ||= $parm{pipeline}->[-1];
  
  croak "Cannot create index for table aready populated"
    if $self->{nextk} > 1;
  
  require WAIT::InvertedIndex;

  # backward compatibility stuff
  my %opt = %parm;
  for (qw(attribute pipeline predicate)) {
    delete $opt{$_};
  }
  
  my $name = join '_', ($parm{attribute}, @{$parm{pipeline}});
  my $idx = new WAIT::InvertedIndex(file   => $self->{file}.'/'.$name,
                                    filter => [@{$parm{pipeline}}], # clone
                                    name   => $name,
                                    attr   => $parm{attribute},
                                    %opt, # backward compatibility stuff
                                   );
  # We will have to use $parm{predicate} here
  push @{$self->{inverted}->{$parm{attribute}}}, $idx;
}

sub dir {
  $_[0]->{file};
}

=head2 C<$tb-E<gt>layout>

Returns the reference to the associated parser object.

=cut

sub layout { $_[0]->{layout} }

=head2 C<$tb-E<gt>fields>

Returns the array of attribute names.

=cut


sub fields { keys %{$_[0]->{inverted}}}

=head2 C<$tb-E<gt>drop>

Must be called via C<WAIT::Database::drop_table>

=cut

sub drop {
  my $self = shift;
  if ((caller)[0] eq 'WAIT::Database') { # database knows about this
    $self->close;               # just make sure
    my $file = $self->{file};

    for (values %{$self->{indexes}}) {
      $_->drop;
    }
    unlink "$file/records";
    ! (!-e $file or rmdir $file);
  } else {
    croak ref($self)."::drop called directly";
  }
}

sub mrequire ($) {
  my $module = shift;

  $module =~ s{::}{/}g;
  $module .= '.pm';
  require $module;
}

sub open {
  my $self = shift;
  my $file = $self->{file} . '/records';

  mrequire ref($self);           # that's tricky eh?
  if (defined $self->{'layout'}) {
    mrequire ref($self->{'layout'});
  }
  if (defined $self->{'access'}) {
    mrequire ref($self->{'access'});
  }
  if (exists $self->{indexes}) {
    require WAIT::Index;
    for (values %{$self->{indexes}}) {
      $_->{mode} = $self->{mode};
    }
  }
  if (exists $self->{inverted}) {
    my ($att, $idx);
    for $att (keys %{$self->{inverted}}) {
      for $idx (@{$self->{inverted}->{$att}}) {
        $idx->{mode} = $self->{mode};
      }
    }
    require WAIT::InvertedIndex;
  }
  unless (defined $self->{dbh}) {
    if ($USE_RECNO) {
      $self->{dbh} = tie(@{$self->{db}}, 'DB_File', $file,
                         $self->{mode}, 0664, $DB_RECNO);
    } else {
      $self->{dbh} =
        tie(%{$self->{db}}, 'DB_File', $file,
                         $self->{mode}, 0664, $DB_BTREE);
    }
  }
  $self;
}

sub fetch_extern {
  my $self  = shift;

  print "#@_", $self->{'access'}->{Mode}, "\n";
  if (exists $self->{'access'}) {
    mrequire ref($self->{'access'});
    $self->{'access'}->FETCH(@_);
  }
}

sub fetch_extern_by_id {
  my $self  = shift;

  $self->fetch_extern($self->fetch(@_));
}

sub _find_index {
  my $self  = shift;
  my (@att) = @_;
  my %att;
  my $name;
  
  @att{@att} = @att;

  KEY: for $name (keys %{$self->{indexes}}) {
      my @iat = split /-/, $name;
      for (@iat) {
        next KEY unless exists $att{$_};
      }
      return $self->{indexes}->{$name};
    }
  return undef;
}

sub have {
  my $self  = shift;
  my %parm  = @_;

  my $index = $self->_find_index(keys %parm);
  croak "No index found" unless $index;
  defined $self->{db} or $self->open;
  return $index->have(@_);
}

sub insert {
  my $self  = shift;
  my %parm  = @_;

  defined $self->{db} or $self->open;

  my $tuple = join($;, map($parm{$_} || '', @{$self->{attr}}));
  my $key;
  my @deleted = keys %{$self->{deleted}};

  if (@deleted) {
    $key = pop @deleted;
    delete $self->{deleted}->{$key};
  } else {
    $key = $self->{nextk}++;
  }
  if ($USE_RECNO) {
    $self->{db}->[$key] = $tuple;
  } else {
    $self->{db}->{$key} = $tuple;
  }
  for (values %{$self->{indexes}}) {
    unless ($_->insert($key, %parm)) {
      # duplicate key, undo changes
      if ($key == $self->{nextk}-1) {
        $self->{nextk}--;
      } else {
        $self->{deleted}->{$key}=1;
      }
      my $idx;
      for $idx (values %{$self->{indexes}}) {
        last if $idx eq $_;
        $idx->remove($key, %parm);
      }
      return undef;
    } 
  }
  if (defined $self->{inverted}) {
    my $att;
    for $att (keys %{$self->{inverted}}) {
      if (defined $parm{$att}) {
        map $_->insert($key, $parm{$att}), @{$self->{inverted}->{$att}};
        #map $_->sync, @{$self->{inverted}->{$att}}
      }
    }
  }
  $key
}

sub sync {
  my $self  = shift;
  
  for (values %{$self->{indexes}}) {
    map $_->sync, $_;
  }
  if (defined $self->{inverted}) {
    my $att;
    for $att (keys %{$self->{inverted}}) {
      map $_->sync, @{$self->{inverted}->{$att}}
    }
  }
}

sub fetch {
  my $self  = shift;
  my $key   = shift;

  return () if exists $self->{deleted}->{$key};
  
  defined $self->{db} or $self->open;
  if ($USE_RECNO) {
    $self->unpack($self->{db}->[$key]);
  } else {
    $self->unpack($self->{db}->{$key});
  }
}

sub delete_by_key {
  my $self  = shift;
  my $key   = shift;

  return $self->{deleted}->{$key} if defined $self->{deleted}->{$key};
  my %tuple = $self->fetch($key);
  for (values %{$self->{indexes}}) {
    $_->delete($key, %tuple);
  }
  if (defined $self->{inverted}) {
    # User *must* provide the full record for this or the entries
    # in the inverted index will not be removed
    %tuple = (%tuple, @_);
    my $att;
    for $att (keys %{$self->{inverted}}) {
      if (defined $tuple{$att}) {
        map $_->delete($key, $tuple{$att}), @{$self->{inverted}->{$att}}
      }
    }
  }
  ++$self->{deleted}->{$key};
}

sub delete {
  my $self  = shift;
  my $tkey = $self->have(@_);

  defined $tkey && $self->delete_by_key($tkey, @_);
}

sub unpack {
  my $self = shift;
  my $tuple = shift;

  my $att;
  my @result;
  my @tuple = split /$;/, $tuple;

  for $att (@{$self->{attr}}) {
    push @result, $att, shift @tuple;
  }
  @result;
}

sub close {
  my $self = shift;

  if (exists $self->{'access'}) {
    eval {$self->{'access'}->close}; # dont bother if not opened
  }
  for (values %{$self->{indexes}}) {
    $_->close();
  }
  if (defined $self->{inverted}) {
    my $att;
    for $att (keys %{$self->{inverted}}) {
      if ($] > 5.003) {         # avoid bug in perl up to 5.003_05
        my $idx;
        for $idx (@{$self->{inverted}->{$att}}) {
          $idx->close;
        }
      } else {
        map $_->close(), @{$self->{inverted}->{$att}};
      }
    }
  }
  if ($self->{dbh}) {
    delete $self->{dbh};

    if ($USE_RECNO) {
      untie @{$self->{db}};
    } else {
      untie %{$self->{db}};
    }
    delete $self->{db};
  }

  1;
}

sub open_scan {
  my $self = shift;
  my $code = shift;

  $self->{dbh} or $self->open;
  require WAIT::Scan;
  new WAIT::Scan $self, $self->{nextk}-1, $code;
}

sub open_index_scan {
  my $self = shift;
  my $attr = shift;
  my $code = shift;
  my $name = join '-', @$attr;

  if (defined $self->{indexes}->{$name}) {
    $self->{indexes}->{$name}->open_scan($code);
  } else {
    croak "No such index '$name'";
  }
}

eval {sub WAIT::Query::Raw::new} unless defined \&WAIT::Query::Raw::new;

sub prefix {
  my ($self , $attr, $prefix) = @_;
  my %result;

  defined $self->{db} or $self->open; # require layout

  for (@{$self->{inverted}->{$attr}}) {
    my $result = $_->prefix($prefix);
    if (defined $result) {
      $result{$_->name} = $result;
    }
  }
  bless \%result, 'WAIT::Query::Raw';
}

sub intervall {
  my ($self, $attr, $lb, $ub) = @_;
  my %result;

  defined $self->{db} or $self->open; # require layout

  for (@{$self->{inverted}->{$attr}}) {
    my $result = $_->intervall($lb, $ub);
    if (defined $result) {
      $result{$_->name} = $result;
    }
  }
  bless \%result, 'WAIT::Query::Raw';
}

sub search {
  my $self = shift;
  my $attr = shift;
  my $cont = shift;
  my $raw  = shift;
  my %result;

  defined $self->{db} or $self->open; # require layout

  if ($raw) {
    for (@{$self->{inverted}->{$attr}}) {
      my $name = $_->name;
      if (exists $raw->{$name} and @{$raw->{$name}}) {
        my $scale = 1/scalar(@{$raw->{$name}});
        my %r = $_->search_raw(@{$raw->{$name}});
        my ($key, $val);
        while (($key, $val) = each %r) {
          if (exists $result{$key}) {
            $result{$key} += $val*$scale;
          } else {
            $result{$key}  = $val*$scale;
          }
        }
      }
    }
  }
  if (defined $cont and $cont ne '') {
    for (@{$self->{inverted}->{$attr}}) {
      my %r = $_->search($cont);
      my ($key, $val);
      while (($key, $val) = each %r) {
        if (exists $result{$key}) {
          $result{$key} += $val;
        } else {
          $result{$key}  = $val;
        }
      }
    }
  }
  # sanity check for deleted documents.
  # this should not be necessary !@#$
  for (keys %result) {
    delete $result{$_} if $self->{deleted}->{$_}
  }
  %result;
}

sub hilight_positions {
  my ($self, $attr, $text, $query, $raw)  = @_;
  my %pos;

  if (defined $raw) {
    for (@{$self->{inverted}->{$attr}}) {
      my $name = $_->name;
      if (exists $raw->{$name}) {
        my %qt;
        grep $qt{$_}++, @{$raw->{$name}};
        for ($_->parse_pos($text)) {
          if (exists $qt{$_->[0]}) {
            $pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
          }
        }
      }
    }
  }
  if (defined $query) {
    for (@{$self->{inverted}->{$attr}}) {
      my %qt;

      grep $qt{$_}++, $_->parse($query);
      for ($_->parse_pos($text)) {
        if (exists $qt{$_->[0]}) {
          if (exists $pos{$_->[1]}) { # perl -w ;-)
            $pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
          } else {
            $pos{$_->[1]} = length($_->[0]);
          }
        }
      }
    }
  }

  \%pos;
}

sub hilight {
  my ($tb, $text, $query, $raw) = @_;
  my $type = $tb->layout();
  my @result;

  $query ||= {};
  $raw   ||= {};
  my @ttxt = $type->tag($text);
  while (@ttxt) {
    no strict 'refs';
    my %tag = %{shift @ttxt};
    my $txt = shift @ttxt;
    my $fld;

    my %hl;
    for $fld (grep defined $tag{$_}, keys %$query, keys %$raw) {
      my $hp = $tb->hilight_positions($fld, $txt,
                                      $query->{$fld}, $raw->{$fld});
      for (keys %$hp) {
        if (exists $hl{$_}) {   # -w ;-(
          $hl{$_} = max($hl{$_}, $hp->{$_});
        } else {
          $hl{$_} = $hp->{$_};
        }
      }
    }
    my $pos;
    my $qt = {_qt => 1, %tag};
    my $pl = \%tag;
    my $last = length($txt);
    my @tmp;
    for $pos (sort {$b <=> $a} keys %hl) {
      unshift @tmp, $pl, substr($txt,$pos+$hl{$pos},$last-$pos-$hl{$pos});
      unshift @tmp, $qt, substr($txt,$pos,$hl{$pos});
      $last = $pos;
    }
    push @result, $pl, substr($txt,0,$last);
    push @result, @tmp;
  }
  @result;                      # no speed necessary
}

1;

1	ulpfr	10	# -- Mode: Perl --
2			# Table.pm --
3			# ITIID : $ITI$ $Header $__Header$
4			# Author : Ulrich Pfeifer
5			# Created On : Thu Aug 8 13:05:10 1996
6			# Last Modified By: Ulrich Pfeifer
7			# Last Modified On: Sun Nov 22 18:44:37 1998
8			# Language : CPerl
9			# Update Count : 51
10			# Status : Unknown, Use with caution!
11			#
12			# Copyright (c) 1996-1997, Ulrich Pfeifer
13			#
14
15			=head1 NAME
16
17			WAIT::Table -- Module for maintaining Tables / Relations
18
19			=head1 SYNOPSIS
20
21			require WAIT::Table;
22
23			=head1 DESCRIPTION
24
25			=cut
26
27			package WAIT::Table;
28			require WAIT::Parse::Base;
29			use strict;
30			use Carp;
31			use DB_File;
32			use Fcntl;
33
34			my $USE_RECNO = 0;
35
36			=head2 Creating a Table.
37
38			The constructor WAIT::Table-<gt>new is normally called via the
39			create_table method of a database handle. This is not enforced, but
40			creating a table doesn not make any sense unless the table is
41			registered by the database because the latter implements persistence
42			of the meta data. Registering is done automatically by letting the
43			database handle create a table.
44
45			my $db = create WAIT::Database name => 'sample';
46			my $tb = $db->create_table (name => 'test',
47			attr => ['docid', 'headline'],
48			layout => $layout,
49			access => $access,
50			);
51
52			The constructor returns a handle for the table. This handle is hidden by the
53			table module, to prevent direct access if called via Table.
54
55			=over 10
56
57			=item C<access> => I<accesobj>
58
59			A reference to a acces object for the external parts (attributes) of
60			tuples. As you may remember, the WAIT System does not enforce that
61			objects are completely stored inside the system to avoid duplication.
62			There is no (strong) point in storing all you HTML-Documents inside
63			the system when indexing your WWW-Server.
64
65			=item C<file> => I<fname>
66
67			The filename of the records file. Files for indexes will have I<fname>
68			as prefix. I<Mandatory>
69
70			=item C<name> => I<name>
71
72			The name of this table. I<Mandatory>
73
74			=item C<attr> => [ I<attr> ... ]
75
76			A reference to an array of attribute names. I<Mandatory>
77
78			=item C<djk> => [ I<attr> ... ]
79
80			A reference to an array of attribute names which make up the
81			I<disjointness key>. Don't think about it - i's of no use yet;
82
83			=item C<layout> => I<layoutobj>
84
85			A reference to an external parser object. Defaults to anew instance of
86			C<WAIT::Parse::Base>
87
88			=item C<access> => I<accesobj>
89
90			A reference to a acces object for the external parts of tuples.
91
92			=back
93
94			=cut
95
96			sub new {
97			my $type = shift;
98			my %parm = @_;
99			my $self = {};
100
101			# Do that before we eventually add '_weight' to attributes.
102			$self->{keyset} = $parm{keyset} \|\| [[@{$parm{attr}}]];
103			$self->{mode} = O_CREAT \| O_RDWR;
104			# Determine and set up subclass
105			$type = ref($type) \|\| $type;
106			if (defined $parm{djk}) {
107			if (@{$parm{djk}} == @{$parm{attr}}) {
108			# All attributes in DK (sloppy test here!)
109			$type .= '::Independent';
110			require WAIT::Table::Independent;
111			} else {
112			$type .= '::Disjoint';
113			require WAIT::Table::Disjoint;
114			}
115			# Add '_weight' to attributes
116			my %attr;
117			@attr{@{$parm{attr}}} = (1) x @{$parm{attr}};
118			unshift @{$parm{attr}}, '_weight' unless $attr{'_weight'};
119			}
120
121			$self->{file} = $parm{file} or croak "No file specified";
122			if (-d $self->{file} or !mkdir($self->{file}, 0775)) {
123			croak "Could not 'mkdir $self->{file}': $!\n";
124			}
125			$self->{name} = $parm{name} or croak "No name specified";
126			$self->{attr} = $parm{attr} or croak "No attributes specified";
127			$self->{djk} = $parm{djk} if defined $parm{djk};
128			$self->{layout} = $parm{layout} \|\| new WAIT::Parse::Base;
129			$self->{access} = $parm{access} if defined $parm{access};
130			$self->{nextk} = 1; # next record to insert; first record unused
131			$self->{deleted} = {}; # no deleted records yet
132			$self->{indexes} = {};
133
134			bless $self, $type;
135			# Call create_index() and create_index() for compatibility
136			for (@{$self->{keyset}\|\|[]}) {
137			#carp "Specification of indexes at table create time is deprecated";
138			$self->create_index(@$_);
139			}
140			while (@{$parm{invindex}\|\|[]}) {
141			# carp "Specification of inverted indexes at table create time is deprecated";
142			my $att = shift @{$parm{invindex}};
143			my @spec = @{shift @{$parm{invindex}}};
144			my @opt;
145
146			if (ref($spec[0])) {
147			carp "Secondary pipelines are deprecated\n";
148			@opt = %{shift @spec};
149			}
150			$self->create_inverted_index(attribute => $att, pipeline => \@spec, @opt);
151			}
152			$self;
153			# end of backwarn compatibility stuff
154			}
155
156			=head2 Creating an index
157
158			$tb->create_index('docid');
159
160			=item C<create_index>
161
162			must be called with a list of attributes. This must be a subset of the
163			attributes specified when the table was created. Currently this
164			method must be called before the first tuple is inserted in the
165			table!
166
167			=cut
168
169			sub create_index {
170			my $self= shift;
171
172			croak "Cannot create index for table aready populated"
173			if $self->{nextk} > 1;
174
175			require WAIT::Index;
176
177			my $name = join '-', @_;
178			$self->{indexes}->{$name} =
179			new WAIT::Index file => $self->{file}.'/'.$name, attr => $_;
180			}
181
182			=head2 Creating an inverted index
183
184			$tb->create_inverted_index
185			(attribute => 'au',
186			pipeline => ['detex', 'isotr', 'isolc', 'split2', 'stop'],
187			predicate => 'plain',
188			);
189
190			=over 5
191
192			=item C<attribute>
193
194			The attribute to build the index on. This attribute may not be in the
195			set attributes specified when the table was created.
196
197			=item C<pipeline>
198
199			A piplines specification is a reference to and array of method names
200			(from package C<WAIT::Filter>) which are to applied in sequence to the
201			contents of the named attribute. The attribute name may not be in the
202			attribute list.
203
204			=item C<predicate>
205
206			An indication which predicate the index implements. This may be
207			e.g. 'plain', 'stemming' or 'soundex'. The indicator will be used for
208			query processing. Currently there is no standard set of predicate
209			names. The predicate defaults to the last member of the ppline if
210			omitted.
211
212			=back
213
214			Currently this method must be called before the first tuple is
215			inserted in the table!
216
217			=cut
218
219			sub create_inverted_index {
220			my $self = shift;
221			my %parm = @_;
222
223			croak "No attribute specified" unless $parm{attribute};
224			croak "No pipeline specified" unless $parm{pipeline};
225
226			$parm{predicate} \|\|= $parm{pipeline}->[-1];
227
228			croak "Cannot create index for table aready populated"
229			if $self->{nextk} > 1;
230
231			require WAIT::InvertedIndex;
232
233			# backward compatibility stuff
234			my %opt = %parm;
235			for (qw(attribute pipeline predicate)) {
236			delete $opt{$_};
237			}
238
239			my $name = join '_', ($parm{attribute}, @{$parm{pipeline}});
240			my $idx = new WAIT::InvertedIndex(file => $self->{file}.'/'.$name,
241			filter => [@{$parm{pipeline}}], # clone
242			name => $name,
243			attr => $parm{attribute},
244			%opt, # backward compatibility stuff
245			);
246			# We will have to use $parm{predicate} here
247			push @{$self->{inverted}->{$parm{attribute}}}, $idx;
248			}
249
250			sub dir {
251			$_[0]->{file};
252			}
253
254			=head2 C<$tb-E<gt>layout>
255
256			Returns the reference to the associated parser object.
257
258			=cut
259
260			sub layout { $_[0]->{layout} }
261
262			=head2 C<$tb-E<gt>fields>
263
264			Returns the array of attribute names.
265
266			=cut
267
268
269			sub fields { keys %{$_[0]->{inverted}}}
270
271			=head2 C<$tb-E<gt>drop>
272
273			Must be called via C<WAIT::Database::drop_table>
274
275			=cut
276
277			sub drop {
278			my $self = shift;
279			if ((caller)[0] eq 'WAIT::Database') { # database knows about this
280			$self->close; # just make sure
281			my $file = $self->{file};
282
283			for (values %{$self->{indexes}}) {
284			$_->drop;
285			}
286			unlink "$file/records";
287			! (!-e $file or rmdir $file);
288			} else {
289			croak ref($self)."::drop called directly";
290			}
291			}
292
293			sub mrequire ($) {
294			my $module = shift;
295
296			$module =~ s{::}{/}g;
297			$module .= '.pm';
298			require $module;
299			}
300
301			sub open {
302			my $self = shift;
303			my $file = $self->{file} . '/records';
304
305			mrequire ref($self); # that's tricky eh?
306			if (defined $self->{'layout'}) {
307			mrequire ref($self->{'layout'});
308			}
309			if (defined $self->{'access'}) {
310			mrequire ref($self->{'access'});
311			}
312			if (exists $self->{indexes}) {
313			require WAIT::Index;
314			for (values %{$self->{indexes}}) {
315			$_->{mode} = $self->{mode};
316			}
317			}
318			if (exists $self->{inverted}) {
319			my ($att, $idx);
320			for $att (keys %{$self->{inverted}}) {
321			for $idx (@{$self->{inverted}->{$att}}) {
322			$idx->{mode} = $self->{mode};
323			}
324			}
325			require WAIT::InvertedIndex;
326			}
327			unless (defined $self->{dbh}) {
328			if ($USE_RECNO) {
329			$self->{dbh} = tie(@{$self->{db}}, 'DB_File', $file,
330			$self->{mode}, 0664, $DB_RECNO);
331			} else {
332			$self->{dbh} =
333			tie(%{$self->{db}}, 'DB_File', $file,
334			$self->{mode}, 0664, $DB_BTREE);
335			}
336			}
337			$self;
338			}
339
340			sub fetch_extern {
341			my $self = shift;
342
343			print "#@_", $self->{'access'}->{Mode}, "\n";
344			if (exists $self->{'access'}) {
345			mrequire ref($self->{'access'});
346			$self->{'access'}->FETCH(@_);
347			}
348			}
349
350			sub fetch_extern_by_id {
351			my $self = shift;
352
353			$self->fetch_extern($self->fetch(@_));
354			}
355
356			sub _find_index {
357			my $self = shift;
358			my (@att) = @_;
359			my %att;
360			my $name;
361
362			@att{@att} = @att;
363
364			KEY: for $name (keys %{$self->{indexes}}) {
365			my @iat = split /-/, $name;
366			for (@iat) {
367			next KEY unless exists $att{$_};
368			}
369			return $self->{indexes}->{$name};
370			}
371			return undef;
372			}
373
374			sub have {
375			my $self = shift;
376			my %parm = @_;
377
378			my $index = $self->_find_index(keys %parm);
379			croak "No index found" unless $index;
380			defined $self->{db} or $self->open;
381			return $index->have(@_);
382			}
383
384			sub insert {
385			my $self = shift;
386			my %parm = @_;
387
388			defined $self->{db} or $self->open;
389
390			my $tuple = join($;, map($parm{$_} \|\| '', @{$self->{attr}}));
391			my $key;
392			my @deleted = keys %{$self->{deleted}};
393
394			if (@deleted) {
395			$key = pop @deleted;
396			delete $self->{deleted}->{$key};
397			} else {
398			$key = $self->{nextk}++;
399			}
400			if ($USE_RECNO) {
401			$self->{db}->[$key] = $tuple;
402			} else {
403			$self->{db}->{$key} = $tuple;
404			}
405			for (values %{$self->{indexes}}) {
406			unless ($_->insert($key, %parm)) {
407			# duplicate key, undo changes
408			if ($key == $self->{nextk}-1) {
409			$self->{nextk}--;
410			} else {
411			$self->{deleted}->{$key}=1;
412			}
413			my $idx;
414			for $idx (values %{$self->{indexes}}) {
415			last if $idx eq $_;
416			$idx->remove($key, %parm);
417			}
418			return undef;
419			}
420			}
421			if (defined $self->{inverted}) {
422			my $att;
423			for $att (keys %{$self->{inverted}}) {
424			if (defined $parm{$att}) {
425			map $_->insert($key, $parm{$att}), @{$self->{inverted}->{$att}};
426			#map $_->sync, @{$self->{inverted}->{$att}}
427			}
428			}
429			}
430			$key
431			}
432
433			sub sync {
434			my $self = shift;
435
436			for (values %{$self->{indexes}}) {
437			map $_->sync, $_;
438			}
439			if (defined $self->{inverted}) {
440			my $att;
441			for $att (keys %{$self->{inverted}}) {
442			map $_->sync, @{$self->{inverted}->{$att}}
443			}
444			}
445			}
446
447			sub fetch {
448			my $self = shift;
449			my $key = shift;
450
451			return () if exists $self->{deleted}->{$key};
452
453			defined $self->{db} or $self->open;
454			if ($USE_RECNO) {
455			$self->unpack($self->{db}->[$key]);
456			} else {
457			$self->unpack($self->{db}->{$key});
458			}
459			}
460
461			sub delete_by_key {
462			my $self = shift;
463			my $key = shift;
464
465			return $self->{deleted}->{$key} if defined $self->{deleted}->{$key};
466			my %tuple = $self->fetch($key);
467			for (values %{$self->{indexes}}) {
468			$_->delete($key, %tuple);
469			}
470			if (defined $self->{inverted}) {
471			# User must provide the full record for this or the entries
472			# in the inverted index will not be removed
473			%tuple = (%tuple, @_);
474			my $att;
475			for $att (keys %{$self->{inverted}}) {
476			if (defined $tuple{$att}) {
477			map $_->delete($key, $tuple{$att}), @{$self->{inverted}->{$att}}
478			}
479			}
480			}
481			++$self->{deleted}->{$key};
482			}
483
484			sub delete {
485			my $self = shift;
486			my $tkey = $self->have(@_);
487
488			defined $tkey && $self->delete_by_key($tkey, @_);
489			}
490
491			sub unpack {
492			my $self = shift;
493			my $tuple = shift;
494
495			my $att;
496			my @result;
497			my @tuple = split /$;/, $tuple;
498
499			for $att (@{$self->{attr}}) {
500			push @result, $att, shift @tuple;
501			}
502			@result;
503			}
504
505			sub close {
506			my $self = shift;
507
508			if (exists $self->{'access'}) {
509			eval {$self->{'access'}->close}; # dont bother if not opened
510			}
511			for (values %{$self->{indexes}}) {
512			$_->close();
513			}
514			if (defined $self->{inverted}) {
515			my $att;
516			for $att (keys %{$self->{inverted}}) {
517			if ($] > 5.003) { # avoid bug in perl up to 5.003_05
518			my $idx;
519			for $idx (@{$self->{inverted}->{$att}}) {
520			$idx->close;
521			}
522			} else {
523			map $_->close(), @{$self->{inverted}->{$att}};
524			}
525			}
526			}
527			if ($self->{dbh}) {
528			delete $self->{dbh};
529
530			if ($USE_RECNO) {
531			untie @{$self->{db}};
532			} else {
533			untie %{$self->{db}};
534			}
535			delete $self->{db};
536			}
537
538			1;
539			}
540
541			sub open_scan {
542			my $self = shift;
543			my $code = shift;
544
545			$self->{dbh} or $self->open;
546			require WAIT::Scan;
547			new WAIT::Scan $self, $self->{nextk}-1, $code;
548			}
549
550			sub open_index_scan {
551			my $self = shift;
552			my $attr = shift;
553			my $code = shift;
554			my $name = join '-', @$attr;
555
556			if (defined $self->{indexes}->{$name}) {
557			$self->{indexes}->{$name}->open_scan($code);
558			} else {
559			croak "No such index '$name'";
560			}
561			}
562
563			eval {sub WAIT::Query::Raw::new} unless defined \&WAIT::Query::Raw::new;
564
565			sub prefix {
566			my ($self , $attr, $prefix) = @_;
567			my %result;
568
569			defined $self->{db} or $self->open; # require layout
570
571			for (@{$self->{inverted}->{$attr}}) {
572			my $result = $_->prefix($prefix);
573			if (defined $result) {
574			$result{$_->name} = $result;
575			}
576			}
577			bless \%result, 'WAIT::Query::Raw';
578			}
579
580			sub intervall {
581			my ($self, $attr, $lb, $ub) = @_;
582			my %result;
583
584			defined $self->{db} or $self->open; # require layout
585
586			for (@{$self->{inverted}->{$attr}}) {
587			my $result = $_->intervall($lb, $ub);
588			if (defined $result) {
589			$result{$_->name} = $result;
590			}
591			}
592			bless \%result, 'WAIT::Query::Raw';
593			}
594
595			sub search {
596			my $self = shift;
597			my $attr = shift;
598			my $cont = shift;
599			my $raw = shift;
600			my %result;
601
602			defined $self->{db} or $self->open; # require layout
603
604			if ($raw) {
605			for (@{$self->{inverted}->{$attr}}) {
606			my $name = $_->name;
607			if (exists $raw->{$name} and @{$raw->{$name}}) {
608			my $scale = 1/scalar(@{$raw->{$name}});
609			my %r = $_->search_raw(@{$raw->{$name}});
610			my ($key, $val);
611			while (($key, $val) = each %r) {
612			if (exists $result{$key}) {
613			$result{$key} += $val*$scale;
614			} else {
615			$result{$key} = $val*$scale;
616			}
617			}
618			}
619			}
620			}
621			if (defined $cont and $cont ne '') {
622			for (@{$self->{inverted}->{$attr}}) {
623			my %r = $_->search($cont);
624			my ($key, $val);
625			while (($key, $val) = each %r) {
626			if (exists $result{$key}) {
627			$result{$key} += $val;
628			} else {
629			$result{$key} = $val;
630			}
631			}
632			}
633			}
634			# sanity check for deleted documents.
635			# this should not be necessary !@#$
636			for (keys %result) {
637			delete $result{$_} if $self->{deleted}->{$_}
638			}
639			%result;
640			}
641
642			sub hilight_positions {
643			my ($self, $attr, $text, $query, $raw) = @_;
644			my %pos;
645
646			if (defined $raw) {
647			for (@{$self->{inverted}->{$attr}}) {
648			my $name = $_->name;
649			if (exists $raw->{$name}) {
650			my %qt;
651			grep $qt{$_}++, @{$raw->{$name}};
652			for ($_->parse_pos($text)) {
653			if (exists $qt{$_->[0]}) {
654			$pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
655			}
656			}
657			}
658			}
659			}
660			if (defined $query) {
661			for (@{$self->{inverted}->{$attr}}) {
662			my %qt;
663
664			grep $qt{$_}++, $_->parse($query);
665			for ($_->parse_pos($text)) {
666			if (exists $qt{$_->[0]}) {
667			if (exists $pos{$_->[1]}) { # perl -w ;-)
668			$pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
669			} else {
670			$pos{$_->[1]} = length($_->[0]);
671			}
672			}
673			}
674			}
675			}
676
677			\%pos;
678			}
679
680			sub hilight {
681			my ($tb, $text, $query, $raw) = @_;
682			my $type = $tb->layout();
683			my @result;
684
685			$query \|\|= {};
686			$raw \|\|= {};
687			my @ttxt = $type->tag($text);
688			while (@ttxt) {
689			no strict 'refs';
690			my %tag = %{shift @ttxt};
691			my $txt = shift @ttxt;
692			my $fld;
693
694			my %hl;
695			for $fld (grep defined $tag{$_}, keys %$query, keys %$raw) {
696			my $hp = $tb->hilight_positions($fld, $txt,
697			$query->{$fld}, $raw->{$fld});
698			for (keys %$hp) {
699			if (exists $hl{$_}) { # -w ;-(
700			$hl{$_} = max($hl{$_}, $hp->{$_});
701			} else {
702			$hl{$_} = $hp->{$_};
703			}
704			}
705			}
706			my $pos;
707			my $qt = {_qt => 1, %tag};
708			my $pl = \%tag;
709			my $last = length($txt);
710			my @tmp;
711			for $pos (sort {$b <=> $a} keys %hl) {
712			unshift @tmp, $pl, substr($txt,$pos+$hl{$pos},$last-$pos-$hl{$pos});
713			unshift @tmp, $qt, substr($txt,$pos,$hl{$pos});
714			$last = $pos;
715			}
716			push @result, $pl, substr($txt,0,$last);
717			push @result, @tmp;
718			}
719			@result; # no speed necessary
720			}
721
722			1;
723