lib/WAIT/Table.pm

#                              -*- Mode: Cperl -*- 
# Table.pm -- 
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug  8 13:05:10 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun May 30 20:42:30 1999
# Language        : CPerl
# Update Count    : 56
# Status          : Unknown, Use with caution!
#
# Copyright (c) 1996-1997, Ulrich Pfeifer
#

=head1 NAME

WAIT::Table -- Module for maintaining Tables / Relations

=head1 SYNOPSIS

  require WAIT::Table;

=head1 DESCRIPTION

=cut

package WAIT::Table;

use WAIT::Table::Handle ();
require WAIT::Parse::Base;

use strict;
use Carp;
# use autouse Carp => qw( croak($) );
use DB_File;
use Fcntl;

my $USE_RECNO = 0;

=head2 Creating a Table.

The constructor WAIT::Table-E<gt>new is normally called via the
create_table method of a database handle. This is not enforced, but
creating a table does not make any sense unless the table is
registered by the database because the latter implements persistence
of the meta data. Registering is done automatically by letting the
database handle the creation of a table.

  my $db = WAIT::Database->create(name => 'sample');
  my $tb = $db->create_table(name     => 'test',
                             access   => $access,
                             layout   => $layout,
                             attr     => ['docid', 'headline'],
                            );

The constructor returns a handle for the table. This handle is hidden by the
table module, to prevent direct access if called via Table.

=over 10

=item C<access> => I<accessobj>

A reference to an access object for the external parts (attributes) of
tuples. As you may remember, the WAIT System does not enforce that
objects are completely stored inside the system to avoid duplication.
There is no (strong) point in storing all your HTML documents inside
the system when indexing your WWW-Server.

The access object is designed to work like as a tied hash. You pass
the refernce to the object, not the tied hash though. An example
implementation of an access class that works for manpages is
WAIT::Document::Nroff.

The implementation needs to take into account that WAIT will keep this
object in a Data::Dumper or Storable database and re-use it when sman
is run. So it is not good enough if we can produce the index with it
now, when we create or actively access the table, WAIT also must be
able to retrieve documents on its own, when we are in a different
context. This happens specifically in a retrieval. To get this working
seemlessly, the access-defining class must implement a close method.
This method will be called before the Data::Dumper dump takes place.
In that moment the access-defining class must get rid of all data
structures that cannot be reconstructed via the Data::Dumper dump,
such as database handles or C pointers.

=item C<file> => I<fname>

The filename of the records file. Files for indexes will have I<fname>
as prefix. I<Mandatory>, but usually taken care of by the
WAIT::Database handle when the constructor is called via
WAIT::Database::create_table().

=item C<name> => I<name>

The name of this table. I<Mandatory>

=item C<attr> => [ I<attr> ... ]

A reference to an array of attribute names. WAIT will keep the
contents of these attributes in its table. I<Mandatory>

=item C<djk> => [ I<attr> ... ]

A reference to an array of attribute names which make up the
I<disjointness key>. Don't think about it - it's of no use yet;

=item C<layout> => I<layoutobj>

A reference to an external parser object. Defaults to a new instance
of C<WAIT::Parse::Base>. For an example implementation see
WAIT::Parse::Nroff. A layout class can be implemented as a singleton
class if you so like.

=item C<keyset> => I<keyset>

The set of attributes needed to identify a record. Defaults to all
attributes.

=item C<invindex> => I<inverted index>

A reference to an anon array defining attributes of each record that
need to be indexed. See the source of smakewhatis for how to set this
up.

=back

=cut

sub new {
  my $type = shift;
  my %parm = @_;
  my $self = {};

  # Check for mandatory attrs early
  $self->{name}     = $parm{name}     or croak "No name specified";
  $self->{attr}     = $parm{attr}     or croak "No attributes specified";

  # Do that before we eventually add '_weight' to attributes.
  $self->{keyset}   = $parm{keyset}   || [[@{$parm{attr}}]];

  $self->{mode}     = O_CREAT | O_RDWR;

  # Determine and set up subclass
  $type = ref($type) || $type;
  if (defined $parm{djk}) {
    if (@{$parm{djk}} == @{$parm{attr}}) {
      # All attributes in DK (sloppy test here!)
      $type .= '::Independent';
      require WAIT::Table::Independent;
    } else {
      $type .= '::Disjoint';
      require WAIT::Table::Disjoint;
    }
    # Add '_weight' to attributes
    my %attr;
    @attr{@{$parm{attr}}} = (1) x @{$parm{attr}};
    unshift @{$parm{attr}}, '_weight' unless $attr{'_weight'};
  }

  $self->{file}     = $parm{file}     or croak "No file specified";
  if (-d  $self->{file}){
    warn "Warning: Directory '$self->{file}' already exists\n";
  } elsif (!mkdir($self->{file}, 0775)) {
    croak "Could not 'mkdir $self->{file}': $!\n";
  }
  $self->{djk}      = $parm{djk}      if defined $parm{djk};
  $self->{layout}   = $parm{layout} || new WAIT::Parse::Base;
  $self->{access}   = $parm{access} if defined $parm{access};
  $self->{nextk}    = 1;        # next record to insert; first record unused
  $self->{deleted}  = {};       # no deleted records yet
  $self->{indexes}  = {};

  bless $self, $type;
  # Call create_index() and create_index() for compatibility
  for (@{$self->{keyset}||[]}) {
    #carp "Specification of indexes at table create time is deprecated";
    $self->create_index(@$_);
  }
  while (@{$parm{invindex}||[]}) {
    # carp "Specification of inverted indexes at table create time is deprecated";
    my $att  = shift @{$parm{invindex}};
    my @spec = @{shift @{$parm{invindex}}};
    my @opt;

    if (ref($spec[0])) {
      carp "Secondary pipelines are deprecated\n";
      @opt = %{shift @spec};
    }
    $self->create_inverted_index(attribute => $att, pipeline  => \@spec, @opt);
  }
  $self;
  # end of backwarn compatibility stuff
}

=head2 Creating an index

  $tb->create_index('docid');

=item C<create_index>

must be called with a list of attributes. This must be a subset of the
attributes specified when the table was created. Currently this
method must be called before the first tuple is inserted in the
table!

=cut

sub create_index {
  my $self= shift;

  croak "Cannot create index for table aready populated"
    if $self->{nextk} > 1;

  require WAIT::Index;

  my $name = join '-', @_;
  $self->{indexes}->{$name} =
    new WAIT::Index file => $self->{file}.'/'.$name, attr => $_;
}

=head2 Creating an inverted index

  $tb->create_inverted_index
    (attribute => 'au',
     pipeline  => ['detex', 'isotr', 'isolc', 'split2', 'stop'],
     predicate => 'plain',
    );

=over 5

=item C<attribute>

The attribute to build the index on. This attribute may not be in the
set attributes specified when the table was created.

=item C<pipeline>

A piplines specification is a reference to an array of method names
(from package C<WAIT::Filter>) which are to be applied in sequence to
the contents of the named attribute. The attribute name may not be in
the attribute list.

=item C<predicate>

An indication which predicate the index implements. This may be
e.g. 'plain', 'stemming' or 'soundex'. The indicator will be used for
query processing. Currently there is no standard set of predicate
names. The predicate defaults to the last member of the pipeline if
omitted.

=back

Currently this method must be called before the first tuple is
inserted in the table!

=cut

sub create_inverted_index {
  my $self  = shift;
  my %parm  = @_;

  croak "No attribute specified" unless $parm{attribute};
  croak "No pipeline specified"  unless $parm{pipeline};

  $parm{predicate} ||= $parm{pipeline}->[-1];

  croak "Cannot create index for table aready populated"
    if $self->{nextk} > 1;

  require WAIT::InvertedIndex;

  # backward compatibility stuff
  my %opt = %parm;
  for (qw(attribute pipeline predicate)) {
    delete $opt{$_};
  }

  my $name = join '_', ($parm{attribute}, @{$parm{pipeline}});
  my $idx = new WAIT::InvertedIndex(file   => $self->{file}.'/'.$name,
                                    filter => [@{$parm{pipeline}}], # clone
                                    name   => $name,
                                    attr   => $parm{attribute},
                                    %opt, # backward compatibility stuff
                                   );
  # We will have to use $parm{predicate} here
  push @{$self->{inverted}->{$parm{attribute}}}, $idx;
}

sub dir {
  $_[0]->{file};
}

=head2 C<$tb-E<gt>layout>

Returns the reference to the associated parser object.

=cut

sub layout { $_[0]->{layout} }

=head2 C<$tb-E<gt>fields>

Returns the array of attribute names.

=cut


sub fields { keys %{$_[0]->{inverted}}}

=head2 C<$tb-E<gt>drop>

Must be called via C<WAIT::Database::drop_table>

=cut

sub drop {
  my $self = shift;
  if ((caller)[0] eq 'WAIT::Database') { # database knows about this
    $self->close;               # just make sure
    my $file = $self->{file};

    for (values %{$self->{indexes}}) {
      $_->drop;
    }
    unlink "$file/records";
    ! (!-e $file or rmdir $file);
  } else {
    croak ref($self)."::drop called directly";
  }
}

sub mrequire ($) {
  my $module = shift;

  $module =~ s{::}{/}g;
  $module .= '.pm';
  require $module;
}

sub open {
  my $self = shift;
  my $file = $self->{file} . '/records';

  mrequire ref($self);           # that's tricky eh?
  if (defined $self->{'layout'}) {
    mrequire ref($self->{'layout'});
  }
  if (defined $self->{'access'}) {
    mrequire ref($self->{'access'});
  }
  if (exists $self->{indexes}) {
    require WAIT::Index;
    for (values %{$self->{indexes}}) {
      $_->{mode} = $self->{mode};
    }
  }
  if (exists $self->{inverted}) {
    my ($att, $idx);
    for $att (keys %{$self->{inverted}}) {
      for $idx (@{$self->{inverted}->{$att}}) {
        $idx->{mode} = $self->{mode};
      }
    }
    require WAIT::InvertedIndex;
  }
  unless (defined $self->{dbh}) {
    if ($USE_RECNO) {
      $self->{dbh} = tie(@{$self->{db}}, 'DB_File', $file,
                         $self->{mode}, 0664, $DB_RECNO);
    } else {
      $self->{dbh} =
        tie(%{$self->{db}}, 'DB_File', $file,
                         $self->{mode}, 0664, $DB_BTREE);
    }
  }
  $self;
}

sub fetch_extern {
  my $self  = shift;

  # print "#@_", $self->{'access'}->{Mode}, "\n"; # DEBUGGING?
  if (exists $self->{'access'}) {
    mrequire ref($self->{'access'});
    $self->{'access'}->FETCH(@_);
  }
}

sub fetch_extern_by_id {
  my $self  = shift;

  $self->fetch_extern($self->fetch(@_));
}

sub _find_index {
  my $self  = shift;
  my (@att) = @_;
  my %att;
  my $name;

  @att{@att} = @att;

  KEY: for $name (keys %{$self->{indexes}}) {
      my @iat = split /-/, $name;
      for (@iat) {
        next KEY unless exists $att{$_};
      }
      return $self->{indexes}->{$name};
    }
  return undef;
}

sub have {
  my $self  = shift;
  my %parm  = @_;

  my $index = $self->_find_index(keys %parm) or return; # no index-no have

  defined $self->{db} or $self->open;
  return $index->have(@_);
}

sub insert {
  my $self  = shift;
  my %parm  = @_;

  defined $self->{db} or $self->open;

  # We should move all writing methods to a subclass to check only once
  $self->{mode} & O_RDWR or croak "Cannot insert into table opened in RD_ONLY mode";

  my $tuple = join($;, map($parm{$_} || '', @{$self->{attr}}));
  my $key;
  my @deleted = keys %{$self->{deleted}};

  if (@deleted) {
    $key = pop @deleted;
    delete $self->{deleted}->{$key};
  } else {
    $key = $self->{nextk}++;
  }
  if ($USE_RECNO) {
    $self->{db}->[$key] = $tuple;
  } else {
    $self->{db}->{$key} = $tuple;
  }
  for (values %{$self->{indexes}}) {
    unless ($_->insert($key, %parm)) {
      # duplicate key, undo changes
      if ($key == $self->{nextk}-1) {
        $self->{nextk}--;
      } else {
        $self->{deleted}->{$key}=1;
      }
      my $idx;
      for $idx (values %{$self->{indexes}}) {
        last if $idx eq $_;
        $idx->remove($key, %parm);
      }
      return undef;
    }
  }
  if (defined $self->{inverted}) {
    my $att;
    for $att (keys %{$self->{inverted}}) {
      if (defined $parm{$att}) {
        map $_->insert($key, $parm{$att}), @{$self->{inverted}->{$att}};
        #map $_->sync, @{$self->{inverted}->{$att}}
      }
    }
  }
  $key
}

sub sync {
  my $self  = shift;

  for (values %{$self->{indexes}}) {
    map $_->sync, $_;
  }
  if (defined $self->{inverted}) {
    my $att;
    for $att (keys %{$self->{inverted}}) {
      map $_->sync, @{$self->{inverted}->{$att}}
    }
  }
}

sub fetch {
  my $self  = shift;
  my $key   = shift;

  return () if exists $self->{deleted}->{$key};

  defined $self->{db} or $self->open;
  if ($USE_RECNO) {
    $self->unpack($self->{db}->[$key]);
  } else {
    $self->unpack($self->{db}->{$key});
  }
}

sub delete_by_key {
  my $self  = shift;
  my $key   = shift;

  return $self->{deleted}->{$key} if defined $self->{deleted}->{$key};
  my %tuple = $self->fetch($key);
  for (values %{$self->{indexes}}) {
    $_->delete($key, %tuple);
  }
  if (defined $self->{inverted}) {
    # User *must* provide the full record for this or the entries
    # in the inverted index will not be removed
    %tuple = (%tuple, @_);
    my $att;
    for $att (keys %{$self->{inverted}}) {
      if (defined $tuple{$att}) {
        map $_->delete($key, $tuple{$att}), @{$self->{inverted}->{$att}}
      }
    }
  }
  ++$self->{deleted}->{$key};
}

sub delete {
  my $self  = shift;
  my $tkey = $self->have(@_);

  defined $tkey && $self->delete_by_key($tkey, @_);
}

sub unpack {
  my $self = shift;
  my $tuple = shift;

  my $att;
  my @result;
  my @tuple = split /$;/, $tuple;

  for $att (@{$self->{attr}}) {
    push @result, $att, shift @tuple;
  }
  @result;
}

sub close {
  my $self = shift;

  if (exists $self->{'access'}) {
    eval {$self->{'access'}->close}; # dont bother if not opened
  }
  for (values %{$self->{indexes}}) {
    $_->close();
  }
  if (defined $self->{inverted}) {
    my $att;
    for $att (keys %{$self->{inverted}}) {
      if ($] > 5.003) {         # avoid bug in perl up to 5.003_05
        my $idx;
        for $idx (@{$self->{inverted}->{$att}}) {
          $idx->close;
        }
      } else {
        map $_->close(), @{$self->{inverted}->{$att}};
      }
    }
  }
  if ($self->{dbh}) {
    delete $self->{dbh};

    if ($USE_RECNO) {
      untie @{$self->{db}};
    } else {
      untie %{$self->{db}};
    }
    delete $self->{db};
  }

  1;
}

sub DESTROY {
  my $self = shift;

  warn "Table handle destroyed without closing it first"
    if $self->{db} and $self->{mode}&O_RDWR;
}

sub open_scan {
  my $self = shift;
  my $code = shift;

  $self->{dbh} or $self->open;
  require WAIT::Scan;
  new WAIT::Scan $self, $self->{nextk}-1, $code;
}

sub open_index_scan {
  my $self = shift;
  my $attr = shift;
  my $code = shift;
  my $name = join '-', @$attr;

  if (defined $self->{indexes}->{$name}) {
    $self->{indexes}->{$name}->open_scan($code);
  } else {
    croak "No such index '$name'";
  }
}

eval {sub WAIT::Query::Raw::new} unless defined \&WAIT::Query::Raw::new;

sub prefix {
  my ($self , $attr, $prefix) = @_;
  my %result;

  defined $self->{db} or $self->open; # require layout

  for (@{$self->{inverted}->{$attr}}) {
    my $result = $_->prefix($prefix);
    if (defined $result) {
      $result{$_->name} = $result;
    }
  }
  bless \%result, 'WAIT::Query::Raw';
}

sub intervall {
  my ($self, $attr, $lb, $ub) = @_;
  my %result;

  defined $self->{db} or $self->open; # require layout

  for (@{$self->{inverted}->{$attr}}) {
    my $result = $_->intervall($lb, $ub);
    if (defined $result) {
      $result{$_->name} = $result;
    }
  }
  bless \%result, 'WAIT::Query::Raw';
}

sub search {
  my $self = shift;
  my $attr = shift;
  my $cont = shift;
  my $raw  = shift;
  my %result;

  defined $self->{db} or $self->open; # require layout

  if ($raw) {
    for (@{$self->{inverted}->{$attr}}) {
      my $name = $_->name;
      if (exists $raw->{$name} and @{$raw->{$name}}) {
        my $scale = 1/scalar(@{$raw->{$name}});
        my %r = $_->search_raw(@{$raw->{$name}});
        my ($key, $val);
        while (($key, $val) = each %r) {
          if (exists $result{$key}) {
            $result{$key} += $val*$scale;
          } else {
            $result{$key}  = $val*$scale;
          }
        }
      }
    }
  }
  if (defined $cont and $cont ne '') {
    for (@{$self->{inverted}->{$attr}}) {
      my %r = $_->search($cont);
      my ($key, $val);
      while (($key, $val) = each %r) {
        if (exists $result{$key}) {
          $result{$key} += $val;
        } else {
          $result{$key}  = $val;
        }
      }
    }
  }
  # sanity check for deleted documents.
  # this should not be necessary !@#$
  for (keys %result) {
    delete $result{$_} if $self->{deleted}->{$_}
  }
  %result;
}

sub hilight_positions {
  my ($self, $attr, $text, $query, $raw)  = @_;
  my %pos;

  if (defined $raw) {
    for (@{$self->{inverted}->{$attr}}) { # objects of type
                                          # WAIT::InvertedIndex for
                                          # this index field $attr
      my $name = $_->name;
      if (exists $raw->{$name}) {
        my %qt;
        grep $qt{$_}++, @{$raw->{$name}};
        for ($_->parse_pos($text)) {
          if (exists $qt{$_->[0]}) {
            $pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
          }
        }
      }
    }
  }
  if (defined $query) {
    for (@{$self->{inverted}->{$attr}}) {
      my %qt;

      grep $qt{$_}++, $_->parse($query);
      for ($_->parse_pos($text)) {
        if (exists $qt{$_->[0]}) {
          if (exists $pos{$_->[1]}) { # perl -w ;-)
            $pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
          } else {
            $pos{$_->[1]} = length($_->[0]);
          }
        }
      }
    }
  }

  \%pos;
}

sub hilight {
  my ($tb, $buf, $qplain, $qraw) = @_;
  my $layout = $tb->layout();

  my @result;

  $qplain ||= {};
  $qraw   ||= {};
  my @ttxt = $layout->tag($buf);
  while (@ttxt) {
    no strict 'refs';
    my %tag = %{shift @ttxt};
    my $txt = shift @ttxt;
    my $fld;

    my %hl;
    for $fld (grep defined $tag{$_}, keys %$qplain, keys %$qraw) {
      my $hp = $tb->hilight_positions($fld, $txt,
                                      $qplain->{$fld}, $qraw->{$fld});
      for (keys %$hp) {
        if (exists $hl{$_}) {   # -w ;-(
          $hl{$_} = max($hl{$_}, $hp->{$_});
        } else {
          $hl{$_} = $hp->{$_};
        }
      }
    }
    my $pos;
    my $qt = {_qt => 1, %tag};
    my $pl = \%tag;
    my $last = length($txt);
    my @tmp;
    for $pos (sort {$b <=> $a} keys %hl) {
      unshift @tmp, $pl, substr($txt,$pos+$hl{$pos},$last-$pos-$hl{$pos});
      unshift @tmp, $qt, substr($txt,$pos,$hl{$pos});
      $last = $pos;
    }
    push @result, $pl, substr($txt,0,$last);
    push @result, @tmp;
  }
  @result;                      # no speed necessary
}

1;
1	# -- Mode: Cperl --
2	# Table.pm --
3	# ITIID : $ITI$ $Header $__Header$
4	# Author : Ulrich Pfeifer
5	# Created On : Thu Aug 8 13:05:10 1996
6	# Last Modified By: Ulrich Pfeifer
7	# Last Modified On: Sun May 30 20:42:30 1999
8	# Language : CPerl
9	# Update Count : 56
10	# Status : Unknown, Use with caution!
11	#
12	# Copyright (c) 1996-1997, Ulrich Pfeifer
13	#
14
15	=head1 NAME
16
17	WAIT::Table -- Module for maintaining Tables / Relations
18
19	=head1 SYNOPSIS
20
21	require WAIT::Table;
22
23	=head1 DESCRIPTION
24
25	=cut
26
27	package WAIT::Table;
28
29	use WAIT::Table::Handle ();
30	require WAIT::Parse::Base;
31
32	use strict;
33	use Carp;
34	# use autouse Carp => qw( croak($) );
35	use DB_File;
36	use Fcntl;
37
38	my $USE_RECNO = 0;
39
40	=head2 Creating a Table.
41
42	The constructor WAIT::Table-E<gt>new is normally called via the
43	create_table method of a database handle. This is not enforced, but
44	creating a table does not make any sense unless the table is
45	registered by the database because the latter implements persistence
46	of the meta data. Registering is done automatically by letting the
47	database handle the creation of a table.
48
49	my $db = WAIT::Database->create(name => 'sample');
50	my $tb = $db->create_table(name => 'test',
51	access => $access,
52	layout => $layout,
53	attr => ['docid', 'headline'],
54	);
55
56	The constructor returns a handle for the table. This handle is hidden by the
57	table module, to prevent direct access if called via Table.
58
59	=over 10
60
61	=item C<access> => I<accessobj>
62
63	A reference to an access object for the external parts (attributes) of
64	tuples. As you may remember, the WAIT System does not enforce that
65	objects are completely stored inside the system to avoid duplication.
66	There is no (strong) point in storing all your HTML documents inside
67	the system when indexing your WWW-Server.
68
69	The access object is designed to work like as a tied hash. You pass
70	the refernce to the object, not the tied hash though. An example
71	implementation of an access class that works for manpages is
72	WAIT::Document::Nroff.
73
74	The implementation needs to take into account that WAIT will keep this
75	object in a Data::Dumper or Storable database and re-use it when sman
76	is run. So it is not good enough if we can produce the index with it
77	now, when we create or actively access the table, WAIT also must be
78	able to retrieve documents on its own, when we are in a different
79	context. This happens specifically in a retrieval. To get this working
80	seemlessly, the access-defining class must implement a close method.
81	This method will be called before the Data::Dumper dump takes place.
82	In that moment the access-defining class must get rid of all data
83	structures that cannot be reconstructed via the Data::Dumper dump,
84	such as database handles or C pointers.
85
86	=item C<file> => I<fname>
87
88	The filename of the records file. Files for indexes will have I<fname>
89	as prefix. I<Mandatory>, but usually taken care of by the
90	WAIT::Database handle when the constructor is called via
91	WAIT::Database::create_table().
92
93	=item C<name> => I<name>
94
95	The name of this table. I<Mandatory>
96
97	=item C<attr> => [ I<attr> ... ]
98
99	A reference to an array of attribute names. WAIT will keep the
100	contents of these attributes in its table. I<Mandatory>
101
102	=item C<djk> => [ I<attr> ... ]
103
104	A reference to an array of attribute names which make up the
105	I<disjointness key>. Don't think about it - it's of no use yet;
106
107	=item C<layout> => I<layoutobj>
108
109	A reference to an external parser object. Defaults to a new instance
110	of C<WAIT::Parse::Base>. For an example implementation see
111	WAIT::Parse::Nroff. A layout class can be implemented as a singleton
112	class if you so like.
113
114	=item C<keyset> => I<keyset>
115
116	The set of attributes needed to identify a record. Defaults to all
117	attributes.
118
119	=item C<invindex> => I<inverted index>
120
121	A reference to an anon array defining attributes of each record that
122	need to be indexed. See the source of smakewhatis for how to set this
123	up.
124
125	=back
126
127	=cut
128
129	sub new {
130	my $type = shift;
131	my %parm = @_;
132	my $self = {};
133
134	# Check for mandatory attrs early
135	$self->{name} = $parm{name} or croak "No name specified";
136	$self->{attr} = $parm{attr} or croak "No attributes specified";
137
138	# Do that before we eventually add '_weight' to attributes.
139	$self->{keyset} = $parm{keyset} \|\| [[@{$parm{attr}}]];
140
141	$self->{mode} = O_CREAT \| O_RDWR;
142
143	# Determine and set up subclass
144	$type = ref($type) \|\| $type;
145	if (defined $parm{djk}) {
146	if (@{$parm{djk}} == @{$parm{attr}}) {
147	# All attributes in DK (sloppy test here!)
148	$type .= '::Independent';
149	require WAIT::Table::Independent;
150	} else {
151	$type .= '::Disjoint';
152	require WAIT::Table::Disjoint;
153	}
154	# Add '_weight' to attributes
155	my %attr;
156	@attr{@{$parm{attr}}} = (1) x @{$parm{attr}};
157	unshift @{$parm{attr}}, '_weight' unless $attr{'_weight'};
158	}
159
160	$self->{file} = $parm{file} or croak "No file specified";
161	if (-d $self->{file}){
162	warn "Warning: Directory '$self->{file}' already exists\n";
163	} elsif (!mkdir($self->{file}, 0775)) {
164	croak "Could not 'mkdir $self->{file}': $!\n";
165	}
166	$self->{djk} = $parm{djk} if defined $parm{djk};
167	$self->{layout} = $parm{layout} \|\| new WAIT::Parse::Base;
168	$self->{access} = $parm{access} if defined $parm{access};
169	$self->{nextk} = 1; # next record to insert; first record unused
170	$self->{deleted} = {}; # no deleted records yet
171	$self->{indexes} = {};
172
173	bless $self, $type;
174	# Call create_index() and create_index() for compatibility
175	for (@{$self->{keyset}\|\|[]}) {
176	#carp "Specification of indexes at table create time is deprecated";
177	$self->create_index(@$_);
178	}
179	while (@{$parm{invindex}\|\|[]}) {
180	# carp "Specification of inverted indexes at table create time is deprecated";
181	my $att = shift @{$parm{invindex}};
182	my @spec = @{shift @{$parm{invindex}}};
183	my @opt;
184
185	if (ref($spec[0])) {
186	carp "Secondary pipelines are deprecated\n";
187	@opt = %{shift @spec};
188	}
189	$self->create_inverted_index(attribute => $att, pipeline => \@spec, @opt);
190	}
191	$self;
192	# end of backwarn compatibility stuff
193	}
194
195	=head2 Creating an index
196
197	$tb->create_index('docid');
198
199	=item C<create_index>
200
201	must be called with a list of attributes. This must be a subset of the
202	attributes specified when the table was created. Currently this
203	method must be called before the first tuple is inserted in the
204	table!
205
206	=cut
207
208	sub create_index {
209	my $self= shift;
210
211	croak "Cannot create index for table aready populated"
212	if $self->{nextk} > 1;
213
214	require WAIT::Index;
215
216	my $name = join '-', @_;
217	$self->{indexes}->{$name} =
218	new WAIT::Index file => $self->{file}.'/'.$name, attr => $_;
219	}
220
221	=head2 Creating an inverted index
222
223	$tb->create_inverted_index
224	(attribute => 'au',
225	pipeline => ['detex', 'isotr', 'isolc', 'split2', 'stop'],
226	predicate => 'plain',
227	);
228
229	=over 5
230
231	=item C<attribute>
232
233	The attribute to build the index on. This attribute may not be in the
234	set attributes specified when the table was created.
235
236	=item C<pipeline>
237
238	A piplines specification is a reference to an array of method names
239	(from package C<WAIT::Filter>) which are to be applied in sequence to
240	the contents of the named attribute. The attribute name may not be in
241	the attribute list.
242
243	=item C<predicate>
244
245	An indication which predicate the index implements. This may be
246	e.g. 'plain', 'stemming' or 'soundex'. The indicator will be used for
247	query processing. Currently there is no standard set of predicate
248	names. The predicate defaults to the last member of the pipeline if
249	omitted.
250
251	=back
252
253	Currently this method must be called before the first tuple is
254	inserted in the table!
255
256	=cut
257
258	sub create_inverted_index {
259	my $self = shift;
260	my %parm = @_;
261
262	croak "No attribute specified" unless $parm{attribute};
263	croak "No pipeline specified" unless $parm{pipeline};
264
265	$parm{predicate} \|\|= $parm{pipeline}->[-1];
266
267	croak "Cannot create index for table aready populated"
268	if $self->{nextk} > 1;
269
270	require WAIT::InvertedIndex;
271
272	# backward compatibility stuff
273	my %opt = %parm;
274	for (qw(attribute pipeline predicate)) {
275	delete $opt{$_};
276	}
277
278	my $name = join '_', ($parm{attribute}, @{$parm{pipeline}});
279	my $idx = new WAIT::InvertedIndex(file => $self->{file}.'/'.$name,
280	filter => [@{$parm{pipeline}}], # clone
281	name => $name,
282	attr => $parm{attribute},
283	%opt, # backward compatibility stuff
284	);
285	# We will have to use $parm{predicate} here
286	push @{$self->{inverted}->{$parm{attribute}}}, $idx;
287	}
288
289	sub dir {
290	$_[0]->{file};
291	}
292
293	=head2 C<$tb-E<gt>layout>
294
295	Returns the reference to the associated parser object.
296
297	=cut
298
299	sub layout { $_[0]->{layout} }
300
301	=head2 C<$tb-E<gt>fields>
302
303	Returns the array of attribute names.
304
305	=cut
306
307
308	sub fields { keys %{$_[0]->{inverted}}}
309
310	=head2 C<$tb-E<gt>drop>
311
312	Must be called via C<WAIT::Database::drop_table>
313
314	=cut
315
316	sub drop {
317	my $self = shift;
318	if ((caller)[0] eq 'WAIT::Database') { # database knows about this
319	$self->close; # just make sure
320	my $file = $self->{file};
321
322	for (values %{$self->{indexes}}) {
323	$_->drop;
324	}
325	unlink "$file/records";
326	! (!-e $file or rmdir $file);
327	} else {
328	croak ref($self)."::drop called directly";
329	}
330	}
331
332	sub mrequire ($) {
333	my $module = shift;
334
335	$module =~ s{::}{/}g;
336	$module .= '.pm';
337	require $module;
338	}
339
340	sub open {
341	my $self = shift;
342	my $file = $self->{file} . '/records';
343
344	mrequire ref($self); # that's tricky eh?
345	if (defined $self->{'layout'}) {
346	mrequire ref($self->{'layout'});
347	}
348	if (defined $self->{'access'}) {
349	mrequire ref($self->{'access'});
350	}
351	if (exists $self->{indexes}) {
352	require WAIT::Index;
353	for (values %{$self->{indexes}}) {
354	$_->{mode} = $self->{mode};
355	}
356	}
357	if (exists $self->{inverted}) {
358	my ($att, $idx);
359	for $att (keys %{$self->{inverted}}) {
360	for $idx (@{$self->{inverted}->{$att}}) {
361	$idx->{mode} = $self->{mode};
362	}
363	}
364	require WAIT::InvertedIndex;
365	}
366	unless (defined $self->{dbh}) {
367	if ($USE_RECNO) {
368	$self->{dbh} = tie(@{$self->{db}}, 'DB_File', $file,
369	$self->{mode}, 0664, $DB_RECNO);
370	} else {
371	$self->{dbh} =
372	tie(%{$self->{db}}, 'DB_File', $file,
373	$self->{mode}, 0664, $DB_BTREE);
374	}
375	}
376	$self;
377	}
378
379	sub fetch_extern {
380	my $self = shift;
381
382	# print "#@_", $self->{'access'}->{Mode}, "\n"; # DEBUGGING?
383	if (exists $self->{'access'}) {
384	mrequire ref($self->{'access'});
385	$self->{'access'}->FETCH(@_);
386	}
387	}
388
389	sub fetch_extern_by_id {
390	my $self = shift;
391
392	$self->fetch_extern($self->fetch(@_));
393	}
394
395	sub _find_index {
396	my $self = shift;
397	my (@att) = @_;
398	my %att;
399	my $name;
400
401	@att{@att} = @att;
402
403	KEY: for $name (keys %{$self->{indexes}}) {
404	my @iat = split /-/, $name;
405	for (@iat) {
406	next KEY unless exists $att{$_};
407	}
408	return $self->{indexes}->{$name};
409	}
410	return undef;
411	}
412
413	sub have {
414	my $self = shift;
415	my %parm = @_;
416
417	my $index = $self->_find_index(keys %parm) or return; # no index-no have
418
419	defined $self->{db} or $self->open;
420	return $index->have(@_);
421	}
422
423	sub insert {
424	my $self = shift;
425	my %parm = @_;
426
427	defined $self->{db} or $self->open;
428
429	# We should move all writing methods to a subclass to check only once
430	$self->{mode} & O_RDWR or croak "Cannot insert into table opened in RD_ONLY mode";
431
432	my $tuple = join($;, map($parm{$_} \|\| '', @{$self->{attr}}));
433	my $key;
434	my @deleted = keys %{$self->{deleted}};
435
436	if (@deleted) {
437	$key = pop @deleted;
438	delete $self->{deleted}->{$key};
439	} else {
440	$key = $self->{nextk}++;
441	}
442	if ($USE_RECNO) {
443	$self->{db}->[$key] = $tuple;
444	} else {
445	$self->{db}->{$key} = $tuple;
446	}
447	for (values %{$self->{indexes}}) {
448	unless ($_->insert($key, %parm)) {
449	# duplicate key, undo changes
450	if ($key == $self->{nextk}-1) {
451	$self->{nextk}--;
452	} else {
453	$self->{deleted}->{$key}=1;
454	}
455	my $idx;
456	for $idx (values %{$self->{indexes}}) {
457	last if $idx eq $_;
458	$idx->remove($key, %parm);
459	}
460	return undef;
461	}
462	}
463	if (defined $self->{inverted}) {
464	my $att;
465	for $att (keys %{$self->{inverted}}) {
466	if (defined $parm{$att}) {
467	map $_->insert($key, $parm{$att}), @{$self->{inverted}->{$att}};
468	#map $_->sync, @{$self->{inverted}->{$att}}
469	}
470	}
471	}
472	$key
473	}
474
475	sub sync {
476	my $self = shift;
477
478	for (values %{$self->{indexes}}) {
479	map $_->sync, $_;
480	}
481	if (defined $self->{inverted}) {
482	my $att;
483	for $att (keys %{$self->{inverted}}) {
484	map $_->sync, @{$self->{inverted}->{$att}}
485	}
486	}
487	}
488
489	sub fetch {
490	my $self = shift;
491	my $key = shift;
492
493	return () if exists $self->{deleted}->{$key};
494
495	defined $self->{db} or $self->open;
496	if ($USE_RECNO) {
497	$self->unpack($self->{db}->[$key]);
498	} else {
499	$self->unpack($self->{db}->{$key});
500	}
501	}
502
503	sub delete_by_key {
504	my $self = shift;
505	my $key = shift;
506
507	return $self->{deleted}->{$key} if defined $self->{deleted}->{$key};
508	my %tuple = $self->fetch($key);
509	for (values %{$self->{indexes}}) {
510	$_->delete($key, %tuple);
511	}
512	if (defined $self->{inverted}) {
513	# User must provide the full record for this or the entries
514	# in the inverted index will not be removed
515	%tuple = (%tuple, @_);
516	my $att;
517	for $att (keys %{$self->{inverted}}) {
518	if (defined $tuple{$att}) {
519	map $_->delete($key, $tuple{$att}), @{$self->{inverted}->{$att}}
520	}
521	}
522	}
523	++$self->{deleted}->{$key};
524	}
525
526	sub delete {
527	my $self = shift;
528	my $tkey = $self->have(@_);
529
530	defined $tkey && $self->delete_by_key($tkey, @_);
531	}
532
533	sub unpack {
534	my $self = shift;
535	my $tuple = shift;
536
537	my $att;
538	my @result;
539	my @tuple = split /$;/, $tuple;
540
541	for $att (@{$self->{attr}}) {
542	push @result, $att, shift @tuple;
543	}
544	@result;
545	}
546
547	sub close {
548	my $self = shift;
549
550	if (exists $self->{'access'}) {
551	eval {$self->{'access'}->close}; # dont bother if not opened
552	}
553	for (values %{$self->{indexes}}) {
554	$_->close();
555	}
556	if (defined $self->{inverted}) {
557	my $att;
558	for $att (keys %{$self->{inverted}}) {
559	if ($] > 5.003) { # avoid bug in perl up to 5.003_05
560	my $idx;
561	for $idx (@{$self->{inverted}->{$att}}) {
562	$idx->close;
563	}
564	} else {
565	map $_->close(), @{$self->{inverted}->{$att}};
566	}
567	}
568	}
569	if ($self->{dbh}) {
570	delete $self->{dbh};
571
572	if ($USE_RECNO) {
573	untie @{$self->{db}};
574	} else {
575	untie %{$self->{db}};
576	}
577	delete $self->{db};
578	}
579
580	1;
581	}
582
583	sub DESTROY {
584	my $self = shift;
585
586	warn "Table handle destroyed without closing it first"
587	if $self->{db} and $self->{mode}&O_RDWR;
588	}
589
590	sub open_scan {
591	my $self = shift;
592	my $code = shift;
593
594	$self->{dbh} or $self->open;
595	require WAIT::Scan;
596	new WAIT::Scan $self, $self->{nextk}-1, $code;
597	}
598
599	sub open_index_scan {
600	my $self = shift;
601	my $attr = shift;
602	my $code = shift;
603	my $name = join '-', @$attr;
604
605	if (defined $self->{indexes}->{$name}) {
606	$self->{indexes}->{$name}->open_scan($code);
607	} else {
608	croak "No such index '$name'";
609	}
610	}
611
612	eval {sub WAIT::Query::Raw::new} unless defined \&WAIT::Query::Raw::new;
613
614	sub prefix {
615	my ($self , $attr, $prefix) = @_;
616	my %result;
617
618	defined $self->{db} or $self->open; # require layout
619
620	for (@{$self->{inverted}->{$attr}}) {
621	my $result = $_->prefix($prefix);
622	if (defined $result) {
623	$result{$_->name} = $result;
624	}
625	}
626	bless \%result, 'WAIT::Query::Raw';
627	}
628
629	sub intervall {
630	my ($self, $attr, $lb, $ub) = @_;
631	my %result;
632
633	defined $self->{db} or $self->open; # require layout
634
635	for (@{$self->{inverted}->{$attr}}) {
636	my $result = $_->intervall($lb, $ub);
637	if (defined $result) {
638	$result{$_->name} = $result;
639	}
640	}
641	bless \%result, 'WAIT::Query::Raw';
642	}
643
644	sub search {
645	my $self = shift;
646	my $attr = shift;
647	my $cont = shift;
648	my $raw = shift;
649	my %result;
650
651	defined $self->{db} or $self->open; # require layout
652
653	if ($raw) {
654	for (@{$self->{inverted}->{$attr}}) {
655	my $name = $_->name;
656	if (exists $raw->{$name} and @{$raw->{$name}}) {
657	my $scale = 1/scalar(@{$raw->{$name}});
658	my %r = $_->search_raw(@{$raw->{$name}});
659	my ($key, $val);
660	while (($key, $val) = each %r) {
661	if (exists $result{$key}) {
662	$result{$key} += $val*$scale;
663	} else {
664	$result{$key} = $val*$scale;
665	}
666	}
667	}
668	}
669	}
670	if (defined $cont and $cont ne '') {
671	for (@{$self->{inverted}->{$attr}}) {
672	my %r = $_->search($cont);
673	my ($key, $val);
674	while (($key, $val) = each %r) {
675	if (exists $result{$key}) {
676	$result{$key} += $val;
677	} else {
678	$result{$key} = $val;
679	}
680	}
681	}
682	}
683	# sanity check for deleted documents.
684	# this should not be necessary !@#$
685	for (keys %result) {
686	delete $result{$_} if $self->{deleted}->{$_}
687	}
688	%result;
689	}
690
691	sub hilight_positions {
692	my ($self, $attr, $text, $query, $raw) = @_;
693	my %pos;
694
695	if (defined $raw) {
696	for (@{$self->{inverted}->{$attr}}) { # objects of type
697	# WAIT::InvertedIndex for
698	# this index field $attr
699	my $name = $_->name;
700	if (exists $raw->{$name}) {
701	my %qt;
702	grep $qt{$_}++, @{$raw->{$name}};
703	for ($_->parse_pos($text)) {
704	if (exists $qt{$_->[0]}) {
705	$pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
706	}
707	}
708	}
709	}
710	}
711	if (defined $query) {
712	for (@{$self->{inverted}->{$attr}}) {
713	my %qt;
714
715	grep $qt{$_}++, $_->parse($query);
716	for ($_->parse_pos($text)) {
717	if (exists $qt{$_->[0]}) {
718	if (exists $pos{$_->[1]}) { # perl -w ;-)
719	$pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
720	} else {
721	$pos{$_->[1]} = length($_->[0]);
722	}
723	}
724	}
725	}
726	}
727
728	\%pos;
729	}
730
731	sub hilight {
732	my ($tb, $buf, $qplain, $qraw) = @_;
733	my $layout = $tb->layout();
734
735	my @result;
736
737	$qplain \|\|= {};
738	$qraw \|\|= {};
739	my @ttxt = $layout->tag($buf);
740	while (@ttxt) {
741	no strict 'refs';
742	my %tag = %{shift @ttxt};
743	my $txt = shift @ttxt;
744	my $fld;
745
746	my %hl;
747	for $fld (grep defined $tag{$_}, keys %$qplain, keys %$qraw) {
748	my $hp = $tb->hilight_positions($fld, $txt,
749	$qplain->{$fld}, $qraw->{$fld});
750	for (keys %$hp) {
751	if (exists $hl{$_}) { # -w ;-(
752	$hl{$_} = max($hl{$_}, $hp->{$_});
753	} else {
754	$hl{$_} = $hp->{$_};
755	}
756	}
757	}
758	my $pos;
759	my $qt = {_qt => 1, %tag};
760	my $pl = \%tag;
761	my $last = length($txt);
762	my @tmp;
763	for $pos (sort {$b <=> $a} keys %hl) {
764	unshift @tmp, $pl, substr($txt,$pos+$hl{$pos},$last-$pos-$hl{$pos});
765	unshift @tmp, $qt, substr($txt,$pos,$hl{$pos});
766	$last = $pos;
767	}
768	push @result, $pl, substr($txt,0,$last);
769	push @result, @tmp;
770	}
771	@result; # no speed necessary
772	}
773
774	1;