--- branches/CPAN/lib/WAIT/Table.pm 2000/04/28 15:41:10 11 +++ branches/CPAN/lib/WAIT/Table.pm 2000/05/09 11:29:45 19 @@ -1,16 +1,16 @@ -# -*- Mode: Perl -*- +# -*- Mode: Cperl -*- # Table.pm -- # ITIID : $ITI$ $Header $__Header$ # Author : Ulrich Pfeifer # Created On : Thu Aug 8 13:05:10 1996 # Last Modified By: Ulrich Pfeifer -# Last Modified On: Sun Nov 22 18:44:37 1998 +# Last Modified On: Mon May 8 20:20:58 2000 # Language : CPerl -# Update Count : 51 +# Update Count : 131 # Status : Unknown, Use with caution! -# +# # Copyright (c) 1996-1997, Ulrich Pfeifer -# +# =head1 NAME @@ -25,47 +25,71 @@ =cut package WAIT::Table; + +use WAIT::Table::Handle (); require WAIT::Parse::Base; + use strict; use Carp; +# use autouse Carp => qw( croak($) ); use DB_File; use Fcntl; +use LockFile::Simple (); my $USE_RECNO = 0; =head2 Creating a Table. -The constructor WAIT::Table-new is normally called via the +The constructor WAIT::Table-Enew is normally called via the create_table method of a database handle. This is not enforced, but -creating a table doesn not make any sense unless the table is +creating a table does not make any sense unless the table is registered by the database because the latter implements persistence of the meta data. Registering is done automatically by letting the -database handle create a table. +database handle the creation of a table. - my $db = create WAIT::Database name => 'sample'; - my $tb = $db->create_table (name => 'test', - attr => ['docid', 'headline'], - layout => $layout, - access => $access, - ); + my $db = WAIT::Database->create(name => 'sample'); + my $tb = $db->create_table(name => 'test', + access => $access, + layout => $layout, + attr => ['docid', 'headline'], + ); The constructor returns a handle for the table. This handle is hidden by the table module, to prevent direct access if called via Table. =over 10 -=item C => I +=item C => I -A reference to a acces object for the external parts (attributes) of +A reference to an access object for the external parts (attributes) of tuples. As you may remember, the WAIT System does not enforce that objects are completely stored inside the system to avoid duplication. -There is no (strong) point in storing all you HTML-Documents inside +There is no (strong) point in storing all your HTML documents inside the system when indexing your WWW-Server. +The access object is designed to work like as a tied hash. You pass +the refernce to the object, not the tied hash though. An example +implementation of an access class that works for manpages is +WAIT::Document::Nroff. + +The implementation needs to take into account that WAIT will keep this +object in a Data::Dumper or Storable database and re-use it when sman +is run. So it is not good enough if we can produce the index with it +now, when we create or actively access the table, WAIT also must be +able to retrieve documents on its own, when we are in a different +context. This happens specifically in a retrieval. To get this working +seemlessly, the access-defining class must implement a close method. +This method will be called before the Data::Dumper dump takes place. +In that moment the access-defining class must get rid of all data +structures that cannot be reconstructed via the Data::Dumper dump, +such as database handles or C pointers. + =item C => I The filename of the records file. Files for indexes will have I -as prefix. I +as prefix. I, but usually taken care of by the +WAIT::Database handle when the constructor is called via +WAIT::Database::create_table(). =item C => I @@ -73,21 +97,31 @@ =item C => [ I ... ] -A reference to an array of attribute names. I +A reference to an array of attribute names. WAIT will keep the +contents of these attributes in its table. I =item C => [ I ... ] A reference to an array of attribute names which make up the -I. Don't think about it - i's of no use yet; +I. Don't think about it - it's of no use yet; =item C => I -A reference to an external parser object. Defaults to anew instance of -C +A reference to an external parser object. Defaults to a new instance +of C. For an example implementation see +WAIT::Parse::Nroff. A layout class can be implemented as a singleton +class if you so like. -=item C => I +=item C => I -A reference to a acces object for the external parts of tuples. +The set of attributes needed to identify a record. Defaults to all +attributes. + +=item C => I + +A reference to an anon array defining attributes of each record that +need to be indexed. See the source of smakewhatis for how to set this +up. =back @@ -98,9 +132,15 @@ my %parm = @_; my $self = {}; + # Check for mandatory attrs early + $self->{name} = $parm{name} or croak "No name specified"; + $self->{attr} = $parm{attr} or croak "No attributes specified"; + # Do that before we eventually add '_weight' to attributes. $self->{keyset} = $parm{keyset} || [[@{$parm{attr}}]]; + $self->{mode} = O_CREAT | O_RDWR; + # Determine and set up subclass $type = ref($type) || $type; if (defined $parm{djk}) { @@ -119,11 +159,17 @@ } $self->{file} = $parm{file} or croak "No file specified"; - if (-d $self->{file} or !mkdir($self->{file}, 0775)) { + if (-d $self->{file}){ + warn "Warning: Directory '$self->{file}' already exists\n"; + } elsif (!mkdir($self->{file}, 0775)) { croak "Could not 'mkdir $self->{file}': $!\n"; } - $self->{name} = $parm{name} or croak "No name specified"; - $self->{attr} = $parm{attr} or croak "No attributes specified"; + + my $lockmgr = LockFile::Simple->make(-autoclean => 1); + # aquire a write lock + $self->{write_lock} = $lockmgr->lock($self->{file} . '/write') + or die "Can't lock '$self->{file}/write'"; + $self->{djk} = $parm{djk} if defined $parm{djk}; $self->{layout} = $parm{layout} || new WAIT::Parse::Base; $self->{access} = $parm{access} if defined $parm{access}; @@ -142,13 +188,14 @@ my $att = shift @{$parm{invindex}}; my @spec = @{shift @{$parm{invindex}}}; my @opt; - + if (ref($spec[0])) { carp "Secondary pipelines are deprecated\n"; @opt = %{shift @spec}; } $self->create_inverted_index(attribute => $att, pipeline => \@spec, @opt); } + $self; # end of backwarn compatibility stuff } @@ -168,12 +215,12 @@ sub create_index { my $self= shift; - + croak "Cannot create index for table aready populated" if $self->{nextk} > 1; - + require WAIT::Index; - + my $name = join '-', @_; $self->{indexes}->{$name} = new WAIT::Index file => $self->{file}.'/'.$name, attr => $_; @@ -196,17 +243,17 @@ =item C -A piplines specification is a reference to and array of method names -(from package C) which are to applied in sequence to the -contents of the named attribute. The attribute name may not be in the -attribute list. +A piplines specification is a reference to an array of method names +(from package C) which are to be applied in sequence to +the contents of the named attribute. The attribute name may not be in +the attribute list. =item C An indication which predicate the index implements. This may be e.g. 'plain', 'stemming' or 'soundex'. The indicator will be used for query processing. Currently there is no standard set of predicate -names. The predicate defaults to the last member of the ppline if +names. The predicate defaults to the last member of the pipeline if omitted. =back @@ -224,10 +271,10 @@ croak "No pipeline specified" unless $parm{pipeline}; $parm{predicate} ||= $parm{pipeline}->[-1]; - + croak "Cannot create index for table aready populated" if $self->{nextk} > 1; - + require WAIT::InvertedIndex; # backward compatibility stuff @@ -235,7 +282,7 @@ for (qw(attribute pipeline predicate)) { delete $opt{$_}; } - + my $name = join '_', ($parm{attribute}, @{$parm{pipeline}}); my $idx = new WAIT::InvertedIndex(file => $self->{file}.'/'.$name, filter => [@{$parm{pipeline}}], # clone @@ -284,6 +331,7 @@ $_->drop; } unlink "$file/records"; + # $self->unlock; ! (!-e $file or rmdir $file); } else { croak ref($self)."::drop called directly"; @@ -334,13 +382,64 @@ $self->{mode}, 0664, $DB_BTREE); } } + + # Locking + # + # We allow multiple readers to coexists. But write access excludes + # all read access vice versa. In practice read access on tables + # open for writing will mostly work ;-) + + my $lockmgr = LockFile::Simple->make(-autoclean => 1); + + # aquire a write lock. We might hold one acquired in create() already + $self->{write_lock} ||= $lockmgr->lock($self->{file} . '/write') + or die "Can't lock '$self->{file}/write'"; + + my $lockdir = $self->{file} . '/read'; + unless (-d $lockdir) { + mkdir $lockdir, 0755 or die "Could not mkdir $lockdir: $!"; + } + + if ($self->{mode} & O_RDWR) { + # this is a hack. We do not check for reopening ... + return $self if $self->{write_lock}; + + # If we actually want to write we must check if there are any readers + opendir DIR, $lockdir or + die "Could not opendir '$lockdir': $!"; + for my $lockfile (grep { -f "$lockdir/$_" } readdir DIR) { + # check if the locks are still valid. + # Since we are protected by a write lock, we could use a pline file. + # But we want to use the stale testing from LockFile::Simple. + if (my $lck = $lockmgr->trylock("$lockdir/$lockfile")) { + warn "Removing stale lockfile '$lockdir/$lockfile'"; + $lck->release; + } else { + $self->{write_lock}->release; + die "Cannot write table '$file' while it's in use"; + } + } + } else { + # this is a hack. We do not check for reopening ... + return $self if $self->{read_lock}; + + # We are a reader. So we release the write lock + my $id = time; + while (-f "$lockdir/$id.lock") { # here assume ".lock" format! + $id++; + } + $self->{read_lock} = $lockmgr->lock("$lockdir/$id"); + $self->{write_lock}->release; + delete $self->{write_lock}; + } + $self; } sub fetch_extern { my $self = shift; - print "#@_", $self->{'access'}->{Mode}, "\n"; + # print "#@_", $self->{'access'}->{Mode}, "\n"; # DEBUGGING? if (exists $self->{'access'}) { mrequire ref($self->{'access'}); $self->{'access'}->FETCH(@_); @@ -358,7 +457,7 @@ my (@att) = @_; my %att; my $name; - + @att{@att} = @att; KEY: for $name (keys %{$self->{indexes}}) { @@ -375,8 +474,8 @@ my $self = shift; my %parm = @_; - my $index = $self->_find_index(keys %parm); - croak "No index found" unless $index; + my $index = $self->_find_index(keys %parm) or return; # no index-no have + defined $self->{db} or $self->open; return $index->have(@_); } @@ -387,14 +486,28 @@ defined $self->{db} or $self->open; + # We should move all writing methods to a subclass to check only once + $self->{mode} & O_RDWR or croak "Cannot insert into table opened in RD_ONLY mode"; + my $tuple = join($;, map($parm{$_} || '', @{$self->{attr}})); my $key; my @deleted = keys %{$self->{deleted}}; + my $gotkey = 0; if (@deleted) { $key = pop @deleted; delete $self->{deleted}->{$key}; + # Sanity check + if ($key && $key>0) { + $gotkey=1; } else { + warn(sprintf("WAIT database inconsistency during insert ". + "key[%s]: Please rebuild index\n", + $key + )); + } + } + unless ($gotkey) { $key = $self->{nextk}++; } if ($USE_RECNO) { @@ -408,6 +521,7 @@ if ($key == $self->{nextk}-1) { $self->{nextk}--; } else { + # warn "setting key[$key] deleted during insert"; $self->{deleted}->{$key}=1; } my $idx; @@ -416,7 +530,7 @@ $idx->remove($key, %parm); } return undef; - } + } } if (defined $self->{inverted}) { my $att; @@ -432,7 +546,7 @@ sub sync { my $self = shift; - + for (values %{$self->{indexes}}) { map $_->sync, $_; } @@ -449,7 +563,7 @@ my $key = shift; return () if exists $self->{deleted}->{$key}; - + defined $self->{db} or $self->open; if ($USE_RECNO) { $self->unpack($self->{db}->[$key]); @@ -462,6 +576,11 @@ my $self = shift; my $key = shift; + unless ($key) { + Carp::cluck "Warning: delete_by_key called without key. Looks like a bug in WAIT?"; + return; + } + return $self->{deleted}->{$key} if defined $self->{deleted}->{$key}; my %tuple = $self->fetch($key); for (values %{$self->{indexes}}) { @@ -478,19 +597,21 @@ } } } + # warn "setting key[$key] deleted during delete_by_key"; ++$self->{deleted}->{$key}; } sub delete { my $self = shift; my $tkey = $self->have(@_); - + # warn "tkey[$tkey]\@_[@_]"; defined $tkey && $self->delete_by_key($tkey, @_); } sub unpack { my $self = shift; my $tuple = shift; + return unless defined $tuple; my $att; my @result; @@ -502,6 +623,24 @@ @result; } +sub set { + my ($self, $iattr, $value) = @_; + + return unless $self->{write_lock}; + for my $att (keys %{$self->{inverted}}) { + if ($] > 5.003) { # avoid bug in perl up to 5.003_05 + my $idx; + for $idx (@{$self->{inverted}->{$att}}) { + $idx->set($iattr, $value); + } + } else { + map $_->set($iattr, $value), @{$self->{inverted}->{$att}}; + } + } + + 1; +} + sub close { my $self = shift; @@ -509,6 +648,7 @@ eval {$self->{'access'}->close}; # dont bother if not opened } for (values %{$self->{indexes}}) { + require WAIT::Index; $_->close(); } if (defined $self->{inverted}) { @@ -535,9 +675,36 @@ delete $self->{db}; } + $self->unlock; + 1; } +sub unlock { + my $self = shift; + + # Either we have a read or a write lock (or we close the table already) + # unless ($self->{read_lock} || $self->{write_lock}) { + # warn "WAIT::Table::unlock: Table aparently hold's no lock" + # } + if ($self->{write_lock}) { + $self->{write_lock}->release(); + delete $self->{write_lock}; + } + if ($self->{read_lock}) { + $self->{read_lock}->release(); + delete $self->{read_lock}; + } + +} + +sub DESTROY { + my $self = shift; + + warn "Table handle destroyed without closing it first" + if $self->{write_lock} || $self->{read_lock}; +} + sub open_scan { my $self = shift; my $code = shift; @@ -593,10 +760,27 @@ } sub search { - my $self = shift; - my $attr = shift; - my $cont = shift; - my $raw = shift; + my $self = shift; + my ($query, $attr, $cont, $raw); + if (ref $_[0]) { + $query = shift; + + $attr = $query->{attr}; + $cont = $query->{cont}; + $raw = $query->{raw}; + } else { + require Carp; + Carp::cluck("Using three argument search interface is deprecated, use hashref interface instead"); + $attr = shift; + $cont = shift; + $raw = shift; + $query = { + attr => $attr, + cont => $cont, + raw => $raw, + }; + } + my %result; defined $self->{db} or $self->open; # require layout @@ -606,7 +790,7 @@ my $name = $_->name; if (exists $raw->{$name} and @{$raw->{$name}}) { my $scale = 1/scalar(@{$raw->{$name}}); - my %r = $_->search_raw(@{$raw->{$name}}); + my %r = $_->search_raw($query, @{$raw->{$name}}); my ($key, $val); while (($key, $val) = each %r) { if (exists $result{$key}) { @@ -620,7 +804,7 @@ } if (defined $cont and $cont ne '') { for (@{$self->{inverted}->{$attr}}) { - my %r = $_->search($cont); + my %r = $_->search($query, $cont); my ($key, $val); while (($key, $val) = each %r) { if (exists $result{$key}) { @@ -644,7 +828,9 @@ my %pos; if (defined $raw) { - for (@{$self->{inverted}->{$attr}}) { + for (@{$self->{inverted}->{$attr}}) { # objects of type + # WAIT::InvertedIndex for + # this index field $attr my $name = $_->name; if (exists $raw->{$name}) { my %qt; @@ -678,13 +864,14 @@ } sub hilight { - my ($tb, $text, $query, $raw) = @_; - my $type = $tb->layout(); + my ($tb, $buf, $qplain, $qraw) = @_; + my $layout = $tb->layout(); + my @result; - $query ||= {}; - $raw ||= {}; - my @ttxt = $type->tag($text); + $qplain ||= {}; + $qraw ||= {}; + my @ttxt = $layout->tag($buf); while (@ttxt) { no strict 'refs'; my %tag = %{shift @ttxt}; @@ -692,9 +879,9 @@ my $fld; my %hl; - for $fld (grep defined $tag{$_}, keys %$query, keys %$raw) { + for $fld (grep defined $tag{$_}, keys %$qplain, keys %$qraw) { my $hp = $tb->hilight_positions($fld, $txt, - $query->{$fld}, $raw->{$fld}); + $qplain->{$fld}, $qraw->{$fld}); for (keys %$hp) { if (exists $hl{$_}) { # -w ;-( $hl{$_} = max($hl{$_}, $hp->{$_}); @@ -720,4 +907,3 @@ } 1; -