--- trunk/PlusPlus.pm 2004/12/05 22:24:09 21 +++ trunk/PlusPlus.pm 2004/12/07 16:05:43 22 @@ -4,12 +4,13 @@ use strict; use warnings; -our $VERSION = '0.10'; +our $VERSION = '0.20'; use Carp; use File::Temp qw/ tempdir /; use BerkeleyDB; -#use YAML; +use Storable qw(store retrieve freeze thaw); +use YAML; =head1 NAME @@ -18,7 +19,13 @@ =head1 SYNOPSIS use SWISH::PlusPlus; - blah blah blah + + my $i = new SWISH::PlusPlus( + index_dir => '/tmp/foo', + ); + $i->add( 42 => 'meaning of life' ); + + print $i->search("meaning"); # returns 42 =head1 DESCRIPTION @@ -164,7 +171,10 @@ Quick way to add simple data to index. $i->index_document($path, $data); - $i->index_document( 42 => 'meaning of life' ); + $i->index_document( + 42 => 'meaning of life', + 1984 => 'Oh!', + ); C<$path> value is really path, so you don't want to use directory separators (slashes, /) in it probably. @@ -208,11 +218,40 @@ sub add { my $self = shift; - $self->_create_doc(@_); + return $self->_create_doc(@_); +} - return 1; + +=head2 delete + +Delete document from index. + + $i->delete("document/path"); + +If deletion is succesfull returns revision of deleted document, otherwise +undef. + +=cut + +sub delete { + my $self = shift; + + my $path = shift || carp "empty path?"; + + print STDERR "## delete: $path\n" if ($self->{'debug'}); + + my $rev = $self->{'meta_db'}->{"R$path"}; + if ($rev) { + $self->{'_deleted'}->{$path} = $rev; + $self->{'_deleted_counter'}++; + print STDERR "## deleted revision $rev, counter: ",$self->{'_deleted_counter'}++,"\n" if ($self->{'debug'}); + return $rev; + } + + return undef; } + =head2 search Search your index using any valid SWISH++ query. @@ -249,20 +288,31 @@ ' |'; print STDERR "## search: $open_cmd\n" if ($self->{'debug'}); + my %r; + open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!"; my $l; while($l = ) { next if ($l =~ /^#/); chomp($l); print STDERR "## $l\n" if ($self->{'debug'}); - my ($rank,$path,$size,$title) = split(/ /,$l,4); + my ($rank,$path,$size,$rev,$title) = split(/ /,$l,5); $path =~ s#^\./##; # strip from path + + # get current revision + $r{$path} = $self->{'meta_db'}->{"R$path"}; + + # skip if old revision + next if ($r{$path} > $rev); + + print STDERR "## current revision $rev\n" if ($self->{'debug'}); + push @results, { rank => $rank, path => $path, size => $size, title => $title, - } + } unless ($self->{'_deleted'}->{$path} && $self->{'_deleted'}->{$path} <= $rev); } close(SEARCH) || confess "can't close search"; @@ -276,23 +326,37 @@ Return stored meta property from result or result path. - print $i->property('path', 'title'); - print $i->property($res->{'path'}, 'title'); + print $i->property('path', 'meta name'); + print $i->property($res->{'path'}, 'meta name'); + print $i->property('path'); + print $i->property($res->{'path'}); + +Returns one meta property (if meta name is specified) or whole hash with +all meta properties. =cut sub property { my $self = shift; - my ($path,$meta) = @_; + my $path = shift || return; + my $meta = shift; if ($path =~ m/^HASH/) { $path = $path->{'path'} || confess "can't find path in input data"; } - my $val = $self->{'meta_db'}->{"$path-$meta"}; + my $val = $self->{'meta_db'}->{"M$path"}; + + # FIXME should we die here like swish-e does? + return unless ($val); + + $val = thaw($val); + + print STDERR "## property $path $meta: ",(Dump($val) || 'undef'),"\n" if ($self->{'debug'}); + + return $val->{$meta} if ($meta); - print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'}); return $val; } @@ -349,10 +413,21 @@ chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!"; - print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'}); + print STDERR "## tmp_dir: $tmp_dir\n" if ($self->{'debug'}); my $opt = "-v " . ($self->{'debug'} || '0'); + my $index_dir = $self->{'index_dir'} || confess "no index_dir?"; + my $index_file = $index_dir . '/index'; + + if (-e $index_file && ! -z $index_file) { + $opt .= ' -I '; + $self->{'_incremental'} = 1; + print STDERR "## using incremental indexing for $index_file\n" if ($self->{'debug'}); + } else { + $self->{'_incremental'} = 0; + } + unless ($self->{'use_stopwrods'}) { open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n"; print STOP " "; @@ -360,9 +435,7 @@ $opt .= " -s _stopwords_"; } - my $index_dir = $self->{'index_dir'} || confess "no index_dir?"; - - my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -'; + my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_file.' -'; print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'}); @@ -389,8 +462,6 @@ } ); -To delete document, just omit body and meta data. - =cut sub _create_doc { @@ -405,35 +476,53 @@ my $id = $arg->{'path'} || confess "no path?"; $path .= "/$id"; - print STDERR "## _create_doc: $path\n" if ($self->{'debug'}); + my $rev = $self->{'rev'}++; + + print STDERR "## _create_doc: $path [$rev]\n" if ($self->{'debug'}); open(TMP, '>', $path) || die "can't create temp file $path: $!"; print TMP ''; - $arg->{'body'} ||= ''; + my $body = $arg->{'body'}; + + if (defined($body)) { + $self->{'meta_db'}->{"B$id"} = $body; + } else { + $body = ''; + } + + my $title = $arg->{'title'}; if ($arg->{'meta'}) { foreach my $name (keys %{$arg->{'meta'}}) { my $content = $arg->{'meta'}->{$name}; print TMP qq{}; - $arg->{'body'} .= " $content" if ($self->{'meta_in_body'}); - $self->{'meta_db'}->{"$id-$name"} = $content; + $body .= " $content" if ($self->{'meta_in_body'}); } + $arg->{'meta'}->{'title'} = $title; + $self->{'meta_db'}->{"M$id"} = freeze($arg->{'meta'}); } - my $title = $arg->{'title'}; if (defined($title)) { - print TMP "$title"; - $arg->{'body'} .= " $title" if ($self->{'meta_in_body'}); - $self->{'meta_db'}->{"$id-title"} = $title; + $title = "$rev $title"; + $body .= " $title" if ($self->{'meta_in_body'}); + } else { + $title = "$rev $id"; } - print TMP '' . $arg->{'body'} . ''; + # dump html + print TMP "$title$body"; close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!"; - print { $self->{'_index_fh'} } "$id\n"; + print { $self->{'_index_fh'} } "$id\n" || confess "can't pass document $id to indexer: $!"; + + $self->{'meta_db'}->{"R$id"} = $rev; + + # FIXME this is probably not the right place to update global + # maximum revision, but it keeps database in sane state + $self->{'meta_db'}->{"Crev"} = $rev; } =head2 _close_index @@ -449,6 +538,8 @@ sub _close_index { my $self = shift; + $self->_store_deleted; + return unless ($self->{'_index_fh'}); print STDERR "## close index\n" if ($self->{'debug'}); @@ -456,6 +547,11 @@ close($self->{'_index_fh'}) || confess "can't close index: $!"; undef $self->{'_index_fh'}; + if ($self->{'_incremental'}) { + print STDERR "## move new index over old\n" if ($self->{'debug'}); + rename $self->{'index_dir'}.'/index.new',$self->{'index_dir'}.'/index' || die "can't move new index over old one: $!"; + } + return 1; } @@ -489,6 +585,18 @@ -Flags => $flags or confess "cannot open $file: $! $BerkeleyDB::Error\n" ; + $self->{'rev'} = $self->{'meta_db'}->{'Crev'} || 0; + + my $delref = $self->{'meta_db'}->{'Cdeleted'}; + if ($delref) { + $self->{'_deleted'} = thaw($delref); + + print "## deleted ",keys %{$self->{'_deleted'}}," records\n" if ($self->{'debug'}); + } else { + $self->{'_deleted'} = {}; + } + + $self->{'_deleted_counter'} = 0; return 1; } @@ -513,6 +621,33 @@ return 1; } + +=head2 _store_deleted + +Save hash of deleted files using L. + + $i->_store_deleted; + +=cut + +sub _store_deleted { + my $self = shift; + + return if (! $self->{'_deleted_counter'}); + + print STDERR "## save deleted ",Dump($self->{'_deleted'}) if ($self->{'debug'}); + + my $d = freeze($self->{'_deleted'}); + + $self->_tie_meta_db(DB_CREATE); + + $self->{'meta_db'}->{'Cdeleted'} = $d || + carp "can't store deleted: $!"; + + # reset counter + $self->{'_deleted_counter'} = 0; +} + 1; __END__