--- trunk/PlusPlus.pm 2004/12/05 15:35:53 14 +++ trunk/PlusPlus.pm 2004/12/05 22:24:09 21 @@ -4,15 +4,16 @@ use strict; use warnings; -our $VERSION = '0.06'; +our $VERSION = '0.10'; use Carp; use File::Temp qw/ tempdir /; +use BerkeleyDB; #use YAML; =head1 NAME -SWISH::PlusPlus - Perl extension SWISH++ +SWISH::PlusPlus - Perl extension for full-text indexer SWISH++ with properties support =head1 SYNOPSIS @@ -22,18 +23,20 @@ =head1 DESCRIPTION This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is -rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without -support for properties (which this module tries to fix). - -Implementation of this module is crafted after L and it -should be easy to replace Plucene with this module for increased -performance. However, this module is not plug-in replacement. +rewrite of swish-e in C++ which is extremely fast (due to mmap usage and +clever language heuristics), but without support for properties (which this +module tries to fix). + +Implementation of API is something in-between C and +C. It should be easy to replace Plucene or swish-e with +this module for increased performance. However, this module is not plug-in +replacement. =head1 METHODS =head2 new -Create new indexing object. +Create new instance for index. my $i = SWISH::PlusPlus->new( index_dir => '/path/to/index', @@ -44,13 +47,13 @@ use_stopwords => 1, ); -Options to new are following: +Options are described below: =over 5 =item C -Path to directory in which index will be created. +Path to directory in which index and meta database will be created. =item C @@ -70,8 +73,8 @@ =item C This option (off by default) enables to search content of meta fields -without specifing them (like they are in body of document). This will -somewhat increate index size. +without specifying them (like they are in body of document). This will +somewhat increase index size. =item C @@ -118,7 +121,7 @@ =head2 check_bin -Check if swish++ binaries specified in L are available and verify +Check if SWISH++ binaries specified in L are available and verify version signature. if ($i->check_bin) { @@ -129,7 +132,7 @@ $i->{'version'} -which you can examine to see version. +which you can examined to see numeric version (something like C<6.0.4>). =cut @@ -160,9 +163,12 @@ Quick way to add simple data to index. - $i->index_document($key, $data); + $i->index_document($path, $data); $i->index_document( 42 => 'meaning of life' ); +C<$path> value is really path, so you don't want to use directory +separators (slashes, /) in it probably. + =cut sub index_document { @@ -182,7 +188,7 @@ =head2 add -Add document with metadata to index. +Add document with meta-data to index. $i->add( path => 'path/to/document', @@ -206,13 +212,21 @@ return 1; } + =head2 search -Search your index. +Search your index using any valid SWISH++ query. + + my @results = $i->search("swish query"); - my @results = $i->search("swhish query"); +Returns array with elements like this: -Returns array with result IDs. + { + rank => 10, # rank of result + path => 'path to result', # path to result + size => 999, # size in bytes + title => 'title of result' # title meta property + } =cut @@ -222,21 +236,27 @@ my $query = shift || return; $self->finish_update; + $self->_tie_meta_db(DB_RDONLY); my @results; # escape double quotes in query for shell $query =~ s/"/\\"/g; - my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |'; - print STDERR "## search $open_cmd\n" if ($self->{'debug'}); + my $open_cmd = $self->{'search'} . + ' -i ' . $self->{'index_dir'}.'/index' . + ' "' . $query . '"'. + ' |'; + print STDERR "## search: $open_cmd\n" if ($self->{'debug'}); open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!"; - while() { - next if (/^#/); - chomp; - print STDERR "## $_\n" if ($self->{'debug'}); - my ($rank,$path,$size,$title) = split(/ /,$_,4); + my $l; + while($l = ) { + next if ($l =~ /^#/); + chomp($l); + print STDERR "## $l\n" if ($self->{'debug'}); + my ($rank,$path,$size,$title) = split(/ /,$l,4); + $path =~ s#^\./##; # strip from path push @results, { rank => $rank, path => $path, @@ -252,13 +272,40 @@ return @results; } +=head2 property + +Return stored meta property from result or result path. + + print $i->property('path', 'title'); + print $i->property($res->{'path'}, 'title'); + +=cut + +sub property { + my $self = shift; + + my ($path,$meta) = @_; + + if ($path =~ m/^HASH/) { + $path = $path->{'path'} || confess "can't find path in input data"; + } + + my $val = $self->{'meta_db'}->{"$path-$meta"}; + + print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'}); + return $val; +} + =head2 finish_update -This method will close index. +This method will close index binary and enable search. Searching is not +available while indexing is in process. $i->finish_update; -It will be called on DESTROY when $i goes out of scope. +Usually, you don't need to call this method directly. It will be called on +DESTROY when $i goes out of scope or when you first call search in session +if indexing was started. =cut @@ -267,7 +314,7 @@ print STDERR "## finish_update\n" if ($self->{'debug'}); - $self->_close_index; + $self->_close_index && $self->_untie_meta_db; } sub DESTROY { @@ -277,8 +324,8 @@ =head1 PRIVATE METHODS -Private methods implement internals for creating temporary file needed for -swish++. You should have no need to call them directly, and they are here +Private methods implement internals for creating temporary files needed for +SWISH++. You should have no need to call them directly, and they are here just to have documentation. =head2 _init_indexer @@ -313,7 +360,9 @@ $opt .= " -s _stopwords_"; } - my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -'; + my $index_dir = $self->{'index_dir'} || confess "no index_dir?"; + + my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -'; print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'}); @@ -321,12 +370,14 @@ chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!"; + $self->_tie_meta_db(DB_CREATE); + return $self->{'_index_fh'}; } =head2 _create_doc -Create temporary file and pass it's name to swish++ +Create temporary file and pass it's name to SWISH++ $i->_create_doc( path => 'path/to/store/in/index', @@ -351,7 +402,8 @@ $self->_init_indexer; my $path = $self->{'tmp_dir'} || confess "no tmp_dir?"; - $path .= '/' . $arg->{'path'}; + my $id = $arg->{'path'} || confess "no path?"; + $path .= "/$id"; print STDERR "## _create_doc: $path\n" if ($self->{'debug'}); @@ -366,19 +418,22 @@ my $content = $arg->{'meta'}->{$name}; print TMP qq{}; $arg->{'body'} .= " $content" if ($self->{'meta_in_body'}); + $self->{'meta_db'}->{"$id-$name"} = $content; } } - if (defined($arg->{'title'})) { - print TMP '' . ($arg->{'title'} || '') . ''; - $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'}); + my $title = $arg->{'title'}; + if (defined($title)) { + print TMP "$title"; + $arg->{'body'} .= " $title" if ($self->{'meta_in_body'}); + $self->{'meta_db'}->{"$id-title"} = $title; } print TMP '' . $arg->{'body'} . ''; close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!"; - print { $self->{'_index_fh'} } $arg->{'path'}."\n"; + print { $self->{'_index_fh'} } "$id\n"; } =head2 _close_index @@ -398,8 +453,64 @@ print STDERR "## close index\n" if ($self->{'debug'}); - close($self->{'_index_fh'}); + close($self->{'_index_fh'}) || confess "can't close index: $!"; undef $self->{'_index_fh'}; + + return 1; +} + +=head2 _tie_meta_db + +Open BerkeleyDB database with meta properties. + + $i->_tie_meta_db(DB_CREATE); + $i->_tie_meta_db(DB_RDONLY); + +} + +=cut + +sub _tie_meta_db { + my $self = shift; + + my $flags = shift || confess "need DB_CREATE or DB_RDONLY"; + + return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags); + + print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'}); + + $self->_untie_meta_db; + $self->{'_meta_db_flags'} = $flags; + + my $file = $self->{'index_dir'}.'/meta.db'; + + tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash", + -Filename => $file, + -Flags => $flags + or confess "cannot open $file: $! $BerkeleyDB::Error\n" ; + + return 1; +} + +=head2 _untie_meta_db + +Close BerkeleyDB database with meta properties. + + $i->_untie_meta_db; + +=cut + +sub _untie_meta_db { + my $self = shift; + + return unless ($self->{'meta_db'}); + + print STDERR "## _untie_meta_db\n" if ($self->{'debug'}); + untie %{$self->{'meta_db'}} || confess "can't untie!"; + undef $self->{'meta_db'}; + undef $self->{'_meta_db_flags'}; + + return 1; } 1; @@ -413,7 +524,7 @@ =head2 Debian -Debian version of swish++ is often old (version 5 at moment of this writing +Debian version of SWISH++ is often old (version 5 at moment of this writing while version 6 is available in source code), so this module by default uses executable names B and B for self-compiled version instead of one from Debian package. See L how to specify Debian @@ -421,10 +532,10 @@ =head2 SWISH++ -Aside from very good rewrite in C++, SWISH++ is fatster because it has +Aside from very good rewrite in C++, SWISH++ is faster because it uses claver heuristics about which data in input files are words to index and which are not. It's based on English language and might be best choice if -you plan to install large amount of long text documents. +you plan to index large amount of long text documents. However, if you plan to index all data from structured storage (e.g. RDBMS) you might want B words from data to end up in index as opposed to just @@ -432,7 +543,7 @@ don't plan to index English texts with this module. With distribution build versions of SWISH++ you might have problems with -disepearing words. To overcome this problem, you will have to compile and +disapearing words. To overcome this problem, you will have to compile and configure SWISH++ yourself (because language characteristics are compilation-time option). @@ -448,9 +559,20 @@ doesn't recognize 2004-12-05 as date. Have in mind that your index size might explode. +=head1 BUGS + +Currently there is no way to specify which meta data will be stored as +properties. B. + +There is no garbage collection on temporary files created for SWISH++. This +means that one run of indexer will take additional disk space for temporary +files, which will be removed at end. There should be some way to remove +files after they are indexed by SWISH++. However, at this early stage of +development it's just not supported yet. Have plenty of disk space! + =head1 SEE ALSO -C web site L +SWISH++ web site L =head1 AUTHOR