--- trunk/PlusPlus.pm 2004/12/03 19:35:02 4 +++ trunk/PlusPlus.pm 2004/12/05 22:24:09 21 @@ -4,14 +4,16 @@ use strict; use warnings; -our $VERSION = '0.02'; +our $VERSION = '0.10'; use Carp; use File::Temp qw/ tempdir /; +use BerkeleyDB; +#use YAML; =head1 NAME -SWISH::PlusPlus - Perl extension SWISH++ +SWISH::PlusPlus - Perl extension for full-text indexer SWISH++ with properties support =head1 SYNOPSIS @@ -21,32 +23,37 @@ =head1 DESCRIPTION This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is -rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without -support for properties (which this module tries to fix). - -Implementation of this module is crafted after L and it -should be easy to replace Plucene with this module for increased -performance. However, this module is not plug-in replacement. +rewrite of swish-e in C++ which is extremely fast (due to mmap usage and +clever language heuristics), but without support for properties (which this +module tries to fix). + +Implementation of API is something in-between C and +C. It should be easy to replace Plucene or swish-e with +this module for increased performance. However, this module is not plug-in +replacement. =head1 METHODS -=head2 open +=head2 new -Create new indexing object. +Create new instance for index. - my $i = SWISH::PlusPlus->open( + my $i = SWISH::PlusPlus->new( index_dir => '/path/to/index', index => 'index++', search => 'search++', + debug => 1, + meta_in_body => 1, + use_stopwords => 1, ); -Options to open are following: +Options are described below: =over 5 =item C -Path to directory in which index will be created. +Path to directory in which index and meta database will be created. =item C @@ -58,11 +65,26 @@ Full or partial path to SWISH++ search executable. By default, it's B. +=item C + +This option (off by default) will produce a lot of debugging output on +C prefixed by C<##>. + +=item C + +This option (off by default) enables to search content of meta fields +without specifying them (like they are in body of document). This will +somewhat increase index size. + +=item C + +Use built-in SWISH++ stop words. By default, they are disabled. + =back =cut -sub open { +sub new { my $class = shift; my $self = {@_}; bless($self, $class); @@ -71,21 +93,35 @@ croak "need $_" unless $self->{$_}; } - if (! -e $self->{'index_dir'}) { - mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!"; + my $index_dir = $self->{'index_dir'}; + + my $cwd; + chomp($cwd = `pwd`); + $self->{'cwd'} = $cwd || carp "can't get cwd!"; + + if ($index_dir !~ m#^/#) { + $index_dir = "$cwd/$index_dir"; + print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'}); + $self->{'index_dir'} = $index_dir; + } + + if (! -e $index_dir) { + mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!"; } # default executables $self->{'index'} ||= 'index'; $self->{'search'} ||= 'search'; + print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'}); + $self ? return $self : return undef; } =head2 check_bin -Check if swish++ binaries specified in L are available and verify +Check if SWISH++ binaries specified in L are available and verify version signature. if ($i->check_bin) { @@ -96,7 +132,7 @@ $i->{'version'} -which you can examine to see version. +which you can examined to see numeric version (something like C<6.0.4>). =cut @@ -113,6 +149,7 @@ confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/); if ($i eq $s) { + $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version"; $self->{'version'} = $i; return 1; } else { @@ -126,9 +163,12 @@ Quick way to add simple data to index. - $i->index_document($key, $data); + $i->index_document($path, $data); $i->index_document( 42 => 'meaning of life' ); +C<$path> value is really path, so you don't want to use directory +separators (slashes, /) in it probably. + =cut sub index_document { @@ -146,43 +186,202 @@ return 1; } +=head2 add + +Add document with meta-data to index. + + $i->add( + path => 'path/to/document', + title => 'this is result title', + meta => { + description => 'this is description meta tag', + date => '2004-11-04', + author => 'Dobrica Pavlinusic', + } + body => 'this is text without meta data', + ); + +This is thin wrapper round L<_create_doc>. + +=cut + +sub add { + my $self = shift; + + $self->_create_doc(@_); + + return 1; +} + +=head2 search + +Search your index using any valid SWISH++ query. + + my @results = $i->search("swish query"); + +Returns array with elements like this: + + { + rank => 10, # rank of result + path => 'path to result', # path to result + size => 999, # size in bytes + title => 'title of result' # title meta property + } + +=cut + +sub search { + my $self = shift; + + my $query = shift || return; + + $self->finish_update; + $self->_tie_meta_db(DB_RDONLY); + + my @results; + + # escape double quotes in query for shell + $query =~ s/"/\\"/g; + + my $open_cmd = $self->{'search'} . + ' -i ' . $self->{'index_dir'}.'/index' . + ' "' . $query . '"'. + ' |'; + print STDERR "## search: $open_cmd\n" if ($self->{'debug'}); + + open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!"; + my $l; + while($l = ) { + next if ($l =~ /^#/); + chomp($l); + print STDERR "## $l\n" if ($self->{'debug'}); + my ($rank,$path,$size,$title) = split(/ /,$l,4); + $path =~ s#^\./##; # strip from path + push @results, { + rank => $rank, + path => $path, + size => $size, + title => $title, + } + } + + close(SEARCH) || confess "can't close search"; + + #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'}); + + return @results; +} + +=head2 property + +Return stored meta property from result or result path. + + print $i->property('path', 'title'); + print $i->property($res->{'path'}, 'title'); + +=cut + +sub property { + my $self = shift; + + my ($path,$meta) = @_; + + if ($path =~ m/^HASH/) { + $path = $path->{'path'} || confess "can't find path in input data"; + } + + my $val = $self->{'meta_db'}->{"$path-$meta"}; + + print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'}); + return $val; +} + +=head2 finish_update + +This method will close index binary and enable search. Searching is not +available while indexing is in process. + + $i->finish_update; + +Usually, you don't need to call this method directly. It will be called on +DESTROY when $i goes out of scope or when you first call search in session +if indexing was started. + +=cut + +sub finish_update { + my $self = shift; + + print STDERR "## finish_update\n" if ($self->{'debug'}); + + $self->_close_index && $self->_untie_meta_db; +} + +sub DESTROY { + my $self = shift; + $self->finish_update; +} + =head1 PRIVATE METHODS -Private methods implement internals for creating temporary file needed for -swish++. You should have no need to call them directly, and they are here +Private methods implement internals for creating temporary files needed for +SWISH++. You should have no need to call them directly, and they are here just to have documentation. -=head2 _init_index +=head2 _init_indexer Create temporary directory in which files for indexing will be created and start index process. - my $i->_init_index || die "can't start indexer"; + my $i->_init_indexer || die "can't start indexer"; + +It will also create empty file C<_stopwords_> to disable stop words. =cut -sub _init_index { +sub _init_indexer { my $self = shift; - $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!"; + return if ($self->{'_index_fh'}); + + my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!"; + $self->{'tmp_dir'} = $tmp_dir; - my $opt = "-v 4"; + chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!"; - my $open_cmd = '| index '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -'; + print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'}); - chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!"; + my $opt = "-v " . ($self->{'debug'} || '0'); + + unless ($self->{'use_stopwrods'}) { + open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n"; + print STOP " "; + close(STOP); + $opt .= " -s _stopwords_"; + } - CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!"; + my $index_dir = $self->{'index_dir'} || confess "no index_dir?"; - return $self->{'index_fh'}; + my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -'; + + print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'}); + + open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!"; + + chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!"; + + $self->_tie_meta_db(DB_CREATE); + + return $self->{'_index_fh'}; } =head2 _create_doc -Create temporary file and pass it's name to swish++ +Create temporary file and pass it's name to SWISH++ $i->_create_doc( path => 'path/to/store/in/index', + title => 'this is title in results', body => 'data to story in body tag', meta => { 'meta name' => 'data for this meta', @@ -190,6 +389,8 @@ } ); +To delete document, just omit body and meta data. + =cut sub _create_doc { @@ -198,23 +399,118 @@ my $arg = {@_}; # open indexer if needed - $self->{'index_fh'} ||= $self->_init_index; + $self->_init_indexer; my $path = $self->{'tmp_dir'} || confess "no tmp_dir?"; + my $id = $arg->{'path'} || confess "no path?"; + $path .= "/$id"; + + print STDERR "## _create_doc: $path\n" if ($self->{'debug'}); + + open(TMP, '>', $path) || die "can't create temp file $path: $!"; - CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!"; + print TMP ''; - print TMP ''; + $arg->{'body'} ||= ''; if ($arg->{'meta'}) { - confess "not yet implemented"; + foreach my $name (keys %{$arg->{'meta'}}) { + my $content = $arg->{'meta'}->{$name}; + print TMP qq{}; + $arg->{'body'} .= " $content" if ($self->{'meta_in_body'}); + $self->{'meta_db'}->{"$id-$name"} = $content; + } } - - print TMP '' . ($arg->{'body'} || '') . ''; + + my $title = $arg->{'title'}; + if (defined($title)) { + print TMP "$title"; + $arg->{'body'} .= " $title" if ($self->{'meta_in_body'}); + $self->{'meta_db'}->{"$id-title"} = $title; + } + + print TMP '' . $arg->{'body'} . ''; close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!"; - print { $self->{'index_fh'} } $arg->{'path'}."\n"; + print { $self->{'_index_fh'} } "$id\n"; +} + +=head2 _close_index + +Close index after indexing. + + $i->_close_index; + +You have to close index before searching. + +=cut + +sub _close_index { + my $self = shift; + + return unless ($self->{'_index_fh'}); + + print STDERR "## close index\n" if ($self->{'debug'}); + + close($self->{'_index_fh'}) || confess "can't close index: $!"; + undef $self->{'_index_fh'}; + + return 1; +} + +=head2 _tie_meta_db + +Open BerkeleyDB database with meta properties. + + $i->_tie_meta_db(DB_CREATE); + $i->_tie_meta_db(DB_RDONLY); + +} + +=cut + +sub _tie_meta_db { + my $self = shift; + + my $flags = shift || confess "need DB_CREATE or DB_RDONLY"; + + return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags); + + print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'}); + + $self->_untie_meta_db; + $self->{'_meta_db_flags'} = $flags; + + my $file = $self->{'index_dir'}.'/meta.db'; + + tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash", + -Filename => $file, + -Flags => $flags + or confess "cannot open $file: $! $BerkeleyDB::Error\n" ; + + return 1; +} + +=head2 _untie_meta_db + +Close BerkeleyDB database with meta properties. + + $i->_untie_meta_db; + +=cut + +sub _untie_meta_db { + my $self = shift; + + return unless ($self->{'meta_db'}); + + print STDERR "## _untie_meta_db\n" if ($self->{'debug'}); + untie %{$self->{'meta_db'}} || confess "can't untie!"; + undef $self->{'meta_db'}; + undef $self->{'_meta_db_flags'}; + + return 1; } 1; @@ -228,26 +524,59 @@ =head2 Debian -Debian version of swish++ is often old (version 5 at moment of this writing +Debian version of SWISH++ is often old (version 5 at moment of this writing while version 6 is available in source code), so this module by default uses executable names B and B for self-compiled version -instead of one from Debian package. See L how to specify Debian +instead of one from Debian package. See L how to specify Debian default binaries B and B. -=head1 SEE ALSO +=head2 SWISH++ -Mention other useful documentation such as the documentation of -related modules or operating system documentation (such as man pages -in UNIX), or any relevant external documentation such as RFCs or -standards. +Aside from very good rewrite in C++, SWISH++ is faster because it uses +claver heuristics about which data in input files are words to index and +which are not. It's based on English language and might be best choice if +you plan to index large amount of long text documents. + +However, if you plan to index all data from structured storage (e.g. RDBMS) +you might want B words from data to end up in index as opposed to just +those which look like English words. This is especially important if you +don't plan to index English texts with this module. + +With distribution build versions of SWISH++ you might have problems with +disapearing words. To overcome this problem, you will have to compile and +configure SWISH++ yourself (because language characteristics are +compilation-time option). + +Compilation of SWISH++ is easy process well described on project's web +pages. To see my very relaxed sample configuration take a look at C +directory included in distribution. + +=head2 SWISH++ config + +C located in C directory of this distribution is relaxed +SWISH++ configuration that will index all words passed to it. This +configuration is needed for B because default configuration +doesn't recognize 2004-12-05 as date. Have in mind that your index size +might explode. + +=head1 BUGS + +Currently there is no way to specify which meta data will be stored as +properties. B. + +There is no garbage collection on temporary files created for SWISH++. This +means that one run of indexer will take additional disk space for temporary +files, which will be removed at end. There should be some way to remove +files after they are indexed by SWISH++. However, at this early stage of +development it's just not supported yet. Have plenty of disk space! -If you have a mailing list set up for your module, mention it here. +=head1 SEE ALSO -If you have a web site set up for your module, mention it here. +SWISH++ web site L =head1 AUTHOR -Dobrica Pavlinusic, Edpavlin@E +Dobrica Pavlinusic, Edpavlin@rot13.orgE =head1 COPYRIGHT AND LICENSE