--- trunk/PlusPlus.pm 2004/12/03 15:23:23 3 +++ trunk/PlusPlus.pm 2004/12/05 12:48:00 10 @@ -4,9 +4,11 @@ use strict; use warnings; -our $VERSION = '0.02'; +our $VERSION = '0.04'; use Carp; +use File::Temp qw/ tempdir /; +#use YAML; =head1 NAME @@ -29,17 +31,20 @@ =head1 METHODS -=head2 open +=head2 new Create new indexing object. - my $i = SWISH::PlusPlus->open( + my $i = SWISH::PlusPlus->new( index_dir => '/path/to/index', index => 'index++', search => 'search++', + debug => 1, + meta_in_body => 1, + use_stopwords => 1, ); -Options to open are following: +Options to new are following: =over 5 @@ -57,11 +62,26 @@ Full or partial path to SWISH++ search executable. By default, it's B. +=item C + +This option (off by default) will produce a lot of debugging output on +C prefixed by C<##>. + +=item C + +This option (off by default) enables to search content of meta fields +without specifing them (like they are in body of document). This will +somewhat increate index size. + +=item C + +Use built-in SWISH++ stop words. By default, they are disabled. + =back =cut -sub open { +sub new { my $class = shift; my $self = {@_}; bless($self, $class); @@ -78,13 +98,15 @@ $self->{'index'} ||= 'index'; $self->{'search'} ||= 'search'; + print STDERR "## new index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'}); + $self ? return $self : return undef; } =head2 check_bin -Check if swish++ binaries specified in L are available and verify +Check if swish++ binaries specified in L are available and verify version signature. if ($i->check_bin) { @@ -121,6 +143,215 @@ } +=head2 index_document + +Quick way to add simple data to index. + + $i->index_document($key, $data); + $i->index_document( 42 => 'meaning of life' ); + +=cut + +sub index_document { + my $self = shift; + + my %doc = @_; + + foreach my $id (keys %doc) { + $self->_create_doc( + path => $id, + body => $doc{$id}, + ); + } + + return 1; +} + +=head2 add + +Add document with metadata to index. + + $i->add( + path => 'path/to/document', + title => 'this is result title', + meta => { + description => 'this is description meta tag', + date => '2004-11-04', + author => 'Dobrica Pavlinusic', + } + body => 'this is text without meta data', + ); + +This is thin wrapper round L<_create_doc>. + +=cut + +sub add { + my $self = shift; + + $self->_create_doc(@_); + + return 1; +} +=head2 search + +Search your index. + + my @results = $i->search("swhish query"); + +Returns array with result IDs. + +=cut + +sub search { + my $self = shift; + + my $query = shift || return; + + $self->_close_index; + + my @results; + + # escape double quotes in query for shell + $query =~ s/"/\\"/g; + + my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |'; + print STDERR "## search $open_cmd\n" if ($self->{'debug'}); + + open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!"; + while() { + next if (/^#/); + chomp; + print STDERR "## $_\n" if ($self->{'debug'}); + my ($rank,$path,$size,$title) = split(/ /,$_,4); + push @results, { + rank => $rank, + path => $path, + size => $size, + title => $title, + } + } + + close(SEARCH) || confess "can't close search"; + + #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'}); + + return @results; +} + +=head1 PRIVATE METHODS + +Private methods implement internals for creating temporary file needed for +swish++. You should have no need to call them directly, and they are here +just to have documentation. + +=head2 _init_indexer + +Create temporary directory in which files for indexing will be created and +start index process. + + my $i->_init_indexer || die "can't start indexer"; + +It will also create empty file C<_stopwords_> to disable stop words. + +=cut + +sub _init_indexer { + my $self = shift; + + $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!"; + + chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!"; + + my $opt = "-v 4"; + + unless ($self->{'use_stopwrods'}) { + open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n"; + print STOP " "; + close(STOP); + $opt .= " -s _stopwords_"; + } + + my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -'; + + + open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!"; + + + return $self->{'index_fh'}; +} + +=head2 _create_doc + +Create temporary file and pass it's name to swish++ + + $i->_create_doc( + path => 'path/to/store/in/index', + title => 'this is title in results', + body => 'data to story in body tag', + meta => { + 'meta name' => 'data for this meta', + 'another' => 'again more data', + } + ); + +To delete document, just omit body and meta data. + +=cut + +sub _create_doc { + my $self = shift; + + my $arg = {@_}; + + # open indexer if needed + $self->{'index_fh'} ||= $self->_init_indexer; + + my $path = $self->{'tmp_dir'} || confess "no tmp_dir?"; + + open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!"; + + print TMP ''; + + $arg->{'body'} ||= ''; + + if ($arg->{'meta'}) { + confess "not yet implemented"; + } + + if (defined($arg->{'title'})) { + print TMP '' . ($arg->{'title'} || '') . ''; + $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'}); + } + + print TMP '' . $arg->{'body'} . ''; + + close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!"; + + print { $self->{'index_fh'} } $arg->{'path'}."\n"; +} + +=head2 _close_index + +Close index after indexing. + + $i->_close_index; + +You have to close index before searching. + +=cut + +sub _close_index { + my $self = shift; + + return unless ($self->{'index_fh'}); + + print STDERR "## close index\n" if ($self->{'debug'}); + + close($self->{'index_fh'}); + undef $self->{'index_fh'}; +} + 1; __END__ @@ -135,23 +366,37 @@ Debian version of swish++ is often old (version 5 at moment of this writing while version 6 is available in source code), so this module by default uses executable names B and B for self-compiled version -instead of one from Debian package. See L how to specify Debian +instead of one from Debian package. See L how to specify Debian default binaries B and B. -=head1 SEE ALSO +=head2 SWISH++ -Mention other useful documentation such as the documentation of -related modules or operating system documentation (such as man pages -in UNIX), or any relevant external documentation such as RFCs or -standards. +Aside from very good rewrite in C++, SWISH++ is fatster because it has +claver heuristics about which data in input files are words to index and +which are not. It's based on English language and might be best choice if +you plan to install large amount of long text documents. + +However, if you plan to index all data from structured storage (e.g. RDBMS) +you might want B words from data to end up in index as opposed to just +those which look like English words. This is especially important if you +don't plan to index English texts with this module. + +With distribution build versions of SWISH++ you might have problems with +disepearing words. To overcome this problem, you will have to compile and +configure SWISH++ yourself (because language characteristics are +compilation-time option). + +Compilation of SWISH++ is easy process well described on project's web +pages. To see my very relaxed sample configuration take a look at C +directory included in distribution. -If you have a mailing list set up for your module, mention it here. +=head1 SEE ALSO -If you have a web site set up for your module, mention it here. +C web site L =head1 AUTHOR -Dobrica Pavlinusic, Edpavlin@E +Dobrica Pavlinusic, Edpavlin@rot13.orgE =head1 COPYRIGHT AND LICENSE