--- trunk/PlusPlus.pm 2004/12/04 17:49:20 8 +++ trunk/PlusPlus.pm 2004/12/05 15:35:53 14 @@ -4,7 +4,7 @@ use strict; use warnings; -our $VERSION = '0.02'; +our $VERSION = '0.06'; use Carp; use File::Temp qw/ tempdir /; @@ -31,18 +31,20 @@ =head1 METHODS -=head2 open +=head2 new Create new indexing object. - my $i = SWISH::PlusPlus->open( + my $i = SWISH::PlusPlus->new( index_dir => '/path/to/index', index => 'index++', search => 'search++', debug => 1, + meta_in_body => 1, + use_stopwords => 1, ); -Options to open are following: +Options to new are following: =over 5 @@ -65,11 +67,21 @@ This option (off by default) will produce a lot of debugging output on C prefixed by C<##>. +=item C + +This option (off by default) enables to search content of meta fields +without specifing them (like they are in body of document). This will +somewhat increate index size. + +=item C + +Use built-in SWISH++ stop words. By default, they are disabled. + =back =cut -sub open { +sub new { my $class = shift; my $self = {@_}; bless($self, $class); @@ -78,15 +90,27 @@ croak "need $_" unless $self->{$_}; } - if (! -e $self->{'index_dir'}) { - mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!"; + my $index_dir = $self->{'index_dir'}; + + my $cwd; + chomp($cwd = `pwd`); + $self->{'cwd'} = $cwd || carp "can't get cwd!"; + + if ($index_dir !~ m#^/#) { + $index_dir = "$cwd/$index_dir"; + print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'}); + $self->{'index_dir'} = $index_dir; + } + + if (! -e $index_dir) { + mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!"; } # default executables $self->{'index'} ||= 'index'; $self->{'search'} ||= 'search'; - print STDERR "## open index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'}); + print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'}); $self ? return $self : return undef; } @@ -94,7 +118,7 @@ =head2 check_bin -Check if swish++ binaries specified in L are available and verify +Check if swish++ binaries specified in L are available and verify version signature. if ($i->check_bin) { @@ -122,6 +146,7 @@ confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/); if ($i eq $s) { + $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version"; $self->{'version'} = $i; return 1; } else { @@ -155,6 +180,32 @@ return 1; } +=head2 add + +Add document with metadata to index. + + $i->add( + path => 'path/to/document', + title => 'this is result title', + meta => { + description => 'this is description meta tag', + date => '2004-11-04', + author => 'Dobrica Pavlinusic', + } + body => 'this is text without meta data', + ); + +This is thin wrapper round L<_create_doc>. + +=cut + +sub add { + my $self = shift; + + $self->_create_doc(@_); + + return 1; +} =head2 search Search your index. @@ -170,7 +221,7 @@ my $query = shift || return; - $self->_close_index; + $self->finish_update; my @results; @@ -180,7 +231,7 @@ my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |'; print STDERR "## search $open_cmd\n" if ($self->{'debug'}); - CORE::open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!"; + open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!"; while() { next if (/^#/); chomp; @@ -201,35 +252,76 @@ return @results; } +=head2 finish_update + +This method will close index. + + $i->finish_update; + +It will be called on DESTROY when $i goes out of scope. + +=cut + +sub finish_update { + my $self = shift; + + print STDERR "## finish_update\n" if ($self->{'debug'}); + + $self->_close_index; +} + +sub DESTROY { + my $self = shift; + $self->finish_update; +} + =head1 PRIVATE METHODS Private methods implement internals for creating temporary file needed for swish++. You should have no need to call them directly, and they are here just to have documentation. -=head2 _init_index +=head2 _init_indexer Create temporary directory in which files for indexing will be created and start index process. - my $i->_init_index || die "can't start indexer"; + my $i->_init_indexer || die "can't start indexer"; + +It will also create empty file C<_stopwords_> to disable stop words. =cut -sub _init_index { +sub _init_indexer { my $self = shift; - $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!"; + return if ($self->{'_index_fh'}); + + my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!"; + $self->{'tmp_dir'} = $tmp_dir; + + chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!"; - my $opt = "-v 4"; + print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'}); + + my $opt = "-v " . ($self->{'debug'} || '0'); + + unless ($self->{'use_stopwrods'}) { + open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n"; + print STOP " "; + close(STOP); + $opt .= " -s _stopwords_"; + } my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -'; - chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!"; + print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'}); + + open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!"; - CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!"; + chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!"; - return $self->{'index_fh'}; + return $self->{'_index_fh'}; } =head2 _create_doc @@ -238,6 +330,7 @@ $i->_create_doc( path => 'path/to/store/in/index', + title => 'this is title in results', body => 'data to story in body tag', meta => { 'meta name' => 'data for this meta', @@ -255,23 +348,37 @@ my $arg = {@_}; # open indexer if needed - $self->{'index_fh'} ||= $self->_init_index; + $self->_init_indexer; my $path = $self->{'tmp_dir'} || confess "no tmp_dir?"; + $path .= '/' . $arg->{'path'}; - CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!"; + print STDERR "## _create_doc: $path\n" if ($self->{'debug'}); - print TMP ''; + open(TMP, '>', $path) || die "can't create temp file $path: $!"; + + print TMP ''; + + $arg->{'body'} ||= ''; if ($arg->{'meta'}) { - confess "not yet implemented"; + foreach my $name (keys %{$arg->{'meta'}}) { + my $content = $arg->{'meta'}->{$name}; + print TMP qq{}; + $arg->{'body'} .= " $content" if ($self->{'meta_in_body'}); + } } - - print TMP '' . ($arg->{'body'} || '') . ''; + + if (defined($arg->{'title'})) { + print TMP '' . ($arg->{'title'} || '') . ''; + $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'}); + } + + print TMP '' . $arg->{'body'} . ''; close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!"; - print { $self->{'index_fh'} } $arg->{'path'}."\n"; + print { $self->{'_index_fh'} } $arg->{'path'}."\n"; } =head2 _close_index @@ -287,12 +394,12 @@ sub _close_index { my $self = shift; - return unless ($self->{'index_fh'}); + return unless ($self->{'_index_fh'}); print STDERR "## close index\n" if ($self->{'debug'}); - close($self->{'index_fh'}); - undef $self->{'index_fh'}; + close($self->{'_index_fh'}); + undef $self->{'_index_fh'}; } 1; @@ -309,7 +416,7 @@ Debian version of swish++ is often old (version 5 at moment of this writing while version 6 is available in source code), so this module by default uses executable names B and B for self-compiled version -instead of one from Debian package. See L how to specify Debian +instead of one from Debian package. See L how to specify Debian default binaries B and B. =head2 SWISH++ @@ -333,6 +440,14 @@ pages. To see my very relaxed sample configuration take a look at C directory included in distribution. +=head2 SWISH++ config + +C located in C directory of this distribution is relaxed +SWISH++ configuration that will index all words passed to it. This +configuration is needed for B because default configuration +doesn't recognize 2004-12-05 as date. Have in mind that your index size +might explode. + =head1 SEE ALSO C web site L