/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 14 by dpavlin, Sun Dec 5 15:35:53 2004 UTC revision 21 by dpavlin, Sun Dec 5 22:24:09 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.06';  our $VERSION = '0.10';
8    
9  use Carp;  use Carp;
10  use File::Temp qw/ tempdir /;  use File::Temp qw/ tempdir /;
11    use BerkeleyDB;
12  #use YAML;  #use YAML;
13    
14  =head1 NAME  =head1 NAME
15    
16  SWISH::PlusPlus - Perl extension SWISH++  SWISH::PlusPlus - Perl extension for full-text indexer SWISH++ with properties support
17    
18  =head1 SYNOPSIS  =head1 SYNOPSIS
19    
# Line 22  SWISH::PlusPlus - Perl extension SWISH++ Line 23  SWISH::PlusPlus - Perl extension SWISH++
23  =head1 DESCRIPTION  =head1 DESCRIPTION
24    
25  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
26  rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without  rewrite of swish-e in C++ which is extremely fast (due to mmap usage and
27  support for properties (which this module tries to fix).  clever language heuristics), but without support for properties (which this
28    module tries to fix).
29  Implementation of this module is crafted after L<Plucene::Simple> and it  
30  should be easy to replace Plucene with this module for increased  Implementation of API is something in-between C<SWISH::API> and
31  performance. However, this module is not plug-in replacement.  C<Plucene::Simple>. It should be easy to replace Plucene or swish-e with
32    this module for increased performance. However, this module is not plug-in
33    replacement.
34    
35  =head1 METHODS  =head1 METHODS
36    
37  =head2 new  =head2 new
38    
39  Create new indexing object.  Create new instance for index.
40    
41    my $i = SWISH::PlusPlus->new(    my $i = SWISH::PlusPlus->new(
42          index_dir => '/path/to/index',          index_dir => '/path/to/index',
# Line 44  Create new indexing object. Line 47  Create new indexing object.
47          use_stopwords => 1,          use_stopwords => 1,
48    );    );
49    
50  Options to new are following:  Options are described below:
51    
52  =over 5  =over 5
53    
54  =item C<index_dir>  =item C<index_dir>
55    
56  Path to directory in which index will be created.  Path to directory in which index and meta database will be created.
57    
58  =item C<index>  =item C<index>
59    
# Line 70  C<STDERR> prefixed by C<##>. Line 73  C<STDERR> prefixed by C<##>.
73  =item C<meta_in_body>  =item C<meta_in_body>
74    
75  This option (off by default) enables to search content of meta fields  This option (off by default) enables to search content of meta fields
76  without specifing them (like they are in body of document). This will  without specifying them (like they are in body of document). This will
77  somewhat increate index size.  somewhat increase index size.
78    
79  =item C<use_stopwords>  =item C<use_stopwords>
80    
# Line 118  sub new { Line 121  sub new {
121    
122  =head2 check_bin  =head2 check_bin
123    
124  Check if swish++ binaries specified in L<new> are available and verify  Check if SWISH++ binaries specified in L<new> are available and verify
125  version signature.  version signature.
126    
127    if ($i->check_bin) {    if ($i->check_bin) {
# Line 129  It will also setup property Line 132  It will also setup property
132    
133    $i->{'version'}    $i->{'version'}
134    
135  which you can examine to see version.  which you can examined to see numeric version (something like C<6.0.4>).
136    
137  =cut  =cut
138    
# Line 160  sub check_bin { Line 163  sub check_bin {
163    
164  Quick way to add simple data to index.  Quick way to add simple data to index.
165    
166    $i->index_document($key, $data);    $i->index_document($path, $data);
167    $i->index_document( 42 => 'meaning of life' );    $i->index_document( 42 => 'meaning of life' );
168    
169    C<$path> value is really path, so you don't want to use directory
170    separators (slashes, /) in it probably.
171    
172  =cut  =cut
173    
174  sub index_document {  sub index_document {
# Line 182  sub index_document { Line 188  sub index_document {
188    
189  =head2 add  =head2 add
190    
191  Add document with metadata to index.  Add document with meta-data to index.
192    
193    $i->add(    $i->add(
194          path => 'path/to/document',          path => 'path/to/document',
# Line 206  sub add { Line 212  sub add {
212    
213          return 1;          return 1;
214  }  }
215    
216  =head2 search  =head2 search
217    
218  Search your index.  Search your index using any valid SWISH++ query.
219    
220      my @results = $i->search("swish query");
221    
222    my @results = $i->search("swhish query");  Returns array with elements like this:
223    
224  Returns array with result IDs.    {
225       rank => 10,                  # rank of result
226       path => 'path to result',    # path to result
227       size => 999,                 # size in bytes
228       title => 'title of result'   # title meta property
229      }
230    
231  =cut  =cut
232    
# Line 222  sub search { Line 236  sub search {
236          my $query = shift || return;          my $query = shift || return;
237    
238          $self->finish_update;          $self->finish_update;
239            $self->_tie_meta_db(DB_RDONLY);
240    
241          my @results;          my @results;
242    
243          # escape double quotes in query for shell          # escape double quotes in query for shell
244          $query =~ s/"/\\"/g;          $query =~ s/"/\\"/g;
245    
246          my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';          my $open_cmd = $self->{'search'} .
247          print STDERR "## search $open_cmd\n" if ($self->{'debug'});                  ' -i ' . $self->{'index_dir'}.'/index' .
248                    ' "' . $query . '"'.
249                    ' |';
250            print STDERR "## search: $open_cmd\n" if ($self->{'debug'});
251    
252          open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";          open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
253          while(<SEARCH>) {          my $l;
254                  next if (/^#/);          while($l = <SEARCH>) {
255                  chomp;                  next if ($l =~ /^#/);
256                  print STDERR "## $_\n" if ($self->{'debug'});                  chomp($l);
257                  my ($rank,$path,$size,$title) = split(/ /,$_,4);                  print STDERR "## $l\n" if ($self->{'debug'});
258                    my ($rank,$path,$size,$title) = split(/ /,$l,4);
259                    $path =~ s#^\./##; # strip from path
260                  push @results, {                  push @results, {
261                          rank => $rank,                          rank => $rank,
262                          path => $path,                          path => $path,
# Line 252  sub search { Line 272  sub search {
272          return @results;          return @results;
273  }  }
274    
275    =head2 property
276    
277    Return stored meta property from result or result path.
278    
279      print $i->property('path', 'title');
280      print $i->property($res->{'path'}, 'title');
281    
282    =cut
283    
284    sub property {
285            my $self = shift;
286    
287            my ($path,$meta) = @_;
288    
289            if ($path =~ m/^HASH/) {
290                    $path = $path->{'path'} || confess "can't find path in input data";
291            }
292    
293            my $val = $self->{'meta_db'}->{"$path-$meta"};
294    
295            print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'});
296            return $val;
297    }
298    
299  =head2 finish_update  =head2 finish_update
300    
301  This method will close index.  This method will close index binary and enable search. Searching is not
302    available while indexing is in process.
303    
304    $i->finish_update;    $i->finish_update;
305    
306  It will be called on DESTROY when $i goes out of scope.  Usually, you don't need to call this method directly. It will be called on
307    DESTROY when $i goes out of scope or when you first call search in session
308    if indexing was started.
309    
310  =cut  =cut
311    
# Line 267  sub finish_update { Line 314  sub finish_update {
314    
315          print STDERR "## finish_update\n" if ($self->{'debug'});          print STDERR "## finish_update\n" if ($self->{'debug'});
316    
317          $self->_close_index;          $self->_close_index && $self->_untie_meta_db;
318  }  }
319    
320  sub DESTROY {  sub DESTROY {
# Line 277  sub DESTROY { Line 324  sub DESTROY {
324    
325  =head1 PRIVATE METHODS  =head1 PRIVATE METHODS
326    
327  Private methods implement internals for creating temporary file needed for  Private methods implement internals for creating temporary files needed for
328  swish++. You should have no need to call them directly, and they are here  SWISH++. You should have no need to call them directly, and they are here
329  just to have documentation.  just to have documentation.
330    
331  =head2 _init_indexer  =head2 _init_indexer
# Line 313  sub _init_indexer { Line 360  sub _init_indexer {
360                  $opt .= " -s _stopwords_";                  $opt .= " -s _stopwords_";
361          }          }
362    
363          my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';          my $index_dir = $self->{'index_dir'} || confess "no index_dir?";
364    
365            my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -';
366    
367          print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});          print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
368    
# Line 321  sub _init_indexer { Line 370  sub _init_indexer {
370    
371          chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";          chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
372    
373            $self->_tie_meta_db(DB_CREATE);
374    
375          return $self->{'_index_fh'};          return $self->{'_index_fh'};
376  }  }
377    
378  =head2 _create_doc  =head2 _create_doc
379    
380  Create temporary file and pass it's name to swish++  Create temporary file and pass it's name to SWISH++
381    
382    $i->_create_doc(    $i->_create_doc(
383          path => 'path/to/store/in/index',          path => 'path/to/store/in/index',
# Line 351  sub _create_doc { Line 402  sub _create_doc {
402          $self->_init_indexer;          $self->_init_indexer;
403    
404          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
405          $path .= '/' . $arg->{'path'};          my $id = $arg->{'path'} || confess "no path?";
406            $path .= "/$id";
407    
408          print STDERR "## _create_doc: $path\n" if ($self->{'debug'});          print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
409    
# Line 366  sub _create_doc { Line 418  sub _create_doc {
418                          my $content = $arg->{'meta'}->{$name};                          my $content = $arg->{'meta'}->{$name};
419                          print TMP qq{<meta name="$name" content="$content">};                          print TMP qq{<meta name="$name" content="$content">};
420                          $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});                          $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
421                            $self->{'meta_db'}->{"$id-$name"} = $content;
422                  }                  }
423          }          }
424    
425          if (defined($arg->{'title'})) {          my $title = $arg->{'title'};
426                  print TMP '<title>' . ($arg->{'title'} || '') . '</title>';          if (defined($title)) {
427                  $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});                  print TMP "<title>$title</title>";
428                    $arg->{'body'} .= " $title" if ($self->{'meta_in_body'});
429                    $self->{'meta_db'}->{"$id-title"} = $title;
430          }          }
431    
432          print TMP '</head><body>' . $arg->{'body'} . '</body></html>';          print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
433                    
434          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
435    
436          print { $self->{'_index_fh'} } $arg->{'path'}."\n";          print { $self->{'_index_fh'} } "$id\n";
437  }  }
438    
439  =head2 _close_index  =head2 _close_index
# Line 398  sub _close_index { Line 453  sub _close_index {
453    
454          print STDERR "## close index\n" if ($self->{'debug'});          print STDERR "## close index\n" if ($self->{'debug'});
455    
456          close($self->{'_index_fh'});          close($self->{'_index_fh'}) || confess "can't close index: $!";
457          undef $self->{'_index_fh'};          undef $self->{'_index_fh'};
458    
459            return 1;
460    }
461    
462    =head2 _tie_meta_db
463    
464    Open BerkeleyDB database with meta properties.
465    
466      $i->_tie_meta_db(DB_CREATE);
467      $i->_tie_meta_db(DB_RDONLY);
468    
469    }
470    
471    =cut
472    
473    sub _tie_meta_db  {
474            my $self = shift;
475    
476            my $flags = shift || confess "need DB_CREATE or DB_RDONLY";
477    
478            return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags);
479    
480            print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'});
481    
482            $self->_untie_meta_db;
483            $self->{'_meta_db_flags'} = $flags;
484    
485            my $file = $self->{'index_dir'}.'/meta.db';
486    
487            tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash",
488                    -Filename => $file,
489                    -Flags    => $flags
490            or confess "cannot open $file: $! $BerkeleyDB::Error\n" ;
491    
492            return 1;
493    }
494    
495    =head2 _untie_meta_db
496    
497    Close BerkeleyDB database with meta properties.
498    
499      $i->_untie_meta_db;
500    
501    =cut
502    
503    sub _untie_meta_db {
504            my $self = shift;
505    
506            return unless ($self->{'meta_db'});
507    
508            print STDERR "## _untie_meta_db\n" if ($self->{'debug'});
509            untie %{$self->{'meta_db'}} || confess "can't untie!";
510            undef $self->{'meta_db'};
511            undef $self->{'_meta_db_flags'};
512    
513            return 1;
514  }  }
515    
516  1;  1;
# Line 413  None by default. Line 524  None by default.
524    
525  =head2 Debian  =head2 Debian
526    
527  Debian version of swish++ is often old (version 5 at moment of this writing  Debian version of SWISH++ is often old (version 5 at moment of this writing
528  while version 6 is available in source code), so this module by default  while version 6 is available in source code), so this module by default
529  uses executable names B<index> and B<search> for self-compiled version  uses executable names B<index> and B<search> for self-compiled version
530  instead of one from Debian package. See L<new> how to specify Debian  instead of one from Debian package. See L<new> how to specify Debian
# Line 421  default binaries B<index++> and B<search Line 532  default binaries B<index++> and B<search
532    
533  =head2 SWISH++  =head2 SWISH++
534    
535  Aside from very good rewrite in C++, SWISH++ is fatster because it has  Aside from very good rewrite in C++, SWISH++ is faster because it uses
536  claver heuristics about which data in input files are words to index and  claver heuristics about which data in input files are words to index and
537  which are not. It's based on English language and might be best choice if  which are not. It's based on English language and might be best choice if
538  you plan to install large amount of long text documents.  you plan to index large amount of long text documents.
539    
540  However, if you plan to index all data from structured storage (e.g. RDBMS)  However, if you plan to index all data from structured storage (e.g. RDBMS)
541  you might want B<all> words from data to end up in index as opposed to just  you might want B<all> words from data to end up in index as opposed to just
# Line 432  those which look like English words. Thi Line 543  those which look like English words. Thi
543  don't plan to index English texts with this module.  don't plan to index English texts with this module.
544    
545  With distribution build versions of SWISH++ you might have problems with  With distribution build versions of SWISH++ you might have problems with
546  disepearing words. To overcome this problem, you will have to compile and  disapearing words. To overcome this problem, you will have to compile and
547  configure SWISH++ yourself (because language characteristics are  configure SWISH++ yourself (because language characteristics are
548  compilation-time option).  compilation-time option).
549    
# Line 448  configuration is needed for B<date test> Line 559  configuration is needed for B<date test>
559  doesn't recognize 2004-12-05 as date. Have in mind that your index size  doesn't recognize 2004-12-05 as date. Have in mind that your index size
560  might explode.  might explode.
561    
562    =head1 BUGS
563    
564    Currently there is no way to specify which meta data will be stored as
565    properties. B<This will be fixed very soon>.
566    
567    There is no garbage collection on temporary files created for SWISH++. This
568    means that one run of indexer will take additional disk space for temporary
569    files, which will be removed at end. There should be some way to remove
570    files after they are indexed by SWISH++. However, at this early stage of
571    development it's just not supported yet. Have plenty of disk space!
572    
573  =head1 SEE ALSO  =head1 SEE ALSO
574    
575  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>  SWISH++ web site L<http://homepage.mac.com/pauljlucas/software/swish/>
576    
577  =head1 AUTHOR  =head1 AUTHOR
578    

Legend:
Removed from v.14  
changed lines
  Added in v.21

  ViewVC Help
Powered by ViewVC 1.1.26