/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 5 by dpavlin, Fri Dec 3 21:48:15 2004 UTC revision 21 by dpavlin, Sun Dec 5 22:24:09 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.02';  our $VERSION = '0.10';
8    
9  use Carp;  use Carp;
10  use File::Temp qw/ tempdir /;  use File::Temp qw/ tempdir /;
11    use BerkeleyDB;
12    #use YAML;
13    
14  =head1 NAME  =head1 NAME
15    
16  SWISH::PlusPlus - Perl extension SWISH++  SWISH::PlusPlus - Perl extension for full-text indexer SWISH++ with properties support
17    
18  =head1 SYNOPSIS  =head1 SYNOPSIS
19    
# Line 21  SWISH::PlusPlus - Perl extension SWISH++ Line 23  SWISH::PlusPlus - Perl extension SWISH++
23  =head1 DESCRIPTION  =head1 DESCRIPTION
24    
25  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
26  rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without  rewrite of swish-e in C++ which is extremely fast (due to mmap usage and
27  support for properties (which this module tries to fix).  clever language heuristics), but without support for properties (which this
28    module tries to fix).
29  Implementation of this module is crafted after L<Plucene::Simple> and it  
30  should be easy to replace Plucene with this module for increased  Implementation of API is something in-between C<SWISH::API> and
31  performance. However, this module is not plug-in replacement.  C<Plucene::Simple>. It should be easy to replace Plucene or swish-e with
32    this module for increased performance. However, this module is not plug-in
33    replacement.
34    
35  =head1 METHODS  =head1 METHODS
36    
37  =head2 open  =head2 new
38    
39  Create new indexing object.  Create new instance for index.
40    
41    my $i = SWISH::PlusPlus->open(    my $i = SWISH::PlusPlus->new(
42          index_dir => '/path/to/index',          index_dir => '/path/to/index',
43          index => 'index++',          index => 'index++',
44          search => 'search++',          search => 'search++',
45            debug => 1,
46            meta_in_body => 1,
47            use_stopwords => 1,
48    );    );
49    
50  Options to open are following:  Options are described below:
51    
52  =over 5  =over 5
53    
54  =item C<index_dir>  =item C<index_dir>
55    
56  Path to directory in which index will be created.  Path to directory in which index and meta database will be created.
57    
58  =item C<index>  =item C<index>
59    
# Line 58  B<index++>. See C<Debian>. Line 65  B<index++>. See C<Debian>.
65    
66  Full or partial path to SWISH++ search executable. By default, it's B<search>.  Full or partial path to SWISH++ search executable. By default, it's B<search>.
67    
68    =item C<debug>
69    
70    This option (off by default) will produce a lot of debugging output on
71    C<STDERR> prefixed by C<##>.
72    
73    =item C<meta_in_body>
74    
75    This option (off by default) enables to search content of meta fields
76    without specifying them (like they are in body of document). This will
77    somewhat increase index size.
78    
79    =item C<use_stopwords>
80    
81    Use built-in SWISH++ stop words. By default, they are disabled.
82    
83  =back  =back
84    
85  =cut  =cut
86    
87  sub open {  sub new {
88          my $class = shift;          my $class = shift;
89          my $self = {@_};          my $self = {@_};
90          bless($self, $class);          bless($self, $class);
# Line 71  sub open { Line 93  sub open {
93                  croak "need $_" unless $self->{$_};                  croak "need $_" unless $self->{$_};
94          }          }
95    
96          if (! -e $self->{'index_dir'}) {          my $index_dir = $self->{'index_dir'};
97                  mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";  
98            my $cwd;
99            chomp($cwd = `pwd`);
100            $self->{'cwd'} = $cwd || carp "can't get cwd!";
101            
102            if ($index_dir !~ m#^/#) {
103                    $index_dir = "$cwd/$index_dir";
104                    print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
105                    $self->{'index_dir'} = $index_dir;
106            }
107    
108            if (! -e $index_dir) {
109                    mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
110          }          }
111    
112          # default executables          # default executables
113          $self->{'index'} ||= 'index';          $self->{'index'} ||= 'index';
114          $self->{'search'} ||= 'search';          $self->{'search'} ||= 'search';
115    
116            print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
117    
118          $self ? return $self : return undef;          $self ? return $self : return undef;
119  }  }
120    
121    
122  =head2 check_bin  =head2 check_bin
123    
124  Check if swish++ binaries specified in L<open> are available and verify  Check if SWISH++ binaries specified in L<new> are available and verify
125  version signature.  version signature.
126    
127    if ($i->check_bin) {    if ($i->check_bin) {
# Line 96  It will also setup property Line 132  It will also setup property
132    
133    $i->{'version'}    $i->{'version'}
134    
135  which you can examine to see version.  which you can examined to see numeric version (something like C<6.0.4>).
136    
137  =cut  =cut
138    
# Line 113  sub check_bin { Line 149  sub check_bin {
149          confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);          confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
150    
151          if ($i eq $s) {          if ($i eq $s) {
152                    $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
153                  $self->{'version'} = $i;                  $self->{'version'} = $i;
154                  return 1;                  return 1;
155          } else  {          } else  {
# Line 126  sub check_bin { Line 163  sub check_bin {
163    
164  Quick way to add simple data to index.  Quick way to add simple data to index.
165    
166    $i->index_document($key, $data);    $i->index_document($path, $data);
167    $i->index_document( 42 => 'meaning of life' );    $i->index_document( 42 => 'meaning of life' );
168    
169    C<$path> value is really path, so you don't want to use directory
170    separators (slashes, /) in it probably.
171    
172  =cut  =cut
173    
174  sub index_document {  sub index_document {
# Line 146  sub index_document { Line 186  sub index_document {
186          return 1;          return 1;
187  }  }
188    
189    =head2 add
190    
191    Add document with meta-data to index.
192    
193      $i->add(
194            path => 'path/to/document',
195            title => 'this is result title',
196            meta => {
197                    description => 'this is description meta tag',
198                    date => '2004-11-04',
199                    author => 'Dobrica Pavlinusic',
200            }
201            body => 'this is text without meta data',
202      );
203    
204    This is thin wrapper round L<_create_doc>.
205    
206    =cut
207    
208    sub add {
209            my $self = shift;
210    
211            $self->_create_doc(@_);
212    
213            return 1;
214    }
215    
216    =head2 search
217    
218    Search your index using any valid SWISH++ query.
219    
220      my @results = $i->search("swish query");
221    
222    Returns array with elements like this:
223    
224      {
225       rank => 10,                  # rank of result
226       path => 'path to result',    # path to result
227       size => 999,                 # size in bytes
228       title => 'title of result'   # title meta property
229      }
230    
231    =cut
232    
233    sub search {
234            my $self = shift;
235    
236            my $query = shift || return;
237    
238            $self->finish_update;
239            $self->_tie_meta_db(DB_RDONLY);
240    
241            my @results;
242    
243            # escape double quotes in query for shell
244            $query =~ s/"/\\"/g;
245    
246            my $open_cmd = $self->{'search'} .
247                    ' -i ' . $self->{'index_dir'}.'/index' .
248                    ' "' . $query . '"'.
249                    ' |';
250            print STDERR "## search: $open_cmd\n" if ($self->{'debug'});
251    
252            open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
253            my $l;
254            while($l = <SEARCH>) {
255                    next if ($l =~ /^#/);
256                    chomp($l);
257                    print STDERR "## $l\n" if ($self->{'debug'});
258                    my ($rank,$path,$size,$title) = split(/ /,$l,4);
259                    $path =~ s#^\./##; # strip from path
260                    push @results, {
261                            rank => $rank,
262                            path => $path,
263                            size => $size,
264                            title => $title,
265                    }
266            }
267    
268            close(SEARCH) || confess "can't close search";
269    
270            #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
271    
272            return @results;
273    }
274    
275    =head2 property
276    
277    Return stored meta property from result or result path.
278    
279      print $i->property('path', 'title');
280      print $i->property($res->{'path'}, 'title');
281    
282    =cut
283    
284    sub property {
285            my $self = shift;
286    
287            my ($path,$meta) = @_;
288    
289            if ($path =~ m/^HASH/) {
290                    $path = $path->{'path'} || confess "can't find path in input data";
291            }
292    
293            my $val = $self->{'meta_db'}->{"$path-$meta"};
294    
295            print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'});
296            return $val;
297    }
298    
299    =head2 finish_update
300    
301    This method will close index binary and enable search. Searching is not
302    available while indexing is in process.
303    
304      $i->finish_update;
305    
306    Usually, you don't need to call this method directly. It will be called on
307    DESTROY when $i goes out of scope or when you first call search in session
308    if indexing was started.
309    
310    =cut
311    
312    sub finish_update {
313            my $self = shift;
314    
315            print STDERR "## finish_update\n" if ($self->{'debug'});
316    
317            $self->_close_index && $self->_untie_meta_db;
318    }
319    
320    sub DESTROY {
321            my $self = shift;
322            $self->finish_update;
323    }
324    
325  =head1 PRIVATE METHODS  =head1 PRIVATE METHODS
326    
327  Private methods implement internals for creating temporary file needed for  Private methods implement internals for creating temporary files needed for
328  swish++. You should have no need to call them directly, and they are here  SWISH++. You should have no need to call them directly, and they are here
329  just to have documentation.  just to have documentation.
330    
331  =head2 _init_index  =head2 _init_indexer
332    
333  Create temporary directory in which files for indexing will be created and  Create temporary directory in which files for indexing will be created and
334  start index process.  start index process.
335    
336    my $i->_init_index || die "can't start indexer";    my $i->_init_indexer || die "can't start indexer";
337    
338    It will also create empty file C<_stopwords_> to disable stop words.
339    
340  =cut  =cut
341    
342  sub _init_index {  sub _init_indexer {
343          my $self = shift;          my $self = shift;
344    
345          $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";          return if ($self->{'_index_fh'});
346    
347            my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
348            $self->{'tmp_dir'} = $tmp_dir;
349    
350          my $opt = "-v 4";          chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
351    
352          my $open_cmd = '| index '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';          print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
353    
354          chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";          my $opt = "-v " . ($self->{'debug'} || '0');
355    
356            unless ($self->{'use_stopwrods'}) {
357                    open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
358                    print STOP "  ";
359                    close(STOP);
360                    $opt .= " -s _stopwords_";
361            }
362    
363          CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";          my $index_dir = $self->{'index_dir'} || confess "no index_dir?";
364    
365          return $self->{'index_fh'};          my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -';
366    
367            print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
368    
369            open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
370    
371            chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
372    
373            $self->_tie_meta_db(DB_CREATE);
374    
375            return $self->{'_index_fh'};
376  }  }
377    
378  =head2 _create_doc  =head2 _create_doc
379    
380  Create temporary file and pass it's name to swish++  Create temporary file and pass it's name to SWISH++
381    
382    $i->_create_doc(    $i->_create_doc(
383          path => 'path/to/store/in/index',          path => 'path/to/store/in/index',
384            title => 'this is title in results',
385          body => 'data to story in body tag',          body => 'data to story in body tag',
386          meta => {          meta => {
387                  'meta name' => 'data for this meta',                  'meta name' => 'data for this meta',
# Line 190  Create temporary file and pass it's name Line 389  Create temporary file and pass it's name
389          }          }
390    );    );
391    
392    To delete document, just omit body and meta data.
393    
394  =cut  =cut
395    
396  sub _create_doc {  sub _create_doc {
# Line 198  sub _create_doc { Line 399  sub _create_doc {
399          my $arg = {@_};          my $arg = {@_};
400    
401          # open indexer if needed          # open indexer if needed
402          $self->{'index_fh'} ||= $self->_init_index;          $self->_init_indexer;
403    
404          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
405            my $id = $arg->{'path'} || confess "no path?";
406            $path .= "/$id";
407    
408            print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
409    
410            open(TMP, '>', $path) || die "can't create temp file $path: $!";
411    
412          CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";          print TMP '<html><head>';
413    
414          print TMP '<html>';          $arg->{'body'} ||= '';
415    
416          if ($arg->{'meta'}) {          if ($arg->{'meta'}) {
417                  confess "not yet implemented";                  foreach my $name (keys %{$arg->{'meta'}}) {
418                            my $content = $arg->{'meta'}->{$name};
419                            print TMP qq{<meta name="$name" content="$content">};
420                            $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
421                            $self->{'meta_db'}->{"$id-$name"} = $content;
422                    }
423          }          }
424            
425          print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';          my $title = $arg->{'title'};
426            if (defined($title)) {
427                    print TMP "<title>$title</title>";
428                    $arg->{'body'} .= " $title" if ($self->{'meta_in_body'});
429                    $self->{'meta_db'}->{"$id-title"} = $title;
430            }
431    
432            print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
433                    
434          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
435    
436          print { $self->{'index_fh'} } $arg->{'path'}."\n";          print { $self->{'_index_fh'} } "$id\n";
437    }
438    
439    =head2 _close_index
440    
441    Close index after indexing.
442    
443      $i->_close_index;
444    
445    You have to close index before searching.
446    
447    =cut
448    
449    sub _close_index {
450            my $self = shift;
451    
452            return unless ($self->{'_index_fh'});
453    
454            print STDERR "## close index\n" if ($self->{'debug'});
455    
456            close($self->{'_index_fh'}) || confess "can't close index: $!";
457            undef $self->{'_index_fh'};
458    
459            return 1;
460    }
461    
462    =head2 _tie_meta_db
463    
464    Open BerkeleyDB database with meta properties.
465    
466      $i->_tie_meta_db(DB_CREATE);
467      $i->_tie_meta_db(DB_RDONLY);
468    
469    }
470    
471    =cut
472    
473    sub _tie_meta_db  {
474            my $self = shift;
475    
476            my $flags = shift || confess "need DB_CREATE or DB_RDONLY";
477    
478            return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags);
479    
480            print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'});
481    
482            $self->_untie_meta_db;
483            $self->{'_meta_db_flags'} = $flags;
484    
485            my $file = $self->{'index_dir'}.'/meta.db';
486    
487            tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash",
488                    -Filename => $file,
489                    -Flags    => $flags
490            or confess "cannot open $file: $! $BerkeleyDB::Error\n" ;
491    
492            return 1;
493    }
494    
495    =head2 _untie_meta_db
496    
497    Close BerkeleyDB database with meta properties.
498    
499      $i->_untie_meta_db;
500    
501    =cut
502    
503    sub _untie_meta_db {
504            my $self = shift;
505    
506            return unless ($self->{'meta_db'});
507    
508            print STDERR "## _untie_meta_db\n" if ($self->{'debug'});
509            untie %{$self->{'meta_db'}} || confess "can't untie!";
510            undef $self->{'meta_db'};
511            undef $self->{'_meta_db_flags'};
512    
513            return 1;
514  }  }
515    
516  1;  1;
# Line 228  None by default. Line 524  None by default.
524    
525  =head2 Debian  =head2 Debian
526    
527  Debian version of swish++ is often old (version 5 at moment of this writing  Debian version of SWISH++ is often old (version 5 at moment of this writing
528  while version 6 is available in source code), so this module by default  while version 6 is available in source code), so this module by default
529  uses executable names B<index> and B<search> for self-compiled version  uses executable names B<index> and B<search> for self-compiled version
530  instead of one from Debian package. See L<open> how to specify Debian  instead of one from Debian package. See L<new> how to specify Debian
531  default binaries B<index++> and B<search++>.  default binaries B<index++> and B<search++>.
532    
533  =head2 SWISH++  =head2 SWISH++
534    
535  Aside from very good rewrite in C++, SWISH++ is fatster because it has  Aside from very good rewrite in C++, SWISH++ is faster because it uses
536  claver heuristics about which data in input files are words to index and  claver heuristics about which data in input files are words to index and
537  which are not. It's based on English language and might be best choice if  which are not. It's based on English language and might be best choice if
538  you plan to install large amount of long text documents.  you plan to index large amount of long text documents.
539    
540  However, if you plan to index all data from structured storage (e.g. RDBMS)  However, if you plan to index all data from structured storage (e.g. RDBMS)
541  you might want B<all> words from data to end up in index as opposed to just  you might want B<all> words from data to end up in index as opposed to just
# Line 247  those which look like English words. Thi Line 543  those which look like English words. Thi
543  don't plan to index English texts with this module.  don't plan to index English texts with this module.
544    
545  With distribution build versions of SWISH++ you might have problems with  With distribution build versions of SWISH++ you might have problems with
546  disepearing words. To overcome this problem, you will have to compile and  disapearing words. To overcome this problem, you will have to compile and
547  configure SWISH++ yourself (because language characteristics are  configure SWISH++ yourself (because language characteristics are
548  compilation-time option).  compilation-time option).
549    
# Line 255  Compilation of SWISH++ is easy process w Line 551  Compilation of SWISH++ is easy process w
551  pages. To see my very relaxed sample configuration take a look at C<swish++>  pages. To see my very relaxed sample configuration take a look at C<swish++>
552  directory included in distribution.  directory included in distribution.
553    
554    =head2 SWISH++ config
555    
556    C<config.h> located in C<swish++> directory of this distribution is relaxed
557    SWISH++ configuration that will index all words passed to it. This
558    configuration is needed for B<date test> because default configuration
559    doesn't recognize 2004-12-05 as date. Have in mind that your index size
560    might explode.
561    
562    =head1 BUGS
563    
564    Currently there is no way to specify which meta data will be stored as
565    properties. B<This will be fixed very soon>.
566    
567    There is no garbage collection on temporary files created for SWISH++. This
568    means that one run of indexer will take additional disk space for temporary
569    files, which will be removed at end. There should be some way to remove
570    files after they are indexed by SWISH++. However, at this early stage of
571    development it's just not supported yet. Have plenty of disk space!
572    
573  =head1 SEE ALSO  =head1 SEE ALSO
574    
575  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>  SWISH++ web site L<http://homepage.mac.com/pauljlucas/software/swish/>
576    
577  =head1 AUTHOR  =head1 AUTHOR
578    

Legend:
Removed from v.5  
changed lines
  Added in v.21

  ViewVC Help
Powered by ViewVC 1.1.26