/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 5 by dpavlin, Fri Dec 3 21:48:15 2004 UTC revision 16 by dpavlin, Sun Dec 5 21:06:48 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.02';  our $VERSION = '0.10';
8    
9  use Carp;  use Carp;
10  use File::Temp qw/ tempdir /;  use File::Temp qw/ tempdir /;
11    use BerkeleyDB;
12    #use YAML;
13    
14  =head1 NAME  =head1 NAME
15    
# Line 30  performance. However, this module is not Line 32  performance. However, this module is not
32    
33  =head1 METHODS  =head1 METHODS
34    
35  =head2 open  =head2 new
36    
37  Create new indexing object.  Create new indexing object.
38    
39    my $i = SWISH::PlusPlus->open(    my $i = SWISH::PlusPlus->new(
40          index_dir => '/path/to/index',          index_dir => '/path/to/index',
41          index => 'index++',          index => 'index++',
42          search => 'search++',          search => 'search++',
43            debug => 1,
44            meta_in_body => 1,
45            use_stopwords => 1,
46    );    );
47    
48  Options to open are following:  Options to new are following:
49    
50  =over 5  =over 5
51    
# Line 58  B<index++>. See C<Debian>. Line 63  B<index++>. See C<Debian>.
63    
64  Full or partial path to SWISH++ search executable. By default, it's B<search>.  Full or partial path to SWISH++ search executable. By default, it's B<search>.
65    
66    =item C<debug>
67    
68    This option (off by default) will produce a lot of debugging output on
69    C<STDERR> prefixed by C<##>.
70    
71    =item C<meta_in_body>
72    
73    This option (off by default) enables to search content of meta fields
74    without specifing them (like they are in body of document). This will
75    somewhat increate index size.
76    
77    =item C<use_stopwords>
78    
79    Use built-in SWISH++ stop words. By default, they are disabled.
80    
81  =back  =back
82    
83  =cut  =cut
84    
85  sub open {  sub new {
86          my $class = shift;          my $class = shift;
87          my $self = {@_};          my $self = {@_};
88          bless($self, $class);          bless($self, $class);
# Line 71  sub open { Line 91  sub open {
91                  croak "need $_" unless $self->{$_};                  croak "need $_" unless $self->{$_};
92          }          }
93    
94          if (! -e $self->{'index_dir'}) {          my $index_dir = $self->{'index_dir'};
95                  mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";  
96            my $cwd;
97            chomp($cwd = `pwd`);
98            $self->{'cwd'} = $cwd || carp "can't get cwd!";
99            
100            if ($index_dir !~ m#^/#) {
101                    $index_dir = "$cwd/$index_dir";
102                    print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
103                    $self->{'index_dir'} = $index_dir;
104            }
105    
106            if (! -e $index_dir) {
107                    mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
108          }          }
109    
110          # default executables          # default executables
111          $self->{'index'} ||= 'index';          $self->{'index'} ||= 'index';
112          $self->{'search'} ||= 'search';          $self->{'search'} ||= 'search';
113    
114            print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
115    
116          $self ? return $self : return undef;          $self ? return $self : return undef;
117  }  }
118    
119    
120  =head2 check_bin  =head2 check_bin
121    
122  Check if swish++ binaries specified in L<open> are available and verify  Check if swish++ binaries specified in L<new> are available and verify
123  version signature.  version signature.
124    
125    if ($i->check_bin) {    if ($i->check_bin) {
# Line 113  sub check_bin { Line 147  sub check_bin {
147          confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);          confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
148    
149          if ($i eq $s) {          if ($i eq $s) {
150                    $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
151                  $self->{'version'} = $i;                  $self->{'version'} = $i;
152                  return 1;                  return 1;
153          } else  {          } else  {
# Line 146  sub index_document { Line 181  sub index_document {
181          return 1;          return 1;
182  }  }
183    
184    =head2 add
185    
186    Add document with metadata to index.
187    
188      $i->add(
189            path => 'path/to/document',
190            title => 'this is result title',
191            meta => {
192                    description => 'this is description meta tag',
193                    date => '2004-11-04',
194                    author => 'Dobrica Pavlinusic',
195            }
196            body => 'this is text without meta data',
197      );
198    
199    This is thin wrapper round L<_create_doc>.
200    
201    =cut
202    
203    sub add {
204            my $self = shift;
205    
206            $self->_create_doc(@_);
207    
208            return 1;
209    }
210    =head2 search
211    
212    Search your index.
213    
214      my @results = $i->search("swhish query");
215    
216    Returns array with result IDs.
217    
218    =cut
219    
220    sub search {
221            my $self = shift;
222    
223            my $query = shift || return;
224    
225            $self->finish_update;
226            $self->_tie_meta_db(DB_RDONLY);
227    
228            my @results;
229    
230            # escape double quotes in query for shell
231            $query =~ s/"/\\"/g;
232    
233            my $open_cmd = $self->{'search'} .
234                    ' -i ' . $self->{'index_dir'}.'/index' .
235                    ' "' . $query . '"'.
236                    ' |';
237            print STDERR "## search: $open_cmd\n" if ($self->{'debug'});
238    
239            open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
240            my $l;
241            while($l = <SEARCH>) {
242                    next if ($l =~ /^#/);
243                    chomp($l);
244                    print STDERR "## $l\n" if ($self->{'debug'});
245                    my ($rank,$path,$size,$title) = split(/ /,$l,4);
246                    $path =~ s#^\./##; # strip from path
247                    push @results, {
248                            rank => $rank,
249                            path => $path,
250                            size => $size,
251                            title => $title,
252                    }
253            }
254    
255            close(SEARCH) || confess "can't close search";
256    
257            #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
258    
259            return @results;
260    }
261    
262    =head2 property
263    
264    Return stored meta property from result or result path.
265    
266      print $i->property('path', 'title');
267      print $i->property($res->{'path'}, 'title');
268    
269    =cut
270    
271    sub property {
272            my $self = shift;
273    
274            my ($path,$meta) = @_;
275    
276            if ($path =~ m/^HASH/) {
277                    $path = $path->{'path'} || confess "can't find path in input data";
278            }
279    
280            my $val = $self->{'meta_db'}->{"$path-$meta"};
281    
282            print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'});
283            return $val;
284    }
285    
286    =head2 finish_update
287    
288    This method will close index.
289    
290      $i->finish_update;
291    
292    It will be called on DESTROY when $i goes out of scope.
293    
294    =cut
295    
296    sub finish_update {
297            my $self = shift;
298    
299            print STDERR "## finish_update\n" if ($self->{'debug'});
300    
301            $self->_close_index && $self->_untie_meta_db;
302    }
303    
304    sub DESTROY {
305            my $self = shift;
306            $self->finish_update;
307    }
308    
309  =head1 PRIVATE METHODS  =head1 PRIVATE METHODS
310    
311  Private methods implement internals for creating temporary file needed for  Private methods implement internals for creating temporary file needed for
312  swish++. You should have no need to call them directly, and they are here  swish++. You should have no need to call them directly, and they are here
313  just to have documentation.  just to have documentation.
314    
315  =head2 _init_index  =head2 _init_indexer
316    
317  Create temporary directory in which files for indexing will be created and  Create temporary directory in which files for indexing will be created and
318  start index process.  start index process.
319    
320    my $i->_init_index || die "can't start indexer";    my $i->_init_indexer || die "can't start indexer";
321    
322    It will also create empty file C<_stopwords_> to disable stop words.
323    
324  =cut  =cut
325    
326  sub _init_index {  sub _init_indexer {
327          my $self = shift;          my $self = shift;
328    
329          $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";          return if ($self->{'_index_fh'});
330    
331            my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
332            $self->{'tmp_dir'} = $tmp_dir;
333    
334          my $opt = "-v 4";          chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
335    
336            print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
337    
338            my $opt = "-v " . ($self->{'debug'} || '0');
339    
340            unless ($self->{'use_stopwrods'}) {
341                    open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
342                    print STOP "  ";
343                    close(STOP);
344                    $opt .= " -s _stopwords_";
345            }
346    
347            my $index_dir = $self->{'index_dir'} || confess "no index_dir?";
348    
349            my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -';
350    
351            print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
352    
353            open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
354    
355            chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
356    
357            $self->_tie_meta_db(DB_CREATE);
358    
359            return $self->{'_index_fh'};
360    }
361    
362          my $open_cmd = '| index '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';  =head2 _tie_meta_db
363    
364          chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";  Open BerkeleyDB database with meta properties.
365    
366          CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";    $i->_tie_meta_db(DB_CREATE);
367      $i->_tie_meta_db(DB_RDONLY);
368    
369          return $self->{'index_fh'};  }
370    
371    =cut
372    
373    sub _tie_meta_db  {
374            my $self = shift;
375    
376            my $flags = shift || confess "need DB_CREATE or DB_RDONLY";
377    
378            return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags);
379    
380            print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'});
381    
382            $self->_untie_meta_db;
383            $self->{'_meta_db_flags'} = $flags;
384    
385            my $file = $self->{'index_dir'}.'/meta.db';
386    
387            tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash",
388                    -Filename => $file,
389                    -Flags    => $flags
390            or confess "cannot open $file: $! $BerkeleyDB::Error\n" ;
391    
392            return 1;
393    }
394    
395    =head2 _untie_meta_db
396    
397    Close BerkeleyDB database with meta properties.
398    
399      $i->_untie_meta_db
400    
401    =cut
402    
403    sub _untie_meta_db {
404            my $self = shift;
405    
406            return unless ($self->{'meta_db'});
407    
408            print STDERR "## _untie_meta_db\n" if ($self->{'debug'});
409            untie %{$self->{'meta_db'}} || confess "can't untie!";
410            undef $self->{'meta_db'};
411            undef $self->{'_meta_db_flags'};
412    
413            return 1;
414  }  }
415    
416  =head2 _create_doc  =head2 _create_doc
# Line 183  Create temporary file and pass it's name Line 419  Create temporary file and pass it's name
419    
420    $i->_create_doc(    $i->_create_doc(
421          path => 'path/to/store/in/index',          path => 'path/to/store/in/index',
422            title => 'this is title in results',
423          body => 'data to story in body tag',          body => 'data to story in body tag',
424          meta => {          meta => {
425                  'meta name' => 'data for this meta',                  'meta name' => 'data for this meta',
# Line 190  Create temporary file and pass it's name Line 427  Create temporary file and pass it's name
427          }          }
428    );    );
429    
430    To delete document, just omit body and meta data.
431    
432  =cut  =cut
433    
434  sub _create_doc {  sub _create_doc {
# Line 198  sub _create_doc { Line 437  sub _create_doc {
437          my $arg = {@_};          my $arg = {@_};
438    
439          # open indexer if needed          # open indexer if needed
440          $self->{'index_fh'} ||= $self->_init_index;          $self->_init_indexer;
441    
442          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
443            my $id = $arg->{'path'} || confess "no path?";
444            $path .= "/$id";
445    
446            print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
447    
448            open(TMP, '>', $path) || die "can't create temp file $path: $!";
449    
450          CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";          print TMP '<html><head>';
451    
452          print TMP '<html>';          $arg->{'body'} ||= '';
453    
454          if ($arg->{'meta'}) {          if ($arg->{'meta'}) {
455                  confess "not yet implemented";                  foreach my $name (keys %{$arg->{'meta'}}) {
456                            my $content = $arg->{'meta'}->{$name};
457                            print TMP qq{<meta name="$name" content="$content">};
458                            $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
459                            $self->{'meta_db'}->{"$id-$name"} = $content;
460                    }
461          }          }
462            
463          print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';          my $title = $arg->{'title'};
464            if (defined($title)) {
465                    print TMP "<title>$title</title>";
466                    $arg->{'body'} .= " $title" if ($self->{'meta_in_body'});
467                    $self->{'meta_db'}->{"$id-title"} = $title;
468            }
469    
470            print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
471                    
472          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
473    
474          print { $self->{'index_fh'} } $arg->{'path'}."\n";          print { $self->{'_index_fh'} } "$id\n";
475    }
476    
477    =head2 _close_index
478    
479    Close index after indexing.
480    
481      $i->_close_index;
482    
483    You have to close index before searching.
484    
485    =cut
486    
487    sub _close_index {
488            my $self = shift;
489    
490            return unless ($self->{'_index_fh'});
491    
492            print STDERR "## close index\n" if ($self->{'debug'});
493    
494            close($self->{'_index_fh'}) || confess "can't close index: $!";
495            undef $self->{'_index_fh'};
496    
497            return 1;
498  }  }
499    
500  1;  1;
# Line 231  None by default. Line 511  None by default.
511  Debian version of swish++ is often old (version 5 at moment of this writing  Debian version of swish++ is often old (version 5 at moment of this writing
512  while version 6 is available in source code), so this module by default  while version 6 is available in source code), so this module by default
513  uses executable names B<index> and B<search> for self-compiled version  uses executable names B<index> and B<search> for self-compiled version
514  instead of one from Debian package. See L<open> how to specify Debian  instead of one from Debian package. See L<new> how to specify Debian
515  default binaries B<index++> and B<search++>.  default binaries B<index++> and B<search++>.
516    
517  =head2 SWISH++  =head2 SWISH++
# Line 255  Compilation of SWISH++ is easy process w Line 535  Compilation of SWISH++ is easy process w
535  pages. To see my very relaxed sample configuration take a look at C<swish++>  pages. To see my very relaxed sample configuration take a look at C<swish++>
536  directory included in distribution.  directory included in distribution.
537    
538    =head2 SWISH++ config
539    
540    C<config.h> located in C<swish++> directory of this distribution is relaxed
541    SWISH++ configuration that will index all words passed to it. This
542    configuration is needed for B<date test> because default configuration
543    doesn't recognize 2004-12-05 as date. Have in mind that your index size
544    might explode.
545    
546  =head1 SEE ALSO  =head1 SEE ALSO
547    
548  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>

Legend:
Removed from v.5  
changed lines
  Added in v.16

  ViewVC Help
Powered by ViewVC 1.1.26