/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 4 by dpavlin, Fri Dec 3 19:35:02 2004 UTC revision 14 by dpavlin, Sun Dec 5 15:35:53 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.02';  our $VERSION = '0.06';
8    
9  use Carp;  use Carp;
10  use File::Temp qw/ tempdir /;  use File::Temp qw/ tempdir /;
11    #use YAML;
12    
13  =head1 NAME  =head1 NAME
14    
# Line 30  performance. However, this module is not Line 31  performance. However, this module is not
31    
32  =head1 METHODS  =head1 METHODS
33    
34  =head2 open  =head2 new
35    
36  Create new indexing object.  Create new indexing object.
37    
38    my $i = SWISH::PlusPlus->open(    my $i = SWISH::PlusPlus->new(
39          index_dir => '/path/to/index',          index_dir => '/path/to/index',
40          index => 'index++',          index => 'index++',
41          search => 'search++',          search => 'search++',
42            debug => 1,
43            meta_in_body => 1,
44            use_stopwords => 1,
45    );    );
46    
47  Options to open are following:  Options to new are following:
48    
49  =over 5  =over 5
50    
# Line 58  B<index++>. See C<Debian>. Line 62  B<index++>. See C<Debian>.
62    
63  Full or partial path to SWISH++ search executable. By default, it's B<search>.  Full or partial path to SWISH++ search executable. By default, it's B<search>.
64    
65    =item C<debug>
66    
67    This option (off by default) will produce a lot of debugging output on
68    C<STDERR> prefixed by C<##>.
69    
70    =item C<meta_in_body>
71    
72    This option (off by default) enables to search content of meta fields
73    without specifing them (like they are in body of document). This will
74    somewhat increate index size.
75    
76    =item C<use_stopwords>
77    
78    Use built-in SWISH++ stop words. By default, they are disabled.
79    
80  =back  =back
81    
82  =cut  =cut
83    
84  sub open {  sub new {
85          my $class = shift;          my $class = shift;
86          my $self = {@_};          my $self = {@_};
87          bless($self, $class);          bless($self, $class);
# Line 71  sub open { Line 90  sub open {
90                  croak "need $_" unless $self->{$_};                  croak "need $_" unless $self->{$_};
91          }          }
92    
93          if (! -e $self->{'index_dir'}) {          my $index_dir = $self->{'index_dir'};
94                  mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";  
95            my $cwd;
96            chomp($cwd = `pwd`);
97            $self->{'cwd'} = $cwd || carp "can't get cwd!";
98            
99            if ($index_dir !~ m#^/#) {
100                    $index_dir = "$cwd/$index_dir";
101                    print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
102                    $self->{'index_dir'} = $index_dir;
103            }
104    
105            if (! -e $index_dir) {
106                    mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
107          }          }
108    
109          # default executables          # default executables
110          $self->{'index'} ||= 'index';          $self->{'index'} ||= 'index';
111          $self->{'search'} ||= 'search';          $self->{'search'} ||= 'search';
112    
113            print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
114    
115          $self ? return $self : return undef;          $self ? return $self : return undef;
116  }  }
117    
118    
119  =head2 check_bin  =head2 check_bin
120    
121  Check if swish++ binaries specified in L<open> are available and verify  Check if swish++ binaries specified in L<new> are available and verify
122  version signature.  version signature.
123    
124    if ($i->check_bin) {    if ($i->check_bin) {
# Line 113  sub check_bin { Line 146  sub check_bin {
146          confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);          confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
147    
148          if ($i eq $s) {          if ($i eq $s) {
149                    $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
150                  $self->{'version'} = $i;                  $self->{'version'} = $i;
151                  return 1;                  return 1;
152          } else  {          } else  {
# Line 146  sub index_document { Line 180  sub index_document {
180          return 1;          return 1;
181  }  }
182    
183    =head2 add
184    
185    Add document with metadata to index.
186    
187      $i->add(
188            path => 'path/to/document',
189            title => 'this is result title',
190            meta => {
191                    description => 'this is description meta tag',
192                    date => '2004-11-04',
193                    author => 'Dobrica Pavlinusic',
194            }
195            body => 'this is text without meta data',
196      );
197    
198    This is thin wrapper round L<_create_doc>.
199    
200    =cut
201    
202    sub add {
203            my $self = shift;
204    
205            $self->_create_doc(@_);
206    
207            return 1;
208    }
209    =head2 search
210    
211    Search your index.
212    
213      my @results = $i->search("swhish query");
214    
215    Returns array with result IDs.
216    
217    =cut
218    
219    sub search {
220            my $self = shift;
221    
222            my $query = shift || return;
223    
224            $self->finish_update;
225    
226            my @results;
227    
228            # escape double quotes in query for shell
229            $query =~ s/"/\\"/g;
230    
231            my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
232            print STDERR "## search $open_cmd\n" if ($self->{'debug'});
233    
234            open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
235            while(<SEARCH>) {
236                    next if (/^#/);
237                    chomp;
238                    print STDERR "## $_\n" if ($self->{'debug'});
239                    my ($rank,$path,$size,$title) = split(/ /,$_,4);
240                    push @results, {
241                            rank => $rank,
242                            path => $path,
243                            size => $size,
244                            title => $title,
245                    }
246            }
247    
248            close(SEARCH) || confess "can't close search";
249    
250            #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
251    
252            return @results;
253    }
254    
255    =head2 finish_update
256    
257    This method will close index.
258    
259      $i->finish_update;
260    
261    It will be called on DESTROY when $i goes out of scope.
262    
263    =cut
264    
265    sub finish_update {
266            my $self = shift;
267    
268            print STDERR "## finish_update\n" if ($self->{'debug'});
269    
270            $self->_close_index;
271    }
272    
273    sub DESTROY {
274            my $self = shift;
275            $self->finish_update;
276    }
277    
278  =head1 PRIVATE METHODS  =head1 PRIVATE METHODS
279    
280  Private methods implement internals for creating temporary file needed for  Private methods implement internals for creating temporary file needed for
281  swish++. You should have no need to call them directly, and they are here  swish++. You should have no need to call them directly, and they are here
282  just to have documentation.  just to have documentation.
283    
284  =head2 _init_index  =head2 _init_indexer
285    
286  Create temporary directory in which files for indexing will be created and  Create temporary directory in which files for indexing will be created and
287  start index process.  start index process.
288    
289    my $i->_init_index || die "can't start indexer";    my $i->_init_indexer || die "can't start indexer";
290    
291    It will also create empty file C<_stopwords_> to disable stop words.
292    
293  =cut  =cut
294    
295  sub _init_index {  sub _init_indexer {
296          my $self = shift;          my $self = shift;
297    
298          $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";          return if ($self->{'_index_fh'});
299    
300            my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
301            $self->{'tmp_dir'} = $tmp_dir;
302    
303            chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
304    
305            print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
306    
307            my $opt = "-v " . ($self->{'debug'} || '0');
308    
309            unless ($self->{'use_stopwrods'}) {
310                    open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
311                    print STOP "  ";
312                    close(STOP);
313                    $opt .= " -s _stopwords_";
314            }
315    
316          my $opt = "-v 4";          my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
317    
318          my $open_cmd = '| index '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';          print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
319    
320          chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";          open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
321    
322          CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";          chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
323    
324          return $self->{'index_fh'};          return $self->{'_index_fh'};
325  }  }
326    
327  =head2 _create_doc  =head2 _create_doc
# Line 183  Create temporary file and pass it's name Line 330  Create temporary file and pass it's name
330    
331    $i->_create_doc(    $i->_create_doc(
332          path => 'path/to/store/in/index',          path => 'path/to/store/in/index',
333            title => 'this is title in results',
334          body => 'data to story in body tag',          body => 'data to story in body tag',
335          meta => {          meta => {
336                  'meta name' => 'data for this meta',                  'meta name' => 'data for this meta',
# Line 190  Create temporary file and pass it's name Line 338  Create temporary file and pass it's name
338          }          }
339    );    );
340    
341    To delete document, just omit body and meta data.
342    
343  =cut  =cut
344    
345  sub _create_doc {  sub _create_doc {
# Line 198  sub _create_doc { Line 348  sub _create_doc {
348          my $arg = {@_};          my $arg = {@_};
349    
350          # open indexer if needed          # open indexer if needed
351          $self->{'index_fh'} ||= $self->_init_index;          $self->_init_indexer;
352    
353          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
354            $path .= '/' . $arg->{'path'};
355    
356            print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
357    
358            open(TMP, '>', $path) || die "can't create temp file $path: $!";
359    
360          CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";          print TMP '<html><head>';
361    
362          print TMP '<html>';          $arg->{'body'} ||= '';
363    
364          if ($arg->{'meta'}) {          if ($arg->{'meta'}) {
365                  confess "not yet implemented";                  foreach my $name (keys %{$arg->{'meta'}}) {
366                            my $content = $arg->{'meta'}->{$name};
367                            print TMP qq{<meta name="$name" content="$content">};
368                            $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
369                    }
370          }          }
371            
372          print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';          if (defined($arg->{'title'})) {
373                    print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
374                    $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
375            }
376    
377            print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
378                    
379          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
380    
381          print { $self->{'index_fh'} } $arg->{'path'}."\n";          print { $self->{'_index_fh'} } $arg->{'path'}."\n";
382    }
383    
384    =head2 _close_index
385    
386    Close index after indexing.
387    
388      $i->_close_index;
389    
390    You have to close index before searching.
391    
392    =cut
393    
394    sub _close_index {
395            my $self = shift;
396    
397            return unless ($self->{'_index_fh'});
398    
399            print STDERR "## close index\n" if ($self->{'debug'});
400    
401            close($self->{'_index_fh'});
402            undef $self->{'_index_fh'};
403  }  }
404    
405  1;  1;
# Line 231  None by default. Line 416  None by default.
416  Debian version of swish++ is often old (version 5 at moment of this writing  Debian version of swish++ is often old (version 5 at moment of this writing
417  while version 6 is available in source code), so this module by default  while version 6 is available in source code), so this module by default
418  uses executable names B<index> and B<search> for self-compiled version  uses executable names B<index> and B<search> for self-compiled version
419  instead of one from Debian package. See L<open> how to specify Debian  instead of one from Debian package. See L<new> how to specify Debian
420  default binaries B<index++> and B<search++>.  default binaries B<index++> and B<search++>.
421    
422  =head1 SEE ALSO  =head2 SWISH++
423    
424  Mention other useful documentation such as the documentation of  Aside from very good rewrite in C++, SWISH++ is fatster because it has
425  related modules or operating system documentation (such as man pages  claver heuristics about which data in input files are words to index and
426  in UNIX), or any relevant external documentation such as RFCs or  which are not. It's based on English language and might be best choice if
427  standards.  you plan to install large amount of long text documents.
428    
429    However, if you plan to index all data from structured storage (e.g. RDBMS)
430    you might want B<all> words from data to end up in index as opposed to just
431    those which look like English words. This is especially important if you
432    don't plan to index English texts with this module.
433    
434    With distribution build versions of SWISH++ you might have problems with
435    disepearing words. To overcome this problem, you will have to compile and
436    configure SWISH++ yourself (because language characteristics are
437    compilation-time option).
438    
439    Compilation of SWISH++ is easy process well described on project's web
440    pages. To see my very relaxed sample configuration take a look at C<swish++>
441    directory included in distribution.
442    
443    =head2 SWISH++ config
444    
445    C<config.h> located in C<swish++> directory of this distribution is relaxed
446    SWISH++ configuration that will index all words passed to it. This
447    configuration is needed for B<date test> because default configuration
448    doesn't recognize 2004-12-05 as date. Have in mind that your index size
449    might explode.
450    
451  If you have a mailing list set up for your module, mention it here.  =head1 SEE ALSO
452    
453  If you have a web site set up for your module, mention it here.  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
454    
455  =head1 AUTHOR  =head1 AUTHOR
456    
457  Dobrica Pavlinusic, E<lt>dpavlin@E<gt>  Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
458    
459  =head1 COPYRIGHT AND LICENSE  =head1 COPYRIGHT AND LICENSE
460    

Legend:
Removed from v.4  
changed lines
  Added in v.14

  ViewVC Help
Powered by ViewVC 1.1.26