/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 8 by dpavlin, Sat Dec 4 17:49:20 2004 UTC revision 13 by dpavlin, Sun Dec 5 14:35:54 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.02';  our $VERSION = '0.06';
8    
9  use Carp;  use Carp;
10  use File::Temp qw/ tempdir /;  use File::Temp qw/ tempdir /;
# Line 31  performance. However, this module is not Line 31  performance. However, this module is not
31    
32  =head1 METHODS  =head1 METHODS
33    
34  =head2 open  =head2 new
35    
36  Create new indexing object.  Create new indexing object.
37    
38    my $i = SWISH::PlusPlus->open(    my $i = SWISH::PlusPlus->new(
39          index_dir => '/path/to/index',          index_dir => '/path/to/index',
40          index => 'index++',          index => 'index++',
41          search => 'search++',          search => 'search++',
42          debug => 1,          debug => 1,
43            meta_in_body => 1,
44            use_stopwords => 1,
45    );    );
46    
47  Options to open are following:  Options to new are following:
48    
49  =over 5  =over 5
50    
# Line 65  Full or partial path to SWISH++ search e Line 67  Full or partial path to SWISH++ search e
67  This option (off by default) will produce a lot of debugging output on  This option (off by default) will produce a lot of debugging output on
68  C<STDERR> prefixed by C<##>.  C<STDERR> prefixed by C<##>.
69    
70    =item C<meta_in_body>
71    
72    This option (off by default) enables to search content of meta fields
73    without specifing them (like they are in body of document). This will
74    somewhat increate index size.
75    
76    =item C<use_stopwords>
77    
78    Use built-in SWISH++ stop words. By default, they are disabled.
79    
80  =back  =back
81    
82  =cut  =cut
83    
84  sub open {  sub new {
85          my $class = shift;          my $class = shift;
86          my $self = {@_};          my $self = {@_};
87          bless($self, $class);          bless($self, $class);
# Line 78  sub open { Line 90  sub open {
90                  croak "need $_" unless $self->{$_};                  croak "need $_" unless $self->{$_};
91          }          }
92    
93          if (! -e $self->{'index_dir'}) {          my $index_dir = $self->{'index_dir'};
94                  mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";  
95            if ($index_dir !~ m#^/#) {
96                    chomp(my $cwd = `pwd`);
97                    $index_dir = "$cwd/$index_dir";
98                    print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
99                    $self->{'index_dir'} = $index_dir;
100            }
101    
102            if (! -e $index_dir) {
103                    mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
104          }          }
105    
106          # default executables          # default executables
107          $self->{'index'} ||= 'index';          $self->{'index'} ||= 'index';
108          $self->{'search'} ||= 'search';          $self->{'search'} ||= 'search';
109    
110          print STDERR "## open index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});          print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
111    
112          $self ? return $self : return undef;          $self ? return $self : return undef;
113  }  }
# Line 94  sub open { Line 115  sub open {
115    
116  =head2 check_bin  =head2 check_bin
117    
118  Check if swish++ binaries specified in L<open> are available and verify  Check if swish++ binaries specified in L<new> are available and verify
119  version signature.  version signature.
120    
121    if ($i->check_bin) {    if ($i->check_bin) {
# Line 155  sub index_document { Line 176  sub index_document {
176          return 1;          return 1;
177  }  }
178    
179    =head2 add
180    
181    Add document with metadata to index.
182    
183      $i->add(
184            path => 'path/to/document',
185            title => 'this is result title',
186            meta => {
187                    description => 'this is description meta tag',
188                    date => '2004-11-04',
189                    author => 'Dobrica Pavlinusic',
190            }
191            body => 'this is text without meta data',
192      );
193    
194    This is thin wrapper round L<_create_doc>.
195    
196    =cut
197    
198    sub add {
199            my $self = shift;
200    
201            $self->_create_doc(@_);
202    
203            return 1;
204    }
205  =head2 search  =head2 search
206    
207  Search your index.  Search your index.
# Line 180  sub search { Line 227  sub search {
227          my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';          my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
228          print STDERR "## search $open_cmd\n" if ($self->{'debug'});          print STDERR "## search $open_cmd\n" if ($self->{'debug'});
229    
230          CORE::open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";          open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
231          while(<SEARCH>) {          while(<SEARCH>) {
232                  next if (/^#/);                  next if (/^#/);
233                  chomp;                  chomp;
# Line 201  sub search { Line 248  sub search {
248          return @results;          return @results;
249  }  }
250    
251    =head2 finish_update
252    
253    This method will close index.
254    
255      $i->finish_update;
256    
257    It will be called on DESTROY when $i goes out of scope.
258    
259    =cut
260    
261    sub finish_update {
262            my $self = shift;
263    
264            $self->_close_index;
265    }
266    
267    sub DESTROY {
268            my $self = shift;
269            $self->finish_update;
270    }
271    
272  =head1 PRIVATE METHODS  =head1 PRIVATE METHODS
273    
274  Private methods implement internals for creating temporary file needed for  Private methods implement internals for creating temporary file needed for
275  swish++. You should have no need to call them directly, and they are here  swish++. You should have no need to call them directly, and they are here
276  just to have documentation.  just to have documentation.
277    
278  =head2 _init_index  =head2 _init_indexer
279    
280  Create temporary directory in which files for indexing will be created and  Create temporary directory in which files for indexing will be created and
281  start index process.  start index process.
282    
283    my $i->_init_index || die "can't start indexer";    my $i->_init_indexer || die "can't start indexer";
284    
285    It will also create empty file C<_stopwords_> to disable stop words.
286    
287  =cut  =cut
288    
289  sub _init_index {  sub _init_indexer {
290          my $self = shift;          my $self = shift;
291    
292          $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";          $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
293    
294          my $opt = "-v 4";          chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
295    
296            my $opt = "-v " . ($self->{'debug'} || '0');
297    
298            unless ($self->{'use_stopwrods'}) {
299                    open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
300                    print STOP "  ";
301                    close(STOP);
302                    $opt .= " -s _stopwords_";
303            }
304    
305          my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';          my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
306    
307          chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";          print STDERR "## open index $open_cmd\n" if ($self->{'index'});
308    
309            open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
310    
         CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";  
311    
312          return $self->{'index_fh'};          return $self->{'index_fh'};
313  }  }
# Line 238  Create temporary file and pass it's name Line 318  Create temporary file and pass it's name
318    
319    $i->_create_doc(    $i->_create_doc(
320          path => 'path/to/store/in/index',          path => 'path/to/store/in/index',
321            title => 'this is title in results',
322          body => 'data to story in body tag',          body => 'data to story in body tag',
323          meta => {          meta => {
324                  'meta name' => 'data for this meta',                  'meta name' => 'data for this meta',
# Line 255  sub _create_doc { Line 336  sub _create_doc {
336          my $arg = {@_};          my $arg = {@_};
337    
338          # open indexer if needed          # open indexer if needed
339          $self->{'index_fh'} ||= $self->_init_index;          $self->{'index_fh'} ||= $self->_init_indexer;
340    
341          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
342    
343          CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";          open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
344    
345            print TMP '<html><head>';
346    
347          print TMP '<html>';          $arg->{'body'} ||= '';
348    
349          if ($arg->{'meta'}) {          if ($arg->{'meta'}) {
350                  confess "not yet implemented";                  foreach my $name (keys %{$arg->{'meta'}}) {
351                            my $content = $arg->{'meta'}->{$name};
352                            print TMP qq{<meta name="$name" content="$content">};
353                            $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
354                    }
355          }          }
356            
357          print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';          if (defined($arg->{'title'})) {
358                    print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
359                    $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
360            }
361    
362            print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
363                    
364          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
365    
# Line 309  None by default. Line 401  None by default.
401  Debian version of swish++ is often old (version 5 at moment of this writing  Debian version of swish++ is often old (version 5 at moment of this writing
402  while version 6 is available in source code), so this module by default  while version 6 is available in source code), so this module by default
403  uses executable names B<index> and B<search> for self-compiled version  uses executable names B<index> and B<search> for self-compiled version
404  instead of one from Debian package. See L<open> how to specify Debian  instead of one from Debian package. See L<new> how to specify Debian
405  default binaries B<index++> and B<search++>.  default binaries B<index++> and B<search++>.
406    
407  =head2 SWISH++  =head2 SWISH++
# Line 333  Compilation of SWISH++ is easy process w Line 425  Compilation of SWISH++ is easy process w
425  pages. To see my very relaxed sample configuration take a look at C<swish++>  pages. To see my very relaxed sample configuration take a look at C<swish++>
426  directory included in distribution.  directory included in distribution.
427    
428    =head2 SWISH++ config
429    
430    C<config.h> located in C<swish++> directory of this distribution is relaxed
431    SWISH++ configuration that will index all words passed to it. This
432    configuration is needed for B<date test> because default configuration
433    doesn't recognize 2004-12-05 as date. Have in mind that your index size
434    might explode.
435    
436  =head1 SEE ALSO  =head1 SEE ALSO
437    
438  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>

Legend:
Removed from v.8  
changed lines
  Added in v.13

  ViewVC Help
Powered by ViewVC 1.1.26