/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 5 by dpavlin, Fri Dec 3 21:48:15 2004 UTC revision 11 by dpavlin, Sun Dec 5 13:30:57 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.02';  our $VERSION = '0.05';
8    
9  use Carp;  use Carp;
10  use File::Temp qw/ tempdir /;  use File::Temp qw/ tempdir /;
11    #use YAML;
12    
13  =head1 NAME  =head1 NAME
14    
# Line 30  performance. However, this module is not Line 31  performance. However, this module is not
31    
32  =head1 METHODS  =head1 METHODS
33    
34  =head2 open  =head2 new
35    
36  Create new indexing object.  Create new indexing object.
37    
38    my $i = SWISH::PlusPlus->open(    my $i = SWISH::PlusPlus->new(
39          index_dir => '/path/to/index',          index_dir => '/path/to/index',
40          index => 'index++',          index => 'index++',
41          search => 'search++',          search => 'search++',
42            debug => 1,
43            meta_in_body => 1,
44            use_stopwords => 1,
45    );    );
46    
47  Options to open are following:  Options to new are following:
48    
49  =over 5  =over 5
50    
# Line 58  B<index++>. See C<Debian>. Line 62  B<index++>. See C<Debian>.
62    
63  Full or partial path to SWISH++ search executable. By default, it's B<search>.  Full or partial path to SWISH++ search executable. By default, it's B<search>.
64    
65    =item C<debug>
66    
67    This option (off by default) will produce a lot of debugging output on
68    C<STDERR> prefixed by C<##>.
69    
70    =item C<meta_in_body>
71    
72    This option (off by default) enables to search content of meta fields
73    without specifing them (like they are in body of document). This will
74    somewhat increate index size.
75    
76    =item C<use_stopwords>
77    
78    Use built-in SWISH++ stop words. By default, they are disabled.
79    
80  =back  =back
81    
82  =cut  =cut
83    
84  sub open {  sub new {
85          my $class = shift;          my $class = shift;
86          my $self = {@_};          my $self = {@_};
87          bless($self, $class);          bless($self, $class);
# Line 79  sub open { Line 98  sub open {
98          $self->{'index'} ||= 'index';          $self->{'index'} ||= 'index';
99          $self->{'search'} ||= 'search';          $self->{'search'} ||= 'search';
100    
101            print STDERR "## new index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
102    
103          $self ? return $self : return undef;          $self ? return $self : return undef;
104  }  }
105    
106    
107  =head2 check_bin  =head2 check_bin
108    
109  Check if swish++ binaries specified in L<open> are available and verify  Check if swish++ binaries specified in L<new> are available and verify
110  version signature.  version signature.
111    
112    if ($i->check_bin) {    if ($i->check_bin) {
# Line 146  sub index_document { Line 167  sub index_document {
167          return 1;          return 1;
168  }  }
169    
170    =head2 add
171    
172    Add document with metadata to index.
173    
174      $i->add(
175            path => 'path/to/document',
176            title => 'this is result title',
177            meta => {
178                    description => 'this is description meta tag',
179                    date => '2004-11-04',
180                    author => 'Dobrica Pavlinusic',
181            }
182            body => 'this is text without meta data',
183      );
184    
185    This is thin wrapper round L<_create_doc>.
186    
187    =cut
188    
189    sub add {
190            my $self = shift;
191    
192            $self->_create_doc(@_);
193    
194            return 1;
195    }
196    =head2 search
197    
198    Search your index.
199    
200      my @results = $i->search("swhish query");
201    
202    Returns array with result IDs.
203    
204    =cut
205    
206    sub search {
207            my $self = shift;
208    
209            my $query = shift || return;
210    
211            $self->_close_index;
212    
213            my @results;
214    
215            # escape double quotes in query for shell
216            $query =~ s/"/\\"/g;
217    
218            my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
219            print STDERR "## search $open_cmd\n" if ($self->{'debug'});
220    
221            open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
222            while(<SEARCH>) {
223                    next if (/^#/);
224                    chomp;
225                    print STDERR "## $_\n" if ($self->{'debug'});
226                    my ($rank,$path,$size,$title) = split(/ /,$_,4);
227                    push @results, {
228                            rank => $rank,
229                            path => $path,
230                            size => $size,
231                            title => $title,
232                    }
233            }
234    
235            close(SEARCH) || confess "can't close search";
236    
237            #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
238    
239            return @results;
240    }
241    
242  =head1 PRIVATE METHODS  =head1 PRIVATE METHODS
243    
244  Private methods implement internals for creating temporary file needed for  Private methods implement internals for creating temporary file needed for
245  swish++. You should have no need to call them directly, and they are here  swish++. You should have no need to call them directly, and they are here
246  just to have documentation.  just to have documentation.
247    
248  =head2 _init_index  =head2 _init_indexer
249    
250  Create temporary directory in which files for indexing will be created and  Create temporary directory in which files for indexing will be created and
251  start index process.  start index process.
252    
253    my $i->_init_index || die "can't start indexer";    my $i->_init_indexer || die "can't start indexer";
254    
255    It will also create empty file C<_stopwords_> to disable stop words.
256    
257  =cut  =cut
258    
259  sub _init_index {  sub _init_indexer {
260          my $self = shift;          my $self = shift;
261    
262          $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";          $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
263    
264            chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
265    
266          my $opt = "-v 4";          my $opt = "-v 4";
267    
268          my $open_cmd = '| index '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';          unless ($self->{'use_stopwrods'}) {
269                    open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
270                    print STOP "  ";
271                    close(STOP);
272                    $opt .= " -s _stopwords_";
273            }
274    
275          chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";          my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
276    
277    
278            open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
279    
         CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";  
280    
281          return $self->{'index_fh'};          return $self->{'index_fh'};
282  }  }
# Line 183  Create temporary file and pass it's name Line 287  Create temporary file and pass it's name
287    
288    $i->_create_doc(    $i->_create_doc(
289          path => 'path/to/store/in/index',          path => 'path/to/store/in/index',
290            title => 'this is title in results',
291          body => 'data to story in body tag',          body => 'data to story in body tag',
292          meta => {          meta => {
293                  'meta name' => 'data for this meta',                  'meta name' => 'data for this meta',
# Line 190  Create temporary file and pass it's name Line 295  Create temporary file and pass it's name
295          }          }
296    );    );
297    
298    To delete document, just omit body and meta data.
299    
300  =cut  =cut
301    
302  sub _create_doc {  sub _create_doc {
# Line 198  sub _create_doc { Line 305  sub _create_doc {
305          my $arg = {@_};          my $arg = {@_};
306    
307          # open indexer if needed          # open indexer if needed
308          $self->{'index_fh'} ||= $self->_init_index;          $self->{'index_fh'} ||= $self->_init_indexer;
309    
310          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";          my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
311    
312          CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";          open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
313    
314            print TMP '<html><head>';
315    
316          print TMP '<html>';          $arg->{'body'} ||= '';
317    
318          if ($arg->{'meta'}) {          if ($arg->{'meta'}) {
319                  confess "not yet implemented";                  foreach my $name (keys %{$arg->{'meta'}}) {
320                            my $content = $arg->{'meta'}->{$name};
321                            print TMP qq{<meta name="$name" content="$content">};
322                            $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
323                    }
324          }          }
325            
326          print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';          if (defined($arg->{'title'})) {
327                    print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
328                    $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
329            }
330    
331            print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
332                    
333          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";          close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
334    
335          print { $self->{'index_fh'} } $arg->{'path'}."\n";          print { $self->{'index_fh'} } $arg->{'path'}."\n";
336  }  }
337    
338    =head2 _close_index
339    
340    Close index after indexing.
341    
342      $i->_close_index;
343    
344    You have to close index before searching.
345    
346    =cut
347    
348    sub _close_index {
349            my $self = shift;
350    
351            return unless ($self->{'index_fh'});
352    
353            print STDERR "## close index\n" if ($self->{'debug'});
354    
355            close($self->{'index_fh'});
356            undef $self->{'index_fh'};
357    }
358    
359  1;  1;
360  __END__  __END__
361    
# Line 231  None by default. Line 370  None by default.
370  Debian version of swish++ is often old (version 5 at moment of this writing  Debian version of swish++ is often old (version 5 at moment of this writing
371  while version 6 is available in source code), so this module by default  while version 6 is available in source code), so this module by default
372  uses executable names B<index> and B<search> for self-compiled version  uses executable names B<index> and B<search> for self-compiled version
373  instead of one from Debian package. See L<open> how to specify Debian  instead of one from Debian package. See L<new> how to specify Debian
374  default binaries B<index++> and B<search++>.  default binaries B<index++> and B<search++>.
375    
376  =head2 SWISH++  =head2 SWISH++
# Line 255  Compilation of SWISH++ is easy process w Line 394  Compilation of SWISH++ is easy process w
394  pages. To see my very relaxed sample configuration take a look at C<swish++>  pages. To see my very relaxed sample configuration take a look at C<swish++>
395  directory included in distribution.  directory included in distribution.
396    
397    =head2 SWISH++ config
398    
399    C<config.h> located in C<swish++> directory of this distribution is relaxed
400    SWISH++ configuration that will index all words passed to it. This
401    configuration is needed for B<date test> because default configuration
402    doesn't recognize 2004-12-05 as date. Have in mind that your index size
403    might explode.
404    
405  =head1 SEE ALSO  =head1 SEE ALSO
406    
407  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>

Legend:
Removed from v.5  
changed lines
  Added in v.11

  ViewVC Help
Powered by ViewVC 1.1.26