/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (hide annotations)
Sun Dec 5 14:35:54 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 10184 byte(s)
relative index_dir paths will be now resolved to absolute,
added finish_update to close indexer, DESTROY will also close index,
verbose level now corresponds to debug level, mode debug output

1 dpavlin 1 package SWISH::PlusPlus;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7 dpavlin 13 our $VERSION = '0.06';
8 dpavlin 1
9     use Carp;
10 dpavlin 4 use File::Temp qw/ tempdir /;
11 dpavlin 8 #use YAML;
12 dpavlin 1
13     =head1 NAME
14    
15     SWISH::PlusPlus - Perl extension SWISH++
16    
17     =head1 SYNOPSIS
18    
19     use SWISH::PlusPlus;
20     blah blah blah
21    
22     =head1 DESCRIPTION
23    
24     This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25 dpavlin 3 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26     support for properties (which this module tries to fix).
27 dpavlin 1
28 dpavlin 3 Implementation of this module is crafted after L<Plucene::Simple> and it
29     should be easy to replace Plucene with this module for increased
30     performance. However, this module is not plug-in replacement.
31    
32 dpavlin 1 =head1 METHODS
33    
34 dpavlin 10 =head2 new
35 dpavlin 1
36     Create new indexing object.
37    
38 dpavlin 10 my $i = SWISH::PlusPlus->new(
39 dpavlin 3 index_dir => '/path/to/index',
40     index => 'index++',
41     search => 'search++',
42 dpavlin 8 debug => 1,
43 dpavlin 9 meta_in_body => 1,
44     use_stopwords => 1,
45 dpavlin 1 );
46    
47 dpavlin 10 Options to new are following:
48 dpavlin 1
49     =over 5
50    
51 dpavlin 3 =item C<index_dir>
52    
53     Path to directory in which index will be created.
54    
55 dpavlin 1 =item C<index>
56    
57 dpavlin 3 Full or partial path to SWISH++ index executable. By default, it's B<index>
58     for self-compiled version. If you use Debian GNU/Linux package specify
59     B<index++>. See C<Debian>.
60 dpavlin 1
61 dpavlin 3 =item C<search>
62    
63     Full or partial path to SWISH++ search executable. By default, it's B<search>.
64    
65 dpavlin 8 =item C<debug>
66    
67     This option (off by default) will produce a lot of debugging output on
68     C<STDERR> prefixed by C<##>.
69    
70 dpavlin 9 =item C<meta_in_body>
71    
72     This option (off by default) enables to search content of meta fields
73     without specifing them (like they are in body of document). This will
74     somewhat increate index size.
75    
76     =item C<use_stopwords>
77    
78     Use built-in SWISH++ stop words. By default, they are disabled.
79    
80 dpavlin 1 =back
81    
82     =cut
83    
84 dpavlin 10 sub new {
85 dpavlin 1 my $class = shift;
86     my $self = {@_};
87     bless($self, $class);
88    
89 dpavlin 3 foreach (qw(index_dir)) {
90 dpavlin 1 croak "need $_" unless $self->{$_};
91     }
92    
93 dpavlin 13 my $index_dir = $self->{'index_dir'};
94    
95     if ($index_dir !~ m#^/#) {
96     chomp(my $cwd = `pwd`);
97     $index_dir = "$cwd/$index_dir";
98     print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
99     $self->{'index_dir'} = $index_dir;
100 dpavlin 1 }
101    
102 dpavlin 13 if (! -e $index_dir) {
103     mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
104     }
105    
106 dpavlin 3 # default executables
107     $self->{'index'} ||= 'index';
108     $self->{'search'} ||= 'search';
109    
110 dpavlin 13 print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
111 dpavlin 8
112 dpavlin 1 $self ? return $self : return undef;
113     }
114    
115    
116 dpavlin 3 =head2 check_bin
117    
118 dpavlin 10 Check if swish++ binaries specified in L<new> are available and verify
119 dpavlin 3 version signature.
120    
121     if ($i->check_bin) {
122     print "swish++ binaries found\n";
123     };
124    
125     It will also setup property
126    
127     $i->{'version'}
128    
129     which you can examine to see version.
130    
131     =cut
132    
133     sub check_bin {
134     my $self = shift;
135    
136     my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
137     my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
138    
139     chomp $i;
140     chomp $s;
141    
142     confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
143     confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
144    
145     if ($i eq $s) {
146     $self->{'version'} = $i;
147     return 1;
148     } else {
149     carp "version difference: index is $i while search is $s";
150     return;
151     }
152    
153     }
154    
155 dpavlin 4 =head2 index_document
156    
157     Quick way to add simple data to index.
158    
159     $i->index_document($key, $data);
160     $i->index_document( 42 => 'meaning of life' );
161    
162     =cut
163    
164     sub index_document {
165     my $self = shift;
166    
167     my %doc = @_;
168    
169     foreach my $id (keys %doc) {
170     $self->_create_doc(
171     path => $id,
172     body => $doc{$id},
173     );
174     }
175    
176     return 1;
177     }
178    
179 dpavlin 9 =head2 add
180    
181     Add document with metadata to index.
182    
183     $i->add(
184     path => 'path/to/document',
185     title => 'this is result title',
186     meta => {
187     description => 'this is description meta tag',
188     date => '2004-11-04',
189     author => 'Dobrica Pavlinusic',
190     }
191     body => 'this is text without meta data',
192     );
193    
194     This is thin wrapper round L<_create_doc>.
195    
196     =cut
197    
198     sub add {
199     my $self = shift;
200    
201     $self->_create_doc(@_);
202    
203     return 1;
204     }
205 dpavlin 8 =head2 search
206    
207     Search your index.
208    
209     my @results = $i->search("swhish query");
210    
211     Returns array with result IDs.
212    
213     =cut
214    
215     sub search {
216     my $self = shift;
217    
218     my $query = shift || return;
219    
220     $self->_close_index;
221    
222     my @results;
223    
224     # escape double quotes in query for shell
225     $query =~ s/"/\\"/g;
226    
227     my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
228     print STDERR "## search $open_cmd\n" if ($self->{'debug'});
229    
230 dpavlin 10 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
231 dpavlin 8 while(<SEARCH>) {
232     next if (/^#/);
233     chomp;
234     print STDERR "## $_\n" if ($self->{'debug'});
235     my ($rank,$path,$size,$title) = split(/ /,$_,4);
236     push @results, {
237     rank => $rank,
238     path => $path,
239     size => $size,
240     title => $title,
241     }
242     }
243    
244     close(SEARCH) || confess "can't close search";
245    
246     #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
247    
248     return @results;
249     }
250    
251 dpavlin 13 =head2 finish_update
252    
253     This method will close index.
254    
255     $i->finish_update;
256    
257     It will be called on DESTROY when $i goes out of scope.
258    
259     =cut
260    
261     sub finish_update {
262     my $self = shift;
263    
264     $self->_close_index;
265     }
266    
267     sub DESTROY {
268     my $self = shift;
269     $self->finish_update;
270     }
271    
272 dpavlin 4 =head1 PRIVATE METHODS
273    
274     Private methods implement internals for creating temporary file needed for
275     swish++. You should have no need to call them directly, and they are here
276     just to have documentation.
277    
278 dpavlin 9 =head2 _init_indexer
279 dpavlin 4
280     Create temporary directory in which files for indexing will be created and
281     start index process.
282    
283 dpavlin 9 my $i->_init_indexer || die "can't start indexer";
284 dpavlin 4
285 dpavlin 9 It will also create empty file C<_stopwords_> to disable stop words.
286    
287 dpavlin 4 =cut
288    
289 dpavlin 9 sub _init_indexer {
290 dpavlin 4 my $self = shift;
291    
292     $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
293    
294 dpavlin 9 chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
295    
296 dpavlin 13 my $opt = "-v " . ($self->{'debug'} || '0');
297 dpavlin 4
298 dpavlin 9 unless ($self->{'use_stopwrods'}) {
299 dpavlin 10 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
300 dpavlin 9 print STOP " ";
301     close(STOP);
302     $opt .= " -s _stopwords_";
303     }
304    
305 dpavlin 8 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
306 dpavlin 4
307 dpavlin 13 print STDERR "## open index $open_cmd\n" if ($self->{'index'});
308 dpavlin 4
309 dpavlin 10 open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
310 dpavlin 4
311 dpavlin 9
312 dpavlin 4 return $self->{'index_fh'};
313     }
314    
315     =head2 _create_doc
316    
317     Create temporary file and pass it's name to swish++
318    
319     $i->_create_doc(
320     path => 'path/to/store/in/index',
321 dpavlin 9 title => 'this is title in results',
322 dpavlin 4 body => 'data to story in body tag',
323     meta => {
324     'meta name' => 'data for this meta',
325     'another' => 'again more data',
326     }
327     );
328    
329 dpavlin 8 To delete document, just omit body and meta data.
330    
331 dpavlin 4 =cut
332    
333     sub _create_doc {
334     my $self = shift;
335    
336     my $arg = {@_};
337    
338     # open indexer if needed
339 dpavlin 9 $self->{'index_fh'} ||= $self->_init_indexer;
340 dpavlin 4
341     my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
342    
343 dpavlin 10 open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
344 dpavlin 4
345 dpavlin 9 print TMP '<html><head>';
346 dpavlin 4
347 dpavlin 9 $arg->{'body'} ||= '';
348    
349 dpavlin 4 if ($arg->{'meta'}) {
350 dpavlin 11 foreach my $name (keys %{$arg->{'meta'}}) {
351     my $content = $arg->{'meta'}->{$name};
352     print TMP qq{<meta name="$name" content="$content">};
353     $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
354     }
355 dpavlin 4 }
356 dpavlin 9
357     if (defined($arg->{'title'})) {
358     print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
359     $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
360     }
361    
362     print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
363 dpavlin 4
364     close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
365    
366     print { $self->{'index_fh'} } $arg->{'path'}."\n";
367     }
368    
369 dpavlin 8 =head2 _close_index
370    
371     Close index after indexing.
372    
373     $i->_close_index;
374    
375     You have to close index before searching.
376    
377     =cut
378    
379     sub _close_index {
380     my $self = shift;
381    
382     return unless ($self->{'index_fh'});
383    
384     print STDERR "## close index\n" if ($self->{'debug'});
385    
386     close($self->{'index_fh'});
387     undef $self->{'index_fh'};
388     }
389    
390 dpavlin 1 1;
391     __END__
392    
393     =head2 EXPORT
394    
395     None by default.
396    
397 dpavlin 3 =head1 RELATED
398    
399     =head2 Debian
400    
401     Debian version of swish++ is often old (version 5 at moment of this writing
402     while version 6 is available in source code), so this module by default
403     uses executable names B<index> and B<search> for self-compiled version
404 dpavlin 10 instead of one from Debian package. See L<new> how to specify Debian
405 dpavlin 3 default binaries B<index++> and B<search++>.
406    
407 dpavlin 5 =head2 SWISH++
408 dpavlin 1
409 dpavlin 5 Aside from very good rewrite in C++, SWISH++ is fatster because it has
410     claver heuristics about which data in input files are words to index and
411     which are not. It's based on English language and might be best choice if
412     you plan to install large amount of long text documents.
413 dpavlin 1
414 dpavlin 5 However, if you plan to index all data from structured storage (e.g. RDBMS)
415     you might want B<all> words from data to end up in index as opposed to just
416     those which look like English words. This is especially important if you
417     don't plan to index English texts with this module.
418 dpavlin 1
419 dpavlin 5 With distribution build versions of SWISH++ you might have problems with
420     disepearing words. To overcome this problem, you will have to compile and
421     configure SWISH++ yourself (because language characteristics are
422     compilation-time option).
423 dpavlin 1
424 dpavlin 5 Compilation of SWISH++ is easy process well described on project's web
425     pages. To see my very relaxed sample configuration take a look at C<swish++>
426     directory included in distribution.
427    
428 dpavlin 11 =head2 SWISH++ config
429    
430     C<config.h> located in C<swish++> directory of this distribution is relaxed
431     SWISH++ configuration that will index all words passed to it. This
432     configuration is needed for B<date test> because default configuration
433     doesn't recognize 2004-12-05 as date. Have in mind that your index size
434     might explode.
435    
436 dpavlin 5 =head1 SEE ALSO
437    
438     C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
439    
440 dpavlin 1 =head1 AUTHOR
441    
442 dpavlin 5 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
443 dpavlin 1
444     =head1 COPYRIGHT AND LICENSE
445    
446     Copyright (C) 2004 by Dobrica Pavlinusic
447    
448     This library is free software; you can redistribute it and/or modify
449     it under the same terms as Perl itself, either Perl version 5.8.4 or,
450     at your option, any later version of Perl 5 you may have available.
451    
452    
453     =cut

  ViewVC Help
Powered by ViewVC 1.1.26