/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 14 - (hide annotations)
Sun Dec 5 15:35:53 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 10609 byte(s)
store cwd in $i->{'cwd'}, store just version numbers in $i->{'version'},
_init_indexer doesn't mess with cwd while indexing, _index_fh is now private.

1 dpavlin 1 package SWISH::PlusPlus;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7 dpavlin 13 our $VERSION = '0.06';
8 dpavlin 1
9     use Carp;
10 dpavlin 4 use File::Temp qw/ tempdir /;
11 dpavlin 8 #use YAML;
12 dpavlin 1
13     =head1 NAME
14    
15     SWISH::PlusPlus - Perl extension SWISH++
16    
17     =head1 SYNOPSIS
18    
19     use SWISH::PlusPlus;
20     blah blah blah
21    
22     =head1 DESCRIPTION
23    
24     This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25 dpavlin 3 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26     support for properties (which this module tries to fix).
27 dpavlin 1
28 dpavlin 3 Implementation of this module is crafted after L<Plucene::Simple> and it
29     should be easy to replace Plucene with this module for increased
30     performance. However, this module is not plug-in replacement.
31    
32 dpavlin 1 =head1 METHODS
33    
34 dpavlin 10 =head2 new
35 dpavlin 1
36     Create new indexing object.
37    
38 dpavlin 10 my $i = SWISH::PlusPlus->new(
39 dpavlin 3 index_dir => '/path/to/index',
40     index => 'index++',
41     search => 'search++',
42 dpavlin 8 debug => 1,
43 dpavlin 9 meta_in_body => 1,
44     use_stopwords => 1,
45 dpavlin 1 );
46    
47 dpavlin 10 Options to new are following:
48 dpavlin 1
49     =over 5
50    
51 dpavlin 3 =item C<index_dir>
52    
53     Path to directory in which index will be created.
54    
55 dpavlin 1 =item C<index>
56    
57 dpavlin 3 Full or partial path to SWISH++ index executable. By default, it's B<index>
58     for self-compiled version. If you use Debian GNU/Linux package specify
59     B<index++>. See C<Debian>.
60 dpavlin 1
61 dpavlin 3 =item C<search>
62    
63     Full or partial path to SWISH++ search executable. By default, it's B<search>.
64    
65 dpavlin 8 =item C<debug>
66    
67     This option (off by default) will produce a lot of debugging output on
68     C<STDERR> prefixed by C<##>.
69    
70 dpavlin 9 =item C<meta_in_body>
71    
72     This option (off by default) enables to search content of meta fields
73     without specifing them (like they are in body of document). This will
74     somewhat increate index size.
75    
76     =item C<use_stopwords>
77    
78     Use built-in SWISH++ stop words. By default, they are disabled.
79    
80 dpavlin 1 =back
81    
82     =cut
83    
84 dpavlin 10 sub new {
85 dpavlin 1 my $class = shift;
86     my $self = {@_};
87     bless($self, $class);
88    
89 dpavlin 3 foreach (qw(index_dir)) {
90 dpavlin 1 croak "need $_" unless $self->{$_};
91     }
92    
93 dpavlin 13 my $index_dir = $self->{'index_dir'};
94    
95 dpavlin 14 my $cwd;
96     chomp($cwd = `pwd`);
97     $self->{'cwd'} = $cwd || carp "can't get cwd!";
98    
99 dpavlin 13 if ($index_dir !~ m#^/#) {
100     $index_dir = "$cwd/$index_dir";
101     print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
102     $self->{'index_dir'} = $index_dir;
103 dpavlin 1 }
104    
105 dpavlin 13 if (! -e $index_dir) {
106     mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
107     }
108    
109 dpavlin 3 # default executables
110     $self->{'index'} ||= 'index';
111     $self->{'search'} ||= 'search';
112    
113 dpavlin 13 print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
114 dpavlin 8
115 dpavlin 1 $self ? return $self : return undef;
116     }
117    
118    
119 dpavlin 3 =head2 check_bin
120    
121 dpavlin 10 Check if swish++ binaries specified in L<new> are available and verify
122 dpavlin 3 version signature.
123    
124     if ($i->check_bin) {
125     print "swish++ binaries found\n";
126     };
127    
128     It will also setup property
129    
130     $i->{'version'}
131    
132     which you can examine to see version.
133    
134     =cut
135    
136     sub check_bin {
137     my $self = shift;
138    
139     my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
140     my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
141    
142     chomp $i;
143     chomp $s;
144    
145     confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
146     confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
147    
148     if ($i eq $s) {
149 dpavlin 14 $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
150 dpavlin 3 $self->{'version'} = $i;
151     return 1;
152     } else {
153     carp "version difference: index is $i while search is $s";
154     return;
155     }
156    
157     }
158    
159 dpavlin 4 =head2 index_document
160    
161     Quick way to add simple data to index.
162    
163     $i->index_document($key, $data);
164     $i->index_document( 42 => 'meaning of life' );
165    
166     =cut
167    
168     sub index_document {
169     my $self = shift;
170    
171     my %doc = @_;
172    
173     foreach my $id (keys %doc) {
174     $self->_create_doc(
175     path => $id,
176     body => $doc{$id},
177     );
178     }
179    
180     return 1;
181     }
182    
183 dpavlin 9 =head2 add
184    
185     Add document with metadata to index.
186    
187     $i->add(
188     path => 'path/to/document',
189     title => 'this is result title',
190     meta => {
191     description => 'this is description meta tag',
192     date => '2004-11-04',
193     author => 'Dobrica Pavlinusic',
194     }
195     body => 'this is text without meta data',
196     );
197    
198     This is thin wrapper round L<_create_doc>.
199    
200     =cut
201    
202     sub add {
203     my $self = shift;
204    
205     $self->_create_doc(@_);
206    
207     return 1;
208     }
209 dpavlin 8 =head2 search
210    
211     Search your index.
212    
213     my @results = $i->search("swhish query");
214    
215     Returns array with result IDs.
216    
217     =cut
218    
219     sub search {
220     my $self = shift;
221    
222     my $query = shift || return;
223    
224 dpavlin 14 $self->finish_update;
225 dpavlin 8
226     my @results;
227    
228     # escape double quotes in query for shell
229     $query =~ s/"/\\"/g;
230    
231     my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
232     print STDERR "## search $open_cmd\n" if ($self->{'debug'});
233    
234 dpavlin 10 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
235 dpavlin 8 while(<SEARCH>) {
236     next if (/^#/);
237     chomp;
238     print STDERR "## $_\n" if ($self->{'debug'});
239     my ($rank,$path,$size,$title) = split(/ /,$_,4);
240     push @results, {
241     rank => $rank,
242     path => $path,
243     size => $size,
244     title => $title,
245     }
246     }
247    
248     close(SEARCH) || confess "can't close search";
249    
250     #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
251    
252     return @results;
253     }
254    
255 dpavlin 13 =head2 finish_update
256    
257     This method will close index.
258    
259     $i->finish_update;
260    
261     It will be called on DESTROY when $i goes out of scope.
262    
263     =cut
264    
265     sub finish_update {
266     my $self = shift;
267    
268 dpavlin 14 print STDERR "## finish_update\n" if ($self->{'debug'});
269    
270 dpavlin 13 $self->_close_index;
271     }
272    
273     sub DESTROY {
274     my $self = shift;
275     $self->finish_update;
276     }
277    
278 dpavlin 4 =head1 PRIVATE METHODS
279    
280     Private methods implement internals for creating temporary file needed for
281     swish++. You should have no need to call them directly, and they are here
282     just to have documentation.
283    
284 dpavlin 9 =head2 _init_indexer
285 dpavlin 4
286     Create temporary directory in which files for indexing will be created and
287     start index process.
288    
289 dpavlin 9 my $i->_init_indexer || die "can't start indexer";
290 dpavlin 4
291 dpavlin 9 It will also create empty file C<_stopwords_> to disable stop words.
292    
293 dpavlin 4 =cut
294    
295 dpavlin 9 sub _init_indexer {
296 dpavlin 4 my $self = shift;
297    
298 dpavlin 14 return if ($self->{'_index_fh'});
299 dpavlin 4
300 dpavlin 14 my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
301     $self->{'tmp_dir'} = $tmp_dir;
302 dpavlin 9
303 dpavlin 14 chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
304    
305     print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
306    
307 dpavlin 13 my $opt = "-v " . ($self->{'debug'} || '0');
308 dpavlin 4
309 dpavlin 9 unless ($self->{'use_stopwrods'}) {
310 dpavlin 10 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
311 dpavlin 9 print STOP " ";
312     close(STOP);
313     $opt .= " -s _stopwords_";
314     }
315    
316 dpavlin 8 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
317 dpavlin 4
318 dpavlin 14 print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
319 dpavlin 4
320 dpavlin 14 open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
321 dpavlin 4
322 dpavlin 14 chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
323 dpavlin 9
324 dpavlin 14 return $self->{'_index_fh'};
325 dpavlin 4 }
326    
327     =head2 _create_doc
328    
329     Create temporary file and pass it's name to swish++
330    
331     $i->_create_doc(
332     path => 'path/to/store/in/index',
333 dpavlin 9 title => 'this is title in results',
334 dpavlin 4 body => 'data to story in body tag',
335     meta => {
336     'meta name' => 'data for this meta',
337     'another' => 'again more data',
338     }
339     );
340    
341 dpavlin 8 To delete document, just omit body and meta data.
342    
343 dpavlin 4 =cut
344    
345     sub _create_doc {
346     my $self = shift;
347    
348     my $arg = {@_};
349    
350     # open indexer if needed
351 dpavlin 14 $self->_init_indexer;
352 dpavlin 4
353     my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
354 dpavlin 14 $path .= '/' . $arg->{'path'};
355 dpavlin 4
356 dpavlin 14 print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
357 dpavlin 4
358 dpavlin 14 open(TMP, '>', $path) || die "can't create temp file $path: $!";
359    
360 dpavlin 9 print TMP '<html><head>';
361 dpavlin 4
362 dpavlin 9 $arg->{'body'} ||= '';
363    
364 dpavlin 4 if ($arg->{'meta'}) {
365 dpavlin 11 foreach my $name (keys %{$arg->{'meta'}}) {
366     my $content = $arg->{'meta'}->{$name};
367     print TMP qq{<meta name="$name" content="$content">};
368     $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
369     }
370 dpavlin 4 }
371 dpavlin 9
372     if (defined($arg->{'title'})) {
373     print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
374     $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
375     }
376    
377     print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
378 dpavlin 4
379     close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
380    
381 dpavlin 14 print { $self->{'_index_fh'} } $arg->{'path'}."\n";
382 dpavlin 4 }
383    
384 dpavlin 8 =head2 _close_index
385    
386     Close index after indexing.
387    
388     $i->_close_index;
389    
390     You have to close index before searching.
391    
392     =cut
393    
394     sub _close_index {
395     my $self = shift;
396    
397 dpavlin 14 return unless ($self->{'_index_fh'});
398 dpavlin 8
399     print STDERR "## close index\n" if ($self->{'debug'});
400    
401 dpavlin 14 close($self->{'_index_fh'});
402     undef $self->{'_index_fh'};
403 dpavlin 8 }
404    
405 dpavlin 1 1;
406     __END__
407    
408     =head2 EXPORT
409    
410     None by default.
411    
412 dpavlin 3 =head1 RELATED
413    
414     =head2 Debian
415    
416     Debian version of swish++ is often old (version 5 at moment of this writing
417     while version 6 is available in source code), so this module by default
418     uses executable names B<index> and B<search> for self-compiled version
419 dpavlin 10 instead of one from Debian package. See L<new> how to specify Debian
420 dpavlin 3 default binaries B<index++> and B<search++>.
421    
422 dpavlin 5 =head2 SWISH++
423 dpavlin 1
424 dpavlin 5 Aside from very good rewrite in C++, SWISH++ is fatster because it has
425     claver heuristics about which data in input files are words to index and
426     which are not. It's based on English language and might be best choice if
427     you plan to install large amount of long text documents.
428 dpavlin 1
429 dpavlin 5 However, if you plan to index all data from structured storage (e.g. RDBMS)
430     you might want B<all> words from data to end up in index as opposed to just
431     those which look like English words. This is especially important if you
432     don't plan to index English texts with this module.
433 dpavlin 1
434 dpavlin 5 With distribution build versions of SWISH++ you might have problems with
435     disepearing words. To overcome this problem, you will have to compile and
436     configure SWISH++ yourself (because language characteristics are
437     compilation-time option).
438 dpavlin 1
439 dpavlin 5 Compilation of SWISH++ is easy process well described on project's web
440     pages. To see my very relaxed sample configuration take a look at C<swish++>
441     directory included in distribution.
442    
443 dpavlin 11 =head2 SWISH++ config
444    
445     C<config.h> located in C<swish++> directory of this distribution is relaxed
446     SWISH++ configuration that will index all words passed to it. This
447     configuration is needed for B<date test> because default configuration
448     doesn't recognize 2004-12-05 as date. Have in mind that your index size
449     might explode.
450    
451 dpavlin 5 =head1 SEE ALSO
452    
453     C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
454    
455 dpavlin 1 =head1 AUTHOR
456    
457 dpavlin 5 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
458 dpavlin 1
459     =head1 COPYRIGHT AND LICENSE
460    
461     Copyright (C) 2004 by Dobrica Pavlinusic
462    
463     This library is free software; you can redistribute it and/or modify
464     it under the same terms as Perl itself, either Perl version 5.8.4 or,
465     at your option, any later version of Perl 5 you may have available.
466    
467    
468     =cut

  ViewVC Help
Powered by ViewVC 1.1.26