/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 11 - (hide annotations)
Sun Dec 5 13:30:57 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 9611 byte(s)
support for meta in data, relaxed swish++ config, distribution cleanup

1 dpavlin 1 package SWISH::PlusPlus;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7 dpavlin 11 our $VERSION = '0.05';
8 dpavlin 1
9     use Carp;
10 dpavlin 4 use File::Temp qw/ tempdir /;
11 dpavlin 8 #use YAML;
12 dpavlin 1
13     =head1 NAME
14    
15     SWISH::PlusPlus - Perl extension SWISH++
16    
17     =head1 SYNOPSIS
18    
19     use SWISH::PlusPlus;
20     blah blah blah
21    
22     =head1 DESCRIPTION
23    
24     This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25 dpavlin 3 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26     support for properties (which this module tries to fix).
27 dpavlin 1
28 dpavlin 3 Implementation of this module is crafted after L<Plucene::Simple> and it
29     should be easy to replace Plucene with this module for increased
30     performance. However, this module is not plug-in replacement.
31    
32 dpavlin 1 =head1 METHODS
33    
34 dpavlin 10 =head2 new
35 dpavlin 1
36     Create new indexing object.
37    
38 dpavlin 10 my $i = SWISH::PlusPlus->new(
39 dpavlin 3 index_dir => '/path/to/index',
40     index => 'index++',
41     search => 'search++',
42 dpavlin 8 debug => 1,
43 dpavlin 9 meta_in_body => 1,
44     use_stopwords => 1,
45 dpavlin 1 );
46    
47 dpavlin 10 Options to new are following:
48 dpavlin 1
49     =over 5
50    
51 dpavlin 3 =item C<index_dir>
52    
53     Path to directory in which index will be created.
54    
55 dpavlin 1 =item C<index>
56    
57 dpavlin 3 Full or partial path to SWISH++ index executable. By default, it's B<index>
58     for self-compiled version. If you use Debian GNU/Linux package specify
59     B<index++>. See C<Debian>.
60 dpavlin 1
61 dpavlin 3 =item C<search>
62    
63     Full or partial path to SWISH++ search executable. By default, it's B<search>.
64    
65 dpavlin 8 =item C<debug>
66    
67     This option (off by default) will produce a lot of debugging output on
68     C<STDERR> prefixed by C<##>.
69    
70 dpavlin 9 =item C<meta_in_body>
71    
72     This option (off by default) enables to search content of meta fields
73     without specifing them (like they are in body of document). This will
74     somewhat increate index size.
75    
76     =item C<use_stopwords>
77    
78     Use built-in SWISH++ stop words. By default, they are disabled.
79    
80 dpavlin 1 =back
81    
82     =cut
83    
84 dpavlin 10 sub new {
85 dpavlin 1 my $class = shift;
86     my $self = {@_};
87     bless($self, $class);
88    
89 dpavlin 3 foreach (qw(index_dir)) {
90 dpavlin 1 croak "need $_" unless $self->{$_};
91     }
92    
93 dpavlin 3 if (! -e $self->{'index_dir'}) {
94     mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";
95 dpavlin 1 }
96    
97 dpavlin 3 # default executables
98     $self->{'index'} ||= 'index';
99     $self->{'search'} ||= 'search';
100    
101 dpavlin 10 print STDERR "## new index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
102 dpavlin 8
103 dpavlin 1 $self ? return $self : return undef;
104     }
105    
106    
107 dpavlin 3 =head2 check_bin
108    
109 dpavlin 10 Check if swish++ binaries specified in L<new> are available and verify
110 dpavlin 3 version signature.
111    
112     if ($i->check_bin) {
113     print "swish++ binaries found\n";
114     };
115    
116     It will also setup property
117    
118     $i->{'version'}
119    
120     which you can examine to see version.
121    
122     =cut
123    
124     sub check_bin {
125     my $self = shift;
126    
127     my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
128     my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
129    
130     chomp $i;
131     chomp $s;
132    
133     confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
134     confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
135    
136     if ($i eq $s) {
137     $self->{'version'} = $i;
138     return 1;
139     } else {
140     carp "version difference: index is $i while search is $s";
141     return;
142     }
143    
144     }
145    
146 dpavlin 4 =head2 index_document
147    
148     Quick way to add simple data to index.
149    
150     $i->index_document($key, $data);
151     $i->index_document( 42 => 'meaning of life' );
152    
153     =cut
154    
155     sub index_document {
156     my $self = shift;
157    
158     my %doc = @_;
159    
160     foreach my $id (keys %doc) {
161     $self->_create_doc(
162     path => $id,
163     body => $doc{$id},
164     );
165     }
166    
167     return 1;
168     }
169    
170 dpavlin 9 =head2 add
171    
172     Add document with metadata to index.
173    
174     $i->add(
175     path => 'path/to/document',
176     title => 'this is result title',
177     meta => {
178     description => 'this is description meta tag',
179     date => '2004-11-04',
180     author => 'Dobrica Pavlinusic',
181     }
182     body => 'this is text without meta data',
183     );
184    
185     This is thin wrapper round L<_create_doc>.
186    
187     =cut
188    
189     sub add {
190     my $self = shift;
191    
192     $self->_create_doc(@_);
193    
194     return 1;
195     }
196 dpavlin 8 =head2 search
197    
198     Search your index.
199    
200     my @results = $i->search("swhish query");
201    
202     Returns array with result IDs.
203    
204     =cut
205    
206     sub search {
207     my $self = shift;
208    
209     my $query = shift || return;
210    
211     $self->_close_index;
212    
213     my @results;
214    
215     # escape double quotes in query for shell
216     $query =~ s/"/\\"/g;
217    
218     my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
219     print STDERR "## search $open_cmd\n" if ($self->{'debug'});
220    
221 dpavlin 10 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
222 dpavlin 8 while(<SEARCH>) {
223     next if (/^#/);
224     chomp;
225     print STDERR "## $_\n" if ($self->{'debug'});
226     my ($rank,$path,$size,$title) = split(/ /,$_,4);
227     push @results, {
228     rank => $rank,
229     path => $path,
230     size => $size,
231     title => $title,
232     }
233     }
234    
235     close(SEARCH) || confess "can't close search";
236    
237     #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
238    
239     return @results;
240     }
241    
242 dpavlin 4 =head1 PRIVATE METHODS
243    
244     Private methods implement internals for creating temporary file needed for
245     swish++. You should have no need to call them directly, and they are here
246     just to have documentation.
247    
248 dpavlin 9 =head2 _init_indexer
249 dpavlin 4
250     Create temporary directory in which files for indexing will be created and
251     start index process.
252    
253 dpavlin 9 my $i->_init_indexer || die "can't start indexer";
254 dpavlin 4
255 dpavlin 9 It will also create empty file C<_stopwords_> to disable stop words.
256    
257 dpavlin 4 =cut
258    
259 dpavlin 9 sub _init_indexer {
260 dpavlin 4 my $self = shift;
261    
262     $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
263    
264 dpavlin 9 chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
265    
266 dpavlin 4 my $opt = "-v 4";
267    
268 dpavlin 9 unless ($self->{'use_stopwrods'}) {
269 dpavlin 10 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
270 dpavlin 9 print STOP " ";
271     close(STOP);
272     $opt .= " -s _stopwords_";
273     }
274    
275 dpavlin 8 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
276 dpavlin 4
277    
278 dpavlin 10 open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
279 dpavlin 4
280 dpavlin 9
281 dpavlin 4 return $self->{'index_fh'};
282     }
283    
284     =head2 _create_doc
285    
286     Create temporary file and pass it's name to swish++
287    
288     $i->_create_doc(
289     path => 'path/to/store/in/index',
290 dpavlin 9 title => 'this is title in results',
291 dpavlin 4 body => 'data to story in body tag',
292     meta => {
293     'meta name' => 'data for this meta',
294     'another' => 'again more data',
295     }
296     );
297    
298 dpavlin 8 To delete document, just omit body and meta data.
299    
300 dpavlin 4 =cut
301    
302     sub _create_doc {
303     my $self = shift;
304    
305     my $arg = {@_};
306    
307     # open indexer if needed
308 dpavlin 9 $self->{'index_fh'} ||= $self->_init_indexer;
309 dpavlin 4
310     my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
311    
312 dpavlin 10 open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
313 dpavlin 4
314 dpavlin 9 print TMP '<html><head>';
315 dpavlin 4
316 dpavlin 9 $arg->{'body'} ||= '';
317    
318 dpavlin 4 if ($arg->{'meta'}) {
319 dpavlin 11 foreach my $name (keys %{$arg->{'meta'}}) {
320     my $content = $arg->{'meta'}->{$name};
321     print TMP qq{<meta name="$name" content="$content">};
322     $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
323     }
324 dpavlin 4 }
325 dpavlin 9
326     if (defined($arg->{'title'})) {
327     print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
328     $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
329     }
330    
331     print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
332 dpavlin 4
333     close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
334    
335     print { $self->{'index_fh'} } $arg->{'path'}."\n";
336     }
337    
338 dpavlin 8 =head2 _close_index
339    
340     Close index after indexing.
341    
342     $i->_close_index;
343    
344     You have to close index before searching.
345    
346     =cut
347    
348     sub _close_index {
349     my $self = shift;
350    
351     return unless ($self->{'index_fh'});
352    
353     print STDERR "## close index\n" if ($self->{'debug'});
354    
355     close($self->{'index_fh'});
356     undef $self->{'index_fh'};
357     }
358    
359 dpavlin 1 1;
360     __END__
361    
362     =head2 EXPORT
363    
364     None by default.
365    
366 dpavlin 3 =head1 RELATED
367    
368     =head2 Debian
369    
370     Debian version of swish++ is often old (version 5 at moment of this writing
371     while version 6 is available in source code), so this module by default
372     uses executable names B<index> and B<search> for self-compiled version
373 dpavlin 10 instead of one from Debian package. See L<new> how to specify Debian
374 dpavlin 3 default binaries B<index++> and B<search++>.
375    
376 dpavlin 5 =head2 SWISH++
377 dpavlin 1
378 dpavlin 5 Aside from very good rewrite in C++, SWISH++ is fatster because it has
379     claver heuristics about which data in input files are words to index and
380     which are not. It's based on English language and might be best choice if
381     you plan to install large amount of long text documents.
382 dpavlin 1
383 dpavlin 5 However, if you plan to index all data from structured storage (e.g. RDBMS)
384     you might want B<all> words from data to end up in index as opposed to just
385     those which look like English words. This is especially important if you
386     don't plan to index English texts with this module.
387 dpavlin 1
388 dpavlin 5 With distribution build versions of SWISH++ you might have problems with
389     disepearing words. To overcome this problem, you will have to compile and
390     configure SWISH++ yourself (because language characteristics are
391     compilation-time option).
392 dpavlin 1
393 dpavlin 5 Compilation of SWISH++ is easy process well described on project's web
394     pages. To see my very relaxed sample configuration take a look at C<swish++>
395     directory included in distribution.
396    
397 dpavlin 11 =head2 SWISH++ config
398    
399     C<config.h> located in C<swish++> directory of this distribution is relaxed
400     SWISH++ configuration that will index all words passed to it. This
401     configuration is needed for B<date test> because default configuration
402     doesn't recognize 2004-12-05 as date. Have in mind that your index size
403     might explode.
404    
405 dpavlin 5 =head1 SEE ALSO
406    
407     C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
408    
409 dpavlin 1 =head1 AUTHOR
410    
411 dpavlin 5 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
412 dpavlin 1
413     =head1 COPYRIGHT AND LICENSE
414    
415     Copyright (C) 2004 by Dobrica Pavlinusic
416    
417     This library is free software; you can redistribute it and/or modify
418     it under the same terms as Perl itself, either Perl version 5.8.4 or,
419     at your option, any later version of Perl 5 you may have available.
420    
421    
422     =cut

  ViewVC Help
Powered by ViewVC 1.1.26