/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 10 - (hide annotations)
Sun Dec 5 12:48:00 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 9107 byte(s)
rename open costructor to new

1 dpavlin 1 package SWISH::PlusPlus;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7 dpavlin 10 our $VERSION = '0.04';
8 dpavlin 1
9     use Carp;
10 dpavlin 4 use File::Temp qw/ tempdir /;
11 dpavlin 8 #use YAML;
12 dpavlin 1
13     =head1 NAME
14    
15     SWISH::PlusPlus - Perl extension SWISH++
16    
17     =head1 SYNOPSIS
18    
19     use SWISH::PlusPlus;
20     blah blah blah
21    
22     =head1 DESCRIPTION
23    
24     This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25 dpavlin 3 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26     support for properties (which this module tries to fix).
27 dpavlin 1
28 dpavlin 3 Implementation of this module is crafted after L<Plucene::Simple> and it
29     should be easy to replace Plucene with this module for increased
30     performance. However, this module is not plug-in replacement.
31    
32 dpavlin 1 =head1 METHODS
33    
34 dpavlin 10 =head2 new
35 dpavlin 1
36     Create new indexing object.
37    
38 dpavlin 10 my $i = SWISH::PlusPlus->new(
39 dpavlin 3 index_dir => '/path/to/index',
40     index => 'index++',
41     search => 'search++',
42 dpavlin 8 debug => 1,
43 dpavlin 9 meta_in_body => 1,
44     use_stopwords => 1,
45 dpavlin 1 );
46    
47 dpavlin 10 Options to new are following:
48 dpavlin 1
49     =over 5
50    
51 dpavlin 3 =item C<index_dir>
52    
53     Path to directory in which index will be created.
54    
55 dpavlin 1 =item C<index>
56    
57 dpavlin 3 Full or partial path to SWISH++ index executable. By default, it's B<index>
58     for self-compiled version. If you use Debian GNU/Linux package specify
59     B<index++>. See C<Debian>.
60 dpavlin 1
61 dpavlin 3 =item C<search>
62    
63     Full or partial path to SWISH++ search executable. By default, it's B<search>.
64    
65 dpavlin 8 =item C<debug>
66    
67     This option (off by default) will produce a lot of debugging output on
68     C<STDERR> prefixed by C<##>.
69    
70 dpavlin 9 =item C<meta_in_body>
71    
72     This option (off by default) enables to search content of meta fields
73     without specifing them (like they are in body of document). This will
74     somewhat increate index size.
75    
76     =item C<use_stopwords>
77    
78     Use built-in SWISH++ stop words. By default, they are disabled.
79    
80 dpavlin 1 =back
81    
82     =cut
83    
84 dpavlin 10 sub new {
85 dpavlin 1 my $class = shift;
86     my $self = {@_};
87     bless($self, $class);
88    
89 dpavlin 3 foreach (qw(index_dir)) {
90 dpavlin 1 croak "need $_" unless $self->{$_};
91     }
92    
93 dpavlin 3 if (! -e $self->{'index_dir'}) {
94     mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";
95 dpavlin 1 }
96    
97 dpavlin 3 # default executables
98     $self->{'index'} ||= 'index';
99     $self->{'search'} ||= 'search';
100    
101 dpavlin 10 print STDERR "## new index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
102 dpavlin 8
103 dpavlin 1 $self ? return $self : return undef;
104     }
105    
106    
107 dpavlin 3 =head2 check_bin
108    
109 dpavlin 10 Check if swish++ binaries specified in L<new> are available and verify
110 dpavlin 3 version signature.
111    
112     if ($i->check_bin) {
113     print "swish++ binaries found\n";
114     };
115    
116     It will also setup property
117    
118     $i->{'version'}
119    
120     which you can examine to see version.
121    
122     =cut
123    
124     sub check_bin {
125     my $self = shift;
126    
127     my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
128     my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
129    
130     chomp $i;
131     chomp $s;
132    
133     confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
134     confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
135    
136     if ($i eq $s) {
137     $self->{'version'} = $i;
138     return 1;
139     } else {
140     carp "version difference: index is $i while search is $s";
141     return;
142     }
143    
144     }
145    
146 dpavlin 4 =head2 index_document
147    
148     Quick way to add simple data to index.
149    
150     $i->index_document($key, $data);
151     $i->index_document( 42 => 'meaning of life' );
152    
153     =cut
154    
155     sub index_document {
156     my $self = shift;
157    
158     my %doc = @_;
159    
160     foreach my $id (keys %doc) {
161     $self->_create_doc(
162     path => $id,
163     body => $doc{$id},
164     );
165     }
166    
167     return 1;
168     }
169    
170 dpavlin 9 =head2 add
171    
172     Add document with metadata to index.
173    
174     $i->add(
175     path => 'path/to/document',
176     title => 'this is result title',
177     meta => {
178     description => 'this is description meta tag',
179     date => '2004-11-04',
180     author => 'Dobrica Pavlinusic',
181     }
182     body => 'this is text without meta data',
183     );
184    
185     This is thin wrapper round L<_create_doc>.
186    
187     =cut
188    
189     sub add {
190     my $self = shift;
191    
192     $self->_create_doc(@_);
193    
194     return 1;
195     }
196 dpavlin 8 =head2 search
197    
198     Search your index.
199    
200     my @results = $i->search("swhish query");
201    
202     Returns array with result IDs.
203    
204     =cut
205    
206     sub search {
207     my $self = shift;
208    
209     my $query = shift || return;
210    
211     $self->_close_index;
212    
213     my @results;
214    
215     # escape double quotes in query for shell
216     $query =~ s/"/\\"/g;
217    
218     my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
219     print STDERR "## search $open_cmd\n" if ($self->{'debug'});
220    
221 dpavlin 10 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
222 dpavlin 8 while(<SEARCH>) {
223     next if (/^#/);
224     chomp;
225     print STDERR "## $_\n" if ($self->{'debug'});
226     my ($rank,$path,$size,$title) = split(/ /,$_,4);
227     push @results, {
228     rank => $rank,
229     path => $path,
230     size => $size,
231     title => $title,
232     }
233     }
234    
235     close(SEARCH) || confess "can't close search";
236    
237     #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
238    
239     return @results;
240     }
241    
242 dpavlin 4 =head1 PRIVATE METHODS
243    
244     Private methods implement internals for creating temporary file needed for
245     swish++. You should have no need to call them directly, and they are here
246     just to have documentation.
247    
248 dpavlin 9 =head2 _init_indexer
249 dpavlin 4
250     Create temporary directory in which files for indexing will be created and
251     start index process.
252    
253 dpavlin 9 my $i->_init_indexer || die "can't start indexer";
254 dpavlin 4
255 dpavlin 9 It will also create empty file C<_stopwords_> to disable stop words.
256    
257 dpavlin 4 =cut
258    
259 dpavlin 9 sub _init_indexer {
260 dpavlin 4 my $self = shift;
261    
262     $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
263    
264 dpavlin 9 chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
265    
266 dpavlin 4 my $opt = "-v 4";
267    
268 dpavlin 9 unless ($self->{'use_stopwrods'}) {
269 dpavlin 10 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
270 dpavlin 9 print STOP " ";
271     close(STOP);
272     $opt .= " -s _stopwords_";
273     }
274    
275 dpavlin 8 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
276 dpavlin 4
277    
278 dpavlin 10 open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
279 dpavlin 4
280 dpavlin 9
281 dpavlin 4 return $self->{'index_fh'};
282     }
283    
284     =head2 _create_doc
285    
286     Create temporary file and pass it's name to swish++
287    
288     $i->_create_doc(
289     path => 'path/to/store/in/index',
290 dpavlin 9 title => 'this is title in results',
291 dpavlin 4 body => 'data to story in body tag',
292     meta => {
293     'meta name' => 'data for this meta',
294     'another' => 'again more data',
295     }
296     );
297    
298 dpavlin 8 To delete document, just omit body and meta data.
299    
300 dpavlin 4 =cut
301    
302     sub _create_doc {
303     my $self = shift;
304    
305     my $arg = {@_};
306    
307     # open indexer if needed
308 dpavlin 9 $self->{'index_fh'} ||= $self->_init_indexer;
309 dpavlin 4
310     my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
311    
312 dpavlin 10 open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
313 dpavlin 4
314 dpavlin 9 print TMP '<html><head>';
315 dpavlin 4
316 dpavlin 9 $arg->{'body'} ||= '';
317    
318 dpavlin 4 if ($arg->{'meta'}) {
319     confess "not yet implemented";
320     }
321 dpavlin 9
322     if (defined($arg->{'title'})) {
323     print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
324     $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
325     }
326    
327     print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
328 dpavlin 4
329     close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
330    
331     print { $self->{'index_fh'} } $arg->{'path'}."\n";
332     }
333    
334 dpavlin 8 =head2 _close_index
335    
336     Close index after indexing.
337    
338     $i->_close_index;
339    
340     You have to close index before searching.
341    
342     =cut
343    
344     sub _close_index {
345     my $self = shift;
346    
347     return unless ($self->{'index_fh'});
348    
349     print STDERR "## close index\n" if ($self->{'debug'});
350    
351     close($self->{'index_fh'});
352     undef $self->{'index_fh'};
353     }
354    
355 dpavlin 1 1;
356     __END__
357    
358     =head2 EXPORT
359    
360     None by default.
361    
362 dpavlin 3 =head1 RELATED
363    
364     =head2 Debian
365    
366     Debian version of swish++ is often old (version 5 at moment of this writing
367     while version 6 is available in source code), so this module by default
368     uses executable names B<index> and B<search> for self-compiled version
369 dpavlin 10 instead of one from Debian package. See L<new> how to specify Debian
370 dpavlin 3 default binaries B<index++> and B<search++>.
371    
372 dpavlin 5 =head2 SWISH++
373 dpavlin 1
374 dpavlin 5 Aside from very good rewrite in C++, SWISH++ is fatster because it has
375     claver heuristics about which data in input files are words to index and
376     which are not. It's based on English language and might be best choice if
377     you plan to install large amount of long text documents.
378 dpavlin 1
379 dpavlin 5 However, if you plan to index all data from structured storage (e.g. RDBMS)
380     you might want B<all> words from data to end up in index as opposed to just
381     those which look like English words. This is especially important if you
382     don't plan to index English texts with this module.
383 dpavlin 1
384 dpavlin 5 With distribution build versions of SWISH++ you might have problems with
385     disepearing words. To overcome this problem, you will have to compile and
386     configure SWISH++ yourself (because language characteristics are
387     compilation-time option).
388 dpavlin 1
389 dpavlin 5 Compilation of SWISH++ is easy process well described on project's web
390     pages. To see my very relaxed sample configuration take a look at C<swish++>
391     directory included in distribution.
392    
393     =head1 SEE ALSO
394    
395     C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
396    
397 dpavlin 1 =head1 AUTHOR
398    
399 dpavlin 5 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
400 dpavlin 1
401     =head1 COPYRIGHT AND LICENSE
402    
403     Copyright (C) 2004 by Dobrica Pavlinusic
404    
405     This library is free software; you can redistribute it and/or modify
406     it under the same terms as Perl itself, either Perl version 5.8.4 or,
407     at your option, any later version of Perl 5 you may have available.
408    
409    
410     =cut

  ViewVC Help
Powered by ViewVC 1.1.26