/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (hide annotations)
Sat Dec 4 17:49:20 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 7882 byte(s)
search is working

1 dpavlin 1 package SWISH::PlusPlus;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7 dpavlin 3 our $VERSION = '0.02';
8 dpavlin 1
9     use Carp;
10 dpavlin 4 use File::Temp qw/ tempdir /;
11 dpavlin 8 #use YAML;
12 dpavlin 1
13     =head1 NAME
14    
15     SWISH::PlusPlus - Perl extension SWISH++
16    
17     =head1 SYNOPSIS
18    
19     use SWISH::PlusPlus;
20     blah blah blah
21    
22     =head1 DESCRIPTION
23    
24     This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25 dpavlin 3 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26     support for properties (which this module tries to fix).
27 dpavlin 1
28 dpavlin 3 Implementation of this module is crafted after L<Plucene::Simple> and it
29     should be easy to replace Plucene with this module for increased
30     performance. However, this module is not plug-in replacement.
31    
32 dpavlin 1 =head1 METHODS
33    
34     =head2 open
35    
36     Create new indexing object.
37    
38 dpavlin 3 my $i = SWISH::PlusPlus->open(
39     index_dir => '/path/to/index',
40     index => 'index++',
41     search => 'search++',
42 dpavlin 8 debug => 1,
43 dpavlin 1 );
44    
45     Options to open are following:
46    
47     =over 5
48    
49 dpavlin 3 =item C<index_dir>
50    
51     Path to directory in which index will be created.
52    
53 dpavlin 1 =item C<index>
54    
55 dpavlin 3 Full or partial path to SWISH++ index executable. By default, it's B<index>
56     for self-compiled version. If you use Debian GNU/Linux package specify
57     B<index++>. See C<Debian>.
58 dpavlin 1
59 dpavlin 3 =item C<search>
60    
61     Full or partial path to SWISH++ search executable. By default, it's B<search>.
62    
63 dpavlin 8 =item C<debug>
64    
65     This option (off by default) will produce a lot of debugging output on
66     C<STDERR> prefixed by C<##>.
67    
68 dpavlin 1 =back
69    
70     =cut
71    
72 dpavlin 3 sub open {
73 dpavlin 1 my $class = shift;
74     my $self = {@_};
75     bless($self, $class);
76    
77 dpavlin 3 foreach (qw(index_dir)) {
78 dpavlin 1 croak "need $_" unless $self->{$_};
79     }
80    
81 dpavlin 3 if (! -e $self->{'index_dir'}) {
82     mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";
83 dpavlin 1 }
84    
85 dpavlin 3 # default executables
86     $self->{'index'} ||= 'index';
87     $self->{'search'} ||= 'search';
88    
89 dpavlin 8 print STDERR "## open index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
90    
91 dpavlin 1 $self ? return $self : return undef;
92     }
93    
94    
95 dpavlin 3 =head2 check_bin
96    
97     Check if swish++ binaries specified in L<open> are available and verify
98     version signature.
99    
100     if ($i->check_bin) {
101     print "swish++ binaries found\n";
102     };
103    
104     It will also setup property
105    
106     $i->{'version'}
107    
108     which you can examine to see version.
109    
110     =cut
111    
112     sub check_bin {
113     my $self = shift;
114    
115     my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
116     my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
117    
118     chomp $i;
119     chomp $s;
120    
121     confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
122     confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
123    
124     if ($i eq $s) {
125     $self->{'version'} = $i;
126     return 1;
127     } else {
128     carp "version difference: index is $i while search is $s";
129     return;
130     }
131    
132     }
133    
134 dpavlin 4 =head2 index_document
135    
136     Quick way to add simple data to index.
137    
138     $i->index_document($key, $data);
139     $i->index_document( 42 => 'meaning of life' );
140    
141     =cut
142    
143     sub index_document {
144     my $self = shift;
145    
146     my %doc = @_;
147    
148     foreach my $id (keys %doc) {
149     $self->_create_doc(
150     path => $id,
151     body => $doc{$id},
152     );
153     }
154    
155     return 1;
156     }
157    
158 dpavlin 8 =head2 search
159    
160     Search your index.
161    
162     my @results = $i->search("swhish query");
163    
164     Returns array with result IDs.
165    
166     =cut
167    
168     sub search {
169     my $self = shift;
170    
171     my $query = shift || return;
172    
173     $self->_close_index;
174    
175     my @results;
176    
177     # escape double quotes in query for shell
178     $query =~ s/"/\\"/g;
179    
180     my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
181     print STDERR "## search $open_cmd\n" if ($self->{'debug'});
182    
183     CORE::open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
184     while(<SEARCH>) {
185     next if (/^#/);
186     chomp;
187     print STDERR "## $_\n" if ($self->{'debug'});
188     my ($rank,$path,$size,$title) = split(/ /,$_,4);
189     push @results, {
190     rank => $rank,
191     path => $path,
192     size => $size,
193     title => $title,
194     }
195     }
196    
197     close(SEARCH) || confess "can't close search";
198    
199     #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
200    
201     return @results;
202     }
203    
204 dpavlin 4 =head1 PRIVATE METHODS
205    
206     Private methods implement internals for creating temporary file needed for
207     swish++. You should have no need to call them directly, and they are here
208     just to have documentation.
209    
210     =head2 _init_index
211    
212     Create temporary directory in which files for indexing will be created and
213     start index process.
214    
215     my $i->_init_index || die "can't start indexer";
216    
217     =cut
218    
219     sub _init_index {
220     my $self = shift;
221    
222     $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
223    
224     my $opt = "-v 4";
225    
226 dpavlin 8 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
227 dpavlin 4
228     chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
229    
230     CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
231    
232     return $self->{'index_fh'};
233     }
234    
235     =head2 _create_doc
236    
237     Create temporary file and pass it's name to swish++
238    
239     $i->_create_doc(
240     path => 'path/to/store/in/index',
241     body => 'data to story in body tag',
242     meta => {
243     'meta name' => 'data for this meta',
244     'another' => 'again more data',
245     }
246     );
247    
248 dpavlin 8 To delete document, just omit body and meta data.
249    
250 dpavlin 4 =cut
251    
252     sub _create_doc {
253     my $self = shift;
254    
255     my $arg = {@_};
256    
257     # open indexer if needed
258     $self->{'index_fh'} ||= $self->_init_index;
259    
260     my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
261    
262     CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
263    
264     print TMP '<html>';
265    
266     if ($arg->{'meta'}) {
267     confess "not yet implemented";
268     }
269    
270     print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';
271    
272     close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
273    
274     print { $self->{'index_fh'} } $arg->{'path'}."\n";
275     }
276    
277 dpavlin 8 =head2 _close_index
278    
279     Close index after indexing.
280    
281     $i->_close_index;
282    
283     You have to close index before searching.
284    
285     =cut
286    
287     sub _close_index {
288     my $self = shift;
289    
290     return unless ($self->{'index_fh'});
291    
292     print STDERR "## close index\n" if ($self->{'debug'});
293    
294     close($self->{'index_fh'});
295     undef $self->{'index_fh'};
296     }
297    
298 dpavlin 1 1;
299     __END__
300    
301     =head2 EXPORT
302    
303     None by default.
304    
305 dpavlin 3 =head1 RELATED
306    
307     =head2 Debian
308    
309     Debian version of swish++ is often old (version 5 at moment of this writing
310     while version 6 is available in source code), so this module by default
311     uses executable names B<index> and B<search> for self-compiled version
312     instead of one from Debian package. See L<open> how to specify Debian
313     default binaries B<index++> and B<search++>.
314    
315 dpavlin 5 =head2 SWISH++
316 dpavlin 1
317 dpavlin 5 Aside from very good rewrite in C++, SWISH++ is fatster because it has
318     claver heuristics about which data in input files are words to index and
319     which are not. It's based on English language and might be best choice if
320     you plan to install large amount of long text documents.
321 dpavlin 1
322 dpavlin 5 However, if you plan to index all data from structured storage (e.g. RDBMS)
323     you might want B<all> words from data to end up in index as opposed to just
324     those which look like English words. This is especially important if you
325     don't plan to index English texts with this module.
326 dpavlin 1
327 dpavlin 5 With distribution build versions of SWISH++ you might have problems with
328     disepearing words. To overcome this problem, you will have to compile and
329     configure SWISH++ yourself (because language characteristics are
330     compilation-time option).
331 dpavlin 1
332 dpavlin 5 Compilation of SWISH++ is easy process well described on project's web
333     pages. To see my very relaxed sample configuration take a look at C<swish++>
334     directory included in distribution.
335    
336     =head1 SEE ALSO
337    
338     C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
339    
340 dpavlin 1 =head1 AUTHOR
341    
342 dpavlin 5 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
343 dpavlin 1
344     =head1 COPYRIGHT AND LICENSE
345    
346     Copyright (C) 2004 by Dobrica Pavlinusic
347    
348     This library is free software; you can redistribute it and/or modify
349     it under the same terms as Perl itself, either Perl version 5.8.4 or,
350     at your option, any later version of Perl 5 you may have available.
351    
352    
353     =cut

  ViewVC Help
Powered by ViewVC 1.1.26