/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (show annotations)
Sat Dec 4 17:49:20 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 7882 byte(s)
search is working

1 package SWISH::PlusPlus;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.02';
8
9 use Carp;
10 use File::Temp qw/ tempdir /;
11 #use YAML;
12
13 =head1 NAME
14
15 SWISH::PlusPlus - Perl extension SWISH++
16
17 =head1 SYNOPSIS
18
19 use SWISH::PlusPlus;
20 blah blah blah
21
22 =head1 DESCRIPTION
23
24 This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26 support for properties (which this module tries to fix).
27
28 Implementation of this module is crafted after L<Plucene::Simple> and it
29 should be easy to replace Plucene with this module for increased
30 performance. However, this module is not plug-in replacement.
31
32 =head1 METHODS
33
34 =head2 open
35
36 Create new indexing object.
37
38 my $i = SWISH::PlusPlus->open(
39 index_dir => '/path/to/index',
40 index => 'index++',
41 search => 'search++',
42 debug => 1,
43 );
44
45 Options to open are following:
46
47 =over 5
48
49 =item C<index_dir>
50
51 Path to directory in which index will be created.
52
53 =item C<index>
54
55 Full or partial path to SWISH++ index executable. By default, it's B<index>
56 for self-compiled version. If you use Debian GNU/Linux package specify
57 B<index++>. See C<Debian>.
58
59 =item C<search>
60
61 Full or partial path to SWISH++ search executable. By default, it's B<search>.
62
63 =item C<debug>
64
65 This option (off by default) will produce a lot of debugging output on
66 C<STDERR> prefixed by C<##>.
67
68 =back
69
70 =cut
71
72 sub open {
73 my $class = shift;
74 my $self = {@_};
75 bless($self, $class);
76
77 foreach (qw(index_dir)) {
78 croak "need $_" unless $self->{$_};
79 }
80
81 if (! -e $self->{'index_dir'}) {
82 mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";
83 }
84
85 # default executables
86 $self->{'index'} ||= 'index';
87 $self->{'search'} ||= 'search';
88
89 print STDERR "## open index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
90
91 $self ? return $self : return undef;
92 }
93
94
95 =head2 check_bin
96
97 Check if swish++ binaries specified in L<open> are available and verify
98 version signature.
99
100 if ($i->check_bin) {
101 print "swish++ binaries found\n";
102 };
103
104 It will also setup property
105
106 $i->{'version'}
107
108 which you can examine to see version.
109
110 =cut
111
112 sub check_bin {
113 my $self = shift;
114
115 my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
116 my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
117
118 chomp $i;
119 chomp $s;
120
121 confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
122 confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
123
124 if ($i eq $s) {
125 $self->{'version'} = $i;
126 return 1;
127 } else {
128 carp "version difference: index is $i while search is $s";
129 return;
130 }
131
132 }
133
134 =head2 index_document
135
136 Quick way to add simple data to index.
137
138 $i->index_document($key, $data);
139 $i->index_document( 42 => 'meaning of life' );
140
141 =cut
142
143 sub index_document {
144 my $self = shift;
145
146 my %doc = @_;
147
148 foreach my $id (keys %doc) {
149 $self->_create_doc(
150 path => $id,
151 body => $doc{$id},
152 );
153 }
154
155 return 1;
156 }
157
158 =head2 search
159
160 Search your index.
161
162 my @results = $i->search("swhish query");
163
164 Returns array with result IDs.
165
166 =cut
167
168 sub search {
169 my $self = shift;
170
171 my $query = shift || return;
172
173 $self->_close_index;
174
175 my @results;
176
177 # escape double quotes in query for shell
178 $query =~ s/"/\\"/g;
179
180 my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
181 print STDERR "## search $open_cmd\n" if ($self->{'debug'});
182
183 CORE::open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
184 while(<SEARCH>) {
185 next if (/^#/);
186 chomp;
187 print STDERR "## $_\n" if ($self->{'debug'});
188 my ($rank,$path,$size,$title) = split(/ /,$_,4);
189 push @results, {
190 rank => $rank,
191 path => $path,
192 size => $size,
193 title => $title,
194 }
195 }
196
197 close(SEARCH) || confess "can't close search";
198
199 #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
200
201 return @results;
202 }
203
204 =head1 PRIVATE METHODS
205
206 Private methods implement internals for creating temporary file needed for
207 swish++. You should have no need to call them directly, and they are here
208 just to have documentation.
209
210 =head2 _init_index
211
212 Create temporary directory in which files for indexing will be created and
213 start index process.
214
215 my $i->_init_index || die "can't start indexer";
216
217 =cut
218
219 sub _init_index {
220 my $self = shift;
221
222 $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
223
224 my $opt = "-v 4";
225
226 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
227
228 chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
229
230 CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
231
232 return $self->{'index_fh'};
233 }
234
235 =head2 _create_doc
236
237 Create temporary file and pass it's name to swish++
238
239 $i->_create_doc(
240 path => 'path/to/store/in/index',
241 body => 'data to story in body tag',
242 meta => {
243 'meta name' => 'data for this meta',
244 'another' => 'again more data',
245 }
246 );
247
248 To delete document, just omit body and meta data.
249
250 =cut
251
252 sub _create_doc {
253 my $self = shift;
254
255 my $arg = {@_};
256
257 # open indexer if needed
258 $self->{'index_fh'} ||= $self->_init_index;
259
260 my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
261
262 CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
263
264 print TMP '<html>';
265
266 if ($arg->{'meta'}) {
267 confess "not yet implemented";
268 }
269
270 print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';
271
272 close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
273
274 print { $self->{'index_fh'} } $arg->{'path'}."\n";
275 }
276
277 =head2 _close_index
278
279 Close index after indexing.
280
281 $i->_close_index;
282
283 You have to close index before searching.
284
285 =cut
286
287 sub _close_index {
288 my $self = shift;
289
290 return unless ($self->{'index_fh'});
291
292 print STDERR "## close index\n" if ($self->{'debug'});
293
294 close($self->{'index_fh'});
295 undef $self->{'index_fh'};
296 }
297
298 1;
299 __END__
300
301 =head2 EXPORT
302
303 None by default.
304
305 =head1 RELATED
306
307 =head2 Debian
308
309 Debian version of swish++ is often old (version 5 at moment of this writing
310 while version 6 is available in source code), so this module by default
311 uses executable names B<index> and B<search> for self-compiled version
312 instead of one from Debian package. See L<open> how to specify Debian
313 default binaries B<index++> and B<search++>.
314
315 =head2 SWISH++
316
317 Aside from very good rewrite in C++, SWISH++ is fatster because it has
318 claver heuristics about which data in input files are words to index and
319 which are not. It's based on English language and might be best choice if
320 you plan to install large amount of long text documents.
321
322 However, if you plan to index all data from structured storage (e.g. RDBMS)
323 you might want B<all> words from data to end up in index as opposed to just
324 those which look like English words. This is especially important if you
325 don't plan to index English texts with this module.
326
327 With distribution build versions of SWISH++ you might have problems with
328 disepearing words. To overcome this problem, you will have to compile and
329 configure SWISH++ yourself (because language characteristics are
330 compilation-time option).
331
332 Compilation of SWISH++ is easy process well described on project's web
333 pages. To see my very relaxed sample configuration take a look at C<swish++>
334 directory included in distribution.
335
336 =head1 SEE ALSO
337
338 C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
339
340 =head1 AUTHOR
341
342 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
343
344 =head1 COPYRIGHT AND LICENSE
345
346 Copyright (C) 2004 by Dobrica Pavlinusic
347
348 This library is free software; you can redistribute it and/or modify
349 it under the same terms as Perl itself, either Perl version 5.8.4 or,
350 at your option, any later version of Perl 5 you may have available.
351
352
353 =cut

  ViewVC Help
Powered by ViewVC 1.1.26