/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Sun Dec 5 14:35:54 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 10184 byte(s)
relative index_dir paths will be now resolved to absolute,
added finish_update to close indexer, DESTROY will also close index,
verbose level now corresponds to debug level, mode debug output

1 package SWISH::PlusPlus;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.06';
8
9 use Carp;
10 use File::Temp qw/ tempdir /;
11 #use YAML;
12
13 =head1 NAME
14
15 SWISH::PlusPlus - Perl extension SWISH++
16
17 =head1 SYNOPSIS
18
19 use SWISH::PlusPlus;
20 blah blah blah
21
22 =head1 DESCRIPTION
23
24 This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26 support for properties (which this module tries to fix).
27
28 Implementation of this module is crafted after L<Plucene::Simple> and it
29 should be easy to replace Plucene with this module for increased
30 performance. However, this module is not plug-in replacement.
31
32 =head1 METHODS
33
34 =head2 new
35
36 Create new indexing object.
37
38 my $i = SWISH::PlusPlus->new(
39 index_dir => '/path/to/index',
40 index => 'index++',
41 search => 'search++',
42 debug => 1,
43 meta_in_body => 1,
44 use_stopwords => 1,
45 );
46
47 Options to new are following:
48
49 =over 5
50
51 =item C<index_dir>
52
53 Path to directory in which index will be created.
54
55 =item C<index>
56
57 Full or partial path to SWISH++ index executable. By default, it's B<index>
58 for self-compiled version. If you use Debian GNU/Linux package specify
59 B<index++>. See C<Debian>.
60
61 =item C<search>
62
63 Full or partial path to SWISH++ search executable. By default, it's B<search>.
64
65 =item C<debug>
66
67 This option (off by default) will produce a lot of debugging output on
68 C<STDERR> prefixed by C<##>.
69
70 =item C<meta_in_body>
71
72 This option (off by default) enables to search content of meta fields
73 without specifing them (like they are in body of document). This will
74 somewhat increate index size.
75
76 =item C<use_stopwords>
77
78 Use built-in SWISH++ stop words. By default, they are disabled.
79
80 =back
81
82 =cut
83
84 sub new {
85 my $class = shift;
86 my $self = {@_};
87 bless($self, $class);
88
89 foreach (qw(index_dir)) {
90 croak "need $_" unless $self->{$_};
91 }
92
93 my $index_dir = $self->{'index_dir'};
94
95 if ($index_dir !~ m#^/#) {
96 chomp(my $cwd = `pwd`);
97 $index_dir = "$cwd/$index_dir";
98 print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
99 $self->{'index_dir'} = $index_dir;
100 }
101
102 if (! -e $index_dir) {
103 mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
104 }
105
106 # default executables
107 $self->{'index'} ||= 'index';
108 $self->{'search'} ||= 'search';
109
110 print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
111
112 $self ? return $self : return undef;
113 }
114
115
116 =head2 check_bin
117
118 Check if swish++ binaries specified in L<new> are available and verify
119 version signature.
120
121 if ($i->check_bin) {
122 print "swish++ binaries found\n";
123 };
124
125 It will also setup property
126
127 $i->{'version'}
128
129 which you can examine to see version.
130
131 =cut
132
133 sub check_bin {
134 my $self = shift;
135
136 my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
137 my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
138
139 chomp $i;
140 chomp $s;
141
142 confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
143 confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
144
145 if ($i eq $s) {
146 $self->{'version'} = $i;
147 return 1;
148 } else {
149 carp "version difference: index is $i while search is $s";
150 return;
151 }
152
153 }
154
155 =head2 index_document
156
157 Quick way to add simple data to index.
158
159 $i->index_document($key, $data);
160 $i->index_document( 42 => 'meaning of life' );
161
162 =cut
163
164 sub index_document {
165 my $self = shift;
166
167 my %doc = @_;
168
169 foreach my $id (keys %doc) {
170 $self->_create_doc(
171 path => $id,
172 body => $doc{$id},
173 );
174 }
175
176 return 1;
177 }
178
179 =head2 add
180
181 Add document with metadata to index.
182
183 $i->add(
184 path => 'path/to/document',
185 title => 'this is result title',
186 meta => {
187 description => 'this is description meta tag',
188 date => '2004-11-04',
189 author => 'Dobrica Pavlinusic',
190 }
191 body => 'this is text without meta data',
192 );
193
194 This is thin wrapper round L<_create_doc>.
195
196 =cut
197
198 sub add {
199 my $self = shift;
200
201 $self->_create_doc(@_);
202
203 return 1;
204 }
205 =head2 search
206
207 Search your index.
208
209 my @results = $i->search("swhish query");
210
211 Returns array with result IDs.
212
213 =cut
214
215 sub search {
216 my $self = shift;
217
218 my $query = shift || return;
219
220 $self->_close_index;
221
222 my @results;
223
224 # escape double quotes in query for shell
225 $query =~ s/"/\\"/g;
226
227 my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
228 print STDERR "## search $open_cmd\n" if ($self->{'debug'});
229
230 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
231 while(<SEARCH>) {
232 next if (/^#/);
233 chomp;
234 print STDERR "## $_\n" if ($self->{'debug'});
235 my ($rank,$path,$size,$title) = split(/ /,$_,4);
236 push @results, {
237 rank => $rank,
238 path => $path,
239 size => $size,
240 title => $title,
241 }
242 }
243
244 close(SEARCH) || confess "can't close search";
245
246 #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
247
248 return @results;
249 }
250
251 =head2 finish_update
252
253 This method will close index.
254
255 $i->finish_update;
256
257 It will be called on DESTROY when $i goes out of scope.
258
259 =cut
260
261 sub finish_update {
262 my $self = shift;
263
264 $self->_close_index;
265 }
266
267 sub DESTROY {
268 my $self = shift;
269 $self->finish_update;
270 }
271
272 =head1 PRIVATE METHODS
273
274 Private methods implement internals for creating temporary file needed for
275 swish++. You should have no need to call them directly, and they are here
276 just to have documentation.
277
278 =head2 _init_indexer
279
280 Create temporary directory in which files for indexing will be created and
281 start index process.
282
283 my $i->_init_indexer || die "can't start indexer";
284
285 It will also create empty file C<_stopwords_> to disable stop words.
286
287 =cut
288
289 sub _init_indexer {
290 my $self = shift;
291
292 $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
293
294 chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
295
296 my $opt = "-v " . ($self->{'debug'} || '0');
297
298 unless ($self->{'use_stopwrods'}) {
299 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
300 print STOP " ";
301 close(STOP);
302 $opt .= " -s _stopwords_";
303 }
304
305 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
306
307 print STDERR "## open index $open_cmd\n" if ($self->{'index'});
308
309 open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
310
311
312 return $self->{'index_fh'};
313 }
314
315 =head2 _create_doc
316
317 Create temporary file and pass it's name to swish++
318
319 $i->_create_doc(
320 path => 'path/to/store/in/index',
321 title => 'this is title in results',
322 body => 'data to story in body tag',
323 meta => {
324 'meta name' => 'data for this meta',
325 'another' => 'again more data',
326 }
327 );
328
329 To delete document, just omit body and meta data.
330
331 =cut
332
333 sub _create_doc {
334 my $self = shift;
335
336 my $arg = {@_};
337
338 # open indexer if needed
339 $self->{'index_fh'} ||= $self->_init_indexer;
340
341 my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
342
343 open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
344
345 print TMP '<html><head>';
346
347 $arg->{'body'} ||= '';
348
349 if ($arg->{'meta'}) {
350 foreach my $name (keys %{$arg->{'meta'}}) {
351 my $content = $arg->{'meta'}->{$name};
352 print TMP qq{<meta name="$name" content="$content">};
353 $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
354 }
355 }
356
357 if (defined($arg->{'title'})) {
358 print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
359 $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
360 }
361
362 print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
363
364 close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
365
366 print { $self->{'index_fh'} } $arg->{'path'}."\n";
367 }
368
369 =head2 _close_index
370
371 Close index after indexing.
372
373 $i->_close_index;
374
375 You have to close index before searching.
376
377 =cut
378
379 sub _close_index {
380 my $self = shift;
381
382 return unless ($self->{'index_fh'});
383
384 print STDERR "## close index\n" if ($self->{'debug'});
385
386 close($self->{'index_fh'});
387 undef $self->{'index_fh'};
388 }
389
390 1;
391 __END__
392
393 =head2 EXPORT
394
395 None by default.
396
397 =head1 RELATED
398
399 =head2 Debian
400
401 Debian version of swish++ is often old (version 5 at moment of this writing
402 while version 6 is available in source code), so this module by default
403 uses executable names B<index> and B<search> for self-compiled version
404 instead of one from Debian package. See L<new> how to specify Debian
405 default binaries B<index++> and B<search++>.
406
407 =head2 SWISH++
408
409 Aside from very good rewrite in C++, SWISH++ is fatster because it has
410 claver heuristics about which data in input files are words to index and
411 which are not. It's based on English language and might be best choice if
412 you plan to install large amount of long text documents.
413
414 However, if you plan to index all data from structured storage (e.g. RDBMS)
415 you might want B<all> words from data to end up in index as opposed to just
416 those which look like English words. This is especially important if you
417 don't plan to index English texts with this module.
418
419 With distribution build versions of SWISH++ you might have problems with
420 disepearing words. To overcome this problem, you will have to compile and
421 configure SWISH++ yourself (because language characteristics are
422 compilation-time option).
423
424 Compilation of SWISH++ is easy process well described on project's web
425 pages. To see my very relaxed sample configuration take a look at C<swish++>
426 directory included in distribution.
427
428 =head2 SWISH++ config
429
430 C<config.h> located in C<swish++> directory of this distribution is relaxed
431 SWISH++ configuration that will index all words passed to it. This
432 configuration is needed for B<date test> because default configuration
433 doesn't recognize 2004-12-05 as date. Have in mind that your index size
434 might explode.
435
436 =head1 SEE ALSO
437
438 C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
439
440 =head1 AUTHOR
441
442 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
443
444 =head1 COPYRIGHT AND LICENSE
445
446 Copyright (C) 2004 by Dobrica Pavlinusic
447
448 This library is free software; you can redistribute it and/or modify
449 it under the same terms as Perl itself, either Perl version 5.8.4 or,
450 at your option, any later version of Perl 5 you may have available.
451
452
453 =cut

  ViewVC Help
Powered by ViewVC 1.1.26