/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 14 - (show annotations)
Sun Dec 5 15:35:53 2004 UTC (16 years, 3 months ago) by dpavlin
File size: 10609 byte(s)
store cwd in $i->{'cwd'}, store just version numbers in $i->{'version'},
_init_indexer doesn't mess with cwd while indexing, _index_fh is now private.

1 package SWISH::PlusPlus;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.06';
8
9 use Carp;
10 use File::Temp qw/ tempdir /;
11 #use YAML;
12
13 =head1 NAME
14
15 SWISH::PlusPlus - Perl extension SWISH++
16
17 =head1 SYNOPSIS
18
19 use SWISH::PlusPlus;
20 blah blah blah
21
22 =head1 DESCRIPTION
23
24 This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26 support for properties (which this module tries to fix).
27
28 Implementation of this module is crafted after L<Plucene::Simple> and it
29 should be easy to replace Plucene with this module for increased
30 performance. However, this module is not plug-in replacement.
31
32 =head1 METHODS
33
34 =head2 new
35
36 Create new indexing object.
37
38 my $i = SWISH::PlusPlus->new(
39 index_dir => '/path/to/index',
40 index => 'index++',
41 search => 'search++',
42 debug => 1,
43 meta_in_body => 1,
44 use_stopwords => 1,
45 );
46
47 Options to new are following:
48
49 =over 5
50
51 =item C<index_dir>
52
53 Path to directory in which index will be created.
54
55 =item C<index>
56
57 Full or partial path to SWISH++ index executable. By default, it's B<index>
58 for self-compiled version. If you use Debian GNU/Linux package specify
59 B<index++>. See C<Debian>.
60
61 =item C<search>
62
63 Full or partial path to SWISH++ search executable. By default, it's B<search>.
64
65 =item C<debug>
66
67 This option (off by default) will produce a lot of debugging output on
68 C<STDERR> prefixed by C<##>.
69
70 =item C<meta_in_body>
71
72 This option (off by default) enables to search content of meta fields
73 without specifing them (like they are in body of document). This will
74 somewhat increate index size.
75
76 =item C<use_stopwords>
77
78 Use built-in SWISH++ stop words. By default, they are disabled.
79
80 =back
81
82 =cut
83
84 sub new {
85 my $class = shift;
86 my $self = {@_};
87 bless($self, $class);
88
89 foreach (qw(index_dir)) {
90 croak "need $_" unless $self->{$_};
91 }
92
93 my $index_dir = $self->{'index_dir'};
94
95 my $cwd;
96 chomp($cwd = `pwd`);
97 $self->{'cwd'} = $cwd || carp "can't get cwd!";
98
99 if ($index_dir !~ m#^/#) {
100 $index_dir = "$cwd/$index_dir";
101 print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
102 $self->{'index_dir'} = $index_dir;
103 }
104
105 if (! -e $index_dir) {
106 mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
107 }
108
109 # default executables
110 $self->{'index'} ||= 'index';
111 $self->{'search'} ||= 'search';
112
113 print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
114
115 $self ? return $self : return undef;
116 }
117
118
119 =head2 check_bin
120
121 Check if swish++ binaries specified in L<new> are available and verify
122 version signature.
123
124 if ($i->check_bin) {
125 print "swish++ binaries found\n";
126 };
127
128 It will also setup property
129
130 $i->{'version'}
131
132 which you can examine to see version.
133
134 =cut
135
136 sub check_bin {
137 my $self = shift;
138
139 my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
140 my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
141
142 chomp $i;
143 chomp $s;
144
145 confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
146 confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
147
148 if ($i eq $s) {
149 $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
150 $self->{'version'} = $i;
151 return 1;
152 } else {
153 carp "version difference: index is $i while search is $s";
154 return;
155 }
156
157 }
158
159 =head2 index_document
160
161 Quick way to add simple data to index.
162
163 $i->index_document($key, $data);
164 $i->index_document( 42 => 'meaning of life' );
165
166 =cut
167
168 sub index_document {
169 my $self = shift;
170
171 my %doc = @_;
172
173 foreach my $id (keys %doc) {
174 $self->_create_doc(
175 path => $id,
176 body => $doc{$id},
177 );
178 }
179
180 return 1;
181 }
182
183 =head2 add
184
185 Add document with metadata to index.
186
187 $i->add(
188 path => 'path/to/document',
189 title => 'this is result title',
190 meta => {
191 description => 'this is description meta tag',
192 date => '2004-11-04',
193 author => 'Dobrica Pavlinusic',
194 }
195 body => 'this is text without meta data',
196 );
197
198 This is thin wrapper round L<_create_doc>.
199
200 =cut
201
202 sub add {
203 my $self = shift;
204
205 $self->_create_doc(@_);
206
207 return 1;
208 }
209 =head2 search
210
211 Search your index.
212
213 my @results = $i->search("swhish query");
214
215 Returns array with result IDs.
216
217 =cut
218
219 sub search {
220 my $self = shift;
221
222 my $query = shift || return;
223
224 $self->finish_update;
225
226 my @results;
227
228 # escape double quotes in query for shell
229 $query =~ s/"/\\"/g;
230
231 my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
232 print STDERR "## search $open_cmd\n" if ($self->{'debug'});
233
234 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
235 while(<SEARCH>) {
236 next if (/^#/);
237 chomp;
238 print STDERR "## $_\n" if ($self->{'debug'});
239 my ($rank,$path,$size,$title) = split(/ /,$_,4);
240 push @results, {
241 rank => $rank,
242 path => $path,
243 size => $size,
244 title => $title,
245 }
246 }
247
248 close(SEARCH) || confess "can't close search";
249
250 #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
251
252 return @results;
253 }
254
255 =head2 finish_update
256
257 This method will close index.
258
259 $i->finish_update;
260
261 It will be called on DESTROY when $i goes out of scope.
262
263 =cut
264
265 sub finish_update {
266 my $self = shift;
267
268 print STDERR "## finish_update\n" if ($self->{'debug'});
269
270 $self->_close_index;
271 }
272
273 sub DESTROY {
274 my $self = shift;
275 $self->finish_update;
276 }
277
278 =head1 PRIVATE METHODS
279
280 Private methods implement internals for creating temporary file needed for
281 swish++. You should have no need to call them directly, and they are here
282 just to have documentation.
283
284 =head2 _init_indexer
285
286 Create temporary directory in which files for indexing will be created and
287 start index process.
288
289 my $i->_init_indexer || die "can't start indexer";
290
291 It will also create empty file C<_stopwords_> to disable stop words.
292
293 =cut
294
295 sub _init_indexer {
296 my $self = shift;
297
298 return if ($self->{'_index_fh'});
299
300 my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
301 $self->{'tmp_dir'} = $tmp_dir;
302
303 chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
304
305 print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
306
307 my $opt = "-v " . ($self->{'debug'} || '0');
308
309 unless ($self->{'use_stopwrods'}) {
310 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
311 print STOP " ";
312 close(STOP);
313 $opt .= " -s _stopwords_";
314 }
315
316 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
317
318 print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
319
320 open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
321
322 chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
323
324 return $self->{'_index_fh'};
325 }
326
327 =head2 _create_doc
328
329 Create temporary file and pass it's name to swish++
330
331 $i->_create_doc(
332 path => 'path/to/store/in/index',
333 title => 'this is title in results',
334 body => 'data to story in body tag',
335 meta => {
336 'meta name' => 'data for this meta',
337 'another' => 'again more data',
338 }
339 );
340
341 To delete document, just omit body and meta data.
342
343 =cut
344
345 sub _create_doc {
346 my $self = shift;
347
348 my $arg = {@_};
349
350 # open indexer if needed
351 $self->_init_indexer;
352
353 my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
354 $path .= '/' . $arg->{'path'};
355
356 print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
357
358 open(TMP, '>', $path) || die "can't create temp file $path: $!";
359
360 print TMP '<html><head>';
361
362 $arg->{'body'} ||= '';
363
364 if ($arg->{'meta'}) {
365 foreach my $name (keys %{$arg->{'meta'}}) {
366 my $content = $arg->{'meta'}->{$name};
367 print TMP qq{<meta name="$name" content="$content">};
368 $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
369 }
370 }
371
372 if (defined($arg->{'title'})) {
373 print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
374 $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
375 }
376
377 print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
378
379 close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
380
381 print { $self->{'_index_fh'} } $arg->{'path'}."\n";
382 }
383
384 =head2 _close_index
385
386 Close index after indexing.
387
388 $i->_close_index;
389
390 You have to close index before searching.
391
392 =cut
393
394 sub _close_index {
395 my $self = shift;
396
397 return unless ($self->{'_index_fh'});
398
399 print STDERR "## close index\n" if ($self->{'debug'});
400
401 close($self->{'_index_fh'});
402 undef $self->{'_index_fh'};
403 }
404
405 1;
406 __END__
407
408 =head2 EXPORT
409
410 None by default.
411
412 =head1 RELATED
413
414 =head2 Debian
415
416 Debian version of swish++ is often old (version 5 at moment of this writing
417 while version 6 is available in source code), so this module by default
418 uses executable names B<index> and B<search> for self-compiled version
419 instead of one from Debian package. See L<new> how to specify Debian
420 default binaries B<index++> and B<search++>.
421
422 =head2 SWISH++
423
424 Aside from very good rewrite in C++, SWISH++ is fatster because it has
425 claver heuristics about which data in input files are words to index and
426 which are not. It's based on English language and might be best choice if
427 you plan to install large amount of long text documents.
428
429 However, if you plan to index all data from structured storage (e.g. RDBMS)
430 you might want B<all> words from data to end up in index as opposed to just
431 those which look like English words. This is especially important if you
432 don't plan to index English texts with this module.
433
434 With distribution build versions of SWISH++ you might have problems with
435 disepearing words. To overcome this problem, you will have to compile and
436 configure SWISH++ yourself (because language characteristics are
437 compilation-time option).
438
439 Compilation of SWISH++ is easy process well described on project's web
440 pages. To see my very relaxed sample configuration take a look at C<swish++>
441 directory included in distribution.
442
443 =head2 SWISH++ config
444
445 C<config.h> located in C<swish++> directory of this distribution is relaxed
446 SWISH++ configuration that will index all words passed to it. This
447 configuration is needed for B<date test> because default configuration
448 doesn't recognize 2004-12-05 as date. Have in mind that your index size
449 might explode.
450
451 =head1 SEE ALSO
452
453 C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
454
455 =head1 AUTHOR
456
457 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
458
459 =head1 COPYRIGHT AND LICENSE
460
461 Copyright (C) 2004 by Dobrica Pavlinusic
462
463 This library is free software; you can redistribute it and/or modify
464 it under the same terms as Perl itself, either Perl version 5.8.4 or,
465 at your option, any later version of Perl 5 you may have available.
466
467
468 =cut

  ViewVC Help
Powered by ViewVC 1.1.26