/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 16 - (show annotations)
Sun Dec 5 21:06:48 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 12524 byte(s)
added experimental support for storing meta data (properties) in BerkeleyDB
hash database (currently ALL meta data is also stored in BerkeleyDB).

1 package SWISH::PlusPlus;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.10';
8
9 use Carp;
10 use File::Temp qw/ tempdir /;
11 use BerkeleyDB;
12 #use YAML;
13
14 =head1 NAME
15
16 SWISH::PlusPlus - Perl extension SWISH++
17
18 =head1 SYNOPSIS
19
20 use SWISH::PlusPlus;
21 blah blah blah
22
23 =head1 DESCRIPTION
24
25 This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
26 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
27 support for properties (which this module tries to fix).
28
29 Implementation of this module is crafted after L<Plucene::Simple> and it
30 should be easy to replace Plucene with this module for increased
31 performance. However, this module is not plug-in replacement.
32
33 =head1 METHODS
34
35 =head2 new
36
37 Create new indexing object.
38
39 my $i = SWISH::PlusPlus->new(
40 index_dir => '/path/to/index',
41 index => 'index++',
42 search => 'search++',
43 debug => 1,
44 meta_in_body => 1,
45 use_stopwords => 1,
46 );
47
48 Options to new are following:
49
50 =over 5
51
52 =item C<index_dir>
53
54 Path to directory in which index will be created.
55
56 =item C<index>
57
58 Full or partial path to SWISH++ index executable. By default, it's B<index>
59 for self-compiled version. If you use Debian GNU/Linux package specify
60 B<index++>. See C<Debian>.
61
62 =item C<search>
63
64 Full or partial path to SWISH++ search executable. By default, it's B<search>.
65
66 =item C<debug>
67
68 This option (off by default) will produce a lot of debugging output on
69 C<STDERR> prefixed by C<##>.
70
71 =item C<meta_in_body>
72
73 This option (off by default) enables to search content of meta fields
74 without specifing them (like they are in body of document). This will
75 somewhat increate index size.
76
77 =item C<use_stopwords>
78
79 Use built-in SWISH++ stop words. By default, they are disabled.
80
81 =back
82
83 =cut
84
85 sub new {
86 my $class = shift;
87 my $self = {@_};
88 bless($self, $class);
89
90 foreach (qw(index_dir)) {
91 croak "need $_" unless $self->{$_};
92 }
93
94 my $index_dir = $self->{'index_dir'};
95
96 my $cwd;
97 chomp($cwd = `pwd`);
98 $self->{'cwd'} = $cwd || carp "can't get cwd!";
99
100 if ($index_dir !~ m#^/#) {
101 $index_dir = "$cwd/$index_dir";
102 print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
103 $self->{'index_dir'} = $index_dir;
104 }
105
106 if (! -e $index_dir) {
107 mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
108 }
109
110 # default executables
111 $self->{'index'} ||= 'index';
112 $self->{'search'} ||= 'search';
113
114 print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
115
116 $self ? return $self : return undef;
117 }
118
119
120 =head2 check_bin
121
122 Check if swish++ binaries specified in L<new> are available and verify
123 version signature.
124
125 if ($i->check_bin) {
126 print "swish++ binaries found\n";
127 };
128
129 It will also setup property
130
131 $i->{'version'}
132
133 which you can examine to see version.
134
135 =cut
136
137 sub check_bin {
138 my $self = shift;
139
140 my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
141 my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
142
143 chomp $i;
144 chomp $s;
145
146 confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
147 confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
148
149 if ($i eq $s) {
150 $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
151 $self->{'version'} = $i;
152 return 1;
153 } else {
154 carp "version difference: index is $i while search is $s";
155 return;
156 }
157
158 }
159
160 =head2 index_document
161
162 Quick way to add simple data to index.
163
164 $i->index_document($key, $data);
165 $i->index_document( 42 => 'meaning of life' );
166
167 =cut
168
169 sub index_document {
170 my $self = shift;
171
172 my %doc = @_;
173
174 foreach my $id (keys %doc) {
175 $self->_create_doc(
176 path => $id,
177 body => $doc{$id},
178 );
179 }
180
181 return 1;
182 }
183
184 =head2 add
185
186 Add document with metadata to index.
187
188 $i->add(
189 path => 'path/to/document',
190 title => 'this is result title',
191 meta => {
192 description => 'this is description meta tag',
193 date => '2004-11-04',
194 author => 'Dobrica Pavlinusic',
195 }
196 body => 'this is text without meta data',
197 );
198
199 This is thin wrapper round L<_create_doc>.
200
201 =cut
202
203 sub add {
204 my $self = shift;
205
206 $self->_create_doc(@_);
207
208 return 1;
209 }
210 =head2 search
211
212 Search your index.
213
214 my @results = $i->search("swhish query");
215
216 Returns array with result IDs.
217
218 =cut
219
220 sub search {
221 my $self = shift;
222
223 my $query = shift || return;
224
225 $self->finish_update;
226 $self->_tie_meta_db(DB_RDONLY);
227
228 my @results;
229
230 # escape double quotes in query for shell
231 $query =~ s/"/\\"/g;
232
233 my $open_cmd = $self->{'search'} .
234 ' -i ' . $self->{'index_dir'}.'/index' .
235 ' "' . $query . '"'.
236 ' |';
237 print STDERR "## search: $open_cmd\n" if ($self->{'debug'});
238
239 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
240 my $l;
241 while($l = <SEARCH>) {
242 next if ($l =~ /^#/);
243 chomp($l);
244 print STDERR "## $l\n" if ($self->{'debug'});
245 my ($rank,$path,$size,$title) = split(/ /,$l,4);
246 $path =~ s#^\./##; # strip from path
247 push @results, {
248 rank => $rank,
249 path => $path,
250 size => $size,
251 title => $title,
252 }
253 }
254
255 close(SEARCH) || confess "can't close search";
256
257 #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
258
259 return @results;
260 }
261
262 =head2 property
263
264 Return stored meta property from result or result path.
265
266 print $i->property('path', 'title');
267 print $i->property($res->{'path'}, 'title');
268
269 =cut
270
271 sub property {
272 my $self = shift;
273
274 my ($path,$meta) = @_;
275
276 if ($path =~ m/^HASH/) {
277 $path = $path->{'path'} || confess "can't find path in input data";
278 }
279
280 my $val = $self->{'meta_db'}->{"$path-$meta"};
281
282 print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'});
283 return $val;
284 }
285
286 =head2 finish_update
287
288 This method will close index.
289
290 $i->finish_update;
291
292 It will be called on DESTROY when $i goes out of scope.
293
294 =cut
295
296 sub finish_update {
297 my $self = shift;
298
299 print STDERR "## finish_update\n" if ($self->{'debug'});
300
301 $self->_close_index && $self->_untie_meta_db;
302 }
303
304 sub DESTROY {
305 my $self = shift;
306 $self->finish_update;
307 }
308
309 =head1 PRIVATE METHODS
310
311 Private methods implement internals for creating temporary file needed for
312 swish++. You should have no need to call them directly, and they are here
313 just to have documentation.
314
315 =head2 _init_indexer
316
317 Create temporary directory in which files for indexing will be created and
318 start index process.
319
320 my $i->_init_indexer || die "can't start indexer";
321
322 It will also create empty file C<_stopwords_> to disable stop words.
323
324 =cut
325
326 sub _init_indexer {
327 my $self = shift;
328
329 return if ($self->{'_index_fh'});
330
331 my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
332 $self->{'tmp_dir'} = $tmp_dir;
333
334 chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
335
336 print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
337
338 my $opt = "-v " . ($self->{'debug'} || '0');
339
340 unless ($self->{'use_stopwrods'}) {
341 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
342 print STOP " ";
343 close(STOP);
344 $opt .= " -s _stopwords_";
345 }
346
347 my $index_dir = $self->{'index_dir'} || confess "no index_dir?";
348
349 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -';
350
351 print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
352
353 open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
354
355 chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
356
357 $self->_tie_meta_db(DB_CREATE);
358
359 return $self->{'_index_fh'};
360 }
361
362 =head2 _tie_meta_db
363
364 Open BerkeleyDB database with meta properties.
365
366 $i->_tie_meta_db(DB_CREATE);
367 $i->_tie_meta_db(DB_RDONLY);
368
369 }
370
371 =cut
372
373 sub _tie_meta_db {
374 my $self = shift;
375
376 my $flags = shift || confess "need DB_CREATE or DB_RDONLY";
377
378 return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags);
379
380 print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'});
381
382 $self->_untie_meta_db;
383 $self->{'_meta_db_flags'} = $flags;
384
385 my $file = $self->{'index_dir'}.'/meta.db';
386
387 tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash",
388 -Filename => $file,
389 -Flags => $flags
390 or confess "cannot open $file: $! $BerkeleyDB::Error\n" ;
391
392 return 1;
393 }
394
395 =head2 _untie_meta_db
396
397 Close BerkeleyDB database with meta properties.
398
399 $i->_untie_meta_db
400
401 =cut
402
403 sub _untie_meta_db {
404 my $self = shift;
405
406 return unless ($self->{'meta_db'});
407
408 print STDERR "## _untie_meta_db\n" if ($self->{'debug'});
409 untie %{$self->{'meta_db'}} || confess "can't untie!";
410 undef $self->{'meta_db'};
411 undef $self->{'_meta_db_flags'};
412
413 return 1;
414 }
415
416 =head2 _create_doc
417
418 Create temporary file and pass it's name to swish++
419
420 $i->_create_doc(
421 path => 'path/to/store/in/index',
422 title => 'this is title in results',
423 body => 'data to story in body tag',
424 meta => {
425 'meta name' => 'data for this meta',
426 'another' => 'again more data',
427 }
428 );
429
430 To delete document, just omit body and meta data.
431
432 =cut
433
434 sub _create_doc {
435 my $self = shift;
436
437 my $arg = {@_};
438
439 # open indexer if needed
440 $self->_init_indexer;
441
442 my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
443 my $id = $arg->{'path'} || confess "no path?";
444 $path .= "/$id";
445
446 print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
447
448 open(TMP, '>', $path) || die "can't create temp file $path: $!";
449
450 print TMP '<html><head>';
451
452 $arg->{'body'} ||= '';
453
454 if ($arg->{'meta'}) {
455 foreach my $name (keys %{$arg->{'meta'}}) {
456 my $content = $arg->{'meta'}->{$name};
457 print TMP qq{<meta name="$name" content="$content">};
458 $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
459 $self->{'meta_db'}->{"$id-$name"} = $content;
460 }
461 }
462
463 my $title = $arg->{'title'};
464 if (defined($title)) {
465 print TMP "<title>$title</title>";
466 $arg->{'body'} .= " $title" if ($self->{'meta_in_body'});
467 $self->{'meta_db'}->{"$id-title"} = $title;
468 }
469
470 print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
471
472 close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
473
474 print { $self->{'_index_fh'} } "$id\n";
475 }
476
477 =head2 _close_index
478
479 Close index after indexing.
480
481 $i->_close_index;
482
483 You have to close index before searching.
484
485 =cut
486
487 sub _close_index {
488 my $self = shift;
489
490 return unless ($self->{'_index_fh'});
491
492 print STDERR "## close index\n" if ($self->{'debug'});
493
494 close($self->{'_index_fh'}) || confess "can't close index: $!";
495 undef $self->{'_index_fh'};
496
497 return 1;
498 }
499
500 1;
501 __END__
502
503 =head2 EXPORT
504
505 None by default.
506
507 =head1 RELATED
508
509 =head2 Debian
510
511 Debian version of swish++ is often old (version 5 at moment of this writing
512 while version 6 is available in source code), so this module by default
513 uses executable names B<index> and B<search> for self-compiled version
514 instead of one from Debian package. See L<new> how to specify Debian
515 default binaries B<index++> and B<search++>.
516
517 =head2 SWISH++
518
519 Aside from very good rewrite in C++, SWISH++ is fatster because it has
520 claver heuristics about which data in input files are words to index and
521 which are not. It's based on English language and might be best choice if
522 you plan to install large amount of long text documents.
523
524 However, if you plan to index all data from structured storage (e.g. RDBMS)
525 you might want B<all> words from data to end up in index as opposed to just
526 those which look like English words. This is especially important if you
527 don't plan to index English texts with this module.
528
529 With distribution build versions of SWISH++ you might have problems with
530 disepearing words. To overcome this problem, you will have to compile and
531 configure SWISH++ yourself (because language characteristics are
532 compilation-time option).
533
534 Compilation of SWISH++ is easy process well described on project's web
535 pages. To see my very relaxed sample configuration take a look at C<swish++>
536 directory included in distribution.
537
538 =head2 SWISH++ config
539
540 C<config.h> located in C<swish++> directory of this distribution is relaxed
541 SWISH++ configuration that will index all words passed to it. This
542 configuration is needed for B<date test> because default configuration
543 doesn't recognize 2004-12-05 as date. Have in mind that your index size
544 might explode.
545
546 =head1 SEE ALSO
547
548 C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
549
550 =head1 AUTHOR
551
552 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
553
554 =head1 COPYRIGHT AND LICENSE
555
556 Copyright (C) 2004 by Dobrica Pavlinusic
557
558 This library is free software; you can redistribute it and/or modify
559 it under the same terms as Perl itself, either Perl version 5.8.4 or,
560 at your option, any later version of Perl 5 you may have available.
561
562
563 =cut

  ViewVC Help
Powered by ViewVC 1.1.26