/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 21 - (show annotations)
Sun Dec 5 22:24:09 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 13703 byte(s)
documentation update

1 package SWISH::PlusPlus;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.10';
8
9 use Carp;
10 use File::Temp qw/ tempdir /;
11 use BerkeleyDB;
12 #use YAML;
13
14 =head1 NAME
15
16 SWISH::PlusPlus - Perl extension for full-text indexer SWISH++ with properties support
17
18 =head1 SYNOPSIS
19
20 use SWISH::PlusPlus;
21 blah blah blah
22
23 =head1 DESCRIPTION
24
25 This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
26 rewrite of swish-e in C++ which is extremely fast (due to mmap usage and
27 clever language heuristics), but without support for properties (which this
28 module tries to fix).
29
30 Implementation of API is something in-between C<SWISH::API> and
31 C<Plucene::Simple>. It should be easy to replace Plucene or swish-e with
32 this module for increased performance. However, this module is not plug-in
33 replacement.
34
35 =head1 METHODS
36
37 =head2 new
38
39 Create new instance for index.
40
41 my $i = SWISH::PlusPlus->new(
42 index_dir => '/path/to/index',
43 index => 'index++',
44 search => 'search++',
45 debug => 1,
46 meta_in_body => 1,
47 use_stopwords => 1,
48 );
49
50 Options are described below:
51
52 =over 5
53
54 =item C<index_dir>
55
56 Path to directory in which index and meta database will be created.
57
58 =item C<index>
59
60 Full or partial path to SWISH++ index executable. By default, it's B<index>
61 for self-compiled version. If you use Debian GNU/Linux package specify
62 B<index++>. See C<Debian>.
63
64 =item C<search>
65
66 Full or partial path to SWISH++ search executable. By default, it's B<search>.
67
68 =item C<debug>
69
70 This option (off by default) will produce a lot of debugging output on
71 C<STDERR> prefixed by C<##>.
72
73 =item C<meta_in_body>
74
75 This option (off by default) enables to search content of meta fields
76 without specifying them (like they are in body of document). This will
77 somewhat increase index size.
78
79 =item C<use_stopwords>
80
81 Use built-in SWISH++ stop words. By default, they are disabled.
82
83 =back
84
85 =cut
86
87 sub new {
88 my $class = shift;
89 my $self = {@_};
90 bless($self, $class);
91
92 foreach (qw(index_dir)) {
93 croak "need $_" unless $self->{$_};
94 }
95
96 my $index_dir = $self->{'index_dir'};
97
98 my $cwd;
99 chomp($cwd = `pwd`);
100 $self->{'cwd'} = $cwd || carp "can't get cwd!";
101
102 if ($index_dir !~ m#^/#) {
103 $index_dir = "$cwd/$index_dir";
104 print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
105 $self->{'index_dir'} = $index_dir;
106 }
107
108 if (! -e $index_dir) {
109 mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
110 }
111
112 # default executables
113 $self->{'index'} ||= 'index';
114 $self->{'search'} ||= 'search';
115
116 print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
117
118 $self ? return $self : return undef;
119 }
120
121
122 =head2 check_bin
123
124 Check if SWISH++ binaries specified in L<new> are available and verify
125 version signature.
126
127 if ($i->check_bin) {
128 print "swish++ binaries found\n";
129 };
130
131 It will also setup property
132
133 $i->{'version'}
134
135 which you can examined to see numeric version (something like C<6.0.4>).
136
137 =cut
138
139 sub check_bin {
140 my $self = shift;
141
142 my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
143 my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
144
145 chomp $i;
146 chomp $s;
147
148 confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
149 confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
150
151 if ($i eq $s) {
152 $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
153 $self->{'version'} = $i;
154 return 1;
155 } else {
156 carp "version difference: index is $i while search is $s";
157 return;
158 }
159
160 }
161
162 =head2 index_document
163
164 Quick way to add simple data to index.
165
166 $i->index_document($path, $data);
167 $i->index_document( 42 => 'meaning of life' );
168
169 C<$path> value is really path, so you don't want to use directory
170 separators (slashes, /) in it probably.
171
172 =cut
173
174 sub index_document {
175 my $self = shift;
176
177 my %doc = @_;
178
179 foreach my $id (keys %doc) {
180 $self->_create_doc(
181 path => $id,
182 body => $doc{$id},
183 );
184 }
185
186 return 1;
187 }
188
189 =head2 add
190
191 Add document with meta-data to index.
192
193 $i->add(
194 path => 'path/to/document',
195 title => 'this is result title',
196 meta => {
197 description => 'this is description meta tag',
198 date => '2004-11-04',
199 author => 'Dobrica Pavlinusic',
200 }
201 body => 'this is text without meta data',
202 );
203
204 This is thin wrapper round L<_create_doc>.
205
206 =cut
207
208 sub add {
209 my $self = shift;
210
211 $self->_create_doc(@_);
212
213 return 1;
214 }
215
216 =head2 search
217
218 Search your index using any valid SWISH++ query.
219
220 my @results = $i->search("swish query");
221
222 Returns array with elements like this:
223
224 {
225 rank => 10, # rank of result
226 path => 'path to result', # path to result
227 size => 999, # size in bytes
228 title => 'title of result' # title meta property
229 }
230
231 =cut
232
233 sub search {
234 my $self = shift;
235
236 my $query = shift || return;
237
238 $self->finish_update;
239 $self->_tie_meta_db(DB_RDONLY);
240
241 my @results;
242
243 # escape double quotes in query for shell
244 $query =~ s/"/\\"/g;
245
246 my $open_cmd = $self->{'search'} .
247 ' -i ' . $self->{'index_dir'}.'/index' .
248 ' "' . $query . '"'.
249 ' |';
250 print STDERR "## search: $open_cmd\n" if ($self->{'debug'});
251
252 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
253 my $l;
254 while($l = <SEARCH>) {
255 next if ($l =~ /^#/);
256 chomp($l);
257 print STDERR "## $l\n" if ($self->{'debug'});
258 my ($rank,$path,$size,$title) = split(/ /,$l,4);
259 $path =~ s#^\./##; # strip from path
260 push @results, {
261 rank => $rank,
262 path => $path,
263 size => $size,
264 title => $title,
265 }
266 }
267
268 close(SEARCH) || confess "can't close search";
269
270 #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
271
272 return @results;
273 }
274
275 =head2 property
276
277 Return stored meta property from result or result path.
278
279 print $i->property('path', 'title');
280 print $i->property($res->{'path'}, 'title');
281
282 =cut
283
284 sub property {
285 my $self = shift;
286
287 my ($path,$meta) = @_;
288
289 if ($path =~ m/^HASH/) {
290 $path = $path->{'path'} || confess "can't find path in input data";
291 }
292
293 my $val = $self->{'meta_db'}->{"$path-$meta"};
294
295 print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'});
296 return $val;
297 }
298
299 =head2 finish_update
300
301 This method will close index binary and enable search. Searching is not
302 available while indexing is in process.
303
304 $i->finish_update;
305
306 Usually, you don't need to call this method directly. It will be called on
307 DESTROY when $i goes out of scope or when you first call search in session
308 if indexing was started.
309
310 =cut
311
312 sub finish_update {
313 my $self = shift;
314
315 print STDERR "## finish_update\n" if ($self->{'debug'});
316
317 $self->_close_index && $self->_untie_meta_db;
318 }
319
320 sub DESTROY {
321 my $self = shift;
322 $self->finish_update;
323 }
324
325 =head1 PRIVATE METHODS
326
327 Private methods implement internals for creating temporary files needed for
328 SWISH++. You should have no need to call them directly, and they are here
329 just to have documentation.
330
331 =head2 _init_indexer
332
333 Create temporary directory in which files for indexing will be created and
334 start index process.
335
336 my $i->_init_indexer || die "can't start indexer";
337
338 It will also create empty file C<_stopwords_> to disable stop words.
339
340 =cut
341
342 sub _init_indexer {
343 my $self = shift;
344
345 return if ($self->{'_index_fh'});
346
347 my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
348 $self->{'tmp_dir'} = $tmp_dir;
349
350 chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
351
352 print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
353
354 my $opt = "-v " . ($self->{'debug'} || '0');
355
356 unless ($self->{'use_stopwrods'}) {
357 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
358 print STOP " ";
359 close(STOP);
360 $opt .= " -s _stopwords_";
361 }
362
363 my $index_dir = $self->{'index_dir'} || confess "no index_dir?";
364
365 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -';
366
367 print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
368
369 open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
370
371 chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
372
373 $self->_tie_meta_db(DB_CREATE);
374
375 return $self->{'_index_fh'};
376 }
377
378 =head2 _create_doc
379
380 Create temporary file and pass it's name to SWISH++
381
382 $i->_create_doc(
383 path => 'path/to/store/in/index',
384 title => 'this is title in results',
385 body => 'data to story in body tag',
386 meta => {
387 'meta name' => 'data for this meta',
388 'another' => 'again more data',
389 }
390 );
391
392 To delete document, just omit body and meta data.
393
394 =cut
395
396 sub _create_doc {
397 my $self = shift;
398
399 my $arg = {@_};
400
401 # open indexer if needed
402 $self->_init_indexer;
403
404 my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
405 my $id = $arg->{'path'} || confess "no path?";
406 $path .= "/$id";
407
408 print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
409
410 open(TMP, '>', $path) || die "can't create temp file $path: $!";
411
412 print TMP '<html><head>';
413
414 $arg->{'body'} ||= '';
415
416 if ($arg->{'meta'}) {
417 foreach my $name (keys %{$arg->{'meta'}}) {
418 my $content = $arg->{'meta'}->{$name};
419 print TMP qq{<meta name="$name" content="$content">};
420 $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
421 $self->{'meta_db'}->{"$id-$name"} = $content;
422 }
423 }
424
425 my $title = $arg->{'title'};
426 if (defined($title)) {
427 print TMP "<title>$title</title>";
428 $arg->{'body'} .= " $title" if ($self->{'meta_in_body'});
429 $self->{'meta_db'}->{"$id-title"} = $title;
430 }
431
432 print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
433
434 close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
435
436 print { $self->{'_index_fh'} } "$id\n";
437 }
438
439 =head2 _close_index
440
441 Close index after indexing.
442
443 $i->_close_index;
444
445 You have to close index before searching.
446
447 =cut
448
449 sub _close_index {
450 my $self = shift;
451
452 return unless ($self->{'_index_fh'});
453
454 print STDERR "## close index\n" if ($self->{'debug'});
455
456 close($self->{'_index_fh'}) || confess "can't close index: $!";
457 undef $self->{'_index_fh'};
458
459 return 1;
460 }
461
462 =head2 _tie_meta_db
463
464 Open BerkeleyDB database with meta properties.
465
466 $i->_tie_meta_db(DB_CREATE);
467 $i->_tie_meta_db(DB_RDONLY);
468
469 }
470
471 =cut
472
473 sub _tie_meta_db {
474 my $self = shift;
475
476 my $flags = shift || confess "need DB_CREATE or DB_RDONLY";
477
478 return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags);
479
480 print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'});
481
482 $self->_untie_meta_db;
483 $self->{'_meta_db_flags'} = $flags;
484
485 my $file = $self->{'index_dir'}.'/meta.db';
486
487 tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash",
488 -Filename => $file,
489 -Flags => $flags
490 or confess "cannot open $file: $! $BerkeleyDB::Error\n" ;
491
492 return 1;
493 }
494
495 =head2 _untie_meta_db
496
497 Close BerkeleyDB database with meta properties.
498
499 $i->_untie_meta_db;
500
501 =cut
502
503 sub _untie_meta_db {
504 my $self = shift;
505
506 return unless ($self->{'meta_db'});
507
508 print STDERR "## _untie_meta_db\n" if ($self->{'debug'});
509 untie %{$self->{'meta_db'}} || confess "can't untie!";
510 undef $self->{'meta_db'};
511 undef $self->{'_meta_db_flags'};
512
513 return 1;
514 }
515
516 1;
517 __END__
518
519 =head2 EXPORT
520
521 None by default.
522
523 =head1 RELATED
524
525 =head2 Debian
526
527 Debian version of SWISH++ is often old (version 5 at moment of this writing
528 while version 6 is available in source code), so this module by default
529 uses executable names B<index> and B<search> for self-compiled version
530 instead of one from Debian package. See L<new> how to specify Debian
531 default binaries B<index++> and B<search++>.
532
533 =head2 SWISH++
534
535 Aside from very good rewrite in C++, SWISH++ is faster because it uses
536 claver heuristics about which data in input files are words to index and
537 which are not. It's based on English language and might be best choice if
538 you plan to index large amount of long text documents.
539
540 However, if you plan to index all data from structured storage (e.g. RDBMS)
541 you might want B<all> words from data to end up in index as opposed to just
542 those which look like English words. This is especially important if you
543 don't plan to index English texts with this module.
544
545 With distribution build versions of SWISH++ you might have problems with
546 disapearing words. To overcome this problem, you will have to compile and
547 configure SWISH++ yourself (because language characteristics are
548 compilation-time option).
549
550 Compilation of SWISH++ is easy process well described on project's web
551 pages. To see my very relaxed sample configuration take a look at C<swish++>
552 directory included in distribution.
553
554 =head2 SWISH++ config
555
556 C<config.h> located in C<swish++> directory of this distribution is relaxed
557 SWISH++ configuration that will index all words passed to it. This
558 configuration is needed for B<date test> because default configuration
559 doesn't recognize 2004-12-05 as date. Have in mind that your index size
560 might explode.
561
562 =head1 BUGS
563
564 Currently there is no way to specify which meta data will be stored as
565 properties. B<This will be fixed very soon>.
566
567 There is no garbage collection on temporary files created for SWISH++. This
568 means that one run of indexer will take additional disk space for temporary
569 files, which will be removed at end. There should be some way to remove
570 files after they are indexed by SWISH++. However, at this early stage of
571 development it's just not supported yet. Have plenty of disk space!
572
573 =head1 SEE ALSO
574
575 SWISH++ web site L<http://homepage.mac.com/pauljlucas/software/swish/>
576
577 =head1 AUTHOR
578
579 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
580
581 =head1 COPYRIGHT AND LICENSE
582
583 Copyright (C) 2004 by Dobrica Pavlinusic
584
585 This library is free software; you can redistribute it and/or modify
586 it under the same terms as Perl itself, either Perl version 5.8.4 or,
587 at your option, any later version of Perl 5 you may have available.
588
589
590 =cut

  ViewVC Help
Powered by ViewVC 1.1.26