/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 16 - (hide annotations)
Sun Dec 5 21:06:48 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 12524 byte(s)
added experimental support for storing meta data (properties) in BerkeleyDB
hash database (currently ALL meta data is also stored in BerkeleyDB).

1 dpavlin 1 package SWISH::PlusPlus;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7 dpavlin 16 our $VERSION = '0.10';
8 dpavlin 1
9     use Carp;
10 dpavlin 4 use File::Temp qw/ tempdir /;
11 dpavlin 16 use BerkeleyDB;
12 dpavlin 8 #use YAML;
13 dpavlin 1
14     =head1 NAME
15    
16     SWISH::PlusPlus - Perl extension SWISH++
17    
18     =head1 SYNOPSIS
19    
20     use SWISH::PlusPlus;
21     blah blah blah
22    
23     =head1 DESCRIPTION
24    
25     This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
26 dpavlin 3 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
27     support for properties (which this module tries to fix).
28 dpavlin 1
29 dpavlin 3 Implementation of this module is crafted after L<Plucene::Simple> and it
30     should be easy to replace Plucene with this module for increased
31     performance. However, this module is not plug-in replacement.
32    
33 dpavlin 1 =head1 METHODS
34    
35 dpavlin 10 =head2 new
36 dpavlin 1
37     Create new indexing object.
38    
39 dpavlin 10 my $i = SWISH::PlusPlus->new(
40 dpavlin 3 index_dir => '/path/to/index',
41     index => 'index++',
42     search => 'search++',
43 dpavlin 8 debug => 1,
44 dpavlin 9 meta_in_body => 1,
45     use_stopwords => 1,
46 dpavlin 1 );
47    
48 dpavlin 10 Options to new are following:
49 dpavlin 1
50     =over 5
51    
52 dpavlin 3 =item C<index_dir>
53    
54     Path to directory in which index will be created.
55    
56 dpavlin 1 =item C<index>
57    
58 dpavlin 3 Full or partial path to SWISH++ index executable. By default, it's B<index>
59     for self-compiled version. If you use Debian GNU/Linux package specify
60     B<index++>. See C<Debian>.
61 dpavlin 1
62 dpavlin 3 =item C<search>
63    
64     Full or partial path to SWISH++ search executable. By default, it's B<search>.
65    
66 dpavlin 8 =item C<debug>
67    
68     This option (off by default) will produce a lot of debugging output on
69     C<STDERR> prefixed by C<##>.
70    
71 dpavlin 9 =item C<meta_in_body>
72    
73     This option (off by default) enables to search content of meta fields
74     without specifing them (like they are in body of document). This will
75     somewhat increate index size.
76    
77     =item C<use_stopwords>
78    
79     Use built-in SWISH++ stop words. By default, they are disabled.
80    
81 dpavlin 1 =back
82    
83     =cut
84    
85 dpavlin 10 sub new {
86 dpavlin 1 my $class = shift;
87     my $self = {@_};
88     bless($self, $class);
89    
90 dpavlin 3 foreach (qw(index_dir)) {
91 dpavlin 1 croak "need $_" unless $self->{$_};
92     }
93    
94 dpavlin 13 my $index_dir = $self->{'index_dir'};
95    
96 dpavlin 14 my $cwd;
97     chomp($cwd = `pwd`);
98     $self->{'cwd'} = $cwd || carp "can't get cwd!";
99    
100 dpavlin 13 if ($index_dir !~ m#^/#) {
101     $index_dir = "$cwd/$index_dir";
102     print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
103     $self->{'index_dir'} = $index_dir;
104 dpavlin 1 }
105    
106 dpavlin 13 if (! -e $index_dir) {
107     mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
108     }
109    
110 dpavlin 3 # default executables
111     $self->{'index'} ||= 'index';
112     $self->{'search'} ||= 'search';
113    
114 dpavlin 13 print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
115 dpavlin 8
116 dpavlin 1 $self ? return $self : return undef;
117     }
118    
119    
120 dpavlin 3 =head2 check_bin
121    
122 dpavlin 10 Check if swish++ binaries specified in L<new> are available and verify
123 dpavlin 3 version signature.
124    
125     if ($i->check_bin) {
126     print "swish++ binaries found\n";
127     };
128    
129     It will also setup property
130    
131     $i->{'version'}
132    
133     which you can examine to see version.
134    
135     =cut
136    
137     sub check_bin {
138     my $self = shift;
139    
140     my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
141     my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
142    
143     chomp $i;
144     chomp $s;
145    
146     confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
147     confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
148    
149     if ($i eq $s) {
150 dpavlin 14 $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
151 dpavlin 3 $self->{'version'} = $i;
152     return 1;
153     } else {
154     carp "version difference: index is $i while search is $s";
155     return;
156     }
157    
158     }
159    
160 dpavlin 4 =head2 index_document
161    
162     Quick way to add simple data to index.
163    
164     $i->index_document($key, $data);
165     $i->index_document( 42 => 'meaning of life' );
166    
167     =cut
168    
169     sub index_document {
170     my $self = shift;
171    
172     my %doc = @_;
173    
174     foreach my $id (keys %doc) {
175     $self->_create_doc(
176     path => $id,
177     body => $doc{$id},
178     );
179     }
180    
181     return 1;
182     }
183    
184 dpavlin 9 =head2 add
185    
186     Add document with metadata to index.
187    
188     $i->add(
189     path => 'path/to/document',
190     title => 'this is result title',
191     meta => {
192     description => 'this is description meta tag',
193     date => '2004-11-04',
194     author => 'Dobrica Pavlinusic',
195     }
196     body => 'this is text without meta data',
197     );
198    
199     This is thin wrapper round L<_create_doc>.
200    
201     =cut
202    
203     sub add {
204     my $self = shift;
205    
206     $self->_create_doc(@_);
207    
208     return 1;
209     }
210 dpavlin 8 =head2 search
211    
212     Search your index.
213    
214     my @results = $i->search("swhish query");
215    
216     Returns array with result IDs.
217    
218     =cut
219    
220     sub search {
221     my $self = shift;
222    
223     my $query = shift || return;
224    
225 dpavlin 14 $self->finish_update;
226 dpavlin 16 $self->_tie_meta_db(DB_RDONLY);
227 dpavlin 8
228     my @results;
229    
230     # escape double quotes in query for shell
231     $query =~ s/"/\\"/g;
232    
233 dpavlin 16 my $open_cmd = $self->{'search'} .
234     ' -i ' . $self->{'index_dir'}.'/index' .
235     ' "' . $query . '"'.
236     ' |';
237     print STDERR "## search: $open_cmd\n" if ($self->{'debug'});
238 dpavlin 8
239 dpavlin 10 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
240 dpavlin 16 my $l;
241     while($l = <SEARCH>) {
242     next if ($l =~ /^#/);
243     chomp($l);
244     print STDERR "## $l\n" if ($self->{'debug'});
245     my ($rank,$path,$size,$title) = split(/ /,$l,4);
246     $path =~ s#^\./##; # strip from path
247 dpavlin 8 push @results, {
248     rank => $rank,
249     path => $path,
250     size => $size,
251     title => $title,
252     }
253     }
254    
255     close(SEARCH) || confess "can't close search";
256    
257     #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
258    
259     return @results;
260     }
261    
262 dpavlin 16 =head2 property
263    
264     Return stored meta property from result or result path.
265    
266     print $i->property('path', 'title');
267     print $i->property($res->{'path'}, 'title');
268    
269     =cut
270    
271     sub property {
272     my $self = shift;
273    
274     my ($path,$meta) = @_;
275    
276     if ($path =~ m/^HASH/) {
277     $path = $path->{'path'} || confess "can't find path in input data";
278     }
279    
280     my $val = $self->{'meta_db'}->{"$path-$meta"};
281    
282     print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'});
283     return $val;
284     }
285    
286 dpavlin 13 =head2 finish_update
287    
288     This method will close index.
289    
290     $i->finish_update;
291    
292     It will be called on DESTROY when $i goes out of scope.
293    
294     =cut
295    
296     sub finish_update {
297     my $self = shift;
298    
299 dpavlin 14 print STDERR "## finish_update\n" if ($self->{'debug'});
300    
301 dpavlin 16 $self->_close_index && $self->_untie_meta_db;
302 dpavlin 13 }
303    
304     sub DESTROY {
305     my $self = shift;
306     $self->finish_update;
307     }
308    
309 dpavlin 4 =head1 PRIVATE METHODS
310    
311     Private methods implement internals for creating temporary file needed for
312     swish++. You should have no need to call them directly, and they are here
313     just to have documentation.
314    
315 dpavlin 9 =head2 _init_indexer
316 dpavlin 4
317     Create temporary directory in which files for indexing will be created and
318     start index process.
319    
320 dpavlin 9 my $i->_init_indexer || die "can't start indexer";
321 dpavlin 4
322 dpavlin 9 It will also create empty file C<_stopwords_> to disable stop words.
323    
324 dpavlin 4 =cut
325    
326 dpavlin 9 sub _init_indexer {
327 dpavlin 4 my $self = shift;
328    
329 dpavlin 14 return if ($self->{'_index_fh'});
330 dpavlin 4
331 dpavlin 14 my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
332     $self->{'tmp_dir'} = $tmp_dir;
333 dpavlin 9
334 dpavlin 14 chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
335    
336     print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
337    
338 dpavlin 13 my $opt = "-v " . ($self->{'debug'} || '0');
339 dpavlin 4
340 dpavlin 9 unless ($self->{'use_stopwrods'}) {
341 dpavlin 10 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
342 dpavlin 9 print STOP " ";
343     close(STOP);
344     $opt .= " -s _stopwords_";
345     }
346    
347 dpavlin 16 my $index_dir = $self->{'index_dir'} || confess "no index_dir?";
348 dpavlin 4
349 dpavlin 16 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -';
350    
351 dpavlin 14 print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
352 dpavlin 4
353 dpavlin 14 open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
354 dpavlin 4
355 dpavlin 14 chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
356 dpavlin 9
357 dpavlin 16 $self->_tie_meta_db(DB_CREATE);
358    
359 dpavlin 14 return $self->{'_index_fh'};
360 dpavlin 4 }
361    
362 dpavlin 16 =head2 _tie_meta_db
363    
364     Open BerkeleyDB database with meta properties.
365    
366     $i->_tie_meta_db(DB_CREATE);
367     $i->_tie_meta_db(DB_RDONLY);
368    
369     }
370    
371     =cut
372    
373     sub _tie_meta_db {
374     my $self = shift;
375    
376     my $flags = shift || confess "need DB_CREATE or DB_RDONLY";
377    
378     return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags);
379    
380     print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'});
381    
382     $self->_untie_meta_db;
383     $self->{'_meta_db_flags'} = $flags;
384    
385     my $file = $self->{'index_dir'}.'/meta.db';
386    
387     tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash",
388     -Filename => $file,
389     -Flags => $flags
390     or confess "cannot open $file: $! $BerkeleyDB::Error\n" ;
391    
392     return 1;
393     }
394    
395     =head2 _untie_meta_db
396    
397     Close BerkeleyDB database with meta properties.
398    
399     $i->_untie_meta_db
400    
401     =cut
402    
403     sub _untie_meta_db {
404     my $self = shift;
405    
406     return unless ($self->{'meta_db'});
407    
408     print STDERR "## _untie_meta_db\n" if ($self->{'debug'});
409     untie %{$self->{'meta_db'}} || confess "can't untie!";
410     undef $self->{'meta_db'};
411     undef $self->{'_meta_db_flags'};
412    
413     return 1;
414     }
415    
416 dpavlin 4 =head2 _create_doc
417    
418     Create temporary file and pass it's name to swish++
419    
420     $i->_create_doc(
421     path => 'path/to/store/in/index',
422 dpavlin 9 title => 'this is title in results',
423 dpavlin 4 body => 'data to story in body tag',
424     meta => {
425     'meta name' => 'data for this meta',
426     'another' => 'again more data',
427     }
428     );
429    
430 dpavlin 8 To delete document, just omit body and meta data.
431    
432 dpavlin 4 =cut
433    
434     sub _create_doc {
435     my $self = shift;
436    
437     my $arg = {@_};
438    
439     # open indexer if needed
440 dpavlin 14 $self->_init_indexer;
441 dpavlin 4
442     my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
443 dpavlin 16 my $id = $arg->{'path'} || confess "no path?";
444     $path .= "/$id";
445 dpavlin 4
446 dpavlin 14 print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
447 dpavlin 4
448 dpavlin 14 open(TMP, '>', $path) || die "can't create temp file $path: $!";
449    
450 dpavlin 9 print TMP '<html><head>';
451 dpavlin 4
452 dpavlin 9 $arg->{'body'} ||= '';
453    
454 dpavlin 4 if ($arg->{'meta'}) {
455 dpavlin 11 foreach my $name (keys %{$arg->{'meta'}}) {
456     my $content = $arg->{'meta'}->{$name};
457     print TMP qq{<meta name="$name" content="$content">};
458     $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
459 dpavlin 16 $self->{'meta_db'}->{"$id-$name"} = $content;
460 dpavlin 11 }
461 dpavlin 4 }
462 dpavlin 9
463 dpavlin 16 my $title = $arg->{'title'};
464     if (defined($title)) {
465     print TMP "<title>$title</title>";
466     $arg->{'body'} .= " $title" if ($self->{'meta_in_body'});
467     $self->{'meta_db'}->{"$id-title"} = $title;
468 dpavlin 9 }
469    
470     print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
471 dpavlin 4
472     close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
473    
474 dpavlin 16 print { $self->{'_index_fh'} } "$id\n";
475 dpavlin 4 }
476    
477 dpavlin 8 =head2 _close_index
478    
479     Close index after indexing.
480    
481     $i->_close_index;
482    
483     You have to close index before searching.
484    
485     =cut
486    
487     sub _close_index {
488     my $self = shift;
489    
490 dpavlin 14 return unless ($self->{'_index_fh'});
491 dpavlin 8
492     print STDERR "## close index\n" if ($self->{'debug'});
493    
494 dpavlin 16 close($self->{'_index_fh'}) || confess "can't close index: $!";
495 dpavlin 14 undef $self->{'_index_fh'};
496 dpavlin 16
497     return 1;
498 dpavlin 8 }
499    
500 dpavlin 1 1;
501     __END__
502    
503     =head2 EXPORT
504    
505     None by default.
506    
507 dpavlin 3 =head1 RELATED
508    
509     =head2 Debian
510    
511     Debian version of swish++ is often old (version 5 at moment of this writing
512     while version 6 is available in source code), so this module by default
513     uses executable names B<index> and B<search> for self-compiled version
514 dpavlin 10 instead of one from Debian package. See L<new> how to specify Debian
515 dpavlin 3 default binaries B<index++> and B<search++>.
516    
517 dpavlin 5 =head2 SWISH++
518 dpavlin 1
519 dpavlin 5 Aside from very good rewrite in C++, SWISH++ is fatster because it has
520     claver heuristics about which data in input files are words to index and
521     which are not. It's based on English language and might be best choice if
522     you plan to install large amount of long text documents.
523 dpavlin 1
524 dpavlin 5 However, if you plan to index all data from structured storage (e.g. RDBMS)
525     you might want B<all> words from data to end up in index as opposed to just
526     those which look like English words. This is especially important if you
527     don't plan to index English texts with this module.
528 dpavlin 1
529 dpavlin 5 With distribution build versions of SWISH++ you might have problems with
530     disepearing words. To overcome this problem, you will have to compile and
531     configure SWISH++ yourself (because language characteristics are
532     compilation-time option).
533 dpavlin 1
534 dpavlin 5 Compilation of SWISH++ is easy process well described on project's web
535     pages. To see my very relaxed sample configuration take a look at C<swish++>
536     directory included in distribution.
537    
538 dpavlin 11 =head2 SWISH++ config
539    
540     C<config.h> located in C<swish++> directory of this distribution is relaxed
541     SWISH++ configuration that will index all words passed to it. This
542     configuration is needed for B<date test> because default configuration
543     doesn't recognize 2004-12-05 as date. Have in mind that your index size
544     might explode.
545    
546 dpavlin 5 =head1 SEE ALSO
547    
548     C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
549    
550 dpavlin 1 =head1 AUTHOR
551    
552 dpavlin 5 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
553 dpavlin 1
554     =head1 COPYRIGHT AND LICENSE
555    
556     Copyright (C) 2004 by Dobrica Pavlinusic
557    
558     This library is free software; you can redistribute it and/or modify
559     it under the same terms as Perl itself, either Perl version 5.8.4 or,
560     at your option, any later version of Perl 5 you may have available.
561    
562    
563     =cut

  ViewVC Help
Powered by ViewVC 1.1.26