/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 21 - (hide annotations)
Sun Dec 5 22:24:09 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 13703 byte(s)
documentation update

1 dpavlin 1 package SWISH::PlusPlus;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7 dpavlin 16 our $VERSION = '0.10';
8 dpavlin 1
9     use Carp;
10 dpavlin 4 use File::Temp qw/ tempdir /;
11 dpavlin 16 use BerkeleyDB;
12 dpavlin 8 #use YAML;
13 dpavlin 1
14     =head1 NAME
15    
16 dpavlin 21 SWISH::PlusPlus - Perl extension for full-text indexer SWISH++ with properties support
17 dpavlin 1
18     =head1 SYNOPSIS
19    
20     use SWISH::PlusPlus;
21     blah blah blah
22    
23     =head1 DESCRIPTION
24    
25     This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
26 dpavlin 21 rewrite of swish-e in C++ which is extremely fast (due to mmap usage and
27     clever language heuristics), but without support for properties (which this
28     module tries to fix).
29 dpavlin 1
30 dpavlin 21 Implementation of API is something in-between C<SWISH::API> and
31     C<Plucene::Simple>. It should be easy to replace Plucene or swish-e with
32     this module for increased performance. However, this module is not plug-in
33     replacement.
34 dpavlin 3
35 dpavlin 1 =head1 METHODS
36    
37 dpavlin 10 =head2 new
38 dpavlin 1
39 dpavlin 21 Create new instance for index.
40 dpavlin 1
41 dpavlin 10 my $i = SWISH::PlusPlus->new(
42 dpavlin 3 index_dir => '/path/to/index',
43     index => 'index++',
44     search => 'search++',
45 dpavlin 8 debug => 1,
46 dpavlin 9 meta_in_body => 1,
47     use_stopwords => 1,
48 dpavlin 1 );
49    
50 dpavlin 21 Options are described below:
51 dpavlin 1
52     =over 5
53    
54 dpavlin 3 =item C<index_dir>
55    
56 dpavlin 21 Path to directory in which index and meta database will be created.
57 dpavlin 3
58 dpavlin 1 =item C<index>
59    
60 dpavlin 3 Full or partial path to SWISH++ index executable. By default, it's B<index>
61     for self-compiled version. If you use Debian GNU/Linux package specify
62     B<index++>. See C<Debian>.
63 dpavlin 1
64 dpavlin 3 =item C<search>
65    
66     Full or partial path to SWISH++ search executable. By default, it's B<search>.
67    
68 dpavlin 8 =item C<debug>
69    
70     This option (off by default) will produce a lot of debugging output on
71     C<STDERR> prefixed by C<##>.
72    
73 dpavlin 9 =item C<meta_in_body>
74    
75     This option (off by default) enables to search content of meta fields
76 dpavlin 21 without specifying them (like they are in body of document). This will
77     somewhat increase index size.
78 dpavlin 9
79     =item C<use_stopwords>
80    
81     Use built-in SWISH++ stop words. By default, they are disabled.
82    
83 dpavlin 1 =back
84    
85     =cut
86    
87 dpavlin 10 sub new {
88 dpavlin 1 my $class = shift;
89     my $self = {@_};
90     bless($self, $class);
91    
92 dpavlin 3 foreach (qw(index_dir)) {
93 dpavlin 1 croak "need $_" unless $self->{$_};
94     }
95    
96 dpavlin 13 my $index_dir = $self->{'index_dir'};
97    
98 dpavlin 14 my $cwd;
99     chomp($cwd = `pwd`);
100     $self->{'cwd'} = $cwd || carp "can't get cwd!";
101    
102 dpavlin 13 if ($index_dir !~ m#^/#) {
103     $index_dir = "$cwd/$index_dir";
104     print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
105     $self->{'index_dir'} = $index_dir;
106 dpavlin 1 }
107    
108 dpavlin 13 if (! -e $index_dir) {
109     mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
110     }
111    
112 dpavlin 3 # default executables
113     $self->{'index'} ||= 'index';
114     $self->{'search'} ||= 'search';
115    
116 dpavlin 13 print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
117 dpavlin 8
118 dpavlin 1 $self ? return $self : return undef;
119     }
120    
121    
122 dpavlin 3 =head2 check_bin
123    
124 dpavlin 21 Check if SWISH++ binaries specified in L<new> are available and verify
125 dpavlin 3 version signature.
126    
127     if ($i->check_bin) {
128     print "swish++ binaries found\n";
129     };
130    
131     It will also setup property
132    
133     $i->{'version'}
134    
135 dpavlin 21 which you can examined to see numeric version (something like C<6.0.4>).
136 dpavlin 3
137     =cut
138    
139     sub check_bin {
140     my $self = shift;
141    
142     my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
143     my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
144    
145     chomp $i;
146     chomp $s;
147    
148     confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
149     confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
150    
151     if ($i eq $s) {
152 dpavlin 14 $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
153 dpavlin 3 $self->{'version'} = $i;
154     return 1;
155     } else {
156     carp "version difference: index is $i while search is $s";
157     return;
158     }
159    
160     }
161    
162 dpavlin 4 =head2 index_document
163    
164     Quick way to add simple data to index.
165    
166 dpavlin 21 $i->index_document($path, $data);
167 dpavlin 4 $i->index_document( 42 => 'meaning of life' );
168    
169 dpavlin 21 C<$path> value is really path, so you don't want to use directory
170     separators (slashes, /) in it probably.
171    
172 dpavlin 4 =cut
173    
174     sub index_document {
175     my $self = shift;
176    
177     my %doc = @_;
178    
179     foreach my $id (keys %doc) {
180     $self->_create_doc(
181     path => $id,
182     body => $doc{$id},
183     );
184     }
185    
186     return 1;
187     }
188    
189 dpavlin 9 =head2 add
190    
191 dpavlin 21 Add document with meta-data to index.
192 dpavlin 9
193     $i->add(
194     path => 'path/to/document',
195     title => 'this is result title',
196     meta => {
197     description => 'this is description meta tag',
198     date => '2004-11-04',
199     author => 'Dobrica Pavlinusic',
200     }
201     body => 'this is text without meta data',
202     );
203    
204     This is thin wrapper round L<_create_doc>.
205    
206     =cut
207    
208     sub add {
209     my $self = shift;
210    
211     $self->_create_doc(@_);
212    
213     return 1;
214     }
215 dpavlin 21
216 dpavlin 8 =head2 search
217    
218 dpavlin 21 Search your index using any valid SWISH++ query.
219 dpavlin 8
220 dpavlin 21 my @results = $i->search("swish query");
221 dpavlin 8
222 dpavlin 21 Returns array with elements like this:
223 dpavlin 8
224 dpavlin 21 {
225     rank => 10, # rank of result
226     path => 'path to result', # path to result
227     size => 999, # size in bytes
228     title => 'title of result' # title meta property
229     }
230    
231 dpavlin 8 =cut
232    
233     sub search {
234     my $self = shift;
235    
236     my $query = shift || return;
237    
238 dpavlin 14 $self->finish_update;
239 dpavlin 16 $self->_tie_meta_db(DB_RDONLY);
240 dpavlin 8
241     my @results;
242    
243     # escape double quotes in query for shell
244     $query =~ s/"/\\"/g;
245    
246 dpavlin 16 my $open_cmd = $self->{'search'} .
247     ' -i ' . $self->{'index_dir'}.'/index' .
248     ' "' . $query . '"'.
249     ' |';
250     print STDERR "## search: $open_cmd\n" if ($self->{'debug'});
251 dpavlin 8
252 dpavlin 10 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
253 dpavlin 16 my $l;
254     while($l = <SEARCH>) {
255     next if ($l =~ /^#/);
256     chomp($l);
257     print STDERR "## $l\n" if ($self->{'debug'});
258     my ($rank,$path,$size,$title) = split(/ /,$l,4);
259     $path =~ s#^\./##; # strip from path
260 dpavlin 8 push @results, {
261     rank => $rank,
262     path => $path,
263     size => $size,
264     title => $title,
265     }
266     }
267    
268     close(SEARCH) || confess "can't close search";
269    
270     #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
271    
272     return @results;
273     }
274    
275 dpavlin 16 =head2 property
276    
277     Return stored meta property from result or result path.
278    
279     print $i->property('path', 'title');
280     print $i->property($res->{'path'}, 'title');
281    
282     =cut
283    
284     sub property {
285     my $self = shift;
286    
287     my ($path,$meta) = @_;
288    
289     if ($path =~ m/^HASH/) {
290     $path = $path->{'path'} || confess "can't find path in input data";
291     }
292    
293     my $val = $self->{'meta_db'}->{"$path-$meta"};
294    
295     print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'});
296     return $val;
297     }
298    
299 dpavlin 13 =head2 finish_update
300    
301 dpavlin 21 This method will close index binary and enable search. Searching is not
302     available while indexing is in process.
303 dpavlin 13
304     $i->finish_update;
305    
306 dpavlin 21 Usually, you don't need to call this method directly. It will be called on
307     DESTROY when $i goes out of scope or when you first call search in session
308     if indexing was started.
309 dpavlin 13
310     =cut
311    
312     sub finish_update {
313     my $self = shift;
314    
315 dpavlin 14 print STDERR "## finish_update\n" if ($self->{'debug'});
316    
317 dpavlin 16 $self->_close_index && $self->_untie_meta_db;
318 dpavlin 13 }
319    
320     sub DESTROY {
321     my $self = shift;
322     $self->finish_update;
323     }
324    
325 dpavlin 4 =head1 PRIVATE METHODS
326    
327 dpavlin 21 Private methods implement internals for creating temporary files needed for
328     SWISH++. You should have no need to call them directly, and they are here
329 dpavlin 4 just to have documentation.
330    
331 dpavlin 9 =head2 _init_indexer
332 dpavlin 4
333     Create temporary directory in which files for indexing will be created and
334     start index process.
335    
336 dpavlin 9 my $i->_init_indexer || die "can't start indexer";
337 dpavlin 4
338 dpavlin 9 It will also create empty file C<_stopwords_> to disable stop words.
339    
340 dpavlin 4 =cut
341    
342 dpavlin 9 sub _init_indexer {
343 dpavlin 4 my $self = shift;
344    
345 dpavlin 14 return if ($self->{'_index_fh'});
346 dpavlin 4
347 dpavlin 14 my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
348     $self->{'tmp_dir'} = $tmp_dir;
349 dpavlin 9
350 dpavlin 14 chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
351    
352     print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
353    
354 dpavlin 13 my $opt = "-v " . ($self->{'debug'} || '0');
355 dpavlin 4
356 dpavlin 9 unless ($self->{'use_stopwrods'}) {
357 dpavlin 10 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
358 dpavlin 9 print STOP " ";
359     close(STOP);
360     $opt .= " -s _stopwords_";
361     }
362    
363 dpavlin 16 my $index_dir = $self->{'index_dir'} || confess "no index_dir?";
364 dpavlin 4
365 dpavlin 16 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -';
366    
367 dpavlin 14 print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
368 dpavlin 4
369 dpavlin 14 open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
370 dpavlin 4
371 dpavlin 14 chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
372 dpavlin 9
373 dpavlin 16 $self->_tie_meta_db(DB_CREATE);
374    
375 dpavlin 14 return $self->{'_index_fh'};
376 dpavlin 4 }
377    
378     =head2 _create_doc
379    
380 dpavlin 21 Create temporary file and pass it's name to SWISH++
381 dpavlin 4
382     $i->_create_doc(
383     path => 'path/to/store/in/index',
384 dpavlin 9 title => 'this is title in results',
385 dpavlin 4 body => 'data to story in body tag',
386     meta => {
387     'meta name' => 'data for this meta',
388     'another' => 'again more data',
389     }
390     );
391    
392 dpavlin 8 To delete document, just omit body and meta data.
393    
394 dpavlin 4 =cut
395    
396     sub _create_doc {
397     my $self = shift;
398    
399     my $arg = {@_};
400    
401     # open indexer if needed
402 dpavlin 14 $self->_init_indexer;
403 dpavlin 4
404     my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
405 dpavlin 16 my $id = $arg->{'path'} || confess "no path?";
406     $path .= "/$id";
407 dpavlin 4
408 dpavlin 14 print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
409 dpavlin 4
410 dpavlin 14 open(TMP, '>', $path) || die "can't create temp file $path: $!";
411    
412 dpavlin 9 print TMP '<html><head>';
413 dpavlin 4
414 dpavlin 9 $arg->{'body'} ||= '';
415    
416 dpavlin 4 if ($arg->{'meta'}) {
417 dpavlin 11 foreach my $name (keys %{$arg->{'meta'}}) {
418     my $content = $arg->{'meta'}->{$name};
419     print TMP qq{<meta name="$name" content="$content">};
420     $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
421 dpavlin 16 $self->{'meta_db'}->{"$id-$name"} = $content;
422 dpavlin 11 }
423 dpavlin 4 }
424 dpavlin 9
425 dpavlin 16 my $title = $arg->{'title'};
426     if (defined($title)) {
427     print TMP "<title>$title</title>";
428     $arg->{'body'} .= " $title" if ($self->{'meta_in_body'});
429     $self->{'meta_db'}->{"$id-title"} = $title;
430 dpavlin 9 }
431    
432     print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
433 dpavlin 4
434     close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
435    
436 dpavlin 16 print { $self->{'_index_fh'} } "$id\n";
437 dpavlin 4 }
438    
439 dpavlin 8 =head2 _close_index
440    
441     Close index after indexing.
442    
443     $i->_close_index;
444    
445     You have to close index before searching.
446    
447     =cut
448    
449     sub _close_index {
450     my $self = shift;
451    
452 dpavlin 14 return unless ($self->{'_index_fh'});
453 dpavlin 8
454     print STDERR "## close index\n" if ($self->{'debug'});
455    
456 dpavlin 16 close($self->{'_index_fh'}) || confess "can't close index: $!";
457 dpavlin 14 undef $self->{'_index_fh'};
458 dpavlin 16
459     return 1;
460 dpavlin 8 }
461    
462 dpavlin 21 =head2 _tie_meta_db
463    
464     Open BerkeleyDB database with meta properties.
465    
466     $i->_tie_meta_db(DB_CREATE);
467     $i->_tie_meta_db(DB_RDONLY);
468    
469     }
470    
471     =cut
472    
473     sub _tie_meta_db {
474     my $self = shift;
475    
476     my $flags = shift || confess "need DB_CREATE or DB_RDONLY";
477    
478     return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags);
479    
480     print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'});
481    
482     $self->_untie_meta_db;
483     $self->{'_meta_db_flags'} = $flags;
484    
485     my $file = $self->{'index_dir'}.'/meta.db';
486    
487     tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash",
488     -Filename => $file,
489     -Flags => $flags
490     or confess "cannot open $file: $! $BerkeleyDB::Error\n" ;
491    
492     return 1;
493     }
494    
495     =head2 _untie_meta_db
496    
497     Close BerkeleyDB database with meta properties.
498    
499     $i->_untie_meta_db;
500    
501     =cut
502    
503     sub _untie_meta_db {
504     my $self = shift;
505    
506     return unless ($self->{'meta_db'});
507    
508     print STDERR "## _untie_meta_db\n" if ($self->{'debug'});
509     untie %{$self->{'meta_db'}} || confess "can't untie!";
510     undef $self->{'meta_db'};
511     undef $self->{'_meta_db_flags'};
512    
513     return 1;
514     }
515    
516 dpavlin 1 1;
517     __END__
518    
519     =head2 EXPORT
520    
521     None by default.
522    
523 dpavlin 3 =head1 RELATED
524    
525     =head2 Debian
526    
527 dpavlin 21 Debian version of SWISH++ is often old (version 5 at moment of this writing
528 dpavlin 3 while version 6 is available in source code), so this module by default
529     uses executable names B<index> and B<search> for self-compiled version
530 dpavlin 10 instead of one from Debian package. See L<new> how to specify Debian
531 dpavlin 3 default binaries B<index++> and B<search++>.
532    
533 dpavlin 5 =head2 SWISH++
534 dpavlin 1
535 dpavlin 21 Aside from very good rewrite in C++, SWISH++ is faster because it uses
536 dpavlin 5 claver heuristics about which data in input files are words to index and
537     which are not. It's based on English language and might be best choice if
538 dpavlin 21 you plan to index large amount of long text documents.
539 dpavlin 1
540 dpavlin 5 However, if you plan to index all data from structured storage (e.g. RDBMS)
541     you might want B<all> words from data to end up in index as opposed to just
542     those which look like English words. This is especially important if you
543     don't plan to index English texts with this module.
544 dpavlin 1
545 dpavlin 5 With distribution build versions of SWISH++ you might have problems with
546 dpavlin 21 disapearing words. To overcome this problem, you will have to compile and
547 dpavlin 5 configure SWISH++ yourself (because language characteristics are
548     compilation-time option).
549 dpavlin 1
550 dpavlin 5 Compilation of SWISH++ is easy process well described on project's web
551     pages. To see my very relaxed sample configuration take a look at C<swish++>
552     directory included in distribution.
553    
554 dpavlin 11 =head2 SWISH++ config
555    
556     C<config.h> located in C<swish++> directory of this distribution is relaxed
557     SWISH++ configuration that will index all words passed to it. This
558     configuration is needed for B<date test> because default configuration
559     doesn't recognize 2004-12-05 as date. Have in mind that your index size
560     might explode.
561    
562 dpavlin 21 =head1 BUGS
563    
564     Currently there is no way to specify which meta data will be stored as
565     properties. B<This will be fixed very soon>.
566    
567     There is no garbage collection on temporary files created for SWISH++. This
568     means that one run of indexer will take additional disk space for temporary
569     files, which will be removed at end. There should be some way to remove
570     files after they are indexed by SWISH++. However, at this early stage of
571     development it's just not supported yet. Have plenty of disk space!
572    
573 dpavlin 5 =head1 SEE ALSO
574    
575 dpavlin 21 SWISH++ web site L<http://homepage.mac.com/pauljlucas/software/swish/>
576 dpavlin 5
577 dpavlin 1 =head1 AUTHOR
578    
579 dpavlin 5 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
580 dpavlin 1
581     =head1 COPYRIGHT AND LICENSE
582    
583     Copyright (C) 2004 by Dobrica Pavlinusic
584    
585     This library is free software; you can redistribute it and/or modify
586     it under the same terms as Perl itself, either Perl version 5.8.4 or,
587     at your option, any later version of Perl 5 you may have available.
588    
589    
590     =cut

  ViewVC Help
Powered by ViewVC 1.1.26