/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Fri Dec 3 13:31:43 2004 UTC revision 16 by dpavlin, Sun Dec 5 21:06:48 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.01';  our $VERSION = '0.10';
8    
9  use Carp;  use Carp;
10    use File::Temp qw/ tempdir /;
11    use BerkeleyDB;
12    #use YAML;
13    
14  =head1 NAME  =head1 NAME
15    
# Line 20  SWISH::PlusPlus - Perl extension SWISH++ Line 23  SWISH::PlusPlus - Perl extension SWISH++
23  =head1 DESCRIPTION  =head1 DESCRIPTION
24    
25  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
26  rewrite of swish-e in C++ with blazingly fast performance, but without  rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
27  support for properties (which this module tries to fix)  support for properties (which this module tries to fix).
28    
29    Implementation of this module is crafted after L<Plucene::Simple> and it
30    should be easy to replace Plucene with this module for increased
31    performance. However, this module is not plug-in replacement.
32    
33  =head1 METHODS  =head1 METHODS
34    
35  =head2 open  =head2 new
36    
37  Create new indexing object.  Create new indexing object.
38    
39    my $i = new SWISH::PlusPlus(    my $i = SWISH::PlusPlus->new(
40          index => '/path/to/index',          index_dir => '/path/to/index',
41            index => 'index++',
42            search => 'search++',
43            debug => 1,
44            meta_in_body => 1,
45            use_stopwords => 1,
46    );    );
47    
48  Options to open are following:  Options to new are following:
49    
50  =over 5  =over 5
51    
52    =item C<index_dir>
53    
54    Path to directory in which index will be created.
55    
56  =item C<index>  =item C<index>
57    
58  path to directory in which index will be created.  Full or partial path to SWISH++ index executable. By default, it's B<index>
59    for self-compiled version. If you use Debian GNU/Linux package specify
60    B<index++>. See C<Debian>.
61    
62    =item C<search>
63    
64    Full or partial path to SWISH++ search executable. By default, it's B<search>.
65    
66    =item C<debug>
67    
68    This option (off by default) will produce a lot of debugging output on
69    C<STDERR> prefixed by C<##>.
70    
71    =item C<meta_in_body>
72    
73    This option (off by default) enables to search content of meta fields
74    without specifing them (like they are in body of document). This will
75    somewhat increate index size.
76    
77    =item C<use_stopwords>
78    
79    Use built-in SWISH++ stop words. By default, they are disabled.
80    
81  =back  =back
82    
# Line 50  sub new { Line 87  sub new {
87          my $self = {@_};          my $self = {@_};
88          bless($self, $class);          bless($self, $class);
89    
90          foreach (qw(index)) {          foreach (qw(index_dir)) {
91                  croak "need $_" unless $self->{$_};                  croak "need $_" unless $self->{$_};
92          }          }
93    
94          if (! -e $self->{'index'}) {          my $index_dir = $self->{'index_dir'};
95                  mkdir $self->{'index'} || confess "can't create index ",$self->{'index'},": $!";  
96            my $cwd;
97            chomp($cwd = `pwd`);
98            $self->{'cwd'} = $cwd || carp "can't get cwd!";
99            
100            if ($index_dir !~ m#^/#) {
101                    $index_dir = "$cwd/$index_dir";
102                    print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
103                    $self->{'index_dir'} = $index_dir;
104          }          }
105    
106            if (! -e $index_dir) {
107                    mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
108            }
109    
110            # default executables
111            $self->{'index'} ||= 'index';
112            $self->{'search'} ||= 'search';
113    
114            print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
115    
116          $self ? return $self : return undef;          $self ? return $self : return undef;
117  }  }
118    
119    
120    =head2 check_bin
121    
122    Check if swish++ binaries specified in L<new> are available and verify
123    version signature.
124    
125      if ($i->check_bin) {
126            print "swish++ binaries found\n";
127      };
128    
129    It will also setup property
130    
131      $i->{'version'}
132    
133    which you can examine to see version.
134    
135    =cut
136    
137    sub check_bin {
138            my $self = shift;
139    
140            my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
141            my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
142    
143            chomp $i;
144            chomp $s;
145    
146            confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
147            confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
148    
149            if ($i eq $s) {
150                    $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
151                    $self->{'version'} = $i;
152                    return 1;
153            } else  {
154                    carp "version difference: index is $i while search is $s";
155                    return;
156            }
157    
158    }
159    
160    =head2 index_document
161    
162    Quick way to add simple data to index.
163    
164      $i->index_document($key, $data);
165      $i->index_document( 42 => 'meaning of life' );
166    
167    =cut
168    
169    sub index_document {
170            my $self = shift;
171    
172            my %doc = @_;
173    
174            foreach my $id (keys %doc) {
175                    $self->_create_doc(
176                            path => $id,
177                            body => $doc{$id},
178                    );
179            }
180    
181            return 1;
182    }
183    
184    =head2 add
185    
186    Add document with metadata to index.
187    
188      $i->add(
189            path => 'path/to/document',
190            title => 'this is result title',
191            meta => {
192                    description => 'this is description meta tag',
193                    date => '2004-11-04',
194                    author => 'Dobrica Pavlinusic',
195            }
196            body => 'this is text without meta data',
197      );
198    
199    This is thin wrapper round L<_create_doc>.
200    
201    =cut
202    
203    sub add {
204            my $self = shift;
205    
206            $self->_create_doc(@_);
207    
208            return 1;
209    }
210    =head2 search
211    
212    Search your index.
213    
214      my @results = $i->search("swhish query");
215    
216    Returns array with result IDs.
217    
218    =cut
219    
220    sub search {
221            my $self = shift;
222    
223            my $query = shift || return;
224    
225            $self->finish_update;
226            $self->_tie_meta_db(DB_RDONLY);
227    
228            my @results;
229    
230            # escape double quotes in query for shell
231            $query =~ s/"/\\"/g;
232    
233            my $open_cmd = $self->{'search'} .
234                    ' -i ' . $self->{'index_dir'}.'/index' .
235                    ' "' . $query . '"'.
236                    ' |';
237            print STDERR "## search: $open_cmd\n" if ($self->{'debug'});
238    
239            open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
240            my $l;
241            while($l = <SEARCH>) {
242                    next if ($l =~ /^#/);
243                    chomp($l);
244                    print STDERR "## $l\n" if ($self->{'debug'});
245                    my ($rank,$path,$size,$title) = split(/ /,$l,4);
246                    $path =~ s#^\./##; # strip from path
247                    push @results, {
248                            rank => $rank,
249                            path => $path,
250                            size => $size,
251                            title => $title,
252                    }
253            }
254    
255            close(SEARCH) || confess "can't close search";
256    
257            #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
258    
259            return @results;
260    }
261    
262    =head2 property
263    
264    Return stored meta property from result or result path.
265    
266      print $i->property('path', 'title');
267      print $i->property($res->{'path'}, 'title');
268    
269    =cut
270    
271    sub property {
272            my $self = shift;
273    
274            my ($path,$meta) = @_;
275    
276            if ($path =~ m/^HASH/) {
277                    $path = $path->{'path'} || confess "can't find path in input data";
278            }
279    
280            my $val = $self->{'meta_db'}->{"$path-$meta"};
281    
282            print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'});
283            return $val;
284    }
285    
286    =head2 finish_update
287    
288    This method will close index.
289    
290      $i->finish_update;
291    
292    It will be called on DESTROY when $i goes out of scope.
293    
294    =cut
295    
296    sub finish_update {
297            my $self = shift;
298    
299            print STDERR "## finish_update\n" if ($self->{'debug'});
300    
301            $self->_close_index && $self->_untie_meta_db;
302    }
303    
304    sub DESTROY {
305            my $self = shift;
306            $self->finish_update;
307    }
308    
309    =head1 PRIVATE METHODS
310    
311    Private methods implement internals for creating temporary file needed for
312    swish++. You should have no need to call them directly, and they are here
313    just to have documentation.
314    
315    =head2 _init_indexer
316    
317    Create temporary directory in which files for indexing will be created and
318    start index process.
319    
320      my $i->_init_indexer || die "can't start indexer";
321    
322    It will also create empty file C<_stopwords_> to disable stop words.
323    
324    =cut
325    
326    sub _init_indexer {
327            my $self = shift;
328    
329            return if ($self->{'_index_fh'});
330    
331            my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
332            $self->{'tmp_dir'} = $tmp_dir;
333    
334            chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
335    
336            print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
337    
338            my $opt = "-v " . ($self->{'debug'} || '0');
339    
340            unless ($self->{'use_stopwrods'}) {
341                    open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
342                    print STOP "  ";
343                    close(STOP);
344                    $opt .= " -s _stopwords_";
345            }
346    
347            my $index_dir = $self->{'index_dir'} || confess "no index_dir?";
348    
349            my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -';
350    
351            print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
352    
353            open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
354    
355            chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
356    
357            $self->_tie_meta_db(DB_CREATE);
358    
359            return $self->{'_index_fh'};
360    }
361    
362    =head2 _tie_meta_db
363    
364    Open BerkeleyDB database with meta properties.
365    
366      $i->_tie_meta_db(DB_CREATE);
367      $i->_tie_meta_db(DB_RDONLY);
368    
369    }
370    
371    =cut
372    
373    sub _tie_meta_db  {
374            my $self = shift;
375    
376            my $flags = shift || confess "need DB_CREATE or DB_RDONLY";
377    
378            return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags);
379    
380            print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'});
381    
382            $self->_untie_meta_db;
383            $self->{'_meta_db_flags'} = $flags;
384    
385            my $file = $self->{'index_dir'}.'/meta.db';
386    
387            tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash",
388                    -Filename => $file,
389                    -Flags    => $flags
390            or confess "cannot open $file: $! $BerkeleyDB::Error\n" ;
391    
392            return 1;
393    }
394    
395    =head2 _untie_meta_db
396    
397    Close BerkeleyDB database with meta properties.
398    
399      $i->_untie_meta_db
400    
401    =cut
402    
403    sub _untie_meta_db {
404            my $self = shift;
405    
406            return unless ($self->{'meta_db'});
407    
408            print STDERR "## _untie_meta_db\n" if ($self->{'debug'});
409            untie %{$self->{'meta_db'}} || confess "can't untie!";
410            undef $self->{'meta_db'};
411            undef $self->{'_meta_db_flags'};
412    
413            return 1;
414    }
415    
416    =head2 _create_doc
417    
418    Create temporary file and pass it's name to swish++
419    
420      $i->_create_doc(
421            path => 'path/to/store/in/index',
422            title => 'this is title in results',
423            body => 'data to story in body tag',
424            meta => {
425                    'meta name' => 'data for this meta',
426                    'another' => 'again more data',
427            }
428      );
429    
430    To delete document, just omit body and meta data.
431    
432    =cut
433    
434    sub _create_doc {
435            my $self = shift;
436    
437            my $arg = {@_};
438    
439            # open indexer if needed
440            $self->_init_indexer;
441    
442            my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
443            my $id = $arg->{'path'} || confess "no path?";
444            $path .= "/$id";
445    
446            print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
447    
448            open(TMP, '>', $path) || die "can't create temp file $path: $!";
449    
450            print TMP '<html><head>';
451    
452            $arg->{'body'} ||= '';
453    
454            if ($arg->{'meta'}) {
455                    foreach my $name (keys %{$arg->{'meta'}}) {
456                            my $content = $arg->{'meta'}->{$name};
457                            print TMP qq{<meta name="$name" content="$content">};
458                            $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
459                            $self->{'meta_db'}->{"$id-$name"} = $content;
460                    }
461            }
462    
463            my $title = $arg->{'title'};
464            if (defined($title)) {
465                    print TMP "<title>$title</title>";
466                    $arg->{'body'} .= " $title" if ($self->{'meta_in_body'});
467                    $self->{'meta_db'}->{"$id-title"} = $title;
468            }
469    
470            print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
471            
472            close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
473    
474            print { $self->{'_index_fh'} } "$id\n";
475    }
476    
477    =head2 _close_index
478    
479    Close index after indexing.
480    
481      $i->_close_index;
482    
483    You have to close index before searching.
484    
485    =cut
486    
487    sub _close_index {
488            my $self = shift;
489    
490            return unless ($self->{'_index_fh'});
491    
492            print STDERR "## close index\n" if ($self->{'debug'});
493    
494            close($self->{'_index_fh'}) || confess "can't close index: $!";
495            undef $self->{'_index_fh'};
496    
497            return 1;
498    }
499    
500  1;  1;
501  __END__  __END__
502    
# Line 69  __END__ Line 504  __END__
504    
505  None by default.  None by default.
506    
507  =head1 SEE ALSO  =head1 RELATED
508    
509  Mention other useful documentation such as the documentation of  =head2 Debian
 related modules or operating system documentation (such as man pages  
 in UNIX), or any relevant external documentation such as RFCs or  
 standards.  
510    
511  If you have a mailing list set up for your module, mention it here.  Debian version of swish++ is often old (version 5 at moment of this writing
512    while version 6 is available in source code), so this module by default
513    uses executable names B<index> and B<search> for self-compiled version
514    instead of one from Debian package. See L<new> how to specify Debian
515    default binaries B<index++> and B<search++>.
516    
517    =head2 SWISH++
518    
519    Aside from very good rewrite in C++, SWISH++ is fatster because it has
520    claver heuristics about which data in input files are words to index and
521    which are not. It's based on English language and might be best choice if
522    you plan to install large amount of long text documents.
523    
524    However, if you plan to index all data from structured storage (e.g. RDBMS)
525    you might want B<all> words from data to end up in index as opposed to just
526    those which look like English words. This is especially important if you
527    don't plan to index English texts with this module.
528    
529    With distribution build versions of SWISH++ you might have problems with
530    disepearing words. To overcome this problem, you will have to compile and
531    configure SWISH++ yourself (because language characteristics are
532    compilation-time option).
533    
534    Compilation of SWISH++ is easy process well described on project's web
535    pages. To see my very relaxed sample configuration take a look at C<swish++>
536    directory included in distribution.
537    
538    =head2 SWISH++ config
539    
540    C<config.h> located in C<swish++> directory of this distribution is relaxed
541    SWISH++ configuration that will index all words passed to it. This
542    configuration is needed for B<date test> because default configuration
543    doesn't recognize 2004-12-05 as date. Have in mind that your index size
544    might explode.
545    
546    =head1 SEE ALSO
547    
548  If you have a web site set up for your module, mention it here.  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
549    
550  =head1 AUTHOR  =head1 AUTHOR
551    
552  Dobrica Pavlinusic, E<lt>dpavlin@E<gt>  Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
553    
554  =head1 COPYRIGHT AND LICENSE  =head1 COPYRIGHT AND LICENSE
555    

Legend:
Removed from v.1  
changed lines
  Added in v.16

  ViewVC Help
Powered by ViewVC 1.1.26