/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Fri Dec 3 13:31:43 2004 UTC revision 21 by dpavlin, Sun Dec 5 22:24:09 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.01';  our $VERSION = '0.10';
8    
9  use Carp;  use Carp;
10    use File::Temp qw/ tempdir /;
11    use BerkeleyDB;
12    #use YAML;
13    
14  =head1 NAME  =head1 NAME
15    
16  SWISH::PlusPlus - Perl extension SWISH++  SWISH::PlusPlus - Perl extension for full-text indexer SWISH++ with properties support
17    
18  =head1 SYNOPSIS  =head1 SYNOPSIS
19    
# Line 20  SWISH::PlusPlus - Perl extension SWISH++ Line 23  SWISH::PlusPlus - Perl extension SWISH++
23  =head1 DESCRIPTION  =head1 DESCRIPTION
24    
25  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
26  rewrite of swish-e in C++ with blazingly fast performance, but without  rewrite of swish-e in C++ which is extremely fast (due to mmap usage and
27  support for properties (which this module tries to fix)  clever language heuristics), but without support for properties (which this
28    module tries to fix).
29    
30    Implementation of API is something in-between C<SWISH::API> and
31    C<Plucene::Simple>. It should be easy to replace Plucene or swish-e with
32    this module for increased performance. However, this module is not plug-in
33    replacement.
34    
35  =head1 METHODS  =head1 METHODS
36    
37  =head2 open  =head2 new
38    
39  Create new indexing object.  Create new instance for index.
40    
41    my $i = new SWISH::PlusPlus(    my $i = SWISH::PlusPlus->new(
42          index => '/path/to/index',          index_dir => '/path/to/index',
43            index => 'index++',
44            search => 'search++',
45            debug => 1,
46            meta_in_body => 1,
47            use_stopwords => 1,
48    );    );
49    
50  Options to open are following:  Options are described below:
51    
52  =over 5  =over 5
53    
54    =item C<index_dir>
55    
56    Path to directory in which index and meta database will be created.
57    
58  =item C<index>  =item C<index>
59    
60  path to directory in which index will be created.  Full or partial path to SWISH++ index executable. By default, it's B<index>
61    for self-compiled version. If you use Debian GNU/Linux package specify
62    B<index++>. See C<Debian>.
63    
64    =item C<search>
65    
66    Full or partial path to SWISH++ search executable. By default, it's B<search>.
67    
68    =item C<debug>
69    
70    This option (off by default) will produce a lot of debugging output on
71    C<STDERR> prefixed by C<##>.
72    
73    =item C<meta_in_body>
74    
75    This option (off by default) enables to search content of meta fields
76    without specifying them (like they are in body of document). This will
77    somewhat increase index size.
78    
79    =item C<use_stopwords>
80    
81    Use built-in SWISH++ stop words. By default, they are disabled.
82    
83  =back  =back
84    
# Line 50  sub new { Line 89  sub new {
89          my $self = {@_};          my $self = {@_};
90          bless($self, $class);          bless($self, $class);
91    
92          foreach (qw(index)) {          foreach (qw(index_dir)) {
93                  croak "need $_" unless $self->{$_};                  croak "need $_" unless $self->{$_};
94          }          }
95    
96          if (! -e $self->{'index'}) {          my $index_dir = $self->{'index_dir'};
97                  mkdir $self->{'index'} || confess "can't create index ",$self->{'index'},": $!";  
98            my $cwd;
99            chomp($cwd = `pwd`);
100            $self->{'cwd'} = $cwd || carp "can't get cwd!";
101            
102            if ($index_dir !~ m#^/#) {
103                    $index_dir = "$cwd/$index_dir";
104                    print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
105                    $self->{'index_dir'} = $index_dir;
106          }          }
107    
108            if (! -e $index_dir) {
109                    mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
110            }
111    
112            # default executables
113            $self->{'index'} ||= 'index';
114            $self->{'search'} ||= 'search';
115    
116            print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
117    
118          $self ? return $self : return undef;          $self ? return $self : return undef;
119  }  }
120    
121    
122    =head2 check_bin
123    
124    Check if SWISH++ binaries specified in L<new> are available and verify
125    version signature.
126    
127      if ($i->check_bin) {
128            print "swish++ binaries found\n";
129      };
130    
131    It will also setup property
132    
133      $i->{'version'}
134    
135    which you can examined to see numeric version (something like C<6.0.4>).
136    
137    =cut
138    
139    sub check_bin {
140            my $self = shift;
141    
142            my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
143            my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
144    
145            chomp $i;
146            chomp $s;
147    
148            confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
149            confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
150    
151            if ($i eq $s) {
152                    $i =~ s/^SWISH\+\+\s+// || confess "can't strip SWISH++ from version";
153                    $self->{'version'} = $i;
154                    return 1;
155            } else  {
156                    carp "version difference: index is $i while search is $s";
157                    return;
158            }
159    
160    }
161    
162    =head2 index_document
163    
164    Quick way to add simple data to index.
165    
166      $i->index_document($path, $data);
167      $i->index_document( 42 => 'meaning of life' );
168    
169    C<$path> value is really path, so you don't want to use directory
170    separators (slashes, /) in it probably.
171    
172    =cut
173    
174    sub index_document {
175            my $self = shift;
176    
177            my %doc = @_;
178    
179            foreach my $id (keys %doc) {
180                    $self->_create_doc(
181                            path => $id,
182                            body => $doc{$id},
183                    );
184            }
185    
186            return 1;
187    }
188    
189    =head2 add
190    
191    Add document with meta-data to index.
192    
193      $i->add(
194            path => 'path/to/document',
195            title => 'this is result title',
196            meta => {
197                    description => 'this is description meta tag',
198                    date => '2004-11-04',
199                    author => 'Dobrica Pavlinusic',
200            }
201            body => 'this is text without meta data',
202      );
203    
204    This is thin wrapper round L<_create_doc>.
205    
206    =cut
207    
208    sub add {
209            my $self = shift;
210    
211            $self->_create_doc(@_);
212    
213            return 1;
214    }
215    
216    =head2 search
217    
218    Search your index using any valid SWISH++ query.
219    
220      my @results = $i->search("swish query");
221    
222    Returns array with elements like this:
223    
224      {
225       rank => 10,                  # rank of result
226       path => 'path to result',    # path to result
227       size => 999,                 # size in bytes
228       title => 'title of result'   # title meta property
229      }
230    
231    =cut
232    
233    sub search {
234            my $self = shift;
235    
236            my $query = shift || return;
237    
238            $self->finish_update;
239            $self->_tie_meta_db(DB_RDONLY);
240    
241            my @results;
242    
243            # escape double quotes in query for shell
244            $query =~ s/"/\\"/g;
245    
246            my $open_cmd = $self->{'search'} .
247                    ' -i ' . $self->{'index_dir'}.'/index' .
248                    ' "' . $query . '"'.
249                    ' |';
250            print STDERR "## search: $open_cmd\n" if ($self->{'debug'});
251    
252            open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
253            my $l;
254            while($l = <SEARCH>) {
255                    next if ($l =~ /^#/);
256                    chomp($l);
257                    print STDERR "## $l\n" if ($self->{'debug'});
258                    my ($rank,$path,$size,$title) = split(/ /,$l,4);
259                    $path =~ s#^\./##; # strip from path
260                    push @results, {
261                            rank => $rank,
262                            path => $path,
263                            size => $size,
264                            title => $title,
265                    }
266            }
267    
268            close(SEARCH) || confess "can't close search";
269    
270            #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
271    
272            return @results;
273    }
274    
275    =head2 property
276    
277    Return stored meta property from result or result path.
278    
279      print $i->property('path', 'title');
280      print $i->property($res->{'path'}, 'title');
281    
282    =cut
283    
284    sub property {
285            my $self = shift;
286    
287            my ($path,$meta) = @_;
288    
289            if ($path =~ m/^HASH/) {
290                    $path = $path->{'path'} || confess "can't find path in input data";
291            }
292    
293            my $val = $self->{'meta_db'}->{"$path-$meta"};
294    
295            print STDERR "## property $path-$meta: ",($val || 'undef'),"\n" if ($self->{'debug'});
296            return $val;
297    }
298    
299    =head2 finish_update
300    
301    This method will close index binary and enable search. Searching is not
302    available while indexing is in process.
303    
304      $i->finish_update;
305    
306    Usually, you don't need to call this method directly. It will be called on
307    DESTROY when $i goes out of scope or when you first call search in session
308    if indexing was started.
309    
310    =cut
311    
312    sub finish_update {
313            my $self = shift;
314    
315            print STDERR "## finish_update\n" if ($self->{'debug'});
316    
317            $self->_close_index && $self->_untie_meta_db;
318    }
319    
320    sub DESTROY {
321            my $self = shift;
322            $self->finish_update;
323    }
324    
325    =head1 PRIVATE METHODS
326    
327    Private methods implement internals for creating temporary files needed for
328    SWISH++. You should have no need to call them directly, and they are here
329    just to have documentation.
330    
331    =head2 _init_indexer
332    
333    Create temporary directory in which files for indexing will be created and
334    start index process.
335    
336      my $i->_init_indexer || die "can't start indexer";
337    
338    It will also create empty file C<_stopwords_> to disable stop words.
339    
340    =cut
341    
342    sub _init_indexer {
343            my $self = shift;
344    
345            return if ($self->{'_index_fh'});
346    
347            my $tmp_dir = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
348            $self->{'tmp_dir'} = $tmp_dir;
349    
350            chdir $tmp_dir || confess "can't chdir to ".$tmp_dir.": $!";
351    
352            print STDERR "## tmp_dir: $tmp_dir" if ($self->{'debug'});
353    
354            my $opt = "-v " . ($self->{'debug'} || '0');
355    
356            unless ($self->{'use_stopwrods'}) {
357                    open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
358                    print STOP "  ";
359                    close(STOP);
360                    $opt .= " -s _stopwords_";
361            }
362    
363            my $index_dir = $self->{'index_dir'} || confess "no index_dir?";
364    
365            my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$index_dir.'/index -';
366    
367            print STDERR "## init_indexer: $open_cmd\n" if ($self->{'debug'});
368    
369            open($self->{'_index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
370    
371            chdir $self->{'cwd'} || confess "can't chdir to ".$self->{'cwd'}.": $!";
372    
373            $self->_tie_meta_db(DB_CREATE);
374    
375            return $self->{'_index_fh'};
376    }
377    
378    =head2 _create_doc
379    
380    Create temporary file and pass it's name to SWISH++
381    
382      $i->_create_doc(
383            path => 'path/to/store/in/index',
384            title => 'this is title in results',
385            body => 'data to story in body tag',
386            meta => {
387                    'meta name' => 'data for this meta',
388                    'another' => 'again more data',
389            }
390      );
391    
392    To delete document, just omit body and meta data.
393    
394    =cut
395    
396    sub _create_doc {
397            my $self = shift;
398    
399            my $arg = {@_};
400    
401            # open indexer if needed
402            $self->_init_indexer;
403    
404            my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
405            my $id = $arg->{'path'} || confess "no path?";
406            $path .= "/$id";
407    
408            print STDERR "## _create_doc: $path\n" if ($self->{'debug'});
409    
410            open(TMP, '>', $path) || die "can't create temp file $path: $!";
411    
412            print TMP '<html><head>';
413    
414            $arg->{'body'} ||= '';
415    
416            if ($arg->{'meta'}) {
417                    foreach my $name (keys %{$arg->{'meta'}}) {
418                            my $content = $arg->{'meta'}->{$name};
419                            print TMP qq{<meta name="$name" content="$content">};
420                            $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
421                            $self->{'meta_db'}->{"$id-$name"} = $content;
422                    }
423            }
424    
425            my $title = $arg->{'title'};
426            if (defined($title)) {
427                    print TMP "<title>$title</title>";
428                    $arg->{'body'} .= " $title" if ($self->{'meta_in_body'});
429                    $self->{'meta_db'}->{"$id-title"} = $title;
430            }
431    
432            print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
433            
434            close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
435    
436            print { $self->{'_index_fh'} } "$id\n";
437    }
438    
439    =head2 _close_index
440    
441    Close index after indexing.
442    
443      $i->_close_index;
444    
445    You have to close index before searching.
446    
447    =cut
448    
449    sub _close_index {
450            my $self = shift;
451    
452            return unless ($self->{'_index_fh'});
453    
454            print STDERR "## close index\n" if ($self->{'debug'});
455    
456            close($self->{'_index_fh'}) || confess "can't close index: $!";
457            undef $self->{'_index_fh'};
458    
459            return 1;
460    }
461    
462    =head2 _tie_meta_db
463    
464    Open BerkeleyDB database with meta properties.
465    
466      $i->_tie_meta_db(DB_CREATE);
467      $i->_tie_meta_db(DB_RDONLY);
468    
469    }
470    
471    =cut
472    
473    sub _tie_meta_db  {
474            my $self = shift;
475    
476            my $flags = shift || confess "need DB_CREATE or DB_RDONLY";
477    
478            return if ($self->{'_meta_db_flags'} && $self->{'_meta_db_flags'} == $flags);
479    
480            print STDERR "## _tie_meta_db($flags)\n" if ($self->{'debug'});
481    
482            $self->_untie_meta_db;
483            $self->{'_meta_db_flags'} = $flags;
484    
485            my $file = $self->{'index_dir'}.'/meta.db';
486    
487            tie %{$self->{'meta_db'}}, "BerkeleyDB::Hash",
488                    -Filename => $file,
489                    -Flags    => $flags
490            or confess "cannot open $file: $! $BerkeleyDB::Error\n" ;
491    
492            return 1;
493    }
494    
495    =head2 _untie_meta_db
496    
497    Close BerkeleyDB database with meta properties.
498    
499      $i->_untie_meta_db;
500    
501    =cut
502    
503    sub _untie_meta_db {
504            my $self = shift;
505    
506            return unless ($self->{'meta_db'});
507    
508            print STDERR "## _untie_meta_db\n" if ($self->{'debug'});
509            untie %{$self->{'meta_db'}} || confess "can't untie!";
510            undef $self->{'meta_db'};
511            undef $self->{'_meta_db_flags'};
512    
513            return 1;
514    }
515    
516  1;  1;
517  __END__  __END__
518    
# Line 69  __END__ Line 520  __END__
520    
521  None by default.  None by default.
522    
523  =head1 SEE ALSO  =head1 RELATED
524    
525  Mention other useful documentation such as the documentation of  =head2 Debian
 related modules or operating system documentation (such as man pages  
 in UNIX), or any relevant external documentation such as RFCs or  
 standards.  
526    
527  If you have a mailing list set up for your module, mention it here.  Debian version of SWISH++ is often old (version 5 at moment of this writing
528    while version 6 is available in source code), so this module by default
529    uses executable names B<index> and B<search> for self-compiled version
530    instead of one from Debian package. See L<new> how to specify Debian
531    default binaries B<index++> and B<search++>.
532    
533    =head2 SWISH++
534    
535    Aside from very good rewrite in C++, SWISH++ is faster because it uses
536    claver heuristics about which data in input files are words to index and
537    which are not. It's based on English language and might be best choice if
538    you plan to index large amount of long text documents.
539    
540    However, if you plan to index all data from structured storage (e.g. RDBMS)
541    you might want B<all> words from data to end up in index as opposed to just
542    those which look like English words. This is especially important if you
543    don't plan to index English texts with this module.
544    
545    With distribution build versions of SWISH++ you might have problems with
546    disapearing words. To overcome this problem, you will have to compile and
547    configure SWISH++ yourself (because language characteristics are
548    compilation-time option).
549    
550    Compilation of SWISH++ is easy process well described on project's web
551    pages. To see my very relaxed sample configuration take a look at C<swish++>
552    directory included in distribution.
553    
554    =head2 SWISH++ config
555    
556    C<config.h> located in C<swish++> directory of this distribution is relaxed
557    SWISH++ configuration that will index all words passed to it. This
558    configuration is needed for B<date test> because default configuration
559    doesn't recognize 2004-12-05 as date. Have in mind that your index size
560    might explode.
561    
562    =head1 BUGS
563    
564    Currently there is no way to specify which meta data will be stored as
565    properties. B<This will be fixed very soon>.
566    
567    There is no garbage collection on temporary files created for SWISH++. This
568    means that one run of indexer will take additional disk space for temporary
569    files, which will be removed at end. There should be some way to remove
570    files after they are indexed by SWISH++. However, at this early stage of
571    development it's just not supported yet. Have plenty of disk space!
572    
573    =head1 SEE ALSO
574    
575  If you have a web site set up for your module, mention it here.  SWISH++ web site L<http://homepage.mac.com/pauljlucas/software/swish/>
576    
577  =head1 AUTHOR  =head1 AUTHOR
578    
579  Dobrica Pavlinusic, E<lt>dpavlin@E<gt>  Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
580    
581  =head1 COPYRIGHT AND LICENSE  =head1 COPYRIGHT AND LICENSE
582    

Legend:
Removed from v.1  
changed lines
  Added in v.21

  ViewVC Help
Powered by ViewVC 1.1.26