/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Fri Dec 3 13:31:43 2004 UTC revision 13 by dpavlin, Sun Dec 5 14:35:54 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.01';  our $VERSION = '0.06';
8    
9  use Carp;  use Carp;
10    use File::Temp qw/ tempdir /;
11    #use YAML;
12    
13  =head1 NAME  =head1 NAME
14    
# Line 20  SWISH::PlusPlus - Perl extension SWISH++ Line 22  SWISH::PlusPlus - Perl extension SWISH++
22  =head1 DESCRIPTION  =head1 DESCRIPTION
23    
24  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25  rewrite of swish-e in C++ with blazingly fast performance, but without  rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26  support for properties (which this module tries to fix)  support for properties (which this module tries to fix).
27    
28    Implementation of this module is crafted after L<Plucene::Simple> and it
29    should be easy to replace Plucene with this module for increased
30    performance. However, this module is not plug-in replacement.
31    
32  =head1 METHODS  =head1 METHODS
33    
34  =head2 open  =head2 new
35    
36  Create new indexing object.  Create new indexing object.
37    
38    my $i = new SWISH::PlusPlus(    my $i = SWISH::PlusPlus->new(
39          index => '/path/to/index',          index_dir => '/path/to/index',
40            index => 'index++',
41            search => 'search++',
42            debug => 1,
43            meta_in_body => 1,
44            use_stopwords => 1,
45    );    );
46    
47  Options to open are following:  Options to new are following:
48    
49  =over 5  =over 5
50    
51    =item C<index_dir>
52    
53    Path to directory in which index will be created.
54    
55  =item C<index>  =item C<index>
56    
57  path to directory in which index will be created.  Full or partial path to SWISH++ index executable. By default, it's B<index>
58    for self-compiled version. If you use Debian GNU/Linux package specify
59    B<index++>. See C<Debian>.
60    
61    =item C<search>
62    
63    Full or partial path to SWISH++ search executable. By default, it's B<search>.
64    
65    =item C<debug>
66    
67    This option (off by default) will produce a lot of debugging output on
68    C<STDERR> prefixed by C<##>.
69    
70    =item C<meta_in_body>
71    
72    This option (off by default) enables to search content of meta fields
73    without specifing them (like they are in body of document). This will
74    somewhat increate index size.
75    
76    =item C<use_stopwords>
77    
78    Use built-in SWISH++ stop words. By default, they are disabled.
79    
80  =back  =back
81    
# Line 50  sub new { Line 86  sub new {
86          my $self = {@_};          my $self = {@_};
87          bless($self, $class);          bless($self, $class);
88    
89          foreach (qw(index)) {          foreach (qw(index_dir)) {
90                  croak "need $_" unless $self->{$_};                  croak "need $_" unless $self->{$_};
91          }          }
92    
93          if (! -e $self->{'index'}) {          my $index_dir = $self->{'index_dir'};
94                  mkdir $self->{'index'} || confess "can't create index ",$self->{'index'},": $!";  
95            if ($index_dir !~ m#^/#) {
96                    chomp(my $cwd = `pwd`);
97                    $index_dir = "$cwd/$index_dir";
98                    print STDERR "## full path to index_dir: $index_dir\n" if ($self->{'debug'});
99                    $self->{'index_dir'} = $index_dir;
100            }
101    
102            if (! -e $index_dir) {
103                    mkdir $index_dir || confess "can't create index ",$self->{'index'},": $!";
104          }          }
105    
106            # default executables
107            $self->{'index'} ||= 'index';
108            $self->{'search'} ||= 'search';
109    
110            print STDERR "## new index_dir: ",$index_dir," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
111    
112          $self ? return $self : return undef;          $self ? return $self : return undef;
113  }  }
114    
115    
116    =head2 check_bin
117    
118    Check if swish++ binaries specified in L<new> are available and verify
119    version signature.
120    
121      if ($i->check_bin) {
122            print "swish++ binaries found\n";
123      };
124    
125    It will also setup property
126    
127      $i->{'version'}
128    
129    which you can examine to see version.
130    
131    =cut
132    
133    sub check_bin {
134            my $self = shift;
135    
136            my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
137            my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
138    
139            chomp $i;
140            chomp $s;
141    
142            confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
143            confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
144    
145            if ($i eq $s) {
146                    $self->{'version'} = $i;
147                    return 1;
148            } else  {
149                    carp "version difference: index is $i while search is $s";
150                    return;
151            }
152    
153    }
154    
155    =head2 index_document
156    
157    Quick way to add simple data to index.
158    
159      $i->index_document($key, $data);
160      $i->index_document( 42 => 'meaning of life' );
161    
162    =cut
163    
164    sub index_document {
165            my $self = shift;
166    
167            my %doc = @_;
168    
169            foreach my $id (keys %doc) {
170                    $self->_create_doc(
171                            path => $id,
172                            body => $doc{$id},
173                    );
174            }
175    
176            return 1;
177    }
178    
179    =head2 add
180    
181    Add document with metadata to index.
182    
183      $i->add(
184            path => 'path/to/document',
185            title => 'this is result title',
186            meta => {
187                    description => 'this is description meta tag',
188                    date => '2004-11-04',
189                    author => 'Dobrica Pavlinusic',
190            }
191            body => 'this is text without meta data',
192      );
193    
194    This is thin wrapper round L<_create_doc>.
195    
196    =cut
197    
198    sub add {
199            my $self = shift;
200    
201            $self->_create_doc(@_);
202    
203            return 1;
204    }
205    =head2 search
206    
207    Search your index.
208    
209      my @results = $i->search("swhish query");
210    
211    Returns array with result IDs.
212    
213    =cut
214    
215    sub search {
216            my $self = shift;
217    
218            my $query = shift || return;
219    
220            $self->_close_index;
221    
222            my @results;
223    
224            # escape double quotes in query for shell
225            $query =~ s/"/\\"/g;
226    
227            my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
228            print STDERR "## search $open_cmd\n" if ($self->{'debug'});
229    
230            open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
231            while(<SEARCH>) {
232                    next if (/^#/);
233                    chomp;
234                    print STDERR "## $_\n" if ($self->{'debug'});
235                    my ($rank,$path,$size,$title) = split(/ /,$_,4);
236                    push @results, {
237                            rank => $rank,
238                            path => $path,
239                            size => $size,
240                            title => $title,
241                    }
242            }
243    
244            close(SEARCH) || confess "can't close search";
245    
246            #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
247    
248            return @results;
249    }
250    
251    =head2 finish_update
252    
253    This method will close index.
254    
255      $i->finish_update;
256    
257    It will be called on DESTROY when $i goes out of scope.
258    
259    =cut
260    
261    sub finish_update {
262            my $self = shift;
263    
264            $self->_close_index;
265    }
266    
267    sub DESTROY {
268            my $self = shift;
269            $self->finish_update;
270    }
271    
272    =head1 PRIVATE METHODS
273    
274    Private methods implement internals for creating temporary file needed for
275    swish++. You should have no need to call them directly, and they are here
276    just to have documentation.
277    
278    =head2 _init_indexer
279    
280    Create temporary directory in which files for indexing will be created and
281    start index process.
282    
283      my $i->_init_indexer || die "can't start indexer";
284    
285    It will also create empty file C<_stopwords_> to disable stop words.
286    
287    =cut
288    
289    sub _init_indexer {
290            my $self = shift;
291    
292            $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
293    
294            chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
295    
296            my $opt = "-v " . ($self->{'debug'} || '0');
297    
298            unless ($self->{'use_stopwrods'}) {
299                    open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
300                    print STOP "  ";
301                    close(STOP);
302                    $opt .= " -s _stopwords_";
303            }
304    
305            my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
306    
307            print STDERR "## open index $open_cmd\n" if ($self->{'index'});
308    
309            open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
310    
311    
312            return $self->{'index_fh'};
313    }
314    
315    =head2 _create_doc
316    
317    Create temporary file and pass it's name to swish++
318    
319      $i->_create_doc(
320            path => 'path/to/store/in/index',
321            title => 'this is title in results',
322            body => 'data to story in body tag',
323            meta => {
324                    'meta name' => 'data for this meta',
325                    'another' => 'again more data',
326            }
327      );
328    
329    To delete document, just omit body and meta data.
330    
331    =cut
332    
333    sub _create_doc {
334            my $self = shift;
335    
336            my $arg = {@_};
337    
338            # open indexer if needed
339            $self->{'index_fh'} ||= $self->_init_indexer;
340    
341            my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
342    
343            open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
344    
345            print TMP '<html><head>';
346    
347            $arg->{'body'} ||= '';
348    
349            if ($arg->{'meta'}) {
350                    foreach my $name (keys %{$arg->{'meta'}}) {
351                            my $content = $arg->{'meta'}->{$name};
352                            print TMP qq{<meta name="$name" content="$content">};
353                            $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
354                    }
355            }
356    
357            if (defined($arg->{'title'})) {
358                    print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
359                    $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
360            }
361    
362            print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
363            
364            close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
365    
366            print { $self->{'index_fh'} } $arg->{'path'}."\n";
367    }
368    
369    =head2 _close_index
370    
371    Close index after indexing.
372    
373      $i->_close_index;
374    
375    You have to close index before searching.
376    
377    =cut
378    
379    sub _close_index {
380            my $self = shift;
381    
382            return unless ($self->{'index_fh'});
383    
384            print STDERR "## close index\n" if ($self->{'debug'});
385    
386            close($self->{'index_fh'});
387            undef $self->{'index_fh'};
388    }
389    
390  1;  1;
391  __END__  __END__
392    
# Line 69  __END__ Line 394  __END__
394    
395  None by default.  None by default.
396    
397  =head1 SEE ALSO  =head1 RELATED
398    
399  Mention other useful documentation such as the documentation of  =head2 Debian
 related modules or operating system documentation (such as man pages  
 in UNIX), or any relevant external documentation such as RFCs or  
 standards.  
400    
401  If you have a mailing list set up for your module, mention it here.  Debian version of swish++ is often old (version 5 at moment of this writing
402    while version 6 is available in source code), so this module by default
403    uses executable names B<index> and B<search> for self-compiled version
404    instead of one from Debian package. See L<new> how to specify Debian
405    default binaries B<index++> and B<search++>.
406    
407    =head2 SWISH++
408    
409    Aside from very good rewrite in C++, SWISH++ is fatster because it has
410    claver heuristics about which data in input files are words to index and
411    which are not. It's based on English language and might be best choice if
412    you plan to install large amount of long text documents.
413    
414    However, if you plan to index all data from structured storage (e.g. RDBMS)
415    you might want B<all> words from data to end up in index as opposed to just
416    those which look like English words. This is especially important if you
417    don't plan to index English texts with this module.
418    
419    With distribution build versions of SWISH++ you might have problems with
420    disepearing words. To overcome this problem, you will have to compile and
421    configure SWISH++ yourself (because language characteristics are
422    compilation-time option).
423    
424    Compilation of SWISH++ is easy process well described on project's web
425    pages. To see my very relaxed sample configuration take a look at C<swish++>
426    directory included in distribution.
427    
428    =head2 SWISH++ config
429    
430    C<config.h> located in C<swish++> directory of this distribution is relaxed
431    SWISH++ configuration that will index all words passed to it. This
432    configuration is needed for B<date test> because default configuration
433    doesn't recognize 2004-12-05 as date. Have in mind that your index size
434    might explode.
435    
436    =head1 SEE ALSO
437    
438  If you have a web site set up for your module, mention it here.  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
439    
440  =head1 AUTHOR  =head1 AUTHOR
441    
442  Dobrica Pavlinusic, E<lt>dpavlin@E<gt>  Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
443    
444  =head1 COPYRIGHT AND LICENSE  =head1 COPYRIGHT AND LICENSE
445    

Legend:
Removed from v.1  
changed lines
  Added in v.13

  ViewVC Help
Powered by ViewVC 1.1.26