/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Fri Dec 3 13:31:43 2004 UTC revision 9 by dpavlin, Sun Dec 5 00:59:50 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.01';  our $VERSION = '0.03';
8    
9  use Carp;  use Carp;
10    use File::Temp qw/ tempdir /;
11    #use YAML;
12    
13  =head1 NAME  =head1 NAME
14    
# Line 20  SWISH::PlusPlus - Perl extension SWISH++ Line 22  SWISH::PlusPlus - Perl extension SWISH++
22  =head1 DESCRIPTION  =head1 DESCRIPTION
23    
24  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25  rewrite of swish-e in C++ with blazingly fast performance, but without  rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26  support for properties (which this module tries to fix)  support for properties (which this module tries to fix).
27    
28    Implementation of this module is crafted after L<Plucene::Simple> and it
29    should be easy to replace Plucene with this module for increased
30    performance. However, this module is not plug-in replacement.
31    
32  =head1 METHODS  =head1 METHODS
33    
# Line 29  support for properties (which this modul Line 35  support for properties (which this modul
35    
36  Create new indexing object.  Create new indexing object.
37    
38    my $i = new SWISH::PlusPlus(    my $i = SWISH::PlusPlus->open(
39          index => '/path/to/index',          index_dir => '/path/to/index',
40            index => 'index++',
41            search => 'search++',
42            debug => 1,
43            meta_in_body => 1,
44            use_stopwords => 1,
45    );    );
46    
47  Options to open are following:  Options to open are following:
48    
49  =over 5  =over 5
50    
51    =item C<index_dir>
52    
53    Path to directory in which index will be created.
54    
55  =item C<index>  =item C<index>
56    
57  path to directory in which index will be created.  Full or partial path to SWISH++ index executable. By default, it's B<index>
58    for self-compiled version. If you use Debian GNU/Linux package specify
59    B<index++>. See C<Debian>.
60    
61    =item C<search>
62    
63    Full or partial path to SWISH++ search executable. By default, it's B<search>.
64    
65    =item C<debug>
66    
67    This option (off by default) will produce a lot of debugging output on
68    C<STDERR> prefixed by C<##>.
69    
70    =item C<meta_in_body>
71    
72    This option (off by default) enables to search content of meta fields
73    without specifing them (like they are in body of document). This will
74    somewhat increate index size.
75    
76    =item C<use_stopwords>
77    
78    Use built-in SWISH++ stop words. By default, they are disabled.
79    
80  =back  =back
81    
82  =cut  =cut
83    
84  sub new {  sub open {
85          my $class = shift;          my $class = shift;
86          my $self = {@_};          my $self = {@_};
87          bless($self, $class);          bless($self, $class);
88    
89          foreach (qw(index)) {          foreach (qw(index_dir)) {
90                  croak "need $_" unless $self->{$_};                  croak "need $_" unless $self->{$_};
91          }          }
92    
93          if (! -e $self->{'index'}) {          if (! -e $self->{'index_dir'}) {
94                  mkdir $self->{'index'} || confess "can't create index ",$self->{'index'},": $!";                  mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";
95          }          }
96    
97            # default executables
98            $self->{'index'} ||= 'index';
99            $self->{'search'} ||= 'search';
100    
101            print STDERR "## open index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
102    
103          $self ? return $self : return undef;          $self ? return $self : return undef;
104  }  }
105    
106    
107    =head2 check_bin
108    
109    Check if swish++ binaries specified in L<open> are available and verify
110    version signature.
111    
112      if ($i->check_bin) {
113            print "swish++ binaries found\n";
114      };
115    
116    It will also setup property
117    
118      $i->{'version'}
119    
120    which you can examine to see version.
121    
122    =cut
123    
124    sub check_bin {
125            my $self = shift;
126    
127            my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
128            my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
129    
130            chomp $i;
131            chomp $s;
132    
133            confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
134            confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
135    
136            if ($i eq $s) {
137                    $self->{'version'} = $i;
138                    return 1;
139            } else  {
140                    carp "version difference: index is $i while search is $s";
141                    return;
142            }
143    
144    }
145    
146    =head2 index_document
147    
148    Quick way to add simple data to index.
149    
150      $i->index_document($key, $data);
151      $i->index_document( 42 => 'meaning of life' );
152    
153    =cut
154    
155    sub index_document {
156            my $self = shift;
157    
158            my %doc = @_;
159    
160            foreach my $id (keys %doc) {
161                    $self->_create_doc(
162                            path => $id,
163                            body => $doc{$id},
164                    );
165            }
166    
167            return 1;
168    }
169    
170    =head2 add
171    
172    Add document with metadata to index.
173    
174      $i->add(
175            path => 'path/to/document',
176            title => 'this is result title',
177            meta => {
178                    description => 'this is description meta tag',
179                    date => '2004-11-04',
180                    author => 'Dobrica Pavlinusic',
181            }
182            body => 'this is text without meta data',
183      );
184    
185    This is thin wrapper round L<_create_doc>.
186    
187    =cut
188    
189    sub add {
190            my $self = shift;
191    
192            $self->_create_doc(@_);
193    
194            return 1;
195    }
196    =head2 search
197    
198    Search your index.
199    
200      my @results = $i->search("swhish query");
201    
202    Returns array with result IDs.
203    
204    =cut
205    
206    sub search {
207            my $self = shift;
208    
209            my $query = shift || return;
210    
211            $self->_close_index;
212    
213            my @results;
214    
215            # escape double quotes in query for shell
216            $query =~ s/"/\\"/g;
217    
218            my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
219            print STDERR "## search $open_cmd\n" if ($self->{'debug'});
220    
221            CORE::open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
222            while(<SEARCH>) {
223                    next if (/^#/);
224                    chomp;
225                    print STDERR "## $_\n" if ($self->{'debug'});
226                    my ($rank,$path,$size,$title) = split(/ /,$_,4);
227                    push @results, {
228                            rank => $rank,
229                            path => $path,
230                            size => $size,
231                            title => $title,
232                    }
233            }
234    
235            close(SEARCH) || confess "can't close search";
236    
237            #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
238    
239            return @results;
240    }
241    
242    =head1 PRIVATE METHODS
243    
244    Private methods implement internals for creating temporary file needed for
245    swish++. You should have no need to call them directly, and they are here
246    just to have documentation.
247    
248    =head2 _init_indexer
249    
250    Create temporary directory in which files for indexing will be created and
251    start index process.
252    
253      my $i->_init_indexer || die "can't start indexer";
254    
255    It will also create empty file C<_stopwords_> to disable stop words.
256    
257    =cut
258    
259    sub _init_indexer {
260            my $self = shift;
261    
262            $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
263    
264            chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
265    
266            my $opt = "-v 4";
267    
268            unless ($self->{'use_stopwrods'}) {
269                    CORE::open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
270                    print STOP "  ";
271                    close(STOP);
272                    $opt .= " -s _stopwords_";
273            }
274    
275            my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
276    
277    
278            CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
279    
280    
281            return $self->{'index_fh'};
282    }
283    
284    =head2 _create_doc
285    
286    Create temporary file and pass it's name to swish++
287    
288      $i->_create_doc(
289            path => 'path/to/store/in/index',
290            title => 'this is title in results',
291            body => 'data to story in body tag',
292            meta => {
293                    'meta name' => 'data for this meta',
294                    'another' => 'again more data',
295            }
296      );
297    
298    To delete document, just omit body and meta data.
299    
300    =cut
301    
302    sub _create_doc {
303            my $self = shift;
304    
305            my $arg = {@_};
306    
307            # open indexer if needed
308            $self->{'index_fh'} ||= $self->_init_indexer;
309    
310            my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
311    
312            CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
313    
314            print TMP '<html><head>';
315    
316            $arg->{'body'} ||= '';
317    
318            if ($arg->{'meta'}) {
319                    confess "not yet implemented";
320            }
321    
322            if (defined($arg->{'title'})) {
323                    print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
324                    $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
325            }
326    
327            print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
328            
329            close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
330    
331            print { $self->{'index_fh'} } $arg->{'path'}."\n";
332    }
333    
334    =head2 _close_index
335    
336    Close index after indexing.
337    
338      $i->_close_index;
339    
340    You have to close index before searching.
341    
342    =cut
343    
344    sub _close_index {
345            my $self = shift;
346    
347            return unless ($self->{'index_fh'});
348    
349            print STDERR "## close index\n" if ($self->{'debug'});
350    
351            close($self->{'index_fh'});
352            undef $self->{'index_fh'};
353    }
354    
355  1;  1;
356  __END__  __END__
357    
# Line 69  __END__ Line 359  __END__
359    
360  None by default.  None by default.
361    
362  =head1 SEE ALSO  =head1 RELATED
363    
364    =head2 Debian
365    
366  Mention other useful documentation such as the documentation of  Debian version of swish++ is often old (version 5 at moment of this writing
367  related modules or operating system documentation (such as man pages  while version 6 is available in source code), so this module by default
368  in UNIX), or any relevant external documentation such as RFCs or  uses executable names B<index> and B<search> for self-compiled version
369  standards.  instead of one from Debian package. See L<open> how to specify Debian
370    default binaries B<index++> and B<search++>.
371    
372    =head2 SWISH++
373    
374    Aside from very good rewrite in C++, SWISH++ is fatster because it has
375    claver heuristics about which data in input files are words to index and
376    which are not. It's based on English language and might be best choice if
377    you plan to install large amount of long text documents.
378    
379    However, if you plan to index all data from structured storage (e.g. RDBMS)
380    you might want B<all> words from data to end up in index as opposed to just
381    those which look like English words. This is especially important if you
382    don't plan to index English texts with this module.
383    
384    With distribution build versions of SWISH++ you might have problems with
385    disepearing words. To overcome this problem, you will have to compile and
386    configure SWISH++ yourself (because language characteristics are
387    compilation-time option).
388    
389    Compilation of SWISH++ is easy process well described on project's web
390    pages. To see my very relaxed sample configuration take a look at C<swish++>
391    directory included in distribution.
392    
393  If you have a mailing list set up for your module, mention it here.  =head1 SEE ALSO
394    
395  If you have a web site set up for your module, mention it here.  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
396    
397  =head1 AUTHOR  =head1 AUTHOR
398    
399  Dobrica Pavlinusic, E<lt>dpavlin@E<gt>  Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
400    
401  =head1 COPYRIGHT AND LICENSE  =head1 COPYRIGHT AND LICENSE
402    

Legend:
Removed from v.1  
changed lines
  Added in v.9

  ViewVC Help
Powered by ViewVC 1.1.26