/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Fri Dec 3 13:31:43 2004 UTC revision 8 by dpavlin, Sat Dec 4 17:49:20 2004 UTC
# Line 4  use 5.008004; Line 4  use 5.008004;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.01';  our $VERSION = '0.02';
8    
9  use Carp;  use Carp;
10    use File::Temp qw/ tempdir /;
11    #use YAML;
12    
13  =head1 NAME  =head1 NAME
14    
# Line 20  SWISH::PlusPlus - Perl extension SWISH++ Line 22  SWISH::PlusPlus - Perl extension SWISH++
22  =head1 DESCRIPTION  =head1 DESCRIPTION
23    
24  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is  This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25  rewrite of swish-e in C++ with blazingly fast performance, but without  rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26  support for properties (which this module tries to fix)  support for properties (which this module tries to fix).
27    
28    Implementation of this module is crafted after L<Plucene::Simple> and it
29    should be easy to replace Plucene with this module for increased
30    performance. However, this module is not plug-in replacement.
31    
32  =head1 METHODS  =head1 METHODS
33    
# Line 29  support for properties (which this modul Line 35  support for properties (which this modul
35    
36  Create new indexing object.  Create new indexing object.
37    
38    my $i = new SWISH::PlusPlus(    my $i = SWISH::PlusPlus->open(
39          index => '/path/to/index',          index_dir => '/path/to/index',
40            index => 'index++',
41            search => 'search++',
42            debug => 1,
43    );    );
44    
45  Options to open are following:  Options to open are following:
46    
47  =over 5  =over 5
48    
49    =item C<index_dir>
50    
51    Path to directory in which index will be created.
52    
53  =item C<index>  =item C<index>
54    
55  path to directory in which index will be created.  Full or partial path to SWISH++ index executable. By default, it's B<index>
56    for self-compiled version. If you use Debian GNU/Linux package specify
57    B<index++>. See C<Debian>.
58    
59    =item C<search>
60    
61    Full or partial path to SWISH++ search executable. By default, it's B<search>.
62    
63    =item C<debug>
64    
65    This option (off by default) will produce a lot of debugging output on
66    C<STDERR> prefixed by C<##>.
67    
68  =back  =back
69    
70  =cut  =cut
71    
72  sub new {  sub open {
73          my $class = shift;          my $class = shift;
74          my $self = {@_};          my $self = {@_};
75          bless($self, $class);          bless($self, $class);
76    
77          foreach (qw(index)) {          foreach (qw(index_dir)) {
78                  croak "need $_" unless $self->{$_};                  croak "need $_" unless $self->{$_};
79          }          }
80    
81          if (! -e $self->{'index'}) {          if (! -e $self->{'index_dir'}) {
82                  mkdir $self->{'index'} || confess "can't create index ",$self->{'index'},": $!";                  mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";
83          }          }
84    
85            # default executables
86            $self->{'index'} ||= 'index';
87            $self->{'search'} ||= 'search';
88    
89            print STDERR "## open index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
90    
91          $self ? return $self : return undef;          $self ? return $self : return undef;
92  }  }
93    
94    
95    =head2 check_bin
96    
97    Check if swish++ binaries specified in L<open> are available and verify
98    version signature.
99    
100      if ($i->check_bin) {
101            print "swish++ binaries found\n";
102      };
103    
104    It will also setup property
105    
106      $i->{'version'}
107    
108    which you can examine to see version.
109    
110    =cut
111    
112    sub check_bin {
113            my $self = shift;
114    
115            my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
116            my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
117    
118            chomp $i;
119            chomp $s;
120    
121            confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
122            confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
123    
124            if ($i eq $s) {
125                    $self->{'version'} = $i;
126                    return 1;
127            } else  {
128                    carp "version difference: index is $i while search is $s";
129                    return;
130            }
131    
132    }
133    
134    =head2 index_document
135    
136    Quick way to add simple data to index.
137    
138      $i->index_document($key, $data);
139      $i->index_document( 42 => 'meaning of life' );
140    
141    =cut
142    
143    sub index_document {
144            my $self = shift;
145    
146            my %doc = @_;
147    
148            foreach my $id (keys %doc) {
149                    $self->_create_doc(
150                            path => $id,
151                            body => $doc{$id},
152                    );
153            }
154    
155            return 1;
156    }
157    
158    =head2 search
159    
160    Search your index.
161    
162      my @results = $i->search("swhish query");
163    
164    Returns array with result IDs.
165    
166    =cut
167    
168    sub search {
169            my $self = shift;
170    
171            my $query = shift || return;
172    
173            $self->_close_index;
174    
175            my @results;
176    
177            # escape double quotes in query for shell
178            $query =~ s/"/\\"/g;
179    
180            my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
181            print STDERR "## search $open_cmd\n" if ($self->{'debug'});
182    
183            CORE::open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
184            while(<SEARCH>) {
185                    next if (/^#/);
186                    chomp;
187                    print STDERR "## $_\n" if ($self->{'debug'});
188                    my ($rank,$path,$size,$title) = split(/ /,$_,4);
189                    push @results, {
190                            rank => $rank,
191                            path => $path,
192                            size => $size,
193                            title => $title,
194                    }
195            }
196    
197            close(SEARCH) || confess "can't close search";
198    
199            #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
200    
201            return @results;
202    }
203    
204    =head1 PRIVATE METHODS
205    
206    Private methods implement internals for creating temporary file needed for
207    swish++. You should have no need to call them directly, and they are here
208    just to have documentation.
209    
210    =head2 _init_index
211    
212    Create temporary directory in which files for indexing will be created and
213    start index process.
214    
215      my $i->_init_index || die "can't start indexer";
216    
217    =cut
218    
219    sub _init_index {
220            my $self = shift;
221    
222            $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
223    
224            my $opt = "-v 4";
225    
226            my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
227    
228            chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
229    
230            CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
231    
232            return $self->{'index_fh'};
233    }
234    
235    =head2 _create_doc
236    
237    Create temporary file and pass it's name to swish++
238    
239      $i->_create_doc(
240            path => 'path/to/store/in/index',
241            body => 'data to story in body tag',
242            meta => {
243                    'meta name' => 'data for this meta',
244                    'another' => 'again more data',
245            }
246      );
247    
248    To delete document, just omit body and meta data.
249    
250    =cut
251    
252    sub _create_doc {
253            my $self = shift;
254    
255            my $arg = {@_};
256    
257            # open indexer if needed
258            $self->{'index_fh'} ||= $self->_init_index;
259    
260            my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
261    
262            CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
263    
264            print TMP '<html>';
265    
266            if ($arg->{'meta'}) {
267                    confess "not yet implemented";
268            }
269            
270            print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';
271            
272            close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
273    
274            print { $self->{'index_fh'} } $arg->{'path'}."\n";
275    }
276    
277    =head2 _close_index
278    
279    Close index after indexing.
280    
281      $i->_close_index;
282    
283    You have to close index before searching.
284    
285    =cut
286    
287    sub _close_index {
288            my $self = shift;
289    
290            return unless ($self->{'index_fh'});
291    
292            print STDERR "## close index\n" if ($self->{'debug'});
293    
294            close($self->{'index_fh'});
295            undef $self->{'index_fh'};
296    }
297    
298  1;  1;
299  __END__  __END__
300    
# Line 69  __END__ Line 302  __END__
302    
303  None by default.  None by default.
304    
305  =head1 SEE ALSO  =head1 RELATED
306    
307    =head2 Debian
308    
309  Mention other useful documentation such as the documentation of  Debian version of swish++ is often old (version 5 at moment of this writing
310  related modules or operating system documentation (such as man pages  while version 6 is available in source code), so this module by default
311  in UNIX), or any relevant external documentation such as RFCs or  uses executable names B<index> and B<search> for self-compiled version
312  standards.  instead of one from Debian package. See L<open> how to specify Debian
313    default binaries B<index++> and B<search++>.
314    
315    =head2 SWISH++
316    
317    Aside from very good rewrite in C++, SWISH++ is fatster because it has
318    claver heuristics about which data in input files are words to index and
319    which are not. It's based on English language and might be best choice if
320    you plan to install large amount of long text documents.
321    
322    However, if you plan to index all data from structured storage (e.g. RDBMS)
323    you might want B<all> words from data to end up in index as opposed to just
324    those which look like English words. This is especially important if you
325    don't plan to index English texts with this module.
326    
327    With distribution build versions of SWISH++ you might have problems with
328    disepearing words. To overcome this problem, you will have to compile and
329    configure SWISH++ yourself (because language characteristics are
330    compilation-time option).
331    
332    Compilation of SWISH++ is easy process well described on project's web
333    pages. To see my very relaxed sample configuration take a look at C<swish++>
334    directory included in distribution.
335    
336  If you have a mailing list set up for your module, mention it here.  =head1 SEE ALSO
337    
338  If you have a web site set up for your module, mention it here.  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
339    
340  =head1 AUTHOR  =head1 AUTHOR
341    
342  Dobrica Pavlinusic, E<lt>dpavlin@E<gt>  Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
343    
344  =head1 COPYRIGHT AND LICENSE  =head1 COPYRIGHT AND LICENSE
345    

Legend:
Removed from v.1  
changed lines
  Added in v.8

  ViewVC Help
Powered by ViewVC 1.1.26