/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5 - (hide annotations)
Fri Dec 3 21:48:15 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 6319 byte(s)
more documentation

1 dpavlin 1 package SWISH::PlusPlus;
2    
3     use 5.008004;
4     use strict;
5     use warnings;
6    
7 dpavlin 3 our $VERSION = '0.02';
8 dpavlin 1
9     use Carp;
10 dpavlin 4 use File::Temp qw/ tempdir /;
11 dpavlin 1
12     =head1 NAME
13    
14     SWISH::PlusPlus - Perl extension SWISH++
15    
16     =head1 SYNOPSIS
17    
18     use SWISH::PlusPlus;
19     blah blah blah
20    
21     =head1 DESCRIPTION
22    
23     This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
24 dpavlin 3 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
25     support for properties (which this module tries to fix).
26 dpavlin 1
27 dpavlin 3 Implementation of this module is crafted after L<Plucene::Simple> and it
28     should be easy to replace Plucene with this module for increased
29     performance. However, this module is not plug-in replacement.
30    
31 dpavlin 1 =head1 METHODS
32    
33     =head2 open
34    
35     Create new indexing object.
36    
37 dpavlin 3 my $i = SWISH::PlusPlus->open(
38     index_dir => '/path/to/index',
39     index => 'index++',
40     search => 'search++',
41 dpavlin 1 );
42    
43     Options to open are following:
44    
45     =over 5
46    
47 dpavlin 3 =item C<index_dir>
48    
49     Path to directory in which index will be created.
50    
51 dpavlin 1 =item C<index>
52    
53 dpavlin 3 Full or partial path to SWISH++ index executable. By default, it's B<index>
54     for self-compiled version. If you use Debian GNU/Linux package specify
55     B<index++>. See C<Debian>.
56 dpavlin 1
57 dpavlin 3 =item C<search>
58    
59     Full or partial path to SWISH++ search executable. By default, it's B<search>.
60    
61 dpavlin 1 =back
62    
63     =cut
64    
65 dpavlin 3 sub open {
66 dpavlin 1 my $class = shift;
67     my $self = {@_};
68     bless($self, $class);
69    
70 dpavlin 3 foreach (qw(index_dir)) {
71 dpavlin 1 croak "need $_" unless $self->{$_};
72     }
73    
74 dpavlin 3 if (! -e $self->{'index_dir'}) {
75     mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";
76 dpavlin 1 }
77    
78 dpavlin 3 # default executables
79     $self->{'index'} ||= 'index';
80     $self->{'search'} ||= 'search';
81    
82 dpavlin 1 $self ? return $self : return undef;
83     }
84    
85    
86 dpavlin 3 =head2 check_bin
87    
88     Check if swish++ binaries specified in L<open> are available and verify
89     version signature.
90    
91     if ($i->check_bin) {
92     print "swish++ binaries found\n";
93     };
94    
95     It will also setup property
96    
97     $i->{'version'}
98    
99     which you can examine to see version.
100    
101     =cut
102    
103     sub check_bin {
104     my $self = shift;
105    
106     my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
107     my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
108    
109     chomp $i;
110     chomp $s;
111    
112     confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
113     confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
114    
115     if ($i eq $s) {
116     $self->{'version'} = $i;
117     return 1;
118     } else {
119     carp "version difference: index is $i while search is $s";
120     return;
121     }
122    
123     }
124    
125 dpavlin 4 =head2 index_document
126    
127     Quick way to add simple data to index.
128    
129     $i->index_document($key, $data);
130     $i->index_document( 42 => 'meaning of life' );
131    
132     =cut
133    
134     sub index_document {
135     my $self = shift;
136    
137     my %doc = @_;
138    
139     foreach my $id (keys %doc) {
140     $self->_create_doc(
141     path => $id,
142     body => $doc{$id},
143     );
144     }
145    
146     return 1;
147     }
148    
149     =head1 PRIVATE METHODS
150    
151     Private methods implement internals for creating temporary file needed for
152     swish++. You should have no need to call them directly, and they are here
153     just to have documentation.
154    
155     =head2 _init_index
156    
157     Create temporary directory in which files for indexing will be created and
158     start index process.
159    
160     my $i->_init_index || die "can't start indexer";
161    
162     =cut
163    
164     sub _init_index {
165     my $self = shift;
166    
167     $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
168    
169     my $opt = "-v 4";
170    
171     my $open_cmd = '| index '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
172    
173     chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
174    
175     CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
176    
177     return $self->{'index_fh'};
178     }
179    
180     =head2 _create_doc
181    
182     Create temporary file and pass it's name to swish++
183    
184     $i->_create_doc(
185     path => 'path/to/store/in/index',
186     body => 'data to story in body tag',
187     meta => {
188     'meta name' => 'data for this meta',
189     'another' => 'again more data',
190     }
191     );
192    
193     =cut
194    
195     sub _create_doc {
196     my $self = shift;
197    
198     my $arg = {@_};
199    
200     # open indexer if needed
201     $self->{'index_fh'} ||= $self->_init_index;
202    
203     my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
204    
205     CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
206    
207     print TMP '<html>';
208    
209     if ($arg->{'meta'}) {
210     confess "not yet implemented";
211     }
212    
213     print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';
214    
215     close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
216    
217     print { $self->{'index_fh'} } $arg->{'path'}."\n";
218     }
219    
220 dpavlin 1 1;
221     __END__
222    
223     =head2 EXPORT
224    
225     None by default.
226    
227 dpavlin 3 =head1 RELATED
228    
229     =head2 Debian
230    
231     Debian version of swish++ is often old (version 5 at moment of this writing
232     while version 6 is available in source code), so this module by default
233     uses executable names B<index> and B<search> for self-compiled version
234     instead of one from Debian package. See L<open> how to specify Debian
235     default binaries B<index++> and B<search++>.
236    
237 dpavlin 5 =head2 SWISH++
238 dpavlin 1
239 dpavlin 5 Aside from very good rewrite in C++, SWISH++ is fatster because it has
240     claver heuristics about which data in input files are words to index and
241     which are not. It's based on English language and might be best choice if
242     you plan to install large amount of long text documents.
243 dpavlin 1
244 dpavlin 5 However, if you plan to index all data from structured storage (e.g. RDBMS)
245     you might want B<all> words from data to end up in index as opposed to just
246     those which look like English words. This is especially important if you
247     don't plan to index English texts with this module.
248 dpavlin 1
249 dpavlin 5 With distribution build versions of SWISH++ you might have problems with
250     disepearing words. To overcome this problem, you will have to compile and
251     configure SWISH++ yourself (because language characteristics are
252     compilation-time option).
253 dpavlin 1
254 dpavlin 5 Compilation of SWISH++ is easy process well described on project's web
255     pages. To see my very relaxed sample configuration take a look at C<swish++>
256     directory included in distribution.
257    
258     =head1 SEE ALSO
259    
260     C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
261    
262 dpavlin 1 =head1 AUTHOR
263    
264 dpavlin 5 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
265 dpavlin 1
266     =head1 COPYRIGHT AND LICENSE
267    
268     Copyright (C) 2004 by Dobrica Pavlinusic
269    
270     This library is free software; you can redistribute it and/or modify
271     it under the same terms as Perl itself, either Perl version 5.8.4 or,
272     at your option, any later version of Perl 5 you may have available.
273    
274    
275     =cut

  ViewVC Help
Powered by ViewVC 1.1.26