/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5 - (show annotations)
Fri Dec 3 21:48:15 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 6319 byte(s)
more documentation

1 package SWISH::PlusPlus;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.02';
8
9 use Carp;
10 use File::Temp qw/ tempdir /;
11
12 =head1 NAME
13
14 SWISH::PlusPlus - Perl extension SWISH++
15
16 =head1 SYNOPSIS
17
18 use SWISH::PlusPlus;
19 blah blah blah
20
21 =head1 DESCRIPTION
22
23 This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
24 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
25 support for properties (which this module tries to fix).
26
27 Implementation of this module is crafted after L<Plucene::Simple> and it
28 should be easy to replace Plucene with this module for increased
29 performance. However, this module is not plug-in replacement.
30
31 =head1 METHODS
32
33 =head2 open
34
35 Create new indexing object.
36
37 my $i = SWISH::PlusPlus->open(
38 index_dir => '/path/to/index',
39 index => 'index++',
40 search => 'search++',
41 );
42
43 Options to open are following:
44
45 =over 5
46
47 =item C<index_dir>
48
49 Path to directory in which index will be created.
50
51 =item C<index>
52
53 Full or partial path to SWISH++ index executable. By default, it's B<index>
54 for self-compiled version. If you use Debian GNU/Linux package specify
55 B<index++>. See C<Debian>.
56
57 =item C<search>
58
59 Full or partial path to SWISH++ search executable. By default, it's B<search>.
60
61 =back
62
63 =cut
64
65 sub open {
66 my $class = shift;
67 my $self = {@_};
68 bless($self, $class);
69
70 foreach (qw(index_dir)) {
71 croak "need $_" unless $self->{$_};
72 }
73
74 if (! -e $self->{'index_dir'}) {
75 mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";
76 }
77
78 # default executables
79 $self->{'index'} ||= 'index';
80 $self->{'search'} ||= 'search';
81
82 $self ? return $self : return undef;
83 }
84
85
86 =head2 check_bin
87
88 Check if swish++ binaries specified in L<open> are available and verify
89 version signature.
90
91 if ($i->check_bin) {
92 print "swish++ binaries found\n";
93 };
94
95 It will also setup property
96
97 $i->{'version'}
98
99 which you can examine to see version.
100
101 =cut
102
103 sub check_bin {
104 my $self = shift;
105
106 my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
107 my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
108
109 chomp $i;
110 chomp $s;
111
112 confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
113 confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
114
115 if ($i eq $s) {
116 $self->{'version'} = $i;
117 return 1;
118 } else {
119 carp "version difference: index is $i while search is $s";
120 return;
121 }
122
123 }
124
125 =head2 index_document
126
127 Quick way to add simple data to index.
128
129 $i->index_document($key, $data);
130 $i->index_document( 42 => 'meaning of life' );
131
132 =cut
133
134 sub index_document {
135 my $self = shift;
136
137 my %doc = @_;
138
139 foreach my $id (keys %doc) {
140 $self->_create_doc(
141 path => $id,
142 body => $doc{$id},
143 );
144 }
145
146 return 1;
147 }
148
149 =head1 PRIVATE METHODS
150
151 Private methods implement internals for creating temporary file needed for
152 swish++. You should have no need to call them directly, and they are here
153 just to have documentation.
154
155 =head2 _init_index
156
157 Create temporary directory in which files for indexing will be created and
158 start index process.
159
160 my $i->_init_index || die "can't start indexer";
161
162 =cut
163
164 sub _init_index {
165 my $self = shift;
166
167 $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
168
169 my $opt = "-v 4";
170
171 my $open_cmd = '| index '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
172
173 chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
174
175 CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
176
177 return $self->{'index_fh'};
178 }
179
180 =head2 _create_doc
181
182 Create temporary file and pass it's name to swish++
183
184 $i->_create_doc(
185 path => 'path/to/store/in/index',
186 body => 'data to story in body tag',
187 meta => {
188 'meta name' => 'data for this meta',
189 'another' => 'again more data',
190 }
191 );
192
193 =cut
194
195 sub _create_doc {
196 my $self = shift;
197
198 my $arg = {@_};
199
200 # open indexer if needed
201 $self->{'index_fh'} ||= $self->_init_index;
202
203 my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
204
205 CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
206
207 print TMP '<html>';
208
209 if ($arg->{'meta'}) {
210 confess "not yet implemented";
211 }
212
213 print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';
214
215 close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
216
217 print { $self->{'index_fh'} } $arg->{'path'}."\n";
218 }
219
220 1;
221 __END__
222
223 =head2 EXPORT
224
225 None by default.
226
227 =head1 RELATED
228
229 =head2 Debian
230
231 Debian version of swish++ is often old (version 5 at moment of this writing
232 while version 6 is available in source code), so this module by default
233 uses executable names B<index> and B<search> for self-compiled version
234 instead of one from Debian package. See L<open> how to specify Debian
235 default binaries B<index++> and B<search++>.
236
237 =head2 SWISH++
238
239 Aside from very good rewrite in C++, SWISH++ is fatster because it has
240 claver heuristics about which data in input files are words to index and
241 which are not. It's based on English language and might be best choice if
242 you plan to install large amount of long text documents.
243
244 However, if you plan to index all data from structured storage (e.g. RDBMS)
245 you might want B<all> words from data to end up in index as opposed to just
246 those which look like English words. This is especially important if you
247 don't plan to index English texts with this module.
248
249 With distribution build versions of SWISH++ you might have problems with
250 disepearing words. To overcome this problem, you will have to compile and
251 configure SWISH++ yourself (because language characteristics are
252 compilation-time option).
253
254 Compilation of SWISH++ is easy process well described on project's web
255 pages. To see my very relaxed sample configuration take a look at C<swish++>
256 directory included in distribution.
257
258 =head1 SEE ALSO
259
260 C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
261
262 =head1 AUTHOR
263
264 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
265
266 =head1 COPYRIGHT AND LICENSE
267
268 Copyright (C) 2004 by Dobrica Pavlinusic
269
270 This library is free software; you can redistribute it and/or modify
271 it under the same terms as Perl itself, either Perl version 5.8.4 or,
272 at your option, any later version of Perl 5 you may have available.
273
274
275 =cut

  ViewVC Help
Powered by ViewVC 1.1.26