/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 9 - (show annotations)
Sun Dec 5 00:59:50 2004 UTC (16 years ago) by dpavlin
File size: 9138 byte(s)
- new options to open: meta_in_body and use_stopwords
- new add method (same as _create_doc)
- renamed _init_index to _init_indexer
- by default, stopwords from swish++ will be disabled (use_stopword option to
  open to enables them)
- support for (only) title meta data
- support to include meta data in full text search (meta_in_body open opt)

1 package SWISH::PlusPlus;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.03';
8
9 use Carp;
10 use File::Temp qw/ tempdir /;
11 #use YAML;
12
13 =head1 NAME
14
15 SWISH::PlusPlus - Perl extension SWISH++
16
17 =head1 SYNOPSIS
18
19 use SWISH::PlusPlus;
20 blah blah blah
21
22 =head1 DESCRIPTION
23
24 This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26 support for properties (which this module tries to fix).
27
28 Implementation of this module is crafted after L<Plucene::Simple> and it
29 should be easy to replace Plucene with this module for increased
30 performance. However, this module is not plug-in replacement.
31
32 =head1 METHODS
33
34 =head2 open
35
36 Create new indexing object.
37
38 my $i = SWISH::PlusPlus->open(
39 index_dir => '/path/to/index',
40 index => 'index++',
41 search => 'search++',
42 debug => 1,
43 meta_in_body => 1,
44 use_stopwords => 1,
45 );
46
47 Options to open are following:
48
49 =over 5
50
51 =item C<index_dir>
52
53 Path to directory in which index will be created.
54
55 =item C<index>
56
57 Full or partial path to SWISH++ index executable. By default, it's B<index>
58 for self-compiled version. If you use Debian GNU/Linux package specify
59 B<index++>. See C<Debian>.
60
61 =item C<search>
62
63 Full or partial path to SWISH++ search executable. By default, it's B<search>.
64
65 =item C<debug>
66
67 This option (off by default) will produce a lot of debugging output on
68 C<STDERR> prefixed by C<##>.
69
70 =item C<meta_in_body>
71
72 This option (off by default) enables to search content of meta fields
73 without specifing them (like they are in body of document). This will
74 somewhat increate index size.
75
76 =item C<use_stopwords>
77
78 Use built-in SWISH++ stop words. By default, they are disabled.
79
80 =back
81
82 =cut
83
84 sub open {
85 my $class = shift;
86 my $self = {@_};
87 bless($self, $class);
88
89 foreach (qw(index_dir)) {
90 croak "need $_" unless $self->{$_};
91 }
92
93 if (! -e $self->{'index_dir'}) {
94 mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";
95 }
96
97 # default executables
98 $self->{'index'} ||= 'index';
99 $self->{'search'} ||= 'search';
100
101 print STDERR "## open index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
102
103 $self ? return $self : return undef;
104 }
105
106
107 =head2 check_bin
108
109 Check if swish++ binaries specified in L<open> are available and verify
110 version signature.
111
112 if ($i->check_bin) {
113 print "swish++ binaries found\n";
114 };
115
116 It will also setup property
117
118 $i->{'version'}
119
120 which you can examine to see version.
121
122 =cut
123
124 sub check_bin {
125 my $self = shift;
126
127 my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
128 my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
129
130 chomp $i;
131 chomp $s;
132
133 confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
134 confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
135
136 if ($i eq $s) {
137 $self->{'version'} = $i;
138 return 1;
139 } else {
140 carp "version difference: index is $i while search is $s";
141 return;
142 }
143
144 }
145
146 =head2 index_document
147
148 Quick way to add simple data to index.
149
150 $i->index_document($key, $data);
151 $i->index_document( 42 => 'meaning of life' );
152
153 =cut
154
155 sub index_document {
156 my $self = shift;
157
158 my %doc = @_;
159
160 foreach my $id (keys %doc) {
161 $self->_create_doc(
162 path => $id,
163 body => $doc{$id},
164 );
165 }
166
167 return 1;
168 }
169
170 =head2 add
171
172 Add document with metadata to index.
173
174 $i->add(
175 path => 'path/to/document',
176 title => 'this is result title',
177 meta => {
178 description => 'this is description meta tag',
179 date => '2004-11-04',
180 author => 'Dobrica Pavlinusic',
181 }
182 body => 'this is text without meta data',
183 );
184
185 This is thin wrapper round L<_create_doc>.
186
187 =cut
188
189 sub add {
190 my $self = shift;
191
192 $self->_create_doc(@_);
193
194 return 1;
195 }
196 =head2 search
197
198 Search your index.
199
200 my @results = $i->search("swhish query");
201
202 Returns array with result IDs.
203
204 =cut
205
206 sub search {
207 my $self = shift;
208
209 my $query = shift || return;
210
211 $self->_close_index;
212
213 my @results;
214
215 # escape double quotes in query for shell
216 $query =~ s/"/\\"/g;
217
218 my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
219 print STDERR "## search $open_cmd\n" if ($self->{'debug'});
220
221 CORE::open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
222 while(<SEARCH>) {
223 next if (/^#/);
224 chomp;
225 print STDERR "## $_\n" if ($self->{'debug'});
226 my ($rank,$path,$size,$title) = split(/ /,$_,4);
227 push @results, {
228 rank => $rank,
229 path => $path,
230 size => $size,
231 title => $title,
232 }
233 }
234
235 close(SEARCH) || confess "can't close search";
236
237 #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
238
239 return @results;
240 }
241
242 =head1 PRIVATE METHODS
243
244 Private methods implement internals for creating temporary file needed for
245 swish++. You should have no need to call them directly, and they are here
246 just to have documentation.
247
248 =head2 _init_indexer
249
250 Create temporary directory in which files for indexing will be created and
251 start index process.
252
253 my $i->_init_indexer || die "can't start indexer";
254
255 It will also create empty file C<_stopwords_> to disable stop words.
256
257 =cut
258
259 sub _init_indexer {
260 my $self = shift;
261
262 $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
263
264 chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
265
266 my $opt = "-v 4";
267
268 unless ($self->{'use_stopwrods'}) {
269 CORE::open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
270 print STOP " ";
271 close(STOP);
272 $opt .= " -s _stopwords_";
273 }
274
275 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
276
277
278 CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
279
280
281 return $self->{'index_fh'};
282 }
283
284 =head2 _create_doc
285
286 Create temporary file and pass it's name to swish++
287
288 $i->_create_doc(
289 path => 'path/to/store/in/index',
290 title => 'this is title in results',
291 body => 'data to story in body tag',
292 meta => {
293 'meta name' => 'data for this meta',
294 'another' => 'again more data',
295 }
296 );
297
298 To delete document, just omit body and meta data.
299
300 =cut
301
302 sub _create_doc {
303 my $self = shift;
304
305 my $arg = {@_};
306
307 # open indexer if needed
308 $self->{'index_fh'} ||= $self->_init_indexer;
309
310 my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
311
312 CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
313
314 print TMP '<html><head>';
315
316 $arg->{'body'} ||= '';
317
318 if ($arg->{'meta'}) {
319 confess "not yet implemented";
320 }
321
322 if (defined($arg->{'title'})) {
323 print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
324 $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
325 }
326
327 print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
328
329 close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
330
331 print { $self->{'index_fh'} } $arg->{'path'}."\n";
332 }
333
334 =head2 _close_index
335
336 Close index after indexing.
337
338 $i->_close_index;
339
340 You have to close index before searching.
341
342 =cut
343
344 sub _close_index {
345 my $self = shift;
346
347 return unless ($self->{'index_fh'});
348
349 print STDERR "## close index\n" if ($self->{'debug'});
350
351 close($self->{'index_fh'});
352 undef $self->{'index_fh'};
353 }
354
355 1;
356 __END__
357
358 =head2 EXPORT
359
360 None by default.
361
362 =head1 RELATED
363
364 =head2 Debian
365
366 Debian version of swish++ is often old (version 5 at moment of this writing
367 while version 6 is available in source code), so this module by default
368 uses executable names B<index> and B<search> for self-compiled version
369 instead of one from Debian package. See L<open> how to specify Debian
370 default binaries B<index++> and B<search++>.
371
372 =head2 SWISH++
373
374 Aside from very good rewrite in C++, SWISH++ is fatster because it has
375 claver heuristics about which data in input files are words to index and
376 which are not. It's based on English language and might be best choice if
377 you plan to install large amount of long text documents.
378
379 However, if you plan to index all data from structured storage (e.g. RDBMS)
380 you might want B<all> words from data to end up in index as opposed to just
381 those which look like English words. This is especially important if you
382 don't plan to index English texts with this module.
383
384 With distribution build versions of SWISH++ you might have problems with
385 disepearing words. To overcome this problem, you will have to compile and
386 configure SWISH++ yourself (because language characteristics are
387 compilation-time option).
388
389 Compilation of SWISH++ is easy process well described on project's web
390 pages. To see my very relaxed sample configuration take a look at C<swish++>
391 directory included in distribution.
392
393 =head1 SEE ALSO
394
395 C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
396
397 =head1 AUTHOR
398
399 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
400
401 =head1 COPYRIGHT AND LICENSE
402
403 Copyright (C) 2004 by Dobrica Pavlinusic
404
405 This library is free software; you can redistribute it and/or modify
406 it under the same terms as Perl itself, either Perl version 5.8.4 or,
407 at your option, any later version of Perl 5 you may have available.
408
409
410 =cut

  ViewVC Help
Powered by ViewVC 1.1.26