/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 11 - (show annotations)
Sun Dec 5 13:30:57 2004 UTC (16 years ago) by dpavlin
File size: 9611 byte(s)
support for meta in data, relaxed swish++ config, distribution cleanup

1 package SWISH::PlusPlus;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.05';
8
9 use Carp;
10 use File::Temp qw/ tempdir /;
11 #use YAML;
12
13 =head1 NAME
14
15 SWISH::PlusPlus - Perl extension SWISH++
16
17 =head1 SYNOPSIS
18
19 use SWISH::PlusPlus;
20 blah blah blah
21
22 =head1 DESCRIPTION
23
24 This is perl module to use SWISH++ indexer by Paul J. Lucas. SWISH++ is
25 rewrite of swish-e in C++ which is extremly fast (thank to mmap), but without
26 support for properties (which this module tries to fix).
27
28 Implementation of this module is crafted after L<Plucene::Simple> and it
29 should be easy to replace Plucene with this module for increased
30 performance. However, this module is not plug-in replacement.
31
32 =head1 METHODS
33
34 =head2 new
35
36 Create new indexing object.
37
38 my $i = SWISH::PlusPlus->new(
39 index_dir => '/path/to/index',
40 index => 'index++',
41 search => 'search++',
42 debug => 1,
43 meta_in_body => 1,
44 use_stopwords => 1,
45 );
46
47 Options to new are following:
48
49 =over 5
50
51 =item C<index_dir>
52
53 Path to directory in which index will be created.
54
55 =item C<index>
56
57 Full or partial path to SWISH++ index executable. By default, it's B<index>
58 for self-compiled version. If you use Debian GNU/Linux package specify
59 B<index++>. See C<Debian>.
60
61 =item C<search>
62
63 Full or partial path to SWISH++ search executable. By default, it's B<search>.
64
65 =item C<debug>
66
67 This option (off by default) will produce a lot of debugging output on
68 C<STDERR> prefixed by C<##>.
69
70 =item C<meta_in_body>
71
72 This option (off by default) enables to search content of meta fields
73 without specifing them (like they are in body of document). This will
74 somewhat increate index size.
75
76 =item C<use_stopwords>
77
78 Use built-in SWISH++ stop words. By default, they are disabled.
79
80 =back
81
82 =cut
83
84 sub new {
85 my $class = shift;
86 my $self = {@_};
87 bless($self, $class);
88
89 foreach (qw(index_dir)) {
90 croak "need $_" unless $self->{$_};
91 }
92
93 if (! -e $self->{'index_dir'}) {
94 mkdir $self->{'index_dir'} || confess "can't create index ",$self->{'index'},": $!";
95 }
96
97 # default executables
98 $self->{'index'} ||= 'index';
99 $self->{'search'} ||= 'search';
100
101 print STDERR "## new index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
102
103 $self ? return $self : return undef;
104 }
105
106
107 =head2 check_bin
108
109 Check if swish++ binaries specified in L<new> are available and verify
110 version signature.
111
112 if ($i->check_bin) {
113 print "swish++ binaries found\n";
114 };
115
116 It will also setup property
117
118 $i->{'version'}
119
120 which you can examine to see version.
121
122 =cut
123
124 sub check_bin {
125 my $self = shift;
126
127 my $i = `$self->{'index'} -V 2>&1` || confess "can't find '",$self->{'index'},"' binary";
128 my $s = `$self->{'search'} -V 2>&1` || confess "can't find '",$self->{'search'},"' binary";
129
130 chomp $i;
131 chomp $s;
132
133 confess $self->{'index'}," binary is not SWISH++" unless ($i =~ m/^SWISH\+\+/);
134 confess $self->{'search'}," binary is not SWISH++" unless ($s =~ m/^SWISH\+\+/);
135
136 if ($i eq $s) {
137 $self->{'version'} = $i;
138 return 1;
139 } else {
140 carp "version difference: index is $i while search is $s";
141 return;
142 }
143
144 }
145
146 =head2 index_document
147
148 Quick way to add simple data to index.
149
150 $i->index_document($key, $data);
151 $i->index_document( 42 => 'meaning of life' );
152
153 =cut
154
155 sub index_document {
156 my $self = shift;
157
158 my %doc = @_;
159
160 foreach my $id (keys %doc) {
161 $self->_create_doc(
162 path => $id,
163 body => $doc{$id},
164 );
165 }
166
167 return 1;
168 }
169
170 =head2 add
171
172 Add document with metadata to index.
173
174 $i->add(
175 path => 'path/to/document',
176 title => 'this is result title',
177 meta => {
178 description => 'this is description meta tag',
179 date => '2004-11-04',
180 author => 'Dobrica Pavlinusic',
181 }
182 body => 'this is text without meta data',
183 );
184
185 This is thin wrapper round L<_create_doc>.
186
187 =cut
188
189 sub add {
190 my $self = shift;
191
192 $self->_create_doc(@_);
193
194 return 1;
195 }
196 =head2 search
197
198 Search your index.
199
200 my @results = $i->search("swhish query");
201
202 Returns array with result IDs.
203
204 =cut
205
206 sub search {
207 my $self = shift;
208
209 my $query = shift || return;
210
211 $self->_close_index;
212
213 my @results;
214
215 # escape double quotes in query for shell
216 $query =~ s/"/\\"/g;
217
218 my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
219 print STDERR "## search $open_cmd\n" if ($self->{'debug'});
220
221 open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
222 while(<SEARCH>) {
223 next if (/^#/);
224 chomp;
225 print STDERR "## $_\n" if ($self->{'debug'});
226 my ($rank,$path,$size,$title) = split(/ /,$_,4);
227 push @results, {
228 rank => $rank,
229 path => $path,
230 size => $size,
231 title => $title,
232 }
233 }
234
235 close(SEARCH) || confess "can't close search";
236
237 #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
238
239 return @results;
240 }
241
242 =head1 PRIVATE METHODS
243
244 Private methods implement internals for creating temporary file needed for
245 swish++. You should have no need to call them directly, and they are here
246 just to have documentation.
247
248 =head2 _init_indexer
249
250 Create temporary directory in which files for indexing will be created and
251 start index process.
252
253 my $i->_init_indexer || die "can't start indexer";
254
255 It will also create empty file C<_stopwords_> to disable stop words.
256
257 =cut
258
259 sub _init_indexer {
260 my $self = shift;
261
262 $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
263
264 chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
265
266 my $opt = "-v 4";
267
268 unless ($self->{'use_stopwrods'}) {
269 open(STOP, '>', "_stopwords_") || carp "can't create empty stopword file, skipping\n";
270 print STOP " ";
271 close(STOP);
272 $opt .= " -s _stopwords_";
273 }
274
275 my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
276
277
278 open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
279
280
281 return $self->{'index_fh'};
282 }
283
284 =head2 _create_doc
285
286 Create temporary file and pass it's name to swish++
287
288 $i->_create_doc(
289 path => 'path/to/store/in/index',
290 title => 'this is title in results',
291 body => 'data to story in body tag',
292 meta => {
293 'meta name' => 'data for this meta',
294 'another' => 'again more data',
295 }
296 );
297
298 To delete document, just omit body and meta data.
299
300 =cut
301
302 sub _create_doc {
303 my $self = shift;
304
305 my $arg = {@_};
306
307 # open indexer if needed
308 $self->{'index_fh'} ||= $self->_init_indexer;
309
310 my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
311
312 open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
313
314 print TMP '<html><head>';
315
316 $arg->{'body'} ||= '';
317
318 if ($arg->{'meta'}) {
319 foreach my $name (keys %{$arg->{'meta'}}) {
320 my $content = $arg->{'meta'}->{$name};
321 print TMP qq{<meta name="$name" content="$content">};
322 $arg->{'body'} .= " $content" if ($self->{'meta_in_body'});
323 }
324 }
325
326 if (defined($arg->{'title'})) {
327 print TMP '<title>' . ($arg->{'title'} || '') . '</title>';
328 $arg->{'body'} .= " ".$arg->{'title'} if ($self->{'meta_in_body'});
329 }
330
331 print TMP '</head><body>' . $arg->{'body'} . '</body></html>';
332
333 close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
334
335 print { $self->{'index_fh'} } $arg->{'path'}."\n";
336 }
337
338 =head2 _close_index
339
340 Close index after indexing.
341
342 $i->_close_index;
343
344 You have to close index before searching.
345
346 =cut
347
348 sub _close_index {
349 my $self = shift;
350
351 return unless ($self->{'index_fh'});
352
353 print STDERR "## close index\n" if ($self->{'debug'});
354
355 close($self->{'index_fh'});
356 undef $self->{'index_fh'};
357 }
358
359 1;
360 __END__
361
362 =head2 EXPORT
363
364 None by default.
365
366 =head1 RELATED
367
368 =head2 Debian
369
370 Debian version of swish++ is often old (version 5 at moment of this writing
371 while version 6 is available in source code), so this module by default
372 uses executable names B<index> and B<search> for self-compiled version
373 instead of one from Debian package. See L<new> how to specify Debian
374 default binaries B<index++> and B<search++>.
375
376 =head2 SWISH++
377
378 Aside from very good rewrite in C++, SWISH++ is fatster because it has
379 claver heuristics about which data in input files are words to index and
380 which are not. It's based on English language and might be best choice if
381 you plan to install large amount of long text documents.
382
383 However, if you plan to index all data from structured storage (e.g. RDBMS)
384 you might want B<all> words from data to end up in index as opposed to just
385 those which look like English words. This is especially important if you
386 don't plan to index English texts with this module.
387
388 With distribution build versions of SWISH++ you might have problems with
389 disepearing words. To overcome this problem, you will have to compile and
390 configure SWISH++ yourself (because language characteristics are
391 compilation-time option).
392
393 Compilation of SWISH++ is easy process well described on project's web
394 pages. To see my very relaxed sample configuration take a look at C<swish++>
395 directory included in distribution.
396
397 =head2 SWISH++ config
398
399 C<config.h> located in C<swish++> directory of this distribution is relaxed
400 SWISH++ configuration that will index all words passed to it. This
401 configuration is needed for B<date test> because default configuration
402 doesn't recognize 2004-12-05 as date. Have in mind that your index size
403 might explode.
404
405 =head1 SEE ALSO
406
407 C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
408
409 =head1 AUTHOR
410
411 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
412
413 =head1 COPYRIGHT AND LICENSE
414
415 Copyright (C) 2004 by Dobrica Pavlinusic
416
417 This library is free software; you can redistribute it and/or modify
418 it under the same terms as Perl itself, either Perl version 5.8.4 or,
419 at your option, any later version of Perl 5 you may have available.
420
421
422 =cut

  ViewVC Help
Powered by ViewVC 1.1.26