/[SWISH-PlusPlus]/trunk/PlusPlus.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/PlusPlus.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 3 by dpavlin, Fri Dec 3 15:23:23 2004 UTC revision 8 by dpavlin, Sat Dec 4 17:49:20 2004 UTC
# Line 7  use warnings; Line 7  use warnings;
7  our $VERSION = '0.02';  our $VERSION = '0.02';
8    
9  use Carp;  use Carp;
10    use File::Temp qw/ tempdir /;
11    #use YAML;
12    
13  =head1 NAME  =head1 NAME
14    
# Line 37  Create new indexing object. Line 39  Create new indexing object.
39          index_dir => '/path/to/index',          index_dir => '/path/to/index',
40          index => 'index++',          index => 'index++',
41          search => 'search++',          search => 'search++',
42            debug => 1,
43    );    );
44    
45  Options to open are following:  Options to open are following:
# Line 57  B<index++>. See C<Debian>. Line 60  B<index++>. See C<Debian>.
60    
61  Full or partial path to SWISH++ search executable. By default, it's B<search>.  Full or partial path to SWISH++ search executable. By default, it's B<search>.
62    
63    =item C<debug>
64    
65    This option (off by default) will produce a lot of debugging output on
66    C<STDERR> prefixed by C<##>.
67    
68  =back  =back
69    
70  =cut  =cut
# Line 78  sub open { Line 86  sub open {
86          $self->{'index'} ||= 'index';          $self->{'index'} ||= 'index';
87          $self->{'search'} ||= 'search';          $self->{'search'} ||= 'search';
88    
89            print STDERR "## open index_dir: ",$self->{'index_dir'}," index: ",$self->{'index'}, " search: ",$self->{'search'},"\n" if ($self->{'debug'});
90    
91          $self ? return $self : return undef;          $self ? return $self : return undef;
92  }  }
93    
# Line 121  sub check_bin { Line 131  sub check_bin {
131    
132  }  }
133    
134    =head2 index_document
135    
136    Quick way to add simple data to index.
137    
138      $i->index_document($key, $data);
139      $i->index_document( 42 => 'meaning of life' );
140    
141    =cut
142    
143    sub index_document {
144            my $self = shift;
145    
146            my %doc = @_;
147    
148            foreach my $id (keys %doc) {
149                    $self->_create_doc(
150                            path => $id,
151                            body => $doc{$id},
152                    );
153            }
154    
155            return 1;
156    }
157    
158    =head2 search
159    
160    Search your index.
161    
162      my @results = $i->search("swhish query");
163    
164    Returns array with result IDs.
165    
166    =cut
167    
168    sub search {
169            my $self = shift;
170    
171            my $query = shift || return;
172    
173            $self->_close_index;
174    
175            my @results;
176    
177            # escape double quotes in query for shell
178            $query =~ s/"/\\"/g;
179    
180            my $open_cmd = $self->{'search'}." -i ".$self->{'index_dir'}.'/index "'.$query.'" |';
181            print STDERR "## search $open_cmd\n" if ($self->{'debug'});
182    
183            CORE::open(SEARCH, $open_cmd) || confess "can't start $open_cmd: $!";
184            while(<SEARCH>) {
185                    next if (/^#/);
186                    chomp;
187                    print STDERR "## $_\n" if ($self->{'debug'});
188                    my ($rank,$path,$size,$title) = split(/ /,$_,4);
189                    push @results, {
190                            rank => $rank,
191                            path => $path,
192                            size => $size,
193                            title => $title,
194                    }
195            }
196    
197            close(SEARCH) || confess "can't close search";
198    
199            #print STDERR "## results: ",Dump(@results),"\n" if ($self->{'debug'});
200    
201            return @results;
202    }
203    
204    =head1 PRIVATE METHODS
205    
206    Private methods implement internals for creating temporary file needed for
207    swish++. You should have no need to call them directly, and they are here
208    just to have documentation.
209    
210    =head2 _init_index
211    
212    Create temporary directory in which files for indexing will be created and
213    start index process.
214    
215      my $i->_init_index || die "can't start indexer";
216    
217    =cut
218    
219    sub _init_index {
220            my $self = shift;
221    
222            $self->{'tmp_dir'} = tempdir( CLEANUP => 1 ) || confess "can't create temporary directory: $!";
223    
224            my $opt = "-v 4";
225    
226            my $open_cmd = '| '.$self->{'index'}.' '.$opt.' -e "html:*" -i '.$self->{'index_dir'}.'/index -';
227    
228            chdir $self->{'tmp_dir'} || confess "can't chdir to ".$self->{'tmp_dir'}.": $!";
229    
230            CORE::open($self->{'index_fh'}, $open_cmd) || confess "can't start index with $open_cmd: $!";
231    
232            return $self->{'index_fh'};
233    }
234    
235    =head2 _create_doc
236    
237    Create temporary file and pass it's name to swish++
238    
239      $i->_create_doc(
240            path => 'path/to/store/in/index',
241            body => 'data to story in body tag',
242            meta => {
243                    'meta name' => 'data for this meta',
244                    'another' => 'again more data',
245            }
246      );
247    
248    To delete document, just omit body and meta data.
249    
250    =cut
251    
252    sub _create_doc {
253            my $self = shift;
254    
255            my $arg = {@_};
256    
257            # open indexer if needed
258            $self->{'index_fh'} ||= $self->_init_index;
259    
260            my $path = $self->{'tmp_dir'} || confess "no tmp_dir?";
261    
262            CORE::open(TMP, '>', $arg->{'path'}) || die "can't create temp file ".$arg->{'path'}.": $!";
263    
264            print TMP '<html>';
265    
266            if ($arg->{'meta'}) {
267                    confess "not yet implemented";
268            }
269            
270            print TMP '<body>' . ($arg->{'body'} || '') . '</body></html>';
271            
272            close(TMP) || confess "can't close tmp file ".$arg->{'path'}.": $!";
273    
274            print { $self->{'index_fh'} } $arg->{'path'}."\n";
275    }
276    
277    =head2 _close_index
278    
279    Close index after indexing.
280    
281      $i->_close_index;
282    
283    You have to close index before searching.
284    
285    =cut
286    
287    sub _close_index {
288            my $self = shift;
289    
290            return unless ($self->{'index_fh'});
291    
292            print STDERR "## close index\n" if ($self->{'debug'});
293    
294            close($self->{'index_fh'});
295            undef $self->{'index_fh'};
296    }
297    
298  1;  1;
299  __END__  __END__
300    
# Line 138  uses executable names B<index> and B<sea Line 312  uses executable names B<index> and B<sea
312  instead of one from Debian package. See L<open> how to specify Debian  instead of one from Debian package. See L<open> how to specify Debian
313  default binaries B<index++> and B<search++>.  default binaries B<index++> and B<search++>.
314    
315  =head1 SEE ALSO  =head2 SWISH++
316    
317  Mention other useful documentation such as the documentation of  Aside from very good rewrite in C++, SWISH++ is fatster because it has
318  related modules or operating system documentation (such as man pages  claver heuristics about which data in input files are words to index and
319  in UNIX), or any relevant external documentation such as RFCs or  which are not. It's based on English language and might be best choice if
320  standards.  you plan to install large amount of long text documents.
321    
322    However, if you plan to index all data from structured storage (e.g. RDBMS)
323    you might want B<all> words from data to end up in index as opposed to just
324    those which look like English words. This is especially important if you
325    don't plan to index English texts with this module.
326    
327    With distribution build versions of SWISH++ you might have problems with
328    disepearing words. To overcome this problem, you will have to compile and
329    configure SWISH++ yourself (because language characteristics are
330    compilation-time option).
331    
332    Compilation of SWISH++ is easy process well described on project's web
333    pages. To see my very relaxed sample configuration take a look at C<swish++>
334    directory included in distribution.
335    
336  If you have a mailing list set up for your module, mention it here.  =head1 SEE ALSO
337    
338  If you have a web site set up for your module, mention it here.  C<swish++> web site L<http://homepage.mac.com/pauljlucas/software/swish/>
339    
340  =head1 AUTHOR  =head1 AUTHOR
341    
342  Dobrica Pavlinusic, E<lt>dpavlin@E<gt>  Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
343    
344  =head1 COPYRIGHT AND LICENSE  =head1 COPYRIGHT AND LICENSE
345    

Legend:
Removed from v.3  
changed lines
  Added in v.8

  ViewVC Help
Powered by ViewVC 1.1.26