4 |
use strict; |
use strict; |
5 |
use warnings; |
use warnings; |
6 |
|
|
7 |
our $VERSION = '0.00'; |
our $VERSION = '0.01'; |
8 |
|
|
9 |
use SWISH::API; |
use SWISH::API; |
10 |
use Text::Iconv; |
use Text::Iconv; |
12 |
use Carp; |
use Carp; |
13 |
use Digest::MD5 qw(md5_hex); |
use Digest::MD5 qw(md5_hex); |
14 |
use Memoize; |
use Memoize; |
15 |
|
use IPC::Run qw(start timeout pump finish); |
16 |
|
use File::Which; |
17 |
|
|
18 |
use Data::Dumper; |
use Data::Dumper; |
19 |
|
|
20 |
|
use constant { |
21 |
|
ADDED => 1, |
22 |
|
DELETED => 2, |
23 |
|
}; |
24 |
|
|
25 |
=head1 NAME |
=head1 NAME |
26 |
|
|
27 |
SWISH::Split - Perl interface to split index variant of Swish-e |
SWISH::Split - Perl interface to split index variant of Swish-e |
34 |
=head1 DESCRIPTION |
=head1 DESCRIPTION |
35 |
|
|
36 |
This is alternative interface for indexing data with swish-e. It's designed |
This is alternative interface for indexing data with swish-e. It's designed |
37 |
to split indexes over multiple files to allow updates of records in index |
to split indexes over multiple files (slices) to allow updates of records in index |
38 |
by reindexing just changed parts. |
by reindexing just changed parts (slice). |
39 |
|
|
40 |
Data is stored in index using intrface which is somewhat similar to |
Data is stored in index using intrface which is somewhat similar to |
41 |
L<Plucene::Simple>. This could make your migration (or supporting two index |
L<Plucene::Simple>. This could make your migration (or supporting two index |
42 |
engines) easier. |
engines) easier. |
43 |
|
|
44 |
In the background, it will fork swish-e binaries (one for each index slice) |
In the background, it will fork swish-e binaries (one for each index slice) |
45 |
and produce UTF-8 encoded XML files. So, if your imput charset isn't |
and produce UTF-8 encoded XML files for it. So, if your input charset isn't |
46 |
C<ISO-8859-1> you will have to specify it. |
C<ISO-8859-1> you will have to specify it. |
47 |
|
|
48 |
=head1 Methods used for indexing |
=head1 Methods used for indexing |
55 |
index => '/path/to/index', |
index => '/path/to/index', |
56 |
slice_name => \&slice_on_path, |
slice_name => \&slice_on_path, |
57 |
slices => 30, |
slices => 30, |
58 |
merge => 1, |
merge => 0, |
59 |
codepage => 'ISO-8859-2' |
codepage => 'ISO-8859-2', |
60 |
|
swish_config => qq{ |
61 |
|
PropertyNames from date |
62 |
|
PropertyNamesDate date |
63 |
|
}, |
64 |
|
memoize_to_xml => 0, |
65 |
); |
); |
66 |
|
|
67 |
# split index on first component of path |
# split index on first component of path |
69 |
return shift split(/\//,$_[0]); |
return shift split(/\//,$_[0]); |
70 |
} |
} |
71 |
|
|
72 |
|
Options to open are following: |
73 |
|
|
74 |
|
=over 5 |
75 |
|
|
76 |
|
=item C<index> |
77 |
|
|
78 |
|
path to (existing) directory in which index slices will be created. |
79 |
|
|
80 |
|
=item C<slice_name> |
81 |
|
|
82 |
C<slices> is maximum number of index slices. See L<"in_slice"> for |
coderef to function which provide slicing from path. |
83 |
|
|
84 |
|
=item C<slices> |
85 |
|
|
86 |
|
maximum number of index slices. See L<"in_slice"> for |
87 |
more explanation. |
more explanation. |
88 |
|
|
89 |
|
=item C<merge> |
90 |
|
|
91 |
|
(planned) option to merge indexes into one at end. |
92 |
|
|
93 |
|
=item C<codepage> |
94 |
|
|
95 |
|
data codepage (needed for conversion to UTF-8). |
96 |
|
By default, it's C<ISO-8859-1>. |
97 |
|
|
98 |
|
=item C<swish_config> |
99 |
|
|
100 |
|
additional parametars which will be inserted into |
101 |
|
C<swish-e> configuration file. See L<swish-config>. |
102 |
|
|
103 |
|
=item C<memoize_to_xml> |
104 |
|
|
105 |
|
speed up repeatable data, see L<"to_xml">. |
106 |
|
|
107 |
|
=back |
108 |
|
|
109 |
=cut |
=cut |
110 |
|
|
111 |
my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); |
my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); |
115 |
my $self = {@_}; |
my $self = {@_}; |
116 |
bless($self, $class); |
bless($self, $class); |
117 |
|
|
|
print Dumper($self->{'slice_name'}); |
|
|
|
|
118 |
croak "need slice_name coderef" unless ref $self->{'slice_name'}; |
croak "need slice_name coderef" unless ref $self->{'slice_name'}; |
119 |
croak "need slices" unless $self->{'slices'}; |
croak "need slices" unless $self->{'slices'}; |
120 |
|
|
124 |
|
|
125 |
$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'}); |
$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'}); |
126 |
|
|
127 |
|
# speedup |
128 |
memoize('in_slice'); |
memoize('in_slice'); |
129 |
|
memoize('to_xml') if ($self->{'memoize_to_xml'}); |
130 |
|
|
131 |
$self ? return $self : return undef; |
$self ? return $self : return undef; |
132 |
|
|
149 |
my $swishpath = shift || return; |
my $swishpath = shift || return; |
150 |
my $data = shift || return; |
my $data = shift || return; |
151 |
|
|
152 |
return 1; |
my $slice = $self->put_slice($swishpath, $self->to_xml($data)); |
153 |
|
|
154 |
|
# if ($err) { |
155 |
|
# carp "$swishpath: $err"; |
156 |
|
# return undef; |
157 |
|
# } |
158 |
|
|
159 |
|
return $slice; |
160 |
} |
} |
161 |
|
|
162 |
=head2 delete |
=head2 delete |
172 |
|
|
173 |
my @paths = @_ || return; |
my @paths = @_ || return; |
174 |
|
|
175 |
|
foreach my $path (@paths) { |
176 |
|
$self->{'paths'}->{$path} = DELETED; |
177 |
|
} |
178 |
|
|
179 |
return 42; |
return 42; |
180 |
} |
} |
181 |
|
|
182 |
|
|
183 |
=head2 close |
=head2 done |
184 |
|
|
185 |
Close index file and finish indexing. |
Finish indexing and close index file(s). |
186 |
|
|
187 |
$i->close; |
$i->done; |
188 |
|
|
189 |
This is most time-consuming operation. When it's called, it will re-index |
This is most time-consuming operation. When it's called, it will re-index |
190 |
all entries which haven't changed in all slices. |
all entries which haven't changed in all slices. |
191 |
|
|
192 |
|
Returns number of slices updated. |
193 |
|
|
194 |
|
This method should really be called close or finish, but both of those are |
195 |
|
allready used. |
196 |
|
|
197 |
=cut |
=cut |
198 |
|
|
199 |
sub close { |
sub done { |
200 |
my $self = shift; |
my $self = shift; |
201 |
|
|
202 |
return 1; |
my $ret = 0; |
203 |
|
|
204 |
|
foreach my $s (keys %{$self->{'slice'}}) { |
205 |
|
print STDERR "closing slice $s\n"; |
206 |
|
$ret += $self->close_slice($s); |
207 |
|
} |
208 |
|
|
209 |
|
return $ret; |
210 |
} |
} |
211 |
|
|
212 |
|
|
225 |
|
|
226 |
sub swishpaths { |
sub swishpaths { |
227 |
my $self = shift; |
my $self = shift; |
228 |
|
|
229 |
|
my $s = shift || return; |
230 |
|
return if (! exists($self->{'slice'}->{'s'})); |
231 |
|
|
232 |
|
return keys %{$self->{'slice'}->{'s'}}; |
233 |
} |
} |
234 |
|
|
235 |
=head2 swishpaths_updated |
=head2 swishpaths_updated |
287 |
run, think about creating your own C<slice> function and distributing |
run, think about creating your own C<slice> function and distributing |
288 |
documents manually across slices. |
documents manually across slices. |
289 |
|
|
290 |
|
Slice number must always be true value or various sanity checks will fail. |
291 |
|
|
292 |
This function is C<Memoize>ed for performance reasons. |
This function is C<Memoize>ed for performance reasons. |
293 |
|
|
294 |
=cut |
=cut |
298 |
|
|
299 |
my $path = shift || confess "need path"; |
my $path = shift || confess "need path"; |
300 |
|
|
|
print Dumper($self->{'slice_name'}); |
|
301 |
confess "need slice_name function" unless ref ($self->{'slice_name'}); |
confess "need slice_name function" unless ref ($self->{'slice_name'}); |
302 |
|
|
303 |
if ($self->{'slices'}) { |
if ($self->{'slices'}) { |
309 |
# FIXME how random is this? |
# FIXME how random is this? |
310 |
$slice = hex(substr($slice,0,8)); |
$slice = hex(substr($slice,0,8)); |
311 |
|
|
312 |
print "slice_nr: $slice slices: ",$self->{'slices'},"\n"; |
$slice = ($slice % $self->{'slices'}) + 1; |
313 |
return ($slice % $self->{'slices'}); |
print "hash: $slice / ",$self->{'slices'}," => $slice\n"; |
314 |
|
return $slice; |
315 |
} else { |
} else { |
316 |
return &{$self->{'split'}}($path); |
return &{$self->{'split'}}($path); |
317 |
} |
} |
331 |
sub find_paths { |
sub find_paths { |
332 |
my $self = shift; |
my $self = shift; |
333 |
|
|
|
my $s = shift || return; |
|
334 |
} |
} |
335 |
|
|
336 |
|
|
337 |
|
=head2 make_config |
338 |
|
|
339 |
|
Create C<swish-e> configuration file for given slice. |
340 |
|
|
341 |
|
my $config_filename = $i->make_config('slice name'); |
342 |
|
|
343 |
|
It returns configuration filename. If no C<swish_config> was defined in |
344 |
|
L<"open">, default swish-e configuration will be used. It will index all data for |
345 |
|
searching, but none for properties. |
346 |
|
|
347 |
|
If you want to see what is allready defined for swish-e in configuration |
348 |
|
take a look at source code for C<DEFAULT_SWISH_CONF>. |
349 |
|
|
350 |
|
It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>. |
351 |
|
|
352 |
|
=cut |
353 |
|
|
354 |
|
sub make_config { |
355 |
|
my $self = shift; |
356 |
|
|
357 |
|
|
358 |
|
my $index_file = $self->{'index'}."/"; |
359 |
|
$index_file .= shift || confess "need slice name"; |
360 |
|
|
361 |
|
my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX"); |
362 |
|
|
363 |
|
# find cat on filesystem |
364 |
|
my $cat = which('cat'); |
365 |
|
|
366 |
|
print $tmp_fh <<"DEFAULT_SWISH_CONF"; |
367 |
|
# swish-e config file |
368 |
|
|
369 |
|
IndexDir stdin |
370 |
|
|
371 |
|
# input file definition |
372 |
|
DefaultContents XML* |
373 |
|
|
374 |
|
# indexed metatags |
375 |
|
MetaNames xml swishdocpath |
376 |
|
|
377 |
|
|
378 |
|
#XMLClassAttributes type |
379 |
|
UndefinedMetaTags auto |
380 |
|
UndefinedXMLAttributes auto |
381 |
|
|
382 |
|
IndexFile $index_file |
383 |
|
|
384 |
|
# Croatian ISO-8859-2 characters to unaccented equivalents |
385 |
|
TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz |
386 |
|
|
387 |
|
|
388 |
|
# disable output |
389 |
|
ParserWarnLevel 0 |
390 |
|
IndexReport 1 |
391 |
|
|
392 |
|
DEFAULT_SWISH_CONF |
393 |
|
|
394 |
|
# add user parametars (like stored properties) |
395 |
|
print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'}); |
396 |
|
|
397 |
|
close($tmp_fh); |
398 |
|
|
399 |
|
return $swish_config_filename; |
400 |
|
} |
401 |
|
|
402 |
|
=head2 create_slice |
403 |
|
|
404 |
|
On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return |
405 |
|
it's handles using L<Memoize>. |
406 |
|
|
407 |
|
my $s = create_slice('/path/to/document'); |
408 |
|
|
409 |
|
You shouldn't need to call C<create_slice> directly because it will be called |
410 |
|
from L<"put_slice"> when needed. |
411 |
|
|
412 |
|
=cut |
413 |
|
|
414 |
|
sub create_slice { |
415 |
|
my $self = shift; |
416 |
|
|
417 |
|
my $path = shift || confess "create_slice need path!"; |
418 |
|
|
419 |
|
my $s = $self->in_slice($path) || confess "in_slice returned null"; |
420 |
|
|
421 |
|
return $s if (exists($self->{'slice'}->{$s})); |
422 |
|
|
423 |
|
my $swish_config = $self->make_config($s); |
424 |
|
|
425 |
|
print STDERR "creating slice $s\n"; # FIXME |
426 |
|
|
427 |
|
my @swish = qw(swish-e -u -S prog -c); |
428 |
|
push @swish, $swish_config; |
429 |
|
|
430 |
|
## Build the harness, open all pipes, and launch the subprocesses |
431 |
|
$self->{'slice'}->{$s}->{'h'} = start \@swish, |
432 |
|
\$self->{'slice'}->{$s}->{'in'}, |
433 |
|
\$self->{'slice'}->{$s}->{'out'}, |
434 |
|
\$self->{'slice'}->{$s}->{'err'}, |
435 |
|
timeout( 90 ); # FIXME |
436 |
|
|
437 |
|
$self->{'slice'}->{$s}->{'out_len'} = 0; |
438 |
|
$self->{'slice'}->{$s}->{'err_len'} = 0; |
439 |
|
|
440 |
|
$self->slice_output($s); |
441 |
|
|
442 |
|
return $s; |
443 |
|
} |
444 |
|
|
445 |
|
=head2 put_slice |
446 |
|
|
447 |
|
Pass XML data to swish. |
448 |
|
|
449 |
|
my $slice = $i->put_slice('/swish/path', '<xml>data</xml>'); |
450 |
|
|
451 |
|
Returns slice in which XML ended up. |
452 |
|
|
453 |
|
=cut |
454 |
|
|
455 |
|
sub put_slice { |
456 |
|
my $self = shift; |
457 |
|
|
458 |
|
my $path = shift || confess "need path"; |
459 |
|
my $xml = shift || confess "need xml"; |
460 |
|
|
461 |
|
$xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml"; |
462 |
|
|
463 |
|
my $s = $self->create_slice($path) || confess "create_slice returned null"; |
464 |
|
|
465 |
|
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
466 |
|
confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); |
467 |
|
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
468 |
|
|
469 |
|
$self->slice_output($s); |
470 |
|
|
471 |
|
use bytes; # as opposed to chars |
472 |
|
$self->{'slice'}->{$s}->{'in'} .= |
473 |
|
"Path-Name: $path\n". |
474 |
|
"Content-Length: ".(length($xml)+1)."\n". |
475 |
|
"Update-Mode: Index\n". |
476 |
|
"Document-Type: XML\n\n$xml\n"; |
477 |
|
|
478 |
|
# do I/O |
479 |
|
$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go |
480 |
|
|
481 |
|
$self->slice_output($s); |
482 |
|
|
483 |
|
$self->{'paths'}->{$path} = ADDED; |
484 |
|
|
485 |
|
return $s; |
486 |
|
} |
487 |
|
|
488 |
|
=head2 slice_output |
489 |
|
|
490 |
|
Prints to STDERR output and errors from C<swish-e>. |
491 |
|
|
492 |
|
my $slice = $i->slice_output($s); |
493 |
|
|
494 |
|
Normally, you don't need to call it. |
495 |
|
|
496 |
|
=cut |
497 |
|
|
498 |
|
sub slice_output { |
499 |
|
my $self = shift; |
500 |
|
|
501 |
|
my $s = shift || confess "slice_output needs slice"; |
502 |
|
|
503 |
|
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
504 |
|
confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); |
505 |
|
confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'})); |
506 |
|
|
507 |
|
if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) { |
508 |
|
#print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'}); |
509 |
|
$self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'}; |
510 |
|
return $s; |
511 |
|
} elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) { |
512 |
|
print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'}); |
513 |
|
$self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'}; |
514 |
|
# this is fatal |
515 |
|
return undef; |
516 |
|
} |
517 |
|
|
518 |
|
return $s; |
519 |
|
} |
520 |
|
|
521 |
|
=head2 close_slice |
522 |
|
|
523 |
|
Close slice (terminates swish-e process for that slice). |
524 |
|
|
525 |
|
my $i->close_slice($s); |
526 |
|
|
527 |
|
Returns true if slice is closed, false otherwise. |
528 |
|
|
529 |
|
=cut |
530 |
|
|
531 |
|
sub close_slice { |
532 |
|
my $self = shift; |
533 |
|
|
534 |
|
my $s = shift || confess "close_slice needs slice"; |
535 |
|
|
536 |
|
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
537 |
|
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
538 |
|
|
539 |
|
# pump rest of content (if any) |
540 |
|
$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'}; |
541 |
|
|
542 |
|
$self->slice_output($s); |
543 |
|
|
544 |
|
# clean up |
545 |
|
$self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'}; |
546 |
|
|
547 |
|
delete($self->{'slice'}->{$s}) && return 1; |
548 |
|
return 0; |
549 |
|
} |
550 |
|
|
551 |
|
=head2 to_xml |
552 |
|
|
553 |
|
Convert (binary safe, I hope) your data into XML for C<swish-e>. |
554 |
|
Data will not yet be recoded to UTF-8. L<"put_slice"> will do that. |
555 |
|
|
556 |
|
my $xml = $i->to_xml({ foo => 'bar' }); |
557 |
|
|
558 |
|
This function is extracted from L<"add"> method so that you can L<Memoize> it. |
559 |
|
If your data set has a lot of repeatable data, and memory is not a problem, you |
560 |
|
can add C<memoize_to_xml> option to L<"open">. |
561 |
|
|
562 |
|
=cut |
563 |
|
|
564 |
|
my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"'); |
565 |
|
my $escape_re = join '|' => keys %escape; |
566 |
|
|
567 |
|
sub to_xml { |
568 |
|
my $self = shift; |
569 |
|
|
570 |
|
my $data = shift || return; |
571 |
|
|
572 |
|
my $xml = qq{<xml>}; |
573 |
|
foreach my $tag (keys %$data) { |
574 |
|
my $content = $data->{$tag}; |
575 |
|
next if (! $content || $content eq ''); |
576 |
|
# save [cr/]lf before conversion to XML |
577 |
|
# $content =~ s/\n\r/##lf##/gs; |
578 |
|
# $content =~ s/\n/##lf##/gs; |
579 |
|
$content =~ s/($escape_re)/$escape{$1}/gs; |
580 |
|
$xml .= "<$tag><![CDATA[".$content."]]></$tag>"; |
581 |
|
} |
582 |
|
$xml .= qq{</xml>}; |
583 |
|
} |
584 |
|
|
585 |
1; |
1; |
586 |
__END__ |
__END__ |
587 |
|
|
588 |
|
|
589 |
=head2 Searching |
=head1 Searching |
590 |
|
|
591 |
Searching is still conducted using L<SWISH::API>, but you have to glob |
Searching is still conducted using L<SWISH::API>, but you have to glob |
592 |
index names. |
index names. |
601 |
That would also benefit performance, but it increases indexing time |
That would also benefit performance, but it increases indexing time |
602 |
because merged indexes must be re-created on each indexing run. |
because merged indexes must be re-created on each indexing run. |
603 |
|
|
604 |
=head2 EXPORT |
=head1 EXPORT |
605 |
|
|
606 |
None by default. |
Nothing by default. |
607 |
|
|
608 |
|
=head1 EXAMPLES |
609 |
|
|
610 |
|
Test script for this module uses all parts of API. It's also nice example |
611 |
|
how to use C<SWISH::Split>. |
612 |
|
|
613 |
=head1 SEE ALSO |
=head1 SEE ALSO |
614 |
|
|