Revision 4

Date:
2004/08/08 19:22:56
Author:
dpavlin
Revision Log:
first version which passes 51 test. It still doesn't update documents, just
insert.
Files:

Legend:

 
Added
 
Removed
 
Modified
  • trunk/Split.pm

     
    12 12 use Carp;
    13 13 use Digest::MD5 qw(md5_hex);
    14 14 use Memoize;
    15 use IPC::Run qw(start timeout pump finish);
    16 use File::Which;
    15 17
    16 18 use Data::Dumper;
    17 19
     
    27 29 =head1 DESCRIPTION
    28 30
    29 31 This is alternative interface for indexing data with swish-e. It's designed
    30 to split indexes over multiple files to allow updates of records in index
    31 by reindexing just changed parts.
    32 to split indexes over multiple files (slices) to allow updates of records in index
    33 by reindexing just changed parts (slice).
    32 34
    33 35 Data is stored in index using intrface which is somewhat similar to
    34 36 L<Plucene::Simple>. This could make your migration (or supporting two index
    35 37 engines) easier.
    36 38
    37 39 In the background, it will fork swish-e binaries (one for each index slice)
    38 and produce UTF-8 encoded XML files. So, if your imput charset isn't
    40 and produce UTF-8 encoded XML files for it. So, if your imput charset isn't
    39 41 C<ISO-8859-1> you will have to specify it.
    40 42
    41 43 =head1 Methods used for indexing
     
    48 50 index => '/path/to/index',
    49 51 slice_name => \&slice_on_path,
    50 52 slices => 30,
    51 merge => 1,
    52 codepage => 'ISO-8859-2'
    53 merge => 0,
    54 codepage => 'ISO-8859-2',
    55 swish_config => qq{
    56 PropertyNames from date
    57 PropertyNamesDate date
    58 },
    59 memoize_to_xml => 0,
    53 60 );
    54 61
    55 62 # split index on first component of path
     
    57 64 return shift split(/\//,$_[0]);
    58 65 }
    59 66
    67 Options to open are following:
    60 68
    61 C<slices> is maximum number of index slices. See L<"in_slice"> for
    69 =over 5
    70
    71 =item C<index>
    72
    73 path to (existing) directory in which index slices will be created.
    74
    75 =item C<slice_name>
    76
    77 coderef to function which provide slicing from path.
    78
    79 =item C<slices>
    80
    81 maximum number of index slices. See L<"in_slice"> for
    62 82 more explanation.
    63 83
    84 =item C<merge>
    85
    86 (planned) option to merge indexes into one at end.
    87
    88 =item C<codepage>
    89
    90 data codepage (needed for conversion to UTF-8).
    91 By default, it's C<ISO-8859-1>.
    92
    93 =item C<swish_config>
    94
    95 additional parametars which will be inserted into
    96 C<swish-e> configuration file. See L<swish-config>.
    97
    98 =item C<memoize_to_xml>
    99
    100 speed up repeatable data, see L<"to_xml">.
    101
    102 =back
    103
    64 104 =cut
    65 105
    66 106 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
     
    70 110 my $self = {@_};
    71 111 bless($self, $class);
    72 112
    73 print Dumper($self->{'slice_name'});
    74
    75 113 croak "need slice_name coderef" unless ref $self->{'slice_name'};
    76 114 croak "need slices" unless $self->{'slices'};
    77 115
     
    81 119
    82 120 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
    83 121
    122 # speedup
    84 123 memoize('in_slice');
    124 memoize('to_xml') if ($self->{'memoize_to_xml'});
    85 125
    86 126 $self ? return $self : return undef;
    87 127
     
    104 144 my $swishpath = shift || return;
    105 145 my $data = shift || return;
    106 146
    147 my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));
    148
    149 if ($err) {
    150 carp "$swishpath: $err";
    151 return 0;
    152 }
    153
    107 154 return 1;
    108 155 }
    109 156
     
    124 171 }
    125 172
    126 173
    127 =head2 close
    174 =head2 finish
    128 175
    129 Close index file and finish indexing.
    176 Finish indexing and close index file(s).
    130 177
    131 $i->close;
    178 $i->finish;
    132 179
    133 180 This is most time-consuming operation. When it's called, it will re-index
    134 181 all entries which haven't changed in all slices.
    135 182
    183 Returns number of slices updated.
    184
    136 185 =cut
    137 186
    138 sub close {
    187 sub finish {
    139 188 my $self = shift;
    140 189
    141 return 1;
    190 my $ret = 0;
    191
    192 foreach my $s (keys %{$self->{'slice'}}) {
    193 $ret += $self->close_slice($s);
    194 }
    195
    196 return $ret;
    142 197 }
    143 198
    144 199
     
    214 269 run, think about creating your own C<slice> function and distributing
    215 270 documents manually across slices.
    216 271
    272 Slice number must always be true value or various sanity checks will fail.
    273
    217 274 This function is C<Memoize>ed for performance reasons.
    218 275
    219 276 =cut
     
    223 280
    224 281 my $path = shift || confess "need path";
    225 282
    226 print Dumper($self->{'slice_name'});
    227 283 confess "need slice_name function" unless ref ($self->{'slice_name'});
    228 284
    229 285 if ($self->{'slices'}) {
     
    235 291 # FIXME how random is this?
    236 292 $slice = hex(substr($slice,0,8));
    237 293
    238 print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
    239 return ($slice % $self->{'slices'});
    294 $slice = ($slice % $self->{'slices'}) + 1;
    295 print "hash: $slice / ",$self->{'slices'}," => $slice\n";
    296 return $slice;
    240 297 } else {
    241 298 return &{$self->{'split'}}($path);
    242 299 }
     
    260 317 }
    261 318
    262 319
    320 =head2 make_config
    263 321
    322 Create C<swish-e> configuration file for given slice.
    323
    324 my $config_filename = $i->make_config('slice name');
    325
    326 It returns configuration filename. If no C<swish_config> was defined in
    327 L<"open">, default swish-e configuration will be used. It will index all data for
    328 searching, but none for properties.
    329
    330 If you want to see what is allready defined for swish-e in configuration
    331 take a look at source code for C<DEFAULT_SWISH_CONF>.
    332
    333 It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
    334 by C<File::Which>. Do Windows users have to change that to C<COPY /B>
    335 or something similar?
    336
    337 =cut
    338
    339 sub make_config {
    340 my $self = shift;
    341
    342
    343 my $index_file = $self->{'index'}."/";
    344 $index_file .= shift || confess "need slice name";
    345
    346 my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
    347
    348 # find cat on filesystem
    349 my $cat = which('cat');
    350
    351 print $tmp_fh <<"DEFAULT_SWISH_CONF";
    352 # swish-e config file
    353
    354 IndexDir cat
    355 #SwishProgParameters -
    356
    357 # input file definition
    358 DefaultContents XML*
    359
    360 # indexed metatags
    361 MetaNames xml swishdocpath
    362
    363
    364 #XMLClassAttributes type
    365 UndefinedMetaTags auto
    366 UndefinedXMLAttributes auto
    367
    368 IndexFile $index_file
    369
    370 # Croatian ISO-8859-2 characters to unaccented equivalents
    371 TranslateCharacters ƾ ssddcccczz
    372
    373
    374 # disable output
    375 ParserWarnLevel 0
    376 IndexReport 1
    377
    378 DEFAULT_SWISH_CONF
    379
    380 # add user parametars (like stored properties)
    381 print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
    382
    383 close($tmp_fh);
    384
    385 return $swish_config_filename;
    386 }
    387
    388 =head2 create_slice
    389
    390 On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
    391 it's handles using L<Memoize>.
    392
    393 my $s = create_slice('/path/to/document');
    394
    395 You shouldn't need to call C<create_slice> directly because it will be called
    396 from L<"put_slice"> when needed.
    397
    398 =cut
    399
    400 sub create_slice {
    401 my $self = shift;
    402
    403 my $path = shift || confess "create_slice need path!";
    404
    405 my $s = $self->in_slice($path) || confess "in_slice returned null";
    406
    407 return $s if (exists($self->{'slice'}->{$s}));
    408
    409 my $swish_config = $self->make_config($s);
    410
    411 print STDERR "creating slice $s\n"; # FIXME
    412
    413 my @swish = qw(swish-e -S prog -c);
    414 push @swish, $swish_config;
    415
    416 ## Build the harness, open all pipes, and launch the subprocesses
    417 $self->{'slice'}->{$s}->{'h'} = start \@swish,
    418 \$self->{'slice'}->{$s}->{'in'},
    419 \$self->{'slice'}->{$s}->{'out'},
    420 \$self->{'slice'}->{$s}->{'err'},
    421 timeout( 90 ); # FIXME
    422
    423 $self->{'slice'}->{$s}->{'out_len'} = 0;
    424 $self->{'slice'}->{$s}->{'err_len'} = 0;
    425
    426 $self->slice_output($s);
    427
    428 return $s;
    429 }
    430
    431 =head2 put_slice
    432
    433 Pass XML data to swish and receive output and errors.
    434
    435 my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');
    436
    437 =cut
    438
    439 sub put_slice {
    440 my $self = shift;
    441
    442 my $path = shift || confess "need path";
    443 my $xml = shift || confess "need xml";
    444
    445 $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
    446
    447 my $s = $self->create_slice($path) || confess "create_slice returned null";
    448
    449 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
    450 confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
    451 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
    452
    453 $self->slice_output($s);
    454
    455 use bytes; # as opposed to chars
    456 $self->{'slice'}->{$s}->{'in'} .=
    457 "Path-Name: $path\n".
    458 "Content-Length: ".(length($xml)+1)."\n".
    459 "Document-Type: XML\n\n$xml\n";
    460
    461 # do I/O
    462 $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go
    463
    464 $self->slice_output($s);
    465
    466 return $s;
    467 }
    468
    469 =head2 slice_output
    470
    471 Prints to STDERR output and errors from C<swish-e>.
    472
    473 $i->slice_output($s);
    474
    475 Normally, you don't need to call it.
    476
    477 =cut
    478
    479 sub slice_output {
    480 my $self = shift;
    481
    482 my $s = shift || confess "slice_output needs slice";
    483
    484 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
    485 confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
    486 confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
    487
    488 if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
    489 #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
    490 $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
    491 return 1;
    492 } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
    493 print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
    494 $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
    495 # this is fatal
    496 return 0;
    497 }
    498
    499 return 1;
    500 }
    501
    502 =head2 close_slice {
    503
    504 Close slice (terminates swish-e process for that slice).
    505
    506 my $i->close_slice($s);
    507
    508 Returns true if slice is closed, false otherwise.
    509
    510 =cut
    511
    512 sub close_slice {
    513 my $self = shift;
    514
    515 my $s = shift || confess "close_slice needs slice";
    516
    517 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
    518 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
    519
    520 # pump rest of content (if any)
    521 $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
    522
    523 $self->slice_output($s);
    524
    525 # clean up
    526 $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
    527
    528 delete($self->{'slice'}->{$s}) && return 1;
    529 return 0;
    530 }
    531
    532 =head2 to_xml
    533
    534 Convert (binary safe, I hope) your data into XML for C<swish-e>.
    535 Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
    536
    537 my $xml = $i->to_xml({ foo => 'bar' });
    538
    539 This function is extracted from L<"add"> method so that you can L<Memoize> it.
    540 If your data set has a lot of repeatable data, and memory is not a problem, you
    541 can add C<memoize_to_xml> option to L<"open">.
    542
    543 =cut
    544
    545 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
    546 my $escape_re = join '|' => keys %escape;
    547
    548 sub to_xml {
    549 my $self = shift;
    550
    551 my $data = shift || return;
    552
    553 my $xml = qq{<xml>};
    554 foreach my $tag (keys %$data) {
    555 my $content = $data->{$tag};
    556 next if (! $content || $content eq '');
    557 # save [cr/]lf before conversion to XML
    558 # $content =~ s/\n\r/##lf##/gs;
    559 # $content =~ s/\n/##lf##/gs;
    560 $content =~ s/($escape_re)/$escape{$1}/gs;
    561 $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
    562 }
    563 $xml .= qq{</xml>};
    564 }
    565
    264 566 1;
    265 567 __END__
    266 568
     
    282 584
    283 585 =head2 EXPORT
    284 586
    285 None by default.
    587 Nothing by default.
    286 588
    589 =head2 EXAMPLES
    287 590
    591 Test script for this module uses all parts of API. It's also nice example
    592 how to use C<SWISH::Split>.
    288 593
    289 594 =head1 SEE ALSO
    290 595
  • trunk/t/SWISH-Split.t

     
    2 2
    3 3 use strict;
    4 4
    5 use Test::More tests => 11;
    5 use Test::More tests => 51;
    6 6 use Test::Exception;
    7 7 use File::Temp qw/ :mktemp /;
    8 8 use blib;
    9 9
    10 BEGIN { use_ok('SWISH::Split') };
    10 BEGIN {
    11 use_ok('SWISH::Split');
    12 use_ok('SWISH::API');
    13 };
    11 14
    12 15 # FIXME debug
    13 16 system "rm -Rf /tmp/swish?????";
    17 system "rm -Rf /tmp/swish?????.prop";
    18 system "rm -Rf /tmp/swish?????.temp";
    14 19
    15 20 my %param;
    16 21
    17 22 throws_ok { SWISH::Split->open(%param) } qr/slice_name/, "slice_name";
    18 23
    19 sub slice_1st_char {
    20 return substr($_[0],0,1);
    24 sub slice_hash {
    25 return $_[0];
    21 26 };
    22 27
    23 use Data::Dumper;
    24 print Dumper(\&slice_1st_char);
    25
    26 $param{'slice_name'} = \&slice_1st_char;
    28 $param{'slice_name'} = \&slice_hash;
    27 29 throws_ok { SWISH::Split->open(%param) } qr/slices/, "need slices";
    28 30
    29 31 $param{'slices'} = 3;
    30 32 throws_ok { SWISH::Split->open(%param) } qr/index/, "need index";
    31 33
    32 ok($param{'index'} = mktemp("/tmp/swishXXXXX"), "make temp name");
    34 ok($param{'index'} = mktemp("/tmp/swishXXXXX"), "index name");
    33 35
    34 36 diag "index path: $param{'index'}\n";
    35 37
     
    42 44
    43 45 ok(mkdir($param{'index'}), "mkdir");
    44 46
    47 $param{'swish_config'} = qq{
    48 PropertyNames foo
    49 };
    50
    45 51 ok(my $i=SWISH::Split->open(%param), "open");
    46 52
    47 # methods test
    53 cmp_ok(my $s = $i->in_slice("swishpath"), '==', 1, "open");
    48 54
    55 ok(my $config = $i->make_config($s), "make_config");
    56 diag "swish config: $config";
    49 57
    58 # make temporary index and data names
    50 59
    51 # internal functions test
    60 ok(my $test_index = mktemp("/tmp/swishXXXXX"), "test index name");
    61 diag "test index: $test_index";
    62 ok(my $test_data = mktemp("/tmp/swishXXXXX"), "test data name");
    63 diag "test data: $test_data";
    52 64
    53 cmp_ok($i->in_slice("swishpath"), '==', 2, "open");
    65 ok(my $xml = $i->to_xml({ foo => 'bar' }), "to_xml");
    54 66
    67 sub write_test_data($$) {
    68 my ($path,$xml) = @_;
    69
    70 use bytes;
    71 my $l = length($xml);
    72
    73 diag "xml: $xml [$l bytes]";
    74 ok(open(DATA, "> $test_data"), "write to test data");
    75 print DATA "Path-name: $path\nContent-length: $l\n\n$xml";
    76 ok(close(DATA), "close");
    77 }
    78
    79 write_test_data('testpath',$xml);
    80
    81 # test swish-e binary
    82 ok(my $out =`cat $test_data | swish-e -S prog -f $test_index -c $config 2>&1`, "test config");
    83
    84 like($out, qr/foo/, "found foo");
    85 like($out, qr/testpath/, "found testpath");
    86
    87 diag "swish-e binary o.k.";
    88
    89 # test compatiblity of produced index with SWISH::API
    90
    91 sub swish_search {
    92 my ($index, $query, $hits, $path, $size, $prop, $val) = @_;
    93
    94 my $swish = SWISH::API->new($index);
    95 ok(! $swish->Error, "SWISH::API->new $index");
    96
    97 ok(my $results = $swish->Query($query), "SWISH::API->Query $query");
    98 ok(! $swish->Error, "no error");
    99
    100 cmp_ok($results->Hits, '==', $hits, "got $hits hits");
    101
    102 ok(my $result = $results->NextResult, "get result");
    103
    104 cmp_ok($result->Property('swishdocpath'), '==', $path, "correct swishdocpath") if ($path);
    105 cmp_ok($result->Property('swishdocsize'), '==', $size, "correct swishdocsize") if (defined($size));
    106 cmp_ok($result->Property($prop), '==', $val, "correct data") if (defined($prop) && defined($val));
    107 }
    108
    109 swish_search($test_index, "foo=(bar)", 1, "testpath", length($xml), "foo", "bar");
    110
    111 diag "SWISH::API o.k.";
    112
    113 # now, test slice handling
    114
    115 ok(my $slice = $i->create_slice('testpath'), "create_slice");
    116
    117 ok($i->put_slice('testpath', $xml), "put_slice");
    118
    119 ok($i->close_slice($slice), "close_slice");
    120
    121 swish_search($param{'index'}."/$slice", "foo=(bar)", 1, "testpath", length($xml)+1, "foo", "bar");
    122
    123 diag "slice handling o.k.";
    124
    125 ok($i->add('testpath',{ foo => 'bar' }),"add foo");
    126
    127 foreach (1..$param{'slices'} * 3) {
    128 ok($i->add('testpath'.$_,{ 'foo'.$_ => 'bar'.$_ }), "add $_");
    129 }
    130
    131 cmp_ok($i->finish, '==', 3, "finish");
    132
    133
    134 #diag "$out";