| 1 |
1 |
dpavlin |
package SWISH::Split; |
| 2 |
|
|
|
| 3 |
|
|
use 5.008; |
| 4 |
|
|
use strict; |
| 5 |
|
|
use warnings; |
| 6 |
|
|
|
| 7 |
9 |
dpavlin |
our $VERSION = '0.03'; |
| 8 |
1 |
dpavlin |
|
| 9 |
|
|
use SWISH::API; |
| 10 |
|
|
use Text::Iconv; |
| 11 |
|
|
use File::Temp qw/ :mktemp /; |
| 12 |
|
|
use Carp; |
| 13 |
|
|
use Digest::MD5 qw(md5_hex); |
| 14 |
|
|
use Memoize; |
| 15 |
4 |
dpavlin |
use File::Which; |
| 16 |
1 |
dpavlin |
|
| 17 |
|
|
use Data::Dumper; |
| 18 |
|
|
|
| 19 |
5 |
dpavlin |
use constant { |
| 20 |
|
|
ADDED => 1, |
| 21 |
|
|
DELETED => 2, |
| 22 |
|
|
}; |
| 23 |
|
|
|
| 24 |
1 |
dpavlin |
=head1 NAME |
| 25 |
|
|
|
| 26 |
|
|
SWISH::Split - Perl interface to split index variant of Swish-e |
| 27 |
|
|
|
| 28 |
|
|
=head1 SYNOPSIS |
| 29 |
|
|
|
| 30 |
|
|
use SWISH::Split; |
| 31 |
|
|
|
| 32 |
|
|
|
| 33 |
|
|
=head1 DESCRIPTION |
| 34 |
|
|
|
| 35 |
|
|
This is alternative interface for indexing data with swish-e. It's designed |
| 36 |
4 |
dpavlin |
to split indexes over multiple files (slices) to allow updates of records in index |
| 37 |
|
|
by reindexing just changed parts (slice). |
| 38 |
1 |
dpavlin |
|
| 39 |
|
|
Data is stored in index using intrface which is somewhat similar to |
| 40 |
|
|
L<Plucene::Simple>. This could make your migration (or supporting two index |
| 41 |
|
|
engines) easier. |
| 42 |
|
|
|
| 43 |
|
|
In the background, it will fork swish-e binaries (one for each index slice) |
| 44 |
6 |
dpavlin |
and produce UTF-8 encoded XML files for it. So, if your input charset isn't |
| 45 |
1 |
dpavlin |
C<ISO-8859-1> you will have to specify it. |
| 46 |
|
|
|
| 47 |
|
|
=head1 Methods used for indexing |
| 48 |
|
|
|
| 49 |
8 |
dpavlin |
=head2 open_index |
| 50 |
1 |
dpavlin |
|
| 51 |
|
|
Create new object for index. |
| 52 |
|
|
|
| 53 |
8 |
dpavlin |
my $i = SWISH::Split->open_index({ |
| 54 |
1 |
dpavlin |
index => '/path/to/index', |
| 55 |
|
|
slice_name => \&slice_on_path, |
| 56 |
|
|
slices => 30, |
| 57 |
4 |
dpavlin |
merge => 0, |
| 58 |
|
|
codepage => 'ISO-8859-2', |
| 59 |
|
|
swish_config => qq{ |
| 60 |
|
|
PropertyNames from date |
| 61 |
|
|
PropertyNamesDate date |
| 62 |
|
|
}, |
| 63 |
|
|
memoize_to_xml => 0, |
| 64 |
1 |
dpavlin |
); |
| 65 |
|
|
|
| 66 |
|
|
# split index on first component of path |
| 67 |
|
|
sub slice_on_path { |
| 68 |
|
|
return shift split(/\//,$_[0]); |
| 69 |
|
|
} |
| 70 |
|
|
|
| 71 |
8 |
dpavlin |
Options to C<open_index> are following: |
| 72 |
1 |
dpavlin |
|
| 73 |
4 |
dpavlin |
=over 5 |
| 74 |
|
|
|
| 75 |
|
|
=item C<index> |
| 76 |
|
|
|
| 77 |
|
|
path to (existing) directory in which index slices will be created. |
| 78 |
|
|
|
| 79 |
|
|
=item C<slice_name> |
| 80 |
|
|
|
| 81 |
|
|
coderef to function which provide slicing from path. |
| 82 |
|
|
|
| 83 |
|
|
=item C<slices> |
| 84 |
|
|
|
| 85 |
|
|
maximum number of index slices. See L<"in_slice"> for |
| 86 |
1 |
dpavlin |
more explanation. |
| 87 |
|
|
|
| 88 |
4 |
dpavlin |
=item C<merge> |
| 89 |
|
|
|
| 90 |
|
|
(planned) option to merge indexes into one at end. |
| 91 |
|
|
|
| 92 |
|
|
=item C<codepage> |
| 93 |
|
|
|
| 94 |
|
|
data codepage (needed for conversion to UTF-8). |
| 95 |
|
|
By default, it's C<ISO-8859-1>. |
| 96 |
|
|
|
| 97 |
|
|
=item C<swish_config> |
| 98 |
|
|
|
| 99 |
|
|
additional parametars which will be inserted into |
| 100 |
11 |
dpavlin |
C<swish-e> configuration file. See C<swish-config>. |
| 101 |
4 |
dpavlin |
|
| 102 |
|
|
=item C<memoize_to_xml> |
| 103 |
|
|
|
| 104 |
|
|
speed up repeatable data, see L<"to_xml">. |
| 105 |
|
|
|
| 106 |
|
|
=back |
| 107 |
|
|
|
| 108 |
1 |
dpavlin |
=cut |
| 109 |
|
|
|
| 110 |
|
|
my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); |
| 111 |
|
|
|
| 112 |
8 |
dpavlin |
sub open_index { |
| 113 |
1 |
dpavlin |
my $class = shift; |
| 114 |
|
|
my $self = {@_}; |
| 115 |
|
|
bless($self, $class); |
| 116 |
|
|
|
| 117 |
|
|
croak "need slice_name coderef" unless ref $self->{'slice_name'}; |
| 118 |
|
|
croak "need slices" unless $self->{'slices'}; |
| 119 |
|
|
|
| 120 |
|
|
croak "need index" unless $self->{'index'}; |
| 121 |
|
|
croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'}; |
| 122 |
|
|
croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'}; |
| 123 |
|
|
|
| 124 |
|
|
$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'}); |
| 125 |
|
|
|
| 126 |
4 |
dpavlin |
# speedup |
| 127 |
1 |
dpavlin |
memoize('in_slice'); |
| 128 |
4 |
dpavlin |
memoize('to_xml') if ($self->{'memoize_to_xml'}); |
| 129 |
1 |
dpavlin |
|
| 130 |
|
|
$self ? return $self : return undef; |
| 131 |
|
|
|
| 132 |
|
|
} |
| 133 |
|
|
|
| 134 |
|
|
=head2 add |
| 135 |
|
|
|
| 136 |
|
|
Add document to index. |
| 137 |
|
|
|
| 138 |
|
|
$i->add($swishpath, { |
| 139 |
|
|
headline => 'foobar result', |
| 140 |
|
|
property => 'data', |
| 141 |
|
|
}) |
| 142 |
|
|
|
| 143 |
|
|
=cut |
| 144 |
|
|
|
| 145 |
|
|
sub add { |
| 146 |
|
|
my $self = shift; |
| 147 |
3 |
dpavlin |
|
| 148 |
|
|
my $swishpath = shift || return; |
| 149 |
|
|
my $data = shift || return; |
| 150 |
|
|
|
| 151 |
7 |
dpavlin |
my $slice = $self->put_slice($swishpath, $self->to_xml($data)); |
| 152 |
4 |
dpavlin |
|
| 153 |
7 |
dpavlin |
return $slice; |
| 154 |
1 |
dpavlin |
} |
| 155 |
|
|
|
| 156 |
|
|
=head2 delete |
| 157 |
|
|
|
| 158 |
3 |
dpavlin |
Delete documents from index. |
| 159 |
1 |
dpavlin |
|
| 160 |
3 |
dpavlin |
$i->delete(@swishpath); |
| 161 |
1 |
dpavlin |
|
| 162 |
13 |
dpavlin |
B<This function is not implemented.> |
| 163 |
|
|
|
| 164 |
1 |
dpavlin |
=cut |
| 165 |
|
|
|
| 166 |
|
|
sub delete { |
| 167 |
|
|
my $self = shift; |
| 168 |
3 |
dpavlin |
|
| 169 |
|
|
my @paths = @_ || return; |
| 170 |
|
|
|
| 171 |
5 |
dpavlin |
foreach my $path (@paths) { |
| 172 |
|
|
$self->{'paths'}->{$path} = DELETED; |
| 173 |
|
|
} |
| 174 |
|
|
|
| 175 |
13 |
dpavlin |
die "delete is not yet implemented"; |
| 176 |
|
|
|
| 177 |
3 |
dpavlin |
return 42; |
| 178 |
1 |
dpavlin |
} |
| 179 |
|
|
|
| 180 |
|
|
|
| 181 |
5 |
dpavlin |
=head2 done |
| 182 |
1 |
dpavlin |
|
| 183 |
4 |
dpavlin |
Finish indexing and close index file(s). |
| 184 |
1 |
dpavlin |
|
| 185 |
5 |
dpavlin |
$i->done; |
| 186 |
1 |
dpavlin |
|
| 187 |
|
|
This is most time-consuming operation. When it's called, it will re-index |
| 188 |
|
|
all entries which haven't changed in all slices. |
| 189 |
|
|
|
| 190 |
4 |
dpavlin |
Returns number of slices updated. |
| 191 |
|
|
|
| 192 |
5 |
dpavlin |
This method should really be called close or finish, but both of those are |
| 193 |
|
|
allready used. |
| 194 |
|
|
|
| 195 |
1 |
dpavlin |
=cut |
| 196 |
|
|
|
| 197 |
5 |
dpavlin |
sub done { |
| 198 |
1 |
dpavlin |
my $self = shift; |
| 199 |
3 |
dpavlin |
|
| 200 |
4 |
dpavlin |
my $ret = 0; |
| 201 |
|
|
|
| 202 |
|
|
foreach my $s (keys %{$self->{'slice'}}) { |
| 203 |
8 |
dpavlin |
$self->_debug("closing slice $s"); |
| 204 |
4 |
dpavlin |
$ret += $self->close_slice($s); |
| 205 |
|
|
} |
| 206 |
|
|
|
| 207 |
|
|
return $ret; |
| 208 |
1 |
dpavlin |
} |
| 209 |
|
|
|
| 210 |
|
|
|
| 211 |
|
|
|
| 212 |
|
|
=head1 Reporting methods |
| 213 |
|
|
|
| 214 |
|
|
This methods return statistics about your index. |
| 215 |
|
|
|
| 216 |
|
|
=head2 swishpaths |
| 217 |
|
|
|
| 218 |
|
|
Return array of C<swishpath>s in index. |
| 219 |
|
|
|
| 220 |
|
|
my @p = $i->swishpaths; |
| 221 |
|
|
|
| 222 |
|
|
=cut |
| 223 |
|
|
|
| 224 |
|
|
sub swishpaths { |
| 225 |
|
|
my $self = shift; |
| 226 |
5 |
dpavlin |
|
| 227 |
|
|
my $s = shift || return; |
| 228 |
|
|
return if (! exists($self->{'slice'}->{'s'})); |
| 229 |
|
|
|
| 230 |
|
|
return keys %{$self->{'slice'}->{'s'}}; |
| 231 |
1 |
dpavlin |
} |
| 232 |
|
|
|
| 233 |
|
|
=head2 swishpaths_updated |
| 234 |
|
|
|
| 235 |
|
|
Return array with updated C<swishpath>s. |
| 236 |
|
|
|
| 237 |
|
|
my @d = $i->swishpaths_updated; |
| 238 |
|
|
|
| 239 |
|
|
=cut |
| 240 |
|
|
|
| 241 |
|
|
sub swishpaths_updated { |
| 242 |
|
|
my $self = shift; |
| 243 |
|
|
} |
| 244 |
|
|
|
| 245 |
|
|
|
| 246 |
|
|
=head2 swishpaths_deleted |
| 247 |
|
|
|
| 248 |
|
|
Return array with deleted C<swishpath>s. |
| 249 |
|
|
|
| 250 |
|
|
my $n = $i->swishpaths_deleted; |
| 251 |
|
|
|
| 252 |
|
|
=cut |
| 253 |
|
|
|
| 254 |
|
|
sub swishpaths_deleted { |
| 255 |
|
|
my $self = shift; |
| 256 |
|
|
} |
| 257 |
|
|
|
| 258 |
|
|
|
| 259 |
|
|
=head2 slices |
| 260 |
|
|
|
| 261 |
|
|
Return array with all slice names. |
| 262 |
|
|
|
| 263 |
|
|
my @s = $i->slices; |
| 264 |
|
|
|
| 265 |
|
|
=cut |
| 266 |
|
|
|
| 267 |
|
|
sub slices { |
| 268 |
|
|
my $self = shift; |
| 269 |
|
|
} |
| 270 |
|
|
|
| 271 |
|
|
=head1 Helper methods |
| 272 |
|
|
|
| 273 |
|
|
This methods are used internally, but they might be useful. |
| 274 |
|
|
|
| 275 |
|
|
=head2 in_slice |
| 276 |
|
|
|
| 277 |
|
|
Takes path and return slice in which this path belongs. |
| 278 |
|
|
|
| 279 |
|
|
my $s = $i->in_slice('path/to/document/in/index'); |
| 280 |
|
|
|
| 281 |
8 |
dpavlin |
If there are C<slices> parametar to L<"open_index"> it will use |
| 282 |
1 |
dpavlin |
MD5 hash to spread documents across slices. That will produce random |
| 283 |
|
|
distribution of your documents in slices, which might or might not be best |
| 284 |
|
|
for your data. If you have to re-index large number of slices on each |
| 285 |
|
|
run, think about creating your own C<slice> function and distributing |
| 286 |
|
|
documents manually across slices. |
| 287 |
|
|
|
| 288 |
4 |
dpavlin |
Slice number must always be true value or various sanity checks will fail. |
| 289 |
|
|
|
| 290 |
1 |
dpavlin |
This function is C<Memoize>ed for performance reasons. |
| 291 |
|
|
|
| 292 |
|
|
=cut |
| 293 |
|
|
|
| 294 |
|
|
sub in_slice { |
| 295 |
|
|
my $self = shift; |
| 296 |
|
|
|
| 297 |
|
|
my $path = shift || confess "need path"; |
| 298 |
|
|
|
| 299 |
|
|
confess "need slice_name function" unless ref ($self->{'slice_name'}); |
| 300 |
|
|
|
| 301 |
|
|
if ($self->{'slices'}) { |
| 302 |
|
|
# first, pass path through slice_name function |
| 303 |
|
|
my $slice = &{$self->{'slice_name'}}($path); |
| 304 |
|
|
# then calculate MD5 hash |
| 305 |
8 |
dpavlin |
my $hash = md5_hex($slice); |
| 306 |
1 |
dpavlin |
# take first 8 chars to produce number |
| 307 |
|
|
# FIXME how random is this? |
| 308 |
8 |
dpavlin |
$hash = hex(substr($hash,0,8)); |
| 309 |
1 |
dpavlin |
|
| 310 |
8 |
dpavlin |
$slice = ($hash % $self->{'slices'}) + 1; |
| 311 |
|
|
$self->_debug("hash: $hash / ",$self->{'slices'}," => $slice"); |
| 312 |
4 |
dpavlin |
return $slice; |
| 313 |
1 |
dpavlin |
} else { |
| 314 |
|
|
return &{$self->{'split'}}($path); |
| 315 |
|
|
} |
| 316 |
|
|
} |
| 317 |
|
|
|
| 318 |
3 |
dpavlin |
=head2 find_paths |
| 319 |
1 |
dpavlin |
|
| 320 |
3 |
dpavlin |
Return array of C<swishpath>s for given C<swish-e> query. |
| 321 |
1 |
dpavlin |
|
| 322 |
3 |
dpavlin |
my @p = $i->find_paths("headline=test*"); |
| 323 |
|
|
|
| 324 |
|
|
Useful for combining with L<"delete_documents"> to delete documents |
| 325 |
|
|
which hasn't changed a while (so, expired). |
| 326 |
|
|
|
| 327 |
|
|
=cut |
| 328 |
|
|
|
| 329 |
|
|
sub find_paths { |
| 330 |
|
|
my $self = shift; |
| 331 |
|
|
|
| 332 |
|
|
} |
| 333 |
|
|
|
| 334 |
|
|
|
| 335 |
4 |
dpavlin |
=head2 make_config |
| 336 |
3 |
dpavlin |
|
| 337 |
4 |
dpavlin |
Create C<swish-e> configuration file for given slice. |
| 338 |
|
|
|
| 339 |
|
|
my $config_filename = $i->make_config('slice name'); |
| 340 |
|
|
|
| 341 |
|
|
It returns configuration filename. If no C<swish_config> was defined in |
| 342 |
8 |
dpavlin |
L<"open_index">, default swish-e configuration will be used. It will index all data for |
| 343 |
4 |
dpavlin |
searching, but none for properties. |
| 344 |
|
|
|
| 345 |
|
|
If you want to see what is allready defined for swish-e in configuration |
| 346 |
|
|
take a look at source code for C<DEFAULT_SWISH_CONF>. |
| 347 |
|
|
|
| 348 |
7 |
dpavlin |
It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>. |
| 349 |
4 |
dpavlin |
|
| 350 |
|
|
=cut |
| 351 |
|
|
|
| 352 |
|
|
sub make_config { |
| 353 |
|
|
my $self = shift; |
| 354 |
|
|
|
| 355 |
|
|
|
| 356 |
|
|
my $index_file = $self->{'index'}."/"; |
| 357 |
|
|
$index_file .= shift || confess "need slice name"; |
| 358 |
|
|
|
| 359 |
|
|
my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX"); |
| 360 |
|
|
|
| 361 |
|
|
# find cat on filesystem |
| 362 |
|
|
my $cat = which('cat'); |
| 363 |
|
|
|
| 364 |
|
|
print $tmp_fh <<"DEFAULT_SWISH_CONF"; |
| 365 |
|
|
# swish-e config file |
| 366 |
|
|
|
| 367 |
7 |
dpavlin |
IndexDir stdin |
| 368 |
4 |
dpavlin |
|
| 369 |
|
|
# input file definition |
| 370 |
|
|
DefaultContents XML* |
| 371 |
|
|
|
| 372 |
|
|
# indexed metatags |
| 373 |
|
|
MetaNames xml swishdocpath |
| 374 |
|
|
|
| 375 |
|
|
|
| 376 |
|
|
#XMLClassAttributes type |
| 377 |
|
|
UndefinedMetaTags auto |
| 378 |
|
|
UndefinedXMLAttributes auto |
| 379 |
|
|
|
| 380 |
|
|
IndexFile $index_file |
| 381 |
|
|
|
| 382 |
|
|
# Croatian ISO-8859-2 characters to unaccented equivalents |
| 383 |
|
|
TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz |
| 384 |
|
|
|
| 385 |
|
|
|
| 386 |
|
|
# disable output |
| 387 |
|
|
ParserWarnLevel 0 |
| 388 |
|
|
IndexReport 1 |
| 389 |
|
|
|
| 390 |
|
|
DEFAULT_SWISH_CONF |
| 391 |
|
|
|
| 392 |
|
|
# add user parametars (like stored properties) |
| 393 |
|
|
print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'}); |
| 394 |
|
|
|
| 395 |
|
|
close($tmp_fh); |
| 396 |
|
|
|
| 397 |
|
|
return $swish_config_filename; |
| 398 |
|
|
} |
| 399 |
|
|
|
| 400 |
|
|
=head2 create_slice |
| 401 |
|
|
|
| 402 |
8 |
dpavlin |
On first run, starts C<swish-e>. On subsequent calls just return |
| 403 |
11 |
dpavlin |
it's handles using C<Memoize>. |
| 404 |
4 |
dpavlin |
|
| 405 |
|
|
my $s = create_slice('/path/to/document'); |
| 406 |
|
|
|
| 407 |
|
|
You shouldn't need to call C<create_slice> directly because it will be called |
| 408 |
|
|
from L<"put_slice"> when needed. |
| 409 |
|
|
|
| 410 |
|
|
=cut |
| 411 |
|
|
|
| 412 |
|
|
sub create_slice { |
| 413 |
|
|
my $self = shift; |
| 414 |
|
|
|
| 415 |
|
|
my $path = shift || confess "create_slice need path!"; |
| 416 |
|
|
|
| 417 |
|
|
my $s = $self->in_slice($path) || confess "in_slice returned null"; |
| 418 |
|
|
|
| 419 |
|
|
return $s if (exists($self->{'slice'}->{$s})); |
| 420 |
|
|
|
| 421 |
|
|
my $swish_config = $self->make_config($s); |
| 422 |
|
|
|
| 423 |
8 |
dpavlin |
my $swish = qq{| swish-e }; |
| 424 |
9 |
dpavlin |
if (-f $self->{'index'}.'/'.$s) { |
| 425 |
|
|
$swish .= qq{ -u }; |
| 426 |
|
|
$self->{'slice'}->{$s}->{'update_mode'}++; |
| 427 |
|
|
} |
| 428 |
8 |
dpavlin |
$swish .= qq{ -S prog -c } . $swish_config; |
| 429 |
4 |
dpavlin |
|
| 430 |
8 |
dpavlin |
$self->_debug("creating slice $s using $swish"); |
| 431 |
4 |
dpavlin |
|
| 432 |
|
|
## Build the harness, open all pipes, and launch the subprocesses |
| 433 |
8 |
dpavlin |
open(my $fh, $swish) || croak "can't open $swish: $!"; |
| 434 |
4 |
dpavlin |
|
| 435 |
8 |
dpavlin |
$self->{'slice'}->{$s}->{'h'} = $fh; |
| 436 |
4 |
dpavlin |
|
| 437 |
|
|
$self->slice_output($s); |
| 438 |
|
|
|
| 439 |
|
|
return $s; |
| 440 |
|
|
} |
| 441 |
|
|
|
| 442 |
|
|
=head2 put_slice |
| 443 |
|
|
|
| 444 |
7 |
dpavlin |
Pass XML data to swish. |
| 445 |
4 |
dpavlin |
|
| 446 |
7 |
dpavlin |
my $slice = $i->put_slice('/swish/path', '<xml>data</xml>'); |
| 447 |
4 |
dpavlin |
|
| 448 |
7 |
dpavlin |
Returns slice in which XML ended up. |
| 449 |
|
|
|
| 450 |
4 |
dpavlin |
=cut |
| 451 |
|
|
|
| 452 |
|
|
sub put_slice { |
| 453 |
|
|
my $self = shift; |
| 454 |
|
|
|
| 455 |
|
|
my $path = shift || confess "need path"; |
| 456 |
|
|
my $xml = shift || confess "need xml"; |
| 457 |
|
|
|
| 458 |
|
|
$xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml"; |
| 459 |
|
|
|
| 460 |
|
|
my $s = $self->create_slice($path) || confess "create_slice returned null"; |
| 461 |
|
|
|
| 462 |
|
|
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
| 463 |
|
|
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
| 464 |
|
|
|
| 465 |
|
|
$self->slice_output($s); |
| 466 |
|
|
|
| 467 |
|
|
use bytes; # as opposed to chars |
| 468 |
8 |
dpavlin |
my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined"; |
| 469 |
9 |
dpavlin |
|
| 470 |
|
|
my $update_header = "Update-Mode: Index\n"; |
| 471 |
|
|
$update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'}); |
| 472 |
|
|
|
| 473 |
8 |
dpavlin |
print { $fh } "Path-Name: $path\n". |
| 474 |
9 |
dpavlin |
"Content-Length: ".(length($xml)+1)."\n" . $update_header . |
| 475 |
4 |
dpavlin |
"Document-Type: XML\n\n$xml\n"; |
| 476 |
|
|
|
| 477 |
|
|
$self->slice_output($s); |
| 478 |
|
|
|
| 479 |
8 |
dpavlin |
$self->_debug("dumping in slice $s: $path"); |
| 480 |
|
|
|
| 481 |
5 |
dpavlin |
$self->{'paths'}->{$path} = ADDED; |
| 482 |
|
|
|
| 483 |
4 |
dpavlin |
return $s; |
| 484 |
|
|
} |
| 485 |
|
|
|
| 486 |
|
|
=head2 slice_output |
| 487 |
|
|
|
| 488 |
|
|
Prints to STDERR output and errors from C<swish-e>. |
| 489 |
|
|
|
| 490 |
7 |
dpavlin |
my $slice = $i->slice_output($s); |
| 491 |
4 |
dpavlin |
|
| 492 |
|
|
Normally, you don't need to call it. |
| 493 |
|
|
|
| 494 |
8 |
dpavlin |
B<This is dummy placeholder function for very old code that assumes this |
| 495 |
|
|
module is using C<IPC::Run> which it isn't any more.> |
| 496 |
|
|
|
| 497 |
4 |
dpavlin |
=cut |
| 498 |
|
|
|
| 499 |
|
|
sub slice_output { |
| 500 |
|
|
my $self = shift; |
| 501 |
|
|
|
| 502 |
|
|
my $s = shift || confess "slice_output needs slice"; |
| 503 |
|
|
|
| 504 |
|
|
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
| 505 |
|
|
|
| 506 |
8 |
dpavlin |
# FIXME |
| 507 |
4 |
dpavlin |
|
| 508 |
7 |
dpavlin |
return $s; |
| 509 |
4 |
dpavlin |
} |
| 510 |
|
|
|
| 511 |
5 |
dpavlin |
=head2 close_slice |
| 512 |
4 |
dpavlin |
|
| 513 |
|
|
Close slice (terminates swish-e process for that slice). |
| 514 |
|
|
|
| 515 |
|
|
my $i->close_slice($s); |
| 516 |
|
|
|
| 517 |
|
|
Returns true if slice is closed, false otherwise. |
| 518 |
|
|
|
| 519 |
|
|
=cut |
| 520 |
|
|
|
| 521 |
|
|
sub close_slice { |
| 522 |
|
|
my $self = shift; |
| 523 |
|
|
|
| 524 |
|
|
my $s = shift || confess "close_slice needs slice"; |
| 525 |
|
|
|
| 526 |
|
|
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
| 527 |
|
|
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
| 528 |
|
|
|
| 529 |
|
|
# pump rest of content (if any) |
| 530 |
8 |
dpavlin |
close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!"; |
| 531 |
4 |
dpavlin |
|
| 532 |
|
|
$self->slice_output($s); |
| 533 |
|
|
|
| 534 |
8 |
dpavlin |
undef $self->{'slice'}->{$s}->{'h'}; |
| 535 |
4 |
dpavlin |
|
| 536 |
|
|
delete($self->{'slice'}->{$s}) && return 1; |
| 537 |
|
|
return 0; |
| 538 |
|
|
} |
| 539 |
|
|
|
| 540 |
|
|
=head2 to_xml |
| 541 |
|
|
|
| 542 |
|
|
Convert (binary safe, I hope) your data into XML for C<swish-e>. |
| 543 |
|
|
Data will not yet be recoded to UTF-8. L<"put_slice"> will do that. |
| 544 |
|
|
|
| 545 |
|
|
my $xml = $i->to_xml({ foo => 'bar' }); |
| 546 |
|
|
|
| 547 |
11 |
dpavlin |
This function is extracted from L<"add"> method so that you can C<Memoize> it. |
| 548 |
4 |
dpavlin |
If your data set has a lot of repeatable data, and memory is not a problem, you |
| 549 |
8 |
dpavlin |
can add C<memoize_to_xml> option to L<"open_index">. |
| 550 |
4 |
dpavlin |
|
| 551 |
|
|
=cut |
| 552 |
|
|
|
| 553 |
|
|
my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"'); |
| 554 |
|
|
my $escape_re = join '|' => keys %escape; |
| 555 |
|
|
|
| 556 |
|
|
sub to_xml { |
| 557 |
|
|
my $self = shift; |
| 558 |
|
|
|
| 559 |
|
|
my $data = shift || return; |
| 560 |
|
|
|
| 561 |
|
|
my $xml = qq{<xml>}; |
| 562 |
|
|
foreach my $tag (keys %$data) { |
| 563 |
|
|
my $content = $data->{$tag}; |
| 564 |
|
|
next if (! $content || $content eq ''); |
| 565 |
|
|
# save [cr/]lf before conversion to XML |
| 566 |
|
|
# $content =~ s/\n\r/##lf##/gs; |
| 567 |
|
|
# $content =~ s/\n/##lf##/gs; |
| 568 |
|
|
$content =~ s/($escape_re)/$escape{$1}/gs; |
| 569 |
|
|
$xml .= "<$tag><![CDATA[".$content."]]></$tag>"; |
| 570 |
|
|
} |
| 571 |
|
|
$xml .= qq{</xml>}; |
| 572 |
|
|
} |
| 573 |
|
|
|
| 574 |
8 |
dpavlin |
sub _debug { |
| 575 |
|
|
my $self = shift; |
| 576 |
|
|
print STDERR "## ",@_,"\n" if ($self->{'debug'}); |
| 577 |
|
|
return; |
| 578 |
|
|
} |
| 579 |
|