SWISH-Split/trunk/Split.pm

package SWISH::Split;

use 5.008;
use strict;
use warnings;

our $VERSION = '0.03';

use SWISH::API;
use Text::Iconv;
use File::Temp qw/ :mktemp /;
use Carp;
use Digest::MD5 qw(md5_hex);
use Memoize;
use File::Which;

use Data::Dumper;

use constant {
        ADDED => 1,
        DELETED => 2,
};

=head1 NAME

SWISH::Split - Perl interface to split index variant of Swish-e

=head1 SYNOPSIS

  use SWISH::Split;


=head1 DESCRIPTION

This is alternative interface for indexing data with swish-e. It's designed
to split indexes over multiple files (slices) to allow updates of records in index
by reindexing just changed parts (slice).

Data is stored in index using intrface which is somewhat similar to
L<Plucene::Simple>. This could make your migration (or supporting two index
engines) easier.

In the background, it will fork swish-e binaries (one for each index slice)
and produce UTF-8 encoded XML files for it. So, if your input charset isn't
C<ISO-8859-1> you will have to specify it.

=head1 Methods used for indexing

=head2 open_index

Create new object for index.

  my $i = SWISH::Split->open_index({
        index => '/path/to/index',
        slice_name => \&slice_on_path,
        slices => 30,
        merge => 0,
        codepage => 'ISO-8859-2',
        swish_config => qq{
                PropertyNames from date
                PropertyNamesDate date
        },
        memoize_to_xml => 0,
  );

  # split index on first component of path
  sub slice_on_path {
        return shift split(/\//,$_[0]);
  }

Options to C<open_index> are following:

=over 5

=item C<index>

path to (existing) directory in which index slices will be created.

=item C<slice_name>

coderef to function which provide slicing from path.

=item C<slices>

maximum number of index slices. See L<"in_slice"> for
more explanation.

=item C<merge>

(planned) option to merge indexes into one at end.

=item C<codepage>

data codepage (needed for conversion to UTF-8).
By default, it's C<ISO-8859-1>.

=item C<swish_config>

additional parametars which will be inserted into
C<swish-e> configuration file. See L<swish-config>.

=item C<memoize_to_xml>

speed up repeatable data, see L<"to_xml">.

=back

=cut

my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');

sub open_index {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        croak "need slice_name coderef" unless ref $self->{'slice_name'};
        croak "need slices" unless $self->{'slices'};

        croak "need index" unless $self->{'index'};
        croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
        croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};

        $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});

        # speedup
        memoize('in_slice');
        memoize('to_xml') if ($self->{'memoize_to_xml'});

        $self ? return $self : return undef;

}

=head2 add

Add document to index.

  $i->add($swishpath, {
        headline => 'foobar result',
        property => 'data',
  })

=cut

sub add {
        my $self = shift;

        my $swishpath = shift || return;
        my $data = shift || return;

        my $slice = $self->put_slice($swishpath, $self->to_xml($data));

        return $slice;
}

=head2 delete

Delete documents from index.

  $i->delete(@swishpath);

=cut

sub delete {
        my $self = shift;

        my @paths = @_ || return;

        foreach my $path (@paths) {
                $self->{'paths'}->{$path} = DELETED; 
        }

        return 42;
}


=head2 done

Finish indexing and close index file(s).

  $i->done;

This is most time-consuming operation. When it's called, it will re-index
all entries which haven't changed in all slices.

Returns number of slices updated.

This method should really be called close or finish, but both of those are
allready used.

=cut

sub done {
        my $self = shift;

        my $ret = 0;

        foreach my $s (keys %{$self->{'slice'}}) {
                $self->_debug("closing slice $s");
                $ret += $self->close_slice($s);
        }

        return $ret;
}


=head1 Reporting methods

This methods return statistics about your index.

=head2 swishpaths

Return array of C<swishpath>s in index.

  my @p = $i->swishpaths;

=cut

sub swishpaths {
        my $self = shift;

        my $s = shift || return;
        return if (! exists($self->{'slice'}->{'s'}));

        return keys %{$self->{'slice'}->{'s'}};
}

=head2 swishpaths_updated

Return array with updated C<swishpath>s.

  my @d = $i->swishpaths_updated;

=cut

sub swishpaths_updated {
        my $self = shift;
}


=head2 swishpaths_deleted

Return array with deleted C<swishpath>s.

  my $n = $i->swishpaths_deleted;

=cut

sub swishpaths_deleted {
        my $self = shift;
}


=head2 slices

Return array with all slice names.

  my @s = $i->slices;

=cut

sub slices {
        my $self = shift;
}

=head1 Helper methods

This methods are used internally, but they might be useful.

=head2 in_slice

Takes path and return slice in which this path belongs.

  my $s = $i->in_slice('path/to/document/in/index');

If there are C<slices> parametar to L<"open_index"> it will use
MD5 hash to spread documents across slices. That will produce random
distribution of your documents in slices, which might or might not be best
for your data. If you have to re-index large number of slices on each
run, think about creating your own C<slice> function and distributing
documents manually across slices.

Slice number must always be true value or various sanity checks will fail.

This function is C<Memoize>ed for performance reasons.

=cut

sub in_slice {
        my $self = shift;

        my $path = shift || confess "need path";

        confess "need slice_name function" unless ref ($self->{'slice_name'});

        if ($self->{'slices'}) {
                # first, pass path through slice_name function
                my $slice = &{$self->{'slice_name'}}($path);
                # then calculate MD5 hash
                my $hash = md5_hex($slice);
                # take first 8 chars to produce number
                # FIXME how random is this?
                $hash = hex(substr($hash,0,8));
                
                $slice = ($hash % $self->{'slices'}) + 1;
                $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
                return $slice;
        } else {
                return &{$self->{'split'}}($path);
        }
}

=head2 find_paths

Return array of C<swishpath>s for given C<swish-e> query.

  my @p = $i->find_paths("headline=test*");

Useful for combining with L<"delete_documents"> to delete documents
which hasn't changed a while (so, expired).

=cut

sub find_paths {
        my $self = shift;

}


=head2 make_config

Create C<swish-e> configuration file for given slice.

  my $config_filename = $i->make_config('slice name');

It returns configuration filename. If no C<swish_config> was defined in
L<"open_index">, default swish-e configuration will be used. It will index all data for
searching, but none for properties.

If you want to see what is allready defined for swish-e in configuration
take a look at source code for C<DEFAULT_SWISH_CONF>.

It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.

=cut

sub make_config {
        my $self = shift;


        my $index_file = $self->{'index'}."/";
        $index_file .= shift || confess "need slice name";

        my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");

        # find cat on filesystem
        my $cat = which('cat');

        print $tmp_fh <<"DEFAULT_SWISH_CONF";
# swish-e config file

IndexDir stdin

# input file definition
DefaultContents XML*

# indexed metatags
MetaNames xml swishdocpath


#XMLClassAttributes type
UndefinedMetaTags auto
UndefinedXMLAttributes auto

IndexFile $index_file

# Croatian ISO-8859-2 characters to unaccented equivalents
TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz


# disable output
ParserWarnLevel 0
IndexReport 1

DEFAULT_SWISH_CONF

        # add user parametars (like stored properties)
        print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});

        close($tmp_fh);

        return $swish_config_filename;
}

=head2 create_slice

On first run, starts C<swish-e>. On subsequent calls just return
it's handles using L<Memoize>.

  my $s = create_slice('/path/to/document');

You shouldn't need to call C<create_slice> directly because it will be called
from L<"put_slice"> when needed.

=cut

sub create_slice {
        my $self = shift;

        my $path = shift || confess "create_slice need path!";

        my $s = $self->in_slice($path) || confess "in_slice returned null";

        return $s if (exists($self->{'slice'}->{$s}));

        my $swish_config = $self->make_config($s);

        my $swish = qq{| swish-e };
        if (-f $self->{'index'}.'/'.$s) {
                $swish .= qq{ -u };
                $self->{'slice'}->{$s}->{'update_mode'}++;
        }
        $swish .= qq{ -S prog -c } . $swish_config;

        $self->_debug("creating slice $s using $swish");

        ## Build the harness, open all pipes, and launch the subprocesses
        open(my $fh, $swish) || croak "can't open $swish: $!";

        $self->{'slice'}->{$s}->{'h'} = $fh;

        $self->slice_output($s);

        return $s;
}

=head2 put_slice

Pass XML data to swish.

  my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');

Returns slice in which XML ended up.

=cut

sub put_slice {
        my $self = shift;

        my $path = shift || confess "need path";
        my $xml = shift || confess "need xml";

        $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";

        my $s = $self->create_slice($path) || confess "create_slice returned null";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

        $self->slice_output($s);

        use bytes;      # as opposed to chars
        my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";

        my $update_header = "Update-Mode: Index\n";
        $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});

        print { $fh } "Path-Name: $path\n".
                "Content-Length: ".(length($xml)+1)."\n" . $update_header . 
                "Document-Type: XML\n\n$xml\n";

        $self->slice_output($s);

        $self->_debug("dumping in slice $s: $path");

        $self->{'paths'}->{$path} = ADDED; 

        return $s;
}

=head2 slice_output

Prints to STDERR output and errors from C<swish-e>.

  my $slice = $i->slice_output($s);

Normally, you don't need to call it.

B<This is dummy placeholder function for very old code that assumes this
module is using C<IPC::Run> which it isn't any more.>

=cut

sub slice_output {
        my $self = shift;

        my $s = shift || confess "slice_output needs slice";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));

        # FIXME

        return $s;
}

=head2 close_slice

Close slice (terminates swish-e process for that slice).

  my $i->close_slice($s);

Returns true if slice is closed, false otherwise.

=cut

sub close_slice {
        my $self = shift;

        my $s = shift || confess "close_slice needs slice";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

        # pump rest of content (if any)
        close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";

        $self->slice_output($s);

        undef $self->{'slice'}->{$s}->{'h'};
        
        delete($self->{'slice'}->{$s}) && return 1;
        return 0;
}

=head2 to_xml

Convert (binary safe, I hope) your data into XML for C<swish-e>.
Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.

  my $xml = $i->to_xml({ foo => 'bar' });

This function is extracted from L<"add"> method so that you can L<Memoize> it.
If your data set has a lot of repeatable data, and memory is not a problem, you
can add C<memoize_to_xml> option to L<"open_index">.

=cut

my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
my $escape_re  = join '|' => keys %escape;

sub to_xml {
        my $self = shift;

        my $data = shift || return;

        my $xml = qq{<xml>};
        foreach my $tag (keys %$data) {
                my $content = $data->{$tag};
                next if (! $content || $content eq '');
                # save [cr/]lf before conversion to XML
#               $content =~ s/\n\r/##lf##/gs;
#               $content =~ s/\n/##lf##/gs;
                $content =~ s/($escape_re)/$escape{$1}/gs;
                $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
        }
        $xml .= qq{</xml>};
}

sub _debug {
        my $self = shift;
        print STDERR "## ",@_,"\n" if ($self->{'debug'});
        return;
}

1;
__END__


=head1 Searching

Searching is still conducted using L<SWISH::API>, but you have to glob
index names.

    use SWISH::API;

    my $swish = SWISH::API->new( glob('index.swish-e/*') );

You can also alternativly create merged index (using C<merge> option) and
not change your source code at all.

That would also benefit performance, but it increases indexing time
because merged indexes must be re-created on each indexing run.

=head1 EXPORT

Nothing by default.

=head1 EXAMPLES

Test script for this module uses all parts of API. It's also nice example
how to use C<SWISH::Split>.

=head1 SEE ALSO

L<SWISH::API>,
L<http://www.swish-e.org/>

=head1 AUTHOR

Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Dobrica Pavlinusic

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut
1	package SWISH::Split;
2
3	use 5.008;
4	use strict;
5	use warnings;
6
7	our $VERSION = '0.03';
8
9	use SWISH::API;
10	use Text::Iconv;
11	use File::Temp qw/ :mktemp /;
12	use Carp;
13	use Digest::MD5 qw(md5_hex);
14	use Memoize;
15	use File::Which;
16
17	use Data::Dumper;
18
19	use constant {
20	ADDED => 1,
21	DELETED => 2,
22	};
23
24	=head1 NAME
25
26	SWISH::Split - Perl interface to split index variant of Swish-e
27
28	=head1 SYNOPSIS
29
30	use SWISH::Split;
31
32
33	=head1 DESCRIPTION
34
35	This is alternative interface for indexing data with swish-e. It's designed
36	to split indexes over multiple files (slices) to allow updates of records in index
37	by reindexing just changed parts (slice).
38
39	Data is stored in index using intrface which is somewhat similar to
40	L<Plucene::Simple>. This could make your migration (or supporting two index
41	engines) easier.
42
43	In the background, it will fork swish-e binaries (one for each index slice)
44	and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45	C<ISO-8859-1> you will have to specify it.
46
47	=head1 Methods used for indexing
48
49	=head2 open_index
50
51	Create new object for index.
52
53	my $i = SWISH::Split->open_index({
54	index => '/path/to/index',
55	slice_name => \&slice_on_path,
56	slices => 30,
57	merge => 0,
58	codepage => 'ISO-8859-2',
59	swish_config => qq{
60	PropertyNames from date
61	PropertyNamesDate date
62	},
63	memoize_to_xml => 0,
64	);
65
66	# split index on first component of path
67	sub slice_on_path {
68	return shift split(/\//,$_[0]);
69	}
70
71	Options to C<open_index> are following:
72
73	=over 5
74
75	=item C<index>
76
77	path to (existing) directory in which index slices will be created.
78
79	=item C<slice_name>
80
81	coderef to function which provide slicing from path.
82
83	=item C<slices>
84
85	maximum number of index slices. See L<"in_slice"> for
86	more explanation.
87
88	=item C<merge>
89
90	(planned) option to merge indexes into one at end.
91
92	=item C<codepage>
93
94	data codepage (needed for conversion to UTF-8).
95	By default, it's C<ISO-8859-1>.
96
97	=item C<swish_config>
98
99	additional parametars which will be inserted into
100	C<swish-e> configuration file. See L<swish-config>.
101
102	=item C<memoize_to_xml>
103
104	speed up repeatable data, see L<"to_xml">.
105
106	=back
107
108	=cut
109
110	my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111
112	sub open_index {
113	my $class = shift;
114	my $self = {@_};
115	bless($self, $class);
116
117	croak "need slice_name coderef" unless ref $self->{'slice_name'};
118	croak "need slices" unless $self->{'slices'};
119
120	croak "need index" unless $self->{'index'};
121	croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
122	croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
123
124	$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125
126	# speedup
127	memoize('in_slice');
128	memoize('to_xml') if ($self->{'memoize_to_xml'});
129
130	$self ? return $self : return undef;
131
132	}
133
134	=head2 add
135
136	Add document to index.
137
138	$i->add($swishpath, {
139	headline => 'foobar result',
140	property => 'data',
141	})
142
143	=cut
144
145	sub add {
146	my $self = shift;
147
148	my $swishpath = shift \|\| return;
149	my $data = shift \|\| return;
150
151	my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152
153	return $slice;
154	}
155
156	=head2 delete
157
158	Delete documents from index.
159
160	$i->delete(@swishpath);
161
162	=cut
163
164	sub delete {
165	my $self = shift;
166
167	my @paths = @_ \|\| return;
168
169	foreach my $path (@paths) {
170	$self->{'paths'}->{$path} = DELETED;
171	}
172
173	return 42;
174	}
175
176
177	=head2 done
178
179	Finish indexing and close index file(s).
180
181	$i->done;
182
183	This is most time-consuming operation. When it's called, it will re-index
184	all entries which haven't changed in all slices.
185
186	Returns number of slices updated.
187
188	This method should really be called close or finish, but both of those are
189	allready used.
190
191	=cut
192
193	sub done {
194	my $self = shift;
195
196	my $ret = 0;
197
198	foreach my $s (keys %{$self->{'slice'}}) {
199	$self->_debug("closing slice $s");
200	$ret += $self->close_slice($s);
201	}
202
203	return $ret;
204	}
205
206
207
208	=head1 Reporting methods
209
210	This methods return statistics about your index.
211
212	=head2 swishpaths
213
214	Return array of C<swishpath>s in index.
215
216	my @p = $i->swishpaths;
217
218	=cut
219
220	sub swishpaths {
221	my $self = shift;
222
223	my $s = shift \|\| return;
224	return if (! exists($self->{'slice'}->{'s'}));
225
226	return keys %{$self->{'slice'}->{'s'}};
227	}
228
229	=head2 swishpaths_updated
230
231	Return array with updated C<swishpath>s.
232
233	my @d = $i->swishpaths_updated;
234
235	=cut
236
237	sub swishpaths_updated {
238	my $self = shift;
239	}
240
241
242	=head2 swishpaths_deleted
243
244	Return array with deleted C<swishpath>s.
245
246	my $n = $i->swishpaths_deleted;
247
248	=cut
249
250	sub swishpaths_deleted {
251	my $self = shift;
252	}
253
254
255	=head2 slices
256
257	Return array with all slice names.
258
259	my @s = $i->slices;
260
261	=cut
262
263	sub slices {
264	my $self = shift;
265	}
266
267	=head1 Helper methods
268
269	This methods are used internally, but they might be useful.
270
271	=head2 in_slice
272
273	Takes path and return slice in which this path belongs.
274
275	my $s = $i->in_slice('path/to/document/in/index');
276
277	If there are C<slices> parametar to L<"open_index"> it will use
278	MD5 hash to spread documents across slices. That will produce random
279	distribution of your documents in slices, which might or might not be best
280	for your data. If you have to re-index large number of slices on each
281	run, think about creating your own C<slice> function and distributing
282	documents manually across slices.
283
284	Slice number must always be true value or various sanity checks will fail.
285
286	This function is C<Memoize>ed for performance reasons.
287
288	=cut
289
290	sub in_slice {
291	my $self = shift;
292
293	my $path = shift \|\| confess "need path";
294
295	confess "need slice_name function" unless ref ($self->{'slice_name'});
296
297	if ($self->{'slices'}) {
298	# first, pass path through slice_name function
299	my $slice = &{$self->{'slice_name'}}($path);
300	# then calculate MD5 hash
301	my $hash = md5_hex($slice);
302	# take first 8 chars to produce number
303	# FIXME how random is this?
304	$hash = hex(substr($hash,0,8));
305
306	$slice = ($hash % $self->{'slices'}) + 1;
307	$self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
308	return $slice;
309	} else {
310	return &{$self->{'split'}}($path);
311	}
312	}
313
314	=head2 find_paths
315
316	Return array of C<swishpath>s for given C<swish-e> query.
317
318	my @p = $i->find_paths("headline=test*");
319
320	Useful for combining with L<"delete_documents"> to delete documents
321	which hasn't changed a while (so, expired).
322
323	=cut
324
325	sub find_paths {
326	my $self = shift;
327
328	}
329
330
331	=head2 make_config
332
333	Create C<swish-e> configuration file for given slice.
334
335	my $config_filename = $i->make_config('slice name');
336
337	It returns configuration filename. If no C<swish_config> was defined in
338	L<"open_index">, default swish-e configuration will be used. It will index all data for
339	searching, but none for properties.
340
341	If you want to see what is allready defined for swish-e in configuration
342	take a look at source code for C<DEFAULT_SWISH_CONF>.
343
344	It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
345
346	=cut
347
348	sub make_config {
349	my $self = shift;
350
351
352	my $index_file = $self->{'index'}."/";
353	$index_file .= shift \|\| confess "need slice name";
354
355	my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
356
357	# find cat on filesystem
358	my $cat = which('cat');
359
360	print $tmp_fh <<"DEFAULT_SWISH_CONF";
361	# swish-e config file
362
363	IndexDir stdin
364
365	# input file definition
366	DefaultContents XML*
367
368	# indexed metatags
369	MetaNames xml swishdocpath
370
371
372	#XMLClassAttributes type
373	UndefinedMetaTags auto
374	UndefinedXMLAttributes auto
375
376	IndexFile $index_file
377
378	# Croatian ISO-8859-2 characters to unaccented equivalents
379	TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz
380
381
382	# disable output
383	ParserWarnLevel 0
384	IndexReport 1
385
386	DEFAULT_SWISH_CONF
387
388	# add user parametars (like stored properties)
389	print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
390
391	close($tmp_fh);
392
393	return $swish_config_filename;
394	}
395
396	=head2 create_slice
397
398	On first run, starts C<swish-e>. On subsequent calls just return
399	it's handles using L<Memoize>.
400
401	my $s = create_slice('/path/to/document');
402
403	You shouldn't need to call C<create_slice> directly because it will be called
404	from L<"put_slice"> when needed.
405
406	=cut
407
408	sub create_slice {
409	my $self = shift;
410
411	my $path = shift \|\| confess "create_slice need path!";
412
413	my $s = $self->in_slice($path) \|\| confess "in_slice returned null";
414
415	return $s if (exists($self->{'slice'}->{$s}));
416
417	my $swish_config = $self->make_config($s);
418
419	my $swish = qq{\| swish-e };
420	if (-f $self->{'index'}.'/'.$s) {
421	$swish .= qq{ -u };
422	$self->{'slice'}->{$s}->{'update_mode'}++;
423	}
424	$swish .= qq{ -S prog -c } . $swish_config;
425
426	$self->_debug("creating slice $s using $swish");
427
428	## Build the harness, open all pipes, and launch the subprocesses
429	open(my $fh, $swish) \|\| croak "can't open $swish: $!";
430
431	$self->{'slice'}->{$s}->{'h'} = $fh;
432
433	$self->slice_output($s);
434
435	return $s;
436	}
437
438	=head2 put_slice
439
440	Pass XML data to swish.
441
442	my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
443
444	Returns slice in which XML ended up.
445
446	=cut
447
448	sub put_slice {
449	my $self = shift;
450
451	my $path = shift \|\| confess "need path";
452	my $xml = shift \|\| confess "need xml";
453
454	$xml = $iso2utf->convert($xml) \|\| carp "XML conversion error in $xml";
455
456	my $s = $self->create_slice($path) \|\| confess "create_slice returned null";
457
458	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
459	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
460
461	$self->slice_output($s);
462
463	use bytes; # as opposed to chars
464	my $fh = $self->{'slice'}->{$s}->{'h'} \|\| confess "handle for slice $s undefined";
465
466	my $update_header = "Update-Mode: Index\n";
467	$update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
468
469	print { $fh } "Path-Name: $path\n".
470	"Content-Length: ".(length($xml)+1)."\n" . $update_header .
471	"Document-Type: XML\n\n$xml\n";
472
473	$self->slice_output($s);
474
475	$self->_debug("dumping in slice $s: $path");
476
477	$self->{'paths'}->{$path} = ADDED;
478
479	return $s;
480	}
481
482	=head2 slice_output
483
484	Prints to STDERR output and errors from C<swish-e>.
485
486	my $slice = $i->slice_output($s);
487
488	Normally, you don't need to call it.
489
490	B<This is dummy placeholder function for very old code that assumes this
491	module is using C<IPC::Run> which it isn't any more.>
492
493	=cut
494
495	sub slice_output {
496	my $self = shift;
497
498	my $s = shift \|\| confess "slice_output needs slice";
499
500	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
501
502	# FIXME
503
504	return $s;
505	}
506
507	=head2 close_slice
508
509	Close slice (terminates swish-e process for that slice).
510
511	my $i->close_slice($s);
512
513	Returns true if slice is closed, false otherwise.
514
515	=cut
516
517	sub close_slice {
518	my $self = shift;
519
520	my $s = shift \|\| confess "close_slice needs slice";
521
522	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
523	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
524
525	# pump rest of content (if any)
526	close $self->{'slice'}->{$s}->{'h'} \|\| carp "can't close slice $s: $!";
527
528	$self->slice_output($s);
529
530	undef $self->{'slice'}->{$s}->{'h'};
531
532	delete($self->{'slice'}->{$s}) && return 1;
533	return 0;
534	}
535
536	=head2 to_xml
537
538	Convert (binary safe, I hope) your data into XML for C<swish-e>.
539	Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
540
541	my $xml = $i->to_xml({ foo => 'bar' });
542
543	This function is extracted from L<"add"> method so that you can L<Memoize> it.
544	If your data set has a lot of repeatable data, and memory is not a problem, you
545	can add C<memoize_to_xml> option to L<"open_index">.
546
547	=cut
548
549	my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"');
550	my $escape_re = join '\|' => keys %escape;
551
552	sub to_xml {
553	my $self = shift;
554
555	my $data = shift \|\| return;
556
557	my $xml = qq{<xml>};
558	foreach my $tag (keys %$data) {
559	my $content = $data->{$tag};
560	next if (! $content \|\| $content eq '');
561	# save [cr/]lf before conversion to XML
562	# $content =~ s/\n\r/##lf##/gs;
563	# $content =~ s/\n/##lf##/gs;
564	$content =~ s/($escape_re)/$escape{$1}/gs;
565	$xml .= "<$tag><![CDATA[".$content."]]></$tag>";
566	}
567	$xml .= qq{</xml>};
568	}
569
570	sub _debug {
571	my $self = shift;
572	print STDERR "## ",@_,"\n" if ($self->{'debug'});
573	return;
574	}
575
576	1;
577	__END__
578
579
580	=head1 Searching
581
582	Searching is still conducted using L<SWISH::API>, but you have to glob
583	index names.
584
585	use SWISH::API;
586
587	my $swish = SWISH::API->new( glob('index.swish-e/*') );
588
589	You can also alternativly create merged index (using C<merge> option) and
590	not change your source code at all.
591
592	That would also benefit performance, but it increases indexing time
593	because merged indexes must be re-created on each indexing run.
594
595	=head1 EXPORT
596
597	Nothing by default.
598
599	=head1 EXAMPLES
600
601	Test script for this module uses all parts of API. It's also nice example
602	how to use C<SWISH::Split>.
603
604	=head1 SEE ALSO
605
606	L<SWISH::API>,
607	L<http://www.swish-e.org/>
608
609	=head1 AUTHOR
610
611	Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
612
613	=head1 COPYRIGHT AND LICENSE
614
615	Copyright (C) 2004 by Dobrica Pavlinusic
616
617	This library is free software; you can redistribute it and/or modify
618	it under the same terms as Perl itself, either Perl version 5.8.4 or,
619	at your option, any later version of Perl 5 you may have available.
620
621
622	=cut