/[SWISH-Split]/trunk/Split.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/Split.pm

Parent Directory | Revision Log | View Patch Patch

-revision 3 by dpavlin,
Sun Aug  8 10:53:04 2004 UTC
+revision 7 by dpavlin,
Fri Dec 17 18:32:34 2004 UTC
 Line 4 
 use 5.008;
  use strict;
  use warnings;
- our $VERSION = '0.00';
+ our $VERSION = '0.01';
  use SWISH::API;
  use Text::Iconv;
 Line 12 
 use File::Temp qw/ :mktemp /;
  use Carp;
  use Digest::MD5 qw(md5_hex);
  use Memoize;
+ use IPC::Run qw(start timeout pump finish);
+ use File::Which;
  use Data::Dumper;
+ use constant {
+         ADDED => 1,
+         DELETED => 2,
+ };
  =head1 NAME
  SWISH::Split - Perl interface to split index variant of Swish-e
-Line 27 
 SWISH::Split - Perl interface to split i
+Line 34 
 SWISH::Split - Perl interface to split i
  =head1 DESCRIPTION
  This is alternative interface for indexing data with swish-e. It's designed
- to split indexes over multiple files to allow updates of records in index
+ to split indexes over multiple files (slices) to allow updates of records in index
- by reindexing just changed parts.
+ by reindexing just changed parts (slice).
  Data is stored in index using intrface which is somewhat similar to
  L<Plucene::Simple>. This could make your migration (or supporting two index
  engines) easier.
  In the background, it will fork swish-e binaries (one for each index slice)
- and produce UTF-8 encoded XML files. So, if your imput charset isn't
+ and produce UTF-8 encoded XML files for it. So, if your input charset isn't
  C<ISO-8859-1> you will have to specify it.
  =head1 Methods used for indexing
-Line 48 
 Create new object for index.
+Line 55 
 Create new object for index.
          index => '/path/to/index',
          slice_name => \&slice_on_path,
          slices => 30,
-         merge => 1,
+         merge => 0,
-         codepage => 'ISO-8859-2'
+         codepage => 'ISO-8859-2',
+         swish_config => qq{
+                 PropertyNames from date
+                 PropertyNamesDate date
+         },
+         memoize_to_xml => 0,
    );
    # split index on first component of path
-Line 57 
 Create new object for index.
+Line 69 
 Create new object for index.
          return shift split(/\//,$_[0]);
    }
+ Options to open are following:
+ =over 5
+ =item C<index>
+ path to (existing) directory in which index slices will be created.
+ =item C<slice_name>
- C<slices> is maximum number of index slices. See L<"in_slice"> for
+ coderef to function which provide slicing from path.
+ =item C<slices>
+ maximum number of index slices. See L<"in_slice"> for
  more explanation.
+ =item C<merge>
+ (planned) option to merge indexes into one at end.
+ =item C<codepage>
+ data codepage (needed for conversion to UTF-8).
+ By default, it's C<ISO-8859-1>.
+ =item C<swish_config>
+ additional parametars which will be inserted into
+ C<swish-e> configuration file. See L<swish-config>.
+ =item C<memoize_to_xml>
+ speed up repeatable data, see L<"to_xml">.
+ =back
  =cut
  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
-Line 70 
 sub open {
+Line 115 
 sub open {
          my $self = {@_};
          bless($self, $class);
-         print Dumper($self->{'slice_name'});
          croak "need slice_name coderef" unless ref $self->{'slice_name'};
          croak "need slices" unless $self->{'slices'};
-Line 81 
 sub open {
+Line 124 
 sub open {
          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
+         # speedup
          memoize('in_slice');
+         memoize('to_xml') if ($self->{'memoize_to_xml'});
          $self ? return $self : return undef;
-Line 104 
 sub add {
+Line 149 
 sub add {
          my $swishpath = shift || return;
          my $data = shift || return;
-         return 1;
+         my $slice = $self->put_slice($swishpath, $self->to_xml($data));
+ #       if ($err) {
+ #               carp "$swishpath: $err";
+ #               return undef;
+ #       }
+         return $slice;
  }
  =head2 delete
-Line 120 
 sub delete {
+Line 172 
 sub delete {
          my @paths = @_ || return;
+         foreach my $path (@paths) {
+                 $self->{'paths'}->{$path} = DELETED;
+         }
          return 42;
  }
- =head2 close
+ =head2 done
- Close index file and finish indexing.
+ Finish indexing and close index file(s).
-   $i->close;
+   $i->done;
  This is most time-consuming operation. When it's called, it will re-index
  all entries which haven't changed in all slices.
+ Returns number of slices updated.
+ This method should really be called close or finish, but both of those are
+ allready used.
  =cut
- sub close {
+ sub done {
          my $self = shift;
-         return 1;
+         my $ret = 0;
+         foreach my $s (keys %{$self->{'slice'}}) {
+                 print STDERR "closing slice $s\n";
+                 $ret += $self->close_slice($s);
+         }
+         return $ret;
  }
-Line 157 
 Return array of C<swishpath>s in index.
+Line 225 
 Return array of C<swishpath>s in index.
  sub swishpaths {
          my $self = shift;
+         my $s = shift || return;
+         return if (! exists($self->{'slice'}->{'s'}));
+         return keys %{$self->{'slice'}->{'s'}};
  }
  =head2 swishpaths_updated
-Line 214 
 for your data. If you have to re-index l
+Line 287 
 for your data. If you have to re-index l
  run, think about creating your own C<slice> function and distributing
  documents manually across slices.
+ Slice number must always be true value or various sanity checks will fail.
  This function is C<Memoize>ed for performance reasons.
  =cut
-Line 223 
 sub in_slice {
+Line 298 
 sub in_slice {
          my $path = shift || confess "need path";
-         print Dumper($self->{'slice_name'});
          confess "need slice_name function" unless ref ($self->{'slice_name'});
          if ($self->{'slices'}) {
-Line 235 
 sub in_slice {
+Line 309 
 sub in_slice {
                  # FIXME how random is this?
                  $slice = hex(substr($slice,0,8));
-                 print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
+                 $slice = ($slice % $self->{'slices'}) + 1;
-                 return ($slice % $self->{'slices'});
+                 print "hash: $slice / ",$self->{'slices'}," => $slice\n";
+                 return $slice;
          } else {
                  return &{$self->{'split'}}($path);
          }
-Line 256 
 which hasn't changed a while (so, expire
+Line 331 
 which hasn't changed a while (so, expire
  sub find_paths {
          my $self = shift;
-         my $s = shift || return;
  }
+ =head2 make_config
+ Create C<swish-e> configuration file for given slice.
+   my $config_filename = $i->make_config('slice name');
+ It returns configuration filename. If no C<swish_config> was defined in
+ L<"open">, default swish-e configuration will be used. It will index all data for
+ searching, but none for properties.
+ If you want to see what is allready defined for swish-e in configuration
+ take a look at source code for C<DEFAULT_SWISH_CONF>.
+ It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
+ =cut
+ sub make_config {
+         my $self = shift;
+         my $index_file = $self->{'index'}."/";
+         $index_file .= shift || confess "need slice name";
+         my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
+         # find cat on filesystem
+         my $cat = which('cat');
+         print $tmp_fh <<"DEFAULT_SWISH_CONF";
+ # swish-e config file
+ IndexDir stdin
+ # input file definition
+ DefaultContents XML*
+ # indexed metatags
+ MetaNames xml swishdocpath
+ #XMLClassAttributes type
+ UndefinedMetaTags auto
+ UndefinedXMLAttributes auto
+ IndexFile $index_file
+ # Croatian ISO-8859-2 characters to unaccented equivalents
+ TranslateCharacters �������ƾ� ssddcccczz
+ # disable output
+ ParserWarnLevel 0
+ IndexReport 1
+ DEFAULT_SWISH_CONF
+         # add user parametars (like stored properties)
+         print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
+         close($tmp_fh);
+         return $swish_config_filename;
+ }
+ =head2 create_slice
+ On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
+ it's handles using L<Memoize>.
+   my $s = create_slice('/path/to/document');
+ You shouldn't need to call C<create_slice> directly because it will be called
+ from L<"put_slice"> when needed.
+ =cut
+ sub create_slice {
+         my $self = shift;
+         my $path = shift || confess "create_slice need path!";
+         my $s = $self->in_slice($path) || confess "in_slice returned null";
+         return $s if (exists($self->{'slice'}->{$s}));
+         my $swish_config = $self->make_config($s);
+         print STDERR "creating slice $s\n";     # FIXME
+         my @swish = qw(swish-e -u -S prog -c);
+         push @swish, $swish_config;
+         ## Build the harness, open all pipes, and launch the subprocesses
+         $self->{'slice'}->{$s}->{'h'} = start \@swish,
+                 \$self->{'slice'}->{$s}->{'in'},
+                 \$self->{'slice'}->{$s}->{'out'},
+                 \$self->{'slice'}->{$s}->{'err'},
+                 timeout( 90 );  # FIXME
+         $self->{'slice'}->{$s}->{'out_len'} = 0;
+         $self->{'slice'}->{$s}->{'err_len'} = 0;
+         $self->slice_output($s);
+         return $s;
+ }
+ =head2 put_slice
+ Pass XML data to swish.
+   my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
+ Returns slice in which XML ended up.
+ =cut
+ sub put_slice {
+         my $self = shift;
+         my $path = shift || confess "need path";
+         my $xml = shift || confess "need xml";
+         $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
+         my $s = $self->create_slice($path) || confess "create_slice returned null";
+         confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
+         confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
+         confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
+         $self->slice_output($s);
+         use bytes;      # as opposed to chars
+         $self->{'slice'}->{$s}->{'in'} .=
+                 "Path-Name: $path\n".
+                 "Content-Length: ".(length($xml)+1)."\n".
+                 "Update-Mode: Index\n".
+                 "Document-Type: XML\n\n$xml\n";
+         # do I/O
+         $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go
+         $self->slice_output($s);
+         $self->{'paths'}->{$path} = ADDED;
+         return $s;
+ }
+ =head2 slice_output
+ Prints to STDERR output and errors from C<swish-e>.
+   my $slice = $i->slice_output($s);
+ Normally, you don't need to call it.
+ =cut
+ sub slice_output {
+         my $self = shift;
+         my $s = shift || confess "slice_output needs slice";
+         confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
+         confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
+         confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
+         if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
+                 #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
+                 $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
+                 return $s;
+         } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
+                 print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
+                 $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
+                 # this is fatal
+                 return undef;
+         }
+         return $s;
+ }
+ =head2 close_slice
+ Close slice (terminates swish-e process for that slice).
+   my $i->close_slice($s);
+ Returns true if slice is closed, false otherwise.
+ =cut
+ sub close_slice {
+         my $self = shift;
+         my $s = shift || confess "close_slice needs slice";
+         confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
+         confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
+         # pump rest of content (if any)
+         $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
+         $self->slice_output($s);
+         # clean up
+         $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'};
+         delete($self->{'slice'}->{$s}) && return 1;
+         return 0;
+ }
+ =head2 to_xml
+ Convert (binary safe, I hope) your data into XML for C<swish-e>.
+ Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
+   my $xml = $i->to_xml({ foo => 'bar' });
+ This function is extracted from L<"add"> method so that you can L<Memoize> it.
+ If your data set has a lot of repeatable data, and memory is not a problem, you
+ can add C<memoize_to_xml> option to L<"open">.
+ =cut
+ my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
+ my $escape_re  = join '|' => keys %escape;
+ sub to_xml {
+         my $self = shift;
+         my $data = shift || return;
+         my $xml = qq{<xml>};
+         foreach my $tag (keys %$data) {
+                 my $content = $data->{$tag};
+                 next if (! $content || $content eq '');
+                 # save [cr/]lf before conversion to XML
+ #               $content =~ s/\n\r/##lf##/gs;
+ #               $content =~ s/\n/##lf##/gs;
+                 $content =~ s/($escape_re)/$escape{$1}/gs;
+                 $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
+         }
+         $xml .= qq{</xml>};
+ }
 ;
  __END__
- =head2 Searching
+ =head1 Searching
  Searching is still conducted using L<SWISH::API>, but you have to glob
  index names.
-Line 280 
 not change your source code at all.
+Line 601 
 not change your source code at all.
  That would also benefit performance, but it increases indexing time
  because merged indexes must be re-created on each indexing run.
- =head2 EXPORT
+ =head1 EXPORT
- None by default.
+ Nothing by default.
+ =head1 EXAMPLES
+ Test script for this module uses all parts of API. It's also nice example
+ how to use C<SWISH::Split>.
  =head1 SEE ALSO

 Legend:



Removed from v.3
 


changed lines


 
Added in v.7
 Legend:



Removed from v.3
 


changed lines


 
Added in v.7
-Removed from v.3
+Added in v.7

	ViewVC Help
Powered by ViewVC 1.1.26