Revision 1

Date:
2004/08/08 10:09:55
Author:
dpavlin
Revision Log:
initial import of SWISH::Split. Lot of documentation, less code.
Files:

Legend:

 
Added
 
Removed
 
Modified
  • trunk/Changes

     
    1 Revision history for Perl extension SWISH::Split.
    2
    3 0.01 Sun Aug 8 08:57:37 2004
    4 - original version; created by h2xs 1.23 with options
    5 -AX -n SWISH::Split
    6
  • trunk/Makefile.PL

     
    1 use 5.008004;
    2 use ExtUtils::MakeMaker;
    3 # See lib/ExtUtils/MakeMaker.pm for details of how to influence
    4 # the contents of the Makefile that is written.
    5 WriteMakefile(
    6 NAME => 'SWISH::Split',
    7 VERSION_FROM => 'Split.pm', # finds $VERSION
    8 PREREQ_PM => {}, # e.g., Module::Name => 1.1
    9 ($] >= 5.005 ? ## Add these new keywords supported since 5.005
    10 (ABSTRACT_FROM => 'Split.pm', # retrieve abstract from module
    11 AUTHOR => 'Dobrica Pavlinusic <dpavlin@rot13.org>') : ()),
    12 );
  • trunk/MANIFEST

     
    1 Changes
    2 Makefile.PL
    3 MANIFEST
    4 README
    5 t/SWISH-Split.t
    6 Split.pm
  • trunk/README

     
    1 SWISH-Split version 0.01
    2 ========================
    3
    4 The README is used to introduce the module and provide instructions on
    5 how to install the module, any machine dependencies it may have (for
    6 example C compilers and installed libraries) and any other information
    7 that should be provided before the module is installed.
    8
    9 A README file is required for CPAN modules since CPAN extracts the
    10 README file from a module distribution so that people browsing the
    11 archive can use it get an idea of the modules uses. It is usually a
    12 good idea to provide version information here so that people can
    13 decide whether fixes for the module are worth downloading.
    14
    15 INSTALLATION
    16
    17 To install this module type the following:
    18
    19 perl Makefile.PL
    20 make
    21 make test
    22 make install
    23
    24 DEPENDENCIES
    25
    26 This module requires these other modules and libraries:
    27
    28 blah blah blah
    29
    30 COPYRIGHT AND LICENCE
    31
    32 Put the correct copyright and licence information here.
    33
    34 Copyright (C) 2004 by Dobrica Pavlinusic
    35
    36 This library is free software; you can redistribute it and/or modify
    37 it under the same terms as Perl itself, either Perl version 5.8.4 or,
    38 at your option, any later version of Perl 5 you may have available.
    39
    40
  • trunk/Split.pm

     
    1 package SWISH::Split;
    2
    3 use 5.008;
    4 use strict;
    5 use warnings;
    6
    7 our $VERSION = '0.00';
    8
    9 use SWISH::API;
    10 use Text::Iconv;
    11 use File::Temp qw/ :mktemp /;
    12 use Carp;
    13 use Digest::MD5 qw(md5_hex);
    14 use Memoize;
    15
    16 use Data::Dumper;
    17
    18 =head1 NAME
    19
    20 SWISH::Split - Perl interface to split index variant of Swish-e
    21
    22 =head1 SYNOPSIS
    23
    24 use SWISH::Split;
    25
    26
    27 =head1 DESCRIPTION
    28
    29 This is alternative interface for indexing data with swish-e. It's designed
    30 to split indexes over multiple files to allow updates of records in index
    31 by reindexing just changed parts.
    32
    33 Data is stored in index using intrface which is somewhat similar to
    34 L<Plucene::Simple>. This could make your migration (or supporting two index
    35 engines) easier.
    36
    37 In the background, it will fork swish-e binaries (one for each index slice)
    38 and produce UTF-8 encoded XML files. So, if your imput charset isn't
    39 C<ISO-8859-1> you will have to specify it.
    40
    41 =head1 Methods used for indexing
    42
    43 =head2 open
    44
    45 Create new object for index.
    46
    47 my $i = SWISH::Split->open({
    48 index => '/path/to/index',
    49 slice_name => \&slice_on_path,
    50 slices => 30,
    51 merge => 1,
    52 codepage => 'ISO-8859-2'
    53 );
    54
    55 # split index on first component of path
    56 sub slice_on_path {
    57 return shift split(/\//,$_[0]);
    58 }
    59
    60
    61 C<slices> is maximum number of index slices. See L<"in_slice"> for
    62 more explanation.
    63
    64 =cut
    65
    66 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
    67
    68 sub open {
    69 my $class = shift;
    70 my $self = {@_};
    71 bless($self, $class);
    72
    73 print Dumper($self->{'slice_name'});
    74
    75 croak "need slice_name coderef" unless ref $self->{'slice_name'};
    76 croak "need slices" unless $self->{'slices'};
    77
    78 croak "need index" unless $self->{'index'};
    79 croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
    80 croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
    81
    82 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
    83
    84 memoize('in_slice');
    85
    86 $self ? return $self : return undef;
    87
    88 }
    89
    90 =head2 add
    91
    92 Add document to index.
    93
    94 $i->add($swishpath, {
    95 headline => 'foobar result',
    96 property => 'data',
    97 })
    98
    99 =cut
    100
    101 sub add {
    102 my $self = shift;
    103 }
    104
    105 =head2 delete
    106
    107 Delete document from index.
    108
    109 $i->delete($swishpath);
    110
    111 =cut
    112
    113 sub delete {
    114 my $self = shift;
    115 }
    116
    117
    118 =head2 close
    119
    120 Close index file and finish indexing.
    121
    122 $i->close;
    123
    124 This is most time-consuming operation. When it's called, it will re-index
    125 all entries which haven't changed in all slices.
    126
    127 =cut
    128
    129 sub close {
    130 my $self = shift;
    131 }
    132
    133
    134
    135 =head1 Reporting methods
    136
    137 This methods return statistics about your index.
    138
    139 =head2 swishpaths
    140
    141 Return array of C<swishpath>s in index.
    142
    143 my @p = $i->swishpaths;
    144
    145 =cut
    146
    147 sub swishpaths {
    148 my $self = shift;
    149 }
    150
    151 =head2 swishpaths_updated
    152
    153 Return array with updated C<swishpath>s.
    154
    155 my @d = $i->swishpaths_updated;
    156
    157 =cut
    158
    159 sub swishpaths_updated {
    160 my $self = shift;
    161 }
    162
    163
    164 =head2 swishpaths_deleted
    165
    166 Return array with deleted C<swishpath>s.
    167
    168 my $n = $i->swishpaths_deleted;
    169
    170 =cut
    171
    172 sub swishpaths_deleted {
    173 my $self = shift;
    174 }
    175
    176
    177 =head2 slices
    178
    179 Return array with all slice names.
    180
    181 my @s = $i->slices;
    182
    183 =cut
    184
    185 sub slices {
    186 my $self = shift;
    187 }
    188
    189 =head1 Helper methods
    190
    191 This methods are used internally, but they might be useful.
    192
    193 =head2 in_slice
    194
    195 Takes path and return slice in which this path belongs.
    196
    197 my $s = $i->in_slice('path/to/document/in/index');
    198
    199 If there are C<slices> parametar to L<"open"> it will use
    200 MD5 hash to spread documents across slices. That will produce random
    201 distribution of your documents in slices, which might or might not be best
    202 for your data. If you have to re-index large number of slices on each
    203 run, think about creating your own C<slice> function and distributing
    204 documents manually across slices.
    205
    206 This function is C<Memoize>ed for performance reasons.
    207
    208 =cut
    209
    210 sub in_slice {
    211 my $self = shift;
    212
    213 my $path = shift || confess "need path";
    214
    215 print Dumper($self->{'slice_name'});
    216 confess "need slice_name function" unless ref ($self->{'slice_name'});
    217
    218 if ($self->{'slices'}) {
    219 # first, pass path through slice_name function
    220 my $slice = &{$self->{'slice_name'}}($path);
    221 # then calculate MD5 hash
    222 $slice = md5_hex($slice);
    223 # take first 8 chars to produce number
    224 # FIXME how random is this?
    225 $slice = hex(substr($slice,0,8));
    226
    227 print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
    228 return ($slice % $self->{'slices'});
    229 } else {
    230 return &{$self->{'split'}}($path);
    231 }
    232 }
    233
    234
    235
    236 1;
    237 __END__
    238
    239
    240 =head2 Searching
    241
    242 Searching is still conducted using L<SWISH::API>, but you have to glob
    243 index names.
    244
    245 use SWISH::API;
    246
    247 my $swish = SWISH::API->new( glob('index.swish-e/*') );
    248
    249 You can also alternativly create merged index (using C<merge> option) and
    250 not change your source code at all.
    251
    252 That would also benefit performance, but it increases indexing time
    253 because merged indexes must be re-created on each indexing run.
    254
    255 =head2 EXPORT
    256
    257 None by default.
    258
    259
    260
    261 =head1 SEE ALSO
    262
    263 L<SWISH::API>,
    264 L<http://www.swish-e.org/>
    265
    266 =head1 AUTHOR
    267
    268 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
    269
    270 =head1 COPYRIGHT AND LICENSE
    271
    272 Copyright (C) 2004 by Dobrica Pavlinusic
    273
    274 This library is free software; you can redistribute it and/or modify
    275 it under the same terms as Perl itself, either Perl version 5.8.4 or,
    276 at your option, any later version of Perl 5 you may have available.
    277
    278
    279 =cut
  • trunk/t/SWISH-Split.t

     
    1 #!/usr/bin/perl -w
    2
    3 use strict;
    4
    5 use Test::More tests => 9;
    6 use Test::Exception;
    7 use File::Temp qw/ :mktemp /;
    8 use blib;
    9
    10 BEGIN { use_ok('SWISH::Split') };
    11
    12 my %param;
    13
    14 throws_ok { SWISH::Split->open(%param) } qr/slice_name/, "slice_name";
    15
    16 sub slice_1st_char {
    17 return substr($_[0],0,1);
    18 };
    19
    20 use Data::Dumper;
    21 print Dumper(\&slice_1st_char);
    22
    23 $param{'slice_name'} = \&slice_1st_char;
    24 throws_ok { SWISH::Split->open(%param) } qr/slices/, "slices";
    25
    26 $param{'slices'} = 3;
    27 throws_ok { SWISH::Split->open(%param) } qr/index/, "index";
    28
    29 ok($param{'index'} = mkstemp("/tmp/swishXXXXX"), "make temp index");
    30 throws_ok { SWISH::Split->open(%param) } qr/dir/, "dir";
    31
    32 ok($param{'index'} = mkdtemp("/tmp/swishXXXXX"), "make temp index");
    33 ok(my $i=SWISH::Split->open(%param), "open");
    34
    35 # methods test
    36
    37
    38
    39 # internal functions test
    40
    41 cmp_ok($i->in_slice("swishpath"), '==', 2, "open");
    42