Revision 1
- Date:
- 2004/08/08 10:09:55
- Files:
Legend:
- Added
- Removed
- Modified
-
trunk/Changes
1 Revision history for Perl extension SWISH::Split. 2 3 0.01 Sun Aug 8 08:57:37 2004 4 - original version; created by h2xs 1.23 with options 5 -AX -n SWISH::Split 6 -
trunk/Makefile.PL
1 use 5.008004; 2 use ExtUtils::MakeMaker; 3 # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 # the contents of the Makefile that is written. 5 WriteMakefile( 6 NAME => 'SWISH::Split', 7 VERSION_FROM => 'Split.pm', # finds $VERSION 8 PREREQ_PM => {}, # e.g., Module::Name => 1.1 9 ($] >= 5.005 ? ## Add these new keywords supported since 5.005 10 (ABSTRACT_FROM => 'Split.pm', # retrieve abstract from module 11 AUTHOR => 'Dobrica Pavlinusic <dpavlin@rot13.org>') : ()), 12 ); -
trunk/MANIFEST
1 Changes 2 Makefile.PL 3 MANIFEST 4 README 5 t/SWISH-Split.t 6 Split.pm -
trunk/README
1 SWISH-Split version 0.01 2 ======================== 3 4 The README is used to introduce the module and provide instructions on 5 how to install the module, any machine dependencies it may have (for 6 example C compilers and installed libraries) and any other information 7 that should be provided before the module is installed. 8 9 A README file is required for CPAN modules since CPAN extracts the 10 README file from a module distribution so that people browsing the 11 archive can use it get an idea of the modules uses. It is usually a 12 good idea to provide version information here so that people can 13 decide whether fixes for the module are worth downloading. 14 15 INSTALLATION 16 17 To install this module type the following: 18 19 perl Makefile.PL 20 make 21 make test 22 make install 23 24 DEPENDENCIES 25 26 This module requires these other modules and libraries: 27 28 blah blah blah 29 30 COPYRIGHT AND LICENCE 31 32 Put the correct copyright and licence information here. 33 34 Copyright (C) 2004 by Dobrica Pavlinusic 35 36 This library is free software; you can redistribute it and/or modify 37 it under the same terms as Perl itself, either Perl version 5.8.4 or, 38 at your option, any later version of Perl 5 you may have available. 39 40 -
trunk/Split.pm
1 package SWISH::Split; 2 3 use 5.008; 4 use strict; 5 use warnings; 6 7 our $VERSION = '0.00'; 8 9 use SWISH::API; 10 use Text::Iconv; 11 use File::Temp qw/ :mktemp /; 12 use Carp; 13 use Digest::MD5 qw(md5_hex); 14 use Memoize; 15 16 use Data::Dumper; 17 18 =head1 NAME 19 20 SWISH::Split - Perl interface to split index variant of Swish-e 21 22 =head1 SYNOPSIS 23 24 use SWISH::Split; 25 26 27 =head1 DESCRIPTION 28 29 This is alternative interface for indexing data with swish-e. It's designed 30 to split indexes over multiple files to allow updates of records in index 31 by reindexing just changed parts. 32 33 Data is stored in index using intrface which is somewhat similar to 34 L<Plucene::Simple>. This could make your migration (or supporting two index 35 engines) easier. 36 37 In the background, it will fork swish-e binaries (one for each index slice) 38 and produce UTF-8 encoded XML files. So, if your imput charset isn't 39 C<ISO-8859-1> you will have to specify it. 40 41 =head1 Methods used for indexing 42 43 =head2 open 44 45 Create new object for index. 46 47 my $i = SWISH::Split->open({ 48 index => '/path/to/index', 49 slice_name => \&slice_on_path, 50 slices => 30, 51 merge => 1, 52 codepage => 'ISO-8859-2' 53 ); 54 55 # split index on first component of path 56 sub slice_on_path { 57 return shift split(/\//,$_[0]); 58 } 59 60 61 C<slices> is maximum number of index slices. See L<"in_slice"> for 62 more explanation. 63 64 =cut 65 66 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); 67 68 sub open { 69 my $class = shift; 70 my $self = {@_}; 71 bless($self, $class); 72 73 print Dumper($self->{'slice_name'}); 74 75 croak "need slice_name coderef" unless ref $self->{'slice_name'}; 76 croak "need slices" unless $self->{'slices'}; 77 78 croak "need index" unless $self->{'index'}; 79 croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'}; 80 croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'}; 81 82 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'}); 83 84 memoize('in_slice'); 85 86 $self ? return $self : return undef; 87 88 } 89 90 =head2 add 91 92 Add document to index. 93 94 $i->add($swishpath, { 95 headline => 'foobar result', 96 property => 'data', 97 }) 98 99 =cut 100 101 sub add { 102 my $self = shift; 103 } 104 105 =head2 delete 106 107 Delete document from index. 108 109 $i->delete($swishpath); 110 111 =cut 112 113 sub delete { 114 my $self = shift; 115 } 116 117 118 =head2 close 119 120 Close index file and finish indexing. 121 122 $i->close; 123 124 This is most time-consuming operation. When it's called, it will re-index 125 all entries which haven't changed in all slices. 126 127 =cut 128 129 sub close { 130 my $self = shift; 131 } 132 133 134 135 =head1 Reporting methods 136 137 This methods return statistics about your index. 138 139 =head2 swishpaths 140 141 Return array of C<swishpath>s in index. 142 143 my @p = $i->swishpaths; 144 145 =cut 146 147 sub swishpaths { 148 my $self = shift; 149 } 150 151 =head2 swishpaths_updated 152 153 Return array with updated C<swishpath>s. 154 155 my @d = $i->swishpaths_updated; 156 157 =cut 158 159 sub swishpaths_updated { 160 my $self = shift; 161 } 162 163 164 =head2 swishpaths_deleted 165 166 Return array with deleted C<swishpath>s. 167 168 my $n = $i->swishpaths_deleted; 169 170 =cut 171 172 sub swishpaths_deleted { 173 my $self = shift; 174 } 175 176 177 =head2 slices 178 179 Return array with all slice names. 180 181 my @s = $i->slices; 182 183 =cut 184 185 sub slices { 186 my $self = shift; 187 } 188 189 =head1 Helper methods 190 191 This methods are used internally, but they might be useful. 192 193 =head2 in_slice 194 195 Takes path and return slice in which this path belongs. 196 197 my $s = $i->in_slice('path/to/document/in/index'); 198 199 If there are C<slices> parametar to L<"open"> it will use 200 MD5 hash to spread documents across slices. That will produce random 201 distribution of your documents in slices, which might or might not be best 202 for your data. If you have to re-index large number of slices on each 203 run, think about creating your own C<slice> function and distributing 204 documents manually across slices. 205 206 This function is C<Memoize>ed for performance reasons. 207 208 =cut 209 210 sub in_slice { 211 my $self = shift; 212 213 my $path = shift || confess "need path"; 214 215 print Dumper($self->{'slice_name'}); 216 confess "need slice_name function" unless ref ($self->{'slice_name'}); 217 218 if ($self->{'slices'}) { 219 # first, pass path through slice_name function 220 my $slice = &{$self->{'slice_name'}}($path); 221 # then calculate MD5 hash 222 $slice = md5_hex($slice); 223 # take first 8 chars to produce number 224 # FIXME how random is this? 225 $slice = hex(substr($slice,0,8)); 226 227 print "slice_nr: $slice slices: ",$self->{'slices'},"\n"; 228 return ($slice % $self->{'slices'}); 229 } else { 230 return &{$self->{'split'}}($path); 231 } 232 } 233 234 235 236 1; 237 __END__ 238 239 240 =head2 Searching 241 242 Searching is still conducted using L<SWISH::API>, but you have to glob 243 index names. 244 245 use SWISH::API; 246 247 my $swish = SWISH::API->new( glob('index.swish-e/*') ); 248 249 You can also alternativly create merged index (using C<merge> option) and 250 not change your source code at all. 251 252 That would also benefit performance, but it increases indexing time 253 because merged indexes must be re-created on each indexing run. 254 255 =head2 EXPORT 256 257 None by default. 258 259 260 261 =head1 SEE ALSO 262 263 L<SWISH::API>, 264 L<http://www.swish-e.org/> 265 266 =head1 AUTHOR 267 268 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt> 269 270 =head1 COPYRIGHT AND LICENSE 271 272 Copyright (C) 2004 by Dobrica Pavlinusic 273 274 This library is free software; you can redistribute it and/or modify 275 it under the same terms as Perl itself, either Perl version 5.8.4 or, 276 at your option, any later version of Perl 5 you may have available. 277 278 279 =cut -
trunk/t/SWISH-Split.t
1 #!/usr/bin/perl -w 2 3 use strict; 4 5 use Test::More tests => 9; 6 use Test::Exception; 7 use File::Temp qw/ :mktemp /; 8 use blib; 9 10 BEGIN { use_ok('SWISH::Split') }; 11 12 my %param; 13 14 throws_ok { SWISH::Split->open(%param) } qr/slice_name/, "slice_name"; 15 16 sub slice_1st_char { 17 return substr($_[0],0,1); 18 }; 19 20 use Data::Dumper; 21 print Dumper(\&slice_1st_char); 22 23 $param{'slice_name'} = \&slice_1st_char; 24 throws_ok { SWISH::Split->open(%param) } qr/slices/, "slices"; 25 26 $param{'slices'} = 3; 27 throws_ok { SWISH::Split->open(%param) } qr/index/, "index"; 28 29 ok($param{'index'} = mkstemp("/tmp/swishXXXXX"), "make temp index"); 30 throws_ok { SWISH::Split->open(%param) } qr/dir/, "dir"; 31 32 ok($param{'index'} = mkdtemp("/tmp/swishXXXXX"), "make temp index"); 33 ok(my $i=SWISH::Split->open(%param), "open"); 34 35 # methods test 36 37 38 39 # internal functions test 40 41 cmp_ok($i->in_slice("swishpath"), '==', 2, "open"); 42