/[jsFind]/trunk/jsFind.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/jsFind.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (show annotations)
Sun Jul 11 20:18:25 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 15474 byte(s)
initial import into subversion of version 0.1

1 package jsFind;
2
3 use 5.008004;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.01';
8
9 =head1 NAME
10
11 jsFind - generate index for jsFind using B-Tree
12
13 =head1 SYNOPSIS
14
15 use jsFind;
16
17
18
19 =head1 DESCRIPTION
20
21 This module can be used to create index files for jsFind, powerful tool for
22 adding a search engine to a CDROM archive or catalog without requiring the
23 user to install anything.
24
25 Main difference between this module and scripts delivered with jsFind are:
26
27 =over 5
28
29 =item *
30
31 You don't need to use swish-e to create index
32
33 =item *
34
35 You can programatically (and incrementaly) create index for jsFind
36
37 =back
38
39 =head1 METHODS
40
41 This module contains two packages C<jsFind> and C<jsFind::Node>.
42
43 =head2 jsFind methods
44
45 =cut
46
47 use Exporter 'import';
48 use Carp;
49
50 our @ISA = qw(Exporter);
51
52 BEGIN {
53 import 'jsFind::Node';
54 }
55
56 =head3 new
57
58 Create new tree. Arguments are C<B> which is maximum numbers of keys in
59 each node and optional C<Root> node. Each root node may have child nodes.
60
61 All nodes are objects from C<jsFind::Node>.
62
63 my $t = new jsFind(B => 4);
64
65 =cut
66
67 my $DEBUG = 1;
68
69 sub new {
70 my $package = shift;
71 my %ARGV = @_;
72 croak "Usage: {$package}::new(B => number [, Root => root node ])"
73 unless exists $ARGV{B};
74 if ($ARGV{B} % 2) {
75 my $B = $ARGV{B} + 1;
76 carp "B must be an even number. Using $B instead.";
77 $ARGV{B} = $B;
78 }
79
80 my $B = $ARGV{B};
81 my $Root = exists($ARGV{Root}) ? $ARGV{Root} : jsFind::Node->emptynode;
82 bless { B => $B, Root => $Root } => $package;
83 }
84
85 =head3 B_search
86
87 Search, insert, append or replace data in B-Tree
88
89
90
91 Semantics:
92
93 If key not found, insert it iff C<Insert> argument is present.
94
95 If key B<is> found, replace existing data iff C<Replace> argument
96 is present or add new datum to existing iff C<Append> argument is present.
97
98 =cut
99
100 sub B_search {
101 my $self = shift;
102 my %args = @_;
103 my $cur_node = $self->root;
104 my $k = $args{Key};
105 my $d = $args{Data};
106 my @path;
107
108 if ($cur_node->is_empty) { # Special case for empty root
109 if ($args{Insert}) {
110 $cur_node->kdp_insert($k => $d);
111 return $d;
112 } else {
113 return undef;
114 }
115 }
116
117 # Descend tree to leaf
118 for (;;) {
119
120 # Didn't hit bottom yet.
121
122 my($there, $where) = $cur_node->locate_key($k);
123 if ($there) { # Found it!
124 if ($args{Replace}) {
125 $cur_node->kdp_replace($where, $k => $d);
126 } elsif ($args{Append}) {
127 $cur_node->kdp_append($where, $k => $d);
128 }
129 return $cur_node->data($where);
130 }
131
132 # Not here---must be in a subtree.
133
134 if ($cur_node->is_leaf) { # But there are no subtrees
135 return undef unless $args{Insert}; # Search failed
136 # Stuff it in
137 $cur_node->kdp_insert($k => $d);
138 if ($self->node_overfull($cur_node)) { # Oops--there was no room.
139 $self->split_and_promote($cur_node, @path);
140 }
141 return $d;
142 }
143
144 # There are subtrees, and the key is in one of them.
145
146 push @path, [$cur_node, $where]; # Record path from root.
147
148 # Move down to search the subtree
149 $cur_node = $cur_node->subnode($where);
150
151 # and start over.
152 } # for (;;) ...
153
154 croak ("How did I get here?");
155 }
156
157
158
159 sub split_and_promote_old {
160 my $self = shift;
161 my ($cur_node, @path) = @_;
162
163 for (;;) {
164 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
165 my ($up, $where) = @{pop @path};
166 if ($up) {
167 $up->kdp_insert(@$kdp);
168 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
169 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
170 unless $tthere;
171 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
172 unless $twhere == $where;
173 $up->subnode($where, $newleft);
174 $up->subnode($where+1, $newright);
175 return unless $self->node_overfull($up);
176 $cur_node = $up;
177 } else { # We're at the top; make a new root.
178 my $newroot = new jsFind::Node ([$kdp->[0]],
179 [$kdp->[1]],
180 [$newleft, $newright]);
181 $self->root($newroot);
182 return;
183 }
184 }
185
186 }
187
188 sub split_and_promote {
189 my $self = shift;
190 my ($cur_node, @path) = @_;
191
192 for (;;) {
193 my ($newleft, $newright, $kdp) = $cur_node->halves($self->B / 2);
194 my ($up, $where) = @{pop @path} if (@path);
195 if ($up) {
196 $up->kdp_insert(@$kdp);
197 if ($DEBUG) {
198 my ($tthere, $twhere) = $up->locate_key($kdp->[0]);
199 croak "Couldn't find key `$kdp->[0]' in node after just inserting it!"
200 unless $tthere;
201 croak "`$kdp->[0]' went into node at `$twhere' instead of expected `$where'!"
202 unless $twhere == $where;
203 }
204 $up->subnode($where, $newleft);
205 $up->subnode($where+1, $newright);
206 return unless $self->node_overfull($up);
207 $cur_node = $up;
208 } else { # We're at the top; make a new root.
209 my $newroot = new jsFind::Node([$kdp->[0]],
210 [$kdp->[1]],
211 [$newleft, $newright]);
212 $self->root($newroot);
213 return;
214 }
215 }
216 }
217
218 =head3 B
219
220 Return B (maximum number of keys)
221
222 my $max_size = $t->B;
223
224 =cut
225
226 sub B {
227 $_[0]{B};
228 }
229
230 =head3 root
231
232 Returns root node
233
234 my $root = $t->root;
235
236 =cut
237
238 sub root {
239 my ($self, $newroot) = @_;
240 $self->{Root} = $newroot if defined $newroot;
241 $self->{Root};
242 }
243
244 =head3 node_overfull
245
246 Returns if node is overfull
247
248 if ($node->node_overfull) { something }
249
250 =cut
251
252 sub node_overfull {
253 my $self = shift;
254 my $node = shift;
255 $node->size > $self->B;
256 }
257
258 =head3 to_string
259
260 Returns your tree as formatted string.
261
262 my $text = $root->to_string;
263
264 Mostly usefull for debugging as output leaves much to be desired.
265
266 =cut
267
268 sub to_string {
269 $_[0]->root->to_string;
270 }
271
272 =head3 to_dot
273
274 Create Graphviz graph of your tree
275
276 my $dot_graph = $root->to_dot;
277
278 =cut
279
280 sub to_dot {
281 my $self = shift;
282
283 my $dot = qq/digraph dns {\nrankdir=LR;\n/;
284 $dot .= $self->root->to_dot;
285 $dot .= qq/\n}\n/;
286
287 return $dot;
288 }
289
290 =head3 to_jsfind
291
292 Create xml index files for jsFind. This should be called after
293 your B-Tree has been filled with data.
294
295 $root->to_jsfind('/full/path/to/index/dir/');
296
297 Returns number of nodes in created tree.
298
299 =cut
300
301 sub to_jsfind {
302 my $self = shift;
303
304 my $path = shift || confess "to_jsfind need path to your index!";
305
306 $path .= "/" if ($path =~ /\/$/);
307 carp "can't create index in '$path': $!" if (! -w $path);
308
309 return $self->root->to_jsfind($path,"0");
310 }
311
312
313 # private, default cmd function
314 sub default_cmp {
315 $_[0] cmp $_[1];
316 }
317
318 #####################################################################
319
320 =head2 jsFind::Node methods
321
322 Each node has C<k> key-data pairs, with C<B> <= C<k> <= C<2B>, and
323 each has C<k+1> subnodes, which might be null.
324
325 The node is a blessed reference to a list with three elements:
326
327 ($keylist, $datalist, $subnodelist)
328
329 each is a reference to a list list.
330
331 The null node is represented by a blessed reference to an empty list.
332
333 =cut
334
335 package jsFind::Node;
336
337 use warnings;
338 use strict;
339
340 use Carp;
341 use File::Path;
342
343 my $KEYS = 0;
344 my $DATA = 1;
345 my $SUBNODES = 2;
346
347 =head3 new
348
349 Create New node
350
351 my $node = new jsFind::Node ($keylist, $datalist, $subnodelist);
352
353 You can also mit argument list to create empty node.
354
355 my $empty_node = new jsFind::Node;
356
357 =cut
358
359 sub new {
360 my $self = shift;
361 my $package = ref $self || $self;
362 croak "Internal error: jsFind::Node::new called with wrong number of arguments."
363 unless @_ == 3 || @_ == 0;
364 bless [@_] => $package;
365 }
366
367 =head3 locate_key
368
369 Locate key in node using linear search. This should probably be replaced
370 by binary search for better performance.
371
372 my ($found, $index) = $node->locate_key($key, $cmp_coderef);
373
374 Argument C<$cmp_coderef> is optional reference to custom comparison
375 operator.
376
377 Returns (1, $index) if $key[$index] eq $key.
378
379 Returns (0, $index) if key could be found in $subnode[$index].
380
381 In scalar context, just returns 1 or 0.
382
383 =cut
384
385 sub locate_key {
386 # Use linear search for testing, replace with binary search.
387 my $self = shift;
388 my $key = shift;
389 my $cmp = shift || \&jsFind::default_cmp;
390 my $i;
391 my $cmp_result;
392 my $N = $self->size;
393 for ($i = 0; $i < $N; $i++) {
394 $cmp_result = &$cmp($key, $self->key($i));
395 last if $cmp_result <= 0;
396 }
397
398 # $i is now the index of the first node-key greater than $key
399 # or $N if there is no such. $cmp_result is 0 iff the key was found.
400 (!$cmp_result, $i);
401 }
402
403
404 =head3 emptynode
405
406 Creates new empty node
407
408 $node = $root->emptynode;
409 $new_node = $node->emptynode;
410
411 =cut
412
413 sub emptynode {
414 new($_[0]); # Pass package name, but not anything else.
415 }
416
417 =head3 is_empty
418
419 Test if node is empty
420
421 if ($node->is_empty) { something }
422
423 =cut
424
425 # undef is empty; so is a blessed empty list.
426 sub is_empty {
427 my $self = shift;
428 !defined($self) || $#$self < 0;
429 }
430
431 =head3 key
432
433 Return C<$i>th key from node
434
435 my $key = $node->key($i);
436
437 =cut
438
439 sub key {
440 # my ($self, $n) = @_;
441 # $self->[$KEYS][$n];
442
443 # speedup
444 $_[0]->[$KEYS][$_[1]];
445 }
446
447 =head3 data
448
449 Return C<$i>th data from node
450
451 my $data = $node->data($i);
452
453 =cut
454
455 sub data {
456 my ($self, $n) = @_;
457 $self->[$DATA][$n];
458 }
459
460 =head3 kdp_replace
461
462 Set key data pair for C<$i>th element in node
463
464 $node->kdp_replace($i, "key value" => {
465 "data key 1" => "data value 1",
466 "data key 2" => "data value 2",
467 };
468
469 =cut
470
471 sub kdp_replace {
472 my ($self, $n, $k => $d) = @_;
473 if (defined $k) {
474 $self->[$KEYS][$n] = $k;
475 $self->[$DATA][$n] = $d;
476 }
477 [$self->[$KEYS][$n],
478 $self->[$DATA][$n]];
479 }
480
481 =head3 kdp_insert
482
483 # No return value.
484
485 =cut
486
487 sub kdp_insert {
488 my $self = shift;
489 my ($k => $d) = @_;
490 my ($there, $where) = $self->locate_key($k) unless $self->is_empty;
491
492 if ($there) { croak("Tried to insert `$k => $d' into node where `$k' was already present."); }
493
494 # undef fix
495 $where ||= 0;
496
497 splice(@{$self->[$KEYS]}, $where, 0, $k);
498 splice(@{$self->[$DATA]}, $where, 0, $d);
499 splice(@{$self->[$SUBNODES]}, $where, 0, undef);
500 }
501
502 =head3 kdp_append
503
504 Adds new data keys and values to C<$i>th element in node
505
506 $node->kdp_append($i, "key value" => {
507 "added data key" => "added data value",
508 };
509
510 =cut
511
512 sub kdp_append {
513 my ($self, $n, $k => $d) = @_;
514 if (defined $k) {
515 $self->[$KEYS][$n] = $k;
516 my ($kv,$dv) = %{$d};
517 $self->[$DATA][$n]->{$kv} = $dv;
518 }
519 [$self->[$KEYS][$n],
520 $self->[$DATA][$n]];
521 }
522
523 =head3 subnode
524
525 Set new or return existing subnode
526
527 # return 4th subnode
528 my $my_node = $node->subnode(4);
529
530 # create new subnode 5 from $my_node
531 $node->subnode(5, $my_node);
532
533 =cut
534
535 sub subnode {
536 my ($self, $n, $newnode) = @_;
537 $self->[$SUBNODES][$n] = $newnode if defined $newnode;
538 $self->[$SUBNODES][$n];
539 }
540
541 =head3 is_leaf
542
543 Test if node is leaf
544
545 if ($node->is_leaf) { something }
546
547 =cut
548
549 sub is_leaf {
550 my $self = shift;
551 ! defined $self->[$SUBNODES][0]; # undefined subnode means leaf node.
552 }
553
554 =head3 size
555
556 Return number of keys in the node
557
558 my $nr = $node->size;
559
560 =cut
561
562 sub size {
563 my $self = shift;
564 return scalar(@{$self->[$KEYS]});
565 }
566
567 =head3 halves
568
569 # Accept an index $n
570 # Divide into two nodes so that keys 0 .. $n-1 are in one node
571 # and keys $n+1 ... $size are in the other.
572
573 =cut
574
575 sub halves {
576 my $self = shift;
577 my $n = shift;
578 my $s = $self->size;
579 my @right;
580 my @left;
581
582 $left[$KEYS] = [@{$self->[$KEYS]}[0 .. $n-1]];
583 $left[$DATA] = [@{$self->[$DATA]}[0 .. $n-1]];
584 $left[$SUBNODES] = [@{$self->[$SUBNODES]}[0 .. $n]];
585
586 $right[$KEYS] = [@{$self->[$KEYS]}[$n+1 .. $s-1]];
587 $right[$DATA] = [@{$self->[$DATA]}[$n+1 .. $s-1]];
588 $right[$SUBNODES] = [@{$self->[$SUBNODES]}[$n+1 .. $s]];
589
590 my @middle = ($self->[$KEYS][$n], $self->[$DATA][$n]);
591
592 ($self->new(@left), $self->new(@right), \@middle);
593 }
594
595 =head3 to_string
596
597 Dumps tree as string
598
599 my $str = $root->to_string;
600
601 =cut
602
603 sub to_string {
604 my $self = shift;
605 my $indent = shift || 0;
606 my $I = ' ' x $indent;
607 return '' if $self->is_empty;
608 my ($k, $d, $s) = @$self;
609 my $result = '';
610 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+2) : '';
611 my $N = $self->size;
612 my $i;
613 for ($i = 0; $i < $N; $i++) {
614 # $result .= $I . "$k->[$i] => $d->[$i]\n";
615 $result .= $I . "$k->[$i]\n";
616 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+2) : '';
617 }
618 $result;
619 }
620
621 =begin comment
622
623 use Data::Dumper;
624
625 sub to_string {
626 my $self = shift;
627 my $indent = shift || 0;
628 my $path = shift || '0';
629 return '' if $self->is_empty;
630 my ($k, $d, $s) = @$self;
631 my $result = '';
632 $result .= defined($s->[0]) ? $s->[0]->to_string($indent+1,"$path/0") : '';
633 my $N = $self->size;
634 for (my $i = 0; $i < $N; $i++) {
635 my $dump = Dumper($d->[$i]);
636 $dump =~ s/[\n\r\s]+/ /gs;
637 $dump =~ s/\$VAR1\s*=\s*//;
638 $result .= sprintf("%-5s [%2d] %2s: %s => %s\n", $path, $i, $indent, $k->[$i], $dump);
639 $result .= defined($s->[$i+1]) ? $s->[$i+1]->to_string($indent+1,"$path/$i") : '';
640 }
641 $result;
642 }
643
644 =end comment
645
646 =head3 to_dot
647
648 Recursivly walk nodes of tree
649
650 =cut
651
652 sub to_dot {
653 my $self = shift;
654 my $parent = shift;
655
656 return '' if $self->is_empty;
657
658 my $dot = '';
659
660 my ($k, $d, $s) = @$self;
661 my $N = $self->size;
662
663 my @dot_keys;
664
665 my $node_name = $parent || '_';
666 $node_name =~ s/\W+//g;
667 $node_name .= " [$N]";
668
669 for (my $i = 0; $i <= $N; $i++) {
670 if (my $key = $k->[$i]) {
671 push @dot_keys, qq{<$i>$key};
672 }
673 $dot .= $s->[$i]->to_dot(qq{"$node_name":$i}) if ($s->[$i]);
674 }
675 push @dot_keys, qq{<$N>...} if (! $self->is_leaf);
676
677 my $label = join("|",@dot_keys);
678 $dot .= qq{"$node_name" [ shape=record, label="$label" ];\n};
679
680 $dot .= qq{$parent -> "$node_name";\n} if ($parent);
681
682 $dot;
683 }
684
685 =head3 to_jsfind
686
687 Create jsFind xml files
688
689 my $nr=$tree->to_dot('/path/to/index','0');
690
691 Returns number of elements created
692
693 =cut
694
695 sub to_jsfind {
696 my $self = shift;
697 my ($path,$file) = @_;
698
699 return 0 if $self->is_empty;
700
701 my $nr_keys = 0;
702
703 my ($k, $d, $s) = @$self;
704 my $N = $self->size;
705
706 my ($key_xml, $data_xml) = ("<n>","<d>");
707
708 for (my $i = 0; $i <= $N; $i++) {
709 my $key = lc($k->[$i]);
710
711 if ($key) {
712 $key_xml .= qq{<k>$key</k>};
713 $data_xml .= qq{<e>};
714 #use Data::Dumper;
715 #print Dumper($d->[$i]);
716 foreach my $path (keys %{$d->[$i]}) {
717 $data_xml .= '<l f="'.($d->[$i]->{$path}->{'f'} || 1).'" t="'.($d->[$i]->{$path}->{'t'} || 'no title').'">'.$path.'</l>';
718 $nr_keys++;
719 }
720 $data_xml .= qq{</e>};
721 }
722
723 $nr_keys += $s->[$i]->to_jsfind("$path/$file","$i") if ($s->[$i]);
724 }
725
726 $key_xml .= "</n>";
727 $data_xml .= "</d>";
728
729 if (! -e $path) {
730 mkpath($path) || croak "can't create dir '$path': $!";
731 }
732
733 open(K, "> ${path}/${file}.xml") || croak "can't open '$path/$file.xml': $!";
734 open(D, "> ${path}/_${file}.xml") || croak "can't open '$path/_$file.xml': $!";
735
736 print K $key_xml;
737 print D $data_xml;
738
739 close(K);
740 close(D);
741
742 return $nr_keys;
743 }
744
745 1;
746 __END__
747
748 =head1 SEE ALSO
749
750 jsFind web site L<http://www.elucidsoft.net/projects/jsfind/>
751
752 B-Trees in perl web site L<http://perl.plover.com/BTree/>
753
754 =head1 AUTHORS
755
756 Mark-Jonson Dominus E<lt>mjd@pobox.comE<gt> wrote C<BTree.pm> which was
757 base for this module
758
759 Shawn P. Garbett E<lt>shawn@elucidsoft.netE<gt> wrote jsFind
760
761 Dobrica Pavlinusic E<lt>dpavlin@rot13.orgE<gt> wrote this module
762
763 =head1 COPYRIGHT AND LICENSE
764
765 Copyright (C) 2004 by Dobrica Pavlinusic
766
767 This program is free software; you can redistribute it and/or modify it
768 under the terms of the GNU General Public License as published by the Free
769 Software Foundation; either version 2 of the License, or (at your option)
770 any later version. This program is distributed in the hope that it will be
771 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
772 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
773 Public License for more details.
774
775 =cut

  ViewVC Help
Powered by ViewVC 1.1.26