/[wait]/branches/CPAN/lib/WAIT/Table.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/CPAN/lib/WAIT/Table.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Fri Apr 28 15:42:44 2000 UTC (24 years ago) by ulpfr
File size: 19432 byte(s)
Import of WAIT-1.710

1 # -*- Mode: Cperl -*-
2 # Table.pm --
3 # ITIID : $ITI$ $Header $__Header$
4 # Author : Ulrich Pfeifer
5 # Created On : Thu Aug 8 13:05:10 1996
6 # Last Modified By: Ulrich Pfeifer
7 # Last Modified On: Sun May 30 20:42:30 1999
8 # Language : CPerl
9 # Update Count : 56
10 # Status : Unknown, Use with caution!
11 #
12 # Copyright (c) 1996-1997, Ulrich Pfeifer
13 #
14
15 =head1 NAME
16
17 WAIT::Table -- Module for maintaining Tables / Relations
18
19 =head1 SYNOPSIS
20
21 require WAIT::Table;
22
23 =head1 DESCRIPTION
24
25 =cut
26
27 package WAIT::Table;
28
29 use WAIT::Table::Handle ();
30 require WAIT::Parse::Base;
31
32 use strict;
33 use Carp;
34 # use autouse Carp => qw( croak($) );
35 use DB_File;
36 use Fcntl;
37
38 my $USE_RECNO = 0;
39
40 =head2 Creating a Table.
41
42 The constructor WAIT::Table-E<gt>new is normally called via the
43 create_table method of a database handle. This is not enforced, but
44 creating a table does not make any sense unless the table is
45 registered by the database because the latter implements persistence
46 of the meta data. Registering is done automatically by letting the
47 database handle the creation of a table.
48
49 my $db = WAIT::Database->create(name => 'sample');
50 my $tb = $db->create_table(name => 'test',
51 access => $access,
52 layout => $layout,
53 attr => ['docid', 'headline'],
54 );
55
56 The constructor returns a handle for the table. This handle is hidden by the
57 table module, to prevent direct access if called via Table.
58
59 =over 10
60
61 =item C<access> => I<accessobj>
62
63 A reference to an access object for the external parts (attributes) of
64 tuples. As you may remember, the WAIT System does not enforce that
65 objects are completely stored inside the system to avoid duplication.
66 There is no (strong) point in storing all your HTML documents inside
67 the system when indexing your WWW-Server.
68
69 The access object is designed to work like as a tied hash. You pass
70 the refernce to the object, not the tied hash though. An example
71 implementation of an access class that works for manpages is
72 WAIT::Document::Nroff.
73
74 The implementation needs to take into account that WAIT will keep this
75 object in a Data::Dumper or Storable database and re-use it when sman
76 is run. So it is not good enough if we can produce the index with it
77 now, when we create or actively access the table, WAIT also must be
78 able to retrieve documents on its own, when we are in a different
79 context. This happens specifically in a retrieval. To get this working
80 seemlessly, the access-defining class must implement a close method.
81 This method will be called before the Data::Dumper dump takes place.
82 In that moment the access-defining class must get rid of all data
83 structures that cannot be reconstructed via the Data::Dumper dump,
84 such as database handles or C pointers.
85
86 =item C<file> => I<fname>
87
88 The filename of the records file. Files for indexes will have I<fname>
89 as prefix. I<Mandatory>, but usually taken care of by the
90 WAIT::Database handle when the constructor is called via
91 WAIT::Database::create_table().
92
93 =item C<name> => I<name>
94
95 The name of this table. I<Mandatory>
96
97 =item C<attr> => [ I<attr> ... ]
98
99 A reference to an array of attribute names. WAIT will keep the
100 contents of these attributes in its table. I<Mandatory>
101
102 =item C<djk> => [ I<attr> ... ]
103
104 A reference to an array of attribute names which make up the
105 I<disjointness key>. Don't think about it - it's of no use yet;
106
107 =item C<layout> => I<layoutobj>
108
109 A reference to an external parser object. Defaults to a new instance
110 of C<WAIT::Parse::Base>. For an example implementation see
111 WAIT::Parse::Nroff. A layout class can be implemented as a singleton
112 class if you so like.
113
114 =item C<keyset> => I<keyset>
115
116 The set of attributes needed to identify a record. Defaults to all
117 attributes.
118
119 =item C<invindex> => I<inverted index>
120
121 A reference to an anon array defining attributes of each record that
122 need to be indexed. See the source of smakewhatis for how to set this
123 up.
124
125 =back
126
127 =cut
128
129 sub new {
130 my $type = shift;
131 my %parm = @_;
132 my $self = {};
133
134 # Check for mandatory attrs early
135 $self->{name} = $parm{name} or croak "No name specified";
136 $self->{attr} = $parm{attr} or croak "No attributes specified";
137
138 # Do that before we eventually add '_weight' to attributes.
139 $self->{keyset} = $parm{keyset} || [[@{$parm{attr}}]];
140
141 $self->{mode} = O_CREAT | O_RDWR;
142
143 # Determine and set up subclass
144 $type = ref($type) || $type;
145 if (defined $parm{djk}) {
146 if (@{$parm{djk}} == @{$parm{attr}}) {
147 # All attributes in DK (sloppy test here!)
148 $type .= '::Independent';
149 require WAIT::Table::Independent;
150 } else {
151 $type .= '::Disjoint';
152 require WAIT::Table::Disjoint;
153 }
154 # Add '_weight' to attributes
155 my %attr;
156 @attr{@{$parm{attr}}} = (1) x @{$parm{attr}};
157 unshift @{$parm{attr}}, '_weight' unless $attr{'_weight'};
158 }
159
160 $self->{file} = $parm{file} or croak "No file specified";
161 if (-d $self->{file}){
162 warn "Warning: Directory '$self->{file}' already exists\n";
163 } elsif (!mkdir($self->{file}, 0775)) {
164 croak "Could not 'mkdir $self->{file}': $!\n";
165 }
166 $self->{djk} = $parm{djk} if defined $parm{djk};
167 $self->{layout} = $parm{layout} || new WAIT::Parse::Base;
168 $self->{access} = $parm{access} if defined $parm{access};
169 $self->{nextk} = 1; # next record to insert; first record unused
170 $self->{deleted} = {}; # no deleted records yet
171 $self->{indexes} = {};
172
173 bless $self, $type;
174 # Call create_index() and create_index() for compatibility
175 for (@{$self->{keyset}||[]}) {
176 #carp "Specification of indexes at table create time is deprecated";
177 $self->create_index(@$_);
178 }
179 while (@{$parm{invindex}||[]}) {
180 # carp "Specification of inverted indexes at table create time is deprecated";
181 my $att = shift @{$parm{invindex}};
182 my @spec = @{shift @{$parm{invindex}}};
183 my @opt;
184
185 if (ref($spec[0])) {
186 carp "Secondary pipelines are deprecated\n";
187 @opt = %{shift @spec};
188 }
189 $self->create_inverted_index(attribute => $att, pipeline => \@spec, @opt);
190 }
191 $self;
192 # end of backwarn compatibility stuff
193 }
194
195 =head2 Creating an index
196
197 $tb->create_index('docid');
198
199 =item C<create_index>
200
201 must be called with a list of attributes. This must be a subset of the
202 attributes specified when the table was created. Currently this
203 method must be called before the first tuple is inserted in the
204 table!
205
206 =cut
207
208 sub create_index {
209 my $self= shift;
210
211 croak "Cannot create index for table aready populated"
212 if $self->{nextk} > 1;
213
214 require WAIT::Index;
215
216 my $name = join '-', @_;
217 $self->{indexes}->{$name} =
218 new WAIT::Index file => $self->{file}.'/'.$name, attr => $_;
219 }
220
221 =head2 Creating an inverted index
222
223 $tb->create_inverted_index
224 (attribute => 'au',
225 pipeline => ['detex', 'isotr', 'isolc', 'split2', 'stop'],
226 predicate => 'plain',
227 );
228
229 =over 5
230
231 =item C<attribute>
232
233 The attribute to build the index on. This attribute may not be in the
234 set attributes specified when the table was created.
235
236 =item C<pipeline>
237
238 A piplines specification is a reference to an array of method names
239 (from package C<WAIT::Filter>) which are to be applied in sequence to
240 the contents of the named attribute. The attribute name may not be in
241 the attribute list.
242
243 =item C<predicate>
244
245 An indication which predicate the index implements. This may be
246 e.g. 'plain', 'stemming' or 'soundex'. The indicator will be used for
247 query processing. Currently there is no standard set of predicate
248 names. The predicate defaults to the last member of the pipeline if
249 omitted.
250
251 =back
252
253 Currently this method must be called before the first tuple is
254 inserted in the table!
255
256 =cut
257
258 sub create_inverted_index {
259 my $self = shift;
260 my %parm = @_;
261
262 croak "No attribute specified" unless $parm{attribute};
263 croak "No pipeline specified" unless $parm{pipeline};
264
265 $parm{predicate} ||= $parm{pipeline}->[-1];
266
267 croak "Cannot create index for table aready populated"
268 if $self->{nextk} > 1;
269
270 require WAIT::InvertedIndex;
271
272 # backward compatibility stuff
273 my %opt = %parm;
274 for (qw(attribute pipeline predicate)) {
275 delete $opt{$_};
276 }
277
278 my $name = join '_', ($parm{attribute}, @{$parm{pipeline}});
279 my $idx = new WAIT::InvertedIndex(file => $self->{file}.'/'.$name,
280 filter => [@{$parm{pipeline}}], # clone
281 name => $name,
282 attr => $parm{attribute},
283 %opt, # backward compatibility stuff
284 );
285 # We will have to use $parm{predicate} here
286 push @{$self->{inverted}->{$parm{attribute}}}, $idx;
287 }
288
289 sub dir {
290 $_[0]->{file};
291 }
292
293 =head2 C<$tb-E<gt>layout>
294
295 Returns the reference to the associated parser object.
296
297 =cut
298
299 sub layout { $_[0]->{layout} }
300
301 =head2 C<$tb-E<gt>fields>
302
303 Returns the array of attribute names.
304
305 =cut
306
307
308 sub fields { keys %{$_[0]->{inverted}}}
309
310 =head2 C<$tb-E<gt>drop>
311
312 Must be called via C<WAIT::Database::drop_table>
313
314 =cut
315
316 sub drop {
317 my $self = shift;
318 if ((caller)[0] eq 'WAIT::Database') { # database knows about this
319 $self->close; # just make sure
320 my $file = $self->{file};
321
322 for (values %{$self->{indexes}}) {
323 $_->drop;
324 }
325 unlink "$file/records";
326 ! (!-e $file or rmdir $file);
327 } else {
328 croak ref($self)."::drop called directly";
329 }
330 }
331
332 sub mrequire ($) {
333 my $module = shift;
334
335 $module =~ s{::}{/}g;
336 $module .= '.pm';
337 require $module;
338 }
339
340 sub open {
341 my $self = shift;
342 my $file = $self->{file} . '/records';
343
344 mrequire ref($self); # that's tricky eh?
345 if (defined $self->{'layout'}) {
346 mrequire ref($self->{'layout'});
347 }
348 if (defined $self->{'access'}) {
349 mrequire ref($self->{'access'});
350 }
351 if (exists $self->{indexes}) {
352 require WAIT::Index;
353 for (values %{$self->{indexes}}) {
354 $_->{mode} = $self->{mode};
355 }
356 }
357 if (exists $self->{inverted}) {
358 my ($att, $idx);
359 for $att (keys %{$self->{inverted}}) {
360 for $idx (@{$self->{inverted}->{$att}}) {
361 $idx->{mode} = $self->{mode};
362 }
363 }
364 require WAIT::InvertedIndex;
365 }
366 unless (defined $self->{dbh}) {
367 if ($USE_RECNO) {
368 $self->{dbh} = tie(@{$self->{db}}, 'DB_File', $file,
369 $self->{mode}, 0664, $DB_RECNO);
370 } else {
371 $self->{dbh} =
372 tie(%{$self->{db}}, 'DB_File', $file,
373 $self->{mode}, 0664, $DB_BTREE);
374 }
375 }
376 $self;
377 }
378
379 sub fetch_extern {
380 my $self = shift;
381
382 # print "#@_", $self->{'access'}->{Mode}, "\n"; # DEBUGGING?
383 if (exists $self->{'access'}) {
384 mrequire ref($self->{'access'});
385 $self->{'access'}->FETCH(@_);
386 }
387 }
388
389 sub fetch_extern_by_id {
390 my $self = shift;
391
392 $self->fetch_extern($self->fetch(@_));
393 }
394
395 sub _find_index {
396 my $self = shift;
397 my (@att) = @_;
398 my %att;
399 my $name;
400
401 @att{@att} = @att;
402
403 KEY: for $name (keys %{$self->{indexes}}) {
404 my @iat = split /-/, $name;
405 for (@iat) {
406 next KEY unless exists $att{$_};
407 }
408 return $self->{indexes}->{$name};
409 }
410 return undef;
411 }
412
413 sub have {
414 my $self = shift;
415 my %parm = @_;
416
417 my $index = $self->_find_index(keys %parm) or return; # no index-no have
418
419 defined $self->{db} or $self->open;
420 return $index->have(@_);
421 }
422
423 sub insert {
424 my $self = shift;
425 my %parm = @_;
426
427 defined $self->{db} or $self->open;
428
429 # We should move all writing methods to a subclass to check only once
430 $self->{mode} & O_RDWR or croak "Cannot insert into table opened in RD_ONLY mode";
431
432 my $tuple = join($;, map($parm{$_} || '', @{$self->{attr}}));
433 my $key;
434 my @deleted = keys %{$self->{deleted}};
435
436 if (@deleted) {
437 $key = pop @deleted;
438 delete $self->{deleted}->{$key};
439 } else {
440 $key = $self->{nextk}++;
441 }
442 if ($USE_RECNO) {
443 $self->{db}->[$key] = $tuple;
444 } else {
445 $self->{db}->{$key} = $tuple;
446 }
447 for (values %{$self->{indexes}}) {
448 unless ($_->insert($key, %parm)) {
449 # duplicate key, undo changes
450 if ($key == $self->{nextk}-1) {
451 $self->{nextk}--;
452 } else {
453 $self->{deleted}->{$key}=1;
454 }
455 my $idx;
456 for $idx (values %{$self->{indexes}}) {
457 last if $idx eq $_;
458 $idx->remove($key, %parm);
459 }
460 return undef;
461 }
462 }
463 if (defined $self->{inverted}) {
464 my $att;
465 for $att (keys %{$self->{inverted}}) {
466 if (defined $parm{$att}) {
467 map $_->insert($key, $parm{$att}), @{$self->{inverted}->{$att}};
468 #map $_->sync, @{$self->{inverted}->{$att}}
469 }
470 }
471 }
472 $key
473 }
474
475 sub sync {
476 my $self = shift;
477
478 for (values %{$self->{indexes}}) {
479 map $_->sync, $_;
480 }
481 if (defined $self->{inverted}) {
482 my $att;
483 for $att (keys %{$self->{inverted}}) {
484 map $_->sync, @{$self->{inverted}->{$att}}
485 }
486 }
487 }
488
489 sub fetch {
490 my $self = shift;
491 my $key = shift;
492
493 return () if exists $self->{deleted}->{$key};
494
495 defined $self->{db} or $self->open;
496 if ($USE_RECNO) {
497 $self->unpack($self->{db}->[$key]);
498 } else {
499 $self->unpack($self->{db}->{$key});
500 }
501 }
502
503 sub delete_by_key {
504 my $self = shift;
505 my $key = shift;
506
507 return $self->{deleted}->{$key} if defined $self->{deleted}->{$key};
508 my %tuple = $self->fetch($key);
509 for (values %{$self->{indexes}}) {
510 $_->delete($key, %tuple);
511 }
512 if (defined $self->{inverted}) {
513 # User *must* provide the full record for this or the entries
514 # in the inverted index will not be removed
515 %tuple = (%tuple, @_);
516 my $att;
517 for $att (keys %{$self->{inverted}}) {
518 if (defined $tuple{$att}) {
519 map $_->delete($key, $tuple{$att}), @{$self->{inverted}->{$att}}
520 }
521 }
522 }
523 ++$self->{deleted}->{$key};
524 }
525
526 sub delete {
527 my $self = shift;
528 my $tkey = $self->have(@_);
529
530 defined $tkey && $self->delete_by_key($tkey, @_);
531 }
532
533 sub unpack {
534 my $self = shift;
535 my $tuple = shift;
536
537 my $att;
538 my @result;
539 my @tuple = split /$;/, $tuple;
540
541 for $att (@{$self->{attr}}) {
542 push @result, $att, shift @tuple;
543 }
544 @result;
545 }
546
547 sub close {
548 my $self = shift;
549
550 if (exists $self->{'access'}) {
551 eval {$self->{'access'}->close}; # dont bother if not opened
552 }
553 for (values %{$self->{indexes}}) {
554 $_->close();
555 }
556 if (defined $self->{inverted}) {
557 my $att;
558 for $att (keys %{$self->{inverted}}) {
559 if ($] > 5.003) { # avoid bug in perl up to 5.003_05
560 my $idx;
561 for $idx (@{$self->{inverted}->{$att}}) {
562 $idx->close;
563 }
564 } else {
565 map $_->close(), @{$self->{inverted}->{$att}};
566 }
567 }
568 }
569 if ($self->{dbh}) {
570 delete $self->{dbh};
571
572 if ($USE_RECNO) {
573 untie @{$self->{db}};
574 } else {
575 untie %{$self->{db}};
576 }
577 delete $self->{db};
578 }
579
580 1;
581 }
582
583 sub DESTROY {
584 my $self = shift;
585
586 warn "Table handle destroyed without closing it first"
587 if $self->{db} and $self->{mode}&O_RDWR;
588 }
589
590 sub open_scan {
591 my $self = shift;
592 my $code = shift;
593
594 $self->{dbh} or $self->open;
595 require WAIT::Scan;
596 new WAIT::Scan $self, $self->{nextk}-1, $code;
597 }
598
599 sub open_index_scan {
600 my $self = shift;
601 my $attr = shift;
602 my $code = shift;
603 my $name = join '-', @$attr;
604
605 if (defined $self->{indexes}->{$name}) {
606 $self->{indexes}->{$name}->open_scan($code);
607 } else {
608 croak "No such index '$name'";
609 }
610 }
611
612 eval {sub WAIT::Query::Raw::new} unless defined \&WAIT::Query::Raw::new;
613
614 sub prefix {
615 my ($self , $attr, $prefix) = @_;
616 my %result;
617
618 defined $self->{db} or $self->open; # require layout
619
620 for (@{$self->{inverted}->{$attr}}) {
621 my $result = $_->prefix($prefix);
622 if (defined $result) {
623 $result{$_->name} = $result;
624 }
625 }
626 bless \%result, 'WAIT::Query::Raw';
627 }
628
629 sub intervall {
630 my ($self, $attr, $lb, $ub) = @_;
631 my %result;
632
633 defined $self->{db} or $self->open; # require layout
634
635 for (@{$self->{inverted}->{$attr}}) {
636 my $result = $_->intervall($lb, $ub);
637 if (defined $result) {
638 $result{$_->name} = $result;
639 }
640 }
641 bless \%result, 'WAIT::Query::Raw';
642 }
643
644 sub search {
645 my $self = shift;
646 my $attr = shift;
647 my $cont = shift;
648 my $raw = shift;
649 my %result;
650
651 defined $self->{db} or $self->open; # require layout
652
653 if ($raw) {
654 for (@{$self->{inverted}->{$attr}}) {
655 my $name = $_->name;
656 if (exists $raw->{$name} and @{$raw->{$name}}) {
657 my $scale = 1/scalar(@{$raw->{$name}});
658 my %r = $_->search_raw(@{$raw->{$name}});
659 my ($key, $val);
660 while (($key, $val) = each %r) {
661 if (exists $result{$key}) {
662 $result{$key} += $val*$scale;
663 } else {
664 $result{$key} = $val*$scale;
665 }
666 }
667 }
668 }
669 }
670 if (defined $cont and $cont ne '') {
671 for (@{$self->{inverted}->{$attr}}) {
672 my %r = $_->search($cont);
673 my ($key, $val);
674 while (($key, $val) = each %r) {
675 if (exists $result{$key}) {
676 $result{$key} += $val;
677 } else {
678 $result{$key} = $val;
679 }
680 }
681 }
682 }
683 # sanity check for deleted documents.
684 # this should not be necessary !@#$
685 for (keys %result) {
686 delete $result{$_} if $self->{deleted}->{$_}
687 }
688 %result;
689 }
690
691 sub hilight_positions {
692 my ($self, $attr, $text, $query, $raw) = @_;
693 my %pos;
694
695 if (defined $raw) {
696 for (@{$self->{inverted}->{$attr}}) { # objects of type
697 # WAIT::InvertedIndex for
698 # this index field $attr
699 my $name = $_->name;
700 if (exists $raw->{$name}) {
701 my %qt;
702 grep $qt{$_}++, @{$raw->{$name}};
703 for ($_->parse_pos($text)) {
704 if (exists $qt{$_->[0]}) {
705 $pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
706 }
707 }
708 }
709 }
710 }
711 if (defined $query) {
712 for (@{$self->{inverted}->{$attr}}) {
713 my %qt;
714
715 grep $qt{$_}++, $_->parse($query);
716 for ($_->parse_pos($text)) {
717 if (exists $qt{$_->[0]}) {
718 if (exists $pos{$_->[1]}) { # perl -w ;-)
719 $pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
720 } else {
721 $pos{$_->[1]} = length($_->[0]);
722 }
723 }
724 }
725 }
726 }
727
728 \%pos;
729 }
730
731 sub hilight {
732 my ($tb, $buf, $qplain, $qraw) = @_;
733 my $layout = $tb->layout();
734
735 my @result;
736
737 $qplain ||= {};
738 $qraw ||= {};
739 my @ttxt = $layout->tag($buf);
740 while (@ttxt) {
741 no strict 'refs';
742 my %tag = %{shift @ttxt};
743 my $txt = shift @ttxt;
744 my $fld;
745
746 my %hl;
747 for $fld (grep defined $tag{$_}, keys %$qplain, keys %$qraw) {
748 my $hp = $tb->hilight_positions($fld, $txt,
749 $qplain->{$fld}, $qraw->{$fld});
750 for (keys %$hp) {
751 if (exists $hl{$_}) { # -w ;-(
752 $hl{$_} = max($hl{$_}, $hp->{$_});
753 } else {
754 $hl{$_} = $hp->{$_};
755 }
756 }
757 }
758 my $pos;
759 my $qt = {_qt => 1, %tag};
760 my $pl = \%tag;
761 my $last = length($txt);
762 my @tmp;
763 for $pos (sort {$b <=> $a} keys %hl) {
764 unshift @tmp, $pl, substr($txt,$pos+$hl{$pos},$last-$pos-$hl{$pos});
765 unshift @tmp, $qt, substr($txt,$pos,$hl{$pos});
766 $last = $pos;
767 }
768 push @result, $pl, substr($txt,0,$last);
769 push @result, @tmp;
770 }
771 @result; # no speed necessary
772 }
773
774 1;

Properties

Name Value
cvs2svn:cvs-rev 1.1.1.2

  ViewVC Help
Powered by ViewVC 1.1.26