--- cvs-head/script/index_ora 2002/01/23 12:22:54 65 +++ cvs-head/script/index_ora 2002/01/27 15:27:38 71 @@ -1,7 +1,7 @@ #!/usr/bin/perl -w # -*- Mode: Perl -*- # $Basename$ -# $Revision: 1.4 $ +# $Revision: 1.8 $ # Author : Ulrich Pfeifer # Created On : Mon Dec 31 13:57:11 2001 # Last Modified By: Ulrich Pfeifer @@ -11,66 +11,58 @@ # (C) Copyright 2001, UUNET Deutschland GmbH, Germany # +use 5.007; + use strict; +use Devel::Peek qw(Dump); + use File::Path; use DB_File; use Getopt::Long; use Cwd; -require WAIT::Config; -require WAIT::Database; -require WAIT::Parse::Ora; -require WAIT::Document::Ora; -require WAIT::InvertedIndex; +BEGIN {require WAIT::Config;} +use WAIT::Database; +use WAIT::Parse::Ora; +use WAIT::Document::Ora; +use WAIT::InvertedIndex; $DB_BTREE->{'cachesize'} = 200_000 ; -my %OPT = (clean => 0, +my %OPT = ( database => 'DB', dir => $WAIT::Config->{WAIT_home} || '/tmp', table => 'ora', ); GetOptions(\%OPT, - 'clean!', 'database=s', 'dir=s', 'table=s', ) || die "Usage: ...\n"; -if ($OPT{clean} and -d "$OPT{dir}/$OPT{database}") { - my $tmp = WAIT::Database->open(name => $OPT{database}, - 'directory' => $OPT{dir}) - or die "Could not open table $OPT{table}: $@\n"; - my $tbl = $tmp->table(name => $OPT{table}); - $tbl->drop if $tbl; - rmtree("$OPT{dir}/$OPT{database}/$OPT{table}", 1, 1) - if -d "$OPT{dir}/$OPT{database}/$OPT{table}"; - $tmp->close; -} - -my $db; -unless (-d "$OPT{dir}/$OPT{database}") { - $db = WAIT::Database->create(name => $OPT{database}, - 'directory' => $OPT{dir}) - or die "Could not open database $OPT{database}: $@\n"; -} -else { - $db = WAIT::Database->open(name => $OPT{database}, - 'directory' => $OPT{dir}) - or die "Could not open table $OPT{table}: $@\n"; -} +my @localtime = localtime; +$localtime[5] += 1900; +$localtime[4]++; +my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$; +my $db = WAIT::Database->create(name => "$OPT{database}-$jobid", + directory => $OPT{dir}) + or die "Could not create database $OPT{database}: $@\n"; my $layout = new WAIT::Parse::Ora; -my $stem = ['isotr', 'isolc', 'split2', 'stop', 'Stem']; +use lib "/usr/local/apache/lib"; +use oreilly_de_catalog::wait_handler; + +my $stem = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop', 'Stem']; my $text = [{ - 'prefix' => ['isotr', 'isolc'], - 'intervall' => ['isotr', 'isolc'], + 'prefix' => ['OR_tr_20020124', 'OR_lc_20020124'], + 'intervall' => ['OR_tr_20020124', 'OR_lc_20020124'], }, - 'isotr', 'isolc', 'split2', 'stop']; -my $sound = ['isotr', 'isolc', 'split2', 'Soundex'],; + 'OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop']; +my $sound = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'Soundex']; +my $trigr = ['OR_lc_20020124', 'OR_trigrams_20020125']; my $cwd = cwd; @@ -97,6 +89,7 @@ die "Couldn't create table $OPT{table}: $@\n" unless $tb; my ($did, $value); +binmode STDOUT, ":utf8"; while (($did, $value) = each %D) { my $record = $layout->split($value); my $headline = $record->{title}; @@ -107,9 +100,45 @@ %{$record}); } $tb->set(top=>1); + +my $tritb = $db->create_table( + name => "$OPT{table}_fallback", + attr => [qw(docid headline)], + invindex => [ headline => $trigr ], + ); +my %dict; +for my $f ($tb->fields) { + my(@idx) = @{$tb->table->{inverted}{$f} || []}; + for my $idx (@idx) { + my @keys = $idx->keys; + @dict{@keys} = (); + } +} +my @dictkeys = grep s/^p//, keys %dict; +my $maxdebug = 5; +for my $headline (@dictkeys) { + if ($maxdebug && $headline =~ /[^\040-\177]/) { + Dump $headline; + $maxdebug--; + } + $tritb->insert(docid => $headline, headline => $headline); +} +$tritb->set(top=>1); +$tritb->close; $tb->close(); $db->close(); +# Now we have a new database with a very long name and we want that +# database to be accessible with the $OPT{database} name + +use File::Spec; +my $long_dir = "$OPT{database}-$jobid"; +my $want_dir = File::Spec->catdir($OPT{dir}, $OPT{database}); +my $prel_slink = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$"); +unlink $prel_slink; # may fail +symlink $long_dir, $prel_slink or die "Could not symlink $long_dir, $prel_slink: $!"; +rename $prel_slink, $want_dir or die "Could not rename $prel_slink, $want_dir: $!"; + $WAIT::Config = $WAIT::Config; # make perl -w happy @@ -125,7 +154,6 @@ =head1 SYNOPSIS B -[B<-clean>] [B<-noclean>] [B<-database> I] [B<-dir> I] [B<-table> I] @@ -137,10 +165,6 @@ =over 5 -=item B<-clean> / B<-noclean> - -Clean the table before indexing. Default is B. - =item B<-database> I Specify database name. Default is F.