--- cvs-head/script/index_ora 2002/01/23 12:22:54 65 +++ cvs-head/script/index_ora 2002/04/06 19:00:54 76 @@ -1,82 +1,83 @@ #!/usr/bin/perl -w # -*- Mode: Perl -*- # $Basename$ -# $Revision: 1.4 $ +# $Revision: 1.13 $ # Author : Ulrich Pfeifer # Created On : Mon Dec 31 13:57:11 2001 # Last Modified By: Ulrich Pfeifer # Last Modified On: Fri Jan 4 15:59:20 2002 # Language : CPerl # -# (C) Copyright 2001, UUNET Deutschland GmbH, Germany +# (C) Copyright 2001, Ulrich Pfeifer # +use 5.007; + use strict; +use Devel::Peek qw(Dump); + use File::Path; use DB_File; use Getopt::Long; use Cwd; -require WAIT::Config; -require WAIT::Database; -require WAIT::Parse::Ora; -require WAIT::Document::Ora; -require WAIT::InvertedIndex; +BEGIN {require WAIT::Config;} +use WAIT::Database; +use WAIT::Parse::Ora; +use WAIT::Document::Ora; +use WAIT::InvertedIndex; +use Data::Dumper; $DB_BTREE->{'cachesize'} = 200_000 ; -my %OPT = (clean => 0, - database => 'DB', - dir => $WAIT::Config->{WAIT_home} || '/tmp', +use lib "/usr/local/apache/lib"; +use lib "/online/www/sites/ora/catalogsearch/run/lib"; +use oreilly_de_catalog::config; +use oreilly_de_catalog::wait_filter; + +my %OPT = ( + database => 'oreilly_de_catalog', + dir => oreilly_de_catalog::config::WAITDIR, table => 'ora', ); +my $droot = oreilly_de_catalog::config::CATALOG; + GetOptions(\%OPT, - 'clean!', 'database=s', 'dir=s', 'table=s', + 'verbose!', + 'debug!', ) || die "Usage: ...\n"; -if ($OPT{clean} and -d "$OPT{dir}/$OPT{database}") { - my $tmp = WAIT::Database->open(name => $OPT{database}, - 'directory' => $OPT{dir}) - or die "Could not open table $OPT{table}: $@\n"; - my $tbl = $tmp->table(name => $OPT{table}); - $tbl->drop if $tbl; - rmtree("$OPT{dir}/$OPT{database}/$OPT{table}", 1, 1) - if -d "$OPT{dir}/$OPT{database}/$OPT{table}"; - $tmp->close; -} - -my $db; -unless (-d "$OPT{dir}/$OPT{database}") { - $db = WAIT::Database->create(name => $OPT{database}, - 'directory' => $OPT{dir}) - or die "Could not open database $OPT{database}: $@\n"; -} -else { - $db = WAIT::Database->open(name => $OPT{database}, - 'directory' => $OPT{dir}) - or die "Could not open table $OPT{table}: $@\n"; -} +my @localtime = localtime; +$localtime[5] += 1900; +$localtime[4]++; +my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$; +my $db = WAIT::Database->create(name => "$OPT{database}-$jobid", + directory => $OPT{dir}) + or die "Could not create database $OPT{database}: $@\n"; my $layout = new WAIT::Parse::Ora; -my $stem = ['isotr', 'isolc', 'split2', 'stop', 'Stem']; -my $text = [{ - 'prefix' => ['isotr', 'isolc'], - 'intervall' => ['isotr', 'isolc'], - }, - 'isotr', 'isolc', 'split2', 'stop']; -my $sound = ['isotr', 'isolc', 'split2', 'Soundex'],; +# my $stem = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem']; +# my $text = ['OR_tr_20020124', 'split2', 'OR_minus_20020311', 'OR_lc_20020125', 'split2', 'stop']; +my $text = ['OR_tr_20020124', 'split', 'OR_minus_20020311', 'OR_lc_20020125']; +my $wplus = ['OR_split_20020401', 'OR_lc_20020125', 'OR_mixedonly_20020221']; +# my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex']; +my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125']; + +# split6 is better than split13 or split10: it allows them to enter +# shorter sequences when searching. +my $isbn = ['split6', 'OR_isbn_20020127']; my $cwd = cwd; my %D; -my $access = tie %D, 'WAIT::Document::Ora', @ARGV, - or die "Couldn't tie to file: $!\n"; +my $access = tie %D, 'WAIT::Document::Ora', $droot, + or die "Couldn't tie to dir $droot: $!\n"; my $tb = $db->create_table(name => $OPT{table}, attr => ['author', 'isbn', 'title', @@ -85,30 +86,146 @@ access => $access, invindex => [ - 'title' => $stem, - 'about' => $stem, - 'text' => $text, + 'aboutauthor' => $text, + 'aboutauthor' => $wplus, + 'abouttranslator' => $text, + 'abouttranslator' => $wplus, + 'abstract' => $text, + 'abstract' => $wplus, 'author' => $text, + 'chapter' => $text, + 'chapter' => $wplus, 'colophon' => $text, - 'author' => $sound, - 'isbn' => $text, + 'colophon' => $wplus, + 'desc' => $text, + 'desc' => $wplus, + 'inx' => $text, + 'inx' => $wplus, + 'isbn' => $isbn, + 'subtitle' => $text, + 'subtitle' => $wplus, + 'title' => $text, + 'title' => $wplus, + 'title_orig' => $text, + 'title_orig' => $wplus, + 'toc' => $text, + 'toc' => $wplus, + 'translator' => $text, + 'translator' => $wplus, ] ); die "Couldn't create table $OPT{table}: $@\n" unless $tb; my ($did, $value); -while (($did, $value) = each %D) { +binmode STDOUT, ":utf8"; +my $ALL; +my $traceALL = 0; +$traceALL = 1 if $OPT{debug}; # expensive +my $done = 0; +my $todo = keys %D; +my $lasttimeround = my $starttime = time; + +DOC: while (($did, $value) = each %D) { + my $superdebug = 0; + if ($superdebug && $OPT{debug}) { + # next unless $did =~ /perl/; + } + printf "%15s...(%d)\n", $did, $done if $OPT{verbose}; my $record = $layout->split($value); my $headline = $record->{title}; $headline =~ s/\s+/ /sg; - printf "%15s %s\n", $record->{isbn}, substr($headline,0,60); + # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60); + printf "%15s %s...\n", $did, substr($headline,0,60) if $OPT{verbose}; + if ($superdebug && $OPT{debug}) { + # $record = { chapter => $record->{chapter}}; + } + $done++; + next if oreilly_de_catalog::config::DONT_INDEX_JUST_PARSE() ; $tb->insert('docid' => $did, headline => $headline, %{$record}); + my $spenttime = time - $starttime; + my $averagetime = $spenttime/$done; + my $left = $todo-$done; + printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n", + time - $lasttimeround, + $done, + $left, + $spenttime, + $averagetime, + $left*$averagetime, + ) if $OPT{verbose}; + $lasttimeround = time; + if ($traceALL) { # costs a lot when reaching the 100th file or so + $ALL->{$did} = $record; + open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die; + print F Data::Dumper::Dumper($ALL); + close F or die "Couldn't close debug.dump: $!";; + } } +undef $ALL; $tb->set(top=>1); -$tb->close(); -$db->close(); + +my $tritb = $db->create_table( + name => "$OPT{table}_fallback", + attr => [qw(docid headline)], # name + # "headline" + # only for + # sman + invindex => [ headline => $trigr ], + ); +my %dict; +for my $f ($tb->fields) { + my(@idx) = @{$tb->table->{inverted}{$f} || []}; + for my $idx (@idx) { + my $name = $idx->name; + next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/; + # irrelevant for alternatives + my @keys = $idx->keys; + @dict{@keys} = (); + } +} +my @dictkeys = grep s/^p//, keys %dict; +my $maxdebug = 5; +for my $headline (@dictkeys) { + if ($maxdebug && $headline =~ /[^\040-\177]/) { + Dump $headline; + $maxdebug--; + } + # printf "%s\n", substr($headline,0,60); + $tritb->insert(docid => $headline, headline => $headline); +} +$tritb->set(top=>1); +$tritb->close or die "Couldn't close table: $!"; +$tb->close() or die "Couldn't close table: $!"; +$db->close() or die "Couldn't close database: $!"; + +# Atomically relinking symlink: now we have a new database with a very +# long name "$OPT{database}-$jobid" (e.g. +# oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database +# to be accessible with the oreilly_de_catalog name. + +use File::Spec; +chdir $OPT{dir} or die; +my $dir = "$OPT{database}-$jobid"; +my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database}); +my $sltmp = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$"); +unlink $sltmp; # may fail +symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!"; +rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!"; +print "$slwant now points to $dir\n" if $OPT{verbose}; +system("chmod 777 $slwant/*/read")==0 or die; + +opendir DIR, "." or die "Could not opendir .: $!"; +for my $dirent (readdir DIR) { + next if $dirent =~ /^\./; + next unless $dirent =~ /^$OPT{database}(.*)/; + my $ext = $1 or next; + next unless -M $dirent > 4; + warn "removing old index $dirent"; + File::Path::rmtree($dirent); +} +closedir DIR; $WAIT::Config = $WAIT::Config; # make perl -w happy @@ -125,7 +242,6 @@ =head1 SYNOPSIS B -[B<-clean>] [B<-noclean>] [B<-database> I] [B<-dir> I] [B<-table> I] @@ -137,10 +253,6 @@ =over 5 -=item B<-clean> / B<-noclean> - -Clean the table before indexing. Default is B. - =item B<-database> I Specify database name. Default is F.