--- cvs-head/script/index_ora	2002/01/23 12:22:54	65
+++ cvs-head/script/index_ora	2002/04/06 19:00:54	76
@@ -1,82 +1,83 @@
 #!/usr/bin/perl -w
 #                              -*- Mode: Perl -*- 
 # $Basename$
-# $Revision: 1.4 $
+# $Revision: 1.13 $
 # Author          : Ulrich Pfeifer
 # Created On      : Mon Dec 31 13:57:11 2001
 # Last Modified By: Ulrich Pfeifer
 # Last Modified On: Fri Jan  4 15:59:20 2002
 # Language        : CPerl
 # 
-# (C) Copyright 2001, UUNET Deutschland GmbH, Germany
+# (C) Copyright 2001, Ulrich Pfeifer
 # 
 
+use 5.007;
+
 use strict;
+use Devel::Peek qw(Dump);
+
 use File::Path;
 use DB_File;
 use Getopt::Long;
 use Cwd;
 
-require WAIT::Config;
-require WAIT::Database;
-require WAIT::Parse::Ora;
-require WAIT::Document::Ora;
-require WAIT::InvertedIndex;
+BEGIN {require WAIT::Config;}
+use WAIT::Database;
+use WAIT::Parse::Ora;
+use WAIT::Document::Ora;
+use WAIT::InvertedIndex;
+use Data::Dumper;
 
 
 $DB_BTREE->{'cachesize'} = 200_000 ;
 
-my %OPT = (clean    => 0,
-           database => 'DB',
-           dir      => $WAIT::Config->{WAIT_home} || '/tmp',
+use lib "/usr/local/apache/lib";
+use lib "/online/www/sites/ora/catalogsearch/run/lib";
+use oreilly_de_catalog::config;
+use oreilly_de_catalog::wait_filter;
+
+my %OPT = (
+           database => 'oreilly_de_catalog',
+           dir      => oreilly_de_catalog::config::WAITDIR,
            table    => 'ora',
           );
 
+my $droot = oreilly_de_catalog::config::CATALOG;
+
 GetOptions(\%OPT,
-           'clean!',
            'database=s',
            'dir=s',
            'table=s',
+           'verbose!',
+           'debug!',
           ) || die "Usage: ...\n";
 
-if ($OPT{clean} and -d "$OPT{dir}/$OPT{database}") {
-  my $tmp = WAIT::Database->open(name        => $OPT{database},
-                                 'directory' => $OPT{dir})
-    or die "Could not open table $OPT{table}: $@\n";
-  my $tbl = $tmp->table(name => $OPT{table});
-  $tbl->drop if $tbl;
-  rmtree("$OPT{dir}/$OPT{database}/$OPT{table}", 1, 1)
-    if -d "$OPT{dir}/$OPT{database}/$OPT{table}";
-  $tmp->close;
-}
-
-my $db;
-unless (-d "$OPT{dir}/$OPT{database}") {
-  $db = WAIT::Database->create(name       => $OPT{database},
-                              'directory' => $OPT{dir})
-    or die "Could not open database $OPT{database}: $@\n";
-}
-else {
-  $db = WAIT::Database->open(name        => $OPT{database},
-                             'directory' => $OPT{dir})
-    or die "Could not open table $OPT{table}: $@\n";
-}
+my @localtime = localtime;
+$localtime[5] += 1900;
+$localtime[4]++;
+my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
+my $db = WAIT::Database->create(name      => "$OPT{database}-$jobid",
+                                directory => $OPT{dir})
+    or die "Could not create database $OPT{database}: $@\n";
 
 my $layout = new WAIT::Parse::Ora;
 
-my $stem  = ['isotr', 'isolc', 'split2', 'stop', 'Stem'];
-my $text  = [{
-              'prefix'    => ['isotr', 'isolc'],
-              'intervall' => ['isotr', 'isolc'],
-             },
-             'isotr', 'isolc', 'split2', 'stop'];
-my $sound = ['isotr', 'isolc', 'split2', 'Soundex'],;
+# my $stem  = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];
+# my $text  = ['OR_tr_20020124', 'split2', 'OR_minus_20020311', 'OR_lc_20020125', 'split2', 'stop'];
+my $text  = ['OR_tr_20020124', 'split', 'OR_minus_20020311', 'OR_lc_20020125'];
+my $wplus = ['OR_split_20020401', 'OR_lc_20020125', 'OR_mixedonly_20020221'];
+# my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];
+my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];
+
+# split6 is better than split13 or split10: it allows them to enter
+# shorter sequences when searching.
+my $isbn  = ['split6', 'OR_isbn_20020127'];
 
 my $cwd = cwd;
 
 my %D;
-my $access = tie %D, 'WAIT::Document::Ora', @ARGV,
-  or die "Couldn't tie to file: $!\n";
+my $access = tie %D, 'WAIT::Document::Ora', $droot,
+  or die "Couldn't tie to dir $droot: $!\n";
 
 my $tb = $db->create_table(name     => $OPT{table},
                            attr     => ['author', 'isbn', 'title',
@@ -85,30 +86,146 @@
                            access   => $access,
                            invindex =>
                            [
-                            'title'  => $stem,
-                            'about'  => $stem,
-                            'text'   => $text,
+                            'aboutauthor'  => $text,
+                            'aboutauthor'  => $wplus,
+                            'abouttranslator'  => $text,
+                            'abouttranslator'  => $wplus,
+                            'abstract' => $text,
+                            'abstract' => $wplus,
                             'author' => $text,
+                            'chapter' => $text,
+                            'chapter' => $wplus,
                             'colophon' => $text,
-                            'author' => $sound,
-                            'isbn'   => $text,
+                            'colophon' => $wplus,
+                            'desc'   => $text,
+                            'desc'   => $wplus,
+                            'inx'   => $text,
+                            'inx'   => $wplus,
+                            'isbn'   => $isbn,
+                            'subtitle'  => $text,
+                            'subtitle'  => $wplus,
+                            'title'  => $text,
+                            'title'  => $wplus,
+                            'title_orig'  => $text,
+                            'title_orig'  => $wplus,
+                            'toc'   => $text,
+                            'toc'   => $wplus,
+                            'translator'  => $text,
+                            'translator'  => $wplus,
                            ]
                           );
 die "Couldn't create table $OPT{table}: $@\n" unless $tb;
 
 my ($did, $value);
-while (($did, $value) = each %D) {
+binmode STDOUT, ":utf8";
+my $ALL;
+my $traceALL = 0;
+$traceALL = 1 if $OPT{debug}; # expensive
+my $done = 0;
+my $todo = keys %D;
+my $lasttimeround = my $starttime = time;
+
+DOC: while (($did, $value) = each %D) {
+  my $superdebug = 0;
+  if ($superdebug && $OPT{debug}) {
+    # next unless $did =~ /perl/;
+  }
+  printf "%15s...(%d)\n", $did, $done if $OPT{verbose};
   my $record   = $layout->split($value);
   my $headline = $record->{title};
   $headline =~ s/\s+/ /sg;
-  printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
+  # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
+  printf "%15s %s...\n", $did, substr($headline,0,60) if $OPT{verbose};
+  if ($superdebug && $OPT{debug}) {
+    # $record = { chapter => $record->{chapter}};
+  }
+  $done++;
+  next if oreilly_de_catalog::config::DONT_INDEX_JUST_PARSE() ;
   $tb->insert('docid'  => $did,
               headline => $headline,
               %{$record});
+  my $spenttime = time - $starttime;
+  my $averagetime = $spenttime/$done;
+  my $left = $todo-$done;
+  printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n",
+               time - $lasttimeround,
+               $done,
+               $left,
+               $spenttime,
+               $averagetime,
+               $left*$averagetime,
+              ) if $OPT{verbose};
+  $lasttimeround = time;
+  if ($traceALL) { # costs a lot when reaching the 100th file or so
+    $ALL->{$did} = $record;
+    open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;
+    print F Data::Dumper::Dumper($ALL);
+    close F  or die "Couldn't close debug.dump: $!";;
+  }
 }
+undef $ALL;
 $tb->set(top=>1);
-$tb->close();
-$db->close();
+
+my $tritb = $db->create_table(
+                              name => "$OPT{table}_fallback",
+                              attr => [qw(docid headline)], # name
+                                                            # "headline"
+                                                            # only for
+                                                            # sman
+                              invindex => [ headline => $trigr ],
+                             );
+my %dict;
+for my $f ($tb->fields) {
+  my(@idx) = @{$tb->table->{inverted}{$f} || []};
+  for my $idx (@idx) {
+    my $name = $idx->name;
+    next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/;
+              # irrelevant for alternatives
+    my @keys = $idx->keys;
+    @dict{@keys} = ();
+  }
+}
+my @dictkeys = grep s/^p//, keys %dict;
+my $maxdebug = 5;
+for my $headline (@dictkeys) {
+  if ($maxdebug && $headline =~ /[^\040-\177]/) {
+    Dump $headline;
+    $maxdebug--;
+  }
+  # printf "%s\n", substr($headline,0,60);
+  $tritb->insert(docid => $headline, headline => $headline);
+}
+$tritb->set(top=>1);
+$tritb->close or die "Couldn't close table: $!";
+$tb->close() or die "Couldn't close table: $!";
+$db->close() or die "Couldn't close database: $!";
+
+# Atomically relinking symlink: now we have a new database with a very
+# long name "$OPT{database}-$jobid" (e.g.
+# oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database
+# to be accessible with the oreilly_de_catalog name.
+
+use File::Spec;
+chdir $OPT{dir} or die;
+my $dir    = "$OPT{database}-$jobid";
+my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database});
+my $sltmp  = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
+unlink $sltmp; # may fail
+symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
+rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
+print "$slwant now points to $dir\n" if $OPT{verbose};
+system("chmod 777 $slwant/*/read")==0 or die;
+
+opendir DIR, "." or die "Could not opendir .: $!";
+for my $dirent (readdir DIR) {
+  next if $dirent =~ /^\./;
+  next unless $dirent =~ /^$OPT{database}(.*)/;
+  my $ext = $1 or next;
+  next unless -M $dirent > 4;
+  warn "removing old index $dirent";
+  File::Path::rmtree($dirent);
+}
+closedir DIR;
 
 $WAIT::Config = $WAIT::Config; # make perl -w happy
 
@@ -125,7 +242,6 @@
 =head1 SYNOPSIS
 
 B<index_ora>
-[B<-clean>] [B<-noclean>]
 [B<-database> I<dbname>]
 [B<-dir> I<directory>]
 [B<-table> I<table name>]
@@ -137,10 +253,6 @@
 
 =over 5
 
-=item B<-clean> / B<-noclean>
-
-Clean the table before indexing. Default is B<off>.
-
 =item B<-database> I<dbname>
 
 Specify database name. Default is F<DB>.