/[wait]/trunk/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

cvs-head/script/index_ora revision 72 by laperla, Mon Jan 28 21:35:39 2002 UTC trunk/script/index_ora revision 109 by dpavlin, Tue Jul 13 17:50:27 2004 UTC
# Line 1  Line 1 
1  #!/usr/bin/perl -w  #!/usr/bin/perl -w
2  #                              -*- Mode: Perl -*-  #                              -*- Mode: Perl -*-
3  # $Basename$  # $Basename$
4  # $Revision: 1.9 $  # $Revision: 1.14 $
5  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
6  # Created On      : Mon Dec 31 13:57:11 2001  # Created On      : Mon Dec 31 13:57:11 2001
7  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
8  # Last Modified On: Fri Jan  4 15:59:20 2002  # Last Modified On: Fri Jan  4 15:59:20 2002
9  # Language        : CPerl  # Language        : CPerl
10  #  #
11  # (C) Copyright 2001, UUNET Deutschland GmbH, Germany  # (C) Copyright 2001, Ulrich Pfeifer
12  #  #
13    
14  use 5.007;  use 5.007;
# Line 26  use WAIT::Database; Line 26  use WAIT::Database;
26  use WAIT::Parse::Ora;  use WAIT::Parse::Ora;
27  use WAIT::Document::Ora;  use WAIT::Document::Ora;
28  use WAIT::InvertedIndex;  use WAIT::InvertedIndex;
29    use Data::Dumper;
30    
31    
32  $DB_BTREE->{'cachesize'} = 200_000 ;  $DB_BTREE->{'cachesize'} = 200_000 ;
33    
34    use lib "/usr/local/apache/lib";
35    use lib "/online/www/sites/ora/catalogsearch/run/lib";
36    use oreilly_de_catalog::config;
37    use oreilly_de_catalog::wait_filter;
38    
39  my %OPT = (  my %OPT = (
40             database => 'DB',             database => 'oreilly_de_catalog',
41             dir      => $WAIT::Config->{WAIT_home} || '/tmp',             dir      => oreilly_de_catalog::config::WAITDIR,
42             table    => 'ora',             table    => 'ora',
43            );            );
44    
45    my $droot = oreilly_de_catalog::config::CATALOG;
46    
47  GetOptions(\%OPT,  GetOptions(\%OPT,
48             'database=s',             'database=s',
49             'dir=s',             'dir=s',
50             'table=s',             'table=s',
51               'verbose!',
52               'debug!',
53            ) || die "Usage: ...\n";            ) || die "Usage: ...\n";
54    
55  my @localtime = localtime;  my @localtime = localtime;
# Line 52  my $db = WAIT::Database->create(name Line 62  my $db = WAIT::Database->create(name
62    
63  my $layout = new WAIT::Parse::Ora;  my $layout = new WAIT::Parse::Ora;
64    
65  use lib "/usr/local/apache/lib";  # my $stem  = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];
66  use oreilly_de_catalog::wait_handler;  # my $text  = ['OR_tr_20020124', 'split2', 'OR_minus_20020311', 'OR_lc_20020125', 'split2', 'stop'];
67    my $text  = ['OR_tr_20020124', 'split', 'OR_minus_20020311', 'OR_lc_20020125'];
68    my $wplus = ['OR_split_20020401', 'OR_lc_20020125', 'OR_mixedonly_20020221'];
69    # my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];
70    my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];
71    
72  my $stem  = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop', 'Stem'];  # split6 is better than split13 or split10: it allows them to enter
73  my $text  = [{  # shorter sequences when searching.
               'prefix'    => ['OR_tr_20020124', 'OR_lc_20020124'],  
               'intervall' => ['OR_tr_20020124', 'OR_lc_20020124'],  
              },  
              'OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop'];  
 my $sound = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'Soundex'];  
 my $trigr = ['OR_lc_20020124', 'OR_trigrams_20020125'];  
74  my $isbn  = ['split6', 'OR_isbn_20020127'];  my $isbn  = ['split6', 'OR_isbn_20020127'];
75    
76  my $cwd = cwd;  my $cwd = cwd;
77    
78  my %D;  my %D;
79  my $access = tie %D, 'WAIT::Document::Ora', @ARGV,  my $access = tie %D, 'WAIT::Document::Ora', $droot,
80    or die "Couldn't tie to file: $!\n";    or die "Couldn't tie to dir $droot: $!\n";
81    
82  my $tb = $db->create_table(name     => $OPT{table},  my $tb = $db->create_table(name     => $OPT{table},
83                             attr     => ['author', 'isbn', 'title',                             attr     => ['author', 'isbn', 'title',
# Line 78  my $tb = $db->create_table(name     => $ Line 86  my $tb = $db->create_table(name     => $
86                             access   => $access,                             access   => $access,
87                             invindex =>                             invindex =>
88                             [                             [
                             'title'  => $text,  
                             # 'title'  => $stem,  
89                              'aboutauthor'  => $text,                              'aboutauthor'  => $text,
90                              # 'aboutauthor'  => $stem,                              'aboutauthor'  => $wplus,
91                              'desc'   => $text,                              'abouttranslator'  => $text,
92                                'abouttranslator'  => $wplus,
93                              'abstract' => $text,                              'abstract' => $text,
94                                'abstract' => $wplus,
95                              'author' => $text,                              'author' => $text,
96                              # 'author' => $sound,                              'chapter' => $text,
97                                'chapter' => $wplus,
98                              'colophon' => $text,                              'colophon' => $text,
99                                'colophon' => $wplus,
100                                'desc'   => $text,
101                                'desc'   => $wplus,
102                                'inx'   => $text,
103                                'inx'   => $wplus,
104                              'isbn'   => $isbn,                              'isbn'   => $isbn,
105                                'subtitle'  => $text,
106                                'subtitle'  => $wplus,
107                                'title'  => $text,
108                                'title'  => $wplus,
109                                'title_orig'  => $text,
110                                'title_orig'  => $wplus,
111                                'toc'   => $text,
112                                'toc'   => $wplus,
113                                'translator'  => $text,
114                                'translator'  => $wplus,
115                             ]                             ]
116                            );                            );
117  die "Couldn't create table $OPT{table}: $@\n" unless $tb;  die "Couldn't create table $OPT{table}: $@\n" unless $tb;
118    
119  my ($did, $value);  my ($did, $value);
120  binmode STDOUT, ":utf8";  binmode STDOUT, ":utf8";
121  while (($did, $value) = each %D) {  my $ALL;
122    my $traceALL = 0;
123    $traceALL = 1 if $OPT{debug}; # expensive
124    my $done = 0;
125    my $todo = keys %D;
126    my $lasttimeround = my $starttime = time;
127    
128    DOC: while (($did, $value) = each %D) {
129      my $superdebug = 0;
130      if ($superdebug && $OPT{debug}) {
131        next unless $did =~ /perltb/;
132      }
133      printf "%15s...(%d/%d)\n", $did, $done, $todo if $OPT{verbose};
134    my $record   = $layout->split($value);    my $record   = $layout->split($value);
135    my $headline = $record->{title};    my $headline = $record->{title};
136    $headline =~ s/\s+/ /sg;    $headline =~ s/\s+/ /sg;
137    printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);    # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
138      printf "%15s %s...\n", $did, substr($headline,0,60) if $OPT{verbose};
139      if ($superdebug && $OPT{debug}) {
140        # $record = { chapter => $record->{chapter}};
141      }
142      $done++;
143      next if oreilly_de_catalog::config::DONT_INDEX_JUST_PARSE() ;
144    $tb->insert('docid'  => $did,    $tb->insert('docid'  => $did,
145                headline => $headline,                headline => $headline,
146                %{$record});                %{$record});
147      my $spenttime = time - $starttime;
148      my $averagetime = $spenttime/$done;
149      my $left = $todo-$done;
150      printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n",
151                   time - $lasttimeround,
152                   $done,
153                   $left,
154                   $spenttime,
155                   $averagetime,
156                   $left*$averagetime,
157                  ) if $OPT{verbose};
158      $lasttimeround = time;
159      if ($traceALL) { # costs a lot when reaching the 100th file or so
160        $ALL->{$did} = $record;
161        open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;
162        print F Data::Dumper::Dumper($ALL);
163        close F  or die "Couldn't close debug.dump: $!";;
164      }
165  }  }
166    undef $ALL;
167  $tb->set(top=>1);  $tb->set(top=>1);
168    
169  my $tritb = $db->create_table(  my $tritb = $db->create_table(
# Line 118  for my $f ($tb->fields) { Line 179  for my $f ($tb->fields) {
179    my(@idx) = @{$tb->table->{inverted}{$f} || []};    my(@idx) = @{$tb->table->{inverted}{$f} || []};
180    for my $idx (@idx) {    for my $idx (@idx) {
181      my $name = $idx->name;      my $name = $idx->name;
182      next if $name =~ /(_|\b)(Stem|Soundex)(\b|_)/; # irrelevant for alternatives      next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/;
183                  # irrelevant for alternatives
184      my @keys = $idx->keys;      my @keys = $idx->keys;
185      @dict{@keys} = ();      @dict{@keys} = ();
186    }    }
# Line 134  for my $headline (@dictkeys) { Line 196  for my $headline (@dictkeys) {
196    $tritb->insert(docid => $headline, headline => $headline);    $tritb->insert(docid => $headline, headline => $headline);
197  }  }
198  $tritb->set(top=>1);  $tritb->set(top=>1);
199  $tritb->close;  $tritb->close or die "Couldn't close table: $!";
200  $tb->close();  $tb->close() or die "Couldn't close table: $!";
201  $db->close();  $db->close() or die "Couldn't close database: $!";
202    
203  # Now we have a new database with a very long name and we want that  # Atomically relinking symlink: now we have a new database with a very
204  # database to be accessible with the $OPT{database} name  # long name "$OPT{database}-$jobid" (e.g.
205    # oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database
206    # to be accessible with the oreilly_de_catalog name.
207    
208  use File::Spec;  use File::Spec;
209  my $long_dir   = "$OPT{database}-$jobid";  chdir $OPT{dir} or die;
210  my $want_dir   = File::Spec->catdir($OPT{dir}, $OPT{database});  my $dir    = "$OPT{database}-$jobid";
211  my $prel_slink = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");  my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database});
212  unlink $prel_slink; # may fail  my $sltmp  = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
213  symlink $long_dir, $prel_slink or die "Could not symlink $long_dir, $prel_slink: $!";  unlink $sltmp; # may fail
214  rename $prel_slink, $want_dir or die "Could not rename $prel_slink, $want_dir: $!";  symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
215    rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
216  system("chmod 777 $want_dir/*/read")==0 or die;  print "$slwant now points to $dir\n" if $OPT{verbose};
217    system("chmod 777 $slwant/*/read")==0 or die;
218    
219    opendir DIR, "." or die "Could not opendir .: $!";
220    for my $dirent (readdir DIR) {
221      next if $dirent =~ /^\./;
222      next unless $dirent =~ /^$OPT{database}(.*)/;
223      my $ext = $1 or next;
224      next unless -M $dirent > 4;
225      warn "removing old index $dirent";
226      File::Path::rmtree($dirent);
227    }
228    closedir DIR;
229    
230  $WAIT::Config = $WAIT::Config; # make perl -w happy  $WAIT::Config = $WAIT::Config; # make perl -w happy
231    
# Line 190  directory specified during configuration Line 266  directory specified during configuration
266    
267  Specify an alternate table name. Default is C<ora>.  Specify an alternate table name. Default is C<ora>.
268    
269    =back
270    
271  =head1 AUTHOR  =head1 AUTHOR
272    
273  Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>  Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>

Legend:
Removed from v.72  
changed lines
  Added in v.109

  ViewVC Help
Powered by ViewVC 1.1.26