/[wait]/cvs-head/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log


Revision 72 - (show annotations)
Mon Jan 28 21:35:39 2002 UTC (22 years, 3 months ago) by laperla
File size: 5704 byte(s)
- erste Demoinstallation fuer oreilly fertig

1 #!/usr/bin/perl -w
2 # -*- Mode: Perl -*-
3 # $Basename$
4 # $Revision: 1.9 $
5 # Author : Ulrich Pfeifer
6 # Created On : Mon Dec 31 13:57:11 2001
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Fri Jan 4 15:59:20 2002
9 # Language : CPerl
10 #
11 # (C) Copyright 2001, UUNET Deutschland GmbH, Germany
12 #
13
14 use 5.007;
15
16 use strict;
17 use Devel::Peek qw(Dump);
18
19 use File::Path;
20 use DB_File;
21 use Getopt::Long;
22 use Cwd;
23
24 BEGIN {require WAIT::Config;}
25 use WAIT::Database;
26 use WAIT::Parse::Ora;
27 use WAIT::Document::Ora;
28 use WAIT::InvertedIndex;
29
30
31 $DB_BTREE->{'cachesize'} = 200_000 ;
32
33 my %OPT = (
34 database => 'DB',
35 dir => $WAIT::Config->{WAIT_home} || '/tmp',
36 table => 'ora',
37 );
38
39 GetOptions(\%OPT,
40 'database=s',
41 'dir=s',
42 'table=s',
43 ) || die "Usage: ...\n";
44
45 my @localtime = localtime;
46 $localtime[5] += 1900;
47 $localtime[4]++;
48 my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
49 my $db = WAIT::Database->create(name => "$OPT{database}-$jobid",
50 directory => $OPT{dir})
51 or die "Could not create database $OPT{database}: $@\n";
52
53 my $layout = new WAIT::Parse::Ora;
54
55 use lib "/usr/local/apache/lib";
56 use oreilly_de_catalog::wait_handler;
57
58 my $stem = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop', 'Stem'];
59 my $text = [{
60 'prefix' => ['OR_tr_20020124', 'OR_lc_20020124'],
61 'intervall' => ['OR_tr_20020124', 'OR_lc_20020124'],
62 },
63 'OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop'];
64 my $sound = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'Soundex'];
65 my $trigr = ['OR_lc_20020124', 'OR_trigrams_20020125'];
66 my $isbn = ['split6', 'OR_isbn_20020127'];
67
68 my $cwd = cwd;
69
70 my %D;
71 my $access = tie %D, 'WAIT::Document::Ora', @ARGV,
72 or die "Couldn't tie to file: $!\n";
73
74 my $tb = $db->create_table(name => $OPT{table},
75 attr => ['author', 'isbn', 'title',
76 'headline', 'docid'],
77 layout => $layout,
78 access => $access,
79 invindex =>
80 [
81 'title' => $text,
82 # 'title' => $stem,
83 'aboutauthor' => $text,
84 # 'aboutauthor' => $stem,
85 'desc' => $text,
86 'abstract' => $text,
87 'author' => $text,
88 # 'author' => $sound,
89 'colophon' => $text,
90 'isbn' => $isbn,
91 ]
92 );
93 die "Couldn't create table $OPT{table}: $@\n" unless $tb;
94
95 my ($did, $value);
96 binmode STDOUT, ":utf8";
97 while (($did, $value) = each %D) {
98 my $record = $layout->split($value);
99 my $headline = $record->{title};
100 $headline =~ s/\s+/ /sg;
101 printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
102 $tb->insert('docid' => $did,
103 headline => $headline,
104 %{$record});
105 }
106 $tb->set(top=>1);
107
108 my $tritb = $db->create_table(
109 name => "$OPT{table}_fallback",
110 attr => [qw(docid headline)], # name
111 # "headline"
112 # only for
113 # sman
114 invindex => [ headline => $trigr ],
115 );
116 my %dict;
117 for my $f ($tb->fields) {
118 my(@idx) = @{$tb->table->{inverted}{$f} || []};
119 for my $idx (@idx) {
120 my $name = $idx->name;
121 next if $name =~ /(_|\b)(Stem|Soundex)(\b|_)/; # irrelevant for alternatives
122 my @keys = $idx->keys;
123 @dict{@keys} = ();
124 }
125 }
126 my @dictkeys = grep s/^p//, keys %dict;
127 my $maxdebug = 5;
128 for my $headline (@dictkeys) {
129 if ($maxdebug && $headline =~ /[^\040-\177]/) {
130 Dump $headline;
131 $maxdebug--;
132 }
133 # printf "%s\n", substr($headline,0,60);
134 $tritb->insert(docid => $headline, headline => $headline);
135 }
136 $tritb->set(top=>1);
137 $tritb->close;
138 $tb->close();
139 $db->close();
140
141 # Now we have a new database with a very long name and we want that
142 # database to be accessible with the $OPT{database} name
143
144 use File::Spec;
145 my $long_dir = "$OPT{database}-$jobid";
146 my $want_dir = File::Spec->catdir($OPT{dir}, $OPT{database});
147 my $prel_slink = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
148 unlink $prel_slink; # may fail
149 symlink $long_dir, $prel_slink or die "Could not symlink $long_dir, $prel_slink: $!";
150 rename $prel_slink, $want_dir or die "Could not rename $prel_slink, $want_dir: $!";
151
152 system("chmod 777 $want_dir/*/read")==0 or die;
153
154 $WAIT::Config = $WAIT::Config; # make perl -w happy
155
156
157 __END__
158 ## ###################################################################
159 ## pod
160 ## ###################################################################
161
162 =head1 NAME
163
164 index_ora - generate an WAIT index for O'Reilly catalog
165
166 =head1 SYNOPSIS
167
168 B<index_ora>
169 [B<-database> I<dbname>]
170 [B<-dir> I<directory>]
171 [B<-table> I<table name>]
172 I<directory>
173
174 =head1 DESCRIPTION
175
176 =head1 OPTIONS
177
178 =over 5
179
180 =item B<-database> I<dbname>
181
182 Specify database name. Default is F<DB>.
183
184 =item B<-dir> I<directory>
185
186 Alternate directory where databases are located. Default is the
187 directory specified during configuration of WAIT.
188
189 =item B<-table> I<table name>
190
191 Specify an alternate table name. Default is C<ora>.
192
193 =head1 AUTHOR
194
195 Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>
196

Properties

Name Value
cvs2svn:cvs-rev 1.9

  ViewVC Help
Powered by ViewVC 1.1.26