/[wait]/trunk/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log


Revision 73 - (hide annotations)
Tue Mar 5 13:40:38 2002 UTC (22 years, 2 months ago) by laperla
Original Path: cvs-head/script/index_ora
File size: 6281 byte(s)
- Just a snapshot before we rewrite the indexer since the text is
  marked up better today.

1 ulpfr 55 #!/usr/bin/perl -w
2     # -*- Mode: Perl -*-
3     # $Basename$
4 laperla 73 # $Revision: 1.10 $
5 ulpfr 55 # Author : Ulrich Pfeifer
6     # Created On : Mon Dec 31 13:57:11 2001
7     # Last Modified By: Ulrich Pfeifer
8 ulpfr 62 # Last Modified On: Fri Jan 4 15:59:20 2002
9 ulpfr 55 # Language : CPerl
10     #
11 laperla 73 # (C) Copyright 2001, Ulrich Pfeifer
12 ulpfr 55 #
13    
14 laperla 69 use 5.007;
15    
16 ulpfr 55 use strict;
17 laperla 71 use Devel::Peek qw(Dump);
18 laperla 69
19 ulpfr 55 use File::Path;
20     use DB_File;
21     use Getopt::Long;
22     use Cwd;
23    
24 laperla 69 BEGIN {require WAIT::Config;}
25     use WAIT::Database;
26     use WAIT::Parse::Ora;
27     use WAIT::Document::Ora;
28     use WAIT::InvertedIndex;
29 ulpfr 55
30    
31     $DB_BTREE->{'cachesize'} = 200_000 ;
32    
33 laperla 67 my %OPT = (
34 laperla 73 database => 'oreilly_de_catalog',
35     dir => '/usr/local/apache/data',
36 ulpfr 55 table => 'ora',
37     );
38    
39     GetOptions(\%OPT,
40     'database=s',
41     'dir=s',
42     'table=s',
43     ) || die "Usage: ...\n";
44    
45 laperla 68 my @localtime = localtime;
46     $localtime[5] += 1900;
47     $localtime[4]++;
48     my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
49     my $db = WAIT::Database->create(name => "$OPT{database}-$jobid",
50     directory => $OPT{dir})
51     or die "Could not create database $OPT{database}: $@\n";
52 ulpfr 55
53     my $layout = new WAIT::Parse::Ora;
54    
55 laperla 69 use lib "/usr/local/apache/lib";
56     use oreilly_de_catalog::wait_handler;
57    
58     my $stem = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop', 'Stem'];
59 ulpfr 55 my $text = [{
60 laperla 69 'prefix' => ['OR_tr_20020124', 'OR_lc_20020124'],
61     'intervall' => ['OR_tr_20020124', 'OR_lc_20020124'],
62 ulpfr 55 },
63 laperla 69 'OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop'];
64 laperla 73 my $wplus = ['split2', 'OR_lc_20020124', 'OR_mixedonly_20020221'];
65 laperla 71 my $sound = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'Soundex'];
66     my $trigr = ['OR_lc_20020124', 'OR_trigrams_20020125'];
67 laperla 73 # split6 is better than split13 or split10: it allows them to enter
68     # shorter sequences when searching.
69 laperla 72 my $isbn = ['split6', 'OR_isbn_20020127'];
70 ulpfr 55
71     my $cwd = cwd;
72    
73 laperla 73 my $droot = shift or die "Usage: $0 <options> Document-Rootdirectories";
74    
75 ulpfr 55 my %D;
76 laperla 73 my $access = tie %D, 'WAIT::Document::Ora', $droot,
77     or die "Couldn't tie to dir $droot: $!\n";
78 ulpfr 55
79     my $tb = $db->create_table(name => $OPT{table},
80 ulpfr 62 attr => ['author', 'isbn', 'title',
81 ulpfr 55 'headline', 'docid'],
82     layout => $layout,
83     access => $access,
84     invindex =>
85     [
86 laperla 72 'title' => $text,
87 laperla 73 'title' => $wplus,
88 laperla 72 # 'title' => $stem,
89     'aboutauthor' => $text,
90 laperla 73 'aboutauthor' => $wplus,
91 laperla 72 # 'aboutauthor' => $stem,
92     'desc' => $text,
93 laperla 73 'desc' => $wplus,
94 laperla 72 'abstract' => $text,
95 laperla 73 'abstract' => $wplus,
96 ulpfr 55 'author' => $text,
97 laperla 72 # 'author' => $sound,
98 laperla 65 'colophon' => $text,
99 laperla 73 'colophon' => $wplus,
100 laperla 72 'isbn' => $isbn,
101 ulpfr 55 ]
102     );
103     die "Couldn't create table $OPT{table}: $@\n" unless $tb;
104    
105     my ($did, $value);
106 laperla 69 binmode STDOUT, ":utf8";
107 ulpfr 55 while (($did, $value) = each %D) {
108     my $record = $layout->split($value);
109     my $headline = $record->{title};
110     $headline =~ s/\s+/ /sg;
111 ulpfr 62 printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
112 ulpfr 55 $tb->insert('docid' => $did,
113     headline => $headline,
114     %{$record});
115     }
116     $tb->set(top=>1);
117 laperla 71
118     my $tritb = $db->create_table(
119     name => "$OPT{table}_fallback",
120 laperla 72 attr => [qw(docid headline)], # name
121     # "headline"
122     # only for
123     # sman
124 laperla 71 invindex => [ headline => $trigr ],
125     );
126     my %dict;
127     for my $f ($tb->fields) {
128     my(@idx) = @{$tb->table->{inverted}{$f} || []};
129     for my $idx (@idx) {
130 laperla 72 my $name = $idx->name;
131 laperla 73 next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/;
132     # irrelevant for alternatives
133 laperla 71 my @keys = $idx->keys;
134     @dict{@keys} = ();
135     }
136     }
137     my @dictkeys = grep s/^p//, keys %dict;
138     my $maxdebug = 5;
139     for my $headline (@dictkeys) {
140     if ($maxdebug && $headline =~ /[^\040-\177]/) {
141     Dump $headline;
142     $maxdebug--;
143     }
144 laperla 72 # printf "%s\n", substr($headline,0,60);
145 laperla 71 $tritb->insert(docid => $headline, headline => $headline);
146     }
147     $tritb->set(top=>1);
148     $tritb->close;
149 ulpfr 55 $tb->close();
150     $db->close();
151    
152 laperla 73 # Atomically relinking symlink: now we have a new database with a very
153     # long name like oreilly_de_catalog-2002-01-28_16:12_16467 and we want
154     # that database to be accessible with the oreilly_de_catalog name.
155 laperla 68
156     use File::Spec;
157 laperla 73 my $dir = "$OPT{database}-$jobid";
158     my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database});
159     my $sltmp = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
160     unlink $sltmp; # may fail
161     symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
162     rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
163     warn "$slwant now points to $dir";
164     system("chmod 777 $slwant/*/read")==0 or die;
165 laperla 68
166 ulpfr 55 $WAIT::Config = $WAIT::Config; # make perl -w happy
167    
168    
169     __END__
170     ## ###################################################################
171     ## pod
172     ## ###################################################################
173    
174     =head1 NAME
175    
176     index_ora - generate an WAIT index for O'Reilly catalog
177    
178     =head1 SYNOPSIS
179    
180     B<index_ora>
181     [B<-database> I<dbname>]
182     [B<-dir> I<directory>]
183     [B<-table> I<table name>]
184     I<directory>
185    
186     =head1 DESCRIPTION
187    
188     =head1 OPTIONS
189    
190     =over 5
191    
192     =item B<-database> I<dbname>
193    
194     Specify database name. Default is F<DB>.
195    
196     =item B<-dir> I<directory>
197    
198     Alternate directory where databases are located. Default is the
199     directory specified during configuration of WAIT.
200    
201     =item B<-table> I<table name>
202    
203     Specify an alternate table name. Default is C<ora>.
204    
205     =head1 AUTHOR
206    
207     Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>
208    

Properties

Name Value
cvs2svn:cvs-rev 1.10

  ViewVC Help
Powered by ViewVC 1.1.26