/[wait]/trunk/script/smakewhatis
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/script/smakewhatis

Parent Directory Parent Directory | Revision Log Revision Log


Revision 109 - (show annotations)
Tue Jul 13 17:50:27 2004 UTC (19 years, 8 months ago) by dpavlin
File size: 9153 byte(s)
pod fixes

1 #!/usr/bin/perl -w
2 ######################### -*- Mode: Cperl -*- #########################
3 ##
4 ## $Basename: smakewhatis $
5 ## $Revision: 1.11 $
6 ##
7 ## Author : Ulrich Pfeifer
8 ## Created On : Mon Sep 2 12:57:12 1996
9 ##
10 ## Last Modified By : Ulrich Pfeifer
11 ## Last Modified On : Tue May 9 08:52:03 2000
12 ##
13 ## Copyright (c) 1996-1997, Ulrich Pfeifer
14 ##
15 ##
16 ######################################################################
17
18 use strict;
19
20 use FileHandle;
21 use File::Path;
22 use DB_File;
23 use Getopt::Long;
24
25 use lib '/data/wait/lib';
26
27 require WAIT::Database;
28 require WAIT::Config;
29 require WAIT::Parse::Nroff;
30 require WAIT::Document::Nroff;
31
32
33 my %OPT = (database => 'DB',
34 dir => $WAIT::Config->{WAIT_home} || '/tmp',
35 table => 'man',
36 clean => 0,
37 remove => 0,
38 );
39
40 GetOptions(\%OPT,
41 'database=s',
42 'dir=s',
43 'table=s',
44 'clean!',
45 'remove',
46 ) || die "Usage: ...\n";
47
48 if ($OPT{clean}) {
49 if (-d "$OPT{dir}/$OPT{database}") {
50 eval {
51 my $tmp = WAIT::Database->open(name => $OPT{database},
52 'directory' => $OPT{dir})
53 or die "Could not open table $OPT{table}: $@";
54 my $tbl = $tmp->table(name => $OPT{table});
55 $tbl->drop if $tbl;
56 $tmp->close;
57 rmtree("$OPT{dir}/$OPT{database}/$OPT{table}",1,1)
58 if -d "$OPT{dir}/$OPT{database}/$OPT{table}";
59 };
60 die $@ if $@;
61 } else {
62 die "Database $OPT{dir}/$OPT{database} doesn't exist,
63 nothing to clean, nothing done.\n";
64 }
65 exit;
66 }
67
68 my $db = WAIT::Database->open(
69 name => $OPT{database},
70 'directory' => $OPT{dir},
71 )
72 ||
73 WAIT::Database->create(
74 name => $OPT{database},
75 'directory' => $OPT{dir},
76 );
77 unless ($db) {
78 require Carp;
79 Carp::croak("Could not open/create database '$OPT{dir}/$OPT{database}': $@");
80 }
81
82
83 # We need a class that allows the index to access each document.
84 # Remember, all documents in this collection are values of a single
85 # tied hash. An especially cool feature is that the tie may return the
86 # whole document as a single string or as an object or anything that
87 # fits into a scalar. WAIT::Document::Nroff illustrates how the tieing
88 # class can work. See WAIT::Table for a manpage (W:D:Nroff has none).
89
90 my %D;
91 my $access = tie %D, 'WAIT::Document::Nroff', 'nroff -man';
92 die $@ unless defined $access;
93
94 # While WAIT::Document::Nroff ignored the contents of the scalar it
95 # accessed, WAIT::Parse::Nroff knows how to understand it. So bear in
96 # mind:
97
98 # access => Document
99 # layout => Parse
100
101 # The access to a document is provided by a Document class just as
102 # the layout of a document is provided by a Parser class. Makes sense?
103
104 my $layout= WAIT::Parse::Nroff->new;
105
106 # The definition of filters is something that will be tought in the
107 # advanced techniques course. For now, just copy and paste the
108 # something from here and try out alternatives.
109 my $stem = [{
110 'prefix' => ['unroff', 'isotr', 'isolc'],
111 'intervall' => ['unroff', 'isotr', 'isolc'],
112 },'unroff', 'isolc', 'stop', 'isotr', 'split2', 'Stem'];
113 # unroff it as the first because nroff markup isn't very helpful for
114 # indexing, turn into lowercase, eliminate the stopwords before isotr
115 # because our stopwords contain ticks (isn't, i'm, wouldn't, etc.),
116 # replace line noise ith space, eliminate anything left with less than
117 # 2 letters, find the word's stem.
118 my $text = [{
119 'prefix' => ['unroff', 'isotr', 'isolc'],
120 'intervall' => ['unroff', 'isotr', 'isolc'],
121 },
122 'unroff', 'isolc', 'stop', 'isotr', 'split2'];
123 my $sound = ['unroff', 'isotr', 'isolc', 'split2', 'Soundex'],;
124
125 my $tb;
126 eval { $tb = $db->table(name => $OPT{table}) };
127 $tb ||=
128 $db->create_table
129 (
130
131 name => $OPT{table},
132 # mandatory argument like a tablename in a relational database
133
134 access => $access,
135 # see above
136
137 layout => $layout,
138 # see above
139
140 attr => ['docid', 'headline', 'size'],
141 # the attr argument determines which attributes WAIT will store for
142 # us for later retrieval. A docid is a must, of course, so that we
143 # can retrieve the document later. The more attributes you name
144 # here, the bigger gets the database. For your first experiences it
145 # is highly recommended to have the two items C<docid> and
146 # C<headline> here, so that you can use sman for debugging as soon
147 # as you are through smakewhatis. In the sman program these two
148 # column names are hardcoded. You have the opportunity to create
149 # the two attributes for every record in the Layout/Parser class
150
151 keyset => [['docid']],
152 # which keys are necessary to unambiguously identify a record and
153 # access it through $access?
154
155 invindex =>
156 [
157 'name' => $stem,
158 'synopsis' => $stem,
159 'bugs' => $stem,
160 'description' => $stem,
161 'text' => $stem,
162 'environment' => $text,
163 'example' => $text, 'example' => $stem,
164 'author' => $sound, 'author' => $stem,
165 ]
166 # without this argument, WAIT will be able to run a pass through
167 # the indexer but it won't do anything useful. This argument is the
168 # heart of your indexing task and the place where you will start
169 # tuning once your indexes are working. For the impatent user, it's
170 # recommended to just have them all be text.
171
172 );
173
174 die unless $tb;
175
176 my @DIRS;
177 if (@ARGV) {
178 @DIRS = @ARGV;
179 } else {
180 @DIRS = @{$WAIT::Config->{manpath}};
181 }
182
183 $tb->set(top=>1);
184 my $mandir;
185 for $mandir (grep -d $_, @DIRS) {
186 opendir(DIR, $mandir) or warn "Could not open dir '$mandir': $!";
187 my @mdir = grep -d "$mandir/$_", grep /^man/, readdir(DIR);
188 closedir DIR;
189 my $section;
190 for $section (@mdir) {
191 my $file;
192 print STDERR "Scanning '$mandir/$section' ...\n";
193 opendir(DIR, "$mandir/$section")
194 or warn "Could not open dir '$mandir/section': $!";
195 my @files = grep -f "$mandir/$section/$_", grep $_ !~ /^\./, readdir(DIR);
196 closedir DIR;
197 for $file ( @files ) {
198 print STDERR "Indexing '$mandir/$section/$file' ... ";
199 &index("$mandir/$section/$file");
200 }
201 }
202 }
203 my $now = time;
204 warn "Starting reorg\n";
205 $tb->set(top=>1);
206 warn sprintf "Finished reorg %d seconds\n", time - $now;
207
208 # Do not forget to close the database after the extreme job you just finished.
209
210 $db->close();
211 exit;
212
213 # Now that you have created a database, lean back. To verify that it
214 # sort of worked and to understand what you actually did, I'd
215 # recommend to run sman through the debugger. Sman has options to
216 # choose databases and tables unrelated to its original task. You can
217 # run e.g.
218
219 # perl -Sd sman -dir /usr/local/yourwaitdir -database yourdatabase -table yourtable
220
221 # Step through the debugger to the place where a query object is
222 # created. Expect huge, self-referential datastrucures if you dump any
223 # of these object with the x command. It's quite instructive to watch
224 # the debugger print them for several minutes or hours.
225
226 # Once you have established a working querying with sman, you will
227 # want to write your own sman.
228
229 my $NO;
230 sub index {
231 my $did = shift;
232
233 if ($tb->have('docid' => $did)) {
234 #die "$@" if $2 ne '';
235 if (!$OPT{remove}) {
236 print "duplicate\n";
237 return;
238 }
239 } elsif ($OPT{remove}) {
240 print "missing\n";
241 return;
242 }
243
244 if (-s $did < 100) {
245 print "too small\n";
246 return;
247 }
248
249 my $value = $D{$did};
250 unless (defined $value) {
251 print "unavailable\n";
252 }
253 printf STDERR "ok [%d]\n", ++$NO;
254
255 my $record = $layout->split($value);
256 $record->{size} = length($value);
257 my $headline = $record->{name} || $did;
258 $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
259 printf "%s\n", substr($headline,0,80);
260 if ($OPT{remove}) {
261 $tb->delete('docid' => $did, headline => $headline, %{$record});
262 } else {
263 $tb->insert('docid' => $did, headline => $headline, %{$record});
264 }
265 }
266
267
268 __END__
269 ## ###################################################################
270 ## pod
271 ## ###################################################################
272
273 =head1 NAME
274
275 smakewhatis - generate a manual database for sman
276
277 =head1 SYNOPSIS
278
279 B<smakewhatis>
280 [B<-database> I<database name>]
281 [B<-dir> I<database directory>]
282 [B<-table> I<name>]
283 [B<-remove>]
284 [I<mandir> ...]
285
286 =head1 DESCRIPTION
287
288 B<Smakewhatis> generates/updates databases for B<sman>(1). If
289 I<mandir>s are specified, these are used. Otherwise the confiigured
290 default directories are indexed.
291
292 =head2 OPTIONS
293
294 =over 10
295
296 =item B<-database> I<database name>
297
298 Change the default database name to I<database name>.
299
300 =item B<-dir> I<database directory>
301
302 Change the default database directory to I<database directory>.
303
304 =item B<-table> I<name>
305
306 Use I<name> instead of C<man> as table name.
307
308 =item B<-clean>
309
310 Clean B<database> before indexing.
311
312 =item B<-remove>
313
314 Remove the selected directories from the database instead of
315 adding/updating. This works only for the manuals which are unchanged
316 since the indexing.
317
318 =back
319
320 =head1 SEE ALSO
321
322 L<sman>.
323
324 =head1 AUTHOR
325
326 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Properties

Name Value
cvs2svn:cvs-rev 1.1.1.2
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26