/[wait]/branches/unido/script/smakewhatis
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/unido/script/smakewhatis

Parent Directory Parent Directory | Revision Log Revision Log


Revision 106 - (show annotations)
Tue Jul 13 12:22:09 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 9119 byte(s)
Changes made by Andreas J. Koenig <andreas.koenig(at)anima.de> for Unido project

1 #!/usr/bin/perl -w
2 ######################### -*- Mode: Cperl -*- #########################
3 ##
4 ## $Basename: smakewhatis $
5 ## $Revision: 1.11 $
6 ##
7 ## Author : Ulrich Pfeifer
8 ## Created On : Mon Sep 2 12:57:12 1996
9 ##
10 ## Last Modified By : Ulrich Pfeifer
11 ## Last Modified On : Tue May 9 08:52:03 2000
12 ##
13 ## Copyright (c) 1996-1997, Ulrich Pfeifer
14 ##
15 ##
16 ######################################################################
17
18 use strict;
19
20 use FileHandle;
21 use File::Path;
22 use DB_File;
23 use Getopt::Long;
24
25 require WAIT::Database;
26 require WAIT::Config;
27 require WAIT::Parse::Nroff;
28 require WAIT::Document::Nroff;
29
30
31 my %OPT = (database => 'DB',
32 dir => $WAIT::Config->{WAIT_home} || '/tmp',
33 table => 'man',
34 clean => 0,
35 remove => 0,
36 );
37
38 GetOptions(\%OPT,
39 'database=s',
40 'dir=s',
41 'table=s',
42 'clean!',
43 'remove',
44 ) || die "Usage: ...\n";
45
46 if ($OPT{clean}) {
47 if (-d "$OPT{dir}/$OPT{database}") {
48 eval {
49 my $tmp = WAIT::Database->open(name => $OPT{database},
50 'directory' => $OPT{dir})
51 or die "Could not open table $OPT{table}: $@";
52 my $tbl = $tmp->table(name => $OPT{table});
53 $tbl->drop if $tbl;
54 $tmp->close;
55 rmtree("$OPT{dir}/$OPT{database}/$OPT{table}",1,1)
56 if -d "$OPT{dir}/$OPT{database}/$OPT{table}";
57 };
58 die $@ if $@;
59 } else {
60 die "Database $OPT{dir}/$OPT{database} doesn't exist,
61 nothing to clean, nothing done.\n";
62 }
63 exit;
64 }
65
66 my $db = WAIT::Database->open(
67 name => $OPT{database},
68 'directory' => $OPT{dir},
69 )
70 ||
71 WAIT::Database->create(
72 name => $OPT{database},
73 'directory' => $OPT{dir},
74 );
75 unless ($db) {
76 require Carp;
77 Carp::croak("Could not open/create database '$OPT{dir}/$OPT{database}': $@");
78 }
79
80
81 # We need a class that allows the index to access each document.
82 # Remember, all documents in this collection are values of a single
83 # tied hash. An especially cool feature is that the tie may return the
84 # whole document as a single string or as an object or anything that
85 # fits into a scalar. WAIT::Document::Nroff illustrates how the tieing
86 # class can work. See WAIT::Table for a manpage (W:D:Nroff has none).
87
88 my %D;
89 my $access = tie %D, 'WAIT::Document::Nroff', 'nroff -man';
90 die $@ unless defined $access;
91
92 # While WAIT::Document::Nroff ignored the contents of the scalar it
93 # accessed, WAIT::Parse::Nroff knows how to understand it. So bear in
94 # mind:
95
96 # access => Document
97 # layout => Parse
98
99 # The access to a document is provided by a Document class just as
100 # the layout of a document is provided by a Parser class. Makes sense?
101
102 my $layout= WAIT::Parse::Nroff->new;
103
104 # The definition of filters is something that will be tought in the
105 # advanced techniques course. For now, just copy and paste the
106 # something from here and try out alternatives.
107 my $stem = [{
108 'prefix' => ['unroff', 'isotr', 'isolc'],
109 'intervall' => ['unroff', 'isotr', 'isolc'],
110 },'unroff', 'isolc', 'stop', 'isotr', 'split2', 'Stem'];
111 # unroff it as the first because nroff markup isn't very helpful for
112 # indexing, turn into lowercase, eliminate the stopwords before isotr
113 # because our stopwords contain ticks (isn't, i'm, wouldn't, etc.),
114 # replace line noise ith space, eliminate anything left with less than
115 # 2 letters, find the word's stem.
116 my $text = [{
117 'prefix' => ['unroff', 'isotr', 'isolc'],
118 'intervall' => ['unroff', 'isotr', 'isolc'],
119 },
120 'unroff', 'isolc', 'stop', 'isotr', 'split2'];
121 my $sound = ['unroff', 'isotr', 'isolc', 'split2', 'Soundex'],;
122
123 my $tb;
124 eval { $tb = $db->table(name => $OPT{table}) };
125 $tb ||=
126 $db->create_table
127 (
128
129 name => $OPT{table},
130 # mandatory argument like a tablename in a relational database
131
132 access => $access,
133 # see above
134
135 layout => $layout,
136 # see above
137
138 attr => ['docid', 'headline', 'size'],
139 # the attr argument determines which attributes WAIT will store for
140 # us for later retrieval. A docid is a must, of course, so that we
141 # can retrieve the document later. The more attributes you name
142 # here, the bigger gets the database. For your first experiences it
143 # is highly recommended to have the two items C<docid> and
144 # C<headline> here, so that you can use sman for debugging as soon
145 # as you are through smakewhatis. In the sman program these two
146 # column names are hardcoded. You have the opportunity to create
147 # the two attributes for every record in the Layout/Parser class
148
149 keyset => [['docid']],
150 # which keys are necessary to unambiguously identify a record and
151 # access it through $access?
152
153 invindex =>
154 [
155 'name' => $stem,
156 'synopsis' => $stem,
157 'bugs' => $stem,
158 'description' => $stem,
159 'text' => $stem,
160 'environment' => $text,
161 'example' => $text, 'example' => $stem,
162 'author' => $sound, 'author' => $stem,
163 ]
164 # without this argument, WAIT will be able to run a pass through
165 # the indexer but it won't do anything useful. This argument is the
166 # heart of your indexing task and the place where you will start
167 # tuning once your indexes are working. For the impatent user, it's
168 # recommended to just have them all be text.
169
170 );
171
172 die unless $tb;
173
174 my @DIRS;
175 if (@ARGV) {
176 @DIRS = @ARGV;
177 } else {
178 @DIRS = @{$WAIT::Config->{manpath}};
179 }
180
181 $tb->set(top=>1);
182 my $mandir;
183 for $mandir (grep -d $_, @DIRS) {
184 opendir(DIR, $mandir) or warn "Could not open dir '$mandir': $!";
185 my @mdir = grep -d "$mandir/$_", grep /^man/, readdir(DIR);
186 closedir DIR;
187 my $section;
188 for $section (@mdir) {
189 my $file;
190 print STDERR "Scanning '$mandir/$section' ...\n";
191 opendir(DIR, "$mandir/$section")
192 or warn "Could not open dir '$mandir/section': $!";
193 my @files = grep -f "$mandir/$section/$_", grep $_ !~ /^\./, readdir(DIR);
194 closedir DIR;
195 for $file ( @files ) {
196 print STDERR "Indexing '$mandir/$section/$file' ... ";
197 &index("$mandir/$section/$file");
198 }
199 }
200 }
201 my $now = time;
202 warn "Starting reorg\n";
203 $tb->set(top=>1);
204 warn sprintf "Finished reorg %d seconds\n", time - $now;
205
206 # Do not forget to close the database after the extreme job you just finished.
207
208 $db->close();
209 exit;
210
211 # Now that you have created a database, lean back. To verify that it
212 # sort of worked and to understand what you actually did, I'd
213 # recommend to run sman through the debugger. Sman has options to
214 # choose databases and tables unrelated to its original task. You can
215 # run e.g.
216
217 # perl -Sd sman -dir /usr/local/yourwaitdir -database yourdatabase -table yourtable
218
219 # Step through the debugger to the place where a query object is
220 # created. Expect huge, self-referential datastrucures if you dump any
221 # of these object with the x command. It's quite instructive to watch
222 # the debugger print them for several minutes or hours.
223
224 # Once you have established a working querying with sman, you will
225 # want to write your own sman.
226
227 my $NO;
228 sub index {
229 my $did = shift;
230
231 if ($tb->have('docid' => $did)) {
232 #die "$@" if $2 ne '';
233 if (!$OPT{remove}) {
234 print "duplicate\n";
235 return;
236 }
237 } elsif ($OPT{remove}) {
238 print "missing\n";
239 return;
240 }
241
242 if (-s $did < 100) {
243 print "too small\n";
244 return;
245 }
246
247 my $value = $D{$did};
248 unless (defined $value) {
249 print "unavailable\n";
250 }
251 printf STDERR "ok [%d]\n", ++$NO;
252
253 my $record = $layout->split($value);
254 $record->{size} = length($value);
255 my $headline = $record->{name} || $did;
256 $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
257 printf "%s\n", substr($headline,0,80);
258 if ($OPT{remove}) {
259 $tb->delete('docid' => $did, headline => $headline, %{$record});
260 } else {
261 $tb->insert('docid' => $did, headline => $headline, %{$record});
262 }
263 }
264
265
266 __END__
267 ## ###################################################################
268 ## pod
269 ## ###################################################################
270
271 =head1 NAME
272
273 smakewhatis - generate a manual database for sman
274
275 =head1 SYNOPSIS
276
277 B<smakewhatis>
278 [B<-database> I<database name>]
279 [B<-dir> I<database directory>]
280 [B<-table> I<name>]
281 [B<-remove>]
282 [I<mandir> ...]
283
284 =head1 DESCRIPTION
285
286 B<Smakewhatis> generates/updates databases for B<sman>(1). If
287 I<mandir>s are specified, these are used. Otherwise the confiigured
288 default directories are indexed.
289
290 =head2 OPTIONS
291
292 =over 10
293
294 =item B<-database> I<database name>
295
296 Change the default database name to I<database name>.
297
298 =item B<-dir> I<database directory>
299
300 Change the default database directory to I<database directory>.
301
302 =item B<-table> I<name>
303
304 Use I<name> instead of C<man> as table name.
305
306 =item B<-clean>
307
308 Clean B<database> before indexing.
309
310 =item B<-remove>
311
312 Remove the selected directories from the database instead of
313 adding/updating. This works only for the manuals which are unchanged
314 since the indexing.
315
316 =head1 SEE ALSO
317
318 L<sman>.
319
320 =head1 AUTHOR
321
322 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26