1 |
dpavlin |
118 |
#!/usr/bin/perl -w |
2 |
ulpfr |
17 |
|
3 |
|
|
use strict; |
4 |
|
|
|
5 |
|
|
use Getopt::Long; |
6 |
dpavlin |
118 |
use Digest::MD5 qw(md5_hex); |
7 |
|
|
use Data::Dumper; |
8 |
ulpfr |
17 |
|
9 |
|
|
require WAIT::Database; |
10 |
|
|
require WAIT::Config; |
11 |
|
|
require WAIT::Parse::HTML; |
12 |
|
|
require WAIT::Document::Find; |
13 |
|
|
|
14 |
dpavlin |
118 |
use utf8; |
15 |
ulpfr |
17 |
|
16 |
dpavlin |
118 |
use lib '/data/wait/lib'; |
17 |
|
|
|
18 |
ulpfr |
17 |
my %OPT = (database => 'DB', |
19 |
|
|
dir => $WAIT::Config->{WAIT_home} || '/tmp', |
20 |
dpavlin |
118 |
table => 'html', |
21 |
ulpfr |
17 |
clean => 0, |
22 |
|
|
remove => 0, |
23 |
dpavlin |
118 |
force => 0, |
24 |
ulpfr |
17 |
); |
25 |
|
|
|
26 |
|
|
GetOptions(\%OPT, |
27 |
|
|
'database=s', |
28 |
|
|
'dir=s', |
29 |
|
|
'table=s', |
30 |
|
|
'clean!', |
31 |
dpavlin |
118 |
'remove!', |
32 |
|
|
'force!', |
33 |
|
|
); |
34 |
|
|
|
35 |
|
|
my $path = shift @ARGV || |
36 |
|
|
die "Usage: $0 [-database=$OPT{database}] [-dir=$OPT{dir}] [-table=$OPT{table}]\n\t[-clean] [-remove] [-force] directory_with_htmls\n"; |
37 |
ulpfr |
17 |
|
38 |
dpavlin |
118 |
if ($OPT{clean}) { |
39 |
|
|
my $db = WAIT::Database->open( |
40 |
|
|
name => $OPT{database}, |
41 |
|
|
'directory' => $OPT{dir}, |
42 |
|
|
) |
43 |
|
|
or die "Could not open database '$OPT{dir}/$OPT{database}': $@"; |
44 |
|
|
$db->drop_table(name => $OPT{table}) or |
45 |
|
|
die "Could not drop table '$OPT{tabel}': $@"; |
46 |
|
|
|
47 |
|
|
$db->close; |
48 |
ulpfr |
17 |
} |
49 |
|
|
|
50 |
dpavlin |
118 |
my $db = WAIT::Database->open( |
51 |
|
|
name => $OPT{database}, |
52 |
|
|
'directory' => $OPT{dir}, |
53 |
|
|
) |
54 |
|
|
|| WAIT::Database->create( |
55 |
|
|
name => $OPT{database}, |
56 |
|
|
'directory' => $OPT{dir}, |
57 |
|
|
) |
58 |
|
|
or die "Could not open/create database '$OPT{dir}/$OPT{database}': $@"; |
59 |
|
|
|
60 |
ulpfr |
17 |
my $layout= new WAIT::Parse::HTML; |
61 |
|
|
my $stem = [{ |
62 |
|
|
'prefix' => ['isotr', 'isolc'], |
63 |
|
|
'intervall' => ['isotr', 'isolc'], |
64 |
|
|
},'decode_entities', 'isotr', 'isolc', 'split2', 'stop', 'Stem']; |
65 |
|
|
my $text = [{ |
66 |
|
|
'prefix' => ['isotr', 'isolc'], |
67 |
|
|
'intervall' => ['isotr', 'isolc'], |
68 |
|
|
}, |
69 |
|
|
'decode_entities', 'isotr', 'isolc', 'split2', 'stop']; |
70 |
|
|
my $sound = ['decode_entities', 'isotr', 'isolc', 'split2', 'Soundex']; |
71 |
|
|
|
72 |
|
|
my %D; |
73 |
|
|
|
74 |
|
|
my $access = tie (%D, 'WAIT::Document::Find', sub { $_[0] =~ /\.htm/; }, |
75 |
dpavlin |
118 |
$path); |
76 |
ulpfr |
17 |
die $@ unless defined $access; |
77 |
|
|
|
78 |
|
|
|
79 |
|
|
my $tb = $db->table(name => $OPT{table}) || |
80 |
|
|
$db->create_table |
81 |
|
|
(name => $OPT{table}, |
82 |
dpavlin |
118 |
attr => ['docid', 'headline', 'size', 'md5'], |
83 |
|
|
keyset => [['docid', 'md5']], |
84 |
ulpfr |
17 |
layout => $layout, |
85 |
|
|
access => $access, |
86 |
|
|
invindex => |
87 |
|
|
[ |
88 |
|
|
'text' => $stem, |
89 |
|
|
'title' => $stem, |
90 |
|
|
'title' => $text, |
91 |
|
|
] |
92 |
|
|
); |
93 |
|
|
die unless $tb; |
94 |
|
|
|
95 |
|
|
while (my ($path, $content) = each %D) { |
96 |
|
|
&index($path, $content); |
97 |
|
|
} |
98 |
|
|
$db->close(); |
99 |
|
|
exit; |
100 |
|
|
|
101 |
|
|
my $NO; |
102 |
|
|
sub index { |
103 |
|
|
my ($did, $value) = @_; |
104 |
|
|
if ($tb->have('docid' => $did)) { |
105 |
dpavlin |
118 |
if (!$OPT{remove} && !$OPT{force}) { |
106 |
ulpfr |
17 |
print "duplicate\n"; |
107 |
|
|
return; |
108 |
|
|
} |
109 |
|
|
} elsif ($OPT{remove}) { |
110 |
|
|
print "missing\n"; |
111 |
|
|
return; |
112 |
|
|
} |
113 |
|
|
|
114 |
|
|
if (-s $did < 100) { |
115 |
|
|
print "too small\n"; |
116 |
|
|
return; |
117 |
|
|
} |
118 |
|
|
|
119 |
|
|
unless (defined $value) { |
120 |
|
|
print "unavailable\n"; |
121 |
|
|
return; |
122 |
|
|
} |
123 |
|
|
printf STDERR "ok [%d]\n", ++$NO; |
124 |
|
|
|
125 |
|
|
my $record = $layout->split($value); |
126 |
|
|
$record->{size} = length($value); |
127 |
|
|
my $headline = $record->{title} || $did; |
128 |
|
|
$headline =~ s/\s+/ /g; $headline =~ s/^\s+//; |
129 |
|
|
printf "%s\n", substr($headline,0,80); |
130 |
dpavlin |
118 |
my $t = \$record->{text}; |
131 |
|
|
if ($$t) { |
132 |
|
|
my $md5; |
133 |
|
|
if (utf8::is_utf8($$t)) { |
134 |
|
|
$md5 = Dumper($$t) |
135 |
|
|
} else { |
136 |
|
|
$md5 = md5_hex($$t); |
137 |
|
|
} |
138 |
|
|
if ($tb->have('md5' => $md5)) { |
139 |
|
|
print "duplicate md5\n"; |
140 |
|
|
return; |
141 |
|
|
} |
142 |
|
|
$record->{md5} = $md5; |
143 |
|
|
print "$md5\n"; |
144 |
|
|
} else { |
145 |
|
|
print "no page content! skipping...\n"; |
146 |
|
|
return; |
147 |
|
|
} |
148 |
|
|
|
149 |
ulpfr |
17 |
if ($OPT{remove}) { |
150 |
|
|
$tb->delete('docid' => $did, headline => $headline, %{$record}); |
151 |
|
|
} else { |
152 |
|
|
$tb->insert('docid' => $did, headline => $headline, %{$record}); |
153 |
|
|
} |
154 |
|
|
} |
155 |
|
|
|
156 |
dpavlin |
118 |
$WAIT::Config = $WAIT::Config; |
157 |
ulpfr |
17 |
|
158 |
|
|
__END__ |
159 |
|
|
## ################################################################### |
160 |
|
|
## pod |
161 |
|
|
## ################################################################### |
162 |
|
|
|
163 |
|
|
=head1 NAME |
164 |
|
|
|
165 |
dpavlin |
118 |
index_html - generate a html database for sman |
166 |
ulpfr |
17 |
|
167 |
|
|
=head1 SYNOPSIS |
168 |
|
|
|
169 |
|
|
B<index_html> |
170 |
|
|
[B<-database> I<database name>] |
171 |
|
|
[B<-dir> I<database directory>] |
172 |
|
|
[B<-table> I<name>] |
173 |
|
|
[B<-remove>] |
174 |
dpavlin |
118 |
[I<htmldir> ...] |
175 |
ulpfr |
17 |
|
176 |
|
|
=head1 DESCRIPTION |
177 |
|
|
|
178 |
|
|
B<Index_html> generates/updates databases for B<sman>(1). If |
179 |
dpavlin |
118 |
I<htmldir>s are specified, these are used. Otherwise the script dies. |
180 |
ulpfr |
17 |
|
181 |
|
|
=head2 OPTIONS |
182 |
|
|
|
183 |
|
|
=over 10 |
184 |
|
|
|
185 |
|
|
=item B<-database> I<database name> |
186 |
|
|
|
187 |
|
|
Change the default database name to I<database name>. |
188 |
|
|
|
189 |
|
|
=item B<-dir> I<database directory> |
190 |
|
|
|
191 |
|
|
Change the default database directory to I<database directory>. |
192 |
|
|
|
193 |
|
|
=item B<-table> I<name> |
194 |
|
|
|
195 |
|
|
Use I<name> instead of C<man> as table name. |
196 |
|
|
|
197 |
|
|
=item B<-clean> |
198 |
|
|
|
199 |
|
|
Clean B<database> before indexing. |
200 |
|
|
|
201 |
|
|
=item B<-remove> |
202 |
|
|
|
203 |
|
|
Remove the selected directories from the database instead of |
204 |
|
|
adding/updating. This works only for the manuals which are unchanged |
205 |
|
|
since the indexing. |
206 |
|
|
|
207 |
dpavlin |
109 |
=back |
208 |
|
|
|
209 |
ulpfr |
17 |
=head1 SEE ALSO |
210 |
|
|
|
211 |
|
|
L<sman>. |
212 |
|
|
|
213 |
|
|
=head1 AUTHOR |
214 |
|
|
|
215 |
|
|
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |