1 |
#!/usr/local/perl5.005_56.Mar06/bin/perl -w |
#!/usr/bin/perl -w |
|
eval 'exec perl -w -S $0 "$@"' |
|
|
if 0; |
|
2 |
|
|
3 |
use strict; |
use strict; |
4 |
|
|
|
|
|
|
use FileHandle; |
|
5 |
use Getopt::Long; |
use Getopt::Long; |
6 |
|
use Digest::MD5 qw(md5_hex); |
7 |
|
use Data::Dumper; |
8 |
|
|
9 |
require WAIT::Database; |
require WAIT::Database; |
10 |
require WAIT::Config; |
require WAIT::Config; |
11 |
require WAIT::Parse::HTML; |
require WAIT::Parse::HTML; |
12 |
require WAIT::Document::Find; |
require WAIT::Document::Find; |
13 |
|
|
14 |
|
use utf8; |
15 |
|
|
16 |
|
use lib '/data/wait/lib'; |
17 |
|
|
18 |
my %OPT = (database => 'DB', |
my %OPT = (database => 'DB', |
19 |
dir => $WAIT::Config->{WAIT_home} || '/tmp', |
dir => $WAIT::Config->{WAIT_home} || '/tmp', |
20 |
table => 'kbox', |
table => 'html', |
21 |
clean => 0, |
clean => 0, |
22 |
remove => 0, |
remove => 0, |
23 |
|
force => 0, |
24 |
); |
); |
25 |
|
|
26 |
GetOptions(\%OPT, |
GetOptions(\%OPT, |
28 |
'dir=s', |
'dir=s', |
29 |
'table=s', |
'table=s', |
30 |
'clean!', |
'clean!', |
31 |
'remove', |
'remove!', |
32 |
) || die "Usage: ...\n"; |
'force!', |
33 |
|
); |
34 |
|
|
35 |
|
my $path = shift @ARGV || |
36 |
|
die "Usage: $0 [-database=$OPT{database}] [-dir=$OPT{dir}] [-table=$OPT{table}]\n\t[-clean] [-remove] [-force] directory_with_htmls\n"; |
37 |
|
|
38 |
|
if ($OPT{clean}) { |
39 |
|
my $db = WAIT::Database->open( |
40 |
|
name => $OPT{database}, |
41 |
|
'directory' => $OPT{dir}, |
42 |
|
) |
43 |
|
or die "Could not open database '$OPT{dir}/$OPT{database}': $@"; |
44 |
|
$db->drop_table(name => $OPT{table}) or |
45 |
|
die "Could not drop table '$OPT{tabel}': $@"; |
46 |
|
|
47 |
my $db; |
$db->close; |
|
if ($OPT{clean} and -d "$OPT{dir}/$OPT{database}") { |
|
|
eval { |
|
|
my $tmp = WAIT::Database->open(name => $OPT{database}, |
|
|
'directory' => $OPT{dir}) |
|
|
or die "Could not open table $OPT{table}: $@"; |
|
|
my $tbl = $tmp->table(name => $OPT{table}); |
|
|
$tbl->drop if $tbl; |
|
|
$tmp->close; |
|
|
rmtree("$OPT{dir}/$OPT{database}/$OPT{table}",1,1) |
|
|
if -d "$OPT{dir}/$OPT{database}/$OPT{table}"; |
|
|
}; |
|
|
exit; |
|
|
} |
|
|
unless (-d "$OPT{dir}/$OPT{database}") { |
|
|
$db = WAIT::Database->create(name => $OPT{database}, |
|
|
'directory' => $OPT{dir}) |
|
|
or die "Could not open database $OPT{database}: $@"; |
|
|
} else { |
|
|
$db = WAIT::Database->open(name => $OPT{database}, |
|
|
'directory' => $OPT{dir}) |
|
|
or die "Could not open table $OPT{table}: $@"; |
|
48 |
} |
} |
49 |
|
|
50 |
|
my $db = WAIT::Database->open( |
51 |
|
name => $OPT{database}, |
52 |
|
'directory' => $OPT{dir}, |
53 |
|
) |
54 |
|
|| WAIT::Database->create( |
55 |
|
name => $OPT{database}, |
56 |
|
'directory' => $OPT{dir}, |
57 |
|
) |
58 |
|
or die "Could not open/create database '$OPT{dir}/$OPT{database}': $@"; |
59 |
|
|
60 |
my $layout= new WAIT::Parse::HTML; |
my $layout= new WAIT::Parse::HTML; |
61 |
my $stem = [{ |
my $stem = [{ |
62 |
'prefix' => ['isotr', 'isolc'], |
'prefix' => ['isotr', 'isolc'], |
72 |
my %D; |
my %D; |
73 |
|
|
74 |
my $access = tie (%D, 'WAIT::Document::Find', sub { $_[0] =~ /\.htm/; }, |
my $access = tie (%D, 'WAIT::Document::Find', sub { $_[0] =~ /\.htm/; }, |
75 |
"/usr/local/etc/httpd/htdocs/berlin"); |
$path); |
76 |
die $@ unless defined $access; |
die $@ unless defined $access; |
77 |
|
|
78 |
|
|
79 |
my $tb = $db->table(name => $OPT{table}) || |
my $tb = $db->table(name => $OPT{table}) || |
80 |
$db->create_table |
$db->create_table |
81 |
(name => $OPT{table}, |
(name => $OPT{table}, |
82 |
attr => ['docid', 'headline', 'size'], |
attr => ['docid', 'headline', 'size', 'md5'], |
83 |
keyset => [['docid']], |
keyset => [['docid', 'md5']], |
84 |
layout => $layout, |
layout => $layout, |
85 |
access => $access, |
access => $access, |
86 |
invindex => |
invindex => |
92 |
); |
); |
93 |
die unless $tb; |
die unless $tb; |
94 |
|
|
|
my @DIRS; |
|
|
if (@ARGV) { |
|
|
@DIRS = @ARGV; |
|
|
} else { |
|
|
@DIRS = @{$WAIT::Config->{manpath}}; |
|
|
} |
|
|
|
|
95 |
while (my ($path, $content) = each %D) { |
while (my ($path, $content) = each %D) { |
96 |
&index($path, $content); |
&index($path, $content); |
97 |
} |
} |
102 |
sub index { |
sub index { |
103 |
my ($did, $value) = @_; |
my ($did, $value) = @_; |
104 |
if ($tb->have('docid' => $did)) { |
if ($tb->have('docid' => $did)) { |
105 |
if (!$OPT{remove}) { |
if (!$OPT{remove} && !$OPT{force}) { |
106 |
print "duplicate\n"; |
print "duplicate\n"; |
107 |
return; |
return; |
108 |
} |
} |
127 |
my $headline = $record->{title} || $did; |
my $headline = $record->{title} || $did; |
128 |
$headline =~ s/\s+/ /g; $headline =~ s/^\s+//; |
$headline =~ s/\s+/ /g; $headline =~ s/^\s+//; |
129 |
printf "%s\n", substr($headline,0,80); |
printf "%s\n", substr($headline,0,80); |
130 |
|
my $t = \$record->{text}; |
131 |
|
if ($$t) { |
132 |
|
my $md5; |
133 |
|
if (utf8::is_utf8($$t)) { |
134 |
|
$md5 = Dumper($$t) |
135 |
|
} else { |
136 |
|
$md5 = md5_hex($$t); |
137 |
|
} |
138 |
|
if ($tb->have('md5' => $md5)) { |
139 |
|
print "duplicate md5\n"; |
140 |
|
return; |
141 |
|
} |
142 |
|
$record->{md5} = $md5; |
143 |
|
print "$md5\n"; |
144 |
|
} else { |
145 |
|
print "no page content! skipping...\n"; |
146 |
|
return; |
147 |
|
} |
148 |
|
|
149 |
if ($OPT{remove}) { |
if ($OPT{remove}) { |
150 |
$tb->delete('docid' => $did, headline => $headline, %{$record}); |
$tb->delete('docid' => $did, headline => $headline, %{$record}); |
151 |
} else { |
} else { |
153 |
} |
} |
154 |
} |
} |
155 |
|
|
156 |
|
$WAIT::Config = $WAIT::Config; |
157 |
|
|
158 |
__END__ |
__END__ |
159 |
## ################################################################### |
## ################################################################### |
162 |
|
|
163 |
=head1 NAME |
=head1 NAME |
164 |
|
|
165 |
index_html - generate a manual database for sman |
index_html - generate a html database for sman |
166 |
|
|
167 |
=head1 SYNOPSIS |
=head1 SYNOPSIS |
168 |
|
|
171 |
[B<-dir> I<database directory>] |
[B<-dir> I<database directory>] |
172 |
[B<-table> I<name>] |
[B<-table> I<name>] |
173 |
[B<-remove>] |
[B<-remove>] |
174 |
[I<mandir> ...] |
[I<htmldir> ...] |
175 |
|
|
176 |
=head1 DESCRIPTION |
=head1 DESCRIPTION |
177 |
|
|
178 |
B<Index_html> generates/updates databases for B<sman>(1). If |
B<Index_html> generates/updates databases for B<sman>(1). If |
179 |
I<mandir>s are specified, these are used. Otherwise the confiigured |
I<htmldir>s are specified, these are used. Otherwise the script dies. |
|
default directories are indexed. |
|
180 |
|
|
181 |
=head2 OPTIONS |
=head2 OPTIONS |
182 |
|
|