1 |
#!/usr/local/bin/perl -w |
2 |
|
3 |
use DBI; |
4 |
my $dbh = DBI->connect("DBI:Pg:dbname=corp","","") || die $DBI::errstr; |
5 |
my $sth; |
6 |
|
7 |
my $body=""; |
8 |
|
9 |
my $lastnr=0; |
10 |
my $lasttitle=""; |
11 |
|
12 |
my $smpc = $ARGV[0]; |
13 |
if ($smpc=~m,(\d+)\.html,) { |
14 |
$smpc=$1; |
15 |
} |
16 |
|
17 |
my $section_id; |
18 |
|
19 |
print "\nSMPC: $smpc\n"; |
20 |
|
21 |
sub nukehtml { |
22 |
my $foo = $_[0]; |
23 |
$foo=~s/<\/*[^>]+>//g; |
24 |
$foo=~s/^\s+//g; |
25 |
$foo=~s/\s+$//g; |
26 |
$foo=~s/ +/ /g; |
27 |
return $foo; |
28 |
} |
29 |
|
30 |
sub display { |
31 |
my ($nr,$title,$body) = @_; |
32 |
|
33 |
$nr=~s/\.$//; # nuke last dot |
34 |
|
35 |
return if ($nr==0); |
36 |
|
37 |
print "$nr\t".nukehtml($title); |
38 |
$nbody=nukehtml($body); |
39 |
print "\t",length($nbody)," ",length($body),"\n"; |
40 |
# print "$nbody\n-----------------------------\n"; |
41 |
|
42 |
$sth = $dbh->prepare("select id from sections where section=$nr and upper(title) = upper('$title')") || die $DBI::errstr; |
43 |
$sth->execute || die $DBI::errstr; |
44 |
|
45 |
if (@row = $sth->fetchrow_array) { |
46 |
$section_id = $row[0]; |
47 |
} else { |
48 |
$dbh->do("insert into sections (section,title) values ($nr,'$title')") || die $DBI::errstr; |
49 |
$sth = $dbh->prepare("select id from sections where section=$nr and upper(title) = upper('$title')") || die $DBI::errstr; |
50 |
$sth->execute || die $DBI::errstr; |
51 |
($section_id) = $sth->fetchrow_array; |
52 |
} |
53 |
|
54 |
if (length($nbody) > 0) { |
55 |
$body=~s/'/\\'/g; $body=~s/ +/ /g; |
56 |
$nbody=~s/'/\\'/g; |
57 |
|
58 |
$dbh->do("insert into paragraphs (smpc,section_id,html,body) values ($smpc,$section_id,false,'$nbody')") || die $DBI::errstr; |
59 |
|
60 |
$dbh->do("insert into paragraphs (smpc,section_id,html,body) values ($smpc,$section_id,true,'$body')") || die $DBI::errstr; |
61 |
} |
62 |
|
63 |
} |
64 |
|
65 |
|
66 |
while(<>) { |
67 |
chomp; |
68 |
if (m,^(<[^>]+>)*(\d+\.)|(\d+\.\d)\s+,) { |
69 |
($nr,$title)=split(/\s+/,nukehtml($_),2); |
70 |
$nr=~s/\.$//; # nuke last dot |
71 |
if ($nr >= 1 && $nr < 15 && length($title) < 100) { |
72 |
display($lastnr,$lasttitle,$body); |
73 |
$lastnr=$nr; $lasttitle=$title; |
74 |
$body=""; |
75 |
$lastnr=$nr; |
76 |
} else { |
77 |
$body.="$_ "; |
78 |
} |
79 |
} elsif (m,^(<[^>]+>)*(\d+)\t,) { |
80 |
($nr,$title)=split(/\s+/,nukehtml($_),2); |
81 |
if ($nr > $lastnr && length($title) < 100) { |
82 |
display($lastnr,$lasttitle,$body); |
83 |
$lastnr=$nr; $lasttitle=$title; |
84 |
$body=""; |
85 |
$lastnr=$nr; |
86 |
} else { |
87 |
$body.="$_ "; |
88 |
} |
89 |
} elsif (m,^<b><i>([^<]+)</i></b>,) { |
90 |
$title=$1; |
91 |
display($lastnr,$lasttitle,$body); |
92 |
$lastnr++; $lasttitle=$title; |
93 |
$body=""; |
94 |
} else { |
95 |
$body.="$_ "; |
96 |
} |
97 |
} |
98 |
display($lastnr,$lasttitle,$body); |
99 |
|
100 |
$dbh->do("update products set have_smpc=false"); |
101 |
$dbh->do("update products set have_smpc=true where smpc in (select distinct smpc from paragraphs)"); |
102 |
|
103 |
undef $sth; |
104 |
$dbh->disconnect; |