1 |
dpavlin |
1.1 |
#!/usr/local/bin/perl -w |
2 |
|
|
|
3 |
|
|
use DBI; |
4 |
|
|
my $dbh = DBI->connect("DBI:Pg:dbname=corp","","") || die $DBI::errstr; |
5 |
|
|
my $sth; |
6 |
|
|
|
7 |
|
|
my $body=""; |
8 |
|
|
|
9 |
|
|
my $lastnr=0; |
10 |
|
|
my $lasttitle=""; |
11 |
|
|
|
12 |
|
|
my $smpc = $ARGV[0]; |
13 |
|
|
if ($smpc=~m,(\d+)\.html,) { |
14 |
|
|
$smpc=$1; |
15 |
|
|
} |
16 |
|
|
|
17 |
|
|
my $section_id; |
18 |
|
|
|
19 |
|
|
print "\nSMPC: $smpc\n"; |
20 |
|
|
|
21 |
|
|
sub nukehtml { |
22 |
|
|
my $foo = $_[0]; |
23 |
|
|
$foo=~s/<\/*[^>]+>//g; |
24 |
|
|
$foo=~s/^\s+//g; |
25 |
|
|
$foo=~s/\s+$//g; |
26 |
|
|
$foo=~s/ +/ /g; |
27 |
|
|
return $foo; |
28 |
|
|
} |
29 |
|
|
|
30 |
|
|
sub display { |
31 |
|
|
my ($nr,$title,$body) = @_; |
32 |
|
|
|
33 |
|
|
$nr=~s/\.$//; # nuke last dot |
34 |
|
|
|
35 |
|
|
return if ($nr==0); |
36 |
|
|
|
37 |
|
|
print "$nr\t".nukehtml($title); |
38 |
|
|
$nbody=nukehtml($body); |
39 |
|
|
print "\t",length($nbody)," ",length($body),"\n"; |
40 |
|
|
# print "$nbody\n-----------------------------\n"; |
41 |
|
|
|
42 |
|
|
$sth = $dbh->prepare("select id from sections where section=$nr and upper(title) = upper('$title')") || die $DBI::errstr; |
43 |
|
|
$sth->execute || die $DBI::errstr; |
44 |
|
|
|
45 |
|
|
if (@row = $sth->fetchrow_array) { |
46 |
|
|
$section_id = $row[0]; |
47 |
|
|
} else { |
48 |
|
|
$dbh->do("insert into sections (section,title) values ($nr,'$title')") || die $DBI::errstr; |
49 |
|
|
$sth = $dbh->prepare("select id from sections where section=$nr and upper(title) = upper('$title')") || die $DBI::errstr; |
50 |
|
|
$sth->execute || die $DBI::errstr; |
51 |
|
|
($section_id) = $sth->fetchrow_array; |
52 |
|
|
} |
53 |
|
|
|
54 |
|
|
if (length($nbody) > 0) { |
55 |
|
|
$body=~s/'/\\'/g; $body=~s/ +/ /g; |
56 |
|
|
$nbody=~s/'/\\'/g; |
57 |
|
|
|
58 |
|
|
$dbh->do("insert into paragraphs (smpc,section_id,html,body) values ($smpc,$section_id,false,'$nbody')") || die $DBI::errstr; |
59 |
|
|
|
60 |
|
|
$dbh->do("insert into paragraphs (smpc,section_id,html,body) values ($smpc,$section_id,true,'$body')") || die $DBI::errstr; |
61 |
|
|
} |
62 |
|
|
|
63 |
|
|
} |
64 |
|
|
|
65 |
|
|
|
66 |
|
|
while(<>) { |
67 |
|
|
chomp; |
68 |
|
|
if (m,^(<[^>]+>)*(\d+\.)|(\d+\.\d)\s+,) { |
69 |
|
|
($nr,$title)=split(/\s+/,nukehtml($_),2); |
70 |
|
|
$nr=~s/\.$//; # nuke last dot |
71 |
|
|
if ($nr >= 1 && $nr < 15 && length($title) < 100) { |
72 |
|
|
display($lastnr,$lasttitle,$body); |
73 |
|
|
$lastnr=$nr; $lasttitle=$title; |
74 |
|
|
$body=""; |
75 |
|
|
$lastnr=$nr; |
76 |
|
|
} else { |
77 |
|
|
$body.="$_ "; |
78 |
|
|
} |
79 |
|
|
} elsif (m,^(<[^>]+>)*(\d+)\t,) { |
80 |
|
|
($nr,$title)=split(/\s+/,nukehtml($_),2); |
81 |
|
|
if ($nr > $lastnr && length($title) < 100) { |
82 |
|
|
display($lastnr,$lasttitle,$body); |
83 |
|
|
$lastnr=$nr; $lasttitle=$title; |
84 |
|
|
$body=""; |
85 |
|
|
$lastnr=$nr; |
86 |
|
|
} else { |
87 |
|
|
$body.="$_ "; |
88 |
|
|
} |
89 |
|
|
} elsif (m,^<b><i>([^<]+)</i></b>,) { |
90 |
|
|
$title=$1; |
91 |
|
|
display($lastnr,$lasttitle,$body); |
92 |
|
|
$lastnr++; $lasttitle=$title; |
93 |
|
|
$body=""; |
94 |
|
|
} else { |
95 |
|
|
$body.="$_ "; |
96 |
|
|
} |
97 |
|
|
} |
98 |
|
|
display($lastnr,$lasttitle,$body); |
99 |
|
|
|
100 |
|
|
$dbh->do("update products set have_smpc=false"); |
101 |
|
|
$dbh->do("update products set have_smpc=true where smpc in (select distinct smpc from paragraphs)"); |
102 |
|
|
|
103 |
|
|
undef $sth; |
104 |
|
|
$dbh->disconnect; |