/[corp]/convert/html2db.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /convert/html2db.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations)
Thu Mar 15 07:51:21 2001 UTC (20 years, 8 months ago) by dpavlin
Branch point for: DbP, MAIN
File MIME type: text/plain
Initial revision

1 #!/usr/local/bin/perl -w
2
3 use DBI;
4 my $dbh = DBI->connect("DBI:Pg:dbname=corp","","") || die $DBI::errstr;
5 my $sth;
6
7 my $body="";
8
9 my $lastnr=0;
10 my $lasttitle="";
11
12 my $smpc = $ARGV[0];
13 if ($smpc=~m,(\d+)\.html,) {
14 $smpc=$1;
15 }
16
17 my $section_id;
18
19 print "\nSMPC: $smpc\n";
20
21 sub nukehtml {
22 my $foo = $_[0];
23 $foo=~s/<\/*[^>]+>//g;
24 $foo=~s/^\s+//g;
25 $foo=~s/\s+$//g;
26 $foo=~s/ +/ /g;
27 return $foo;
28 }
29
30 sub display {
31 my ($nr,$title,$body) = @_;
32
33 $nr=~s/\.$//; # nuke last dot
34
35 return if ($nr==0);
36
37 print "$nr\t".nukehtml($title);
38 $nbody=nukehtml($body);
39 print "\t",length($nbody)," ",length($body),"\n";
40 # print "$nbody\n-----------------------------\n";
41
42 $sth = $dbh->prepare("select id from sections where section=$nr and upper(title) = upper('$title')") || die $DBI::errstr;
43 $sth->execute || die $DBI::errstr;
44
45 if (@row = $sth->fetchrow_array) {
46 $section_id = $row[0];
47 } else {
48 $dbh->do("insert into sections (section,title) values ($nr,'$title')") || die $DBI::errstr;
49 $sth = $dbh->prepare("select id from sections where section=$nr and upper(title) = upper('$title')") || die $DBI::errstr;
50 $sth->execute || die $DBI::errstr;
51 ($section_id) = $sth->fetchrow_array;
52 }
53
54 if (length($nbody) > 0) {
55 $body=~s/'/\\'/g; $body=~s/ +/ /g;
56 $nbody=~s/'/\\'/g;
57
58 $dbh->do("insert into paragraphs (smpc,section_id,html,body) values ($smpc,$section_id,false,'$nbody')") || die $DBI::errstr;
59
60 $dbh->do("insert into paragraphs (smpc,section_id,html,body) values ($smpc,$section_id,true,'$body')") || die $DBI::errstr;
61 }
62
63 }
64
65
66 while(<>) {
67 chomp;
68 if (m,^(<[^>]+>)*(\d+\.)|(\d+\.\d)\s+,) {
69 ($nr,$title)=split(/\s+/,nukehtml($_),2);
70 $nr=~s/\.$//; # nuke last dot
71 if ($nr >= 1 && $nr < 15 && length($title) < 100) {
72 display($lastnr,$lasttitle,$body);
73 $lastnr=$nr; $lasttitle=$title;
74 $body="";
75 $lastnr=$nr;
76 } else {
77 $body.="$_ ";
78 }
79 } elsif (m,^(<[^>]+>)*(\d+)\t,) {
80 ($nr,$title)=split(/\s+/,nukehtml($_),2);
81 if ($nr > $lastnr && length($title) < 100) {
82 display($lastnr,$lasttitle,$body);
83 $lastnr=$nr; $lasttitle=$title;
84 $body="";
85 $lastnr=$nr;
86 } else {
87 $body.="$_ ";
88 }
89 } elsif (m,^<b><i>([^<]+)</i></b>,) {
90 $title=$1;
91 display($lastnr,$lasttitle,$body);
92 $lastnr++; $lasttitle=$title;
93 $body="";
94 } else {
95 $body.="$_ ";
96 }
97 }
98 display($lastnr,$lasttitle,$body);
99
100 $dbh->do("update products set have_smpc=false");
101 $dbh->do("update products set have_smpc=true where smpc in (select distinct smpc from paragraphs)");
102
103 undef $sth;
104 $dbh->disconnect;

  ViewVC Help
Powered by ViewVC 1.1.26