/[corp]/convert/html2db.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /convert/html2db.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations)
Thu Mar 15 07:51:21 2001 UTC (23 years, 1 month ago) by dpavlin
Branch point for: DbP, MAIN
File MIME type: text/plain
Initial revision

1 dpavlin 1.1 #!/usr/local/bin/perl -w
2    
3     use DBI;
4     my $dbh = DBI->connect("DBI:Pg:dbname=corp","","") || die $DBI::errstr;
5     my $sth;
6    
7     my $body="";
8    
9     my $lastnr=0;
10     my $lasttitle="";
11    
12     my $smpc = $ARGV[0];
13     if ($smpc=~m,(\d+)\.html,) {
14     $smpc=$1;
15     }
16    
17     my $section_id;
18    
19     print "\nSMPC: $smpc\n";
20    
21     sub nukehtml {
22     my $foo = $_[0];
23     $foo=~s/<\/*[^>]+>//g;
24     $foo=~s/^\s+//g;
25     $foo=~s/\s+$//g;
26     $foo=~s/ +/ /g;
27     return $foo;
28     }
29    
30     sub display {
31     my ($nr,$title,$body) = @_;
32    
33     $nr=~s/\.$//; # nuke last dot
34    
35     return if ($nr==0);
36    
37     print "$nr\t".nukehtml($title);
38     $nbody=nukehtml($body);
39     print "\t",length($nbody)," ",length($body),"\n";
40     # print "$nbody\n-----------------------------\n";
41    
42     $sth = $dbh->prepare("select id from sections where section=$nr and upper(title) = upper('$title')") || die $DBI::errstr;
43     $sth->execute || die $DBI::errstr;
44    
45     if (@row = $sth->fetchrow_array) {
46     $section_id = $row[0];
47     } else {
48     $dbh->do("insert into sections (section,title) values ($nr,'$title')") || die $DBI::errstr;
49     $sth = $dbh->prepare("select id from sections where section=$nr and upper(title) = upper('$title')") || die $DBI::errstr;
50     $sth->execute || die $DBI::errstr;
51     ($section_id) = $sth->fetchrow_array;
52     }
53    
54     if (length($nbody) > 0) {
55     $body=~s/'/\\'/g; $body=~s/ +/ /g;
56     $nbody=~s/'/\\'/g;
57    
58     $dbh->do("insert into paragraphs (smpc,section_id,html,body) values ($smpc,$section_id,false,'$nbody')") || die $DBI::errstr;
59    
60     $dbh->do("insert into paragraphs (smpc,section_id,html,body) values ($smpc,$section_id,true,'$body')") || die $DBI::errstr;
61     }
62    
63     }
64    
65    
66     while(<>) {
67     chomp;
68     if (m,^(<[^>]+>)*(\d+\.)|(\d+\.\d)\s+,) {
69     ($nr,$title)=split(/\s+/,nukehtml($_),2);
70     $nr=~s/\.$//; # nuke last dot
71     if ($nr >= 1 && $nr < 15 && length($title) < 100) {
72     display($lastnr,$lasttitle,$body);
73     $lastnr=$nr; $lasttitle=$title;
74     $body="";
75     $lastnr=$nr;
76     } else {
77     $body.="$_ ";
78     }
79     } elsif (m,^(<[^>]+>)*(\d+)\t,) {
80     ($nr,$title)=split(/\s+/,nukehtml($_),2);
81     if ($nr > $lastnr && length($title) < 100) {
82     display($lastnr,$lasttitle,$body);
83     $lastnr=$nr; $lasttitle=$title;
84     $body="";
85     $lastnr=$nr;
86     } else {
87     $body.="$_ ";
88     }
89     } elsif (m,^<b><i>([^<]+)</i></b>,) {
90     $title=$1;
91     display($lastnr,$lasttitle,$body);
92     $lastnr++; $lasttitle=$title;
93     $body="";
94     } else {
95     $body.="$_ ";
96     }
97     }
98     display($lastnr,$lasttitle,$body);
99    
100     $dbh->do("update products set have_smpc=false");
101     $dbh->do("update products set have_smpc=true where smpc in (select distinct smpc from paragraphs)");
102    
103     undef $sth;
104     $dbh->disconnect;

  ViewVC Help
Powered by ViewVC 1.1.26