/[nn]/find.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /find.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1.1.1 - (show annotations) (vendor branch)
Tue Jan 22 11:52:14 2002 UTC (22 years, 2 months ago) by dpavlin
Branch: dbp, MAIN
CVS Tags: r0, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
initial import

1 #!/usr/bin/perl -w
2
3 use DBI;
4
5 my $sadrzaj=0;
6 my $nr=0;
7 my $naslov="";
8
9 my $br; ## broj NN
10 my $god; ## godina NN
11 my $aname; ## ancor name na originalnim stranicama
12
13 my $nn_dir="."; # dir u kojem su wget-ani fileovi
14
15 my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr;
16 $dbh->do("delete from nn") || die $dbh->errstr();
17
18 opendir(DIR,$nn_dir) || warn "opendir: $!";
19 my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
20 closedir(DIR);
21
22 foreach my $file (@files) {
23 open(IN,$file) || die "can't open $file: $!";
24
25 if ($file=~m/god=(\d+)\&br=(\d+)/) {
26 ($br,$god) = ($2,$1);
27 print "$file -- $2 -- $1\n";
28 }
29
30 while(<IN>) {
31 chomp;
32 s/\015//g; # kill cr
33 tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
34
35 $sadrzaj++ if (m,<div class=sadrzaj>,);
36
37 if ($sadrzaj) {
38 if (s/<a href="#([^"])+">\s*(\d+)\.\s*<[^>]+>//i) {
39 ($aname,$nr) = ($1,$2);
40 }
41 $naslov.=$_;
42 }
43
44 if ($sadrzaj && m,</div>,) {
45 $sadrzaj--;
46 $naslov=~s/\s+/ /g;
47 $naslov=~s/<[^>]+>//g;
48 $naslov=~s/^\s+//g;
49 $naslov=~s/\s+$//g;
50 print "$god $br $nr: $naslov\n";
51 $naslov_czs = $naslov;
52 $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
53 $dbh->do("insert into nn (br,god,nr,aname,title,title_czs) values ($br,$god,$nr,'$aname','$naslov','$naslov_czs')") || die $dbh->errstr();
54 $naslov="";
55 $nr=0;
56 }
57 }
58
59 close(IN);
60 }

  ViewVC Help
Powered by ViewVC 1.1.26