/[nn]/find4.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /find4.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (hide annotations)
Wed Jun 19 10:58:59 2002 UTC (21 years, 9 months ago) by dpavlin
Branch: MAIN
Changes since 1.2: +2 -1 lines
File MIME type: text/plain
fix date in header

1 dpavlin 1.1 #!/usr/bin/perl -w
2    
3 dpavlin 1.3 # indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2002-01-28
4 dpavlin 1.1 # options: -q quiet
5     # -d debug
6     # -v verbose
7    
8     use strict;
9     use DBI;
10     use Getopt::Std;
11     use Lingua::Spelling::Alternative;
12    
13     my $sadrzaj=0;
14     my $nr=0;
15     my $naslov="";
16    
17     my $br; ## broj NN
18     my $god; ## godina NN
19     my $aname; ## ancor name na originalnim stranicama
20    
21     my $nn_dir="."; # dir u kojem su wget-ani fileovi
22    
23     my %opts;
24     getopts("vqd", \%opts);
25    
26     my $brojeva=0;
27     my $zakona=0;
28    
29     my $hr = new Lingua::Spelling::Alternative( DEBUG => $opts{d} );
30 dpavlin 1.2 #$hr->load_affix("$nn_dir/search/croatian.aff");
31     $hr->load_findaffix("$nn_dir/prvih_50.txt");
32 dpavlin 1.1
33    
34     #--------------------------------------------------------------------
35    
36     my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr;
37    
38     $dbh->do("delete from nn") || die $dbh->errstr();
39    
40     opendir(DIR,$nn_dir) || warn "opendir: $!";
41     my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
42     closedir(DIR);
43    
44     my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") || die $dbh->errstr();
45    
46    
47     foreach my $file (@files) {
48     open(IN,$file) || die "can't open $file: $!";
49    
50     if ($file=~m/god=(\d+)\&br=(\d+)/) {
51     ($br,$god) = ($2,$1);
52     print "$file -- $2 -- $1\n" if (! $opts{q});
53     $brojeva++;
54     }
55    
56     while(<IN>) {
57     chomp;
58     s/\015//g; # kill cr
59     tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
60    
61     if (m,<div class=sadrzaj>,) {
62     $sadrzaj++;
63     next;
64     }
65    
66     if ($sadrzaj && m,</div>,) {
67     $sadrzaj--;
68     $naslov=~s/\s+/ /g;
69     $naslov=~s/<[^>]+>//g;
70     $naslov=~s/^\s+//g;
71     $naslov=~s/\s+$//g;
72     print "$god $br $nr: $naslov\n" if ($opts{v});
73     my $naslov_czs = lc($naslov);
74     $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
75     $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
76     $naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs)));
77 dpavlin 1.3 # $naslov_czs = $hr->minimal(split(/ /,$naslov_czs));
78 dpavlin 1.1 $sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) || die $dbh->errstr();
79     $naslov="";
80     $nr=0;
81     $zakona++;
82     }
83    
84     if ($sadrzaj) {
85     if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) {
86     ($aname,$nr) = ($1,$2);
87     } elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\w+),(\d+)\)[^>]*>//i) {
88     ($nr,$aname) = ($3,$4);
89     die "conflict in godina: $1 != $god" if ($god != $1);
90     die "conflict in broj: $2 != $br" if ($br != $2);
91     } else {
92     die "can't find nr in line: $_";
93     }
94     $naslov.=$_;
95     $naslov=~s/^\s*$nr\.*\s*//g;
96     }
97    
98     }
99    
100     close(IN);
101     }
102    
103     $dbh->do("vacuum") || die $dbh->errstr();
104     print "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});
105    

  ViewVC Help
Powered by ViewVC 1.1.26