/[nn]/find4.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /find4.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations)
Mon Feb 11 15:21:30 2002 UTC (22 years, 2 months ago) by dpavlin
Branch: MAIN
File MIME type: text/plain
indexer koji koristi Lingua::Spelling::Alternative modul...

1 #!/usr/bin/perl -w
2
3 # indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2001-01-28
4 # options: -q quiet
5 # -d debug
6 # -v verbose
7
8 use strict;
9 use DBI;
10 use Getopt::Std;
11 use Lingua::Spelling::Alternative;
12
13 my $sadrzaj=0;
14 my $nr=0;
15 my $naslov="";
16
17 my $br; ## broj NN
18 my $god; ## godina NN
19 my $aname; ## ancor name na originalnim stranicama
20
21 my $nn_dir="."; # dir u kojem su wget-ani fileovi
22
23 my %opts;
24 getopts("vqd", \%opts);
25
26 my $brojeva=0;
27 my $zakona=0;
28
29 my $hr = new Lingua::Spelling::Alternative( DEBUG => $opts{d} );
30 $hr->load_affix("$nn_dir/search/croatian.aff");
31
32
33 #--------------------------------------------------------------------
34
35 my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr;
36
37 $dbh->do("delete from nn") || die $dbh->errstr();
38
39 opendir(DIR,$nn_dir) || warn "opendir: $!";
40 my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
41 closedir(DIR);
42
43 my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") || die $dbh->errstr();
44
45
46 foreach my $file (@files) {
47 open(IN,$file) || die "can't open $file: $!";
48
49 if ($file=~m/god=(\d+)\&br=(\d+)/) {
50 ($br,$god) = ($2,$1);
51 print "$file -- $2 -- $1\n" if (! $opts{q});
52 $brojeva++;
53 }
54
55 while(<IN>) {
56 chomp;
57 s/\015//g; # kill cr
58 tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
59
60 if (m,<div class=sadrzaj>,) {
61 $sadrzaj++;
62 next;
63 }
64
65 if ($sadrzaj && m,</div>,) {
66 $sadrzaj--;
67 $naslov=~s/\s+/ /g;
68 $naslov=~s/<[^>]+>//g;
69 $naslov=~s/^\s+//g;
70 $naslov=~s/\s+$//g;
71 print "$god $br $nr: $naslov\n" if ($opts{v});
72 my $naslov_czs = lc($naslov);
73 $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
74 $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
75 $naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs)));
76 $sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) || die $dbh->errstr();
77 $naslov="";
78 $nr=0;
79 $zakona++;
80 }
81
82 if ($sadrzaj) {
83 if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) {
84 ($aname,$nr) = ($1,$2);
85 } elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\w+),(\d+)\)[^>]*>//i) {
86 ($nr,$aname) = ($3,$4);
87 die "conflict in godina: $1 != $god" if ($god != $1);
88 die "conflict in broj: $2 != $br" if ($br != $2);
89 } else {
90 die "can't find nr in line: $_";
91 }
92 $naslov.=$_;
93 $naslov=~s/^\s*$nr\.*\s*//g;
94 }
95
96 }
97
98 close(IN);
99 }
100
101 $dbh->do("vacuum") || die $dbh->errstr();
102 print "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});
103

  ViewVC Help
Powered by ViewVC 1.1.26