/[nn]/find3.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /find3.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (hide annotations)
Tue Feb 5 14:31:49 2002 UTC (22 years, 2 months ago) by dpavlin
Branch: MAIN
Changes since 1.3: +8 -1 lines
File MIME type: text/plain
mice brojeve sa pocetka naslova, malo statistike

1 dpavlin 1.1 #!/usr/bin/perl -w
2    
3 dpavlin 1.3 # indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2001-01-28
4     # options: -q quiet
5     # -d debug
6     # -v verbose
7    
8 dpavlin 1.1 use strict;
9     use DBI;
10 dpavlin 1.3 use Getopt::Std;
11 dpavlin 1.1
12     my $sadrzaj=0;
13     my $nr=0;
14     my $naslov="";
15    
16     my $br; ## broj NN
17     my $god; ## godina NN
18     my $aname; ## ancor name na originalnim stranicama
19    
20     my $nn_dir="."; # dir u kojem su wget-ani fileovi
21    
22 dpavlin 1.3 my %opts;
23 dpavlin 1.4 getopts("vqd", \%opts);
24    
25     my $brojeva=0;
26     my $zakona=0;
27 dpavlin 1.1
28     #--------------------------------------------------------------------
29    
30     my @affix_regexp;
31     my @affix_add;
32     my @affix_sub;
33    
34     sub load_affix {
35     my ($filename) = @_;
36    
37     my $suffixes=0;
38    
39     my ($regexp,$add,$sub);
40    
41     open (A,$filename) || die "$filename: $!";
42     while(<A>) {
43     chomp;
44     next if (/^#|^[\s\t\n\r]*$/);
45    
46     if (/^suffixes/i) {
47     $suffixes++;
48     next;
49     }
50    
51     next if (! $suffixes);
52    
53     if (/^flag[\s\t]+\*{0,1}(.):/i) {
54     undef $regexp;
55     undef $add;
56     undef $sub;
57     next;
58     }
59    
60     if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
61     $regexp = $1;
62     $add = $2;
63     $sub = $3 if ($3 ne "-");
64     } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
65     $regexp = $1;
66     $sub = $2;
67     }
68    
69     sub nuke_s {
70     my $tmp = $_[0];
71     return if (!$tmp);
72     $tmp=~s/^ *//g;
73     $tmp=~s/ *$//g;
74     $tmp=~s/ *//g;
75     return $tmp;
76     }
77    
78     push @affix_regexp,nuke_s($regexp);
79     push @affix_add,nuke_s($add);
80     push @affix_sub,nuke_s($sub);
81     }
82     }
83    
84     sub normalize_word {
85     my @out;
86     foreach my $word (@_) {
87     push @out,$word; # save original word
88     next if (length($word) < 3); # cludge: preskoci kratke
89     for(my $i=0; $i<=$#affix_regexp; $i++) {
90     my $regexp = $affix_regexp[$i];
91     my $add = $affix_add[$i];
92     my $sub = $affix_sub[$i];
93     next if length($word) < length($sub);
94     my $tmp_word = $word;
95     if ($sub) {
96     next if ($word !~ m/$sub$/i);
97     if ($add) {
98     $tmp_word =~ s/$sub$/$add/i;
99     } else {
100     $tmp_word =~ s/$sub$//i;
101     }
102     } else {
103     $tmp_word = $word.$add;
104     }
105     if ($tmp_word =~ m/$regexp/ix) {
106 dpavlin 1.3 print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n" if ($opts{d});
107 dpavlin 1.1 push @out,lc($tmp_word);
108     }
109     }
110     }
111     return @out;
112     }
113    
114    
115     load_affix("$nn_dir/search/croatian.aff");
116    
117    
118     #--------------------------------------------------------------------
119    
120     my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr;
121    
122     $dbh->do("delete from nn") || die $dbh->errstr();
123    
124     opendir(DIR,$nn_dir) || warn "opendir: $!";
125     my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
126     closedir(DIR);
127    
128 dpavlin 1.3 my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") || die $dbh->errstr();
129    
130    
131 dpavlin 1.1 foreach my $file (@files) {
132     open(IN,$file) || die "can't open $file: $!";
133    
134     if ($file=~m/god=(\d+)\&br=(\d+)/) {
135     ($br,$god) = ($2,$1);
136 dpavlin 1.3 print "$file -- $2 -- $1\n" if (! $opts{q});
137 dpavlin 1.4 $brojeva++;
138 dpavlin 1.1 }
139    
140     while(<IN>) {
141     chomp;
142     s/\015//g; # kill cr
143     tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
144    
145 dpavlin 1.2 if (m,<div class=sadrzaj>,) {
146     $sadrzaj++;
147     next;
148 dpavlin 1.1 }
149 dpavlin 1.2
150 dpavlin 1.1 if ($sadrzaj && m,</div>,) {
151     $sadrzaj--;
152     $naslov=~s/\s+/ /g;
153     $naslov=~s/<[^>]+>//g;
154     $naslov=~s/^\s+//g;
155     $naslov=~s/\s+$//g;
156 dpavlin 1.3 print "$god $br $nr: $naslov\n" if ($opts{v});
157 dpavlin 1.1 my $naslov_czs = lc($naslov);
158     $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
159     $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
160     $naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs)));
161 dpavlin 1.3 $sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) || die $dbh->errstr();
162 dpavlin 1.1 $naslov="";
163     $nr=0;
164 dpavlin 1.4 $zakona++;
165 dpavlin 1.1 }
166 dpavlin 1.2
167     if ($sadrzaj) {
168     if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) {
169     ($aname,$nr) = ($1,$2);
170 dpavlin 1.3 } elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\w+),(\d+)\)[^>]*>//i) {
171 dpavlin 1.2 ($nr,$aname) = ($3,$4);
172     die "conflict in godina: $1 != $god" if ($god != $1);
173     die "conflict in broj: $2 != $br" if ($br != $2);
174     } else {
175     die "can't find nr in line: $_";
176     }
177     $naslov.=$_;
178 dpavlin 1.4 $naslov=~s/^\s*$nr\.*\s*//g;
179 dpavlin 1.2 }
180    
181 dpavlin 1.1 }
182    
183     close(IN);
184     }
185    
186     $dbh->do("vacuum") || die $dbh->errstr();
187 dpavlin 1.4 print "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});

  ViewVC Help
Powered by ViewVC 1.1.26