/[nn]/find3.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /find3.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (show annotations)
Wed Jun 19 10:58:59 2002 UTC (17 years, 2 months ago) by dpavlin
Branch: MAIN
CVS Tags: HEAD
Changes since 1.4: +1 -1 lines
File MIME type: text/plain
fix date in header

1 #!/usr/bin/perl -w
2
3 # indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2002-01-28
4 # options: -q quiet
5 # -d debug
6 # -v verbose
7
8 use strict;
9 use DBI;
10 use Getopt::Std;
11
12 my $sadrzaj=0;
13 my $nr=0;
14 my $naslov="";
15
16 my $br; ## broj NN
17 my $god; ## godina NN
18 my $aname; ## ancor name na originalnim stranicama
19
20 my $nn_dir="."; # dir u kojem su wget-ani fileovi
21
22 my %opts;
23 getopts("vqd", \%opts);
24
25 my $brojeva=0;
26 my $zakona=0;
27
28 #--------------------------------------------------------------------
29
30 my @affix_regexp;
31 my @affix_add;
32 my @affix_sub;
33
34 sub load_affix {
35 my ($filename) = @_;
36
37 my $suffixes=0;
38
39 my ($regexp,$add,$sub);
40
41 open (A,$filename) || die "$filename: $!";
42 while(<A>) {
43 chomp;
44 next if (/^#|^[\s\t\n\r]*$/);
45
46 if (/^suffixes/i) {
47 $suffixes++;
48 next;
49 }
50
51 next if (! $suffixes);
52
53 if (/^flag[\s\t]+\*{0,1}(.):/i) {
54 undef $regexp;
55 undef $add;
56 undef $sub;
57 next;
58 }
59
60 if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
61 $regexp = $1;
62 $add = $2;
63 $sub = $3 if ($3 ne "-");
64 } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
65 $regexp = $1;
66 $sub = $2;
67 }
68
69 sub nuke_s {
70 my $tmp = $_[0];
71 return if (!$tmp);
72 $tmp=~s/^ *//g;
73 $tmp=~s/ *$//g;
74 $tmp=~s/ *//g;
75 return $tmp;
76 }
77
78 push @affix_regexp,nuke_s($regexp);
79 push @affix_add,nuke_s($add);
80 push @affix_sub,nuke_s($sub);
81 }
82 }
83
84 sub normalize_word {
85 my @out;
86 foreach my $word (@_) {
87 push @out,$word; # save original word
88 next if (length($word) < 3); # cludge: preskoci kratke
89 for(my $i=0; $i<=$#affix_regexp; $i++) {
90 my $regexp = $affix_regexp[$i];
91 my $add = $affix_add[$i];
92 my $sub = $affix_sub[$i];
93 next if length($word) < length($sub);
94 my $tmp_word = $word;
95 if ($sub) {
96 next if ($word !~ m/$sub$/i);
97 if ($add) {
98 $tmp_word =~ s/$sub$/$add/i;
99 } else {
100 $tmp_word =~ s/$sub$//i;
101 }
102 } else {
103 $tmp_word = $word.$add;
104 }
105 if ($tmp_word =~ m/$regexp/ix) {
106 print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n" if ($opts{d});
107 push @out,lc($tmp_word);
108 }
109 }
110 }
111 return @out;
112 }
113
114
115 load_affix("$nn_dir/search/croatian.aff");
116
117
118 #--------------------------------------------------------------------
119
120 my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr;
121
122 $dbh->do("delete from nn") || die $dbh->errstr();
123
124 opendir(DIR,$nn_dir) || warn "opendir: $!";
125 my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
126 closedir(DIR);
127
128 my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") || die $dbh->errstr();
129
130
131 foreach my $file (@files) {
132 open(IN,$file) || die "can't open $file: $!";
133
134 if ($file=~m/god=(\d+)\&br=(\d+)/) {
135 ($br,$god) = ($2,$1);
136 print "$file -- $2 -- $1\n" if (! $opts{q});
137 $brojeva++;
138 }
139
140 while(<IN>) {
141 chomp;
142 s/\015//g; # kill cr
143 tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
144
145 if (m,<div class=sadrzaj>,) {
146 $sadrzaj++;
147 next;
148 }
149
150 if ($sadrzaj && m,</div>,) {
151 $sadrzaj--;
152 $naslov=~s/\s+/ /g;
153 $naslov=~s/<[^>]+>//g;
154 $naslov=~s/^\s+//g;
155 $naslov=~s/\s+$//g;
156 print "$god $br $nr: $naslov\n" if ($opts{v});
157 my $naslov_czs = lc($naslov);
158 $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
159 $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
160 $naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs)));
161 $sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) || die $dbh->errstr();
162 $naslov="";
163 $nr=0;
164 $zakona++;
165 }
166
167 if ($sadrzaj) {
168 if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) {
169 ($aname,$nr) = ($1,$2);
170 } elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\w+),(\d+)\)[^>]*>//i) {
171 ($nr,$aname) = ($3,$4);
172 die "conflict in godina: $1 != $god" if ($god != $1);
173 die "conflict in broj: $2 != $br" if ($br != $2);
174 } else {
175 die "can't find nr in line: $_";
176 }
177 $naslov.=$_;
178 $naslov=~s/^\s*$nr\.*\s*//g;
179 }
180
181 }
182
183 close(IN);
184 }
185
186 $dbh->do("vacuum") || die $dbh->errstr();
187 print "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});

  ViewVC Help
Powered by ViewVC 1.1.26