/[nn]/find3.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /find3.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1.1.1 - (show annotations) (vendor branch)
Thu Jan 31 16:54:32 2002 UTC (22 years, 2 months ago) by dpavlin
Branch: dbp
CVS Tags: r0
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
initial import

1 #!/usr/bin/perl -w
2
3 use strict;
4 use DBI;
5
6 my $sadrzaj=0;
7 my $nr=0;
8 my $naslov="";
9
10 my $br; ## broj NN
11 my $god; ## godina NN
12 my $aname; ## ancor name na originalnim stranicama
13
14 my $nn_dir="."; # dir u kojem su wget-ani fileovi
15
16
17 #--------------------------------------------------------------------
18
19 my @affix_regexp;
20 my @affix_add;
21 my @affix_sub;
22
23 sub load_affix {
24 my ($filename) = @_;
25
26 my $suffixes=0;
27
28 my ($regexp,$add,$sub);
29
30 open (A,$filename) || die "$filename: $!";
31 while(<A>) {
32 chomp;
33 next if (/^#|^[\s\t\n\r]*$/);
34
35 if (/^suffixes/i) {
36 $suffixes++;
37 next;
38 }
39
40 next if (! $suffixes);
41
42 if (/^flag[\s\t]+\*{0,1}(.):/i) {
43 undef $regexp;
44 undef $add;
45 undef $sub;
46 next;
47 }
48
49 if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
50 $regexp = $1;
51 $add = $2;
52 $sub = $3 if ($3 ne "-");
53 } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
54 $regexp = $1;
55 $sub = $2;
56 }
57
58 sub nuke_s {
59 my $tmp = $_[0];
60 return if (!$tmp);
61 $tmp=~s/^ *//g;
62 $tmp=~s/ *$//g;
63 $tmp=~s/ *//g;
64 return $tmp;
65 }
66
67 push @affix_regexp,nuke_s($regexp);
68 push @affix_add,nuke_s($add);
69 push @affix_sub,nuke_s($sub);
70 }
71 }
72
73 sub normalize_word {
74 my @out;
75 foreach my $word (@_) {
76 push @out,$word; # save original word
77 next if (length($word) < 3); # cludge: preskoci kratke
78 for(my $i=0; $i<=$#affix_regexp; $i++) {
79 my $regexp = $affix_regexp[$i];
80 my $add = $affix_add[$i];
81 my $sub = $affix_sub[$i];
82 next if length($word) < length($sub);
83 my $tmp_word = $word;
84 if ($sub) {
85 next if ($word !~ m/$sub$/i);
86 if ($add) {
87 $tmp_word =~ s/$sub$/$add/i;
88 } else {
89 $tmp_word =~ s/$sub$//i;
90 }
91 } else {
92 $tmp_word = $word.$add;
93 }
94 if ($tmp_word =~ m/$regexp/ix) {
95 # print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n";
96 push @out,lc($tmp_word);
97 }
98 }
99 }
100 return @out;
101 }
102
103
104 load_affix("$nn_dir/search/croatian.aff");
105
106
107 #--------------------------------------------------------------------
108
109 my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr;
110
111 $dbh->do("delete from nn") || die $dbh->errstr();
112
113 opendir(DIR,$nn_dir) || warn "opendir: $!";
114 my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
115 closedir(DIR);
116
117 foreach my $file (@files) {
118 open(IN,$file) || die "can't open $file: $!";
119
120 if ($file=~m/god=(\d+)\&br=(\d+)/) {
121 ($br,$god) = ($2,$1);
122 print "$file -- $2 -- $1\n";
123 }
124
125 while(<IN>) {
126 chomp;
127 s/\015//g; # kill cr
128 tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
129
130 $sadrzaj++ if (m,<div class=sadrzaj>,);
131
132 if ($sadrzaj) {
133 if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) {
134 ($aname,$nr) = ($1,$2);
135 $aname = $1 if ($aname =~ m/Javascript:Mojdok\(\d+,\d+,\d+,(\d+)\);/);
136 }
137 $naslov.=$_;
138 }
139
140 if ($sadrzaj && m,</div>,) {
141 $sadrzaj--;
142 $naslov=~s/\s+/ /g;
143 $naslov=~s/<[^>]+>//g;
144 $naslov=~s/^\s+//g;
145 $naslov=~s/\s+$//g;
146 print "$god $br $nr: $naslov\n";
147 my $naslov_czs = lc($naslov);
148 $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
149 $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
150 $naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs)));
151 $dbh->do("insert into nn (br,god,nr,aname,title,title_czs) values ($br,$god,$nr,'$aname','$naslov','$naslov_czs')") || die $dbh->errstr();
152 $naslov="";
153 $nr=0;
154 }
155 }
156
157 close(IN);
158 }
159
160 $dbh->do("vacuum") || die $dbh->errstr();

  ViewVC Help
Powered by ViewVC 1.1.26