1 |
dpavlin |
1.1 |
#!/usr/bin/perl -w |
2 |
|
|
|
3 |
|
|
use strict; |
4 |
|
|
use DBI; |
5 |
|
|
|
6 |
|
|
my $sadrzaj=0; |
7 |
|
|
my $nr=0; |
8 |
|
|
my $naslov=""; |
9 |
|
|
|
10 |
|
|
my $br; ## broj NN |
11 |
|
|
my $god; ## godina NN |
12 |
|
|
my $aname; ## ancor name na originalnim stranicama |
13 |
|
|
|
14 |
|
|
my $nn_dir="."; # dir u kojem su wget-ani fileovi |
15 |
|
|
|
16 |
|
|
|
17 |
|
|
#-------------------------------------------------------------------- |
18 |
|
|
|
19 |
|
|
my @affix_regexp; |
20 |
|
|
my @affix_add; |
21 |
|
|
my @affix_sub; |
22 |
|
|
|
23 |
|
|
sub load_affix { |
24 |
|
|
my ($filename) = @_; |
25 |
|
|
|
26 |
|
|
my $suffixes=0; |
27 |
|
|
|
28 |
|
|
my ($regexp,$add,$sub); |
29 |
|
|
|
30 |
|
|
open (A,$filename) || die "$filename: $!"; |
31 |
|
|
while(<A>) { |
32 |
|
|
chomp; |
33 |
|
|
next if (/^#|^[\s\t\n\r]*$/); |
34 |
|
|
|
35 |
|
|
if (/^suffixes/i) { |
36 |
|
|
$suffixes++; |
37 |
|
|
next; |
38 |
|
|
} |
39 |
|
|
|
40 |
|
|
next if (! $suffixes); |
41 |
|
|
|
42 |
|
|
if (/^flag[\s\t]+\*{0,1}(.):/i) { |
43 |
|
|
undef $regexp; |
44 |
|
|
undef $add; |
45 |
|
|
undef $sub; |
46 |
|
|
next; |
47 |
|
|
} |
48 |
|
|
|
49 |
|
|
if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) { |
50 |
|
|
$regexp = $1; |
51 |
|
|
$add = $2; |
52 |
|
|
$sub = $3 if ($3 ne "-"); |
53 |
|
|
} elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) { |
54 |
|
|
$regexp = $1; |
55 |
|
|
$sub = $2; |
56 |
|
|
} |
57 |
|
|
|
58 |
|
|
sub nuke_s { |
59 |
|
|
my $tmp = $_[0]; |
60 |
|
|
return if (!$tmp); |
61 |
|
|
$tmp=~s/^ *//g; |
62 |
|
|
$tmp=~s/ *$//g; |
63 |
|
|
$tmp=~s/ *//g; |
64 |
|
|
return $tmp; |
65 |
|
|
} |
66 |
|
|
|
67 |
|
|
push @affix_regexp,nuke_s($regexp); |
68 |
|
|
push @affix_add,nuke_s($add); |
69 |
|
|
push @affix_sub,nuke_s($sub); |
70 |
|
|
} |
71 |
|
|
} |
72 |
|
|
|
73 |
|
|
sub normalize_word { |
74 |
|
|
my @out; |
75 |
|
|
foreach my $word (@_) { |
76 |
|
|
push @out,$word; # save original word |
77 |
|
|
next if (length($word) < 3); # cludge: preskoci kratke |
78 |
|
|
for(my $i=0; $i<=$#affix_regexp; $i++) { |
79 |
|
|
my $regexp = $affix_regexp[$i]; |
80 |
|
|
my $add = $affix_add[$i]; |
81 |
|
|
my $sub = $affix_sub[$i]; |
82 |
|
|
next if length($word) < length($sub); |
83 |
|
|
my $tmp_word = $word; |
84 |
|
|
if ($sub) { |
85 |
|
|
next if ($word !~ m/$sub$/i); |
86 |
|
|
if ($add) { |
87 |
|
|
$tmp_word =~ s/$sub$/$add/i; |
88 |
|
|
} else { |
89 |
|
|
$tmp_word =~ s/$sub$//i; |
90 |
|
|
} |
91 |
|
|
} else { |
92 |
|
|
$tmp_word = $word.$add; |
93 |
|
|
} |
94 |
|
|
if ($tmp_word =~ m/$regexp/ix) { |
95 |
|
|
# print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n"; |
96 |
|
|
push @out,lc($tmp_word); |
97 |
|
|
} |
98 |
|
|
} |
99 |
|
|
} |
100 |
|
|
return @out; |
101 |
|
|
} |
102 |
|
|
|
103 |
|
|
|
104 |
|
|
load_affix("$nn_dir/search/croatian.aff"); |
105 |
|
|
|
106 |
|
|
|
107 |
|
|
#-------------------------------------------------------------------- |
108 |
|
|
|
109 |
|
|
my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr; |
110 |
|
|
|
111 |
|
|
$dbh->do("delete from nn") || die $dbh->errstr(); |
112 |
|
|
|
113 |
|
|
opendir(DIR,$nn_dir) || warn "opendir: $!"; |
114 |
|
|
my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR); |
115 |
|
|
closedir(DIR); |
116 |
|
|
|
117 |
|
|
foreach my $file (@files) { |
118 |
|
|
open(IN,$file) || die "can't open $file: $!"; |
119 |
|
|
|
120 |
|
|
if ($file=~m/god=(\d+)\&br=(\d+)/) { |
121 |
|
|
($br,$god) = ($2,$1); |
122 |
|
|
print "$file -- $2 -- $1\n"; |
123 |
|
|
} |
124 |
|
|
|
125 |
|
|
while(<IN>) { |
126 |
|
|
chomp; |
127 |
|
|
s/\015//g; # kill cr |
128 |
|
|
tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2 |
129 |
|
|
|
130 |
|
|
$sadrzaj++ if (m,<div class=sadrzaj>,); |
131 |
|
|
|
132 |
|
|
if ($sadrzaj) { |
133 |
|
|
if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) { |
134 |
|
|
($aname,$nr) = ($1,$2); |
135 |
|
|
$aname = $1 if ($aname =~ m/Javascript:Mojdok\(\d+,\d+,\d+,(\d+)\);/); |
136 |
|
|
} |
137 |
|
|
$naslov.=$_; |
138 |
|
|
} |
139 |
|
|
|
140 |
|
|
if ($sadrzaj && m,</div>,) { |
141 |
|
|
$sadrzaj--; |
142 |
|
|
$naslov=~s/\s+/ /g; |
143 |
|
|
$naslov=~s/<[^>]+>//g; |
144 |
|
|
$naslov=~s/^\s+//g; |
145 |
|
|
$naslov=~s/\s+$//g; |
146 |
|
|
print "$god $br $nr: $naslov\n"; |
147 |
|
|
my $naslov_czs = lc($naslov); |
148 |
|
|
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
149 |
|
|
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
150 |
|
|
$naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs))); |
151 |
|
|
$dbh->do("insert into nn (br,god,nr,aname,title,title_czs) values ($br,$god,$nr,'$aname','$naslov','$naslov_czs')") || die $dbh->errstr(); |
152 |
|
|
$naslov=""; |
153 |
|
|
$nr=0; |
154 |
|
|
} |
155 |
|
|
} |
156 |
|
|
|
157 |
|
|
close(IN); |
158 |
|
|
} |
159 |
|
|
|
160 |
|
|
$dbh->do("vacuum") || die $dbh->errstr(); |