/[nn.old]/trunk/find3.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/find3.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 40 - (hide annotations)
Thu Jan 31 16:54:31 2002 UTC (22 years, 2 months ago) by dpavlin
File MIME type: application/octet-stream
File size: 3366 byte(s)
Initial revision

1 dpavlin 40 #!/usr/bin/perl -w
2    
3     use strict;
4     use DBI;
5    
6     my $sadrzaj=0;
7     my $nr=0;
8     my $naslov="";
9    
10     my $br; ## broj NN
11     my $god; ## godina NN
12     my $aname; ## ancor name na originalnim stranicama
13    
14     my $nn_dir="."; # dir u kojem su wget-ani fileovi
15    
16    
17     #--------------------------------------------------------------------
18    
19     my @affix_regexp;
20     my @affix_add;
21     my @affix_sub;
22    
23     sub load_affix {
24     my ($filename) = @_;
25    
26     my $suffixes=0;
27    
28     my ($regexp,$add,$sub);
29    
30     open (A,$filename) || die "$filename: $!";
31     while(<A>) {
32     chomp;
33     next if (/^#|^[\s\t\n\r]*$/);
34    
35     if (/^suffixes/i) {
36     $suffixes++;
37     next;
38     }
39    
40     next if (! $suffixes);
41    
42     if (/^flag[\s\t]+\*{0,1}(.):/i) {
43     undef $regexp;
44     undef $add;
45     undef $sub;
46     next;
47     }
48    
49     if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
50     $regexp = $1;
51     $add = $2;
52     $sub = $3 if ($3 ne "-");
53     } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
54     $regexp = $1;
55     $sub = $2;
56     }
57    
58     sub nuke_s {
59     my $tmp = $_[0];
60     return if (!$tmp);
61     $tmp=~s/^ *//g;
62     $tmp=~s/ *$//g;
63     $tmp=~s/ *//g;
64     return $tmp;
65     }
66    
67     push @affix_regexp,nuke_s($regexp);
68     push @affix_add,nuke_s($add);
69     push @affix_sub,nuke_s($sub);
70     }
71     }
72    
73     sub normalize_word {
74     my @out;
75     foreach my $word (@_) {
76     push @out,$word; # save original word
77     next if (length($word) < 3); # cludge: preskoci kratke
78     for(my $i=0; $i<=$#affix_regexp; $i++) {
79     my $regexp = $affix_regexp[$i];
80     my $add = $affix_add[$i];
81     my $sub = $affix_sub[$i];
82     next if length($word) < length($sub);
83     my $tmp_word = $word;
84     if ($sub) {
85     next if ($word !~ m/$sub$/i);
86     if ($add) {
87     $tmp_word =~ s/$sub$/$add/i;
88     } else {
89     $tmp_word =~ s/$sub$//i;
90     }
91     } else {
92     $tmp_word = $word.$add;
93     }
94     if ($tmp_word =~ m/$regexp/ix) {
95     # print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n";
96     push @out,lc($tmp_word);
97     }
98     }
99     }
100     return @out;
101     }
102    
103    
104     load_affix("$nn_dir/search/croatian.aff");
105    
106    
107     #--------------------------------------------------------------------
108    
109     my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr;
110    
111     $dbh->do("delete from nn") || die $dbh->errstr();
112    
113     opendir(DIR,$nn_dir) || warn "opendir: $!";
114     my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
115     closedir(DIR);
116    
117     foreach my $file (@files) {
118     open(IN,$file) || die "can't open $file: $!";
119    
120     if ($file=~m/god=(\d+)\&br=(\d+)/) {
121     ($br,$god) = ($2,$1);
122     print "$file -- $2 -- $1\n";
123     }
124    
125     while(<IN>) {
126     chomp;
127     s/\015//g; # kill cr
128     tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
129    
130     $sadrzaj++ if (m,<div class=sadrzaj>,);
131    
132     if ($sadrzaj) {
133     if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) {
134     ($aname,$nr) = ($1,$2);
135     $aname = $1 if ($aname =~ m/Javascript:Mojdok\(\d+,\d+,\d+,(\d+)\);/);
136     }
137     $naslov.=$_;
138     }
139    
140     if ($sadrzaj && m,</div>,) {
141     $sadrzaj--;
142     $naslov=~s/\s+/ /g;
143     $naslov=~s/<[^>]+>//g;
144     $naslov=~s/^\s+//g;
145     $naslov=~s/\s+$//g;
146     print "$god $br $nr: $naslov\n";
147     my $naslov_czs = lc($naslov);
148     $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
149     $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
150     $naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs)));
151     $dbh->do("insert into nn (br,god,nr,aname,title,title_czs) values ($br,$god,$nr,'$aname','$naslov','$naslov_czs')") || die $dbh->errstr();
152     $naslov="";
153     $nr=0;
154     }
155     }
156    
157     close(IN);
158     }
159    
160     $dbh->do("vacuum") || die $dbh->errstr();

Properties

Name Value
cvs2svn:cvs-rev 1.1
svn:executable *
svn:mime-type application/octet-stream

  ViewVC Help
Powered by ViewVC 1.1.26