viewvc.cgi/nn/find3.pl

#!/usr/bin/perl -w

use strict;
use DBI;

my $sadrzaj=0;
my $nr=0;
my $naslov="";

my $br;         ## broj NN
my $god;        ## godina NN
my $aname;      ## ancor name na originalnim stranicama

my $nn_dir=".";         # dir u kojem su wget-ani fileovi


#--------------------------------------------------------------------

my @affix_regexp;
my @affix_add;
my @affix_sub;

sub load_affix {
        my ($filename) = @_;

        my $suffixes=0;

        my ($regexp,$add,$sub);

        open (A,$filename) || die "$filename: $!";
        while(<A>) {
                chomp;
                next if (/^#|^[\s\t\n\r]*$/);

                if (/^suffixes/i) {
                        $suffixes++;
                        next;
                }

                next if (! $suffixes);

                if (/^flag[\s\t]+\*{0,1}(.):/i) {
                        undef $regexp;
                        undef $add;
                        undef $sub;
                        next;
                }

                if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
                        $regexp = $1;
                        $add = $2;
                        $sub = $3 if ($3 ne "-");
                } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
                        $regexp = $1;
                        $sub = $2;
                }

                sub nuke_s {
                        my $tmp = $_[0];
                        return if (!$tmp);
                        $tmp=~s/^ *//g;
                        $tmp=~s/ *$//g;
                        $tmp=~s/ *//g;
                        return $tmp;
                }

                push @affix_regexp,nuke_s($regexp);
                push @affix_add,nuke_s($add);
                push @affix_sub,nuke_s($sub);
        }
}

sub normalize_word {
        my @out;
        foreach my $word (@_) {
                push @out,$word;                # save original word
                next if (length($word) < 3);    # cludge: preskoci kratke
                for(my $i=0; $i<=$#affix_regexp; $i++) {
                        my $regexp = $affix_regexp[$i];
                        my $add = $affix_add[$i];
                        my $sub = $affix_sub[$i];
                        next if length($word) < length($sub);
                        my $tmp_word = $word;
                        if ($sub) {
                                next if ($word !~ m/$sub$/i);
                                if ($add) {
                                        $tmp_word =~ s/$sub$/$add/i;
                                } else {
                                        $tmp_word =~ s/$sub$//i;
                                }
                        } else {
                                $tmp_word = $word.$add;
                        }
                        if ($tmp_word =~ m/$regexp/ix) {
#                               print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n";
                                push @out,lc($tmp_word);
                        }
                }
        }
        return @out;
}
                

load_affix("$nn_dir/search/croatian.aff");


#--------------------------------------------------------------------

my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr;

$dbh->do("delete from nn") || die $dbh->errstr();

opendir(DIR,$nn_dir) || warn "opendir: $!";
my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
closedir(DIR);

foreach my $file (@files) {
        open(IN,$file) || die "can't open $file: $!";

        if ($file=~m/god=(\d+)\&br=(\d+)/) {
                ($br,$god) = ($2,$1);
                print "$file -- $2 -- $1\n";
        }

        while(<IN>) {
                chomp;
                s/\015//g;      # kill cr
                tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/;      # 1250 -> iso8859-2

                $sadrzaj++ if (m,<div class=sadrzaj>,);
        
                if ($sadrzaj) {
                        if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) {
                                ($aname,$nr) = ($1,$2);
                                $aname = $1 if ($aname =~ m/Javascript:Mojdok\(\d+,\d+,\d+,(\d+)\);/);
                        }
                        $naslov.=$_;
                }
                
                if ($sadrzaj && m,</div>,) {
                        $sadrzaj--;
                        $naslov=~s/\s+/ /g;
                        $naslov=~s/<[^>]+>//g;
                        $naslov=~s/^\s+//g;
                        $naslov=~s/\s+$//g;
                        print "$god $br $nr: $naslov\n";
                        my $naslov_czs = lc($naslov);
                        $naslov_czs =~ tr/¹©ðÐèÈæÆ¾®/sSdDcCcCzZ/;
                        $naslov_czs =~ tr/a-zA-Z/ /cs;  # non a-z  -> space
                        $naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs)));
                        $dbh->do("insert into nn (br,god,nr,aname,title,title_czs) values ($br,$god,$nr,'$aname','$naslov','$naslov_czs')") || die $dbh->errstr();
                        $naslov="";
                        $nr=0;
                }
        }

        close(IN);
}

$dbh->do("vacuum") || die $dbh->errstr();
1	dpavlin	1.1	#!/usr/bin/perl -w
2
3			use strict;
4			use DBI;
5
6			my $sadrzaj=0;
7			my $nr=0;
8			my $naslov="";
9
10			my $br; ## broj NN
11			my $god; ## godina NN
12			my $aname; ## ancor name na originalnim stranicama
13
14			my $nn_dir="."; # dir u kojem su wget-ani fileovi
15
16
17			#--------------------------------------------------------------------
18
19			my @affix_regexp;
20			my @affix_add;
21			my @affix_sub;
22
23			sub load_affix {
24			my ($filename) = @_;
25
26			my $suffixes=0;
27
28			my ($regexp,$add,$sub);
29
30			open (A,$filename) \|\| die "$filename: $!";
31			while(<A>) {
32			chomp;
33			next if (/^#\|^[\s\t\n\r]*$/);
34
35			if (/^suffixes/i) {
36			$suffixes++;
37			next;
38			}
39
40			next if (! $suffixes);
41
42			if (/^flag[\s\t]+\*{0,1}(.):/i) {
43			undef $regexp;
44			undef $add;
45			undef $sub;
46			next;
47			}
48
49			if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
50			$regexp = $1;
51			$add = $2;
52			$sub = $3 if ($3 ne "-");
53			} elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
54			$regexp = $1;
55			$sub = $2;
56			}
57
58			sub nuke_s {
59			my $tmp = $_[0];
60			return if (!$tmp);
61			$tmp=~s/^ *//g;
62			$tmp=~s/ *$//g;
63			$tmp=~s/ *//g;
64			return $tmp;
65			}
66
67			push @affix_regexp,nuke_s($regexp);
68			push @affix_add,nuke_s($add);
69			push @affix_sub,nuke_s($sub);
70			}
71			}
72
73			sub normalize_word {
74			my @out;
75			foreach my $word (@_) {
76			push @out,$word; # save original word
77			next if (length($word) < 3); # cludge: preskoci kratke
78			for(my $i=0; $i<=$#affix_regexp; $i++) {
79			my $regexp = $affix_regexp[$i];
80			my $add = $affix_add[$i];
81			my $sub = $affix_sub[$i];
82			next if length($word) < length($sub);
83			my $tmp_word = $word;
84			if ($sub) {
85			next if ($word !~ m/$sub$/i);
86			if ($add) {
87			$tmp_word =~ s/$sub$/$add/i;
88			} else {
89			$tmp_word =~ s/$sub$//i;
90			}
91			} else {
92			$tmp_word = $word.$add;
93			}
94			if ($tmp_word =~ m/$regexp/ix) {
95			# print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n";
96			push @out,lc($tmp_word);
97			}
98			}
99			}
100			return @out;
101			}
102
103
104			load_affix("$nn_dir/search/croatian.aff");
105
106
107			#--------------------------------------------------------------------
108
109			my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") \|\| die $DBI::errstr;
110
111			$dbh->do("delete from nn") \|\| die $dbh->errstr();
112
113			opendir(DIR,$nn_dir) \|\| warn "opendir: $!";
114			my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
115			closedir(DIR);
116
117			foreach my $file (@files) {
118			open(IN,$file) \|\| die "can't open $file: $!";
119
120			if ($file=~m/god=(\d+)\&br=(\d+)/) {
121			($br,$god) = ($2,$1);
122			print "$file -- $2 -- $1\n";
123			}
124
125			while(<IN>) {
126			chomp;
127			s/\015//g; # kill cr
128			tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
129
130			$sadrzaj++ if (m,<div class=sadrzaj>,);
131
132			if ($sadrzaj) {
133			if (s/<a href="#([^"]+)">\s(\d+)\.\s<[^>]+>//i) {
134			($aname,$nr) = ($1,$2);
135			$aname = $1 if ($aname =~ m/Javascript:Mojdok\(\d+,\d+,\d+,(\d+)\);/);
136			}
137			$naslov.=$_;
138			}
139
140			if ($sadrzaj && m,</div>,) {
141			$sadrzaj--;
142			$naslov=~s/\s+/ /g;
143			$naslov=~s/<[^>]+>//g;
144			$naslov=~s/^\s+//g;
145			$naslov=~s/\s+$//g;
146			print "$god $br $nr: $naslov\n";
147			my $naslov_czs = lc($naslov);
148			$naslov_czs =~ tr/¹©ðÐèÈæÆ¾®/sSdDcCcCzZ/;
149			$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
150			$naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs)));
151			$dbh->do("insert into nn (br,god,nr,aname,title,title_czs) values ($br,$god,$nr,'$aname','$naslov','$naslov_czs')") \|\| die $dbh->errstr();
152			$naslov="";
153			$nr=0;
154			}
155			}
156
157			close(IN);
158			}
159
160			$dbh->do("vacuum") \|\| die $dbh->errstr();