--- trunk/find3.pl 2002/02/05 12:58:30 50 +++ trunk/find3.pl 2002/02/05 13:42:10 53 @@ -1,7 +1,13 @@ #!/usr/bin/perl -w +# indexer, Dobrica Pavlinusic 2001-01-28 +# options: -q quiet +# -d debug +# -v verbose + use strict; use DBI; +use Getopt::Std; my $sadrzaj=0; my $nr=0; @@ -13,6 +19,8 @@ my $nn_dir="."; # dir u kojem su wget-ani fileovi +my %opts; +getopt("vqd", \%opts); #-------------------------------------------------------------------- @@ -92,7 +100,7 @@ $tmp_word = $word.$add; } if ($tmp_word =~ m/$regexp/ix) { -# print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n"; + print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n" if ($opts{d}); push @out,lc($tmp_word); } } @@ -114,12 +122,15 @@ my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR); closedir(DIR); +my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") || die $dbh->errstr(); + + foreach my $file (@files) { open(IN,$file) || die "can't open $file: $!"; if ($file=~m/god=(\d+)\&br=(\d+)/) { ($br,$god) = ($2,$1); - print "$file -- $2 -- $1\n"; + print "$file -- $2 -- $1\n" if (! $opts{q}); } while() { @@ -138,12 +149,12 @@ $naslov=~s/<[^>]+>//g; $naslov=~s/^\s+//g; $naslov=~s/\s+$//g; - print "$god $br $nr: $naslov\n"; + print "$god $br $nr: $naslov\n" if ($opts{v}); my $naslov_czs = lc($naslov); $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space $naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs))); - $dbh->do("insert into nn (br,god,nr,aname,title,title_czs) values ($br,$god,$nr,'$aname','$naslov','$naslov_czs')") || die $dbh->errstr(); + $sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) || die $dbh->errstr(); $naslov=""; $nr=0; } @@ -151,7 +162,7 @@ if ($sadrzaj) { if (s/\s*(\d+)\.\s*<[^>]+>//i) { ($aname,$nr) = ($1,$2); - } elsif (s/]*>//i) { ($nr,$aname) = ($3,$4); die "conflict in godina: $1 != $god" if ($god != $1); die "conflict in broj: $2 != $br" if ($br != $2);