/[nn.old]/trunk/swish/html2xml.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/swish/html2xml.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 66 by dpavlin, Wed Jun 19 11:20:41 2002 UTC revision 67 by dpavlin, Wed Jun 19 12:33:23 2002 UTC
# Line 16  Line 16 
16  use strict;  use strict;
17  use Getopt::Std;  use Getopt::Std;
18  use Lingua::Spelling::Alternative;  use Lingua::Spelling::Alternative;
19    require Unicode::Map8;
20    
21  my $sadrzaj=0;  my $sadrzaj=0;
22  my $nr=0;  my $nr=0;
# Line 26  my $god;       ## godina NN Line 27  my $god;       ## godina NN
27  my $aname;      ## ancor name na originalnim stranicama  my $aname;      ## ancor name na originalnim stranicama
28    
29  my $nn_dir="../";               # dir u kojem su wget-ani fileovi  my $nn_dir="../";               # dir u kojem su wget-ani fileovi
30    my $url="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%d&mid=%d#%d";
31    
32  my %opts;  my %opts;
33  getopts("vqdl:", \%opts);  getopts("vqdl:", \%opts);
# Line 37  my $hr = new Lingua::Spelling::Alternati Line 39  my $hr = new Lingua::Spelling::Alternati
39  #$hr->load_affix("$nn_dir/search/croatian.aff");  #$hr->load_affix("$nn_dir/search/croatian.aff");
40  $hr->load_findaffix("$nn_dir/prvih_50.txt");  $hr->load_findaffix("$nn_dir/prvih_50.txt");
41    
42    my $l2_map = Unicode::Map8->new("ISO-8859-2") || die;
43    
44    #--------------------------------------------------------------------
45    sub dump_to_swish {
46            my $xml = shift @_;
47            my ($god,$br,$nr,$aname) = @_;
48    
49            use utf8;
50    
51            print   "Path-Name: ".sprintf($url,$god,$br,$nr,$aname)."\n".
52                    "Content-Length: ".length($xml)."\n".
53                    "Document-Type: XML\n".
54                    "\n$xml";
55    }
56    
57  #--------------------------------------------------------------------  #--------------------------------------------------------------------
58    
59    
60  opendir(DIR,$nn_dir) || warn "opendir: $!";  opendir(DIR,$nn_dir) || warn "opendir: $!";
61  my @files;  my @files;
62  if ($opts{l}) {  if ($opts{l}) {
# Line 51  if ($opts{l}) { Line 68  if ($opts{l}) {
68  }  }
69  closedir(DIR);  closedir(DIR);
70    
 print "<xml>\n";  
   
71  foreach my $file (@files) {  foreach my $file (@files) {
72          open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!";          open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!";
73    
# Line 84  foreach my $file (@files) { Line 99  foreach my $file (@files) {
99                          $naslov_czs =~ tr/a-zA-Z/ /cs;  # non a-z  -> space                          $naslov_czs =~ tr/a-zA-Z/ /cs;  # non a-z  -> space
100                          $naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs)));                          $naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs)));
101  #                       $naslov_czs = $hr->minimal(split(/ /,$naslov_czs));  #                       $naslov_czs = $hr->minimal(split(/ /,$naslov_czs));
102                          print "<br>$br</br><god>$god</god><nr>$nr</nr><aname>$aname</aname>\n<naslov>$naslov</naslov>\n<naslov_czs>$naslov_czs</naslov_czs>\n";                          my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n";
103                            $xml.="<naslov>". $l2_map->tou($naslov)->utf8 ."</naslov>\n";
104                            $xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n";
105                            dump_to_swish($xml,$god,$br,$nr,$aname);
106                            
107                          $naslov="";                          $naslov="";
108                          $nr=0;                          $nr=0;
109                          $zakona++;                          $zakona++;
# Line 109  foreach my $file (@files) { Line 128  foreach my $file (@files) {
128          close(IN);          close(IN);
129  }  }
130    
 print "</xml>\n";  
   
131  print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});  print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});
132    

Legend:
Removed from v.66  
changed lines
  Added in v.67

  ViewVC Help
Powered by ViewVC 1.1.26