/[Z3950-HTML-Scraper]/COBISS.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /COBISS.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (hide annotations)
Fri Jun 19 21:51:28 2009 UTC (14 years, 9 months ago) by dpavlin
File size: 1974 byte(s)
create MARC record

1 dpavlin 1 package COBISS;
2    
3     use warnings;
4     use strict;
5    
6     use WWW::Mechanize;
7 dpavlin 2 use MARC::Record;
8 dpavlin 1
9 dpavlin 2 binmode STDOUT, ':utf8';
10    
11     my $cobiss_marc21 = {
12     '010' => { a => [ '020', 'a' ] },
13     200 => {
14     a => [ 245 , 'a' ],
15     f => [ 245 , 'f' ],
16     },
17     205 => { a => [ 250 , 'a' ] },
18     210 => {
19     a => [ 250 , 'a' ],
20     c => [ 260 , 'b' ],
21     d => [ 260 , 'c' ],
22     },
23     215 => {
24     a => [ 300 , 'a' ],
25     c => [ 300 , 'b' ],
26     d => [ 300 , 'c' ],
27     },
28     700 => {
29     a => [ 100 , 'a' ],
30     },
31     };
32    
33 dpavlin 1 sub search {
34    
35     my $url = 'http://cobiss.izum.si/scripts/cobiss?ukaz=GETID&lani=en';
36    
37     warn "# get $url\n";
38    
39     my $mech = WWW::Mechanize->new();
40     $mech->get( $url );
41    
42     warn "# got session\n";
43    
44     $mech->follow_link( text_regex => qr/union/ );
45    
46     warn "# submit search\n";
47    
48     $mech->submit_form(
49     fields => {
50     'SS1' => 'Krleza',
51     },
52     );
53    
54     my $hits = 1;
55     if ( $mech->content =~ m{hits:\s*<b>\s*(\d+)\s*</b>}s ) {
56     $hits = $1;
57     } else {
58     warn "get't find results in ", $mech->content;
59     }
60    
61     warn "# got $hits results, get first one\n";
62    
63     $mech->follow_link( url_regex => qr/ukaz=DISP/ );
64    
65     warn "# in COMARC format\n";
66    
67     $mech->follow_link( url_regex => qr/fmt=13/ );
68    
69     my $comarc;
70    
71     if ( $mech->content =~ m{<pre>\s*(.+1\..+?)\s*</pre>}s ) {
72     my $comarc = $1;
73     $comarc =~ s{</?b>}{}gs;
74 dpavlin 2 $comarc =~ s{<font[^>]*>}{<s>}gs;
75     $comarc =~ s{</font>}{<e>}gs;
76 dpavlin 1
77     print $comarc;
78    
79 dpavlin 2 my $marc = MARC::Record->new;
80    
81     foreach my $line ( split(/[\r\n]+/, $comarc) ) {
82     our @f;
83    
84     if ( $line !~ s{(\d\d\d)([01 ])([01 ])}{} ) {
85     warn "SKIP: $line\n";
86     } else {
87     $line .= "<eol>";
88    
89     @f = ( $1, $2, $3 );
90     sub sf { warn "sf",@_,"|",@f; push @f, @_; }
91     $line =~ s{<s>(\w)<e>([^<]+)\s*}{sf($1, $2)}ges;
92     warn "# f:", join(' ', @f), " left:|$line|\n";
93     $marc->add_fields( @f );
94     }
95     }
96    
97     open(my $out, '>:utf8', 'out.marc');
98     print $out $marc->as_usmarc;
99     close($out);
100    
101     warn $marc->as_formatted;
102    
103 dpavlin 1 return $comarc;
104     } else {
105     die "can't fetch COMARC format from ", $mech->content;
106     }
107    
108     }
109    
110     1;

  ViewVC Help
Powered by ViewVC 1.1.26