/[Z3950-HTML-Scraper]/Aleph.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /Aleph.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 12 - (show annotations)
Fri Oct 22 20:49:16 2010 UTC (8 years, 7 months ago) by dpavlin
File size: 3416 byte(s)
generate marc record

1 package Aleph;
2
3 use warnings;
4 use strict;
5
6 use WWW::Mechanize;
7 use MARC::Record;
8 use Data::Dump qw/dump/;
9
10 binmode STDOUT, ':utf8';
11
12 our $mech = WWW::Mechanize->new();
13 our $hits;
14
15 sub diag {
16 print "# ", @_, $/;
17 }
18
19 # Koha Z39.50 query:
20 #
21 # Bib-1 @and @and @and @and @and @and @and @or
22 # @attr 1=8 isbn-issn
23 # @attr 1=7 isbn-issn
24 # @attr 1=4 title
25 # @attr 1=1003 author
26 # @attr 1=16 dewey
27 # @attr 1=21 subject-holding
28 # @attr 1=12 control-no
29 # @attr 1=1007 standard-id
30 # @attr 1=1016 any
31
32 # LCC - Klasifikacija Kongresne knjižnice
33 # LCN - Signatura Kongresne knjižnice
34 # DDC - Deweyjeva klasifikacija
35 # TIT - Naslovi
36 # AUT - Autori
37 # IMP - Impresum
38 # SUB - Predmetnice
39 # SRS - Nakladnička cjelina
40 # LOC - Lokacija
41 # WRD - Riječi
42 # WTI - Riječi u polju naslova
43 # WAU - Riječi u polju autora
44 # WPE - Riječi u polju individualnog autora
45 # WCO - Riječi u polju korporativnog autora
46 # WME - Riječi u polju sastanka
47 # WUT - Riječi u polju jedinstvenog naslova
48 # WPL - Riječi u polju mjesta izdavanja
49 # WPU - Riječi u polju nakladnika
50 # WSU - Riječi u polju predmetnica
51 # WSM - Riječi u predmetnicama MeSH-a
52 # WST - Riječi u polju status
53 # WGA - Riječi u geografskim odrednicama
54 # WYR - Godina izdavanja
55
56 our $usemap = {
57 # 8 => '',
58 # 7 => '',
59 4 => 'WTI',
60 1003 => 'WTI',
61 16 => 'CU',
62 21 => 'SU',
63 # 12 => '',
64 # 1007 => '',
65 # 1016 => '',
66
67 };
68
69 sub usemap {
70 my $f = shift || die;
71 $usemap->{$f};
72 }
73
74 sub search {
75 my ( $self, $query ) = @_;
76
77 die "need query" unless defined $query;
78
79 my $url = 'http://161.53.240.197:8991/F?RN=' . rand(1000000000);
80 # fake JavaScript code on page which creates random session
81
82 diag "get $url";
83
84 $mech->get( $url );
85
86 diag "advanced search";
87
88 $mech->follow_link( url_regex => qr/find-c/ );
89
90 diag "submit search $query";
91
92 $mech->submit_form(
93 fields => {
94 'ccl_term' => $query,
95 },
96 );
97
98 $hits = 0;
99 if ( $mech->content =~ m{ukupno\s+(\d+).*(do\s+(\d+))}s ) {
100 $hits = $1;
101 $hits = $2 if $2 && $2 < $1; # correct for max. results
102 } else {
103 diag "get't find results in ", $mech->content;
104 return;
105 }
106
107 diag "got $hits results, get first one";
108
109 $mech->follow_link( url_regex => qr/set_entry=000001/ );
110
111 diag "in MARC format";
112
113 $mech->follow_link( url_regex => qr/format=001/ );
114 }
115
116
117 sub next_marc {
118 my ($self,$format) = @_;
119
120 print $mech->content;
121
122 if ( $mech->content =~ m{Zapis\s+(\d+)}s ) {
123
124 my $nr = $1;
125
126 diag "parse $nr";
127
128 my $marc = MARC::Record->new;
129
130 my $html = $mech->content;
131 my $hash;
132
133 sub field {
134 my ( $f, $v ) = @_;
135 $v =~ s/\Q&nbsp;\E/ /gs;
136 warn "# $f\t$v\n";
137 $hash->{$f} = $v;
138 my ($i1,$i2) = (' ',' ');
139 ($i1,$i2) = ($2,$3) if $f =~ s/^(...)(.)?(.)?/$1/;
140 my @sf = split(/\|/, $v);
141 shift @sf;
142 @sf = map { s/^(\w)\s+//; { $1 => $_ } } @sf;
143 diag "sf = ", dump(@sf);
144 $marc->add_fields( $f, $i1, $i2, @sf ) if $f =~ m/^\d+$/;
145 }
146
147 $html =~ s|<tr>\s*<td class=td1 id=bold[^>]*>(.+?)</td>\s*<td class=td1>(.+?)</td>|field($1,$2)|ges;
148 diag dump($hash);
149
150 my $id = $hash->{SYS} || die "no SYS";
151
152
153
154 my $path = "marc/$id.$format";
155
156 open(my $out, '>:utf8', $path);
157 print $out $marc->as_usmarc;
158 close($out);
159
160 diag "created $path ", -s $path, " bytes";
161
162 diag $marc->as_formatted;
163
164 $nr++;
165
166 die if $nr == 3; # FIXME
167
168 $mech->follow_link( url_regex => qr/set_entry=0*$nr/ );
169
170 return $marc->as_usmarc;
171 } else {
172 die "can't fetch COMARC format from ", $mech->content;
173 }
174
175 }
176
177 1;

  ViewVC Help
Powered by ViewVC 1.1.26