/[Z3950-HTML-Scraper]/Aleph.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /Aleph.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 11 - (show annotations)
Fri Oct 22 20:25:51 2010 UTC (13 years, 6 months ago) by dpavlin
File size: 3111 byte(s)
basic parser for Aleph html at NSK

1 package Aleph;
2
3 use warnings;
4 use strict;
5
6 use WWW::Mechanize;
7 use MARC::Record;
8 use Data::Dump qw/dump/;
9
10 binmode STDOUT, ':utf8';
11
12 our $mech = WWW::Mechanize->new();
13 our $hits;
14
15 sub diag {
16 print "# ", @_, $/;
17 }
18
19 # Koha Z39.50 query:
20 #
21 # Bib-1 @and @and @and @and @and @and @and @or
22 # @attr 1=8 isbn-issn
23 # @attr 1=7 isbn-issn
24 # @attr 1=4 title
25 # @attr 1=1003 author
26 # @attr 1=16 dewey
27 # @attr 1=21 subject-holding
28 # @attr 1=12 control-no
29 # @attr 1=1007 standard-id
30 # @attr 1=1016 any
31
32 # LCC - Klasifikacija Kongresne knjižnice
33 # LCN - Signatura Kongresne knjižnice
34 # DDC - Deweyjeva klasifikacija
35 # TIT - Naslovi
36 # AUT - Autori
37 # IMP - Impresum
38 # SUB - Predmetnice
39 # SRS - Nakladnička cjelina
40 # LOC - Lokacija
41 # WRD - Riječi
42 # WTI - Riječi u polju naslova
43 # WAU - Riječi u polju autora
44 # WPE - Riječi u polju individualnog autora
45 # WCO - Riječi u polju korporativnog autora
46 # WME - Riječi u polju sastanka
47 # WUT - Riječi u polju jedinstvenog naslova
48 # WPL - Riječi u polju mjesta izdavanja
49 # WPU - Riječi u polju nakladnika
50 # WSU - Riječi u polju predmetnica
51 # WSM - Riječi u predmetnicama MeSH-a
52 # WST - Riječi u polju status
53 # WGA - Riječi u geografskim odrednicama
54 # WYR - Godina izdavanja
55
56 our $usemap = {
57 # 8 => '',
58 # 7 => '',
59 4 => 'WTI',
60 1003 => 'WTI',
61 16 => 'CU',
62 21 => 'SU',
63 # 12 => '',
64 # 1007 => '',
65 # 1016 => '',
66
67 };
68
69 sub usemap {
70 my $f = shift || die;
71 $usemap->{$f};
72 }
73
74 sub search {
75 my ( $self, $query ) = @_;
76
77 die "need query" unless defined $query;
78
79 my $url = 'http://161.53.240.197:8991/F?RN=' . rand(1000000000);
80 # fake JavaScript code on page which creates random session
81
82 diag "get $url";
83
84 $mech->get( $url );
85
86 diag "advanced search";
87
88 $mech->follow_link( url_regex => qr/find-c/ );
89
90 diag "submit search $query";
91
92 $mech->submit_form(
93 fields => {
94 'ccl_term' => $query,
95 },
96 );
97
98 $hits = 0;
99 if ( $mech->content =~ m{ukupno\s+(\d+).*(do\s+(\d+))}s ) {
100 $hits = $1;
101 $hits = $2 if $2 && $2 < $1; # correct for max. results
102 } else {
103 diag "get't find results in ", $mech->content;
104 return;
105 }
106
107 diag "got $hits results, get first one";
108
109 $mech->follow_link( url_regex => qr/set_entry=000001/ );
110
111 diag "in MARC format";
112
113 $mech->follow_link( url_regex => qr/format=001/ );
114 }
115
116
117 sub next_marc {
118 my ($self,$format) = @_;
119
120 print $mech->content;
121
122 if ( $mech->content =~ m{Zapis\s+(\d+)}s ) {
123
124 my $nr = $1;
125
126 diag "parse $nr";
127
128 my $html = $mech->content;
129 my $hash;
130 $html =~ s|<tr>\s*<td class=td1 id=bold[^>]*>(.+?)</td>\s*<td class=td1>(.+?)</td>|$hash->{$1} = "$2";|ges;
131 diag dump($hash);
132
133 my $id = $hash->{SYS} || die "no SYS";
134
135 die;
136
137 my $marc = MARC::Record->new;
138
139 # $marc->add_fields( $f, $i1, $i2, @{ $out->{$f} } );
140
141 my $path = "marc/$id.$format";
142
143 open(my $out, '>:utf8', $path);
144 print $out $marc->as_usmarc;
145 close($out);
146
147 diag "created $path ", -s $path, " bytes";
148
149 diag $marc->as_formatted;
150
151 $nr++;
152
153 die if $nr == 3; # FIXME
154
155 $mech->follow_link( url_regex => qr/set_entry=0*$nr/ );
156
157 return $marc->as_usmarc;
158 } else {
159 die "can't fetch COMARC format from ", $mech->content;
160 }
161
162 }
163
164 1;

  ViewVC Help
Powered by ViewVC 1.1.26