/[Z3950-HTML-Scraper]/Aleph.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /Aleph.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 12 - (hide annotations)
Fri Oct 22 20:49:16 2010 UTC (13 years, 5 months ago) by dpavlin
File size: 3416 byte(s)
generate marc record

1 dpavlin 11 package Aleph;
2    
3     use warnings;
4     use strict;
5    
6     use WWW::Mechanize;
7     use MARC::Record;
8     use Data::Dump qw/dump/;
9    
10     binmode STDOUT, ':utf8';
11    
12     our $mech = WWW::Mechanize->new();
13     our $hits;
14    
15     sub diag {
16     print "# ", @_, $/;
17     }
18    
19     # Koha Z39.50 query:
20     #
21     # Bib-1 @and @and @and @and @and @and @and @or
22     # @attr 1=8 isbn-issn
23     # @attr 1=7 isbn-issn
24     # @attr 1=4 title
25     # @attr 1=1003 author
26     # @attr 1=16 dewey
27     # @attr 1=21 subject-holding
28     # @attr 1=12 control-no
29     # @attr 1=1007 standard-id
30     # @attr 1=1016 any
31    
32     # LCC - Klasifikacija Kongresne knjižnice
33     # LCN - Signatura Kongresne knjižnice
34     # DDC - Deweyjeva klasifikacija
35     # TIT - Naslovi
36     # AUT - Autori
37     # IMP - Impresum
38     # SUB - Predmetnice
39     # SRS - Nakladnička cjelina
40     # LOC - Lokacija
41     # WRD - Riječi
42     # WTI - Riječi u polju naslova
43     # WAU - Riječi u polju autora
44     # WPE - Riječi u polju individualnog autora
45     # WCO - Riječi u polju korporativnog autora
46     # WME - Riječi u polju sastanka
47     # WUT - Riječi u polju jedinstvenog naslova
48     # WPL - Riječi u polju mjesta izdavanja
49     # WPU - Riječi u polju nakladnika
50     # WSU - Riječi u polju predmetnica
51     # WSM - Riječi u predmetnicama MeSH-a
52     # WST - Riječi u polju status
53     # WGA - Riječi u geografskim odrednicama
54     # WYR - Godina izdavanja
55    
56     our $usemap = {
57     # 8 => '',
58     # 7 => '',
59     4 => 'WTI',
60     1003 => 'WTI',
61     16 => 'CU',
62     21 => 'SU',
63     # 12 => '',
64     # 1007 => '',
65     # 1016 => '',
66    
67     };
68    
69     sub usemap {
70     my $f = shift || die;
71     $usemap->{$f};
72     }
73    
74     sub search {
75     my ( $self, $query ) = @_;
76    
77     die "need query" unless defined $query;
78    
79     my $url = 'http://161.53.240.197:8991/F?RN=' . rand(1000000000);
80     # fake JavaScript code on page which creates random session
81    
82     diag "get $url";
83    
84     $mech->get( $url );
85    
86     diag "advanced search";
87    
88     $mech->follow_link( url_regex => qr/find-c/ );
89    
90     diag "submit search $query";
91    
92     $mech->submit_form(
93     fields => {
94     'ccl_term' => $query,
95     },
96     );
97    
98     $hits = 0;
99     if ( $mech->content =~ m{ukupno\s+(\d+).*(do\s+(\d+))}s ) {
100     $hits = $1;
101     $hits = $2 if $2 && $2 < $1; # correct for max. results
102     } else {
103     diag "get't find results in ", $mech->content;
104     return;
105     }
106    
107     diag "got $hits results, get first one";
108    
109     $mech->follow_link( url_regex => qr/set_entry=000001/ );
110    
111     diag "in MARC format";
112    
113     $mech->follow_link( url_regex => qr/format=001/ );
114     }
115    
116    
117     sub next_marc {
118     my ($self,$format) = @_;
119    
120     print $mech->content;
121    
122     if ( $mech->content =~ m{Zapis\s+(\d+)}s ) {
123    
124     my $nr = $1;
125    
126     diag "parse $nr";
127    
128 dpavlin 12 my $marc = MARC::Record->new;
129    
130 dpavlin 11 my $html = $mech->content;
131     my $hash;
132 dpavlin 12
133     sub field {
134     my ( $f, $v ) = @_;
135     $v =~ s/\Q&nbsp;\E/ /gs;
136     warn "# $f\t$v\n";
137     $hash->{$f} = $v;
138     my ($i1,$i2) = (' ',' ');
139     ($i1,$i2) = ($2,$3) if $f =~ s/^(...)(.)?(.)?/$1/;
140     my @sf = split(/\|/, $v);
141     shift @sf;
142     @sf = map { s/^(\w)\s+//; { $1 => $_ } } @sf;
143     diag "sf = ", dump(@sf);
144     $marc->add_fields( $f, $i1, $i2, @sf ) if $f =~ m/^\d+$/;
145     }
146    
147     $html =~ s|<tr>\s*<td class=td1 id=bold[^>]*>(.+?)</td>\s*<td class=td1>(.+?)</td>|field($1,$2)|ges;
148 dpavlin 11 diag dump($hash);
149    
150     my $id = $hash->{SYS} || die "no SYS";
151    
152    
153    
154     my $path = "marc/$id.$format";
155    
156     open(my $out, '>:utf8', $path);
157     print $out $marc->as_usmarc;
158     close($out);
159    
160     diag "created $path ", -s $path, " bytes";
161    
162     diag $marc->as_formatted;
163    
164     $nr++;
165    
166     die if $nr == 3; # FIXME
167    
168     $mech->follow_link( url_regex => qr/set_entry=0*$nr/ );
169    
170     return $marc->as_usmarc;
171     } else {
172     die "can't fetch COMARC format from ", $mech->content;
173     }
174    
175     }
176    
177     1;

  ViewVC Help
Powered by ViewVC 1.1.26