/[Grep]/lib/Grep/Source.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /lib/Grep/Source.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 86 - (show annotations)
Fri Feb 23 21:16:44 2007 UTC (17 years, 2 months ago) by dpavlin
File size: 6619 byte(s)
added hooks to Grep::Source->save to keep useful snippets of html in /tmp/grep (if writable)
1 # Dobrica Pavlinusic, <dpavlin@rot13.org> 02/22/07 20:30:00 CET
2
3 use strict;
4 use warnings;
5
6 package Grep::Source;
7
8 use Carp qw/verbose/;
9 use Module::Pluggable search_path => 'Grep::Source', sub_name => 'sources', require => 1;
10 use base qw(Class::Accessor);
11 Grep::Source->mk_accessors( qw(feed uri q new_items collection) );
12
13 use HTML::TreeBuilder;
14 use WWW::Mechanize;
15 use XML::Feed;
16 use URI;
17
18 use Data::Dump qw/dump/;
19
20 =head1 NAME
21
22 Grep::Source - base class for implementation of different sources for Grep
23
24 =head1 METHODS
25
26 This is mostly documentation because most of methods are implemented by plugins.
27
28 =head2 sources
29
30 my @sources = Grep::Source->sources();
31
32 Returns all available sources.
33
34 =cut
35
36 Jifty->log->debug("Found source plugins: ", join(", ", __PACKAGE__->sources() ) );
37
38 =head2 new
39
40 my $source = Grep::Source->new({ feed => $feed_record });
41
42 This will also setup:
43
44 =head2 feed
45
46 isa L<Grep::Model::Feed>
47
48 =head2 search
49
50 my $collection = $source->search( 'query string' );
51
52 It will also setup following accessors:
53
54 =head2 q
55
56 Search query
57
58 =head2 uri
59
60 URI of feed with embedded search query
61
62 =head2 new_items
63
64 Number of new items in result collection
65
66 =head2 collection
67
68 Actuall results which is L<Grep::Model::ItemCollection>, so following will
69 work:
70
71 print "and ", $self->collection->count, " total items";
72
73
74 Also setups number of new items
75
76 print $source->new_items, " items new";
77
78 =cut
79
80 sub search {
81 my $self = shift;
82
83 my $q = shift;
84
85 $q ? $self->q( $q ) : $q = $self->q;
86
87 die "no q?" unless ( $self->q );
88 die "no feed?" unless ( $self->feed );
89 die "feed not Grep::Model::Feed" unless ( $self->feed->isa('Grep::Model::Feed') );
90
91 my $message;
92 my $uri = $self->feed->uri;
93 if ($uri =~ m/%s/) {
94 $uri = $self->feed->search_uri( $q );
95 $message = 'Searching';
96 } else {
97 $message = 'Fetching';
98 }
99 $message .= ' ' . $self->feed->title . " at $uri";
100
101 $self->uri( $uri );
102
103 Jifty->log->info( $message );
104
105 $self->collection( Grep::Model::ItemCollection->new() );
106
107 my $class = $self->feed->source || 'Grep::Source::Feed';
108 Jifty->log->debug("using $class");
109
110 $class->fetch( $self );
111
112 Grep::Search->finish if $self->new_items;
113
114 return $self->collection;
115 }
116
117 =head2 add_record
118
119 Plugins will be called with parametar C<$parent> so they can call this method to add
120 record into result collection (and store in cache and index).
121
122 $parent->add_record( id => 42, foo => 'bar', ... );
123
124 This will also update L</new_items>
125
126 =cut
127
128 sub add_record {
129 my $self = shift;
130
131 my $i = Grep::Model::Item->new();
132
133 my ($ok,$msg) = $i->load_or_create( @_ );
134
135 $msg ||= '';
136
137 if ( $ok ) {
138 Jifty->log->debug("item ", $i->id, ": $msg");
139 $self->collection->add_record( $i );
140
141 # is new record?
142 if ( $msg !~ m/^Found/ ) {
143 Grep::Search->add( $i );
144 $self->new_items( ( $self->new_items || 0 ) + 1 );
145 }
146 } else {
147 warn "can't add entry ", dump( @_ ), "\n";
148 }
149 }
150
151 =head2 content_class
152
153 Return class registred for particular content.
154
155 my $class = $source->content_class( $content );
156
157 =cut
158
159 sub content_class {
160 my $self = shift;
161
162 my $content = shift or die "no content?";
163
164 foreach my $s ( $self->sources ) {
165 Jifty->log->debug("testing source class $s");
166 if ( $s->can('content_have') ) {
167 my $regex = $s->content_have( $content ) or
168 die "${s}->content_have didn't return anything";
169 die "${s}->content_have didn't return regex but ", dump( $regex ), " ref ", ref( $regex )
170 unless ( ref($regex) eq 'Regexp' );
171 if ( $content =~ $regex ) {
172 Jifty->log->debug("${s}->content_have succesful");
173 return $s;
174 }
175 }
176 }
177 }
178
179 =head2 scrape
180
181 Create semi-complex L<WWW::Mechanize> rules to scrape page
182
183
184 =cut
185
186 sub scrape {
187 my $self = shift;
188
189 my $args = {@_};
190
191 warn "scrape got args ",dump($args);
192
193 my ($feed,$uri,$q) = ($self->feed, $self->uri,$self->q);
194 die "no uri" unless ($uri);
195 die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
196
197 my $mech = WWW::Mechanize->new();
198
199 $mech->get( $uri );
200
201 $self->save( 'get.html', $mech->content );
202
203 if ( $args->{submit_form} ) {
204 warn "submit form on $uri\n";
205 $mech->submit_form( %{ $args->{submit_form} } ) or die "can't submit form";
206 $self->save( 'submit.html', $mech->content );
207 }
208
209 warn "parse result page\n";
210
211 my $tree = HTML::TreeBuilder->new or die "can't create html tree";
212 $tree->parse( $mech->content ) or die "can't parse fetched content";
213
214 die "wrapper doesn't have 3 elements but ", $#{ $args->{wrapper} } unless ( $#{ $args->{wrapper} } == 2 );
215 my ( $el,$attr,$value ) = @{ $args->{wrapper} };
216
217 warn "looking for <$el $attr=\"$value\">";
218
219 my $div = $tree->look_down( '_tag', $el, sub {
220 warn dump( $_[0]->attr( $attr ) ),$/;
221 ( $_[0]->attr( $attr ) || '' ) eq $value;
222 });
223
224 die "can't find results wrapper <$el $attr=\"$value\">" unless ( $div );
225
226 my $max = 5;
227 my $nr = 1;
228
229 my $base_uri = $uri;
230 $base_uri =~ s!\?.*$!!;
231
232 foreach my $dt ( $div->look_down( '_tag', $args->{results} ) ) {
233 my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
234 if ( $a ) {
235
236 my $href = $a->attr('href') or die "can't find href inside <", $args->{results}, ">";
237 my $page_uri = URI->new_abs( $a->attr('href'), $base_uri );
238 $page_uri->query( undef );
239 $page_uri = $page_uri->canonical;
240
241 warn "fetching page: ",$a->as_text," from $page_uri\n";
242 if ( $mech->follow_link( url => $a->attr('href') ) ) {
243
244 $self->save( "page-${nr}.html", $mech->content );
245
246 my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
247 $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
248
249 my ( $el,$attr,$value ) = @{ $args->{scrape} };
250 my $div = $page_tree->look_down( '_tag', $el, sub { ( $_[0]->attr( $attr ) || '' ) eq $value } );
251
252 die "can't find <$el $attr=\"$value\">" unless ($div);
253
254 $self->add_record(
255 in_feed => $feed,
256 title => $mech->title,
257 link => $page_uri,
258 content => $div->as_HTML,
259 # summary =>
260 # category =>
261 # author =>
262 # issued =>
263 # modified =>
264 );
265
266 $mech->back;
267 $page_tree->delete;
268
269 } else {
270 warn "can't follow uri $page_uri: $!\n";
271 }
272 }
273
274 last if ($nr == $max);
275 $nr++;
276 }
277
278 $tree->delete; # clear memory!
279
280 }
281
282 =head2 save
283
284 save( 'name', $content );
285
286 Save dumps into C</tmp/grep> if writable
287
288 =cut
289
290 sub save {
291 my $self = shift;
292 my ( $file, $content ) = @_;
293 if ( -w '/tmp/grep' ) {
294 open(my $f, '>', "/tmp/grep/$file") or die "can't open $file: $!";
295 print $f $content or die "can't write to $file: $!";
296 close $f or die "can't close $file: $!";
297 Jifty->log->debug("saved $file ",length($content)," bytes");
298 }
299 }
300
301 1;

  ViewVC Help
Powered by ViewVC 1.1.26