/[Grep]/lib/Grep/Source.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /lib/Grep/Source.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 133 - (hide annotations)
Tue May 1 20:50:14 2007 UTC (17 years, 1 month ago) by dpavlin
File size: 10058 byte(s)
Very experimental support for selecting multiple wrapper divs in which
we will then try to find search results -- this change is mostly needed
for sites which have so little semantic markup that we need to pass
several divs of which just one have results.
To Source modules everything should "just work"(tm).
PunBB forum is to blame for this feature, so it's new source. 
1 dpavlin 72 # Dobrica Pavlinusic, <dpavlin@rot13.org> 02/22/07 20:30:00 CET
2    
3     use strict;
4     use warnings;
5    
6     package Grep::Source;
7    
8     use Carp qw/verbose/;
9     use Module::Pluggable search_path => 'Grep::Source', sub_name => 'sources', require => 1;
10 dpavlin 100 use base qw(Class::Accessor Jifty::Object);
11 dpavlin 121 Grep::Source->mk_accessors( qw(feed uri q new_items collection search_obj tree) );
12 dpavlin 72
13 dpavlin 85 use HTML::TreeBuilder;
14     use WWW::Mechanize;
15     use XML::Feed;
16     use URI;
17 dpavlin 96 use HTML::ResolveLink;
18 dpavlin 85
19 dpavlin 72 use Data::Dump qw/dump/;
20    
21     =head1 NAME
22    
23     Grep::Source - base class for implementation of different sources for Grep
24    
25     =head1 METHODS
26    
27     This is mostly documentation because most of methods are implemented by plugins.
28    
29     =head2 sources
30    
31     my @sources = Grep::Source->sources();
32    
33     Returns all available sources.
34    
35     =cut
36    
37 dpavlin 73 Jifty->log->debug("Found source plugins: ", join(", ", __PACKAGE__->sources() ) );
38 dpavlin 72
39     =head2 new
40    
41     my $source = Grep::Source->new({ feed => $feed_record });
42    
43 dpavlin 73 This will also setup:
44    
45     =head2 feed
46    
47     isa L<Grep::Model::Feed>
48    
49 dpavlin 72 =head2 search
50    
51     my $collection = $source->search( 'query string' );
52    
53     It will also setup following accessors:
54    
55 dpavlin 73 =head2 q
56 dpavlin 72
57     Search query
58    
59 dpavlin 73 =head2 uri
60 dpavlin 72
61     URI of feed with embedded search query
62    
63 dpavlin 73 =head2 new_items
64 dpavlin 72
65     Number of new items in result collection
66    
67     =head2 collection
68    
69     Actuall results which is L<Grep::Model::ItemCollection>, so following will
70     work:
71    
72     print "and ", $self->collection->count, " total items";
73    
74    
75     Also setups number of new items
76    
77     print $source->new_items, " items new";
78    
79     =cut
80    
81     sub search {
82     my $self = shift;
83    
84     my $q = shift;
85    
86     $q ? $self->q( $q ) : $q = $self->q;
87    
88     die "no q?" unless ( $self->q );
89     die "no feed?" unless ( $self->feed );
90     die "feed not Grep::Model::Feed" unless ( $self->feed->isa('Grep::Model::Feed') );
91    
92     my $message;
93     my $uri = $self->feed->uri;
94     if ($uri =~ m/%s/) {
95     $uri = $self->feed->search_uri( $q );
96     $message = 'Searching';
97     } else {
98     $message = 'Fetching';
99     }
100     $message .= ' ' . $self->feed->title . " at $uri";
101    
102     $self->uri( $uri );
103    
104 dpavlin 100 $self->log->info( $message );
105 dpavlin 72
106     $self->collection( Grep::Model::ItemCollection->new() );
107    
108 dpavlin 73 my $class = $self->feed->source || 'Grep::Source::Feed';
109 dpavlin 100 $self->log->debug("using $class");
110 dpavlin 72
111 dpavlin 110 $self->search_obj( Grep::Search->new() );
112     $self->log->debug("created " . $self->search_obj);
113 dpavlin 72
114 dpavlin 110 $class->fetch( $self );
115    
116     $self->search_obj->finish;
117    
118 dpavlin 72 return $self->collection;
119     }
120    
121     =head2 add_record
122    
123 dpavlin 73 Plugins will be called with parametar C<$parent> so they can call this method to add
124     record into result collection (and store in cache and index).
125    
126 dpavlin 72 $parent->add_record( id => 42, foo => 'bar', ... );
127    
128 dpavlin 73 This will also update L</new_items>
129    
130 dpavlin 72 =cut
131    
132     sub add_record {
133     my $self = shift;
134    
135 dpavlin 110 $self->log->confess("no search_obj") unless ($self->search_obj);
136 dpavlin 109
137 dpavlin 72 my $i = Grep::Model::Item->new();
138    
139 dpavlin 96 my $rec = {@_};
140 dpavlin 72
141 dpavlin 102 $self->log->debug("resolving links using base ", $rec->{link});
142 dpavlin 96 my $resolver = HTML::ResolveLink->new( base => $rec->{link} );
143     $rec->{content} = $resolver->resolve( $rec->{content} );
144    
145     my ($ok,$msg) = $i->load_or_create( %$rec );
146    
147 dpavlin 72 $msg ||= '';
148    
149     if ( $ok ) {
150 dpavlin 100 $self->log->debug("item ", $i->id, ": $msg");
151 dpavlin 72 $self->collection->add_record( $i );
152    
153     # is new record?
154     if ( $msg !~ m/^Found/ ) {
155 dpavlin 110 $self->search_obj->add( $i );
156 dpavlin 74 $self->new_items( ( $self->new_items || 0 ) + 1 );
157 dpavlin 72 }
158     } else {
159     warn "can't add entry ", dump( @_ ), "\n";
160     }
161     }
162    
163 dpavlin 73 =head2 content_class
164    
165     Return class registred for particular content.
166    
167     my $class = $source->content_class( $content );
168    
169     =cut
170    
171     sub content_class {
172     my $self = shift;
173    
174     my $content = shift or die "no content?";
175    
176     foreach my $s ( $self->sources ) {
177 dpavlin 100 $self->log->debug("testing source class $s");
178 dpavlin 82 if ( $s->can('content_have') ) {
179     my $regex = $s->content_have( $content ) or
180     die "${s}->content_have didn't return anything";
181     die "${s}->content_have didn't return regex but ", dump( $regex ), " ref ", ref( $regex )
182     unless ( ref($regex) eq 'Regexp' );
183     if ( $content =~ $regex ) {
184 dpavlin 100 $self->log->debug("${s}->content_have succesful");
185 dpavlin 82 return $s;
186     }
187 dpavlin 73 }
188     }
189     }
190    
191 dpavlin 85
192 dpavlin 132 =head2 element_by_triplet
193 dpavlin 85
194 dpavlin 132 Helper method to select element(s) using C<element/attribute/value> triplet using
195     L<HTML::TreeBuilder> trees.
196 dpavlin 85
197 dpavlin 132 my $el = $self->element_by_triplet(
198     tree => $tree_or_element,
199     triplets => [ qw/
200     div id target
201     div class another
202     / ],
203     message => 'find search result element',
204     fatal => 1, # die instead of warn
205     );
206    
207 dpavlin 85 =cut
208    
209 dpavlin 121 sub element_by_triplet {
210     my $self = shift;
211    
212     my $args = {@_};
213    
214     my $tree = $args->{tree} || die "no tree";
215 dpavlin 123 my $message = $args->{message} || '';
216 dpavlin 121 my $fatal = $args->{fatal};
217 dpavlin 126 die "no triplets" unless defined( $args->{triplets} );
218     my @triplets;
219     if ( ref( $args->{triplets} ) eq 'ARRAY' ) {
220     @triplets = @{ $args->{triplets} };
221 dpavlin 123 } else {
222 dpavlin 126 @triplets = ( $args->{triplets} );
223 dpavlin 123 }
224 dpavlin 121
225 dpavlin 126 push @triplets, ( undef, undef ) if ( $#triplets == 0 );
226 dpavlin 121
227 dpavlin 132 die "triplet doesn't have 3 elements but ", $#triplets unless (
228 dpavlin 126 ( $#triplets + 1 ) % 3 == 0
229 dpavlin 121 );
230    
231 dpavlin 123 my ( $el, $attr, $value );
232 dpavlin 121
233 dpavlin 123 my @results;
234 dpavlin 121 my @tags;
235    
236 dpavlin 133 warn "triplets = ",dump( @triplets );
237    
238 dpavlin 126 while ( @triplets ) {
239     ( $el,$attr,$value ) = splice( @triplets, 0, 3 );
240 dpavlin 121 my $tag = $attr ? "<$el $attr=\"$value\">" : "<$el>";
241     push @tags, $tag;
242     $self->log->debug("looking for $message $tag");
243 dpavlin 123 @results = $tree->look_down( '_tag', $el, sub {
244 dpavlin 121 return 1 unless ( $attr && $value );
245     ( $_[0]->attr( $attr ) || '' ) eq $value;
246     });
247 dpavlin 123 last if @results;
248 dpavlin 121 }
249    
250 dpavlin 123 if ( ! @results ) {
251 dpavlin 121 my $msg = "can't find $message ", join(" ", @tags);
252     die $msg if ( $fatal );
253     warn $msg;
254     return;
255     }
256    
257 dpavlin 123 $self->log->debug("found ", $#results + 1, " results");
258    
259 dpavlin 133 #warn dump( map { $_->as_HTML } @results );
260    
261 dpavlin 123 return @results if wantarray;
262     return shift @results;
263 dpavlin 121 }
264    
265 dpavlin 132 =head2 scrape
266    
267     Create semi-complex L<WWW::Mechanize> rules to scrape page easily
268    
269     $parent->scrape(
270     # if search string isn't part or URI
271     submit_form => {
272     fields => {
273     value => $parent->q,
274     },
275     button => 'fullsearch',
276     },
277     # element with search results
278     wrapper => [ qw/div class searchresults/ ],
279     # element (or tripple) for each result with link
280     # <a href=".."> inside it to full-text result
281     results => 'dt',
282     # collect which element on page linked from results
283     scrape => [ qw/div id page/ ],
284     # when search returns just single hit, it will redirect to result page
285     redirect_single_result => 1,
286     );
287    
288     =cut
289    
290 dpavlin 85 sub scrape {
291     my $self = shift;
292    
293     my $args = {@_};
294    
295 dpavlin 102 $self->log->debug("scrape with args ",dump($args));
296 dpavlin 85
297     my ($feed,$uri,$q) = ($self->feed, $self->uri,$self->q);
298     die "no uri" unless ($uri);
299     die "feed is not a Grep::Model::Feed but ", ref $feed unless $feed->isa('Grep::Model::Feed');
300    
301 dpavlin 92 sub mech_warn {
302     my $m = shift || return;
303     warn $m;
304     }
305 dpavlin 85
306 dpavlin 92 my $mech = WWW::Mechanize->new(
307     cookie_jar => {},
308     onwarn => \&mech_warn,
309     onerror => \&mech_warn,
310     );
311    
312 dpavlin 85 $mech->get( $uri );
313    
314 dpavlin 86 $self->save( 'get.html', $mech->content );
315 dpavlin 85
316 dpavlin 92 if ( my $form = $args->{submit_form} ) {
317 dpavlin 102 $self->log->debug("submit form on $uri with ", dump( $form ));
318 dpavlin 92 $mech->submit_form( %$form ) or die "can't submit form ", dump( $form );
319 dpavlin 86 $self->save( 'submit.html', $mech->content );
320 dpavlin 85 }
321    
322 dpavlin 102 $self->log->debug("parse result page");
323 dpavlin 85
324     my $tree = HTML::TreeBuilder->new or die "can't create html tree";
325     $tree->parse( $mech->content ) or die "can't parse fetched content";
326    
327 dpavlin 133 my @wrapper_divs = $self->element_by_triplet(
328 dpavlin 121 tree => $tree,
329 dpavlin 126 triplets => $args->{wrapper},
330 dpavlin 121 message => 'wrapper for all results',
331 dpavlin 132 fatal => $args->{redirect_single_result} ? 0 : 1,
332 dpavlin 121 );
333 dpavlin 85
334 dpavlin 103 my $max = 15;
335 dpavlin 85 my $nr = 1;
336    
337     my $base_uri = $uri;
338     $base_uri =~ s!\?.*$!!;
339    
340 dpavlin 132 # directly got first result
341 dpavlin 133 if ( $args->{redirect_single_result} && ! @wrapper_divs ) {
342 dpavlin 132
343     my $uri = $mech->uri; $uri->query( undef ); $uri = $uri->canonical;
344    
345 dpavlin 133 my $div = $self->element_by_triplet(
346 dpavlin 132 tree => $tree,
347     message => "single result - redirect to $uri",
348     triplets => $args->{scrape},
349     fatal => 1,
350     );
351    
352     $self->add_record(
353     in_feed => $feed,
354     title => $mech->title,
355     link => $uri,
356     content => $div->as_HTML,
357     );
358    
359     $tree->delete; # clear memory!
360     return;
361     }
362    
363 dpavlin 133 my @r;
364 dpavlin 113
365 dpavlin 133 foreach my $div ( @wrapper_divs ) {
366    
367     my @r_here = $self->element_by_triplet(
368     tree => $div,
369     triplets => $args->{results},
370     message => 'result element',
371     );
372    
373     push @r, @r_here if (@r_here);
374     }
375    
376     $self->log->debug("in total, found ", $#r + 1, " results in ", $#wrapper_divs + 1, " result wrapper elements");
377    
378 dpavlin 113 foreach my $dt ( @r ) {
379 dpavlin 85 my $a = $dt->look_down( '_tag', 'a', sub { $_[0]->attr('href') } );
380     if ( $a ) {
381    
382     my $href = $a->attr('href') or die "can't find href inside <", $args->{results}, ">";
383 dpavlin 132
384     my $page_uri = URI->new_abs( $href, $base_uri );
385 dpavlin 85 $page_uri->query( undef );
386     $page_uri = $page_uri->canonical;
387    
388 dpavlin 102 $self->log->debug("fetching page: ",$a->as_text," from $page_uri");
389 dpavlin 132 if ( $mech->follow_link( url => $href ) ) {
390 dpavlin 85
391 dpavlin 86 $self->save( "page-${nr}.html", $mech->content );
392 dpavlin 85
393     my $page_tree = HTML::TreeBuilder->new or die "can't create page tree";
394     $page_tree->parse( $mech->content ) or die "can't parse page at $page_uri";
395 dpavlin 133 my $div = $self->element_by_triplet(
396 dpavlin 121 tree => $page_tree,
397 dpavlin 133 message => "result page $nr",
398 dpavlin 126 triplets => $args->{scrape}
399 dpavlin 121 );
400 dpavlin 85
401     $self->add_record(
402     in_feed => $feed,
403     title => $mech->title,
404     link => $page_uri,
405     content => $div->as_HTML,
406     # summary =>
407     # category =>
408     # author =>
409     # issued =>
410     # modified =>
411 dpavlin 121 ) if ( $div );
412 dpavlin 85
413     $mech->back;
414     $page_tree->delete;
415    
416     } else {
417     warn "can't follow uri $page_uri: $!\n";
418     }
419 dpavlin 121 } else {
420     $self->log->debug("result $nr doesn't have link inside, ignoring...");
421 dpavlin 85 }
422    
423     last if ($nr == $max);
424     $nr++;
425     }
426    
427     $tree->delete; # clear memory!
428    
429     }
430    
431 dpavlin 86 =head2 save
432    
433     save( 'name', $content );
434    
435     Save dumps into C</tmp/grep> if writable
436    
437     =cut
438    
439     sub save {
440     my $self = shift;
441     my ( $file, $content ) = @_;
442 dpavlin 100 return unless ( defined($file) && defined($content) );
443 dpavlin 86 if ( -w '/tmp/grep' ) {
444     open(my $f, '>', "/tmp/grep/$file") or die "can't open $file: $!";
445     print $f $content or die "can't write to $file: $!";
446     close $f or die "can't close $file: $!";
447 dpavlin 100 $self->log->debug("saved $file ",length($content)," bytes");
448 dpavlin 86 }
449     }
450    
451 dpavlin 72 1;

  ViewVC Help
Powered by ViewVC 1.1.26