/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 30 by dpavlin, Mon Mar 24 09:57:44 2003 UTC revision 46 by dpavlin, Sat Jan 17 23:57:55 2004 UTC
# Line 5  use LWP::UserAgent; Line 5  use LWP::UserAgent;
5  use HTTP::Status;  use HTTP::Status;
6  use HTML::LinkExtor;  use HTML::LinkExtor;
7    
8    my $basedir = $0;
9    $basedir =~ s,/[^/]+$,/,;
10    require "$basedir/filter.pm";
11    
12  if (scalar(@ARGV) != 2) {  if (scalar(@ARGV) != 2) {
13      print STDERR "Usage: SwishSpider localpath url\n";      print STDERR "Usage: SwishSpider localpath url\n";
14      exit(1);      exit(1);
# Line 18  my $url = shift; Line 22  my $url = shift;
22    
23  my $no_parent_url;  my $no_parent_url;
24  if ($url =~ m/\s/) {  if ($url =~ m/\s/) {
25          ($url,$no_parent_url) = split(/\s/,$url,2);          ($no_parent_url,$url) = split(/\s/,$url,2);
26            # old scheme had URL, no parent and new is reverse
27            ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
28  }  }
29    
30  my $request = new HTTP::Request( "GET", $url );  my $request = new HTTP::Request( "GET", $url );
31  my $response = $ua->simple_request( $request );  my $response = $ua->simple_request( $request );
32    my $urlbase = $response->base;
33    $urlbase =~ s,/[^/]*$,/,;       # remove filename
34    
35  #  #
36  # Write out important meta-data.  This includes the HTTP code.  Depending on the  # Write out important meta-data.  This includes the HTTP code.  Depending on the
# Line 39  if( $response->code() == RC_OK ) { Line 47  if( $response->code() == RC_OK ) {
47          if ($no_parent_url) {          if ($no_parent_url) {
48                  if ($link =~ m/$no_parent_url/) {                  if ($link =~ m/$no_parent_url/) {
49                          # if this URL is below parent URL o.k....                          # if this URL is below parent URL o.k....
50                          print RESP "$link $no_parent_url\n";                          print RESP "$no_parent_url $link\n";
51                  } else {                  } else {
52                          # if not, crawl just this page!                          # if not, crawl just this page!
53                          print RESP "$link $link\n";                          print RESP "$link $link\n";
# Line 59  if( $response->code() == RC_OK ) { Line 67  if( $response->code() == RC_OK ) {
67      my $contents = $response->content();      my $contents = $response->content();
68    
69      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
70      # if you don't want content to be indexed, include it in      # fixup just HTML files
71      # <noindex> foobar </noindex> tags or surround it with comments      if ($response->header("content-type") =~ "text/html") {
72      # <!-- noindex --> foobar <!-- /noindex -->          $contents = filter($contents);
73      $contents =~ s,<noindex>.+?</noindex>,,isg;      }
     $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;  
     # this will remove all script from indexing content  
     $contents =~ s,<script>.+?</script>,,isg;  
     # remap Windows charset to ISO-8859-2  
     $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/;     # 1250 -> iso8859-2  
     # this will fix badly formatted html in form:  
     # <head><title>some text</title  
     # ></head>  
     # which will confuse indexer (or libxml2?)  
     $contents =~ s/[\n\r]^(>)/$1\n/msg;  
74      print CONTENTS $contents;      print CONTENTS $contents;
75      close( CONTENTS );      close( CONTENTS );
76    
# Line 81  if( $response->code() == RC_OK ) { Line 79  if( $response->code() == RC_OK ) {
79          my $p = HTML::LinkExtor->new( \&linkcb, $url );          my $p = HTML::LinkExtor->new( \&linkcb, $url );
80          $p->parse( $contents );          $p->parse( $contents );
81    
82    
83    
84          close( LINKS );          close( LINKS );
85      }      }
86  }  }
# Line 88  if( $response->code() == RC_OK ) { Line 88  if( $response->code() == RC_OK ) {
88    
89  sub linkcb {  sub linkcb {
90      my($tag, %links) = @_;      my($tag, %links) = @_;
91      if (($tag eq "a" || $tag eq "area") && ($links{"href"})) {      if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) {
92          my $link = $links{"href"};          my $link = $links{"href"} || $links{"src"};
93    
94          #          #
95          # Remove fragments          # Remove fragments
# Line 102  sub linkcb { Line 102  sub linkcb {
102          #          #
103          $link =~ s/\.\.\///g;          $link =~ s/\.\.\///g;
104    
105            if ($link =~ m,javascript:displayWindow\((.+)\),i) {
106                    my $arg = $1;
107                    $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
108                    ($link,undef) = split(',',$arg,2);
109                    $link =~ s/^['"]//;
110                    $link =~ s/['"]$//;
111                    $link = $urlbase.$link;
112            }
113    
114          # hack for apostrophe -- changes URL, but should work for most clients.          # hack for apostrophe -- changes URL, but should work for most clients.
115          $link =~ s/'/%27/g;          $link =~ s/'/%27/g;
116    
# Line 113  sub linkcb { Line 122  sub linkcb {
122    
123          if ($no_parent_url) {          if ($no_parent_url) {
124                          if ($link =~ m/$no_parent_url/) {                          if ($link =~ m/$no_parent_url/) {
125                                  print LINKS "$link $no_parent_url\n";                                  print LINKS "$no_parent_url $link\n";
126  #                               print STDERR "using $link\n";  #                               print STDERR "using $link\n";
127  #                       } else {  #                       } else {
128  #                               print STDERR "skipping $link\n";  #                               print STDERR "skipping $link\n";

Legend:
Removed from v.30  
changed lines
  Added in v.46

  ViewVC Help
Powered by ViewVC 1.1.26