/[swish]/trunk/spider/swishspider

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/spider/swishspider

Parent Directory | Revision Log | View Patch Patch

-revision 32 by dpavlin,
Wed Apr 30 12:40:09 2003 UTC
+revision 46 by dpavlin,
Sat Jan 17 23:57:55 2004 UTC
 Line 5 
 use LWP::UserAgent;
  use HTTP::Status;
  use HTML::LinkExtor;
+ my $basedir = $0;
+ $basedir =~ s,/[^/]+$,/,;
+ require "$basedir/filter.pm";
  if (scalar(@ARGV) != 2) {
      print STDERR "Usage: SwishSpider localpath url\n";
      exit(1);
-Line 18 
 my $url = shift;
+Line 22 
 my $url = shift;
  my $no_parent_url;
  if ($url =~ m/\s/) {
-         ($url,$no_parent_url) = split(/\s/,$url,2);
+         ($no_parent_url,$url) = split(/\s/,$url,2);
+         # old scheme had URL, no parent and new is reverse
+         ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
  }
  my $request = new HTTP::Request( "GET", $url );
  my $response = $ua->simple_request( $request );
+ my $urlbase = $response->base;
+ $urlbase =~ s,/[^/]*$,/,;       # remove filename
  #
  # Write out important meta-data.  This includes the HTTP code.  Depending on the
-Line 39 
 if( $response->code() == RC_OK ) {
+Line 47 
 if( $response->code() == RC_OK ) {
          if ($no_parent_url) {
                  if ($link =~ m/$no_parent_url/) {
                          # if this URL is below parent URL o.k....
-                         print RESP "$link $no_parent_url\n";
+                         print RESP "$no_parent_url $link\n";
                  } else {
                          # if not, crawl just this page!
                          print RESP "$link $link\n";
-Line 59 
 if( $response->code() == RC_OK ) {
+Line 67 
 if( $response->code() == RC_OK ) {
      my $contents = $response->content();
      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
-     # if you don't want content to be indexed, include it in
+     # fixup just HTML files
-     # <noindex> foobar </noindex> tags or surround it with comments
+     if ($response->header("content-type") =~ "text/html") {
-     # <!-- noindex --> foobar <!-- /noindex -->
+         $contents = filter($contents);
-     $contents =~ s,<noindex>.+?</noindex>,,isg;
+     }
-     $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
-     # this will remove all script from indexing content
-     $contents =~ s,<script>.+?</script>,,isg;
-     # remap Windows charset to ISO-8859-2
-     $contents =~ tr/����Ў��/����Ю��/;     # 1250 -> iso8859-2
-     # this will fix badly formatted html in form:
-     # <head><title>some text</title
-     # ></head>
-     # which will confuse indexer (or libxml2?)
-     $contents =~ s/[\n\r]^(>)/$1\n/msg;
-     # remove comments between <html> and <head> texi2html inserts them
-     # there and swish can't find document title then (libxml or swish bug?)
-     while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { };
      print CONTENTS $contents;
      close( CONTENTS );
-Line 84 
 if( $response->code() == RC_OK ) {
+Line 79 
 if( $response->code() == RC_OK ) {
          my $p = HTML::LinkExtor->new( \&linkcb, $url );
          $p->parse( $contents );
          close( LINKS );
      }
  }
-Line 91 
 if( $response->code() == RC_OK ) {
+Line 88 
 if( $response->code() == RC_OK ) {
  sub linkcb {
      my($tag, %links) = @_;
-     if (($tag eq "a" || $tag eq "area") && ($links{"href"})) {
+     if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) {
-         my $link = $links{"href"};
+         my $link = $links{"href"} || $links{"src"};
          #
          # Remove fragments
-Line 105 
 sub linkcb {
+Line 102 
 sub linkcb {
          #
          $link =~ s/\.\.\///g;
+         if ($link =~ m,javascript:displayWindow\((.+)\),i) {
+                 my $arg = $1;
+                 $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
+                 ($link,undef) = split(',',$arg,2);
+                 $link =~ s/^['"]//;
+                 $link =~ s/['"]$//;
+                 $link = $urlbase.$link;
+         }
          # hack for apostrophe -- changes URL, but should work for most clients.
          $link =~ s/'/%27/g;
-Line 116 
 sub linkcb {
+Line 122 
 sub linkcb {
          if ($no_parent_url) {
                          if ($link =~ m/$no_parent_url/) {
-                                 print LINKS "$link $no_parent_url\n";
+                                 print LINKS "$no_parent_url $link\n";
  #                               print STDERR "using $link\n";
  #                       } else {
  #                               print STDERR "skipping $link\n";

 Legend:



Removed from v.32
 


changed lines


 
Added in v.46
 Legend:



Removed from v.32
 


changed lines


 
Added in v.46
-Removed from v.32
+Added in v.46

	ViewVC Help
Powered by ViewVC 1.1.26