/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 45 by dpavlin, Wed Nov 19 12:07:07 2003 UTC revision 46 by dpavlin, Sat Jan 17 23:57:55 2004 UTC
# Line 5  use LWP::UserAgent; Line 5  use LWP::UserAgent;
5  use HTTP::Status;  use HTTP::Status;
6  use HTML::LinkExtor;  use HTML::LinkExtor;
7    
8    my $basedir = $0;
9    $basedir =~ s,/[^/]+$,/,;
10    require "$basedir/filter.pm";
11    
12  if (scalar(@ARGV) != 2) {  if (scalar(@ARGV) != 2) {
13      print STDERR "Usage: SwishSpider localpath url\n";      print STDERR "Usage: SwishSpider localpath url\n";
14      exit(1);      exit(1);
# Line 65  if( $response->code() == RC_OK ) { Line 69  if( $response->code() == RC_OK ) {
69      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
70      # fixup just HTML files      # fixup just HTML files
71      if ($response->header("content-type") =~ "text/html") {      if ($response->header("content-type") =~ "text/html") {
72          # if you don't want content to be indexed, include it in          $contents = filter($contents);
         # <noindex> foobar </noindex> tags or surround it with comments  
         # <!-- noindex --> foobar <!-- /noindex -->  
         # <!-- noindex --> foobar <!-- index --> (also supported by swish)  
         $contents =~ s,<noindex>.+?</noindex>,,isg;  
         $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;  
         $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*index\s*-->,,isg;  
         # this will remove all script from indexing content  
         $contents =~ s,<script>.+?</script>,,isg;  
         # remap Windows charset to ISO-8859-2  
         $contents =~ tr/Ў/Ю/; # 1250 -> iso8859-2  
         # this will fix badly formatted html in form:  
         # <head><title>some text</title  
         # ></head>  
         # which will confuse indexer (or libxml2?)  
         $contents =~ s/[\n\r]^(>)/$1\n/msg;  
         # remove comments between <html> and <head> texi2html inserts them  
         # there and swish can't find document title then (libxml or swish bug?)  
         while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { };  
   
         # remote TPJ left column  
         if ($contents =~ s,<!-- BEGIN LEFT SIDE BAR CELL -->.+?<!-- END LEFT SIDE BAR CELL -->,,isg) {  
                 my $title;  
                 # extract title and add to title  
                 if ($contents =~ m,<!-- the article goes here -->\s*<h2[^>]*>(.+?)</h2>,si) {  
                         $title = $1;  
                 } elsif ($contents =~ m,<h1[^>]*>(.+?)</h1>,is) {  
                         $title = $1;  
                 } elsif ($contents =~ m,<h2[^>]*>(.+?)</h2>,is) {  
                         $title = $1;  
                 } else {  
                         $title = "no detail title";  
                 }  
                 $contents =~ s,(<title>)([^<]+)(</title>),$1$2: $title$3,gsi if ($title);  
   
         }  
73      }      }
74      print CONTENTS $contents;      print CONTENTS $contents;
75      close( CONTENTS );      close( CONTENTS );

Legend:
Removed from v.45  
changed lines
  Added in v.46

  ViewVC Help
Powered by ViewVC 1.1.26