/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 41 by dpavlin, Sun Jun 1 11:45:19 2003 UTC revision 42 by dpavlin, Tue Jul 29 10:40:58 2003 UTC
# Line 20  my $no_parent_url; Line 20  my $no_parent_url;
20  if ($url =~ m/\s/) {  if ($url =~ m/\s/) {
21          ($no_parent_url,$url) = split(/\s/,$url,2);          ($no_parent_url,$url) = split(/\s/,$url,2);
22          # old scheme had URL, no parent and new is reverse          # old scheme had URL, no parent and new is reverse
23          ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/$url/);          ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
24  }  }
25    
26  my $request = new HTTP::Request( "GET", $url );  my $request = new HTTP::Request( "GET", $url );
# Line 66  if( $response->code() == RC_OK ) { Line 66  if( $response->code() == RC_OK ) {
66          # if you don't want content to be indexed, include it in          # if you don't want content to be indexed, include it in
67          # <noindex> foobar </noindex> tags or surround it with comments          # <noindex> foobar </noindex> tags or surround it with comments
68          # <!-- noindex --> foobar <!-- /noindex -->          # <!-- noindex --> foobar <!-- /noindex -->
69            # <!-- noindex --> foobar <!-- index --> (also supported by swish)
70          $contents =~ s,<noindex>.+?</noindex>,,isg;          $contents =~ s,<noindex>.+?</noindex>,,isg;
71          $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;          $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
72            $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*index\s*-->,,isg;
73          # this will remove all script from indexing content          # this will remove all script from indexing content
74          $contents =~ s,<script>.+?</script>,,isg;          $contents =~ s,<script>.+?</script>,,isg;
75          # remap Windows charset to ISO-8859-2          # remap Windows charset to ISO-8859-2

Legend:
Removed from v.41  
changed lines
  Added in v.42

  ViewVC Help
Powered by ViewVC 1.1.26