/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 39 by dpavlin, Wed Apr 30 12:40:09 2003 UTC revision 40 by dpavlin, Sun Jun 1 11:45:19 2003 UTC
# Line 18  my $url = shift; Line 18  my $url = shift;
18    
19  my $no_parent_url;  my $no_parent_url;
20  if ($url =~ m/\s/) {  if ($url =~ m/\s/) {
21          ($url,$no_parent_url) = split(/\s/,$url,2);          ($no_parent_url,$url) = split(/\s/,$url,2);
22            # old scheme had URL, no parent and new is reverse
23            ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/$url/);
24  }  }
25    
26  my $request = new HTTP::Request( "GET", $url );  my $request = new HTTP::Request( "GET", $url );
# Line 39  if( $response->code() == RC_OK ) { Line 41  if( $response->code() == RC_OK ) {
41          if ($no_parent_url) {          if ($no_parent_url) {
42                  if ($link =~ m/$no_parent_url/) {                  if ($link =~ m/$no_parent_url/) {
43                          # if this URL is below parent URL o.k....                          # if this URL is below parent URL o.k....
44                          print RESP "$link $no_parent_url\n";                          print RESP "$no_parent_url $link\n";
45                  } else {                  } else {
46                          # if not, crawl just this page!                          # if not, crawl just this page!
47                          print RESP "$link $link\n";                          print RESP "$link $link\n";
# Line 59  if( $response->code() == RC_OK ) { Line 61  if( $response->code() == RC_OK ) {
61      my $contents = $response->content();      my $contents = $response->content();
62    
63      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
64      # if you don't want content to be indexed, include it in      # fixup just HTML files
65      # <noindex> foobar </noindex> tags or surround it with comments      if ($response->header("content-type") =~ "text/html") {
66      # <!-- noindex --> foobar <!-- /noindex -->          # if you don't want content to be indexed, include it in
67      $contents =~ s,<noindex>.+?</noindex>,,isg;          # <noindex> foobar </noindex> tags or surround it with comments
68      $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;          # <!-- noindex --> foobar <!-- /noindex -->
69      # this will remove all script from indexing content          $contents =~ s,<noindex>.+?</noindex>,,isg;
70      $contents =~ s,<script>.+?</script>,,isg;          $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
71      # remap Windows charset to ISO-8859-2          # this will remove all script from indexing content
72      $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/;     # 1250 -> iso8859-2          $contents =~ s,<script>.+?</script>,,isg;
73      # this will fix badly formatted html in form:          # remap Windows charset to ISO-8859-2
74      # <head><title>some text</title          $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
75      # ></head>          # this will fix badly formatted html in form:
76      # which will confuse indexer (or libxml2?)          # <head><title>some text</title
77      $contents =~ s/[\n\r]^(>)/$1\n/msg;          # ></head>
78      # remove comments between <html> and <head> texi2html inserts them          # which will confuse indexer (or libxml2?)
79      # there and swish can't find document title then (libxml or swish bug?)          $contents =~ s/[\n\r]^(>)/$1\n/msg;
80      while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { };          # remove comments between <html> and <head> texi2html inserts them
81            # there and swish can't find document title then (libxml or swish bug?)
82            while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { };
83        }
84      print CONTENTS $contents;      print CONTENTS $contents;
85      close( CONTENTS );      close( CONTENTS );
86    
# Line 91  if( $response->code() == RC_OK ) { Line 96  if( $response->code() == RC_OK ) {
96    
97  sub linkcb {  sub linkcb {
98      my($tag, %links) = @_;      my($tag, %links) = @_;
99      if (($tag eq "a" || $tag eq "area") && ($links{"href"})) {      if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) {
100          my $link = $links{"href"};          my $link = $links{"href"} || $links{"src"};
101    
102          #          #
103          # Remove fragments          # Remove fragments
# Line 116  sub linkcb { Line 121  sub linkcb {
121    
122          if ($no_parent_url) {          if ($no_parent_url) {
123                          if ($link =~ m/$no_parent_url/) {                          if ($link =~ m/$no_parent_url/) {
124                                  print LINKS "$link $no_parent_url\n";                                  print LINKS "$no_parent_url $link\n";
125  #                               print STDERR "using $link\n";  #                               print STDERR "using $link\n";
126  #                       } else {  #                       } else {
127  #                               print STDERR "skipping $link\n";  #                               print STDERR "skipping $link\n";

Legend:
Removed from v.39  
changed lines
  Added in v.40

  ViewVC Help
Powered by ViewVC 1.1.26