/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 15 by dpavlin, Sun Mar 16 21:31:55 2003 UTC revision 30 by dpavlin, Mon Mar 24 09:57:44 2003 UTC
# Line 59  if( $response->code() == RC_OK ) { Line 59  if( $response->code() == RC_OK ) {
59      my $contents = $response->content();      my $contents = $response->content();
60    
61      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
62        # if you don't want content to be indexed, include it in
63        # <noindex> foobar </noindex> tags or surround it with comments
64        # <!-- noindex --> foobar <!-- /noindex -->
65      $contents =~ s,<noindex>.+?</noindex>,,isg;      $contents =~ s,<noindex>.+?</noindex>,,isg;
66      $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;      $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
67        # this will remove all script from indexing content
68      $contents =~ s,<script>.+?</script>,,isg;      $contents =~ s,<script>.+?</script>,,isg;
69        # remap Windows charset to ISO-8859-2
70      $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/;     # 1250 -> iso8859-2      $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/;     # 1250 -> iso8859-2
71        # this will fix badly formatted html in form:
72        # <head><title>some text</title
73        # ></head>
74        # which will confuse indexer (or libxml2?)
75        $contents =~ s/[\n\r]^(>)/$1\n/msg;
76      print CONTENTS $contents;      print CONTENTS $contents;
77      close( CONTENTS );      close( CONTENTS );
78    

Legend:
Removed from v.15  
changed lines
  Added in v.30

  ViewVC Help
Powered by ViewVC 1.1.26