/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Tue Jun 4 06:39:53 2002 UTC revision 32 by dpavlin, Wed Apr 30 12:40:09 2003 UTC
# Line 59  if( $response->code() == RC_OK ) { Line 59  if( $response->code() == RC_OK ) {
59      my $contents = $response->content();      my $contents = $response->content();
60    
61      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
62        # if you don't want content to be indexed, include it in
63        # <noindex> foobar </noindex> tags or surround it with comments
64        # <!-- noindex --> foobar <!-- /noindex -->
65      $contents =~ s,<noindex>.+?</noindex>,,isg;      $contents =~ s,<noindex>.+?</noindex>,,isg;
66      $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;      $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
67        # this will remove all script from indexing content
68      $contents =~ s,<script>.+?</script>,,isg;      $contents =~ s,<script>.+?</script>,,isg;
69        # remap Windows charset to ISO-8859-2
70      $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/;     # 1250 -> iso8859-2      $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/;     # 1250 -> iso8859-2
71        # this will fix badly formatted html in form:
72        # <head><title>some text</title
73        # ></head>
74        # which will confuse indexer (or libxml2?)
75        $contents =~ s/[\n\r]^(>)/$1\n/msg;
76        # remove comments between <html> and <head> texi2html inserts them
77        # there and swish can't find document title then (libxml or swish bug?)
78        while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { };
79      print CONTENTS $contents;      print CONTENTS $contents;
80      close( CONTENTS );      close( CONTENTS );
81    
# Line 78  if( $response->code() == RC_OK ) { Line 91  if( $response->code() == RC_OK ) {
91    
92  sub linkcb {  sub linkcb {
93      my($tag, %links) = @_;      my($tag, %links) = @_;
94      if (($tag eq "a") && ($links{"href"})) {      if (($tag eq "a" || $tag eq "area") && ($links{"href"})) {
95          my $link = $links{"href"};          my $link = $links{"href"};
96    
97          #          #
# Line 98  sub linkcb { Line 111  sub linkcb {
111          # hack for Apache directory listings          # hack for Apache directory listings
112          $link =~ s,/\?[NMSD]=[AD]$,/,g;          $link =~ s,/\?[NMSD]=[AD]$,/,g;
113    
114            # speedup, skip pictures
115            return if ($link =~ m/\.(gif|jpg|png)/);
116    
117          if ($no_parent_url) {          if ($no_parent_url) {
118                          if ($link =~ m/$no_parent_url/) {                          if ($link =~ m/$no_parent_url/) {
119                                  print LINKS "$link $no_parent_url\n";                                  print LINKS "$link $no_parent_url\n";

Legend:
Removed from v.1  
changed lines
  Added in v.32

  ViewVC Help
Powered by ViewVC 1.1.26