/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Tue Jun 4 06:39:53 2002 UTC revision 30 by dpavlin, Mon Mar 24 09:57:44 2003 UTC
# Line 59  if( $response->code() == RC_OK ) { Line 59  if( $response->code() == RC_OK ) {
59      my $contents = $response->content();      my $contents = $response->content();
60    
61      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
62        # if you don't want content to be indexed, include it in
63        # <noindex> foobar </noindex> tags or surround it with comments
64        # <!-- noindex --> foobar <!-- /noindex -->
65      $contents =~ s,<noindex>.+?</noindex>,,isg;      $contents =~ s,<noindex>.+?</noindex>,,isg;
66      $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;      $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
67        # this will remove all script from indexing content
68      $contents =~ s,<script>.+?</script>,,isg;      $contents =~ s,<script>.+?</script>,,isg;
69        # remap Windows charset to ISO-8859-2
70      $contents =~ tr/Ў/Ю/;     # 1250 -> iso8859-2      $contents =~ tr/Ў/Ю/;     # 1250 -> iso8859-2
71        # this will fix badly formatted html in form:
72        # <head><title>some text</title
73        # ></head>
74        # which will confuse indexer (or libxml2?)
75        $contents =~ s/[\n\r]^(>)/$1\n/msg;
76      print CONTENTS $contents;      print CONTENTS $contents;
77      close( CONTENTS );      close( CONTENTS );
78    
# Line 78  if( $response->code() == RC_OK ) { Line 88  if( $response->code() == RC_OK ) {
88    
89  sub linkcb {  sub linkcb {
90      my($tag, %links) = @_;      my($tag, %links) = @_;
91      if (($tag eq "a") && ($links{"href"})) {      if (($tag eq "a" || $tag eq "area") && ($links{"href"})) {
92          my $link = $links{"href"};          my $link = $links{"href"};
93    
94          #          #
# Line 98  sub linkcb { Line 108  sub linkcb {
108          # hack for Apache directory listings          # hack for Apache directory listings
109          $link =~ s,/\?[NMSD]=[AD]$,/,g;          $link =~ s,/\?[NMSD]=[AD]$,/,g;
110    
111            # speedup, skip pictures
112            return if ($link =~ m/\.(gif|jpg|png)/);
113    
114          if ($no_parent_url) {          if ($no_parent_url) {
115                          if ($link =~ m/$no_parent_url/) {                          if ($link =~ m/$no_parent_url/) {
116                                  print LINKS "$link $no_parent_url\n";                                  print LINKS "$link $no_parent_url\n";

Legend:
Removed from v.1  
changed lines
  Added in v.30

  ViewVC Help
Powered by ViewVC 1.1.26