/[swish]/trunk/spider/swishspider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/spider/swishspider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 30 by dpavlin, Mon Mar 24 09:57:44 2003 UTC revision 45 by dpavlin, Wed Nov 19 12:07:07 2003 UTC
# Line 18  my $url = shift; Line 18  my $url = shift;
18    
19  my $no_parent_url;  my $no_parent_url;
20  if ($url =~ m/\s/) {  if ($url =~ m/\s/) {
21          ($url,$no_parent_url) = split(/\s/,$url,2);          ($no_parent_url,$url) = split(/\s/,$url,2);
22            # old scheme had URL, no parent and new is reverse
23            ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/);
24  }  }
25    
26  my $request = new HTTP::Request( "GET", $url );  my $request = new HTTP::Request( "GET", $url );
27  my $response = $ua->simple_request( $request );  my $response = $ua->simple_request( $request );
28    my $urlbase = $response->base;
29    $urlbase =~ s,/[^/]*$,/,;       # remove filename
30    
31  #  #
32  # Write out important meta-data.  This includes the HTTP code.  Depending on the  # Write out important meta-data.  This includes the HTTP code.  Depending on the
# Line 39  if( $response->code() == RC_OK ) { Line 43  if( $response->code() == RC_OK ) {
43          if ($no_parent_url) {          if ($no_parent_url) {
44                  if ($link =~ m/$no_parent_url/) {                  if ($link =~ m/$no_parent_url/) {
45                          # if this URL is below parent URL o.k....                          # if this URL is below parent URL o.k....
46                          print RESP "$link $no_parent_url\n";                          print RESP "$no_parent_url $link\n";
47                  } else {                  } else {
48                          # if not, crawl just this page!                          # if not, crawl just this page!
49                          print RESP "$link $link\n";                          print RESP "$link $link\n";
# Line 59  if( $response->code() == RC_OK ) { Line 63  if( $response->code() == RC_OK ) {
63      my $contents = $response->content();      my $contents = $response->content();
64    
65      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );      open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
66      # if you don't want content to be indexed, include it in      # fixup just HTML files
67      # <noindex> foobar </noindex> tags or surround it with comments      if ($response->header("content-type") =~ "text/html") {
68      # <!-- noindex --> foobar <!-- /noindex -->          # if you don't want content to be indexed, include it in
69      $contents =~ s,<noindex>.+?</noindex>,,isg;          # <noindex> foobar </noindex> tags or surround it with comments
70      $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;          # <!-- noindex --> foobar <!-- /noindex -->
71      # this will remove all script from indexing content          # <!-- noindex --> foobar <!-- index --> (also supported by swish)
72      $contents =~ s,<script>.+?</script>,,isg;          $contents =~ s,<noindex>.+?</noindex>,,isg;
73      # remap Windows charset to ISO-8859-2          $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
74      $contents =~ tr/Ў/Ю/;     # 1250 -> iso8859-2          $contents =~ s,<!--\s*noindex\s*-->.+?<!--\s*index\s*-->,,isg;
75      # this will fix badly formatted html in form:          # this will remove all script from indexing content
76      # <head><title>some text</title          $contents =~ s,<script>.+?</script>,,isg;
77      # ></head>          # remap Windows charset to ISO-8859-2
78      # which will confuse indexer (or libxml2?)          $contents =~ tr/Ў/Ю/; # 1250 -> iso8859-2
79      $contents =~ s/[\n\r]^(>)/$1\n/msg;          # this will fix badly formatted html in form:
80            # <head><title>some text</title
81            # ></head>
82            # which will confuse indexer (or libxml2?)
83            $contents =~ s/[\n\r]^(>)/$1\n/msg;
84            # remove comments between <html> and <head> texi2html inserts them
85            # there and swish can't find document title then (libxml or swish bug?)
86            while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { };
87    
88            # remote TPJ left column
89            if ($contents =~ s,<!-- BEGIN LEFT SIDE BAR CELL -->.+?<!-- END LEFT SIDE BAR CELL -->,,isg) {
90                    my $title;
91                    # extract title and add to title
92                    if ($contents =~ m,<!-- the article goes here -->\s*<h2[^>]*>(.+?)</h2>,si) {
93                            $title = $1;
94                    } elsif ($contents =~ m,<h1[^>]*>(.+?)</h1>,is) {
95                            $title = $1;
96                    } elsif ($contents =~ m,<h2[^>]*>(.+?)</h2>,is) {
97                            $title = $1;
98                    } else {
99                            $title = "no detail title";
100                    }
101                    $contents =~ s,(<title>)([^<]+)(</title>),$1$2: $title$3,gsi if ($title);
102    
103            }
104        }
105      print CONTENTS $contents;      print CONTENTS $contents;
106      close( CONTENTS );      close( CONTENTS );
107    
# Line 81  if( $response->code() == RC_OK ) { Line 110  if( $response->code() == RC_OK ) {
110          my $p = HTML::LinkExtor->new( \&linkcb, $url );          my $p = HTML::LinkExtor->new( \&linkcb, $url );
111          $p->parse( $contents );          $p->parse( $contents );
112    
113    
114    
115          close( LINKS );          close( LINKS );
116      }      }
117  }  }
# Line 88  if( $response->code() == RC_OK ) { Line 119  if( $response->code() == RC_OK ) {
119    
120  sub linkcb {  sub linkcb {
121      my($tag, %links) = @_;      my($tag, %links) = @_;
122      if (($tag eq "a" || $tag eq "area") && ($links{"href"})) {      if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) {
123          my $link = $links{"href"};          my $link = $links{"href"} || $links{"src"};
124    
125          #          #
126          # Remove fragments          # Remove fragments
# Line 102  sub linkcb { Line 133  sub linkcb {
133          #          #
134          $link =~ s/\.\.\///g;          $link =~ s/\.\.\///g;
135    
136            if ($link =~ m,javascript:displayWindow\((.+)\),i) {
137                    my $arg = $1;
138                    $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
139                    ($link,undef) = split(',',$arg,2);
140                    $link =~ s/^['"]//;
141                    $link =~ s/['"]$//;
142                    $link = $urlbase.$link;
143            }
144    
145          # hack for apostrophe -- changes URL, but should work for most clients.          # hack for apostrophe -- changes URL, but should work for most clients.
146          $link =~ s/'/%27/g;          $link =~ s/'/%27/g;
147    
# Line 113  sub linkcb { Line 153  sub linkcb {
153    
154          if ($no_parent_url) {          if ($no_parent_url) {
155                          if ($link =~ m/$no_parent_url/) {                          if ($link =~ m/$no_parent_url/) {
156                                  print LINKS "$link $no_parent_url\n";                                  print LINKS "$no_parent_url $link\n";
157  #                               print STDERR "using $link\n";  #                               print STDERR "using $link\n";
158  #                       } else {  #                       } else {
159  #                               print STDERR "skipping $link\n";  #                               print STDERR "skipping $link\n";

Legend:
Removed from v.30  
changed lines
  Added in v.45

  ViewVC Help
Powered by ViewVC 1.1.26