--- trunk/spider/swishspider 2003/08/04 16:41:14 44 +++ trunk/spider/swishspider 2003/11/19 12:07:07 45 @@ -25,6 +25,8 @@ my $request = new HTTP::Request( "GET", $url ); my $response = $ua->simple_request( $request ); +my $urlbase = $response->base; +$urlbase =~ s,/[^/]*$,/,; # remove filename # # Write out important meta-data. This includes the HTTP code. Depending on the @@ -82,6 +84,23 @@ # remove comments between and texi2html inserts them # there and swish can't find document title then (libxml or swish bug?) while ($contents =~ s/(.*)(.*)/$1$2/msi) { }; + + # remote TPJ left column + if ($contents =~ s,.+?,,isg) { + my $title; + # extract title and add to title + if ($contents =~ m,\s*]*>(.+?),si) { + $title = $1; + } elsif ($contents =~ m,]*>(.+?),is) { + $title = $1; + } elsif ($contents =~ m,]*>(.+?),is) { + $title = $1; + } else { + $title = "no detail title"; + } + $contents =~ s,()([^<]+)(),$1$2: $title$3,gsi if ($title); + + } } print CONTENTS $contents; close( CONTENTS ); @@ -91,6 +110,8 @@ my $p = HTML::LinkExtor->new( \&linkcb, $url ); $p->parse( $contents ); + + close( LINKS ); } } @@ -112,6 +133,15 @@ # $link =~ s/\.\.\///g; + if ($link =~ m,javascript:displayWindow\((.+)\),i) { + my $arg = $1; + $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg; + ($link,undef) = split(',',$arg,2); + $link =~ s/^['"]//; + $link =~ s/['"]$//; + $link = $urlbase.$link; + } + # hack for apostrophe -- changes URL, but should work for most clients. $link =~ s/'/%27/g;