--- trunk/spider/swishspider 2003/08/04 16:41:14 44
+++ trunk/spider/swishspider 2003/11/19 12:07:07 45
@@ -25,6 +25,8 @@
my $request = new HTTP::Request( "GET", $url );
my $response = $ua->simple_request( $request );
+my $urlbase = $response->base;
+$urlbase =~ s,/[^/]*$,/,; # remove filename
#
# Write out important meta-data. This includes the HTTP code. Depending on the
@@ -82,6 +84,23 @@
# remove comments between and
texi2html inserts them
# there and swish can't find document title then (libxml or swish bug?)
while ($contents =~ s/(.*)(.*)/$1$2/msi) { };
+
+ # remote TPJ left column
+ if ($contents =~ s,.+?,,isg) {
+ my $title;
+ # extract title and add to title
+ if ($contents =~ m,\s*]*>(.+?)
,si) {
+ $title = $1;
+ } elsif ($contents =~ m,]*>(.+?)
,is) {
+ $title = $1;
+ } elsif ($contents =~ m,]*>(.+?)
,is) {
+ $title = $1;
+ } else {
+ $title = "no detail title";
+ }
+ $contents =~ s,()([^<]+)(),$1$2: $title$3,gsi if ($title);
+
+ }
}
print CONTENTS $contents;
close( CONTENTS );
@@ -91,6 +110,8 @@
my $p = HTML::LinkExtor->new( \&linkcb, $url );
$p->parse( $contents );
+
+
close( LINKS );
}
}
@@ -112,6 +133,15 @@
#
$link =~ s/\.\.\///g;
+ if ($link =~ m,javascript:displayWindow\((.+)\),i) {
+ my $arg = $1;
+ $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg;
+ ($link,undef) = split(',',$arg,2);
+ $link =~ s/^['"]//;
+ $link =~ s/['"]$//;
+ $link = $urlbase.$link;
+ }
+
# hack for apostrophe -- changes URL, but should work for most clients.
$link =~ s/'/%27/g;