--- trunk/spider/swishspider 2003/03/24 09:57:44 30 +++ trunk/spider/swishspider 2003/11/19 12:07:07 45 @@ -18,11 +18,15 @@ my $no_parent_url; if ($url =~ m/\s/) { - ($url,$no_parent_url) = split(/\s/,$url,2); + ($no_parent_url,$url) = split(/\s/,$url,2); + # old scheme had URL, no parent and new is reverse + ($url,$no_parent_url) = ($no_parent_url,$url) if ($no_parent_url =~ m/\Q$url\E/); } my $request = new HTTP::Request( "GET", $url ); my $response = $ua->simple_request( $request ); +my $urlbase = $response->base; +$urlbase =~ s,/[^/]*$,/,; # remove filename # # Write out important meta-data. This includes the HTTP code. Depending on the @@ -39,7 +43,7 @@ if ($no_parent_url) { if ($link =~ m/$no_parent_url/) { # if this URL is below parent URL o.k.... - print RESP "$link $no_parent_url\n"; + print RESP "$no_parent_url $link\n"; } else { # if not, crawl just this page! print RESP "$link $link\n"; @@ -59,20 +63,45 @@ my $contents = $response->content(); open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" ); - # if you don't want content to be indexed, include it in - # foobar tags or surround it with comments - # foobar - $contents =~ s,.+?,,isg; - $contents =~ s,.+?,,isg; - # this will remove all script from indexing content - $contents =~ s,,,isg; - # remap Windows charset to ISO-8859-2 - $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2 - # this will fix badly formatted html in form: - # some text - # which will confuse indexer (or libxml2?) - $contents =~ s/[\n\r]^(>)/$1\n/msg; + # fixup just HTML files + if ($response->header("content-type") =~ "text/html") { + # if you don't want content to be indexed, include it in + # foobar tags or surround it with comments + # foobar + # foobar (also supported by swish) + $contents =~ s,.+?,,isg; + $contents =~ s,.+?,,isg; + $contents =~ s,.+?,,isg; + # this will remove all script from indexing content + $contents =~ s,,,isg; + # remap Windows charset to ISO-8859-2 + $contents =~ tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2 + # this will fix badly formatted html in form: + # some text + # which will confuse indexer (or libxml2?) + $contents =~ s/[\n\r]^(>)/$1\n/msg; + # remove comments between and texi2html inserts them + # there and swish can't find document title then (libxml or swish bug?) + while ($contents =~ s/(.*)(.*)/$1$2/msi) { }; + + # remote TPJ left column + if ($contents =~ s,.+?,,isg) { + my $title; + # extract title and add to title + if ($contents =~ m,\s*]*>(.+?),si) { + $title = $1; + } elsif ($contents =~ m,]*>(.+?),is) { + $title = $1; + } elsif ($contents =~ m,]*>(.+?),is) { + $title = $1; + } else { + $title = "no detail title"; + } + $contents =~ s,()([^<]+)(),$1$2: $title$3,gsi if ($title); + + } + } print CONTENTS $contents; close( CONTENTS ); @@ -81,6 +110,8 @@ my $p = HTML::LinkExtor->new( \&linkcb, $url ); $p->parse( $contents ); + + close( LINKS ); } } @@ -88,8 +119,8 @@ sub linkcb { my($tag, %links) = @_; - if (($tag eq "a" || $tag eq "area") && ($links{"href"})) { - my $link = $links{"href"}; + if (($tag eq "a" || $tag eq "area") && ($links{"href"}) || ($tag eq "frame" && $links{"src"})) { + my $link = $links{"href"} || $links{"src"}; # # Remove fragments @@ -102,6 +133,15 @@ # $link =~ s/\.\.\///g; + if ($link =~ m,javascript:displayWindow\((.+)\),i) { + my $arg = $1; + $arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg; + ($link,undef) = split(',',$arg,2); + $link =~ s/^['"]//; + $link =~ s/['"]$//; + $link = $urlbase.$link; + } + # hack for apostrophe -- changes URL, but should work for most clients. $link =~ s/'/%27/g; @@ -113,7 +153,7 @@ if ($no_parent_url) { if ($link =~ m/$no_parent_url/) { - print LINKS "$link $no_parent_url\n"; + print LINKS "$no_parent_url $link\n"; # print STDERR "using $link\n"; # } else { # print STDERR "skipping $link\n";