25 |
|
|
26 |
my $request = new HTTP::Request( "GET", $url ); |
my $request = new HTTP::Request( "GET", $url ); |
27 |
my $response = $ua->simple_request( $request ); |
my $response = $ua->simple_request( $request ); |
28 |
|
my $urlbase = $response->base; |
29 |
|
$urlbase =~ s,/[^/]*$,/,; # remove filename |
30 |
|
|
31 |
# |
# |
32 |
# Write out important meta-data. This includes the HTTP code. Depending on the |
# Write out important meta-data. This includes the HTTP code. Depending on the |
84 |
# remove comments between <html> and <head> texi2html inserts them |
# remove comments between <html> and <head> texi2html inserts them |
85 |
# there and swish can't find document title then (libxml or swish bug?) |
# there and swish can't find document title then (libxml or swish bug?) |
86 |
while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { }; |
while ($contents =~ s/(<html>.*)<!--.*?-->(.*<head>)/$1$2/msi) { }; |
87 |
|
|
88 |
|
# remote TPJ left column |
89 |
|
if ($contents =~ s,<!-- BEGIN LEFT SIDE BAR CELL -->.+?<!-- END LEFT SIDE BAR CELL -->,,isg) { |
90 |
|
my $title; |
91 |
|
# extract title and add to title |
92 |
|
if ($contents =~ m,<!-- the article goes here -->\s*<h2[^>]*>(.+?)</h2>,si) { |
93 |
|
$title = $1; |
94 |
|
} elsif ($contents =~ m,<h1[^>]*>(.+?)</h1>,is) { |
95 |
|
$title = $1; |
96 |
|
} elsif ($contents =~ m,<h2[^>]*>(.+?)</h2>,is) { |
97 |
|
$title = $1; |
98 |
|
} else { |
99 |
|
$title = "no detail title"; |
100 |
|
} |
101 |
|
$contents =~ s,(<title>)([^<]+)(</title>),$1$2: $title$3,gsi if ($title); |
102 |
|
|
103 |
|
} |
104 |
} |
} |
105 |
print CONTENTS $contents; |
print CONTENTS $contents; |
106 |
close( CONTENTS ); |
close( CONTENTS ); |
110 |
my $p = HTML::LinkExtor->new( \&linkcb, $url ); |
my $p = HTML::LinkExtor->new( \&linkcb, $url ); |
111 |
$p->parse( $contents ); |
$p->parse( $contents ); |
112 |
|
|
113 |
|
|
114 |
|
|
115 |
close( LINKS ); |
close( LINKS ); |
116 |
} |
} |
117 |
} |
} |
133 |
# |
# |
134 |
$link =~ s/\.\.\///g; |
$link =~ s/\.\.\///g; |
135 |
|
|
136 |
|
if ($link =~ m,javascript:displayWindow\((.+)\),i) { |
137 |
|
my $arg = $1; |
138 |
|
$arg =~ s/%([a-f0-9][a-f][0-9])/chr(hex($1))/eg; |
139 |
|
($link,undef) = split(',',$arg,2); |
140 |
|
$link =~ s/^['"]//; |
141 |
|
$link =~ s/['"]$//; |
142 |
|
$link = $urlbase.$link; |
143 |
|
} |
144 |
|
|
145 |
# hack for apostrophe -- changes URL, but should work for most clients. |
# hack for apostrophe -- changes URL, but should work for most clients. |
146 |
$link =~ s/'/%27/g; |
$link =~ s/'/%27/g; |
147 |
|
|