| 1 |
1 |
dpavlin |
#! /usr/bin/ruby -w |
| 2 |
|
|
|
| 3 |
|
|
#------------------------------------------------------------------------------------------------- |
| 4 |
|
|
# wgettsv |
| 5 |
|
|
# Collect WWW resources and generate TSV data |
| 6 |
|
|
# Copyright (C) 2006-2009 Mikio Hirabayashi |
| 7 |
|
|
# This file is part of Tokyo Cabinet. |
| 8 |
|
|
# Tokyo Cabinet is free software; you can redistribute it and/or modify it under the terms of |
| 9 |
|
|
# the GNU Lesser General Public License as published by the Free Software Foundation; either |
| 10 |
|
|
# version 2.1 of the License or any later version. Tokyo Cabinet is distributed in the hope |
| 11 |
|
|
# that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 |
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| 13 |
|
|
# License for more details. |
| 14 |
|
|
# You should have received a copy of the GNU Lesser General Public License along with Tokyo |
| 15 |
|
|
# Cabinet; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, |
| 16 |
|
|
# Boston, MA 02111-1307 USA. |
| 17 |
|
|
#------------------------------------------------------------------------------------------------- |
| 18 |
|
|
|
| 19 |
|
|
|
| 20 |
|
|
require 'open-uri' |
| 21 |
|
|
require 'iconv' |
| 22 |
|
|
require 'kconv' |
| 23 |
|
|
require 'date' |
| 24 |
|
|
require 'time' |
| 25 |
|
|
require 'cgi' |
| 26 |
|
|
|
| 27 |
|
|
|
| 28 |
|
|
# main routine |
| 29 |
|
|
def main |
| 30 |
|
|
seeds = [] |
| 31 |
|
|
hist = {} |
| 32 |
|
|
filters = [] |
| 33 |
|
|
max = 1 << 30 |
| 34 |
|
|
lim = 1 << 20 |
| 35 |
|
|
wait = 0 |
| 36 |
|
|
ndf = false |
| 37 |
|
|
i = 0 |
| 38 |
|
|
while i < ARGV.length |
| 39 |
|
|
if seeds.length < 1 && ARGV[i] =~ /^-/ |
| 40 |
|
|
if ARGV[i] == '-allow' |
| 41 |
|
|
usage if (i += 1) >= ARGV.length |
| 42 |
|
|
regex = Regexp::new(ARGV[i]) |
| 43 |
|
|
filters.push([true, regex]) if regex |
| 44 |
|
|
elsif ARGV[i] == '-deny' |
| 45 |
|
|
usage if (i += 1) >= ARGV.length |
| 46 |
|
|
regex = Regexp::new(ARGV[i]) |
| 47 |
|
|
filters.push([false, regex]) if regex |
| 48 |
|
|
elsif ARGV[i] == '-max' |
| 49 |
|
|
usage if (i += 1) >= ARGV.length |
| 50 |
|
|
max = ARGV[i].to_i |
| 51 |
|
|
elsif ARGV[i] == '-lim' |
| 52 |
|
|
usage if (i += 1) >= ARGV.length |
| 53 |
|
|
lim = ARGV[i].to_i |
| 54 |
|
|
elsif ARGV[i] == '-wait' |
| 55 |
|
|
usage if (i += 1) >= ARGV.length |
| 56 |
|
|
wait = ARGV[i].to_f |
| 57 |
|
|
elsif ARGV[i] == '-ndf' |
| 58 |
|
|
ndf = true |
| 59 |
|
|
else |
| 60 |
|
|
usage |
| 61 |
|
|
end |
| 62 |
|
|
else |
| 63 |
|
|
if ARGV[i] =~ /^http:\/\//i |
| 64 |
|
|
seeds.push(ARGV[i]) |
| 65 |
|
|
hist[ARGV[i]] = true |
| 66 |
|
|
else |
| 67 |
|
|
usage |
| 68 |
|
|
end |
| 69 |
|
|
end |
| 70 |
|
|
i += 1 |
| 71 |
|
|
end |
| 72 |
|
|
usage if seeds.length < 1 |
| 73 |
|
|
if !ndf |
| 74 |
|
|
filters.push([false, /\.(txt|text|asc|c|cc|cxx|cpp|h|hxx|hpp|in)$/i]) |
| 75 |
|
|
filters.push([false, /\.(css|js|csv|tsv|log|md5|crc|conf|ini|inf|lnk|sys|tmp|bak)$/i]) |
| 76 |
|
|
filters.push([false, /\.(xml|xsl|xslt|rdf|rss|dtd|sgml|sgm)$/i]) |
| 77 |
|
|
filters.push([false, /\.(pgp|sig|cer|csr|pem|key|b64|uu|uue|[0-9])$/i]) |
| 78 |
|
|
filters.push([false, /\.(rtf|pdf|ps|eps|ai|doc|xls|ppt|sxw|sxc|sxi|xdw|jtd|oas|swf)$/i]) |
| 79 |
|
|
filters.push([false, /\.(zip|tar|tgz|gz|bz2|tbz2|z|lha|lzh)?$/i]) |
| 80 |
|
|
filters.push([false, /\.(7z|lzo|lzma|cpio|shar|cab|rar|sit|ace|hqx)?$/i]) |
| 81 |
|
|
filters.push([false, /\.(bin|o|a|so|exe|dll|lib|obj|ocx|class|jar|war)?$/i]) |
| 82 |
|
|
filters.push([false, /\.(rpm|deb|qdb|qdb|dbx|dbf|dat|msi|bat|com|iso)?$/i]) |
| 83 |
|
|
filters.push([false, /\.(png|gif|jpg|jpeg|tif|tiff|bmp|ico|pbm|pgm|ppm|xbm|xpm|dvi)$/i]) |
| 84 |
|
|
filters.push([false, /\.(au|snd|mid|midi|kar|smf|mp2|mp3|m3u|wav|wma|wmp|asx|at3|aif)$/i]) |
| 85 |
|
|
filters.push([false, /\.(mpg|mpeg|qt|mov|avi|wmv|wvx|asf|ram|rm)$/i]) |
| 86 |
|
|
filters.push([false, /\.(tch|tdb|tdf|tct)$/i]) |
| 87 |
|
|
filters.push([false, /\.idx\./i]) |
| 88 |
|
|
filters.push([false, /(core|casket|Makefile|README|NEWS|COPYING|LISENCE)($|\/)/i]) |
| 89 |
|
|
end |
| 90 |
|
|
return proc(seeds, hist, filters, max, lim, wait) ? 0 : 1 |
| 91 |
|
|
end |
| 92 |
|
|
|
| 93 |
|
|
|
| 94 |
|
|
# print the command usage and exit |
| 95 |
|
|
def usage |
| 96 |
|
|
STDERR.printf("%s: collect WWW resources and generate TSV data\n", $progname) |
| 97 |
|
|
STDERR.printf("\n") |
| 98 |
|
|
STDERR.printf("usage:\n") |
| 99 |
|
|
STDERR.printf(" %s [-allow regex] [-deny regex] [-max num] [-lim num] [-wait num]" + |
| 100 |
|
|
" url ...\n", $progname) |
| 101 |
|
|
STDERR.printf("\n") |
| 102 |
|
|
exit(1) |
| 103 |
|
|
end |
| 104 |
|
|
|
| 105 |
|
|
|
| 106 |
|
|
# fetch each content |
| 107 |
|
|
def proc(seeds, hist, filters, max, lim, wait) |
| 108 |
|
|
cnt = 0 |
| 109 |
|
|
while (url = seeds.shift) && cnt < max |
| 110 |
|
|
STDERR.printf("%d: getting: %s\n", cnt + 1, url) |
| 111 |
|
|
begin |
| 112 |
|
|
opts = {} |
| 113 |
|
|
OpenURI.open_uri(url, 0, 0, opts) do |sio| |
| 114 |
|
|
baseuri = sio.base_uri |
| 115 |
|
|
if baseuri && baseuri.to_s != url |
| 116 |
|
|
url = baseuri.to_s |
| 117 |
|
|
hist[url] = true |
| 118 |
|
|
end |
| 119 |
|
|
size = sio.size |
| 120 |
|
|
raise "invalid size" if size > lim || size < 3 |
| 121 |
|
|
type = sio.content_type |
| 122 |
|
|
type = "text/plain" if !type |
| 123 |
|
|
str = sio.read |
| 124 |
|
|
head = str[0,2048] |
| 125 |
|
|
if (head[0] == 0xfe && head[1] == 0xff) || (head[0] == 0xff && head[1] == 0xfe) |
| 126 |
|
|
str = Kconv::kconv(str, Kconv::UTF8, Kconv::UTF16) |
| 127 |
|
|
charset = "UTF-8" |
| 128 |
|
|
elsif str.include?(0) |
| 129 |
|
|
raise "binary data" |
| 130 |
|
|
end |
| 131 |
|
|
raise "not HTML" if type != "text/html" && head !~ /<html/i && head !~ /<body/i |
| 132 |
|
|
if !charset && head =~ /<\?xml.*encoding=("|')?[-_a-zA-Z0-9]+("|')/im |
| 133 |
|
|
charset = head.gsub(/.*<\?xml.*encoding=["']?([-_a-zA-Z0-9]+)["']?.*/im, '\1') |
| 134 |
|
|
end |
| 135 |
|
|
if !charset && head =~ /<meta.*content-type.*charset=[-_a-zA-Z0-9]+/im |
| 136 |
|
|
charset = head.gsub(/.*<meta.*content-type.*charset=([-_a-zA-Z0-9]+).*/im, '\1') |
| 137 |
|
|
end |
| 138 |
|
|
charset = sio.charset if !charset || charset.length < 1 |
| 139 |
|
|
if charset && charset.length > 0 |
| 140 |
|
|
if charset !~ /^UTF-?8$/i |
| 141 |
|
|
begin |
| 142 |
|
|
nstr = Iconv.conv("UTF-8", charset, str) |
| 143 |
|
|
str = nstr if nstr && nstr.length > 0 |
| 144 |
|
|
rescue |
| 145 |
|
|
str = str.toutf8 |
| 146 |
|
|
end |
| 147 |
|
|
end |
| 148 |
|
|
else |
| 149 |
|
|
str = str.toutf8 |
| 150 |
|
|
end |
| 151 |
|
|
body = str.gsub(/.*<body[^>]*>/im, "") |
| 152 |
|
|
body = body.gsub(/<\/body>.*/im, "") |
| 153 |
|
|
body = htmltotext(body) |
| 154 |
|
|
if str =~ /<title[^>]*>[^<]*<\/title>/im |
| 155 |
|
|
title = str.gsub(/.*<title[^>]*>([^<]*)<\/title>.*/im, '\1') |
| 156 |
|
|
title = htmltotext(title) |
| 157 |
|
|
end |
| 158 |
|
|
title = "" if !title |
| 159 |
|
|
title = title[0,128] if title.length > 128 |
| 160 |
|
|
mtime = sio.last_modified |
| 161 |
|
|
if mtime |
| 162 |
|
|
mtime = Time::parse(mtime.to_s).to_i |
| 163 |
|
|
else |
| 164 |
|
|
mtime = 0 |
| 165 |
|
|
end |
| 166 |
|
|
printf("%d", cnt + 1) |
| 167 |
|
|
printf("\turl\t%s", url) |
| 168 |
|
|
printf("\tsize\t%s", size) if size > 0 |
| 169 |
|
|
printf("\tmtime\t%s", mtime) if mtime > 0 |
| 170 |
|
|
printf("\ttitle\t%s", title) if title.length > 0 |
| 171 |
|
|
printf("\tbody\t%s", body) if body.length > 0 |
| 172 |
|
|
printf("\n") |
| 173 |
|
|
str.gsub(/<a[^>]*>/im) do |tag| |
| 174 |
|
|
if tag =~ /href=["']?[^"'>]+["']?/ |
| 175 |
|
|
href = tag.gsub(/.*href=["']?([^"'>]+)["']?.*/, '\1') |
| 176 |
|
|
href = URI::join(url, href).to_s |
| 177 |
|
|
href = href.gsub(/#.*/, "") |
| 178 |
|
|
if !hist[href] && checkurl(href, filters) |
| 179 |
|
|
seeds.push(href) |
| 180 |
|
|
hist[href] = true |
| 181 |
|
|
end |
| 182 |
|
|
end |
| 183 |
|
|
end |
| 184 |
|
|
end |
| 185 |
|
|
cnt += 1 |
| 186 |
|
|
rescue |
| 187 |
|
|
STDERR.printf("%d: failed: %s: %s\n", cnt + 1, url, $!) |
| 188 |
|
|
end |
| 189 |
|
|
sleep(wait) if wait > 0 |
| 190 |
|
|
end |
| 191 |
|
|
return 0 |
| 192 |
|
|
end |
| 193 |
|
|
|
| 194 |
|
|
|
| 195 |
|
|
# convert HTML to plain-text |
| 196 |
|
|
def htmltotext(str) |
| 197 |
|
|
str = str.gsub(/<style[^>]*>.*?<\/style>/im, " ") |
| 198 |
|
|
str = str.gsub(/<script[^>]*>.*?<\/script>/im, " ") |
| 199 |
|
|
str = str.gsub(/<\/?(p|br|div|h1|h2|h3|h4|h5|h6|ul|ol|dl|li|dd|dt|td|th|pre)[^>]*>/im, " ") |
| 200 |
|
|
str = str.gsub(/<[^>]*>/, "") |
| 201 |
|
|
str = str.gsub(/&(nbsp|#160|#0160|#xa0|#x00a0);/i, " ") |
| 202 |
|
|
hexrx = Regexp::new("^&#x[0-9a-zA-Z]+;") |
| 203 |
|
|
decrx = Regexp::new("^&#[0-9]+;") |
| 204 |
|
|
str = str.gsub(/&#?[A-Za-z0-9]+;/) do |pat| |
| 205 |
|
|
case pat |
| 206 |
|
|
when "<" |
| 207 |
|
|
pat = '<' |
| 208 |
|
|
when ">" |
| 209 |
|
|
pat = '>' |
| 210 |
|
|
when """ |
| 211 |
|
|
pat = '"' |
| 212 |
|
|
when "'" |
| 213 |
|
|
pat = "'" |
| 214 |
|
|
when " " |
| 215 |
|
|
pat = " " |
| 216 |
|
|
else |
| 217 |
|
|
begin |
| 218 |
|
|
if pat =~ hexrx |
| 219 |
|
|
pat = [ pat.gsub(/&#x([A-Za-z0-9]+);/i, '\1').hex ].pack("n") |
| 220 |
|
|
pat = Iconv.conv("UTF-8", "UTF-16BE", pat) |
| 221 |
|
|
elsif pat =~ decrx |
| 222 |
|
|
pat = [ pat.gsub(/&#([A-Za-z0-9]+);/i, '\1').to_i ].pack("n") |
| 223 |
|
|
pat = Iconv.conv("UTF-8", "UTF-16BE", pat) |
| 224 |
|
|
else |
| 225 |
|
|
pat = " " |
| 226 |
|
|
end |
| 227 |
|
|
rescue |
| 228 |
|
|
pat = "" |
| 229 |
|
|
end |
| 230 |
|
|
end |
| 231 |
|
|
pat |
| 232 |
|
|
end |
| 233 |
|
|
str = str.gsub(/[\x00-\x20]/, " ") |
| 234 |
|
|
str = str.gsub(/\xe3\x80\x80/, " ") |
| 235 |
|
|
str = str.gsub(/ +/, " ") |
| 236 |
|
|
str = str.gsub(/^ */, "") |
| 237 |
|
|
str = str.gsub(/ *$/, "") |
| 238 |
|
|
return str |
| 239 |
|
|
end |
| 240 |
|
|
|
| 241 |
|
|
|
| 242 |
|
|
# check whether the url passes the filters or not |
| 243 |
|
|
def checkurl(url, filters) |
| 244 |
|
|
return false if url !~ /^http:\/\//i; |
| 245 |
|
|
return true if filters.length < 1 |
| 246 |
|
|
ok = !filters[0][0] |
| 247 |
|
|
filters.each do |filter| |
| 248 |
|
|
ok = filter[0] if url =~ filter[1] |
| 249 |
|
|
end |
| 250 |
|
|
return ok |
| 251 |
|
|
end |
| 252 |
|
|
|
| 253 |
|
|
|
| 254 |
|
|
# execute the actual operations |
| 255 |
|
|
STDOUT.sync = true |
| 256 |
|
|
$progname = $0.dup |
| 257 |
|
|
$progname.gsub!(/.*\//, "") |
| 258 |
|
|
srand |
| 259 |
|
|
exit(main) |