1 |
dpavlin |
1 |
#! /usr/bin/ruby -w |
2 |
|
|
|
3 |
|
|
#------------------------------------------------------------------------------------------------- |
4 |
|
|
# wgettsv |
5 |
|
|
# Collect WWW resources and generate TSV data |
6 |
|
|
# Copyright (C) 2006-2009 Mikio Hirabayashi |
7 |
|
|
# This file is part of Tokyo Cabinet. |
8 |
|
|
# Tokyo Cabinet is free software; you can redistribute it and/or modify it under the terms of |
9 |
|
|
# the GNU Lesser General Public License as published by the Free Software Foundation; either |
10 |
|
|
# version 2.1 of the License or any later version. Tokyo Cabinet is distributed in the hope |
11 |
|
|
# that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
13 |
|
|
# License for more details. |
14 |
|
|
# You should have received a copy of the GNU Lesser General Public License along with Tokyo |
15 |
|
|
# Cabinet; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, |
16 |
|
|
# Boston, MA 02111-1307 USA. |
17 |
|
|
#------------------------------------------------------------------------------------------------- |
18 |
|
|
|
19 |
|
|
|
20 |
|
|
require 'open-uri' |
21 |
|
|
require 'iconv' |
22 |
|
|
require 'kconv' |
23 |
|
|
require 'date' |
24 |
|
|
require 'time' |
25 |
|
|
require 'cgi' |
26 |
|
|
|
27 |
|
|
|
28 |
|
|
# main routine |
29 |
|
|
def main |
30 |
|
|
seeds = [] |
31 |
|
|
hist = {} |
32 |
|
|
filters = [] |
33 |
|
|
max = 1 << 30 |
34 |
|
|
lim = 1 << 20 |
35 |
|
|
wait = 0 |
36 |
|
|
ndf = false |
37 |
|
|
i = 0 |
38 |
|
|
while i < ARGV.length |
39 |
|
|
if seeds.length < 1 && ARGV[i] =~ /^-/ |
40 |
|
|
if ARGV[i] == '-allow' |
41 |
|
|
usage if (i += 1) >= ARGV.length |
42 |
|
|
regex = Regexp::new(ARGV[i]) |
43 |
|
|
filters.push([true, regex]) if regex |
44 |
|
|
elsif ARGV[i] == '-deny' |
45 |
|
|
usage if (i += 1) >= ARGV.length |
46 |
|
|
regex = Regexp::new(ARGV[i]) |
47 |
|
|
filters.push([false, regex]) if regex |
48 |
|
|
elsif ARGV[i] == '-max' |
49 |
|
|
usage if (i += 1) >= ARGV.length |
50 |
|
|
max = ARGV[i].to_i |
51 |
|
|
elsif ARGV[i] == '-lim' |
52 |
|
|
usage if (i += 1) >= ARGV.length |
53 |
|
|
lim = ARGV[i].to_i |
54 |
|
|
elsif ARGV[i] == '-wait' |
55 |
|
|
usage if (i += 1) >= ARGV.length |
56 |
|
|
wait = ARGV[i].to_f |
57 |
|
|
elsif ARGV[i] == '-ndf' |
58 |
|
|
ndf = true |
59 |
|
|
else |
60 |
|
|
usage |
61 |
|
|
end |
62 |
|
|
else |
63 |
|
|
if ARGV[i] =~ /^http:\/\//i |
64 |
|
|
seeds.push(ARGV[i]) |
65 |
|
|
hist[ARGV[i]] = true |
66 |
|
|
else |
67 |
|
|
usage |
68 |
|
|
end |
69 |
|
|
end |
70 |
|
|
i += 1 |
71 |
|
|
end |
72 |
|
|
usage if seeds.length < 1 |
73 |
|
|
if !ndf |
74 |
|
|
filters.push([false, /\.(txt|text|asc|c|cc|cxx|cpp|h|hxx|hpp|in)$/i]) |
75 |
|
|
filters.push([false, /\.(css|js|csv|tsv|log|md5|crc|conf|ini|inf|lnk|sys|tmp|bak)$/i]) |
76 |
|
|
filters.push([false, /\.(xml|xsl|xslt|rdf|rss|dtd|sgml|sgm)$/i]) |
77 |
|
|
filters.push([false, /\.(pgp|sig|cer|csr|pem|key|b64|uu|uue|[0-9])$/i]) |
78 |
|
|
filters.push([false, /\.(rtf|pdf|ps|eps|ai|doc|xls|ppt|sxw|sxc|sxi|xdw|jtd|oas|swf)$/i]) |
79 |
|
|
filters.push([false, /\.(zip|tar|tgz|gz|bz2|tbz2|z|lha|lzh)?$/i]) |
80 |
|
|
filters.push([false, /\.(7z|lzo|lzma|cpio|shar|cab|rar|sit|ace|hqx)?$/i]) |
81 |
|
|
filters.push([false, /\.(bin|o|a|so|exe|dll|lib|obj|ocx|class|jar|war)?$/i]) |
82 |
|
|
filters.push([false, /\.(rpm|deb|qdb|qdb|dbx|dbf|dat|msi|bat|com|iso)?$/i]) |
83 |
|
|
filters.push([false, /\.(png|gif|jpg|jpeg|tif|tiff|bmp|ico|pbm|pgm|ppm|xbm|xpm|dvi)$/i]) |
84 |
|
|
filters.push([false, /\.(au|snd|mid|midi|kar|smf|mp2|mp3|m3u|wav|wma|wmp|asx|at3|aif)$/i]) |
85 |
|
|
filters.push([false, /\.(mpg|mpeg|qt|mov|avi|wmv|wvx|asf|ram|rm)$/i]) |
86 |
|
|
filters.push([false, /\.(tch|tdb|tdf|tct)$/i]) |
87 |
|
|
filters.push([false, /\.idx\./i]) |
88 |
|
|
filters.push([false, /(core|casket|Makefile|README|NEWS|COPYING|LISENCE)($|\/)/i]) |
89 |
|
|
end |
90 |
|
|
return proc(seeds, hist, filters, max, lim, wait) ? 0 : 1 |
91 |
|
|
end |
92 |
|
|
|
93 |
|
|
|
94 |
|
|
# print the command usage and exit |
95 |
|
|
def usage |
96 |
|
|
STDERR.printf("%s: collect WWW resources and generate TSV data\n", $progname) |
97 |
|
|
STDERR.printf("\n") |
98 |
|
|
STDERR.printf("usage:\n") |
99 |
|
|
STDERR.printf(" %s [-allow regex] [-deny regex] [-max num] [-lim num] [-wait num]" + |
100 |
|
|
" url ...\n", $progname) |
101 |
|
|
STDERR.printf("\n") |
102 |
|
|
exit(1) |
103 |
|
|
end |
104 |
|
|
|
105 |
|
|
|
106 |
|
|
# fetch each content |
107 |
|
|
def proc(seeds, hist, filters, max, lim, wait) |
108 |
|
|
cnt = 0 |
109 |
|
|
while (url = seeds.shift) && cnt < max |
110 |
|
|
STDERR.printf("%d: getting: %s\n", cnt + 1, url) |
111 |
|
|
begin |
112 |
|
|
opts = {} |
113 |
|
|
OpenURI.open_uri(url, 0, 0, opts) do |sio| |
114 |
|
|
baseuri = sio.base_uri |
115 |
|
|
if baseuri && baseuri.to_s != url |
116 |
|
|
url = baseuri.to_s |
117 |
|
|
hist[url] = true |
118 |
|
|
end |
119 |
|
|
size = sio.size |
120 |
|
|
raise "invalid size" if size > lim || size < 3 |
121 |
|
|
type = sio.content_type |
122 |
|
|
type = "text/plain" if !type |
123 |
|
|
str = sio.read |
124 |
|
|
head = str[0,2048] |
125 |
|
|
if (head[0] == 0xfe && head[1] == 0xff) || (head[0] == 0xff && head[1] == 0xfe) |
126 |
|
|
str = Kconv::kconv(str, Kconv::UTF8, Kconv::UTF16) |
127 |
|
|
charset = "UTF-8" |
128 |
|
|
elsif str.include?(0) |
129 |
|
|
raise "binary data" |
130 |
|
|
end |
131 |
|
|
raise "not HTML" if type != "text/html" && head !~ /<html/i && head !~ /<body/i |
132 |
|
|
if !charset && head =~ /<\?xml.*encoding=("|')?[-_a-zA-Z0-9]+("|')/im |
133 |
|
|
charset = head.gsub(/.*<\?xml.*encoding=["']?([-_a-zA-Z0-9]+)["']?.*/im, '\1') |
134 |
|
|
end |
135 |
|
|
if !charset && head =~ /<meta.*content-type.*charset=[-_a-zA-Z0-9]+/im |
136 |
|
|
charset = head.gsub(/.*<meta.*content-type.*charset=([-_a-zA-Z0-9]+).*/im, '\1') |
137 |
|
|
end |
138 |
|
|
charset = sio.charset if !charset || charset.length < 1 |
139 |
|
|
if charset && charset.length > 0 |
140 |
|
|
if charset !~ /^UTF-?8$/i |
141 |
|
|
begin |
142 |
|
|
nstr = Iconv.conv("UTF-8", charset, str) |
143 |
|
|
str = nstr if nstr && nstr.length > 0 |
144 |
|
|
rescue |
145 |
|
|
str = str.toutf8 |
146 |
|
|
end |
147 |
|
|
end |
148 |
|
|
else |
149 |
|
|
str = str.toutf8 |
150 |
|
|
end |
151 |
|
|
body = str.gsub(/.*<body[^>]*>/im, "") |
152 |
|
|
body = body.gsub(/<\/body>.*/im, "") |
153 |
|
|
body = htmltotext(body) |
154 |
|
|
if str =~ /<title[^>]*>[^<]*<\/title>/im |
155 |
|
|
title = str.gsub(/.*<title[^>]*>([^<]*)<\/title>.*/im, '\1') |
156 |
|
|
title = htmltotext(title) |
157 |
|
|
end |
158 |
|
|
title = "" if !title |
159 |
|
|
title = title[0,128] if title.length > 128 |
160 |
|
|
mtime = sio.last_modified |
161 |
|
|
if mtime |
162 |
|
|
mtime = Time::parse(mtime.to_s).to_i |
163 |
|
|
else |
164 |
|
|
mtime = 0 |
165 |
|
|
end |
166 |
|
|
printf("%d", cnt + 1) |
167 |
|
|
printf("\turl\t%s", url) |
168 |
|
|
printf("\tsize\t%s", size) if size > 0 |
169 |
|
|
printf("\tmtime\t%s", mtime) if mtime > 0 |
170 |
|
|
printf("\ttitle\t%s", title) if title.length > 0 |
171 |
|
|
printf("\tbody\t%s", body) if body.length > 0 |
172 |
|
|
printf("\n") |
173 |
|
|
str.gsub(/<a[^>]*>/im) do |tag| |
174 |
|
|
if tag =~ /href=["']?[^"'>]+["']?/ |
175 |
|
|
href = tag.gsub(/.*href=["']?([^"'>]+)["']?.*/, '\1') |
176 |
|
|
href = URI::join(url, href).to_s |
177 |
|
|
href = href.gsub(/#.*/, "") |
178 |
|
|
if !hist[href] && checkurl(href, filters) |
179 |
|
|
seeds.push(href) |
180 |
|
|
hist[href] = true |
181 |
|
|
end |
182 |
|
|
end |
183 |
|
|
end |
184 |
|
|
end |
185 |
|
|
cnt += 1 |
186 |
|
|
rescue |
187 |
|
|
STDERR.printf("%d: failed: %s: %s\n", cnt + 1, url, $!) |
188 |
|
|
end |
189 |
|
|
sleep(wait) if wait > 0 |
190 |
|
|
end |
191 |
|
|
return 0 |
192 |
|
|
end |
193 |
|
|
|
194 |
|
|
|
195 |
|
|
# convert HTML to plain-text |
196 |
|
|
def htmltotext(str) |
197 |
|
|
str = str.gsub(/<style[^>]*>.*?<\/style>/im, " ") |
198 |
|
|
str = str.gsub(/<script[^>]*>.*?<\/script>/im, " ") |
199 |
|
|
str = str.gsub(/<\/?(p|br|div|h1|h2|h3|h4|h5|h6|ul|ol|dl|li|dd|dt|td|th|pre)[^>]*>/im, " ") |
200 |
|
|
str = str.gsub(/<[^>]*>/, "") |
201 |
|
|
str = str.gsub(/&(nbsp|#160|#0160|#xa0|#x00a0);/i, " ") |
202 |
|
|
hexrx = Regexp::new("^&#x[0-9a-zA-Z]+;") |
203 |
|
|
decrx = Regexp::new("^&#[0-9]+;") |
204 |
|
|
str = str.gsub(/&#?[A-Za-z0-9]+;/) do |pat| |
205 |
|
|
case pat |
206 |
|
|
when "<" |
207 |
|
|
pat = '<' |
208 |
|
|
when ">" |
209 |
|
|
pat = '>' |
210 |
|
|
when """ |
211 |
|
|
pat = '"' |
212 |
|
|
when "'" |
213 |
|
|
pat = "'" |
214 |
|
|
when " " |
215 |
|
|
pat = " " |
216 |
|
|
else |
217 |
|
|
begin |
218 |
|
|
if pat =~ hexrx |
219 |
|
|
pat = [ pat.gsub(/&#x([A-Za-z0-9]+);/i, '\1').hex ].pack("n") |
220 |
|
|
pat = Iconv.conv("UTF-8", "UTF-16BE", pat) |
221 |
|
|
elsif pat =~ decrx |
222 |
|
|
pat = [ pat.gsub(/&#([A-Za-z0-9]+);/i, '\1').to_i ].pack("n") |
223 |
|
|
pat = Iconv.conv("UTF-8", "UTF-16BE", pat) |
224 |
|
|
else |
225 |
|
|
pat = " " |
226 |
|
|
end |
227 |
|
|
rescue |
228 |
|
|
pat = "" |
229 |
|
|
end |
230 |
|
|
end |
231 |
|
|
pat |
232 |
|
|
end |
233 |
|
|
str = str.gsub(/[\x00-\x20]/, " ") |
234 |
|
|
str = str.gsub(/\xe3\x80\x80/, " ") |
235 |
|
|
str = str.gsub(/ +/, " ") |
236 |
|
|
str = str.gsub(/^ */, "") |
237 |
|
|
str = str.gsub(/ *$/, "") |
238 |
|
|
return str |
239 |
|
|
end |
240 |
|
|
|
241 |
|
|
|
242 |
|
|
# check whether the url passes the filters or not |
243 |
|
|
def checkurl(url, filters) |
244 |
|
|
return false if url !~ /^http:\/\//i; |
245 |
|
|
return true if filters.length < 1 |
246 |
|
|
ok = !filters[0][0] |
247 |
|
|
filters.each do |filter| |
248 |
|
|
ok = filter[0] if url =~ filter[1] |
249 |
|
|
end |
250 |
|
|
return ok |
251 |
|
|
end |
252 |
|
|
|
253 |
|
|
|
254 |
|
|
# execute the actual operations |
255 |
|
|
STDOUT.sync = true |
256 |
|
|
$progname = $0.dup |
257 |
|
|
$progname.gsub!(/.*\//, "") |
258 |
|
|
srand |
259 |
|
|
exit(main) |