Line # Revision Author
1 1 dpavlin #! /usr/bin/ruby -w
2
3 #-------------------------------------------------------------------------------------------------
4 # wgettsv
5 # Collect WWW resources and generate TSV data
6 # Copyright (C) 2006-2009 Mikio Hirabayashi
7 # This file is part of Tokyo Cabinet.
8 # Tokyo Cabinet is free software; you can redistribute it and/or modify it under the terms of
9 # the GNU Lesser General Public License as published by the Free Software Foundation; either
10 # version 2.1 of the License or any later version. Tokyo Cabinet is distributed in the hope
11 # that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13 # License for more details.
14 # You should have received a copy of the GNU Lesser General Public License along with Tokyo
15 # Cabinet; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
16 # Boston, MA 02111-1307 USA.
17 #-------------------------------------------------------------------------------------------------
18
19
20 require 'open-uri'
21 require 'iconv'
22 require 'kconv'
23 require 'date'
24 require 'time'
25 require 'cgi'
26
27
28 # main routine
29 def main
30 seeds = []
31 hist = {}
32 filters = []
33 max = 1 << 30
34 lim = 1 << 20
35 wait = 0
36 ndf = false
37 i = 0
38 while i < ARGV.length
39 if seeds.length < 1 && ARGV[i] =~ /^-/
40 if ARGV[i] == '-allow'
41 usage if (i += 1) >= ARGV.length
42 regex = Regexp::new(ARGV[i])
43 filters.push([true, regex]) if regex
44 elsif ARGV[i] == '-deny'
45 usage if (i += 1) >= ARGV.length
46 regex = Regexp::new(ARGV[i])
47 filters.push([false, regex]) if regex
48 elsif ARGV[i] == '-max'
49 usage if (i += 1) >= ARGV.length
50 max = ARGV[i].to_i
51 elsif ARGV[i] == '-lim'
52 usage if (i += 1) >= ARGV.length
53 lim = ARGV[i].to_i
54 elsif ARGV[i] == '-wait'
55 usage if (i += 1) >= ARGV.length
56 wait = ARGV[i].to_f
57 elsif ARGV[i] == '-ndf'
58 ndf = true
59 else
60 usage
61 end
62 else
63 if ARGV[i] =~ /^http:\/\//i
64 seeds.push(ARGV[i])
65 hist[ARGV[i]] = true
66 else
67 usage
68 end
69 end
70 i += 1
71 end
72 usage if seeds.length < 1
73 if !ndf
74 filters.push([false, /\.(txt|text|asc|c|cc|cxx|cpp|h|hxx|hpp|in)$/i])
75 filters.push([false, /\.(css|js|csv|tsv|log|md5|crc|conf|ini|inf|lnk|sys|tmp|bak)$/i])
76 filters.push([false, /\.(xml|xsl|xslt|rdf|rss|dtd|sgml|sgm)$/i])
77 filters.push([false, /\.(pgp|sig|cer|csr|pem|key|b64|uu|uue|[0-9])$/i])
78 filters.push([false, /\.(rtf|pdf|ps|eps|ai|doc|xls|ppt|sxw|sxc|sxi|xdw|jtd|oas|swf)$/i])
79 filters.push([false, /\.(zip|tar|tgz|gz|bz2|tbz2|z|lha|lzh)?$/i])
80 filters.push([false, /\.(7z|lzo|lzma|cpio|shar|cab|rar|sit|ace|hqx)?$/i])
81 filters.push([false, /\.(bin|o|a|so|exe|dll|lib|obj|ocx|class|jar|war)?$/i])
82 filters.push([false, /\.(rpm|deb|qdb|qdb|dbx|dbf|dat|msi|bat|com|iso)?$/i])
83 filters.push([false, /\.(png|gif|jpg|jpeg|tif|tiff|bmp|ico|pbm|pgm|ppm|xbm|xpm|dvi)$/i])
84 filters.push([false, /\.(au|snd|mid|midi|kar|smf|mp2|mp3|m3u|wav|wma|wmp|asx|at3|aif)$/i])
85 filters.push([false, /\.(mpg|mpeg|qt|mov|avi|wmv|wvx|asf|ram|rm)$/i])
86 filters.push([false, /\.(tch|tdb|tdf|tct)$/i])
87 filters.push([false, /\.idx\./i])
88 filters.push([false, /(core|casket|Makefile|README|NEWS|COPYING|LISENCE)($|\/)/i])
89 end
90 return proc(seeds, hist, filters, max, lim, wait) ? 0 : 1
91 end
92
93
94 # print the command usage and exit
95 def usage
96 STDERR.printf("%s: collect WWW resources and generate TSV data\n", $progname)
97 STDERR.printf("\n")
98 STDERR.printf("usage:\n")
99 STDERR.printf(" %s [-allow regex] [-deny regex] [-max num] [-lim num] [-wait num]" +
100 " url ...\n", $progname)
101 STDERR.printf("\n")
102 exit(1)
103 end
104
105
106 # fetch each content
107 def proc(seeds, hist, filters, max, lim, wait)
108 cnt = 0
109 while (url = seeds.shift) && cnt < max
110 STDERR.printf("%d: getting: %s\n", cnt + 1, url)
111 begin
112 opts = {}
113 OpenURI.open_uri(url, 0, 0, opts) do |sio|
114 baseuri = sio.base_uri
115 if baseuri && baseuri.to_s != url
116 url = baseuri.to_s
117 hist[url] = true
118 end
119 size = sio.size
120 raise "invalid size" if size > lim || size < 3
121 type = sio.content_type
122 type = "text/plain" if !type
123 str = sio.read
124 head = str[0,2048]
125 if (head[0] == 0xfe && head[1] == 0xff) || (head[0] == 0xff && head[1] == 0xfe)
126 str = Kconv::kconv(str, Kconv::UTF8, Kconv::UTF16)
127 charset = "UTF-8"
128 elsif str.include?(0)
129 raise "binary data"
130 end
131 raise "not HTML" if type != "text/html" && head !~ /<html/i && head !~ /<body/i
132 if !charset && head =~ /<\?xml.*encoding=("|')?[-_a-zA-Z0-9]+("|')/im
133 charset = head.gsub(/.*<\?xml.*encoding=["']?([-_a-zA-Z0-9]+)["']?.*/im, '\1')
134 end
135 if !charset && head =~ /<meta.*content-type.*charset=[-_a-zA-Z0-9]+/im
136 charset = head.gsub(/.*<meta.*content-type.*charset=([-_a-zA-Z0-9]+).*/im, '\1')
137 end
138 charset = sio.charset if !charset || charset.length < 1
139 if charset && charset.length > 0
140 if charset !~ /^UTF-?8$/i
141 begin
142 nstr = Iconv.conv("UTF-8", charset, str)
143 str = nstr if nstr && nstr.length > 0
144 rescue
145 str = str.toutf8
146 end
147 end
148 else
149 str = str.toutf8
150 end
151 body = str.gsub(/.*<body[^>]*>/im, "")
152 body = body.gsub(/<\/body>.*/im, "")
153 body = htmltotext(body)
154 if str =~ /<title[^>]*>[^<]*<\/title>/im
155 title = str.gsub(/.*<title[^>]*>([^<]*)<\/title>.*/im, '\1')
156 title = htmltotext(title)
157 end
158 title = "" if !title
159 title = title[0,128] if title.length > 128
160 mtime = sio.last_modified
161 if mtime
162 mtime = Time::parse(mtime.to_s).to_i
163 else
164 mtime = 0
165 end
166 printf("%d", cnt + 1)
167 printf("\turl\t%s", url)
168 printf("\tsize\t%s", size) if size > 0
169 printf("\tmtime\t%s", mtime) if mtime > 0
170 printf("\ttitle\t%s", title) if title.length > 0
171 printf("\tbody\t%s", body) if body.length > 0
172 printf("\n")
173 str.gsub(/<a[^>]*>/im) do |tag|
174 if tag =~ /href=["']?[^"'>]+["']?/
175 href = tag.gsub(/.*href=["']?([^"'>]+)["']?.*/, '\1')
176 href = URI::join(url, href).to_s
177 href = href.gsub(/#.*/, "")
178 if !hist[href] && checkurl(href, filters)
179 seeds.push(href)
180 hist[href] = true
181 end
182 end
183 end
184 end
185 cnt += 1
186 rescue
187 STDERR.printf("%d: failed: %s: %s\n", cnt + 1, url, $!)
188 end
189 sleep(wait) if wait > 0
190 end
191 return 0
192 end
193
194
195 # convert HTML to plain-text
196 def htmltotext(str)
197 str = str.gsub(/<style[^>]*>.*?<\/style>/im, " ")
198 str = str.gsub(/<script[^>]*>.*?<\/script>/im, " ")
199 str = str.gsub(/<\/?(p|br|div|h1|h2|h3|h4|h5|h6|ul|ol|dl|li|dd|dt|td|th|pre)[^>]*>/im, " ")
200 str = str.gsub(/<[^>]*>/, "")
201 str = str.gsub(/&(nbsp|#160|#0160|#xa0|#x00a0);/i, " ")
202 hexrx = Regexp::new("^&#x[0-9a-zA-Z]+;")
203 decrx = Regexp::new("^&#[0-9]+;")
204 str = str.gsub(/&#?[A-Za-z0-9]+;/) do |pat|
205 case pat
206 when "&lt;"
207 pat = '<'
208 when "&gt;"
209 pat = '>'
210 when "&quot;"
211 pat = '"'
212 when "&apos;"
213 pat = "'"
214 when "&nbsp;"
215 pat = " "
216 else
217 begin
218 if pat =~ hexrx
219 pat = [ pat.gsub(/&#x([A-Za-z0-9]+);/i, '\1').hex ].pack("n")
220 pat = Iconv.conv("UTF-8", "UTF-16BE", pat)
221 elsif pat =~ decrx
222 pat = [ pat.gsub(/&#([A-Za-z0-9]+);/i, '\1').to_i ].pack("n")
223 pat = Iconv.conv("UTF-8", "UTF-16BE", pat)
224 else
225 pat = " "
226 end
227 rescue
228 pat = ""
229 end
230 end
231 pat
232 end
233 str = str.gsub(/[\x00-\x20]/, " ")
234 str = str.gsub(/\xe3\x80\x80/, " ")
235 str = str.gsub(/ +/, " ")
236 str = str.gsub(/^ */, "")
237 str = str.gsub(/ *$/, "")
238 return str
239 end
240
241
242 # check whether the url passes the filters or not
243 def checkurl(url, filters)
244 return false if url !~ /^http:\/\//i;
245 return true if filters.length < 1
246 ok = !filters[0][0]
247 filters.each do |filter|
248 ok = filter[0] if url =~ filter[1]
249 end
250 return ok
251 end
252
253
254 # execute the actual operations
255 STDOUT.sync = true
256 $progname = $0.dup
257 $progname.gsub!(/.*\//, "")
258 srand
259 exit(main)