/[tokyocabinet-toys]/wgettsv
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /wgettsv

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (hide annotations)
Tue Jul 21 12:32:05 2009 UTC (14 years, 8 months ago) by dpavlin
File size: 8526 byte(s)
import upstream source from http://alpha.mixi.co.jp/blog/?p=1205

1 dpavlin 1 #! /usr/bin/ruby -w
2    
3     #-------------------------------------------------------------------------------------------------
4     # wgettsv
5     # Collect WWW resources and generate TSV data
6     # Copyright (C) 2006-2009 Mikio Hirabayashi
7     # This file is part of Tokyo Cabinet.
8     # Tokyo Cabinet is free software; you can redistribute it and/or modify it under the terms of
9     # the GNU Lesser General Public License as published by the Free Software Foundation; either
10     # version 2.1 of the License or any later version. Tokyo Cabinet is distributed in the hope
11     # that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12     # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13     # License for more details.
14     # You should have received a copy of the GNU Lesser General Public License along with Tokyo
15     # Cabinet; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
16     # Boston, MA 02111-1307 USA.
17     #-------------------------------------------------------------------------------------------------
18    
19    
20     require 'open-uri'
21     require 'iconv'
22     require 'kconv'
23     require 'date'
24     require 'time'
25     require 'cgi'
26    
27    
28     # main routine
29     def main
30     seeds = []
31     hist = {}
32     filters = []
33     max = 1 << 30
34     lim = 1 << 20
35     wait = 0
36     ndf = false
37     i = 0
38     while i < ARGV.length
39     if seeds.length < 1 && ARGV[i] =~ /^-/
40     if ARGV[i] == '-allow'
41     usage if (i += 1) >= ARGV.length
42     regex = Regexp::new(ARGV[i])
43     filters.push([true, regex]) if regex
44     elsif ARGV[i] == '-deny'
45     usage if (i += 1) >= ARGV.length
46     regex = Regexp::new(ARGV[i])
47     filters.push([false, regex]) if regex
48     elsif ARGV[i] == '-max'
49     usage if (i += 1) >= ARGV.length
50     max = ARGV[i].to_i
51     elsif ARGV[i] == '-lim'
52     usage if (i += 1) >= ARGV.length
53     lim = ARGV[i].to_i
54     elsif ARGV[i] == '-wait'
55     usage if (i += 1) >= ARGV.length
56     wait = ARGV[i].to_f
57     elsif ARGV[i] == '-ndf'
58     ndf = true
59     else
60     usage
61     end
62     else
63     if ARGV[i] =~ /^http:\/\//i
64     seeds.push(ARGV[i])
65     hist[ARGV[i]] = true
66     else
67     usage
68     end
69     end
70     i += 1
71     end
72     usage if seeds.length < 1
73     if !ndf
74     filters.push([false, /\.(txt|text|asc|c|cc|cxx|cpp|h|hxx|hpp|in)$/i])
75     filters.push([false, /\.(css|js|csv|tsv|log|md5|crc|conf|ini|inf|lnk|sys|tmp|bak)$/i])
76     filters.push([false, /\.(xml|xsl|xslt|rdf|rss|dtd|sgml|sgm)$/i])
77     filters.push([false, /\.(pgp|sig|cer|csr|pem|key|b64|uu|uue|[0-9])$/i])
78     filters.push([false, /\.(rtf|pdf|ps|eps|ai|doc|xls|ppt|sxw|sxc|sxi|xdw|jtd|oas|swf)$/i])
79     filters.push([false, /\.(zip|tar|tgz|gz|bz2|tbz2|z|lha|lzh)?$/i])
80     filters.push([false, /\.(7z|lzo|lzma|cpio|shar|cab|rar|sit|ace|hqx)?$/i])
81     filters.push([false, /\.(bin|o|a|so|exe|dll|lib|obj|ocx|class|jar|war)?$/i])
82     filters.push([false, /\.(rpm|deb|qdb|qdb|dbx|dbf|dat|msi|bat|com|iso)?$/i])
83     filters.push([false, /\.(png|gif|jpg|jpeg|tif|tiff|bmp|ico|pbm|pgm|ppm|xbm|xpm|dvi)$/i])
84     filters.push([false, /\.(au|snd|mid|midi|kar|smf|mp2|mp3|m3u|wav|wma|wmp|asx|at3|aif)$/i])
85     filters.push([false, /\.(mpg|mpeg|qt|mov|avi|wmv|wvx|asf|ram|rm)$/i])
86     filters.push([false, /\.(tch|tdb|tdf|tct)$/i])
87     filters.push([false, /\.idx\./i])
88     filters.push([false, /(core|casket|Makefile|README|NEWS|COPYING|LISENCE)($|\/)/i])
89     end
90     return proc(seeds, hist, filters, max, lim, wait) ? 0 : 1
91     end
92    
93    
94     # print the command usage and exit
95     def usage
96     STDERR.printf("%s: collect WWW resources and generate TSV data\n", $progname)
97     STDERR.printf("\n")
98     STDERR.printf("usage:\n")
99     STDERR.printf(" %s [-allow regex] [-deny regex] [-max num] [-lim num] [-wait num]" +
100     " url ...\n", $progname)
101     STDERR.printf("\n")
102     exit(1)
103     end
104    
105    
106     # fetch each content
107     def proc(seeds, hist, filters, max, lim, wait)
108     cnt = 0
109     while (url = seeds.shift) && cnt < max
110     STDERR.printf("%d: getting: %s\n", cnt + 1, url)
111     begin
112     opts = {}
113     OpenURI.open_uri(url, 0, 0, opts) do |sio|
114     baseuri = sio.base_uri
115     if baseuri && baseuri.to_s != url
116     url = baseuri.to_s
117     hist[url] = true
118     end
119     size = sio.size
120     raise "invalid size" if size > lim || size < 3
121     type = sio.content_type
122     type = "text/plain" if !type
123     str = sio.read
124     head = str[0,2048]
125     if (head[0] == 0xfe && head[1] == 0xff) || (head[0] == 0xff && head[1] == 0xfe)
126     str = Kconv::kconv(str, Kconv::UTF8, Kconv::UTF16)
127     charset = "UTF-8"
128     elsif str.include?(0)
129     raise "binary data"
130     end
131     raise "not HTML" if type != "text/html" && head !~ /<html/i && head !~ /<body/i
132     if !charset && head =~ /<\?xml.*encoding=("|')?[-_a-zA-Z0-9]+("|')/im
133     charset = head.gsub(/.*<\?xml.*encoding=["']?([-_a-zA-Z0-9]+)["']?.*/im, '\1')
134     end
135     if !charset && head =~ /<meta.*content-type.*charset=[-_a-zA-Z0-9]+/im
136     charset = head.gsub(/.*<meta.*content-type.*charset=([-_a-zA-Z0-9]+).*/im, '\1')
137     end
138     charset = sio.charset if !charset || charset.length < 1
139     if charset && charset.length > 0
140     if charset !~ /^UTF-?8$/i
141     begin
142     nstr = Iconv.conv("UTF-8", charset, str)
143     str = nstr if nstr && nstr.length > 0
144     rescue
145     str = str.toutf8
146     end
147     end
148     else
149     str = str.toutf8
150     end
151     body = str.gsub(/.*<body[^>]*>/im, "")
152     body = body.gsub(/<\/body>.*/im, "")
153     body = htmltotext(body)
154     if str =~ /<title[^>]*>[^<]*<\/title>/im
155     title = str.gsub(/.*<title[^>]*>([^<]*)<\/title>.*/im, '\1')
156     title = htmltotext(title)
157     end
158     title = "" if !title
159     title = title[0,128] if title.length > 128
160     mtime = sio.last_modified
161     if mtime
162     mtime = Time::parse(mtime.to_s).to_i
163     else
164     mtime = 0
165     end
166     printf("%d", cnt + 1)
167     printf("\turl\t%s", url)
168     printf("\tsize\t%s", size) if size > 0
169     printf("\tmtime\t%s", mtime) if mtime > 0
170     printf("\ttitle\t%s", title) if title.length > 0
171     printf("\tbody\t%s", body) if body.length > 0
172     printf("\n")
173     str.gsub(/<a[^>]*>/im) do |tag|
174     if tag =~ /href=["']?[^"'>]+["']?/
175     href = tag.gsub(/.*href=["']?([^"'>]+)["']?.*/, '\1')
176     href = URI::join(url, href).to_s
177     href = href.gsub(/#.*/, "")
178     if !hist[href] && checkurl(href, filters)
179     seeds.push(href)
180     hist[href] = true
181     end
182     end
183     end
184     end
185     cnt += 1
186     rescue
187     STDERR.printf("%d: failed: %s: %s\n", cnt + 1, url, $!)
188     end
189     sleep(wait) if wait > 0
190     end
191     return 0
192     end
193    
194    
195     # convert HTML to plain-text
196     def htmltotext(str)
197     str = str.gsub(/<style[^>]*>.*?<\/style>/im, " ")
198     str = str.gsub(/<script[^>]*>.*?<\/script>/im, " ")
199     str = str.gsub(/<\/?(p|br|div|h1|h2|h3|h4|h5|h6|ul|ol|dl|li|dd|dt|td|th|pre)[^>]*>/im, " ")
200     str = str.gsub(/<[^>]*>/, "")
201     str = str.gsub(/&(nbsp|#160|#0160|#xa0|#x00a0);/i, " ")
202     hexrx = Regexp::new("^&#x[0-9a-zA-Z]+;")
203     decrx = Regexp::new("^&#[0-9]+;")
204     str = str.gsub(/&#?[A-Za-z0-9]+;/) do |pat|
205     case pat
206     when "&lt;"
207     pat = '<'
208     when "&gt;"
209     pat = '>'
210     when "&quot;"
211     pat = '"'
212     when "&apos;"
213     pat = "'"
214     when "&nbsp;"
215     pat = " "
216     else
217     begin
218     if pat =~ hexrx
219     pat = [ pat.gsub(/&#x([A-Za-z0-9]+);/i, '\1').hex ].pack("n")
220     pat = Iconv.conv("UTF-8", "UTF-16BE", pat)
221     elsif pat =~ decrx
222     pat = [ pat.gsub(/&#([A-Za-z0-9]+);/i, '\1').to_i ].pack("n")
223     pat = Iconv.conv("UTF-8", "UTF-16BE", pat)
224     else
225     pat = " "
226     end
227     rescue
228     pat = ""
229     end
230     end
231     pat
232     end
233     str = str.gsub(/[\x00-\x20]/, " ")
234     str = str.gsub(/\xe3\x80\x80/, " ")
235     str = str.gsub(/ +/, " ")
236     str = str.gsub(/^ */, "")
237     str = str.gsub(/ *$/, "")
238     return str
239     end
240    
241    
242     # check whether the url passes the filters or not
243     def checkurl(url, filters)
244     return false if url !~ /^http:\/\//i;
245     return true if filters.length < 1
246     ok = !filters[0][0]
247     filters.each do |filter|
248     ok = filter[0] if url =~ filter[1]
249     end
250     return ok
251     end
252    
253    
254     # execute the actual operations
255     STDOUT.sync = true
256     $progname = $0.dup
257     $progname.gsub!(/.*\//, "")
258     srand
259     exit(main)

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26