viewvc.cgi/tokyocabinet-toys/wgettsv

#! /usr/bin/ruby -w

#-------------------------------------------------------------------------------------------------
# wgettsv
# Collect WWW resources and generate TSV data
#                                                       Copyright (C) 2006-2009 Mikio Hirabayashi
# This file is part of Tokyo Cabinet.
# Tokyo Cabinet is free software; you can redistribute it and/or modify it under the terms of
# the GNU Lesser General Public License as published by the Free Software Foundation; either
# version 2.1 of the License or any later version.  Tokyo Cabinet is distributed in the hope
# that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
# License for more details.
# You should have received a copy of the GNU Lesser General Public License along with Tokyo
# Cabinet; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA.
#-------------------------------------------------------------------------------------------------


require 'open-uri'
require 'iconv'
require 'kconv'
require 'date'
require 'time'
require 'cgi'


# main routine
def main
  seeds = []
  hist = {}
  filters = []
  max = 1 << 30
  lim = 1 << 20
  wait = 0
  ndf = false
  i = 0
  while i < ARGV.length
    if seeds.length < 1 && ARGV[i] =~ /^-/
      if ARGV[i] == '-allow'
        usage if (i += 1) >= ARGV.length
        regex = Regexp::new(ARGV[i])
        filters.push([true, regex]) if regex
      elsif ARGV[i] == '-deny'
        usage if (i += 1) >= ARGV.length
        regex = Regexp::new(ARGV[i])
        filters.push([false, regex]) if regex
      elsif ARGV[i] == '-max'
        usage if (i += 1) >= ARGV.length
        max = ARGV[i].to_i
      elsif ARGV[i] == '-lim'
        usage if (i += 1) >= ARGV.length
        lim = ARGV[i].to_i
      elsif ARGV[i] == '-wait'
        usage if (i += 1) >= ARGV.length
        wait = ARGV[i].to_f
      elsif ARGV[i] == '-ndf'
        ndf = true
      else
        usage
      end
    else
      if ARGV[i] =~ /^http:\/\//i
        seeds.push(ARGV[i])
        hist[ARGV[i]] = true
      else
        usage
      end
    end
    i += 1
  end
  usage if seeds.length < 1
  if !ndf
    filters.push([false, /\.(txt|text|asc|c|cc|cxx|cpp|h|hxx|hpp|in)$/i])
    filters.push([false, /\.(css|js|csv|tsv|log|md5|crc|conf|ini|inf|lnk|sys|tmp|bak)$/i])
    filters.push([false, /\.(xml|xsl|xslt|rdf|rss|dtd|sgml|sgm)$/i])
    filters.push([false, /\.(pgp|sig|cer|csr|pem|key|b64|uu|uue|[0-9])$/i])
    filters.push([false, /\.(rtf|pdf|ps|eps|ai|doc|xls|ppt|sxw|sxc|sxi|xdw|jtd|oas|swf)$/i])
    filters.push([false, /\.(zip|tar|tgz|gz|bz2|tbz2|z|lha|lzh)?$/i])
    filters.push([false, /\.(7z|lzo|lzma|cpio|shar|cab|rar|sit|ace|hqx)?$/i])
    filters.push([false, /\.(bin|o|a|so|exe|dll|lib|obj|ocx|class|jar|war)?$/i])
    filters.push([false, /\.(rpm|deb|qdb|qdb|dbx|dbf|dat|msi|bat|com|iso)?$/i])
    filters.push([false, /\.(png|gif|jpg|jpeg|tif|tiff|bmp|ico|pbm|pgm|ppm|xbm|xpm|dvi)$/i])
    filters.push([false, /\.(au|snd|mid|midi|kar|smf|mp2|mp3|m3u|wav|wma|wmp|asx|at3|aif)$/i])
    filters.push([false, /\.(mpg|mpeg|qt|mov|avi|wmv|wvx|asf|ram|rm)$/i])
    filters.push([false, /\.(tch|tdb|tdf|tct)$/i])
    filters.push([false, /\.idx\./i])
    filters.push([false, /(core|casket|Makefile|README|NEWS|COPYING|LISENCE)($|\/)/i])
  end
  return proc(seeds, hist, filters, max, lim, wait) ? 0 : 1
end


# print the command usage and exit
def usage
  STDERR.printf("%s: collect WWW resources and generate TSV data\n", $progname)
  STDERR.printf("\n")
  STDERR.printf("usage:\n")
  STDERR.printf("  %s [-allow regex] [-deny regex] [-max num] [-lim num] [-wait num]" +
                " url ...\n", $progname)
  STDERR.printf("\n")
  exit(1)
end


# fetch each content
def proc(seeds, hist, filters, max, lim, wait)
  cnt = 0
  while (url = seeds.shift) && cnt < max
    STDERR.printf("%d: getting: %s\n", cnt + 1, url)
    begin
      opts = {}
      OpenURI.open_uri(url, 0, 0, opts) do |sio|
        baseuri = sio.base_uri
        if baseuri && baseuri.to_s != url
          url = baseuri.to_s
          hist[url] = true
        end
        size = sio.size
        raise "invalid size" if size > lim || size < 3
        type = sio.content_type
        type = "text/plain" if !type
        str = sio.read
        head = str[0,2048]
        if (head[0] == 0xfe && head[1] == 0xff) || (head[0] == 0xff && head[1] == 0xfe)
          str = Kconv::kconv(str, Kconv::UTF8, Kconv::UTF16)
          charset = "UTF-8"
        elsif str.include?(0)
          raise "binary data"
        end
        raise "not HTML" if type != "text/html" && head !~ /<html/i && head !~ /<body/i
        if !charset && head =~ /<\?xml.*encoding=("|')?[-_a-zA-Z0-9]+("|')/im
          charset = head.gsub(/.*<\?xml.*encoding=["']?([-_a-zA-Z0-9]+)["']?.*/im, '\1')
        end
        if !charset && head =~ /<meta.*content-type.*charset=[-_a-zA-Z0-9]+/im
          charset = head.gsub(/.*<meta.*content-type.*charset=([-_a-zA-Z0-9]+).*/im, '\1')
        end
        charset = sio.charset if !charset || charset.length < 1
        if charset && charset.length > 0
          if charset !~ /^UTF-?8$/i
            begin
              nstr = Iconv.conv("UTF-8", charset, str)
              str = nstr if nstr && nstr.length > 0
            rescue
              str = str.toutf8
            end
          end
        else
          str = str.toutf8
        end
        body = str.gsub(/.*<body[^>]*>/im, "")
        body = body.gsub(/<\/body>.*/im, "")
        body = htmltotext(body)
        if str =~ /<title[^>]*>[^<]*<\/title>/im
          title = str.gsub(/.*<title[^>]*>([^<]*)<\/title>.*/im, '\1')
          title = htmltotext(title)
        end
        title = "" if !title
        title = title[0,128] if title.length > 128
        mtime = sio.last_modified
        if mtime
          mtime = Time::parse(mtime.to_s).to_i
        else
          mtime = 0
        end
        printf("%d", cnt + 1)
        printf("\turl\t%s", url)
        printf("\tsize\t%s", size) if size > 0
        printf("\tmtime\t%s", mtime) if mtime > 0
        printf("\ttitle\t%s", title) if title.length > 0
        printf("\tbody\t%s", body) if body.length > 0
        printf("\n")
        str.gsub(/<a[^>]*>/im) do |tag|
          if tag =~ /href=["']?[^"'>]+["']?/
            href = tag.gsub(/.*href=["']?([^"'>]+)["']?.*/, '\1')
            href = URI::join(url, href).to_s
            href = href.gsub(/#.*/, "")
            if !hist[href] && checkurl(href, filters)
              seeds.push(href)
              hist[href] = true
            end
          end
        end
      end
      cnt += 1
    rescue
      STDERR.printf("%d: failed: %s: %s\n", cnt + 1, url, $!)
    end
    sleep(wait) if wait > 0
  end
  return 0
end


# convert HTML to plain-text
def htmltotext(str)
  str = str.gsub(/<style[^>]*>.*?<\/style>/im, " ")
  str = str.gsub(/<script[^>]*>.*?<\/script>/im, " ")
  str = str.gsub(/<\/?(p|br|div|h1|h2|h3|h4|h5|h6|ul|ol|dl|li|dd|dt|td|th|pre)[^>]*>/im, " ")
  str = str.gsub(/<[^>]*>/, "")
  str = str.gsub(/&(nbsp|#160|#0160|#xa0|#x00a0);/i, " ")
  hexrx = Regexp::new("^&#x[0-9a-zA-Z]+;")
  decrx = Regexp::new("^&#[0-9]+;")
  str = str.gsub(/&#?[A-Za-z0-9]+;/) do |pat|
    case pat
    when "&lt;"
      pat = '<'
    when "&gt;"
      pat = '>'
    when "&quot;"
      pat = '"'
    when "&apos;"
      pat = "'"
    when "&nbsp;"
      pat = " "
    else
      begin
        if pat =~ hexrx
          pat = [ pat.gsub(/&#x([A-Za-z0-9]+);/i, '\1').hex ].pack("n")
          pat = Iconv.conv("UTF-8", "UTF-16BE", pat)
        elsif pat =~ decrx
          pat = [ pat.gsub(/&#([A-Za-z0-9]+);/i, '\1').to_i ].pack("n")
          pat = Iconv.conv("UTF-8", "UTF-16BE", pat)
        else
          pat = " "
        end
      rescue
        pat = ""
      end
    end
    pat
  end
  str = str.gsub(/[\x00-\x20]/, " ")
  str = str.gsub(/\xe3\x80\x80/, " ")
  str = str.gsub(/ +/, " ")
  str = str.gsub(/^ */, "")
  str = str.gsub(/ *$/, "")
  return str
end


# check whether the url passes the filters or not
def checkurl(url, filters)
  return false if url !~ /^http:\/\//i;
  return true if filters.length < 1
  ok = !filters[0][0]
  filters.each do |filter|
    ok = filter[0] if url =~ filter[1]
  end
  return ok
end


# execute the actual operations
STDOUT.sync = true
$progname = $0.dup
$progname.gsub!(/.*\//, "")
srand
exit(main)
1	dpavlin	1	#! /usr/bin/ruby -w
2
3			#-------------------------------------------------------------------------------------------------
4			# wgettsv
5			# Collect WWW resources and generate TSV data
6			# Copyright (C) 2006-2009 Mikio Hirabayashi
7			# This file is part of Tokyo Cabinet.
8			# Tokyo Cabinet is free software; you can redistribute it and/or modify it under the terms of
9			# the GNU Lesser General Public License as published by the Free Software Foundation; either
10			# version 2.1 of the License or any later version. Tokyo Cabinet is distributed in the hope
11			# that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13			# License for more details.
14			# You should have received a copy of the GNU Lesser General Public License along with Tokyo
15			# Cabinet; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
16			# Boston, MA 02111-1307 USA.
17			#-------------------------------------------------------------------------------------------------
18
19
20			require 'open-uri'
21			require 'iconv'
22			require 'kconv'
23			require 'date'
24			require 'time'
25			require 'cgi'
26
27
28			# main routine
29			def main
30			seeds = []
31			hist = {}
32			filters = []
33			max = 1 << 30
34			lim = 1 << 20
35			wait = 0
36			ndf = false
37			i = 0
38			while i < ARGV.length
39			if seeds.length < 1 && ARGV[i] =~ /^-/
40			if ARGV[i] == '-allow'
41			usage if (i += 1) >= ARGV.length
42			regex = Regexp::new(ARGV[i])
43			filters.push([true, regex]) if regex
44			elsif ARGV[i] == '-deny'
45			usage if (i += 1) >= ARGV.length
46			regex = Regexp::new(ARGV[i])
47			filters.push([false, regex]) if regex
48			elsif ARGV[i] == '-max'
49			usage if (i += 1) >= ARGV.length
50			max = ARGV[i].to_i
51			elsif ARGV[i] == '-lim'
52			usage if (i += 1) >= ARGV.length
53			lim = ARGV[i].to_i
54			elsif ARGV[i] == '-wait'
55			usage if (i += 1) >= ARGV.length
56			wait = ARGV[i].to_f
57			elsif ARGV[i] == '-ndf'
58			ndf = true
59			else
60			usage
61			end
62			else
63			if ARGV[i] =~ /^http:\/\//i
64			seeds.push(ARGV[i])
65			hist[ARGV[i]] = true
66			else
67			usage
68			end
69			end
70			i += 1
71			end
72			usage if seeds.length < 1
73			if !ndf
74			filters.push([false, /\.(txt\|text\|asc\|c\|cc\|cxx\|cpp\|h\|hxx\|hpp\|in)$/i])
75			filters.push([false, /\.(css\|js\|csv\|tsv\|log\|md5\|crc\|conf\|ini\|inf\|lnk\|sys\|tmp\|bak)$/i])
76			filters.push([false, /\.(xml\|xsl\|xslt\|rdf\|rss\|dtd\|sgml\|sgm)$/i])
77			filters.push([false, /\.(pgp\|sig\|cer\|csr\|pem\|key\|b64\|uu\|uue\|[0-9])$/i])
78			filters.push([false, /\.(rtf\|pdf\|ps\|eps\|ai\|doc\|xls\|ppt\|sxw\|sxc\|sxi\|xdw\|jtd\|oas\|swf)$/i])
79			filters.push([false, /\.(zip\|tar\|tgz\|gz\|bz2\|tbz2\|z\|lha\|lzh)?$/i])
80			filters.push([false, /\.(7z\|lzo\|lzma\|cpio\|shar\|cab\|rar\|sit\|ace\|hqx)?$/i])
81			filters.push([false, /\.(bin\|o\|a\|so\|exe\|dll\|lib\|obj\|ocx\|class\|jar\|war)?$/i])
82			filters.push([false, /\.(rpm\|deb\|qdb\|qdb\|dbx\|dbf\|dat\|msi\|bat\|com\|iso)?$/i])
83			filters.push([false, /\.(png\|gif\|jpg\|jpeg\|tif\|tiff\|bmp\|ico\|pbm\|pgm\|ppm\|xbm\|xpm\|dvi)$/i])
84			filters.push([false, /\.(au\|snd\|mid\|midi\|kar\|smf\|mp2\|mp3\|m3u\|wav\|wma\|wmp\|asx\|at3\|aif)$/i])
85			filters.push([false, /\.(mpg\|mpeg\|qt\|mov\|avi\|wmv\|wvx\|asf\|ram\|rm)$/i])
86			filters.push([false, /\.(tch\|tdb\|tdf\|tct)$/i])
87			filters.push([false, /\.idx\./i])
88			filters.push([false, /(core\|casket\|Makefile\|README\|NEWS\|COPYING\|LISENCE)($\|\/)/i])
89			end
90			return proc(seeds, hist, filters, max, lim, wait) ? 0 : 1
91			end
92
93
94			# print the command usage and exit
95			def usage
96			STDERR.printf("%s: collect WWW resources and generate TSV data\n", $progname)
97			STDERR.printf("\n")
98			STDERR.printf("usage:\n")
99			STDERR.printf(" %s [-allow regex] [-deny regex] [-max num] [-lim num] [-wait num]" +
100			" url ...\n", $progname)
101			STDERR.printf("\n")
102			exit(1)
103			end
104
105
106			# fetch each content
107			def proc(seeds, hist, filters, max, lim, wait)
108			cnt = 0
109			while (url = seeds.shift) && cnt < max
110			STDERR.printf("%d: getting: %s\n", cnt + 1, url)
111			begin
112			opts = {}
113			OpenURI.open_uri(url, 0, 0, opts) do \|sio\|
114			baseuri = sio.base_uri
115			if baseuri && baseuri.to_s != url
116			url = baseuri.to_s
117			hist[url] = true
118			end
119			size = sio.size
120			raise "invalid size" if size > lim \|\| size < 3
121			type = sio.content_type
122			type = "text/plain" if !type
123			str = sio.read
124			head = str[0,2048]
125			if (head[0] == 0xfe && head[1] == 0xff) \|\| (head[0] == 0xff && head[1] == 0xfe)
126			str = Kconv::kconv(str, Kconv::UTF8, Kconv::UTF16)
127			charset = "UTF-8"
128			elsif str.include?(0)
129			raise "binary data"
130			end
131			raise "not HTML" if type != "text/html" && head !~ /<html/i && head !~ /<body/i
132			if !charset && head =~ /<\?xml.*encoding=("\|')?[-_a-zA-Z0-9]+("\|')/im
133			charset = head.gsub(/.<\?xml.encoding=["']?([-_a-zA-Z0-9]+)["']?.*/im, '\1')
134			end
135			if !charset && head =~ /<meta.content-type.charset=[-_a-zA-Z0-9]+/im
136			charset = head.gsub(/.<meta.content-type.charset=([-_a-zA-Z0-9]+)./im, '\1')
137			end
138			charset = sio.charset if !charset \|\| charset.length < 1
139			if charset && charset.length > 0
140			if charset !~ /^UTF-?8$/i
141			begin
142			nstr = Iconv.conv("UTF-8", charset, str)
143			str = nstr if nstr && nstr.length > 0
144			rescue
145			str = str.toutf8
146			end
147			end
148			else
149			str = str.toutf8
150			end
151			body = str.gsub(/.<body[^>]>/im, "")
152			body = body.gsub(/<\/body>.*/im, "")
153			body = htmltotext(body)
154			if str =~ /<title[^>]>[^<]<\/title>/im
155			title = str.gsub(/.<title[^>]>([^<])<\/title>./im, '\1')
156			title = htmltotext(title)
157			end
158			title = "" if !title
159			title = title[0,128] if title.length > 128
160			mtime = sio.last_modified
161			if mtime
162			mtime = Time::parse(mtime.to_s).to_i
163			else
164			mtime = 0
165			end
166			printf("%d", cnt + 1)
167			printf("\turl\t%s", url)
168			printf("\tsize\t%s", size) if size > 0
169			printf("\tmtime\t%s", mtime) if mtime > 0
170			printf("\ttitle\t%s", title) if title.length > 0
171			printf("\tbody\t%s", body) if body.length > 0
172			printf("\n")
173			str.gsub(/<a[^>]*>/im) do \|tag\|
174			if tag =~ /href=["']?[^"'>]+["']?/
175			href = tag.gsub(/.href=["']?([^"'>]+)["']?./, '\1')
176			href = URI::join(url, href).to_s
177			href = href.gsub(/#.*/, "")
178			if !hist[href] && checkurl(href, filters)
179			seeds.push(href)
180			hist[href] = true
181			end
182			end
183			end
184			end
185			cnt += 1
186			rescue
187			STDERR.printf("%d: failed: %s: %s\n", cnt + 1, url, $!)
188			end
189			sleep(wait) if wait > 0
190			end
191			return 0
192			end
193
194
195			# convert HTML to plain-text
196			def htmltotext(str)
197			str = str.gsub(/<style[^>]>.?<\/style>/im, " ")
198			str = str.gsub(/<script[^>]>.?<\/script>/im, " ")
199			str = str.gsub(/<\/?(p\|br\|div\|h1\|h2\|h3\|h4\|h5\|h6\|ul\|ol\|dl\|li\|dd\|dt\|td\|th\|pre)[^>]*>/im, " ")
200			str = str.gsub(/<[^>]*>/, "")
201			str = str.gsub(/&(nbsp\|#160\|#0160\|#xa0\|#x00a0);/i, " ")
202			hexrx = Regexp::new("^&#x[0-9a-zA-Z]+;")
203			decrx = Regexp::new("^&#[0-9]+;")
204			str = str.gsub(/&#?[A-Za-z0-9]+;/) do \|pat\|
205			case pat
206			when "<"
207			pat = '<'
208			when ">"
209			pat = '>'
210			when """
211			pat = '"'
212			when "'"
213			pat = "'"
214			when " "
215			pat = " "
216			else
217			begin
218			if pat =~ hexrx
219			pat = [ pat.gsub(/&#x([A-Za-z0-9]+);/i, '\1').hex ].pack("n")
220			pat = Iconv.conv("UTF-8", "UTF-16BE", pat)
221			elsif pat =~ decrx
222			pat = [ pat.gsub(/&#([A-Za-z0-9]+);/i, '\1').to_i ].pack("n")
223			pat = Iconv.conv("UTF-8", "UTF-16BE", pat)
224			else
225			pat = " "
226			end
227			rescue
228			pat = ""
229			end
230			end
231			pat
232			end
233			str = str.gsub(/[\x00-\x20]/, " ")
234			str = str.gsub(/\xe3\x80\x80/, " ")
235			str = str.gsub(/ +/, " ")
236			str = str.gsub(/^ */, "")
237			str = str.gsub(/ *$/, "")
238			return str
239			end
240
241
242			# check whether the url passes the filters or not
243			def checkurl(url, filters)
244			return false if url !~ /^http:\/\//i;
245			return true if filters.length < 1
246			ok = !filters[0][0]
247			filters.each do \|filter\|
248			ok = filter[0] if url =~ filter[1]
249			end
250			return ok
251			end
252
253
254			# execute the actual operations
255			STDOUT.sync = true
256			$progname = $0.dup
257			$progname.gsub!(/.*\//, "")
258			srand
259			exit(main)