Revision 2

Date:
2009/07/21 12:34:14
Author:
dpavlin
Revision Log:
shell script helper to crawl site
Files:

Legend:

 
Added
 
Removed
 
Modified
  • crawl.sh

     
    1 #!/bin/sh -x
    2
    3 test -z "$1" && echo "Usage: $0 http://www.example.com" && exit
    4
    5 base=`echo $1 | sed 's!http://!!'`
    6 dir=`dirname $base`
    7 test ! -z "$dir" -a ! -d "$dir" && mkdir -v -p $dir
    8
    9 test -f $base.tsv || ruby wgettsv -allow "$base" -deny cgi -max 10000 http://$base > $base.tsv || exit
    10 tctmgr importtsv $base.tct $base.tsv
    11 tctmgr setindex -it qgram $base.tct title
    12 tctmgr setindex -it qgram $base.tct body
    13 tctmgr inform $base.tct