--- get_book.sh 2003/12/15 00:26:57 1.3 +++ get_book.sh 2004/01/15 12:39:37 1.5 @@ -1,27 +1,65 @@ #!/bin/sh -#export http_proxy=http://proxy:8080 +# proxy settings (same as in firebird) +export http_proxy=http://proxy:8080 +# user agent (same as in firebird) +ua="Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.5) Gecko/20031225 Firebird/0.7" + +# wait between pages +export wait=120 + if [ -z "$1" ] ; then echo "Usage: $0 ISBN" exit 1 fi -isbn=$1; +if [ -e orig ] ; then + echo "orig directory found. Resume download? [Y/n]" + read ans + if [ "$ans" = "n" ] ; then + exit 1; + fi + mv orig/* . + rm -Rf orig -wait=10 + grep -l 'promo.asp' * | xargs -i rm -v {} +fi + +isbn=$1; isbn2=`echo $isbn | sed 's/-//g'` function mirror() { + + url="$1" + + file=`echo $url | sed -e s,http://[^?]*?,index.html?, -e s,#.*$,, -e s,/,%2F,g` + if [ -e "$file" ] ; then +# echo "skip $url" + return + fi + + if echo $url | grep '/index' >/dev/null ; then + cookies="" + echo -n "no login (index) " + elif echo $url | grep 'mode=toc' >/dev/null ; then + cookies="" + echo -n "no login (toc) " + else + cookies="--load-cookies=/home/dpavlin/.phoenix/default/g6b45nt6.slt/cookies.txt" + echo -n "with login " + fi + echo $url + wget -p -nH -nc -k \ - --random-wait --wait=$wait -t 0 \ - --load-cookies=/home/dpavlin/.phoenix/default/g6b45nt6.slt/cookies.txt \ - -U "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.5) Gecko/20031206 Firebird/0.7" \ - $1 + -t 1 -U "$ua" \ + $cookies $url # -D safari.oreilly.com \ # -A 0-201-41975-0 \ + + perl -e '$t=rand($ENV{wait} || 30);print "sleep for $t sec.\n"; sleep($t);' } function geturl() { @@ -34,6 +72,7 @@ -e 's/open=false/open=true/' | \ grep '&s=1&b=1&f=1&t=1&c=1&u=1&r=&o=1' | \ grep $2 | \ + grep -v "$2/[0-9][0-9][0-9][0-9][0-9][0-9][0-9]" | \ sort -u >> in } @@ -43,22 +82,36 @@ grep 'view=[A-Z].*/index' in.tmp | sort -u >> in } -function checklogin() { - if grep 'promo.asp' index.html* >/dev/null ; then - echo "WARNING: safari seems to logunt you as user. Aborting." - exit 1 - fi +function mirror_in() { + cat in | while read url ; do + mirror "$url" + #sleep $wait + + if grep 'promo.asp' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: safari seems to logout you as user. Aborting." + exit 1 + fi + + if grep -i '>Account locked<' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: your safari account is locked. Aborting." + exit 1 + fi + + if grep -i 'session disabled' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: your safari session is disabled. Aborting." + exit 1 + fi + done } -echo > in +echo -n > in mirror "http://safari.oreilly.com/?XmlId=$isbn" echo "extract URLs from first page..." geturl "index.html?XmlId=$isbn" $isbn uniqurl -mirror "-i in" -checklogin +mirror_in echo -n "extracting URLs [1]" ls index.html* | while read file ; do @@ -69,10 +122,9 @@ uniqurl -mirror "-i in" -checklogin +mirror_in -echo > in +echo -n > in echo -n "extracting URLs [2]" ls index.html* | while read file ; do echo -n "." @@ -81,8 +133,7 @@ uniqurl -mirror "-i in" -checklogin +mirror_in # convert links in html bn=`basename $0`