/[swish]/trunk/crawl-parallel.sh
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/crawl-parallel.sh

Parent Directory Parent Directory | Revision Log Revision Log


Revision 96 - (hide annotations)
Sun Apr 24 16:34:21 2005 UTC (18 years, 11 months ago) by dpavlin
File MIME type: application/x-sh
File size: 1532 byte(s)
added merge splitting in slices

1 dpavlin 47 #!/bin/sh
2    
3     # Parallel swish-e crawl script
4     #
5     # 2003-01-19 Dobrica Pavlinusic <dpavlin@rot13.org>
6     #
7     # somewhat documented in my blog at:
8     # ttp://blog.rot13.org/index.cgi/id_14
9    
10     # max. number of crawlers == nr. of processors
11     max=`grep ^processor /proc/cpuinfo | wc -l`
12    
13     if [ ! -e "index/" ] ; then
14     echo "This script needs index/ directory in current dir to store"
15     echo "created index files."
16     exit 1;
17     fi
18    
19     if [ ! -e "config/" ] ; then
20     echo "This script needs config/ directory in current dir in which"
21     echo "are stored swish-e configuration files for each index."
22     exit 1;
23     fi
24    
25     rm index/*temp
26    
27     find config/ | while read config ; do
28     while [ `find index -name "*temp" | wc -l` -ge $max ] ; do
29 dpavlin 54 #echo "sleep"
30 dpavlin 47 sleep 1
31     done
32    
33     index=`echo $config | sed 's,config/,,'`
34     if [ -e "index/$index" -o -e "index/$index.temp" ] ; then
35     echo "skip $index"
36     else
37 dpavlin 54 ( swish-e -S prog -c $config | grep "files indexed" | sed "s/^/'$index': /" || rm index/$index*temp ) &
38 dpavlin 47 fi
39     # ( echo $index && touch index/$index.temp && sleep 3 && rm index/$index.temp ) &
40     done
41    
42     while [ `find index -name "*temp" | wc -l` -ne 0 ] ; do
43     echo "wait - `find index -name "*temp" | wc -l` left"
44     sleep 1
45     done
46    
47     rm index/all index/all.prop
48    
49 dpavlin 96 tmp=/tmp/swish-list
50     ls ./index/*.prop | sed 's/\.prop//' | grep -v all > $tmp
51     split -l 128 $tmp $tmp-
52    
53     num=0
54     ls $tmp-* | while read slice ; do
55     num=`expr $num + 1`
56     printf 'merge slice %d\n' $num
57    
58     swish-e -M `cat $slice` index/slice-$num
59     done
60    
61     echo "merge all slices"
62     swish-e -M index/slice-* index/all
63     rm -f index/slice-*
64    

Properties

Name Value
cvs2svn:cvs-rev 1.2
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26