1 |
#!/bin/sh |
2 |
|
3 |
# Parallel swish-e crawl script |
4 |
# |
5 |
# 2003-01-19 Dobrica Pavlinusic <dpavlin@rot13.org> |
6 |
# |
7 |
# somewhat documented in my blog at: |
8 |
# ttp://blog.rot13.org/index.cgi/id_14 |
9 |
|
10 |
# max. number of crawlers == nr. of processors |
11 |
max=`grep ^processor /proc/cpuinfo | wc -l` |
12 |
|
13 |
if [ ! -e "index/" ] ; then |
14 |
echo "This script needs index/ directory in current dir to store" |
15 |
echo "created index files." |
16 |
exit 1; |
17 |
fi |
18 |
|
19 |
if [ ! -e "config/" ] ; then |
20 |
echo "This script needs config/ directory in current dir in which" |
21 |
echo "are stored swish-e configuration files for each index." |
22 |
exit 1; |
23 |
fi |
24 |
|
25 |
rm index/*temp |
26 |
|
27 |
find config/ | while read config ; do |
28 |
while [ `find index -name "*temp" | wc -l` -ge $max ] ; do |
29 |
#echo "sleep" |
30 |
sleep 1 |
31 |
done |
32 |
|
33 |
index=`echo $config | sed 's,config/,,'` |
34 |
if [ -e "index/$index" -o -e "index/$index.temp" ] ; then |
35 |
echo "skip $index" |
36 |
else |
37 |
( swish-e -S prog -c $config | grep "files indexed" | sed "s/^/'$index': /" || rm index/$index*temp ) & |
38 |
fi |
39 |
# ( echo $index && touch index/$index.temp && sleep 3 && rm index/$index.temp ) & |
40 |
done |
41 |
|
42 |
while [ `find index -name "*temp" | wc -l` -ne 0 ] ; do |
43 |
echo "wait - `find index -name "*temp" | wc -l` left" |
44 |
sleep 1 |
45 |
done |
46 |
|
47 |
rm index/all index/all.prop |
48 |
|
49 |
tmp=/tmp/swish-list |
50 |
ls ./index/*.prop | sed 's/\.prop//' | grep -v all > $tmp |
51 |
split -l 128 $tmp $tmp- |
52 |
|
53 |
num=0 |
54 |
ls $tmp-* | while read slice ; do |
55 |
num=`expr $num + 1` |
56 |
printf 'merge slice %d\n' $num |
57 |
|
58 |
swish-e -M `cat $slice` index/slice-$num |
59 |
done |
60 |
|
61 |
echo "merge all slices" |
62 |
swish-e -M index/slice-* index/all |
63 |
#rm -f index/slice-* |
64 |
|