1 |
dpavlin |
47 |
#!/bin/sh |
2 |
|
|
|
3 |
|
|
# Parallel swish-e crawl script |
4 |
|
|
# |
5 |
|
|
# 2003-01-19 Dobrica Pavlinusic <dpavlin@rot13.org> |
6 |
|
|
# |
7 |
|
|
# somewhat documented in my blog at: |
8 |
|
|
# ttp://blog.rot13.org/index.cgi/id_14 |
9 |
|
|
|
10 |
|
|
# max. number of crawlers == nr. of processors |
11 |
|
|
max=`grep ^processor /proc/cpuinfo | wc -l` |
12 |
|
|
|
13 |
|
|
if [ ! -e "index/" ] ; then |
14 |
|
|
echo "This script needs index/ directory in current dir to store" |
15 |
|
|
echo "created index files." |
16 |
|
|
exit 1; |
17 |
|
|
fi |
18 |
|
|
|
19 |
|
|
if [ ! -e "config/" ] ; then |
20 |
|
|
echo "This script needs config/ directory in current dir in which" |
21 |
|
|
echo "are stored swish-e configuration files for each index." |
22 |
|
|
exit 1; |
23 |
|
|
fi |
24 |
|
|
|
25 |
|
|
rm index/*temp |
26 |
|
|
|
27 |
|
|
find config/ | while read config ; do |
28 |
|
|
while [ `find index -name "*temp" | wc -l` -ge $max ] ; do |
29 |
dpavlin |
54 |
#echo "sleep" |
30 |
dpavlin |
47 |
sleep 1 |
31 |
|
|
done |
32 |
|
|
|
33 |
|
|
index=`echo $config | sed 's,config/,,'` |
34 |
|
|
if [ -e "index/$index" -o -e "index/$index.temp" ] ; then |
35 |
|
|
echo "skip $index" |
36 |
|
|
else |
37 |
dpavlin |
54 |
( swish-e -S prog -c $config | grep "files indexed" | sed "s/^/'$index': /" || rm index/$index*temp ) & |
38 |
dpavlin |
47 |
fi |
39 |
|
|
# ( echo $index && touch index/$index.temp && sleep 3 && rm index/$index.temp ) & |
40 |
|
|
done |
41 |
|
|
|
42 |
|
|
while [ `find index -name "*temp" | wc -l` -ne 0 ] ; do |
43 |
|
|
echo "wait - `find index -name "*temp" | wc -l` left" |
44 |
|
|
sleep 1 |
45 |
|
|
done |
46 |
|
|
|
47 |
|
|
|
48 |
|
|
rm index/all index/all.prop |
49 |
|
|
swish-e -M `ls index | grep -v \.prop$ | grep -v all | grep -v CVS | sed 's#^#./index/#'` index/all |
50 |
|
|
|