1 |
#!/usr/bin/perl -w |
2 |
# |
3 |
# create configuration for my version of swish crawler and web interface |
4 |
# |
5 |
# 2003-04-26 Dobrica Pavlinusic <dpavlin@rot13.org> |
6 |
# |
7 |
|
8 |
use strict; |
9 |
|
10 |
my $name = shift @ARGV; |
11 |
my $url = shift @ARGV; |
12 |
#my $stripurl = shift @ARGV; |
13 |
my $stripurl; |
14 |
my $title = join(" ",@ARGV); |
15 |
|
16 |
if (!$name || !$url) { |
17 |
print "Usage: $0 name URL [strip from url] [optional title]\n"; |
18 |
exit; |
19 |
} |
20 |
|
21 |
my $dir=$0; $dir=~s/\/[^\/]+$//; |
22 |
|
23 |
# try to deduce working directory if script is not called with absolute path |
24 |
if ($dir !~ m#^/#) { |
25 |
chomp($dir = `pwd`); |
26 |
} |
27 |
|
28 |
my ($host,$urlpath); |
29 |
|
30 |
if ($url =~ m#(http://[^/]+)(/.*)$#) { |
31 |
($host,$urlpath) = ($1,$2); |
32 |
if ($stripurl && $urlpath =~ m/^(.*$stripurl)/) { |
33 |
$stripurl = $1; |
34 |
} else { |
35 |
$stripurl = ""; |
36 |
} |
37 |
} else { |
38 |
print "Can't parse URL $url into host and path!\n"; |
39 |
exit 1; |
40 |
} |
41 |
|
42 |
my $baseurl = $url; |
43 |
$baseurl =~ s#/[^/]+$#/#g; |
44 |
|
45 |
print "Config name: $name\nCrawling url: $url [strip $stripurl]\n"; |
46 |
print "Title: $title\n" if ($title); |
47 |
|
48 |
|
49 |
my $config = <<"EOF"; |
50 |
|
51 |
################################################### |
52 |
IncludeConfigFile $dir/common.config |
53 |
|
54 |
# this is a cludge to implement no parent URL feature in swish indexer |
55 |
IndexDir "$url $baseurl" |
56 |
ReplaceRules replace " $baseurl" "" |
57 |
ReplaceRules replace "${host}${stripurl}" "" |
58 |
|
59 |
IndexFile $dir/index/$name |
60 |
|
61 |
StoreDescription HTML <body> 500 |
62 |
|
63 |
# not very usefull, but... |
64 |
MetaNames keywords description |
65 |
|
66 |
# store <title> |
67 |
PropertyNameAlias swishtitle title |
68 |
|
69 |
EOF |
70 |
|
71 |
my $xml = << "EOF"; |
72 |
<config |
73 |
max_hits="1000" |
74 |
prog="/usr/bin/swish-e" |
75 |
index="$dir/index/$name" |
76 |
charset="iso-8859-2" |
77 |
affix="/usr/lib/ispell/american.aff" |
78 |
EOF |
79 |
if ($title) { |
80 |
$xml .= "\ttitle=\"$title\"\n"; |
81 |
} else { |
82 |
$xml .= "\ttitle=\"$name search\"\n"; |
83 |
} |
84 |
$xml .= "\turl=\"$stripurl\"\n" if ($stripurl); |
85 |
$xml .= << "EOF"; |
86 |
> |
87 |
<labels> |
88 |
<label value="10"> 10</label> |
89 |
<label value="100"> 100</label> |
90 |
<label value="0">unlimited</label> |
91 |
</labels> |
92 |
<text> |
93 |
<search>Search for </search> |
94 |
<documents> documents containing words: </documents> |
95 |
<submit>Search</submit> |
96 |
<no_spell>don't use alternative spellings </no_spell> |
97 |
<no_properties>don't display results details</no_properties> |
98 |
<hits>Showing %s documents (of maximum %d)... (%s)</hits> |
99 |
<no_hits>Can't find any documents (%s, %s)</no_hits> |
100 |
<footer>Searcher will try to create different variations of words using spelling dictionary. If you enter a word with minus (-) before it, it will exclude documents with this word and plus (+) will do the opposite (e.g. -work +play)</footer> |
101 |
</text> |
102 |
<!-- additional data to show summary --> |
103 |
<properties>swishdescription swishdocsize</properties> |
104 |
<hit><![CDATA[ <a href="%s">%s</a> [%s]<br> |
105 |
<font color=gray size=-2>%s ... <i>%s bytes</i></font><br> |
106 |
]]></hit> |
107 |
|
108 |
</config> |
109 |
EOF |
110 |
|
111 |
print "Creating $dir/$name.config\n"; |
112 |
open(C,"> $dir/$name.config") || die "can't open $dir/$name.config: $!"; |
113 |
print C $config; |
114 |
close(C); |
115 |
|
116 |
print "Creating $dir/html/$name.xml\n"; |
117 |
open(C, "> $dir/html/$name.xml") || die "can't open $dir/html/$name.xml: $!"; |
118 |
print C $xml; |
119 |
close(C); |
120 |
|
121 |
print "Creating symlink to $dir/html/$name.cgi\n"; |
122 |
symlink "$dir/html/swish.cgi","$dir/html/$name.cgi" || die "can't create symlink $dir/html/$name.cgi: $!"; |
123 |
|
124 |
#print "Index this with:\nswish-e -S http -c $dir/$name.config\n"; |
125 |
#print "Search using $name.cgi\n"; |