1 |
dpavlin |
32 |
#!/usr/bin/perl -w |
2 |
|
|
# |
3 |
|
|
# create configuration for my version of swish crawler and web interface |
4 |
|
|
# |
5 |
|
|
# 2003-04-26 Dobrica Pavlinusic <dpavlin@rot13.org> |
6 |
|
|
# |
7 |
|
|
|
8 |
|
|
use strict; |
9 |
dpavlin |
45 |
use Getopt::Long; |
10 |
dpavlin |
32 |
|
11 |
dpavlin |
45 |
my $stripurl; |
12 |
|
|
|
13 |
|
|
my $result = GetOptions ("stripurl=s" => \$stripurl); |
14 |
|
|
|
15 |
dpavlin |
33 |
my $name = shift @ARGV; |
16 |
|
|
my $url = shift @ARGV; |
17 |
dpavlin |
34 |
my $title = join(" ",@ARGV); |
18 |
dpavlin |
32 |
|
19 |
|
|
if (!$name || !$url) { |
20 |
dpavlin |
67 |
print "Usage: $0 [--stripurl=path] name URL|path title\n"; |
21 |
dpavlin |
32 |
exit; |
22 |
|
|
} |
23 |
|
|
|
24 |
|
|
my $dir=$0; $dir=~s/\/[^\/]+$//; |
25 |
|
|
|
26 |
|
|
# try to deduce working directory if script is not called with absolute path |
27 |
|
|
if ($dir !~ m#^/#) { |
28 |
|
|
chomp($dir = `pwd`); |
29 |
|
|
} |
30 |
|
|
|
31 |
|
|
my ($host,$urlpath); |
32 |
|
|
|
33 |
dpavlin |
33 |
if ($url =~ m#(http://[^/]+)(/.*)$#) { |
34 |
dpavlin |
32 |
($host,$urlpath) = ($1,$2); |
35 |
dpavlin |
45 |
if ($stripurl && $url =~ m/^(.*$stripurl)/) { |
36 |
dpavlin |
32 |
$stripurl = $1; |
37 |
|
|
} else { |
38 |
dpavlin |
45 |
$stripurl = $host; |
39 |
dpavlin |
32 |
} |
40 |
dpavlin |
67 |
} elsif (-d $url && $url =~ m#.+?(/references/)#) { |
41 |
|
|
$stripurl = $1; |
42 |
|
|
$host = ""; |
43 |
dpavlin |
32 |
} else { |
44 |
|
|
print "Can't parse URL $url into host and path!\n"; |
45 |
|
|
exit 1; |
46 |
|
|
} |
47 |
|
|
|
48 |
|
|
my $baseurl = $url; |
49 |
|
|
$baseurl =~ s#/[^/]+$#/#g; |
50 |
|
|
|
51 |
dpavlin |
67 |
print "Config name: $name\nCrawling: $url [strip $stripurl]\n"; |
52 |
dpavlin |
34 |
print "Title: $title\n" if ($title); |
53 |
dpavlin |
32 |
|
54 |
dpavlin |
67 |
my $indexer_config = << "EOF"; |
55 |
dpavlin |
32 |
IncludeConfigFile $dir/common.config |
56 |
|
|
|
57 |
|
|
# this is a cludge to implement no parent URL feature in swish indexer |
58 |
dpavlin |
40 |
IndexDir "$baseurl $url" |
59 |
|
|
ReplaceRules replace "$baseurl " "" |
60 |
dpavlin |
45 |
ReplaceRules replace "${stripurl}" "" |
61 |
dpavlin |
67 |
EOF |
62 |
dpavlin |
32 |
|
63 |
dpavlin |
67 |
$indexer_config = << "EOF" if (-d $url); |
64 |
|
|
IncludeConfigFile $dir/common-progspider.config |
65 |
|
|
IndexDir $dir/spider/progspider |
66 |
|
|
SwishProgParameters $url |
67 |
|
|
ReplaceRules replace "/rest/references/" "/" |
68 |
|
|
EOF |
69 |
|
|
|
70 |
|
|
my $config = <<"EOF"; |
71 |
|
|
|
72 |
|
|
################################################### |
73 |
|
|
$indexer_config |
74 |
dpavlin |
32 |
IndexFile $dir/index/$name |
75 |
|
|
|
76 |
|
|
StoreDescription HTML <body> 500 |
77 |
|
|
|
78 |
|
|
# not very usefull, but... |
79 |
|
|
MetaNames keywords description |
80 |
|
|
|
81 |
|
|
# store <title> |
82 |
|
|
PropertyNameAlias swishtitle title |
83 |
|
|
|
84 |
|
|
EOF |
85 |
|
|
|
86 |
|
|
my $xml = << "EOF"; |
87 |
|
|
<config |
88 |
dpavlin |
59 |
max_hits="10" |
89 |
dpavlin |
32 |
prog="/usr/bin/swish-e" |
90 |
|
|
index="$dir/index/$name" |
91 |
|
|
charset="iso-8859-2" |
92 |
|
|
affix="/usr/lib/ispell/american.aff" |
93 |
dpavlin |
34 |
EOF |
94 |
|
|
if ($title) { |
95 |
|
|
$xml .= "\ttitle=\"$title\"\n"; |
96 |
|
|
} else { |
97 |
|
|
$xml .= "\ttitle=\"$name search\"\n"; |
98 |
|
|
} |
99 |
dpavlin |
45 |
|
100 |
|
|
my $xml_title = $title || $url; |
101 |
|
|
|
102 |
|
|
my $xml_url = $url; |
103 |
|
|
$xml_url =~ s/^$host//; |
104 |
|
|
my $xml_urlprefix = $stripurl; |
105 |
|
|
$xml_urlprefix =~ s/^$host//; |
106 |
|
|
|
107 |
|
|
$xml .= "\turl=\"$xml_urlprefix\"\n" if ($stripurl); |
108 |
dpavlin |
34 |
$xml .= << "EOF"; |
109 |
dpavlin |
32 |
> |
110 |
|
|
<labels> |
111 |
|
|
<label value="10"> 10</label> |
112 |
dpavlin |
59 |
<label value="50"> 100</label> |
113 |
dpavlin |
32 |
<label value="100"> 100</label> |
114 |
|
|
</labels> |
115 |
|
|
<text> |
116 |
|
|
<search>Search for </search> |
117 |
|
|
<documents> documents containing words: </documents> |
118 |
|
|
<submit>Search</submit> |
119 |
|
|
<no_spell>don't use alternative spellings </no_spell> |
120 |
|
|
<no_properties>don't display results details</no_properties> |
121 |
|
|
<hits>Showing %s documents (of maximum %d)... (%s)</hits> |
122 |
|
|
<no_hits>Can't find any documents (%s, %s)</no_hits> |
123 |
dpavlin |
44 |
<footer><![CDATA[ |
124 |
|
|
Searcher will try to create different variations of words |
125 |
|
|
using spelling dictionary. If you enter a word with minus |
126 |
|
|
(<tt>-</tt>) before it, it will exclude documents with this |
127 |
|
|
word and plus (<tt>+</tt>) will do the opposite |
128 |
|
|
(e.g. <tt>-work +play</tt>) |
129 |
dpavlin |
45 |
<p><small>Content indexed: |
130 |
|
|
<a href="$xml_url">$xml_title</small></p> |
131 |
dpavlin |
44 |
]]></footer> |
132 |
dpavlin |
32 |
</text> |
133 |
|
|
<!-- additional data to show summary --> |
134 |
|
|
<properties>swishdescription swishdocsize</properties> |
135 |
|
|
<hit><![CDATA[ <a href="%s">%s</a> [%s]<br> |
136 |
|
|
<font color=gray size=-2>%s ... <i>%s bytes</i></font><br> |
137 |
|
|
]]></hit> |
138 |
|
|
|
139 |
|
|
</config> |
140 |
|
|
EOF |
141 |
|
|
|
142 |
dpavlin |
53 |
print "Creating $dir/config/$name\n"; |
143 |
|
|
open(C,"> $dir/config/$name") || die "can't open $dir/config/$name: $!"; |
144 |
dpavlin |
32 |
print C $config; |
145 |
|
|
close(C); |
146 |
|
|
|
147 |
|
|
print "Creating $dir/html/$name.xml\n"; |
148 |
|
|
open(C, "> $dir/html/$name.xml") || die "can't open $dir/html/$name.xml: $!"; |
149 |
|
|
print C $xml; |
150 |
|
|
close(C); |
151 |
|
|
|
152 |
|
|
print "Creating symlink to $dir/html/$name.cgi\n"; |
153 |
|
|
symlink "$dir/html/swish.cgi","$dir/html/$name.cgi" || die "can't create symlink $dir/html/$name.cgi: $!"; |
154 |
|
|
|
155 |
dpavlin |
53 |
#print "Index this with:\nswish-e -S http -c $dir/config/$name\n"; |
156 |
dpavlin |
32 |
#print "Search using $name.cgi\n"; |