/[swish]/trunk/make_config.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/make_config.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 40 - (show annotations)
Sun Jun 1 11:45:19 2003 UTC (16 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 3257 byte(s)
- support for listing of files in .tar.gz; decompressing of .gz and .bz2
  content
- changed order of arguments for swishspider: now baseurl,url (but it's
  backwards compatibile, so your old configurations will work)
- do html fixup just on html files (to prevent binary archive corruption)
- crawl sites that have frames

1 #!/usr/bin/perl -w
2 #
3 # create configuration for my version of swish crawler and web interface
4 #
5 # 2003-04-26 Dobrica Pavlinusic <dpavlin@rot13.org>
6 #
7
8 use strict;
9
10 my $name = shift @ARGV;
11 my $url = shift @ARGV;
12 #my $stripurl = shift @ARGV;
13 my $stripurl;
14 my $title = join(" ",@ARGV);
15
16 if (!$name || !$url) {
17 print "Usage: $0 name URL [strip from url] [optional title]\n";
18 exit;
19 }
20
21 my $dir=$0; $dir=~s/\/[^\/]+$//;
22
23 # try to deduce working directory if script is not called with absolute path
24 if ($dir !~ m#^/#) {
25 chomp($dir = `pwd`);
26 }
27
28 my ($host,$urlpath);
29
30 if ($url =~ m#(http://[^/]+)(/.*)$#) {
31 ($host,$urlpath) = ($1,$2);
32 if ($stripurl && $urlpath =~ m/^(.*$stripurl)/) {
33 $stripurl = $1;
34 } else {
35 $stripurl = "";
36 }
37 } else {
38 print "Can't parse URL $url into host and path!\n";
39 exit 1;
40 }
41
42 my $baseurl = $url;
43 $baseurl =~ s#/[^/]+$#/#g;
44
45 print "Config name: $name\nCrawling url: $url [strip $stripurl]\n";
46 print "Title: $title\n" if ($title);
47
48
49 my $config = <<"EOF";
50
51 ###################################################
52 IncludeConfigFile $dir/common.config
53
54 # this is a cludge to implement no parent URL feature in swish indexer
55 IndexDir "$baseurl $url"
56 ReplaceRules replace "$baseurl " ""
57 ReplaceRules replace "${host}${stripurl}" ""
58
59 IndexFile $dir/index/$name
60
61 StoreDescription HTML <body> 500
62
63 # not very usefull, but...
64 MetaNames keywords description
65
66 # store <title>
67 PropertyNameAlias swishtitle title
68
69 EOF
70
71 my $xml = << "EOF";
72 <config
73 max_hits="1000"
74 prog="/usr/bin/swish-e"
75 index="$dir/index/$name"
76 charset="iso-8859-2"
77 affix="/usr/lib/ispell/american.aff"
78 EOF
79 if ($title) {
80 $xml .= "\ttitle=\"$title\"\n";
81 } else {
82 $xml .= "\ttitle=\"$name search\"\n";
83 }
84 $xml .= "\turl=\"$stripurl\"\n" if ($stripurl);
85 $xml .= << "EOF";
86 >
87 <labels>
88 <label value="10"> 10</label>
89 <label value="100"> 100</label>
90 <label value="0">unlimited</label>
91 </labels>
92 <text>
93 <search>Search for </search>
94 <documents> documents containing words: </documents>
95 <submit>Search</submit>
96 <no_spell>don't use alternative spellings </no_spell>
97 <no_properties>don't display results details</no_properties>
98 <hits>Showing %s documents (of maximum %d)... (%s)</hits>
99 <no_hits>Can't find any documents (%s, %s)</no_hits>
100 <footer>Searcher will try to create different variations of words using spelling dictionary. If you enter a word with minus (-) before it, it will exclude documents with this word and plus (+) will do the opposite (e.g. -work +play)</footer>
101 </text>
102 <!-- additional data to show summary -->
103 <properties>swishdescription swishdocsize</properties>
104 <hit><![CDATA[ <a href="%s">%s</a> [%s]<br>
105 <font color=gray size=-2>%s ... <i>%s bytes</i></font><br>
106 ]]></hit>
107
108 </config>
109 EOF
110
111 print "Creating $dir/$name.config\n";
112 open(C,"> $dir/$name.config") || die "can't open $dir/$name.config: $!";
113 print C $config;
114 close(C);
115
116 print "Creating $dir/html/$name.xml\n";
117 open(C, "> $dir/html/$name.xml") || die "can't open $dir/html/$name.xml: $!";
118 print C $xml;
119 close(C);
120
121 print "Creating symlink to $dir/html/$name.cgi\n";
122 symlink "$dir/html/swish.cgi","$dir/html/$name.cgi" || die "can't create symlink $dir/html/$name.cgi: $!";
123
124 #print "Index this with:\nswish-e -S http -c $dir/$name.config\n";
125 #print "Search using $name.cgi\n";

Properties

Name Value
cvs2svn:cvs-rev 1.4
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26