1 |
########################################################################### |
2 |
# This is sample indexer config file. |
3 |
# To start using it please edit and rename to indexer.conf |
4 |
# You may want to keep the original indexer.conf-dist for future references. |
5 |
# Use '#' to comment out lines. |
6 |
# All command names are case insensitive (DBAddr=DBADDR=dbaddr). |
7 |
# You may use '\' character to prolong current command to next line |
8 |
# when it is required. |
9 |
# |
10 |
# You may include enother configuration file in any place of the indexer.conf |
11 |
# using "Include <filename>" command. |
12 |
# Absolute path if <filename> starts with "/": |
13 |
#Include /usr/local/mnogosearch/etc/inc1.conf |
14 |
# Relative path else: |
15 |
#Include inc1.conf |
16 |
########################################################################### |
17 |
|
18 |
|
19 |
|
20 |
########################################################################### |
21 |
# Section 1. |
22 |
# Global parameters. |
23 |
|
24 |
|
25 |
########################################################################### |
26 |
# DBAddr <URL-style database description> |
27 |
# Options (type, host, database name, port, user and password) |
28 |
# to connect to SQL database. |
29 |
# Do not matter for built-in text files support. |
30 |
# Should be used only once and before any other commands. |
31 |
# Command have global effect for whole config file. |
32 |
# Format: |
33 |
#DBAddr <DBType>:[//[DBUser[:DBPass]@]DBHost[:DBPort]]/DBName/ |
34 |
# |
35 |
# ODBC notes: |
36 |
# Use DBName to specify ODBC data source name (DSN) |
37 |
# DBHost does not matter, use "localhost". |
38 |
# Solid notes: |
39 |
# Use DBHost to specify Solid server |
40 |
# DBName does not matter for Solid |
41 |
# |
42 |
# Currently supported DBType values are |
43 |
# mysql, pgsql, msql, solid, mssql, oracle, ibase. |
44 |
# Actually, it does not matter for native libraries support. |
45 |
# But ODBC users should specify one of supported values. |
46 |
# If your database type is not supported, you may use "unknown" instead. |
47 |
|
48 |
DBAddr pgsql://dpavlin@portal.pliva.hr/corpsearch/ |
49 |
|
50 |
|
51 |
####################################################################### |
52 |
# DBMode single/multi/crc/crc-multi |
53 |
# Does not matter for built-in text files support |
54 |
# You may select SQL database mode of words storage. |
55 |
# When "single" is specified, all words are stored in the same |
56 |
# table. If "multi" is selected, words will be located in different |
57 |
# tables depending of their lengths. "multi" mode is usually faster |
58 |
# but requires more tables in database. |
59 |
# |
60 |
# If "crc" mode is selected, mnoGoSearch will store 32 bit integer |
61 |
# word IDs calculated by CRC32 algorythm instead of words. This |
62 |
# mode requres less disk space and it is faster comparing with "single" |
63 |
# and "multi" modes. "crc-multi" uses the same storage structure with |
64 |
# the "crc" mode, but also stores words in different tables depending on |
65 |
# words lengths like "multi" mode. |
66 |
# |
67 |
#Default DBMode value is "single": |
68 |
#DBMode single |
69 |
DBMode crc |
70 |
|
71 |
|
72 |
####################################################################### |
73 |
#SyslogFacility <facility> |
74 |
# This is used if indexer was compiled with syslog support and if you |
75 |
# don't like the default value. Argument is the same as used in syslog.conf |
76 |
# file. For list of possible facilities see syslog.conf(5) |
77 |
#SyslogFacility local7 |
78 |
|
79 |
|
80 |
####################################################################### |
81 |
#LogdAddr host[:port] |
82 |
# Use cachelogd at given host and port if specified. |
83 |
# It is required for "cache mode" only. Default values are localhost |
84 |
# and port 7000 |
85 |
#LogdAddr localhost:7000 |
86 |
|
87 |
|
88 |
####################################################################### |
89 |
# LocalCharset <charset> |
90 |
# Defines charset of local file system. It is required if you are using |
91 |
# 8 bit charsets and does not matter for 7 bit charsets. |
92 |
# This command should be used once and takes global effect for the config file. |
93 |
# Choose currently supported one: |
94 |
# |
95 |
# Western Europe: Germany |
96 |
#LocalCharset iso-8859-1 |
97 |
# |
98 |
# Central Europe: Czech |
99 |
#LocalCharset iso-8859-2 |
100 |
# |
101 |
# ISO Cyrillic |
102 |
#LocalCharset iso-8859-5 |
103 |
# |
104 |
# Unix Cyrillic |
105 |
#LocalCharset koi8-r |
106 |
# |
107 |
# MS Central Europe: Czech |
108 |
#LocalCharset windows-1250 |
109 |
# |
110 |
# MS DOS Cyrillic |
111 |
#LocalCharset cp866 |
112 |
# |
113 |
# MS Cyrillic |
114 |
#LocalCharset windows-1251 |
115 |
# |
116 |
# MS Arabic |
117 |
#LocalCharset windows-1256 |
118 |
# |
119 |
# Mac Cyrillic |
120 |
#LocalCharset x-mac-cyrillic |
121 |
# |
122 |
# ISO Greek |
123 |
#LocalCharset iso-8859-7 |
124 |
# |
125 |
# MS Greek |
126 |
#LocalCharset windows-1253 |
127 |
# |
128 |
# ISO Hebrew |
129 |
#LocalCharset iso-8859-8 |
130 |
# |
131 |
# MS Hebrew |
132 |
#LocalCharset windows-1255 |
133 |
# |
134 |
# ISO Baltic |
135 |
#LocalCharset iso-8859-4 |
136 |
#LocalCharset iso-8859-13 |
137 |
# |
138 |
# MS Baltic |
139 |
#LocalCharset windows-1257 |
140 |
# |
141 |
# ISO Turkish |
142 |
#LocalCharset iso-8859-9 |
143 |
# |
144 |
# MS Turkish |
145 |
#LocalCHarset windows-1254 |
146 |
|
147 |
|
148 |
####################################################################### |
149 |
#ForceIISCharset1251 yes/no |
150 |
#This option is useful for users which deals with Cyrillic content and broken |
151 |
#(or misconfigured?) Microsoft IIS web servers, which tends to not report |
152 |
#charset correctly. This is really dirty hack, but if this option is turned on |
153 |
#it is assumed that all servers which reports as 'Microsoft' or 'IIS' have |
154 |
#content in Windows-1251 charset. |
155 |
#This command should be used only once in configuration file and takes global |
156 |
#effect. |
157 |
#Default: no |
158 |
#ForceIISCharset1251 no |
159 |
|
160 |
|
161 |
########################################################################### |
162 |
# Ispell support commands. Detailed description is given in /doc/ispell.txt |
163 |
# Ispell commands MUST be given after LocalCharset definition. |
164 |
# Set ispell mode. Can be text (default) or db. If set to db then |
165 |
# Affix and Spell command should not be used. |
166 |
#IspellUsePrefixes yes/no |
167 |
# If enabled, indexer will use ispell prefixes, not only suffixes |
168 |
# Default: no |
169 |
#Ispellmode text |
170 |
# Load ispell affix file: |
171 |
#Affix <lang> <ispell affixes file name> |
172 |
# Load ispell dictionary file |
173 |
#Spell <lang> <ispell dictionary file name> |
174 |
# File names are relative to mnoGoSearch /etc directory |
175 |
# Absolute paths can be also specified. |
176 |
# |
177 |
#Affix en en.aff |
178 |
#Spell en en.dict |
179 |
|
180 |
########################################################################### |
181 |
#Phrase yes/no |
182 |
# Whether to index with phrase support. Default value is no. |
183 |
#Prase no |
184 |
|
185 |
|
186 |
########################################################################### |
187 |
#CrossWords yes/no |
188 |
# Whether to build CrossWords index |
189 |
# Default value is no |
190 |
#CrossWords no |
191 |
|
192 |
|
193 |
########################################################################### |
194 |
# StopwordFile <filename> |
195 |
# Load stop words from the given text file. You may specify either absolute |
196 |
# file name or a name relative to mnoGoSearch /etc directory. You may use |
197 |
# several StopwordFile commands. |
198 |
# |
199 |
#StopwordFile stopwords.txt |
200 |
|
201 |
########################################################################### |
202 |
# StopwordTable <tablename> [<tablename>...] |
203 |
# Load stop words from the given SQL table. You may use several |
204 |
# StopwordTable commands. This command has no effect work when compiled |
205 |
# without SQL database support. |
206 |
# |
207 |
StopwordTable stopword |
208 |
|
209 |
####################################################################### |
210 |
# Word lengths. You may change default length range of words |
211 |
# stored in database. By default, words with the length in the |
212 |
# range from 1 to 32 are stored. Note that setting MaxWordLength more |
213 |
# than 32 will not work as expected. |
214 |
# |
215 |
#MinWordLength 1 |
216 |
#MaxWordLength 32 |
217 |
|
218 |
####################################################################### |
219 |
# MaxDocSize bytes |
220 |
# Default value 1048576 (1 Mb) |
221 |
# Takes global effect for whole config file |
222 |
#MaxDocSize 1048576 |
223 |
|
224 |
|
225 |
####################################################################### |
226 |
# HTTPHeader <header> |
227 |
# You may add your desired headers in indexer HTTP request |
228 |
# You should not use "If-Modified-Since","Accept-Charset" headers, |
229 |
# these headers are composed by indexer itself. |
230 |
# "User-Agent: mnoGoSearch/version" is sent too, but you may override it. |
231 |
# Command has global effect for all configuration file. |
232 |
# |
233 |
#HTTPHeader User-Agent: My_Own_Agent |
234 |
#HTTPHeader Accept-Language: ru, en |
235 |
#HTTPHeader From: webmaster@mysite.com |
236 |
|
237 |
|
238 |
####################################################################### |
239 |
# ServerTable <table_name> (SQL only, not supported with build-in database) |
240 |
# Load servers with all their parameters from the table "table_name". |
241 |
# Check an example of these tables structure in create/mysql/server.txt |
242 |
# You may use several arguments for this command: |
243 |
#ServerTable my_servers1 my_servers2 my_servers3 |
244 |
# or the only one argument: |
245 |
# |
246 |
#ServerTable server |
247 |
|
248 |
|
249 |
####################################################################### |
250 |
#DeleteNoServer yes/no |
251 |
# Use it to choose whether delete or not those URLs which have no |
252 |
# correspondent "Server" commands. |
253 |
# Default value is "yes". |
254 |
#DeleteNoServer yes |
255 |
|
256 |
|
257 |
|
258 |
########################################################################## |
259 |
# Section 2. |
260 |
# URL control configuration. |
261 |
|
262 |
|
263 |
########################################################################## |
264 |
#Allow [Match|NoMatch] [NoCase|Case] [String|Regex] <arg> [<arg> ... ] |
265 |
# Use this to allow URLs that match (doesn't match) given argument. |
266 |
# First three optional parameters describe the type of comparison. |
267 |
# Default values are Match, NoCase, String. |
268 |
# Use "NoCase" or "Case" values to choose case insensitive or case sensitive |
269 |
# comparison. |
270 |
# Use "Regex" to choose regular expression comparison. |
271 |
# Use "String" to choose string with wildcards comparison. |
272 |
# Widlcards are '*' for any number of characters and '?' for one character. |
273 |
# Note that '?' and '*' have special meaning in "String" match type. Please use |
274 |
# "Regex" to describe documents with '?' and '*' signs in URL. |
275 |
# "String" match is much faster than "Regex". Use "String" where it |
276 |
# is possible. |
277 |
# You may use several arguments for one 'Allow' command. |
278 |
# You may use this command any times. |
279 |
# Takes global effect for config file. |
280 |
# Note that mnoGoSearch automatically adds one "Allow regex .*" |
281 |
# command after reading config file. It means that allowed everything |
282 |
# that is not disallowed. |
283 |
# Examples |
284 |
# Allow everything: |
285 |
#Allow * |
286 |
# Allow everything but .php .cgi .pl extensions case insensitively using regex: |
287 |
#Allow NoMatch Regex \.php$|\.cgi$|\.pl$ |
288 |
# Allow .HTM extension case sensitively: |
289 |
#Allow Case *.HTM |
290 |
|
291 |
|
292 |
########################################################################## |
293 |
#Disallow [Match|NoMatch] [NoCase|Case] [String|Regex] <arg> [<arg> ... ] |
294 |
# Use this to disallow URLs that match (doesn't match) given argument. |
295 |
# The meaning of first three optional parameters is exactly the same |
296 |
# with "Allow" command. |
297 |
# You can use several arguments for one 'Disallow' command. |
298 |
# Takes global effect for config file. |
299 |
# |
300 |
# Examples: |
301 |
# Disalow URLs that are not in udm.net domains using "string" match: |
302 |
#Disallow NoMatch *.udm.net/* |
303 |
# Disallow any except known extensions and directory index using "regex" match: |
304 |
#Disallow NoMatch Regex \/$|\.htm$|\.html$|\.shtml$|\.phtml$|\.php$|\.txt$ |
305 |
# Exclude cgi-bin and non-parsed-headers using "string" match: |
306 |
#Disallow */cgi-bin/* *.cgi */nph-* |
307 |
# Exclude anything with '?' sign in URL. Note that '?' sign has a |
308 |
# special meaning in "string" match, so we have to use "regex" match here: |
309 |
#Disallow Regex \? |
310 |
|
311 |
|
312 |
# Exclude some known extensions using fast "String" match: |
313 |
Disallow *.b *.sh *.md5 *.rpm |
314 |
Disallow *.arj *.tar *.zip *.tgz *.gz *.z *.bz2 |
315 |
Disallow *.lha *.lzh *.rar *.zoo *.ha *.tar.Z |
316 |
Disallow *.gif *.jpg *.jpeg *.bmp *.tiff *.tif *.xpm *.xbm *.pcx |
317 |
Disallow *.vdo *.mpeg *.mpe *.mpg *.avi *.movie *.mov *.dat |
318 |
Disallow *.mid *.mp3 *.rm *.ram *.wav *.aiff *.ra |
319 |
Disallow *.vrml *.wrl *.png |
320 |
Disallow *.exe *.com *.cab *.dll *.bin *.class *.ex_ |
321 |
Disallow *.tex *.texi *.xls *.doc *.texinfo |
322 |
Disallow *.rtf *.pdf *.cdf *.ps |
323 |
Disallow *.ai *.eps *.ppt *.hqx |
324 |
Disallow *.cpt *.bms *.oda *.tcl |
325 |
Disallow *.o *.a *.la *.so |
326 |
Disallow *.pat *.pm *.m4 *.am *.css |
327 |
Disallow *.map *.aif *.sit *.sea |
328 |
Disallow *.m3u *.qt *.mov |
329 |
|
330 |
# Exclude Apache directory list in different sort order using "string" match: |
331 |
Disallow *D=A *D=D *M=A *M=D *N=A *N=D *S=A *S=D |
332 |
|
333 |
# More complicated case. RAR .r00-.r99, ARJ a00-a99 files |
334 |
# and unix shared libraries. We use "Regex" match type here: |
335 |
Disallow Regex \.r[0-9][0-9]$ \.a[0-9][0-9]$ \.so\.[0-9]$ |
336 |
|
337 |
|
338 |
|
339 |
########################################################################## |
340 |
#CheckOnly [Match|NoMatch] [NoCase|Case] [String|Regex] <arg> [<arg> ... ] |
341 |
# The meaning of first three optional parameters is exactly the same |
342 |
# with "Allow" command. |
343 |
# Indexer will use HEAD instead of GET HTTP method for URLs that |
344 |
# match/do not match given regular expressions. It means that the file |
345 |
# will be checked only for being existing and will not be downloaded. |
346 |
# Useful for zip,exe,arj and other binary files. |
347 |
# Note that you can disallow those files with commands given below. |
348 |
# You may use several arguments for one "CheckOnly" commands. |
349 |
# Useful for example for searching through the URL names rather than |
350 |
# the contents (a la FTP-search). |
351 |
# Takes global effect for config file. |
352 |
# |
353 |
# Check some known non-text extensions using "string" match: |
354 |
#CheckOnly *.b *.sh *.md5 |
355 |
#CheckOnly *.arj *.tar *.zip *.tgz *.gz |
356 |
#CheckOnly *.lha *.lzh *.rar *.zoo *.tar*.Z |
357 |
#CheckOnly *.gif *.jpg *.jpeg *.bmp *.tiff |
358 |
#CheckOnly *.vdo *.mpeg *.mpe *.mpg *.avi *.movie |
359 |
#CheckOnly *.mid *.mp3 *.rm *.ram *.wav *.aiff |
360 |
#CheckOnly *.vrml *.wrl *.png |
361 |
#CheckOnly *.exe *.cab *.dll *.bin *.class |
362 |
#CheckOnly *.tex *.texi *.xls *.doc *.texinfo |
363 |
#CheckOnly *.rtf *.pdf *.cdf *.ps |
364 |
#CheckOnly *.ai *.eps *.ppt *.hqx |
365 |
#CheckOnly *.cpt *.bms *.oda *.tcl |
366 |
#CheckOnly *.rpm *.m3u *.qt *.mov |
367 |
#CheckOnly *.map *.aif *.sit *.sea |
368 |
# |
369 |
# or check ANY except known text extensions using "regex" match: |
370 |
#Check NoMatch Regex \/$|\.html$|\.shtml$|\.phtml$|\.php$|\.txt$ |
371 |
|
372 |
|
373 |
########################################################################## |
374 |
#HrefOnly [Match|NoMatch] [NoCase|Case] [String|Regex] <arg> [<arg> ... ] |
375 |
# The meaning of first three optional parameters is exactly the same |
376 |
# with "Allow" command. |
377 |
# |
378 |
# Use this to scan a HTML page for "href" tags but not to index the contents |
379 |
# of the page with an URLs that match (doesn't match) given argument. |
380 |
# Commands have global effect for all configuration file. |
381 |
# |
382 |
# When indexing large mail list archives for example, the index and thread |
383 |
# index pages (like mail.10.html, thread.21.html, etc.) should be scanned |
384 |
# for links but shouldn't be indexed: |
385 |
# |
386 |
#HrefOnly */mail*.html */thread*.html |
387 |
|
388 |
|
389 |
|
390 |
# How to combine Allow, Disallow, CheckOnly, HrefOnly commands. |
391 |
# |
392 |
# indexer compares URLs against all these command arguments in the |
393 |
# order of their appearence in indexer.conf file. |
394 |
# If indexer find that URL matches some rule it will make a decision of what |
395 |
# to do with this URL, allow it, disallow it or use HEAD instead |
396 |
# of the GET method. So, you may use different Allow, Disallow, |
397 |
# CheckOnly, HrefOnly commands order. |
398 |
# If no one of these commands are given, mnoGoSearch will allow everything |
399 |
# by default. |
400 |
# |
401 |
# There are many possible combinations. Samples of two of them are here: |
402 |
# |
403 |
# Sample of first useful combination. |
404 |
# Disallow known non-text extensions (zip,wav etc), |
405 |
# then allow everything else. This sample is uncommented above (note that |
406 |
# there is actually no "Allow *" command, it is added automatically after |
407 |
# indexer.conf loading). |
408 |
# |
409 |
# Sample of second combination. |
410 |
# Allow some known text extensions (html, txt) and directory index ( / ), |
411 |
# then disallow everything else: |
412 |
# |
413 |
#Allow .html .txt */ |
414 |
#Disallow * |
415 |
|
416 |
|
417 |
|
418 |
################################################################ |
419 |
# Section 3. |
420 |
# Mime types and external parsers. |
421 |
|
422 |
|
423 |
################################################################ |
424 |
#UseRemoteContentType yes/no |
425 |
# This command specifies if the indexer should get content type |
426 |
# from http server headers (yes) or from it's AddType settings (no). |
427 |
# If set to 'no' and the indexer could not determine content-type |
428 |
# by using its AddType settings, then it will use http header. |
429 |
# Default: yes |
430 |
#UseRemoteContentType yes |
431 |
|
432 |
|
433 |
################################################################ |
434 |
#AddType [String|Regex] [Case|NoCase] <mime type> <arg> [<arg>...] |
435 |
# This command associates filename extensions (for services |
436 |
# that don't automatically include them) with their mime types. |
437 |
# Currently "file:" protocol uses these commands. |
438 |
# Use optional first two parameter to choose comparison type. |
439 |
# Default type is "String" "NoCase" (case insensitive string match with |
440 |
# '?' and '*' wildcards for one and several characters correspondently). |
441 |
# |
442 |
AddType text/plain *.txt *.pl *.js *.h *.c *.pm *.e |
443 |
AddType text/html *.html *.htm |
444 |
AddType image/x-xpixmap *.xpm |
445 |
AddType image/x-xbitmap *.xbm |
446 |
AddType image/gif *.gif |
447 |
# |
448 |
# You may also use quotes in mime type definition |
449 |
# for example to specify charset. e.g. Russian webmasters |
450 |
# often use *.htm extension for windows-1251 documents and |
451 |
# *.html for unix koi8-r documents: |
452 |
# |
453 |
#AddType "text/html; charset=koi8-r" *.html |
454 |
#AddType "text/html; charset=windows-1251" *.htm |
455 |
# |
456 |
# More complicated example for rar .r00-r.99 using "Regex" match: |
457 |
#AddType Regex application/rar \.r[0-9][0-9]$ |
458 |
# |
459 |
# Default unknown type for other extensions: |
460 |
AddType application/unknown *.* |
461 |
|
462 |
|
463 |
# Mime <from_mime> <to_mime> <command line> |
464 |
# |
465 |
# This is used to add support for parsing documents with mime types other |
466 |
# than text/plain and text/html. It can be done via external parser (which |
467 |
# must provide output in plain or html text) or just by substituting mime |
468 |
# type so indexer will understand it. |
469 |
# |
470 |
# <from_mime> and <to_mime> are standard mime types |
471 |
# <to_mime> is either text/plain or text/html |
472 |
# |
473 |
# Optional charset parameter used to change charset if needed |
474 |
# |
475 |
# Command line may have $1 parameter which stands for temporary file name. |
476 |
# Some parsers can not operate on stdin, so indexer creates temporary file |
477 |
# for parser and it's name passed instead of $1. Take a look into documentation |
478 |
# for other parser types and parsers usage explanation. |
479 |
# Examples: |
480 |
# |
481 |
# from_mime to_mime[charset] [command line [$1]] |
482 |
# |
483 |
#Mime application/msword "text/plain; charset=cp1251" "catdoc $1" |
484 |
#Mime application/x-troff-man text/plain "deroff" |
485 |
#Mime text/x-postscript text/plain "ps2ascii" |
486 |
|
487 |
|
488 |
|
489 |
######################################################################### |
490 |
# Section 4. |
491 |
# Aliases configuration. |
492 |
|
493 |
|
494 |
######################################################################### |
495 |
#Alias <master> <mirror> |
496 |
# You can use this command for example to organize search through |
497 |
# master site by indexing a mirror site. It is also usefull to |
498 |
# index your site from local file system. |
499 |
# mnoGoSearch will display URLs from <master> while searching |
500 |
# but go to the <mirror> while indexing. |
501 |
# This command has global indexer.conf file effect. |
502 |
# You may use several aliases in one indexer.conf. |
503 |
#Alias http://www.mysql.com/ http://mysql.udm.net/ |
504 |
#Alias http://www.site.com/ file:/usr/local/apache/htdocs/ |
505 |
|
506 |
|
507 |
######################################################################### |
508 |
#AliasProg <command line> |
509 |
# AliasProg is an external program that can be called, that takes a URL, |
510 |
# and returns the appropriate alias to stdout. Use $1 to pass a URL. This |
511 |
# command has global effect for whole indexer.conf. |
512 |
# Example: |
513 |
#AliasProg "echo $1 | /usr/local/mysql/bin/replace http://localhost/ file:/home/httpd/" |
514 |
|
515 |
|
516 |
####################################################################### |
517 |
# Section 5. |
518 |
# Servers configuration. |
519 |
|
520 |
|
521 |
####################################################################### |
522 |
#Period <time> |
523 |
# Does not matter for built-in text files support |
524 |
# Set reindex period. |
525 |
# <time> is in the form 'xxxA[yyyB[zzzC]]' |
526 |
# (Spaces are allowed between xxx and A and yyy and so on) |
527 |
# there xxx, yyy, zzz are numbers (can be negative!) |
528 |
# A, B, C can be one of the following: |
529 |
# s - second |
530 |
# M - minute |
531 |
# h - hour |
532 |
# d - day |
533 |
# m - month |
534 |
# y - year |
535 |
# (these letters are the same as in strptime/strftime functions) |
536 |
# |
537 |
# Examples: |
538 |
# 15s - 15 seconds |
539 |
# 4h30M - 4 hours and 30 minutes |
540 |
# 1y6m-15d - 1 year and six month minus 15 days |
541 |
# 1h-10M+1s - 1 hour minus 10 minutes plus 1 second |
542 |
# |
543 |
# If you specify only number without any character, it is assumed |
544 |
# that time is given in seconds (this behaviour is for |
545 |
# compatibility with versions prior to 3.1.7). |
546 |
# |
547 |
# Can be set many times before "Server" command and |
548 |
# takes effect till the end of config file or till next Period command. |
549 |
#Period 7d |
550 |
|
551 |
|
552 |
####################################################################### |
553 |
#Tag <string> |
554 |
# Use this field for your own purposes. For example for grouping |
555 |
# some servers into one group, etc... |
556 |
# Can be set multiple times before "Server" command and |
557 |
# takes effect till the end of config file or till next Tag command. |
558 |
# Default values is an empty sting |
559 |
|
560 |
|
561 |
####################################################################### |
562 |
#Category <string> |
563 |
#You may distribute documents between nested categories. Category |
564 |
#is a string in hex number notation. You may have up to 5 levels with |
565 |
#256 members per level. Empty category means the root of category tree. |
566 |
#Take a look into doc/categories.txt for more information. |
567 |
#This command means a category on first level: |
568 |
#Category AA |
569 |
#This command meand a category on 5th level: |
570 |
#Category FFAABBCCDD |
571 |
|
572 |
|
573 |
####################################################################### |
574 |
#DefaultLang <string> |
575 |
#Default language for server. Can be used if you need language |
576 |
#restriction while doing search. |
577 |
#DefaultLang en |
578 |
|
579 |
|
580 |
####################################################################### |
581 |
#MaxHops <number> |
582 |
# Maximum way in "mouse clicks" from start url. |
583 |
# Default value is 256. |
584 |
# Can be set multiple times before "Server" command and |
585 |
# takes effect till the end of config file or till next MaxHops command. |
586 |
#MaxHops 256 |
587 |
|
588 |
|
589 |
####################################################################### |
590 |
#MaxNetErrors <number> |
591 |
# Maximum network errors for each server. |
592 |
# Default value is 16. Use 0 for unlimited errors number. |
593 |
# If there too many network errors on some server |
594 |
# (server is down, host unreachable, etc) indexer will try to do |
595 |
# not more then 'number' attempts to connect to this server. |
596 |
# Takes effect till the end of config file or till next MaxNetErrors command. |
597 |
#MaxNetErrors 16 |
598 |
|
599 |
|
600 |
####################################################################### |
601 |
#ReadTimeOut <time> |
602 |
# Connect timeout and stalled connections timeout. |
603 |
# For <time> format see description of Period above. |
604 |
# Default value is 30 seconds. |
605 |
# Can be set any times before "Server" command and |
606 |
# takes effect till the end of config file or till next ReadTimeOut command. |
607 |
#ReadTimeOut 30s |
608 |
|
609 |
|
610 |
####################################################################### |
611 |
#DocTimeOut <time> |
612 |
# Maximum amount of time indexer spends for one document downloading. |
613 |
# For <time> format see description of Period above. |
614 |
# Default value is 90 seconds. |
615 |
# Can be set any times before "Server" command and |
616 |
# takes effect till the end of config file or till next DocTimeOut command. |
617 |
#DocTimeOut 1m30s |
618 |
|
619 |
|
620 |
######################################################################## |
621 |
#NetErrorDelayTime <time> |
622 |
# Specify document processing delay time if network error has occured. |
623 |
# For <time> format see description of Period above. |
624 |
# Default value is one day |
625 |
#NetErrorDelayTime 1d |
626 |
|
627 |
|
628 |
####################################################################### |
629 |
#Robots yes/no |
630 |
# Allows/disallows using robots.txt and <META NAME="robots"> |
631 |
# exclusions. Use "no", for example for link validation of your server(s). |
632 |
# Command may be used several times before "Server" command and |
633 |
# takes effect till the end of config file or till next Robots command. |
634 |
# Default value is "yes". |
635 |
#Robots yes |
636 |
|
637 |
|
638 |
####################################################################### |
639 |
#Clones yes/no |
640 |
# Allow/disallow clone eliminating. If alowed, indexer will |
641 |
# detect the same documents under different location, such as |
642 |
# mirrors, and will index only one document from the group of |
643 |
# such equal documents. "Clones yes" also allows to reduce space usage. |
644 |
# Default value is "yes". |
645 |
#Clones yes |
646 |
|
647 |
|
648 |
####################################################################### |
649 |
#BodyWeight <number> |
650 |
# It is better to use a degree of 2 as *Weight commands argument. |
651 |
# Refer to "Changing different document part weights at search time" |
652 |
# in doc/search.txt. |
653 |
# |
654 |
# Weight of the words in the <body>...</body> of the html documents |
655 |
# and in the content of the text/plain documents. |
656 |
# Can be set multiple times before "Server" command and |
657 |
# takes effect till the end of config file or till next BodyWeight command. |
658 |
# Default value is 2 |
659 |
#BodyWeight 2 |
660 |
|
661 |
|
662 |
####################################################################### |
663 |
#CrossWeight <number> |
664 |
# Weight of the words in a link to html document (CrossWords). |
665 |
# CrossWords indexing is turned on or off with "CrossWords" command |
666 |
# Default value is 32 |
667 |
#CrossWeight 32 |
668 |
|
669 |
|
670 |
####################################################################### |
671 |
#TitleWeight <number> |
672 |
# Weight of the words in the <title>...</title> |
673 |
# Can be set multiple times before "Server" command and |
674 |
# takes effect till the end of config file or till next TitleWeight command. |
675 |
# Default value is 4 |
676 |
#TitleWeight 4 |
677 |
|
678 |
|
679 |
####################################################################### |
680 |
#KeywordWeight <number> |
681 |
# Weight of the words in the <META NAME="Keywords" Content="..."> |
682 |
# Can be set multiple times before "Server" command and |
683 |
# takes effect till the end of config file or till next KeywordWeight command. |
684 |
# Default value is 8 |
685 |
#KeywordWeight 8 |
686 |
|
687 |
|
688 |
####################################################################### |
689 |
#DescWeight <number> |
690 |
# Weight of the words in the <META NAME="Description" Content="..."> |
691 |
# Can be set multiple times before "Server" command and |
692 |
# takes effect till the end of config file or till next DescWeight command. |
693 |
# Default value is 16 |
694 |
#DescWeight 16 |
695 |
|
696 |
|
697 |
####################################################################### |
698 |
#UrlWeight <number> |
699 |
# Weight of the words in the URL of the documents. |
700 |
# Can be set multiple times before "Server" command and |
701 |
# takes effect till the end of config file or till next UrlWeight command. |
702 |
# Default value is 0 |
703 |
#UrlWeight 0 |
704 |
|
705 |
|
706 |
####################################################################### |
707 |
#UrlHostWeight <number> |
708 |
# Weight of the words in the hostname part of URL of the documents. |
709 |
# Can be set multiple times before "Server" command and |
710 |
# takes effect till the end of config file or till next UrlHostWeight command. |
711 |
# Default value is 0 |
712 |
#UrlHostWeight 0 |
713 |
|
714 |
|
715 |
####################################################################### |
716 |
#UrlPathWeight <number> |
717 |
# Weight of the words in the path part of URL of the documents. |
718 |
# Can be set multiple times before "Server" command and |
719 |
# takes effect till the end of config file or till next UrlPathWeight command. |
720 |
# Default value is 0 |
721 |
#UrlPathWeight 0 |
722 |
|
723 |
|
724 |
####################################################################### |
725 |
#UrlFileWeight <number> |
726 |
# Weight of the words in the filename part of URL of the documents. |
727 |
# Can be set multiple times before "Server" command and |
728 |
# takes effect till the end of config file or till next UrlFileWeight command. |
729 |
# Default value is 0 |
730 |
#UrlFileWeight 0 |
731 |
|
732 |
|
733 |
###################################################################### |
734 |
# Spell checking. You can change the factors of word weight depending on |
735 |
# whether word is found in Ispell dictionaries or not. Setting the |
736 |
# "IspellCorrectFactor" to 0 will prevent indexer from storing words with |
737 |
# right spelling in database. The only incorrect words will be stored |
738 |
# in database in this case. Then you may easily find incorrect words |
739 |
# and correspondent URLs where those words are found. If no |
740 |
# ispell files are used all word are considered as "incorrect". |
741 |
# |
742 |
#IspellCorrectFactor 1 |
743 |
#IspellIncorrectFactor 1 |
744 |
|
745 |
|
746 |
####################################################################### |
747 |
# Numbers indexing. By default numbers and words which contain both |
748 |
# digits and letters (like "3a","U2") are stored in database. You may change |
749 |
# this behaviour by setting into "0" weight factors. Usefull for spell checking |
750 |
# in combination with previous commands. |
751 |
# |
752 |
#NumberFactor 1 |
753 |
#AlnumFactor 1 |
754 |
|
755 |
|
756 |
####################################################################### |
757 |
#DeleteBad yes/no |
758 |
# Use it to choose whether delete or not bad (not found, forbidden etc) URLs |
759 |
# from database. |
760 |
# May be used multiple times before "Server" command and |
761 |
# takes effect till the end of config file or till next DeleteBad command. |
762 |
# Default value is "no", that means do not delete bad URLs. |
763 |
#DeleteBad no |
764 |
|
765 |
|
766 |
####################################################################### |
767 |
#Index yes/no |
768 |
# Prevent indexer from storing words into database. |
769 |
# Useful for example for link validation. |
770 |
# Can be set multiple times before "Server" command and |
771 |
# takes effect till the end of config file or till next Index command. |
772 |
# Default value is "yes". |
773 |
#Index yes |
774 |
|
775 |
|
776 |
####################################################################### |
777 |
#Follow page/path/site/world/no |
778 |
# Set indexer behaviour on searching whether an URL correspons a Server |
779 |
# command. It describes which part of argument given in the following |
780 |
# Server command is to be compared with an URL to decide whether URL |
781 |
# corresponds Server command. |
782 |
# "page" means that URL must be the same. It actually means describes web |
783 |
# space which consists of one page. |
784 |
# "path" means URL which is under the same path with Server argument |
785 |
# corresponds Server command. |
786 |
# "site" means links from the same host. |
787 |
# "world" means to follow any link. |
788 |
# "no" is the same with "page". |
789 |
# Follow commad can be used multiple times before "Server" command and |
790 |
# takes effect till the end of config file or till next Follow command. |
791 |
# Default value is "path". |
792 |
#Follow path |
793 |
|
794 |
|
795 |
####################################################################### |
796 |
#CheckMp3Tag yes/no |
797 |
#Work only on servers support HTTP/1.1 protocol. |
798 |
#It is used "Range: bytes" header to download mp3 tag. |
799 |
#CheckMp3Tag no |
800 |
|
801 |
|
802 |
####################################################################### |
803 |
#IndexMP3TagOnly yes/no |
804 |
#Enable this option allow to check file to detect id3 tag and |
805 |
#if no id3 tag exist do nothing. |
806 |
#Also set CheckMp3Tag to yes. |
807 |
#CheckID3TagOnly no |
808 |
|
809 |
|
810 |
######################################################################## |
811 |
#CharSet <charset> |
812 |
# Useful for 8 bit character sets. |
813 |
# WWW-servers send data in different charsets. |
814 |
#<Charset> is default character set of server in next "Server" command(s). |
815 |
#This is required only for "bad" servers that do not send information |
816 |
#about charset in header: "Content-type: text/html; charset=some_charset" |
817 |
# and have not <META NAME="Content" Content="text/html; charset=some_charset"> |
818 |
#Can be set before every "Server" command and |
819 |
# takes effect till the end of config file or till next CharSet command. |
820 |
#CharSet windows-1251 |
821 |
|
822 |
|
823 |
######################################################################### |
824 |
#ProxyAuthBasic login:passwd |
825 |
# Use http proxy basic authorization |
826 |
# Can be used before every "Server" command and |
827 |
# takes effect only for next one "Server" command! |
828 |
# It should be also before "Proxy" command. |
829 |
# Examples: |
830 |
#ProxyAuthBasic somebody:something |
831 |
|
832 |
|
833 |
######################################################################### |
834 |
#Proxy your.proxy.host[:port] |
835 |
# Use proxy rather then connect directly |
836 |
#One can index ftp servers when using proxy |
837 |
#Default port value if not specified is 3128 (Squid) |
838 |
#If proxy host is not specified direct connect will be used. |
839 |
#Can be set before every "Server" command and |
840 |
# takes effect till the end of config file or till next Proxy command. |
841 |
#If no one "Proxy" command specified indexer will use direct connect. |
842 |
# |
843 |
# Examples: |
844 |
# Proxy on atoll.anywhere.com, port 3128: |
845 |
#Proxy atoll.anywhere.com |
846 |
# |
847 |
# Proxy on lota.anywhere.com, port 8090: |
848 |
#Proxy lota.anywhere.com:8090 |
849 |
# |
850 |
# Disable proxy (direct connect): |
851 |
#Proxy |
852 |
|
853 |
|
854 |
######################################################################### |
855 |
#AuthBasic login:passwd |
856 |
# Use basic http authorization |
857 |
# Can be set before every "Server" command and |
858 |
# takes effect only for next one Server command! |
859 |
# Examples: |
860 |
#AuthBasic somebody:something |
861 |
# |
862 |
# If you have password protected directory(ies), but whole server is open,use: |
863 |
#AuthBasic login1:passwd1 |
864 |
#Server http://my.server.com/my/secure/directory1/ |
865 |
#AuthBasic login2:passwd2 |
866 |
#Server http://my.server.com/my/secure/directory2/ |
867 |
#Server http://my.server.com/ |
868 |
|
869 |
|
870 |
############################################################## |
871 |
# Mirroring parameters commands. |
872 |
# |
873 |
# You may specify a path to root dir to enable sites mirroring |
874 |
#MirrorRoot /path/to/mirror |
875 |
# |
876 |
# You may specify as well root dir of mirrored document's headers |
877 |
# indexer will store HTTP headers to local disk too. |
878 |
#MirrorHeadersRoot /path/to/headers |
879 |
# |
880 |
# MirrorPeriod <time> |
881 |
# You may specify period during wich earlier mirrored files |
882 |
# will be used while indexing instead of real downloading. |
883 |
# It is very useful when you do some experiments with mnoGoSearch |
884 |
# indexing the same hosts and do not want much traffic from/to Internet. |
885 |
# If MirrorHeadersRoot is not specified and headers are not stored |
886 |
# to local disk then default Content-Type's given in AddType commands |
887 |
# will be used. |
888 |
# Default value of the MirrorPeriod is -1, which means |
889 |
# "do not use mirrored files". |
890 |
# |
891 |
# For <time> format see Period command description above. |
892 |
# |
893 |
# The command below will force using local copies for one day: |
894 |
#MirrorPeriod 1d |
895 |
|
896 |
|
897 |
######################################################################### |
898 |
#Server [subsection] <URL> [alias] |
899 |
# This is the main command of the indexer.conf file. It's used |
900 |
# to add servers or their parts to be indexed. It also inserts |
901 |
# given URL into database. |
902 |
# For example: |
903 |
#Server http://localhost/ |
904 |
# |
905 |
# You can also specify some path to index server section: |
906 |
#Server http://localhost/subsection/ |
907 |
# or concrete one page: |
908 |
#Server http://localhost/path/main.html |
909 |
# |
910 |
# Use optional subsection parameter to specify server's subsection. |
911 |
# It specifies which part of Server command argument is to be compared |
912 |
# with and URL. Check follow.txt for details. |
913 |
# Values of subsection are the same with "Follow" command arguments. |
914 |
# If subsection is not specified current "Follow" value will be used. |
915 |
# If subsection is specified it does not change current "Follow" value |
916 |
# for next "Server" commands without subsection argument. |
917 |
# This example will add /path/ section on localhost: |
918 |
#Server path http://localhost/path/main.html |
919 |
# This example will add whole server: |
920 |
#Server site http://localhost/path/main.html |
921 |
# |
922 |
# You can also specify optional parameter "alias". This example will |
923 |
# index server "http://search.mnogo.ru/" directly from disk instead of |
924 |
# fetching from HTTP server: |
925 |
#Server http://search.mnogo.ru/ file:/home/httpd/search.mnogo.ru/ |
926 |
# |
927 |
# You may use "Server" command as many times as a number of different |
928 |
# servers you want to index. |
929 |
# |
930 |
#Server http://localhost/ |
931 |
Alias http://www.pliva.hr/ http://portal.pliva.hr/ |
932 |
Server http://www.pliva.hr/ |
933 |
Server http://www.pliva.hr/human_health.php?search=all&type=any&show_description=on |
934 |
Server http://www.pliva.hr/animal_health.php?search=all&type=any&show_description=on&show_animals=on |
935 |
|
936 |
######################################################################### |
937 |
#Realm [String|Regex] [Match|NoMatch] <arg> [alias] |
938 |
# It works almost like "Server" command but takes a regular expression or |
939 |
# string wildcards as it's argument. String wildcards is default match type. |
940 |
# For example, if you want to index all HTTP sites in ".ru" domain, use: |
941 |
#Realm http://*.ru/* |
942 |
# The same using "Regex" match: |
943 |
#Realm Regex ^http://.*\.ru/ |
944 |
# Another example. Use this command to index everything without .com domain: |
945 |
#Realm NoMatch http://*.com/* |
946 |
# |
947 |
# Optional "alias" argument allows to provide very complicated URL rewrite |
948 |
# more powerful than other aliasing mechanism. Take a look into alias.txt |
949 |
# for "alias" argument usage explanation. |
950 |
|
951 |
|
952 |
######################################################################### |
953 |
#URL http://localhost/path/to/page.html |
954 |
# This command inserts given URL into database. This is usefull to add |
955 |
# several entry points to one server. Has no effect if an URL is already |
956 |
# in the database. When inserting indexer does not any checking and this |
957 |
# URL may be delated at first indexing attempt if URL has no correspondent |
958 |
# Server command or disallowed by rules given in Allow/Disallow |
959 |
# commands. |
960 |
# |
961 |
#This command will add /main/index.html page: |
962 |
#URL http://localhost/main/index.html |
963 |
|