/[SWISH-PlusPlus]/trunk/swish++/config.h
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/swish++/config.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 11 - (show annotations)
Sun Dec 5 13:30:57 2004 UTC (16 years, 1 month ago) by dpavlin
File MIME type: text/plain
File size: 10308 byte(s)
support for meta in data, relaxed swish++ config, distribution cleanup

1 /*
2 ** SWISH++
3 ** config.h
4 **
5 ** Copyright (C) 1998 Paul J. Lucas
6 **
7 ** This program is free software; you can redistribute it and/or modify
8 ** it under the terms of the GNU General Public License as published by
9 ** the Free Software Foundation; either version 2 of the License, or
10 ** (at your option) any later version.
11 **
12 ** This program is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ** GNU General Public License for more details.
16 **
17 ** You should have received a copy of the GNU General Public License
18 ** along with this program; if not, write to the Free Software
19 ** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21
22 #ifndef config_H
23 #define config_H
24
25 ////////// Word determination /////////////////////////////////////////////////
26
27 int const Word_Hard_Min_Size = 1;
28 int const Word_Hard_Max_Size = 9999;
29 // The minimum and maximum lengths a word must be in order even to
30 // bother doing more aggressive checks on it to determine if it
31 // should be indexed.
32
33 int const Word_Min_Size = 1;
34 // The minimum length a non-acronym word must be in order to be
35 // considered for indexing.
36
37 int const Word_Min_Vowels = 0;
38 // The minimum number of vowels a word must have in order to be
39 // indexed.
40
41 int const Word_Hex_Max_Size = 9999;
42 // The maximum length a string composed entirely of hexadecimal
43 // digits i.e., ASCII hex data, can be before it is discarded.
44 // Note that the word "cafe" is a legitimate English word composed
45 // entirely of hexedecimal digits. This parameter is used only by
46 // extract(1) in extract.c.
47
48 // I don't think there is a word in English that has more than...
49
50 int const Word_Max_Consec_Consonants = 9999;
51 // ...this many consecutive consonants (like "symphysis")
52
53 int const Word_Max_Consec_Vowels = 9999;
54 // ...this many consecutive vowels (like "queueing")
55
56 int const Word_Max_Consec_Same = 9999;
57 // ...this many of the same alphabetic character consecutively
58
59 int const Word_Max_Consec_Puncts = 9999;
60 // ...this many punctuation character in a row
61
62 // Characters that are permissible in words: letters must be lower case and
63 // upper case letters would be redundant.
64 //
65 char const Word_Chars[] = "&'-0123456789abcdefghijklmnopqrstuvwxyz_";
66 // Characters that may be in a word. Note that '&' is here so
67 // acronyms like "AT&T" are treated as one word. Unlike SWISH-E,
68 // ';' does not need to be here to recognize and convert character
69 // entity references.
70
71 #define OPTIMIZE_WORD_CHARS 1
72 // If you are using the default set of characters, that is the
73 // alphanumerics and "&'-_" characters, then having this macro set
74 // to 1 will optimize the is_word_char() function yielding about a
75 // 10% performance improvement; alternatively, you can also edit
76 // that function to keep the optimization if you are not using the
77 // default set of characters. See word_util.h for details.
78
79 char const Word_Begin_Chars[] = "0123456789abcdefghijklmnopqrstuvwxyz";
80 // Characters that may begin a word; should be a subset of the
81 // above.
82
83 #define OPTIMIZE_WORD_BEGIN_CHARS 1
84 // Same deal as with OPTIMIZE_WORD_CHARS.
85
86 char const Word_End_Chars[] = "0123456789abcdefghijklmnopqrstuvwxyz";
87 // Characters that may end a word; usually the same as the above.
88
89 #define OPTIMIZE_WORD_END_CHARS 1
90 // Same deal as with OPTIMIZE_WORD_CHARS.
91
92 #ifdef SEARCH_DAEMON
93 ////////// Search server daemon parameters ////////////////////////////////////
94
95 char const SocketFile_Default[] = "/tmp/search.socket";
96 // Default name of the Unix domain socket file; this can be
97 // overridden either in a config. file or on the command line.
98
99 int const SocketPort_Default = 1967;
100 // Default port number of the TCP socket; this can be overridden
101 // either in a config. file or on the command line.
102
103 int const SocketQueueSize_Default = 511;
104 // Maximum number of queued connections for a socket. From
105 // [Stevens 1998], p. 96:
106 //
107 // Historically, sample code always shows a backlog of 5,
108 // as that was the maximum value supported by 4.2BSD.
109 // This was adequate in the 1980s when busy servers would
110 // handle only a few hundred connections per day. But
111 // with the growth of the World Wide Web (WWW), where busy
112 // servers handle millions of connections per day, this
113 // small number is completely inadequate. Busy HTTP
114 // servers must specify a much larger backlog, and newer
115 // kernels must support larger values.
116 //
117 // Unfortunately, Stevens doesn't say what a good value is. The
118 // default 511 value is taken from httpd.h in Apache:
119 //
120 // It defaults to 511 instead of 512 because some systems
121 // store it as an 8-bit datatype; 512 truncated to 8-bits
122 // is 0, while 511 is 255 when truncated.
123 //
124 // If it's good enough for Apache, it's good enough for us. This
125 // can be overridden either in a config. file or on the command
126 // line.
127
128 int const SocketTimeout_Default = 10; // seconds
129 // The number of seconds a client has to complete a search request
130 // before being disconnected. This is to prevent a client from
131 // connecting, not completing a request, and causing the thread
132 // servicing the request to wait forever. This can be overridden
133 // either in a config. file or on the command line.
134
135 int const ThreadsMin_Default = 5;
136 // The minimum number of simultanous threads; this can be
137 // overridden either in a config. file or on the command line.
138
139 int const ThreadsMax_Default = 100;
140 // The maximum number of simultanous threads; this can be
141 // overridden either in a config. file or on the command line.
142
143 int const ThreadTimeout_Default = 30; // seconds
144 // The number of seconds until an idle spare thread times out and
145 // destroys itself. This can be overridden either in a config.
146 // file or on the command line.
147
148 char const User_Default[] = "nobody";
149 char const Group_Default[] = "nobody";
150 // The user and group to switch to after initialization (if root
151 // to begin with). This can be overridden either in a config.
152 // file or on the command line.
153 #endif
154
155 ////////// Miscellaneous parameters ///////////////////////////////////////////
156
157 char const ConfigFile_Default[] = "swish++.conf";
158 // Default name of the configuration file; this can be overridden
159 // on the command line.
160
161 char const ExtractExtension_Default[] = "txt";
162 // Default extension to append to filenames during extraction.
163 // This can be overridden either in a config. file or on the
164 // command line.
165
166 int const FilesGrow_Default = 100;
167 // Default number of files to grow reserved space for when
168 // incrementally indexing. This can be overridden either in a
169 // config. file or on the command line.
170
171 int const FilesReserve_Default = 1000;
172 // Default maximum number of files to reserve space for; see
173 // file_info.c for details. This can be overridden either in a
174 // config. file or on the command line.
175
176 int const Fork_Attempts = 5;
177 // Number of times to try to fork before giving up. This
178 // parameter is used only in filter.c.
179
180 int const Fork_Sleep = 5; // seconds
181 // Number of seconds to sleep before retrying to fork. This
182 // parameter is used only in filter.c.
183
184 char const IndexFile_Default[] = "swish++.index";
185 // Default name of the index file generated/searched; can be
186 // overridden either in a config. file or on the command line.
187
188 int const ResultsMax_Default = 999999;
189 // Default maximum number of search results; this can be
190 // overridden either in a config. file or on the command line.
191
192 char const ShellFilenameDelimChars[] = " \t&;<>|";
193 // Characters in a Unix shell command that delimit file names.
194 // Note that this says "file" (not "path") names.
195
196 char const ShellFilenameEscapeChars[] = " !\"#$&'()*/;<>?[\\]^`{|}~";
197 // Characters in a file name that must be escaped when passed to a
198 // Unix shell. This is a superset of what are commonly referred
199 // to as "meta-characers" because the space and tab characters are
200 // included. Note again that this says "file" (not "path") name.
201
202 #ifdef __CYGWIN__
203 char const TempDirectory_Default[] = "/temp";
204 #else
205
206 char const TempDirectory_Default[] = "/tmp";
207 #endif
208 // Default directory to use for temporary files during indexing.
209 // If your OS mounts swap space via /tmp (e.g., Solaris), as
210 // indexing progresses and more files get created in /tmp, you
211 // will have less swap space, indexing will get slower, and you
212 // may run out of memory. If this is the case, you can either
213 // change this default here for all users (preferred) or override
214 // it either in a config. file or on the command line to use a
215 // directory on a real filesystem, i.e., one on a physical disk,
216 // e.g., /var/tmp on some OSs. The directory must exist.
217
218 int const TitleLines_Default = 12;
219 // Specifies the maximum number of lines into a file for its
220 // "title" (whatever that means for a given file format); this can
221 // be overridden either in a config. file or on the command line.
222
223 int const Title_Max_Size = 200;
224 // Maximum length of a file "title" (whatever that means for a
225 // given file format).
226
227 #ifdef FEATURE_word_pos
228 int const WordsNear_Default = 10;
229 // The maximum number of words apart two words can be to be
230 // considered "near" each other; this can be overridden either in
231 // a config. file or on the command line.
232 #endif
233
234 int const WordPercentMax_Default = 100;
235 // Default maximum percentage of files a word may occur in before
236 // it is discarded as being too frequent; this can be overridden
237 // either in a config. file or on the command line.
238
239 int const WordThreshold_Default = 250000;
240 // The word count past which partial indicies are generated and
241 // merged since all the words are too big to fit into memory at
242 // the same time. If you index and your machine begins to swap
243 // like mad, lower this value. The above works OK in a 64MB
244 // machine. A rule of thumb is to add 250000 words for each
245 // additional 64MB of RAM you have. These numbers are for a SPARC
246 // machine running Solaris. Other machines running other
247 // operating systems use memory differently. You simply have to
248 // experiment. Only the super-user can increase this either in a
249 // config. file or on the command line.
250
251 #endif /* config_H */
252 /* vim:set noet sw=8 ts=8: */

  ViewVC Help
Powered by ViewVC 1.1.26