1 |
/************************************************************************************************* |
2 |
* The core API of Hyper Estraier |
3 |
* Copyright (C) 2004-2005 Mikio Hirabayashi |
4 |
* This file is part of Hyper Estraier. |
5 |
* Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of |
6 |
* the GNU Lesser General Public License as published by the Free Software Foundation; either |
7 |
* version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope |
8 |
* that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
10 |
* License for more details. |
11 |
* You should have received a copy of the GNU Lesser General Public License along with Hyper |
12 |
* Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, |
13 |
* Boston, MA 02111-1307 USA. |
14 |
*************************************************************************************************/ |
15 |
|
16 |
|
17 |
#ifndef _ESTRAIER_H /* duplication check */ |
18 |
#define _ESTRAIER_H |
19 |
|
20 |
#if defined(__cplusplus) /* export for C++ */ |
21 |
extern "C" { |
22 |
#endif |
23 |
|
24 |
|
25 |
|
26 |
/************************************************************************************************* |
27 |
* common settings |
28 |
*************************************************************************************************/ |
29 |
|
30 |
|
31 |
/* version of QDBM */ |
32 |
extern const char *est_version; |
33 |
|
34 |
|
35 |
|
36 |
/************************************************************************************************* |
37 |
* underlying headers |
38 |
*************************************************************************************************/ |
39 |
|
40 |
|
41 |
#include <depot.h> |
42 |
#include <curia.h> |
43 |
#include <cabin.h> |
44 |
#include <villa.h> |
45 |
#include <stdlib.h> |
46 |
|
47 |
|
48 |
|
49 |
/************************************************************************************************* |
50 |
* API for document |
51 |
*************************************************************************************************/ |
52 |
|
53 |
|
54 |
#define ESTDATTRID "@id" /* name of the attribute of ID */ |
55 |
#define ESTDATTRURI "@uri" /* name of the attribute of URI */ |
56 |
#define ESTDATTRCDATE "@cdate" /* name of the attribute of creation date */ |
57 |
#define ESTDATTRMDATE "@mdate" /* name of the attribute of modification date */ |
58 |
#define ESTDATTRTITLE "@title" /* name of the attribute of title */ |
59 |
#define ESTDATTRAUTHOR "@author" /* name of the attribute of author */ |
60 |
#define ESTDATTRTYPE "@type" /* name of the attribute of content type */ |
61 |
#define ESTDATTRLANG "@lang" /* name of the attribute of language */ |
62 |
#define ESTDATTRSIZE "@size" /* name of the attribute of entity size */ |
63 |
|
64 |
typedef struct { /* type of structure for a document */ |
65 |
int id; /* identification number */ |
66 |
CBMAP *attrs; /* map of attributes */ |
67 |
CBLIST *dtexts; /* list of shown text */ |
68 |
} ESTDOC; |
69 |
|
70 |
|
71 |
/* Create a document object. |
72 |
The return value is an object of a document. */ |
73 |
ESTDOC *est_doc_new(void); |
74 |
|
75 |
|
76 |
/* Create a document object made from draft data. |
77 |
`draft' specifies a string of draft data. |
78 |
The return value is an object of a document. */ |
79 |
ESTDOC *est_doc_new_from_draft(const char *draft); |
80 |
|
81 |
|
82 |
/* Destroy a document object. |
83 |
`doc' specifies a document object. */ |
84 |
void est_doc_delete(ESTDOC *doc); |
85 |
|
86 |
|
87 |
/* Add an attribute to a document object. |
88 |
`doc' specifies a document object. |
89 |
`name' specifies the name of an attribute. |
90 |
`value' specifies the value of the attribute. If it is `NULL', the attribute is removed. */ |
91 |
void est_doc_add_attr(ESTDOC *doc, const char *name, const char *value); |
92 |
|
93 |
|
94 |
/* Add a sentence of text to a document object. |
95 |
`doc' specifies a document object. |
96 |
`text' specifies a sentence of text. */ |
97 |
void est_doc_add_text(ESTDOC *doc, const char *text); |
98 |
|
99 |
|
100 |
/* Add a hidden sentence to a document object. |
101 |
`doc' specifies a document object. |
102 |
`text' specifies a hidden sentence. */ |
103 |
void est_doc_add_hidden_text(ESTDOC *doc, const char *text); |
104 |
|
105 |
|
106 |
/* Get the ID number of a document object. |
107 |
`doc' specifies a document object. |
108 |
The return value is the ID number of the document object. If the object has never been |
109 |
registered, -1 is returned. */ |
110 |
int est_doc_id(ESTDOC *doc); |
111 |
|
112 |
|
113 |
/* Get a list of attribute names of a document object. |
114 |
`doc' specifies a document object. |
115 |
The return value is a new list object of attribute names of the document object. Because |
116 |
the object of the return value is opened with the function `cblistopen', it should be closed |
117 |
with the function `cblistclose' if it is no longer in use. */ |
118 |
CBLIST *est_doc_attr_names(ESTDOC *doc); |
119 |
|
120 |
|
121 |
/* Get the value of an attribute of a document object. |
122 |
`doc' specifies a document object. |
123 |
`name' specifies the name of an attribute. |
124 |
The return value is the value of the attribute or `NULL' if it does not exist. The life |
125 |
duration of the returned string is synchronous with the one of the document object. */ |
126 |
const char *est_doc_attr(ESTDOC *doc, const char *name); |
127 |
|
128 |
|
129 |
/* Get a list of sentences of the text of a document object. |
130 |
`doc' specifies a document object. |
131 |
The return value is a list object of sentences of the text of the document object. The life |
132 |
duration of the returned object is synchronous with the one of the document object. */ |
133 |
const CBLIST *est_doc_texts(ESTDOC *doc); |
134 |
|
135 |
|
136 |
/* Concatenate sentences of the text of a document object. |
137 |
`doc' specifies a document object. |
138 |
The return value is concatenated sentences of a document object. Because the region of the |
139 |
return value is allocated with the `malloc' call, it should be released with the `free' call |
140 |
if it is no longer in use. */ |
141 |
char *est_doc_cat_texts(ESTDOC *doc); |
142 |
|
143 |
|
144 |
/* Dump draft data of a document object. |
145 |
`doc' specifies a document object. |
146 |
The return value is draft data of a document object. Because the region of the return value |
147 |
is allocated with the `malloc' call, it should be released with the `free' call if it is no |
148 |
longer in use. */ |
149 |
char *est_doc_dump_draft(ESTDOC *doc); |
150 |
|
151 |
|
152 |
/* Make a snippet of the body text of a document object. |
153 |
`doc' specifies a document object. |
154 |
`word' specifies a list object of words to be highlight. |
155 |
`wwitdh' specifies whole width of the result. |
156 |
`hwitdh' specifies width of strings picked up from the beginning of the text. |
157 |
`awitdh' specifies width of strings picked up around each highlighted word. |
158 |
The return value is a snippet string of the body text of a document object. There are tab |
159 |
separated values. Each line is a string to be shown. Though most lines have only one field, |
160 |
some lines have two fields. If the second field exists, the first field is to be shown with |
161 |
highlighted, and the second field means its normalized form. Because the region of the |
162 |
return value is allocated with the `malloc' call, it should be released with the `free' call |
163 |
if it is no longer in use. */ |
164 |
char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth); |
165 |
|
166 |
|
167 |
/* Check whether the text of a document object includes every specified words. |
168 |
`doc' specifies a document object. |
169 |
`word' specifies a list object of words to be checked. |
170 |
The return value is true if every specified words is found, else it is false. */ |
171 |
int est_doc_scan_words(ESTDOC *doc, const CBLIST *words); |
172 |
|
173 |
|
174 |
|
175 |
/************************************************************************************************* |
176 |
* API for search conditions |
177 |
*************************************************************************************************/ |
178 |
|
179 |
|
180 |
#define ESTOPUVSET "[UVSET]" /* universal set */ |
181 |
#define ESTOPSIMILAR "[SIMILAR]" /* similarity search */ |
182 |
|
183 |
#define ESTOPUNION "OR" /* union (conjunction) */ |
184 |
#define ESTOPISECT "AND" /* intersection (disjunction) */ |
185 |
#define ESTOPDIFF "ANDNOT" /* difference (intersection with negation) */ |
186 |
#define ESTOPWITH "WITH" /* delimiter for elements */ |
187 |
|
188 |
#define ESTOPSTREQ "STREQ" /* string is equal */ |
189 |
#define ESTOPSTRNE "STRNE" /* string is not equal */ |
190 |
#define ESTOPSTRINC "STRINC" /* string is included in */ |
191 |
#define ESTOPSTRBW "STRBW" /* string begins with */ |
192 |
#define ESTOPSTREW "STREW" /* string ends with */ |
193 |
#define ESTOPNUMEQ "NUMEQ" /* number or date is equal */ |
194 |
#define ESTOPNUMNE "NUMNE" /* number or date is not equal */ |
195 |
#define ESTOPNUMGT "NUMGT" /* number or date is greater than */ |
196 |
#define ESTOPNUMGE "NUMGE" /* number or date is greater than or equal to */ |
197 |
#define ESTOPNUMLT "NUMLT" /* number or date is less than */ |
198 |
#define ESTOPNUMLE "NUMLE" /* number or date is less than or equal to */ |
199 |
#define ESTOPREGEX "REGEX" /* string matches regular expressions */ |
200 |
|
201 |
#define ESTORDSTRA "STRA" /* strings in ascending order */ |
202 |
#define ESTORDSTRD "STRD" /* strings in descending order */ |
203 |
#define ESTORDNUMA "NUMA" /* numbers in ascending order */ |
204 |
#define ESTORDNUMD "NUMD" /* numbers in descending order */ |
205 |
|
206 |
typedef struct { /* type of structure for search conditions */ |
207 |
char *phrase; /* search phrase */ |
208 |
int gstep; /* step of N-gram */ |
209 |
int tfidf; /* whether with TF-IDF tuning */ |
210 |
int simple; /* whether with the simplefied phrase */ |
211 |
CBLIST *attrs; /* conditions with attributes */ |
212 |
char *order; /* sorting order */ |
213 |
int max; /* maximum number of retrieval */ |
214 |
int scfb; /* whether to feed back scores */ |
215 |
int *scores; /* array of scores */ |
216 |
int snum; /* number of elemnts of the score array */ |
217 |
int opts; /* options for preservation */ |
218 |
} ESTCOND; |
219 |
|
220 |
enum { /* enumeration for options */ |
221 |
ESTCONDSURE = 1 << 0, /* check every N-gram key */ |
222 |
ESTCONDUSU = 1 << 1, /* check N-gram keys skipping by one */ |
223 |
ESTCONDFAST = 1 << 2, /* check N-gram keys skipping by two */ |
224 |
ESTCONDAGIT = 1 << 3, /* check N-gram keys skipping by three */ |
225 |
ESTCONDNOIDF = 1 << 4, /* without TF-IDF tuning */ |
226 |
ESTCONDSIMPLE = 1 << 10, /* with the simplefied phrase */ |
227 |
ESTCONDSCFB = 1 << 30 /* feed back scores (for debug) */ |
228 |
}; |
229 |
|
230 |
|
231 |
/* Create a condition object. |
232 |
The return value is an object of search conditions. */ |
233 |
ESTCOND *est_cond_new(void); |
234 |
|
235 |
|
236 |
/* Destroy a condition object. |
237 |
`cond' specifies a condition object. */ |
238 |
void est_cond_delete(ESTCOND *cond); |
239 |
|
240 |
|
241 |
/* Set a search phrase to a condition object. |
242 |
`cond' specifies a condition object. |
243 |
`phrase' specifies a search phrase. */ |
244 |
void est_cond_set_phrase(ESTCOND *cond, const char *phrase); |
245 |
|
246 |
|
247 |
/* Add an expression for an attribute to a condition object. |
248 |
`cond' specifies a condition object. |
249 |
`expr' specifies an expression for an attribute. */ |
250 |
void est_cond_add_attr(ESTCOND *cond, const char *expr); |
251 |
|
252 |
|
253 |
/* Set the order of a condition object. |
254 |
`cond' specifies a condition object. |
255 |
`expr' specifies an expression for the order. |
256 |
By default, the order is by score descending. */ |
257 |
void est_cond_set_order(ESTCOND *cond, const char *expr); |
258 |
|
259 |
|
260 |
/* Set the maximum number of retrieval of a condition object. |
261 |
`cond' specifies a condition object. |
262 |
`max' specifies the maximum number of retrieval. |
263 |
By default, the number of retrieval is not limited. */ |
264 |
void est_cond_set_max(ESTCOND *cond, int max); |
265 |
|
266 |
|
267 |
/* Set options of retrieval of a condition object. |
268 |
`cond' specifies a condition object. |
269 |
`options' specifies options: `ESTCONDSURE' specifies that it checks every N-gram key, |
270 |
`ESTCONDUSU', which is the default, specifies that it checks N-gram keys with skipping one |
271 |
key, `ESTCONDFAST' skips two keys, `ESTCONDAGIT' skips three keys, `ESTCONDNOIDF' specifies |
272 |
not to perform TF-IDF tuning, `ESTCONDSIMPLE' specifies to use simplefied phrase. Each option |
273 |
can be specified at the same time by bitwise or. If keys are skipped, though search speed is |
274 |
improved, the relevance ratio grows less. */ |
275 |
void est_cond_set_options(ESTCOND *cond, int options); |
276 |
|
277 |
|
278 |
|
279 |
/************************************************************************************************* |
280 |
* API for database |
281 |
*************************************************************************************************/ |
282 |
|
283 |
|
284 |
#define ESTIDXDMAX 16 /* max number of the inverted index */ |
285 |
|
286 |
typedef struct { /* type of structure for the inverted index */ |
287 |
char *name; /* name of the database */ |
288 |
int omode; /* open mode */ |
289 |
VILLA *dbs[ESTIDXDMAX]; /* database handles */ |
290 |
int dnum; /* number of division */ |
291 |
VILLA *cdb; /* current database handle */ |
292 |
} ESTIDX; |
293 |
|
294 |
typedef struct { /* type of structure for a database object */ |
295 |
char *name; /* name of the database */ |
296 |
DEPOT *metadb; /* handle of the meta database */ |
297 |
ESTIDX *idxdb; /* handles of the inverted indexs */ |
298 |
VILLA *fwmdb; /* handle of the database for forward matching */ |
299 |
CURIA *attrdb; /* handle of the database for attrutes */ |
300 |
CURIA *textdb; /* handle of the database for texts */ |
301 |
VILLA *listdb; /* handle of the database for document list */ |
302 |
int ecode; /* last happened error code */ |
303 |
int fatal; /* whether to have a fatal error */ |
304 |
int dseq; /* sequence for document IDs */ |
305 |
int dnum; /* number of the documents */ |
306 |
int amode; /* mode of text analyzer */ |
307 |
CBMAP *idxcc; /* cache for the inverted index */ |
308 |
size_t icsiz; /* power of the cache */ |
309 |
size_t icmax; /* max size of the cache */ |
310 |
CBMAP *outcc; /* cache for deleted documents */ |
311 |
CBMAP *keycc; /* cache for keys for TF-IDF */ |
312 |
int kcmnum; /* max number of the key cache */ |
313 |
CBMAP *attrcc; /* cache for attributes */ |
314 |
int acmnum; /* max number of the attribute cache */ |
315 |
CBMAP *textcc; /* cache for texts */ |
316 |
int tcmnum; /* max number of the text cache */ |
317 |
CBMAP *spacc; /* special cache for attributes */ |
318 |
int scmnum; /* max number of the special cache */ |
319 |
char *scname; /* name of the attribute for the special cache */ |
320 |
void (*cbinfo)(const char *); /* callback function to inform of events */ |
321 |
CBMAP *(*cbvec)(void *, int, void *); /* callback function to create a vector */ |
322 |
void *vecdata; /* arbitrary object for the vectorizer */ |
323 |
CBMAP *metacc; /* cache for meta data */ |
324 |
} ESTDB; |
325 |
|
326 |
enum { /* enumeration for error codes */ |
327 |
ESTENOERR, /* no error */ |
328 |
ESTEINVAL, /* invalid argument */ |
329 |
ESTEACCES, /* access forbidden */ |
330 |
ESTELOCK, /* lock failure */ |
331 |
ESTEDB, /* database problem */ |
332 |
ESTEIO, /* I/O problem */ |
333 |
ESTENOITEM, /* no item */ |
334 |
ESTEMISC = 9999 /* miscellaneous */ |
335 |
}; |
336 |
|
337 |
enum { /* enumeration for open modes */ |
338 |
ESTDBREADER = 1 << 0, /* open as a reader */ |
339 |
ESTDBWRITER = 1 << 1, /* open as a writer */ |
340 |
ESTDBCREAT = 1 << 2, /* a writer creating */ |
341 |
ESTDBTRUNC = 1 << 3, /* a writer truncating */ |
342 |
ESTDBNOLCK = 1 << 4, /* open without locking */ |
343 |
ESTDBLCKNB = 1 << 5, /* lock without blocking */ |
344 |
ESTDBPERFNG = 1 << 6 /* use perfect N-gram analyzer */ |
345 |
}; |
346 |
|
347 |
enum { /* enumeration for options of document registration */ |
348 |
ESTPDCLEAN = 1 << 0 /* clean up dispensable regions */ |
349 |
}; |
350 |
|
351 |
enum { /* enumeration for options of document deletion */ |
352 |
ESTODCLEAN = 1 << 0 /* clean up dispensable regions */ |
353 |
}; |
354 |
|
355 |
enum { /* enumeration for options of optimization */ |
356 |
ESTOPTNOPURGE = 1 << 0, /* omit purging dispensable region of deleted */ |
357 |
ESTOPTNODBOPT = 1 << 1 /* omit optimizization of the database files */ |
358 |
}; |
359 |
|
360 |
enum { /* enumeration for options of document retrieval */ |
361 |
ESTGDNOATTR = 1 << 0, /* no attributes */ |
362 |
ESTGDNOTEXT = 1 << 1 /* no text */ |
363 |
}; |
364 |
|
365 |
|
366 |
/* Get the string of an error code. |
367 |
`ecode' specifies an error code. |
368 |
The return value is the string of the error code. */ |
369 |
const char *est_err_msg(int ecode); |
370 |
|
371 |
|
372 |
/* Open a database. |
373 |
`name' specifies the name of a database directory. |
374 |
`mode' specifies open modes: `ESTDBWRITER' as a writer, `ESTDBREADER' as a reader. If the |
375 |
mode is `ESTDBWRITER', the following may be added by bitwise or: `ESTDBCREAT', which means it |
376 |
creates a new database if not exist, `ESTDBTRUNC', which means it creates a new database |
377 |
regardless if one exists. Both of `ESTDBREADER' and `ESTDBWRITER' can be added to by |
378 |
bitwise or: `ESTDBNOLCK', which means it opens a database file without file locking, or |
379 |
`ESTDBLCKNB', which means locking is performed without blocking. If `ESTDBNOLCK' is used, |
380 |
the application is responsible for exclusion control. `ESTDBCREAT' can be added to by bitwise |
381 |
or: `ESTDBPERFNG', which means N-gram analysis is performed against Europian text also. |
382 |
`ecp' specifies the pointer to a variable to which the error code is assigned. |
383 |
The return value is a database object of the database or `NULL' if failure. */ |
384 |
ESTDB *est_db_open(const char *name, int omode, int *ecp); |
385 |
|
386 |
|
387 |
/* Close a database. |
388 |
`db' specifies a database object. |
389 |
`ecp' specifies the pointer to a variable to which the error code is assigned. |
390 |
The return value is true if success, else it is false. */ |
391 |
int est_db_close(ESTDB *db, int *ecp); |
392 |
|
393 |
|
394 |
/* Get the last happended error code of a database. |
395 |
`db' specifies a database object. |
396 |
The return value is the last happended error code of the database. */ |
397 |
int est_db_error(ESTDB *db); |
398 |
|
399 |
|
400 |
/* Check whether a database has a fatal error. |
401 |
`db' specifies a database object. |
402 |
The return value is true if the database has fatal erroor, else it is false. */ |
403 |
int est_db_fatal(ESTDB *db); |
404 |
|
405 |
|
406 |
/* Flush index words in the cache of a database. |
407 |
`db' specifies a database object connected as a writer. |
408 |
`max' specifies the maximum number of words to be flushed. If it not more than zero, all |
409 |
words are flushed. |
410 |
The return value is true if success, else it is false. */ |
411 |
int est_db_flush(ESTDB *db, int max); |
412 |
|
413 |
|
414 |
/* Synchronize updating contents of a database. |
415 |
`db' specifies a database object connected as a writer. |
416 |
The return value is true if success, else it is false. */ |
417 |
int est_db_sync(ESTDB *db); |
418 |
|
419 |
|
420 |
/* Optimize a database. |
421 |
`db' specifies a database object connected as a writer. |
422 |
`options' specifies options: `ESTOPTNOPURGE' to omit purging dispensable region of deleted |
423 |
documents, `ESTOPTNODBOPT' to omit optimizization of the database files. The two can be |
424 |
specified at the same time by bitwise or. |
425 |
The return value is true if success, else it is false. */ |
426 |
int est_db_optimize(ESTDB *db, int options); |
427 |
|
428 |
|
429 |
/* Add a document to a database. |
430 |
`db' specifies a database object connected as a writer. |
431 |
`doc' specifies a document object. The document object should have the URI attribute. |
432 |
`options' specifies options: `ESTPDCLEAN' to clean up dispensable regions of the overwritten |
433 |
document. |
434 |
The return value is true if success, else it is false. |
435 |
If the URI attribute is same with an existing document in the database, the existing one is |
436 |
deleted. */ |
437 |
int est_db_put_doc(ESTDB *db, ESTDOC *doc, int options); |
438 |
|
439 |
|
440 |
/* Remove a document from a database. |
441 |
`db' specifies a database object connected as a writer. |
442 |
`id' specifies the ID number of a registered document. |
443 |
`options' specifies options: `ESTODCLEAN' to clean up dispensable regions of the deleted |
444 |
document. |
445 |
The return value is true if success, else it is false. */ |
446 |
int est_db_out_doc(ESTDB *db, int id, int options); |
447 |
|
448 |
|
449 |
/* Retrieve a document in a database. |
450 |
`db' specifies a database object. |
451 |
`id' specifies the ID number of a registered document. |
452 |
`options' specifies options: `ESTGDNOATTR' to ignore attributes, `ESTGDNOTEXT' to ignore |
453 |
the body text. The two can be specified at the same time by bitwise or. |
454 |
The return value is a document object. On error, `NULL' is returned. */ |
455 |
ESTDOC *est_db_get_doc(ESTDB *db, int id, int options); |
456 |
|
457 |
|
458 |
/* Retrieve the value of an attribute of a document in a database. |
459 |
`db' specifies a database object. |
460 |
`id' specifies the ID number of a registered document. |
461 |
`name' specifies the name of an attribute. |
462 |
The return value is the value of the attribute or `NULL' if it does not exist. Because the |
463 |
region of the return value is allocated with the `malloc' call, it should be released with |
464 |
the `free' call if it is no longer in use. */ |
465 |
char *est_db_get_doc_attr(ESTDB *db, int id, const char *name); |
466 |
|
467 |
|
468 |
/* Get the ID of a document spacified by URI. |
469 |
`db' specifies a database object. |
470 |
`uri' specifies the URI of a registered document. |
471 |
The return value is the ID of the document. On error, -1 is returned. */ |
472 |
int est_db_uri_to_id(ESTDB *db, const char *uri); |
473 |
|
474 |
|
475 |
/* Extract keywords of a document object. |
476 |
`db' specifies a database object for TF-IDF tuning. If it is `NULL', it is not used. |
477 |
`doc' specifies a document object. |
478 |
`max' specifies the maximum number of keywords to be extracted. |
479 |
The return value is a new map object of keywords and their scores in decimal string. Because |
480 |
the object of the return value is opened with the function `cbmapopen', it should be closed |
481 |
with the function `cbmapclose' if it is no longer in use. */ |
482 |
CBMAP *est_db_etch_doc(ESTDB *db, ESTDOC *doc, int max); |
483 |
|
484 |
|
485 |
/* Initialize the iterator of a database. |
486 |
`db' specifies a database object. |
487 |
The return value is true if success, else it is false. */ |
488 |
int est_db_iter_init(ESTDB *db); |
489 |
|
490 |
|
491 |
/* Get the next ID of the iterator of a database. |
492 |
`db' specifies a database object. |
493 |
The return value is the next ID. If there is no more document, 0 is returned. On error, |
494 |
-1 is returned. */ |
495 |
int est_db_iter_next(ESTDB *db); |
496 |
|
497 |
|
498 |
/* Get the name of a database. |
499 |
`db' specifies a database object. |
500 |
The return value is the name of the database. The life duration of the returned string is |
501 |
synchronous with the one of the database object. */ |
502 |
const char *est_db_name(ESTDB *db); |
503 |
|
504 |
|
505 |
/* Get the number of documents in a database. |
506 |
`db' specifies a database object. |
507 |
The return value is the number of documents in the database. */ |
508 |
int est_db_doc_num(ESTDB *db); |
509 |
|
510 |
|
511 |
/* Get the number of unique words in a database. |
512 |
`db' specifies a database object. |
513 |
The return value is the number of unique words in the database. */ |
514 |
int est_db_word_num(ESTDB *db); |
515 |
|
516 |
|
517 |
/* Get the size of a database. |
518 |
`db' specifies a database object. |
519 |
The return value is the size of the database. */ |
520 |
double est_db_size(ESTDB *db); |
521 |
|
522 |
|
523 |
/* Search documents corresponding a condition for a database. |
524 |
`db' specifies a database object. |
525 |
`cond' specifies a condition object. |
526 |
`nump' specifies the pointer to a variable to which the number of elements in the result is |
527 |
assigned. |
528 |
`hints' specifies a map object into which the number of documents corresponding to each word |
529 |
is stored. If a word is in a negative condition, the number is negative. The element whose |
530 |
key is an empty string specifies the number of whole result. If it is `NULL', it is not used. |
531 |
The return value is an array whose elements are ID numbers of corresponding documents. |
532 |
This function does never fail. Even if no document corresponds or an error occurs, an empty |
533 |
array is returned. Because the region of the return value is allocated with the `malloc' |
534 |
call, it should be released with the `free' call if it is no longer in use. */ |
535 |
int *est_db_search(ESTDB *db, ESTCOND *cond, int *nump, CBMAP *hints); |
536 |
|
537 |
|
538 |
/* Set the maximum size of the cache memory of a database. |
539 |
`db' specifies a database object. |
540 |
`size' specifies the maximum size of the index cache. By default, it is 64MB. If it is not |
541 |
more than 0, the current size is not changed. |
542 |
`anum' specifies the maximum number of cached records for document attributes. By default, it |
543 |
is 8192. If it is not more than 0, the current size is not changed. |
544 |
`tnum' specifies the maximum number of cached records for document texts. By default, it is |
545 |
1024. If it is not more than 0, the current size is not changed. */ |
546 |
void est_db_set_cache_size(ESTDB *db, size_t size, int anum, int tnum); |
547 |
|
548 |
|
549 |
/* Set the special cache for narrowing and sorting with document attributes. |
550 |
`db' specifies a database object. |
551 |
`name' specifies the name of a document. |
552 |
`num' specifies the maximum number of cached records. */ |
553 |
void est_db_set_special_cache(ESTDB *db, const char *name, int num); |
554 |
|
555 |
|
556 |
|
557 |
/************************************************************************************************* |
558 |
* features for experts |
559 |
*************************************************************************************************/ |
560 |
|
561 |
|
562 |
#define _EST_VERSION "0.5.1" |
563 |
#define _EST_LIBVER 200 |
564 |
#define _EST_PROTVER "0.9" |
565 |
|
566 |
enum { /* enumeration for languages */ |
567 |
ESTLANGEN, /* English */ |
568 |
ESTLANGJA, /* Japanese */ |
569 |
ESTLANGZH, /* Chinese */ |
570 |
ESTLANGKO, /* Korean */ |
571 |
ESTLANGMISC /* miscellaneous */ |
572 |
}; |
573 |
|
574 |
|
575 |
/* Break a sentence of text and extract words. |
576 |
`text' specifies a sentence of text. |
577 |
`list' specifies a list object to which extract words are added. |
578 |
`norm' specifies whether to normalize the text. |
579 |
`tail' specifies whether to pick up oddness N-gram at the end. */ |
580 |
void est_break_text(const char *text, CBLIST *list, int norm, int tail); |
581 |
|
582 |
|
583 |
/* Break a sentence of text and extract words using perfect N-gram analyzer. |
584 |
`text' specifies a sentence of text. |
585 |
`list' specifies a list object to which extract words are added. |
586 |
`norm' specifies whether to normalize the text. |
587 |
`tail' specifies whether to pick up oddness N-gram at the end. */ |
588 |
void est_break_text_perfng(const char *text, CBLIST *list, int norm, int tail); |
589 |
|
590 |
|
591 |
/* Convert the character encoding of a string. |
592 |
`ptr' specifies the pointer to a region. |
593 |
`size' specifies the size of the region. If it is negative, the size is assigned with |
594 |
`strlen(ptr)'. |
595 |
`icode' specifies the name of encoding of the input string. |
596 |
`ocode' specifies the name of encoding of the output string. |
597 |
`sp' specifies the pointer to a variable to which the size of the region of the return |
598 |
value is assigned. If it is `NULL', it is not used. |
599 |
`mp' specifies the pointer to a variable to which the number of missing characters by failure |
600 |
of conversion is assigned. If it is `NULL', it is not used. |
601 |
If successful, the return value is the pointer to the result object, else, it is `NULL'. |
602 |
Because an additional zero code is appended at the end of the region of the return value, |
603 |
the return value can be treated as a character string. Because the region of the return |
604 |
value is allocated with the `malloc' call, it should be released with the `free' call if it |
605 |
is no longer in use. */ |
606 |
char *est_iconv(const char *ptr, int size, const char *icode, const char *ocode, |
607 |
int *sp, int *mp); |
608 |
|
609 |
|
610 |
/* Detect the encoding of a string automatically. |
611 |
`ptr' specifies the pointer to a region. |
612 |
`size' specifies the size of the region. If it is negative, the size is assigned with |
613 |
`strlen(ptr)'. |
614 |
`plang' specifies a preferred language. As for now, `ESTLANGEN', `ESTLANGJA', `ESTLANGZH', |
615 |
and `ESTLANGKO' are supported. |
616 |
The return value is the string of the encoding name of the string. */ |
617 |
const char *est_enc_name(const char *ptr, int size, int plang); |
618 |
|
619 |
|
620 |
/* Convert a UTF-8 string into UTF-16BE. |
621 |
`ptr' specifies the pointer to a region. |
622 |
`size' specifies the size of the region. |
623 |
`sp' specifies the pointer to a variable to which the size of the region of the return |
624 |
value is assigned. |
625 |
The return value is the pointer to the result object. Because an additional zero code is |
626 |
appended at the end of the region of the return value, the return value can be treated as a |
627 |
character string. Because the region of the return value is allocated with the `malloc' call, |
628 |
it should be released with the `free' call if it is no longer in use. */ |
629 |
char *est_uconv_in(const char *ptr, int size, int *sp); |
630 |
|
631 |
|
632 |
/* Convert a UTF-16BE string into UTF-8. |
633 |
`ptr' specifies the pointer to a region. |
634 |
`size' specifies the size of the region. |
635 |
`sp' specifies the pointer to a variable to which the size of the region of the return |
636 |
value is assigned. If it is `NULL', it is not used. |
637 |
The return value is the pointer to the result object. Because an additional zero code is |
638 |
appended at the end of the region of the return value, the return value can be treated as a |
639 |
character string. Because the region of the return value is allocated with the `malloc' call, |
640 |
it should be released with the `free' call if it is no longer in use. */ |
641 |
char *est_uconv_out(const char *ptr, int size, int *sp); |
642 |
|
643 |
|
644 |
/* Compress a serial object with ZLIB. |
645 |
`ptr' specifies the pointer to a region. |
646 |
`size' specifies the size of the region. If it is negative, the size is assigned with |
647 |
`strlen(ptr)'. |
648 |
`sp' specifies the pointer to a variable to which the size of the region of the return |
649 |
value is assigned. |
650 |
If successful, the return value is the pointer to the result object, else, it is `NULL'. |
651 |
Because the region of the return value is allocated with the `malloc' call, it should be |
652 |
released with the `free' call if it is no longer in use. */ |
653 |
char *est_deflate(const char *ptr, int size, int *sp); |
654 |
|
655 |
|
656 |
/* Decompress a serial object compressed with ZLIB. |
657 |
`ptr' specifies the pointer to a region. |
658 |
`size' specifies the size of the region. |
659 |
`sp' specifies the pointer to a variable to which the size of the region of the return |
660 |
value is assigned. If it is `NULL', it is not used. |
661 |
If successful, the return value is the pointer to the result object, else, it is `NULL'. |
662 |
Because an additional zero code is appended at the end of the region of the return value, |
663 |
the return value can be treated as a character string. Because the region of the return |
664 |
value is allocated with the `malloc' call, it should be released with the `free' call if it |
665 |
is no longer in use. */ |
666 |
char *est_inflate(const char *ptr, int size, int *sp); |
667 |
|
668 |
|
669 |
/* Get the border string for draft data of documents. |
670 |
The return value is the border string for draft data of documents. */ |
671 |
const char *est_border_str(void); |
672 |
|
673 |
|
674 |
/* Get the real random number. |
675 |
The return value is the real random number between 0.0 and 1.0. */ |
676 |
double est_random(void); |
677 |
|
678 |
|
679 |
/* Get the random number in normal distribution. |
680 |
The return value is the random number in normal distribution between 0.0 and 1.0. */ |
681 |
double est_random_nd(void); |
682 |
|
683 |
|
684 |
/* Get an MD5 hash string of a key string. |
685 |
`key' specifies a string to be encrypted. |
686 |
The return value is an MD5 hash string of the key string. Because the region of the return |
687 |
value is allocated with the `malloc' call, it should be released with the `free' call if it |
688 |
is no longer in use. */ |
689 |
char *est_make_crypt(const char *key); |
690 |
|
691 |
|
692 |
/* Check whether a key matches an MD5 hash string. |
693 |
`key' specifies a string to be checked. |
694 |
`hash' specifies an MD5 hash string. |
695 |
The return value is true if the key matches the hash string, else it is false. */ |
696 |
int est_match_crypt(const char *key, const char *hash); |
697 |
|
698 |
|
699 |
/* Get the hidden texts of a document object. |
700 |
`doc' specifies a document object. |
701 |
The return value is concatenated sentences of the hidden text of the document object. The |
702 |
life duration of the returned string is synchronous with the one of the document object. */ |
703 |
const char *est_doc_hidden_texts(ESTDOC *doc); |
704 |
|
705 |
|
706 |
/* Get the phrase of a condition object. |
707 |
`cond' specifies a condition object. |
708 |
The return value is the phrase of a condition object or `NULL' if it is not specified. The |
709 |
life duration of the returned string is synchronous with the one of the condition object. */ |
710 |
const char *est_cond_phrase(ESTCOND *cond); |
711 |
|
712 |
|
713 |
/* Get a list object of attribute expressions of a condition object. |
714 |
`cond' specifies a condition object. |
715 |
The return value is a list object of attribute expressions of a condition object or `NULL' if |
716 |
it is not specified. The life duration of the returned object is synchronous with the one of |
717 |
the condition object. */ |
718 |
const CBLIST *est_cond_attrs(ESTCOND *cond); |
719 |
|
720 |
|
721 |
/* Get the order expression of a condition object. |
722 |
`cond' specifies a condition object. |
723 |
The return value is the order expression of a condition object or `NULL' if it is not |
724 |
specified. The life duration of the returned string is synchronous with the one of the |
725 |
condition object. */ |
726 |
const char *est_cond_order(ESTCOND *cond); |
727 |
|
728 |
|
729 |
/* Get the maximum number of retrieval of a condition object. |
730 |
`cond' specifies a condition object. |
731 |
The return value is the maximum number of retrieval of a condition object or -1 if it is not |
732 |
specified. */ |
733 |
int est_cond_max(ESTCOND *cond); |
734 |
|
735 |
|
736 |
/* Get the options of a condition object. |
737 |
`cond' specifies a condition object. |
738 |
The return value is the options of a condition object. */ |
739 |
int est_cond_options(ESTCOND *cond); |
740 |
|
741 |
|
742 |
/* Get the score of a document corresponding to a condition object. |
743 |
`cond' specifies a condition object. |
744 |
`index' specifies the index of an element of the result array of `est_db_search'. |
745 |
The return value is the score of the element or -1 if the index is out of bounds. */ |
746 |
int est_cond_score(ESTCOND *cond, int index); |
747 |
|
748 |
|
749 |
/* Set the error code of a database. |
750 |
`db' specifies a database object. |
751 |
`ecode' specifies a error code to set. */ |
752 |
void est_db_set_ecode(ESTDB *db, int ecode); |
753 |
|
754 |
|
755 |
/* Edit attributes of a document object in a database. |
756 |
`db' specifies a database object connected as a writer. |
757 |
`doc' specifies a document object. |
758 |
The return value is true if success, else it is false. */ |
759 |
int est_db_edit_doc(ESTDB *db, ESTDOC *doc); |
760 |
|
761 |
|
762 |
/* Add a piece of meta data to a database. |
763 |
`db' specifies a database object connected as a writer. |
764 |
`name' specifies the name of a piece of meta data. |
765 |
`value' specifies the value of the meta data. If it is `NULL', the meta data is removed. */ |
766 |
void est_db_add_meta(ESTDB *db, const char *name, const char *value); |
767 |
|
768 |
|
769 |
/* Get a list of names of meta data of a database. |
770 |
`db' specifies a database object. |
771 |
The return value is a new list object of meta data names of the document object. Because the |
772 |
object of the return value is opened with the function `cblistopen', it should be closed with |
773 |
the function `cblistclose' if it is no longer in use. */ |
774 |
CBLIST *est_db_meta_names(ESTDB *db); |
775 |
|
776 |
|
777 |
/* Get the value of a piece of meta data of a database. |
778 |
`db' specifies a database object. |
779 |
`name' specifies the name of a piece of meta data. |
780 |
The return value is the value of the meta data or `NULL' if it does not exist. Because the |
781 |
region of the return value is allocated with the `malloc' call, it should be released with |
782 |
the `free' call if it is no longer in use. */ |
783 |
char *est_db_meta(ESTDB *db, const char *name); |
784 |
|
785 |
|
786 |
/* Get the number of records in the cache memory of a database. |
787 |
`db' specifies a database object. |
788 |
The return value is the cache memory of a database. */ |
789 |
int est_db_cache_num(ESTDB *db); |
790 |
|
791 |
|
792 |
/* Set the callback function to inform of database events. |
793 |
`db' specifies a database object. |
794 |
`func' specifies the pointer to a function. The argument of the callback specifies a message |
795 |
of each event. */ |
796 |
void est_db_set_informer(ESTDB *db, void (*func)(const char *)); |
797 |
|
798 |
|
799 |
/* Set the callback function to create a vector of keywords of a document. |
800 |
`db' specifies a database object. |
801 |
`func' specifies the pointer to a function. The arguments of the callback specify the |
802 |
database object, the ID of a document, and an arbitrary pointer. The return value is the |
803 |
callback is a new map object conforming to the return value of `est_db_etch_doc'. |
804 |
`data' specifies the pointer to an object given as the third argument of the callback. */ |
805 |
void est_db_set_vectorizer(ESTDB *db, CBMAP *(*func)(void *, int, void *), void *data); |
806 |
|
807 |
|
808 |
/* Fill the cache for keys for TF-IDF. |
809 |
`db' specifies a database object. */ |
810 |
void est_db_fill_key_cache(ESTDB *db); |
811 |
|
812 |
|
813 |
/* Make a directory. |
814 |
`path' specifies the path of a new directory. |
815 |
The return value is true if success, else it is false. */ |
816 |
int est_mkdir(const char *path); |
817 |
|
818 |
|
819 |
/* Remove a directory and its contents recursively. |
820 |
`path' specifies the path of a directory. |
821 |
The return value is true if success, else it is false. */ |
822 |
int est_rmdir_rec(const char *path); |
823 |
|
824 |
|
825 |
/* Get the canonicalized absolute pathname of a file. |
826 |
`path' specifies the path of a new directory. |
827 |
The return value is the canonicalized absolute pathname of a file. Because the region of the |
828 |
return value is allocated with the `malloc' call, it should be released with the `free' call |
829 |
if it is no longer in use. */ |
830 |
char *est_realpath(const char *path); |
831 |
|
832 |
|
833 |
/* Get the time of day in milliseconds. |
834 |
The return value is the time of day in milliseconds. */ |
835 |
double est_gettimeofday(void); |
836 |
|
837 |
|
838 |
/* Suspend execution for microsecond intervals. |
839 |
`usec' specifies microseconds to sleep for. */ |
840 |
void est_usleep(unsigned long usec); |
841 |
|
842 |
|
843 |
/* Send a signal to a process. |
844 |
`pid' specifies the PID of a target process. |
845 |
`sig' specifies a signal code. |
846 |
The return value is true if success, else it is false. */ |
847 |
int est_kill(int pid, int sig); |
848 |
|
849 |
|
850 |
/* Get the media type of an extention. |
851 |
`ext' specifies the extension of a file path. |
852 |
The return value is the media time of the extension. */ |
853 |
const char *est_ext_type(const char *ext); |
854 |
|
855 |
|
856 |
|
857 |
#if defined(__cplusplus) /* export for C++ */ |
858 |
} |
859 |
#endif |
860 |
|
861 |
#endif /* duplication check */ |
862 |
|
863 |
|
864 |
/* END OF FILE */ |