/[hyperestraier]/trunk/estraier.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/estraier.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3 - (show annotations)
Fri Jul 29 21:57:20 2005 UTC (18 years, 9 months ago) by dpavlin
File MIME type: text/plain
File size: 132031 byte(s)
make working copy from version 0.5.1

1 /*************************************************************************************************
2 * Implementation of the core API
3 * Copyright (C) 2004-2005 Mikio Hirabayashi
4 * This file is part of Hyper Estraier.
5 * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6 * the GNU Lesser General Public License as published by the Free Software Foundation; either
7 * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10 * License for more details.
11 * You should have received a copy of the GNU Lesser General Public License along with Hyper
12 * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13 * Boston, MA 02111-1307 USA.
14 *************************************************************************************************/
15
16
17 #include "estraier.h"
18 #include "myconf.h"
19
20 #define ESTNUMBUFSIZ 32 /* size of a buffer for a number */
21 #define ESTPATHBUFSIZ 4096 /* size of a buffer for a path */
22 #define ESTIOBUFSIZ 8192 /* size of a buffer for I/O */
23 #define ESTALLOCUNIT 1024 /* unit number of memory allocation */
24 #define ESTMINIBNUM 31 /* bucket number of map for attributes */
25
26 #define ESTMETADBNAME "_meta" /* name of the meta database */
27 #define ESTKEYIDXNUM "_idxnum" /* key for the number of inverted indexes */
28 #define ESTKEYDSEQ "_dseq" /* key for the sequence for document IDs */
29 #define ESTKEYDNUM "_dnum" /* key for the number of documents */
30 #define ESTKEYAMODE "_amode" /* key for the mode of text analyzer */
31 #define ESTKEYMETA "_meta" /* key for meta data */
32
33 #define ESTIDXDBNAME "_idx" /* name of the inverted index */
34 #define ESTIDXDBLRM 77 /* records in a leaf node of the inverted index */
35 #define ESTIDXDBNIM 160 /* records in a non-leaf node of the inverted index */
36 #define ESTIDXDBLCN 16 /* number of leaf cache of the inverted index */
37 #define ESTIDXDBNCN 16 /* number of non-leaf cache of the inverted index */
38 #define ESTIDXDBRLCN 128 /* number of leaf cache of the index reader */
39 #define ESTIDXDBRNCN 64 /* number of non-leaf cache of the index reader */
40
41 #define ESTFWMDBNAME "_fwm" /* name of the database for forward matching */
42 #define ESTFWMDBLRM 111 /* records in a leaf node of forward matching DB */
43 #define ESTFWMDBNIM 110 /* records in a non-leaf node of forward matching DB */
44 #define ESTFWMDBLCN 32 /* number of leaf cache of forward matching DB */
45 #define ESTFWMDBNCN 16 /* number of non-leaf cache of forward matching DB */
46
47 #define ESTATTRDBNAME "_attr" /* name of the database for attrutes */
48 #define ESTATTRDBBNUM 122869 /* bucket number of the database for attrutes */
49 #define ESTATTRDBDNUM 3 /* division number of the database for attrutes */
50 #define ESTATTRDBALN -5 /* alignment of the database for attrutes */
51
52 #define ESTTEXTDBNAME "_text" /* name of the database of texts */
53 #define ESTTEXTDBBNUM 30713 /* bucket number of the database for texts */
54 #define ESTTEXTDBDNUM 7 /* division number of the database for texts */
55 #define ESTTEXTDBALN -5 /* alignment of the database for texts */
56
57 #define ESTLISTDBNAME "_list" /* name of the database of document list */
58 #define ESTLISTDBLRM 99 /* records in a leaf node of document list DB */
59 #define ESTLISTDBNIM 200 /* records in a non-leaf node of document list DB */
60 #define ESTLISTDBLCN 32 /* number of leaf cache of document list DB */
61 #define ESTLISTDBNCN 16 /* number of non-leaf cache of document list DB */
62
63 #define ESTIDXCCBNUM 524288 /* bucket number of cache for the inverted index */
64 #define ESTIDXCCMAX (1048576*64) /* max size of the cache */
65 #define ESTOUTCCBNUM 131072 /* bucket number of cache for deleted documents */
66 #define ESTKEYCCMNUM 65536 /* bucket number of cache for keys for TF-IDF */
67 #define ESTATTRCCMNUM 8192 /* number of cache for attributes */
68 #define ESTTEXTCCMNUM 1024 /* number of cache for texts */
69 #define ESTCCCBFREQ 10000 /* frequency of callback for flushing words */
70
71 #define ESTDIRMODE 00755 /* permission of a creating directory */
72 #define ESTICCHECKSIZ 32768 /* size of checking character code */
73 #define ESTICMISSMAX 256 /* allowance number of missing characters */
74 #define ESTICALLWRAT 0.001 /* allowance ratio of missing characters */
75 #define ESTZCOMPLEVEL 5 /* level of compression of zlib */
76 #define ESTOCPOINT 10 /* point per occurrence */
77 #define ESTJHASHNUM 251 /* hash number for a junction */
78 #define ESTWORDMAXLEN 48 /* maximum length of a word */
79 #define ESTWORDAVGLEN 8 /* average length of a word */
80 #define ESTKEYSCALW 4 /* allowance ratio of TF-IDF for keywords */
81 #define ESTMEMIRATIO 1.1 /* incremental ratio of memory allocation */
82
83 #define ESTSMLRKNUM 16 /* number of keywords to get candidates */
84 #define ESTSMLRUNUM 1024 /* number of adopted documents for a keyword */
85 #define ESTSMLRNMIN 0.5 /* the minimum value for narrowing */
86
87 enum { /* enumeration for character categories */
88 ESTSPACECHR, /* space characters */
89 ESTDELIMCHR, /* delimiter characters */
90 ESTWESTALPH, /* west alphabets */
91 ESTEASTALPH /* east alphabets */
92 };
93
94 enum { /* enumeration for text analizer modes */
95 ESTAMNORMAL, /* normal */
96 ESTAMPERFNG /* perfect N-gram */
97 };
98
99 typedef struct { /* type of structure for a hitting object */
100 int id; /* ID of a document */
101 int score; /* score tuned by TF-IDF */
102 char *value; /* value of an attribute for sorting */
103 } ESTSCORE;
104
105 typedef struct { /* type of structure for a conditional attribute */
106 char *name; /* name */
107 int nsiz; /* size of the name */
108 char *oper; /* operator */
109 char *val; /* value */
110 int vsiz; /* size of the value */
111 const char *cop; /* canonical operator */
112 int sign; /* positive or negative */
113 char *sval; /* value of small cases */
114 int ssiz; /* size of the small value */
115 time_t num; /* numeric value */
116 } ESTCATTR;
117
118 typedef struct { /* type of structure for a hitting object */
119 const char *word; /* face of keyword */
120 int wsiz; /* size of the keyword */
121 int pt; /* score tuned by TF-IDF */
122 } ESTKEYSC;
123
124
125 /* private function prototypes */
126 static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode);
127 static void est_normalize_text(unsigned char *utext, int size, int *sp);
128 static void est_canonicalize_text(unsigned char *utext, int size, int funcspc);
129 static int est_char_category(int c);
130 static int est_char_category_perfng(int c);
131 static char *est_phrase_from_thumb(const char *sphrase);
132 static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
133 int size, int awsiz, CBDATUM *res, const CBLIST *rwords);
134 static int est_str_fwmatch_wide(const unsigned char *haystack, int hsiz,
135 const unsigned char *needle, int nsiz);
136 static ESTIDX *est_idx_open(const char *name, int omode, int dnum);
137 static int est_idx_close(ESTIDX *idx);
138 static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum);
139 static void est_idx_increment(ESTIDX *idx);
140 static int est_idx_add(ESTIDX *idx, const char *word, int wsiz, const char *vbuf, int vsiz);
141 static int est_idx_out(ESTIDX *idx, const char *word, int wsiz);
142 static char *est_idx_get(ESTIDX *idx, const char *word, int wsiz, int *sp);
143 static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz);
144 static int est_idx_num(ESTIDX *idx);
145 static int est_idx_size(ESTIDX *idx);
146 static int est_idx_sync(ESTIDX *idx);
147 static int est_idx_optimize(ESTIDX *idx);
148 static void est_idx_set_current(ESTIDX *idx);
149 static int est_db_write_meta(ESTDB *db);
150 static void est_db_inform(ESTDB *db, const char *info);
151 static int est_db_used_cache_size(ESTDB *db);
152 static void est_db_prepare_meta(ESTDB *db);
153 static CBLIST *est_phrase_terms(const char *phrase);
154 static int est_score_compare_by_id(const void *ap, const void *bp);
155 static int est_score_compare_by_score(const void *ap, const void *bp);
156 static int est_score_compare_by_str_asc(const void *ap, const void *bp);
157 static int est_score_compare_by_str_desc(const void *ap, const void *bp);
158 static int est_score_compare_by_num_asc(const void *ap, const void *bp);
159 static int est_score_compare_by_num_desc(const void *ap, const void *bp);
160 static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add);
161 static void est_expand_word(ESTDB *db, const char *word, CBLIST *list);
162 static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
163 int *nump, CBMAP *hints, int add);
164 static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, const char *order,
165 ESTSCORE *scores, int snum);
166 static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
167 const char *oval, int osiz, const char *sval, int ssiz, int onum);
168 static int est_keysc_compare(const void *ap, const void *bp);
169 static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
170 int knum, int unum, int tfidf, double nmin);
171 static CBMAP *est_phrase_vector(const char *phrase);
172 static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf);
173 static void est_set_svec(CBMAP *svmap, int *svec, int vnum);
174 static void est_set_tvec(CBMAP *svmap, CBMAP *tvmap, int *tvec, int vnum);
175 static double est_vec_abs(const int *vec, int vnum);
176 static double est_vec_iprod(const int *avec, const int *bvec, int vnum);
177 static double est_vec_cos(const int *avec, const int *bvec, int vnum);
178 static void est_random_fclose(void);
179
180
181
182 /*************************************************************************************************
183 * common settings
184 *************************************************************************************************/
185
186
187 /* version of QDBM */
188 const char *est_version = _EST_VERSION;
189
190
191
192 /*************************************************************************************************
193 * API for document
194 *************************************************************************************************/
195
196
197 /* Create a document object. */
198 ESTDOC *est_doc_new(void){
199 ESTDOC *doc;
200 CB_MALLOC(doc, sizeof(ESTDOC));
201 doc->id = -1;
202 doc->attrs = NULL;
203 doc->dtexts = NULL;
204 return doc;
205 }
206
207
208 /* Create a document object made from draft data. */
209 ESTDOC *est_doc_new_from_draft(const char *draft){
210 ESTDOC *doc;
211 CBLIST *lines;
212 const char *line;
213 char *pv;
214 int i;
215 assert(draft);
216 doc = est_doc_new();
217 lines = cbsplit(draft, -1, "\n");
218 for(i = 0; i < CB_LISTNUM(lines); i++){
219 line = CB_LISTVAL(lines, i, NULL);
220 while(*line > '\0' && *line <= ' '){
221 line++;
222 }
223 if(line[0] == '\0'){
224 i++;
225 break;
226 }
227 if((pv = strchr(line, '=')) != NULL){
228 *(pv++) = '\0';
229 est_doc_add_attr(doc, line, pv);
230 }
231 }
232 for(; i < CB_LISTNUM(lines); i++){
233 line = CB_LISTVAL(lines, i, NULL);
234 if(line[0] == '\t'){
235 est_doc_add_hidden_text(doc, line + 1);
236 } else {
237 est_doc_add_text(doc, line);
238 }
239 }
240 cblistclose(lines);
241 return doc;
242 }
243
244
245 /* Destroy a document object. */
246 void est_doc_delete(ESTDOC *doc){
247 assert(doc);
248 if(doc->dtexts) cblistclose(doc->dtexts);
249 if(doc->attrs) cbmapclose(doc->attrs);
250 free(doc);
251 }
252
253
254 /* Add an attribute to a document object. */
255 void est_doc_add_attr(ESTDOC *doc, const char *name, const char *value){
256 char *rbuf, *wp;
257 assert(doc && name);
258 if(name[0] == '\0') return;
259 if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
260 if(value){
261 rbuf = cbmemdup(value, -1);
262 for(wp = rbuf; *wp != '\0'; wp++){
263 if(*wp > 0 && *wp < ' ') *wp = ' ';
264 }
265 cbstrsqzspc(rbuf);
266 cbmapputvbuf(doc->attrs, name, strlen(name), rbuf, strlen(rbuf));
267 } else {
268 cbmapout(doc->attrs, name, -1);
269 }
270 }
271
272
273 /* Add a sentence of text to a document object. */
274 void est_doc_add_text(ESTDOC *doc, const char *text){
275 unsigned char *utext;
276 char *rtext, *wp;
277 int size;
278 assert(doc && text);
279 while(*text > '\0' && *text <= ' '){
280 text++;
281 }
282 if(text[0] == '\0') return;
283 if(!doc->dtexts) doc->dtexts = cblistopen();
284 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
285 est_normalize_text(utext, size, &size);
286 rtext = est_uconv_out((char *)utext, size, NULL);
287 for(wp = rtext; *wp != '\0'; wp++){
288 if(*wp > 0 && *wp < ' ') *wp = ' ';
289 }
290 cbstrsqzspc(rtext);
291 if(rtext[0] != '\0'){
292 cblistpushbuf(doc->dtexts, rtext, strlen(rtext));
293 } else {
294 free(rtext);
295 }
296 free(utext);
297 }
298
299
300 /* Add a hidden sentence to a document object. */
301 void est_doc_add_hidden_text(ESTDOC *doc, const char *text){
302 unsigned char *utext;
303 char *rtext, *wp;
304 int size;
305 assert(doc && text);
306 while(*text > '\0' && *text <= ' '){
307 text++;
308 }
309 if(text[0] == '\0') return;
310 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
311 est_normalize_text(utext, size, &size);
312 rtext = est_uconv_out((char *)utext, size, NULL);
313 for(wp = rtext; *wp != '\0'; wp++){
314 if(*wp > 0 && *wp < ' ') *wp = ' ';
315 }
316 cbstrsqzspc(rtext);
317 if(rtext[0] != '\0'){
318 if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
319 if(cbmapget(doc->attrs, "", 0, NULL)) cbmapputcat(doc->attrs, "", 0, " ", 1);
320 cbmapputcat(doc->attrs, "", 0, rtext, -1);
321 }
322 free(rtext);
323 free(utext);
324 }
325
326
327 /* Get the ID number of a document object. */
328 int est_doc_id(ESTDOC *doc){
329 assert(doc);
330 return doc->id;
331 }
332
333
334 /* Get a list of attribute names of a document object. */
335 CBLIST *est_doc_attr_names(ESTDOC *doc){
336 CBLIST *names;
337 const char *kbuf;
338 int ksiz;
339 assert(doc);
340 if(!doc->attrs) return cblistopen();
341 names = cblistopen();
342 cbmapiterinit(doc->attrs);
343 while((kbuf = cbmapiternext(doc->attrs, &ksiz)) != NULL){
344 if(ksiz > 0) cblistpush(names, kbuf, ksiz);
345 }
346 cblistsort(names);
347 return names;
348 }
349
350
351 /* Get the value of an attribute of a document object. */
352 const char *est_doc_attr(ESTDOC *doc, const char *name){
353 assert(doc && name);
354 if(!doc->attrs || name[0] == '\0') return NULL;
355 return cbmapget(doc->attrs, name, -1, NULL);
356 }
357
358
359 /* Get a list of sentences of the text of a document object. */
360 const CBLIST *est_doc_texts(ESTDOC *doc){
361 assert(doc);
362 if(!doc->dtexts) doc->dtexts = cblistopen();
363 return doc->dtexts;
364 }
365
366
367 /* Concatenate sentences of the text of a document object. */
368 char *est_doc_cat_texts(ESTDOC *doc){
369 CBDATUM *datum;
370 const char *elem;
371 int i, size;
372 if(!doc->dtexts) return cbmemdup("", 0);
373 datum = cbdatumopen("", 0);
374 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
375 elem = CB_LISTVAL2(doc->dtexts, i, &size);
376 if(i > 0) cbdatumcat(datum, " ", 1);
377 cbdatumcat(datum, elem, size);
378 }
379 return cbdatumtomalloc(datum, NULL);
380 }
381
382
383 /* Dump draft data of a document object. */
384 char *est_doc_dump_draft(ESTDOC *doc){
385 CBLIST *list;
386 CBDATUM *datum;
387 const char *kbuf, *vbuf;
388 int i, ksiz, vsiz;
389 assert(doc);
390 datum = cbdatumopen("", 0);
391 if(doc->attrs){
392 list = est_doc_attr_names(doc);
393 for(i = 0; i < CB_LISTNUM(list); i++){
394 kbuf = CB_LISTVAL2(list, i, &ksiz);
395 vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz);
396 cbdatumcat(datum, kbuf, ksiz);
397 cbdatumcat(datum, "=", 1);
398 cbdatumcat(datum, vbuf, vsiz);
399 cbdatumcat(datum, "\n", 1);
400 }
401 cblistclose(list);
402 }
403 cbdatumcat(datum, "\n", 1);
404 if(doc->dtexts){
405 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
406 kbuf = CB_LISTVAL2(doc->dtexts, i, &ksiz);
407 cbdatumcat(datum, kbuf, ksiz);
408 cbdatumcat(datum, "\n", 1);
409 }
410 }
411 if(doc->attrs && (vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
412 cbdatumcat(datum, "\t", 1);
413 cbdatumcat(datum, vbuf, vsiz);
414 cbdatumcat(datum, "\n", 1);
415 }
416 return cbdatumtomalloc(datum, NULL);
417 }
418
419
420 /* Make a snippet of the body text of a document object. */
421 char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth){
422 CBDATUM *res, *sbuf;
423 CBMAP *counts;
424 CBLIST *rwords;
425 const char *text, *word, *cval;
426 const unsigned char *rword;
427 unsigned char *rtext, *ctext;
428 int i, j, k, bi, size, wsiz, rwsiz, mywidth, awsiz, csiz;
429 assert(doc && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
430 if(!doc->dtexts) doc->dtexts = cblistopen();
431 res = cbdatumopen("", 0);
432 rwords = cblistopen();
433 for(i = 0; i < CB_LISTNUM(words); i++){
434 word = CB_LISTVAL2(words, i, &wsiz);
435 if(wsiz < 1 || !strcmp(word, ESTOPUVSET)) continue;
436 rtext = (unsigned char *)est_uconv_in(word, wsiz, &size);
437 est_canonicalize_text(rtext, size, TRUE);
438 cblistpushbuf(rwords, (char *)rtext, size);
439 }
440 sbuf = cbdatumopen("", 0);
441 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
442 text = CB_LISTVAL2(doc->dtexts, i, &size);
443 if(i > 0) cbdatumcat(sbuf, " ", 1);
444 cbdatumcat(sbuf, text, size);
445 }
446 rtext = (unsigned char *)est_uconv_in(CB_DATUMPTR(sbuf), CB_DATUMSIZE(sbuf), &size);
447 ctext = (unsigned char *)cbmemdup((char *)rtext, size);
448 est_canonicalize_text(ctext, size, FALSE);
449 mywidth = hwidth;
450 if(CB_LISTNUM(rwords) < 1) mywidth *= 3;
451 if(mywidth > wwidth) mywidth = wwidth;
452 for(i = 0; i < size && mywidth > 0; i += 2){
453 mywidth -= est_char_category(rtext[i] * 0x100 + rtext[i+1]) == ESTEASTALPH ? 2 : 1;
454 }
455 awsiz = size - i;
456 if(awsiz > ESTWORDMAXLEN) awsiz = ESTWORDMAXLEN;
457 est_snippet_add_text(rtext, ctext, i, awsiz, res, rwords);
458 wwidth -= hwidth;
459 bi = i + 2;
460 cbdatumcat(res, "\n", 1);
461 hwidth = 1000;
462 counts = cbmapopenex(ESTMINIBNUM);
463 for(i = bi; i < size && wwidth >= 0; i += 2){
464 for(j = 0; j < CB_LISTNUM(rwords); j++){
465 rword = (unsigned char *)CB_LISTVAL2(rwords, j, &rwsiz);
466 if(est_str_fwmatch_wide(ctext + i, size - i, rword, rwsiz) > 0 &&
467 (!(cval = cbmapget(counts, (char *)rword, rwsiz, &csiz)) ||
468 csiz < (wwidth > awidth * 1.2 ? 2 : 1))){
469 cbmapputcat(counts, (char *)rword, rwsiz, "*", 1);
470 if(cbmaprnum(counts) >= CB_LISTNUM(rwords)){
471 cbmapclose(counts);
472 counts = cbmapopenex(ESTMINIBNUM);
473 }
474 mywidth = awidth / 2 + 1;
475 for(k = i - 2; k >= bi && mywidth >= 0; k -= 2){
476 mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
477 }
478 bi = k;
479 mywidth = awidth / 2 + 1;
480 for(k = i + rwsiz + 2; k < size && mywidth >= 0; k += 2){
481 mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
482 }
483 if(k > size) k = size;
484 est_snippet_add_text(rtext + bi, ctext + bi, k - bi, 0, res, rwords);
485 wwidth -= awidth + rwsiz / 2;
486 bi = k + 2;
487 i = bi - 2;
488 cbdatumcat(res, "\n", 1);
489 break;
490 }
491 }
492 }
493 cbmapclose(counts);
494 free(ctext);
495 free(rtext);
496 cbdatumclose(sbuf);
497 cblistclose(rwords);
498 return cbdatumtomalloc(res, NULL);
499 }
500
501
502 /* Check whether the text of a document object includes every specified words. */
503 int est_doc_scan_words(ESTDOC *doc, const CBLIST *words){
504 CBLIST *rwords;
505 const unsigned char *rp, *rword;
506 const char *vbuf;
507 unsigned char *rbuf;
508 int i, j, vsiz, rsiz, rwsiz, hit;
509 assert(doc && words);
510 rwords = cblistopen();
511 for(i = 0; i < CB_LISTNUM(words); i++){
512 vbuf = CB_LISTVAL2(words, i, &vsiz);
513 if(vsiz < 1 || !strcmp(vbuf, ESTOPUVSET)) continue;
514 rbuf = (unsigned char *)est_uconv_in(vbuf, vsiz, &rsiz);
515 est_canonicalize_text(rbuf, rsiz, TRUE);
516 cblistpushbuf(rwords, (char *)rbuf, rsiz);
517 }
518 if(doc->dtexts){
519 for(i = 0; i < CB_LISTNUM(doc->dtexts) && CB_LISTNUM(rwords) > 0; i++){
520 vbuf = CB_LISTVAL2(doc->dtexts, i, &vsiz);
521 rbuf = (unsigned char *)est_uconv_in(vbuf, vsiz, &rsiz);
522 est_canonicalize_text(rbuf, rsiz, TRUE);
523 for(rp = rbuf; rsiz >= 0; rp += 2, rsiz -= 2){
524 for(j = 0; j < CB_LISTNUM(rwords); j++){
525 rword = (unsigned char *)CB_LISTVAL2(rwords, j, &rwsiz);
526 if(est_str_fwmatch_wide(rp, rsiz, rword, rwsiz)){
527 free(cblistremove(rwords, j, NULL));
528 j--;
529 }
530 }
531 }
532 free(rbuf);
533 }
534 }
535 if(doc->attrs && (vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
536 rbuf = (unsigned char *)est_uconv_in(vbuf, vsiz, &rsiz);
537 est_canonicalize_text(rbuf, rsiz, TRUE);
538 for(rp = rbuf; rsiz >= 0; rp += 2, rsiz -= 2){
539 for(i = 0; i < CB_LISTNUM(rwords); i++){
540 rword = (unsigned char *)CB_LISTVAL2(rwords, i, &rwsiz);
541 if(est_str_fwmatch_wide(rp, rsiz, rword, rwsiz)){
542 free(cblistremove(rwords, i, NULL));
543 i--;
544 }
545 }
546 }
547 free(rbuf);
548 }
549 hit = CB_LISTNUM(rwords) < 1;
550 cblistclose(rwords);
551 return hit;
552 }
553
554
555
556 /*************************************************************************************************
557 * API for search conditions
558 *************************************************************************************************/
559
560
561 /* Create a condition object. */
562 ESTCOND *est_cond_new(void){
563 ESTCOND *cond;
564 CB_MALLOC(cond, sizeof(ESTCOND));
565 cond->phrase = NULL;
566 cond->gstep = 2;
567 cond->tfidf = TRUE;
568 cond->simple = FALSE;
569 cond->attrs = NULL;
570 cond->order = NULL;
571 cond->max = -1;
572 cond->scfb = FALSE;
573 cond->scores = NULL;
574 cond->snum = 0;
575 cond->opts = 0;
576 return cond;
577 }
578
579
580 /* Destroy a condition object. */
581 void est_cond_delete(ESTCOND *cond){
582 assert(cond);
583 if(cond->scores) free(cond->scores);
584 if(cond->order) free(cond->order);
585 if(cond->attrs) cblistclose(cond->attrs);
586 if(cond->phrase) free(cond->phrase);
587 free(cond);
588 }
589
590
591 /* Set a search phrase to a condition object. */
592 void est_cond_set_phrase(ESTCOND *cond, const char *phrase){
593 assert(cond && phrase);
594 if(cond->phrase) free(cond->phrase);
595 while(*phrase > '\0' && *phrase <= ' '){
596 phrase++;
597 }
598 cond->phrase = cbmemdup(phrase, -1);
599 }
600
601
602 /* Add a condition of an attribute fo a condition object. */
603 void est_cond_add_attr(ESTCOND *cond, const char *expr){
604 assert(cond && expr);
605 if(!cond->attrs) cond->attrs = cblistopen();
606 while(*expr > '\0' && *expr <= ' '){
607 expr++;
608 }
609 cblistpush(cond->attrs, expr, -1);
610 }
611
612
613 /* Set the order of a condition object. */
614 void est_cond_set_order(ESTCOND *cond, const char *expr){
615 assert(cond && expr);
616 if(!cond->order) free(cond->order);
617 while(*expr > '\0' && *expr <= ' '){
618 expr++;
619 }
620 cond->order = cbmemdup(expr, -1);
621 }
622
623
624 /* Set the maximum number of retrieval of a condition object. */
625 void est_cond_set_max(ESTCOND *cond, int max){
626 assert(cond && max >= 0);
627 cond->max = max;
628 }
629
630
631 /* Set options of retrieval of a condition object. */
632 void est_cond_set_options(ESTCOND *cond, int options){
633 assert(cond);
634 if(options & ESTCONDSURE) cond->gstep = 1;
635 if(options & ESTCONDUSU) cond->gstep = 2;
636 if(options & ESTCONDFAST) cond->gstep = 3;
637 if(options & ESTCONDAGIT) cond->gstep = 4;
638 if(options & ESTCONDNOIDF) cond->tfidf = FALSE;
639 if(options & ESTCONDSIMPLE) cond->simple = TRUE;
640 if(options & ESTCONDSCFB) cond->scfb = TRUE;
641 cond->opts |= options;
642 }
643
644
645
646 /*************************************************************************************************
647 * API for database
648 *************************************************************************************************/
649
650
651 /* Get the string of an error code. */
652 const char *est_err_msg(int ecode){
653 switch(ecode){
654 case ESTENOERR: return "no error";
655 case ESTEINVAL: return "invalid argument";
656 case ESTEACCES: return "access forbidden";
657 case ESTELOCK: return "lock failure";
658 case ESTEDB: return "database problem";
659 case ESTEIO: return "I/O problem";
660 case ESTENOITEM: return "no such item";
661 default: break;
662 }
663 return "miscellaneous";
664 }
665
666
667 /* Open a database. */
668 ESTDB *est_db_open(const char *name, int omode, int *ecp){
669 ESTDB *db;
670 DEPOT *metadb;
671 ESTIDX *idxdb;
672 CURIA *attrdb, *textdb;
673 VILLA *fwmdb, *listdb;
674 char path[ESTPATHBUFSIZ], vbuf[ESTNUMBUFSIZ];
675 int domode, comode, vomode, idxnum, dseq, dnum, amode, vsiz;
676 assert(name && ecp);
677 *ecp = ESTENOERR;
678 if((omode & ESTDBWRITER) && (omode & ESTDBCREAT) && !est_mkdir(name)){
679 switch(errno){
680 case EACCES:
681 *ecp = ESTEACCES;
682 return NULL;
683 case EEXIST:
684 break;
685 default:
686 *ecp = ESTEIO;
687 return NULL;
688 }
689 }
690 domode = DP_OREADER;
691 comode = CR_OREADER;
692 vomode = VL_OREADER;
693 if(omode & ESTDBWRITER){
694 domode = DP_OWRITER;
695 comode = CR_OWRITER;
696 vomode = VL_OWRITER | VL_OZCOMP;
697 if(omode & ESTDBCREAT){
698 domode |= DP_OCREAT;
699 comode |= CR_OCREAT;
700 vomode |= VL_OCREAT;
701 }
702 if(omode & ESTDBTRUNC){
703 domode |= DP_OTRUNC;
704 comode |= CR_OTRUNC;
705 vomode |= VL_OTRUNC;
706 }
707 }
708 if(omode & ESTDBNOLCK){
709 domode |= DP_ONOLCK;
710 comode |= CR_ONOLCK;
711 vomode |= VL_ONOLCK;
712 }
713 if(omode & ESTDBLCKNB){
714 domode |= DP_OLCKNB;
715 comode |= CR_OLCKNB;
716 vomode |= VL_OLCKNB;
717 }
718 idxnum = 0;
719 dseq = 0;
720 dnum = 0;
721 amode = ESTAMNORMAL;
722 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
723 if((metadb = dpopen(path, domode, ESTMINIBNUM)) != NULL){
724 if((vsiz = dpgetwb(metadb, ESTKEYIDXNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
725 vbuf[vsiz] = '\0';
726 idxnum = atoi(vbuf);
727 }
728 if((vsiz = dpgetwb(metadb, ESTKEYDSEQ, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
729 vbuf[vsiz] = '\0';
730 dseq = atoi(vbuf);
731 }
732 if((vsiz = dpgetwb(metadb, ESTKEYDNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
733 vbuf[vsiz] = '\0';
734 dnum = atoi(vbuf);
735 }
736 if((vsiz = dpgetwb(metadb, ESTKEYAMODE, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
737 vbuf[vsiz] = '\0';
738 amode = atoi(vbuf);
739 } else if(omode & ESTDBPERFNG){
740 amode = ESTAMPERFNG;
741 }
742 }
743 if(!metadb){
744 *ecp = (dpecode == DP_ELOCK) ? ESTELOCK : ESTEDB;
745 return NULL;
746 }
747 if(idxnum < 1) idxnum = 1;
748 if(dseq < 0) dseq = 0;
749 if(dnum < 0) dnum = 0;
750 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME);
751 idxdb = est_idx_open(path, vomode, idxnum);
752 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTFWMDBNAME);
753 fwmdb = vlopen(path, vomode, VL_CMPLEX);
754 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
755 attrdb = cropen(path, comode, ESTATTRDBBNUM, ESTATTRDBDNUM);
756 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
757 textdb = cropen(path, comode, ESTTEXTDBBNUM, ESTTEXTDBDNUM);
758 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
759 listdb = vlopen(path, vomode, VL_CMPLEX);
760 if(!metadb || !idxdb || !fwmdb || !attrdb ||!textdb || !listdb){
761 if(listdb) vlclose(listdb);
762 if(textdb) crclose(textdb);
763 if(attrdb) crclose(attrdb);
764 if(fwmdb) vlclose(fwmdb);
765 if(idxdb) est_idx_close(idxdb);
766 dpclose(metadb);
767 *ecp = ESTEDB;
768 return NULL;
769 }
770 if(omode & ESTDBWRITER){
771 crsetalign(attrdb, ESTATTRDBALN);
772 crsetalign(textdb, ESTTEXTDBALN);
773 est_idx_set_tuning(idxdb, ESTIDXDBLRM, ESTIDXDBNIM, ESTIDXDBLCN, ESTIDXDBNCN);
774 est_idx_set_current(idxdb);
775 vlsettuning(fwmdb, ESTFWMDBLRM, ESTFWMDBNIM, ESTFWMDBLCN, ESTFWMDBNCN);
776 vlsettuning(listdb, ESTLISTDBLRM, ESTLISTDBNIM, ESTLISTDBLCN, ESTLISTDBNCN);
777 } else {
778 est_idx_set_tuning(idxdb, -1, -1, ESTIDXDBRLCN, ESTIDXDBRNCN);
779 vlsettuning(fwmdb, -1, -1, ESTFWMDBLCN, ESTFWMDBNCN);
780 vlsettuning(listdb, -1, -1, ESTLISTDBLCN, ESTLISTDBNCN);
781 }
782 CB_MALLOC(db, sizeof(ESTDB));
783 db->name = cbmemdup(name, -1);
784 db->metadb = metadb;
785 db->idxdb = idxdb;
786 db->fwmdb = fwmdb;
787 db->attrdb = attrdb;
788 db->textdb = textdb;
789 db->listdb = listdb;
790 db->ecode = ESTENOERR;
791 db->fatal = FALSE;
792 db->dseq = dseq;
793 db->dnum = dnum;
794 db->amode = amode;
795 if(omode & ESTDBWRITER){
796 db->idxcc = cbmapopenex(ESTIDXCCBNUM);
797 db->icsiz = 0;
798 db->icmax = ESTIDXCCMAX;
799 db->outcc = cbmapopenex(ESTOUTCCBNUM);
800 } else {
801 db->idxcc = cbmapopenex(1);
802 db->icsiz = 0;
803 db->icmax = 0;
804 db->outcc = cbmapopenex(1);
805 }
806 db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
807 db->kcmnum = ESTKEYCCMNUM;
808 db->attrcc = cbmapopenex(ESTATTRCCMNUM + 1);
809 db->acmnum = ESTATTRCCMNUM;
810 db->textcc = cbmapopenex(ESTTEXTCCMNUM + 1);
811 db->tcmnum = ESTTEXTCCMNUM;
812 db->spacc = NULL;
813 db->scmnum = 0;
814 db->scname = NULL;
815 db->cbinfo = NULL;
816 db->cbvec = NULL;
817 db->vecdata = NULL;
818 db->metacc = NULL;
819 return db;
820 }
821
822
823 /* Close a database. */
824 int est_db_close(ESTDB *db, int *ecp){
825 int err;
826 assert(db && ecp);
827 *ecp = ESTENOERR;
828 err = FALSE;
829 if(dpwritable(db->metadb)){
830 if(!est_db_flush(db, -1) || !est_db_write_meta(db)) err = TRUE;
831 }
832 est_db_inform(db, "closing");
833 if(db->metacc) cbmapclose(db->metacc);
834 if(db->spacc){
835 free(db->scname);
836 cbmapclose(db->spacc);
837 }
838 cbmapclose(db->textcc);
839 cbmapclose(db->attrcc);
840 cbmapclose(db->keycc);
841 cbmapclose(db->outcc);
842 cbmapclose(db->idxcc);
843 if(!vlclose(db->listdb)) err = TRUE;
844 if(!crclose(db->textdb)) err = TRUE;
845 if(!crclose(db->attrdb)) err = TRUE;
846 if(!vlclose(db->fwmdb)) err = TRUE;
847 if(!est_idx_close(db->idxdb)) err = TRUE;
848 if(!dpclose(db->metadb)) err = TRUE;
849 free(db->name);
850 if(db->fatal){
851 *ecp = db->ecode;
852 err = TRUE;
853 } else if(err){
854 *ecp = ESTEDB;
855 }
856 free(db);
857 return err ? FALSE : TRUE;
858 }
859
860
861 /* Get the last happended error code of a database. */
862 int est_db_error(ESTDB *db){
863 assert(db);
864 return db->ecode;
865 }
866
867
868 /* Check whether a database has a fatal error. */
869 int est_db_fatal(ESTDB *db){
870 assert(db);
871 return db->fatal;
872 }
873
874
875 /* Flush index words in the cache of a database. */
876 int est_db_flush(ESTDB *db, int max){
877 CBMAP *ids;
878 CBLIST *keys;
879 CBDATUM *nval;
880 const char *kbuf, *vbuf, *rp, *pv;
881 char *tbuf;
882 int i, err, ksiz, vsiz, rnum, id, tsiz;
883 assert(db);
884 if(!dpwritable(db->metadb)){
885 db->ecode = ESTEACCES;
886 return FALSE;
887 }
888 if(cbmaprnum(db->idxcc) < 1 && cbmaprnum(db->outcc) < 1) return TRUE;
889 err = FALSE;
890 keys = cblistopen();
891 cbmapiterinit(db->idxcc);
892 while((kbuf = cbmapiternext(db->idxcc, &ksiz)) != NULL){
893 cblistpush(keys, kbuf, ksiz);
894 }
895 rnum = CB_LISTNUM(keys);
896 cblistsort(keys);
897 if(max > 0){
898 while(CB_LISTNUM(keys) > max){
899 free(cblistpop(keys, NULL));
900 }
901 }
902 for(i = 0; i < CB_LISTNUM(keys); i++){
903 kbuf = CB_LISTVAL2(keys, i, &ksiz);
904 vbuf = cbmapget(db->idxcc, kbuf, ksiz, &vsiz);
905 if(!est_idx_add(db->idxdb, kbuf, ksiz, vbuf, vsiz)){
906 err = TRUE;
907 break;
908 }
909 cbmapout(db->idxcc, kbuf, ksiz);
910 db->icsiz -= vsiz;
911 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "flushing index words");
912 }
913 for(i = 0; i < CB_LISTNUM(keys); i++){
914 kbuf = CB_LISTVAL2(keys, i, &ksiz);
915 if(!vlput(db->fwmdb, kbuf, ksiz, "", 0, VL_DKEEP) && dpecode != DP_EKEEP){
916 err = TRUE;
917 break;
918 }
919 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "flushing fwm keys");
920 }
921 cblistclose(keys);
922 if(cbmaprnum(db->idxcc) < 1){
923 cbmapclose(db->idxcc);
924 db->idxcc = cbmapopenex(rnum > ESTIDXCCBNUM ? rnum * 1.5 : ESTIDXCCBNUM);
925 }
926 if(max < 0 && cbmaprnum(db->outcc) > 0){
927 ids = cbmapopen();
928 keys = cblistopen();
929 cbmapiterinit(db->outcc);
930 while((kbuf = cbmapiternext(db->outcc, &ksiz)) != NULL){
931 if(*kbuf == '\t'){
932 id = atoi(kbuf + 1);
933 cbmapput(ids, (char *)&id, sizeof(int), "", 0, FALSE);
934 } else {
935 cblistpush(keys, kbuf, ksiz);
936 }
937 }
938 cblistsort(keys);
939 for(i = 0; i < CB_LISTNUM(keys); i++){
940 if(i % (ESTIDXDBLRM * 4) == 0) est_idx_set_current(db->idxdb);
941 kbuf = CB_LISTVAL2(keys, i, &ksiz);
942 if((tbuf = est_idx_get(db->idxdb, kbuf, ksiz, &tsiz)) != NULL){
943 nval = cbdatumopen("", 0);
944 rp = tbuf;
945 while(rp < tbuf + tsiz){
946 pv = rp;
947 rp += 5;
948 while(*rp != 0x0){
949 rp += 2;
950 }
951 rp++;
952 if(!cbmapget(ids, pv, sizeof(int), NULL)) cbdatumcat(nval, pv, rp - pv);
953 }
954 if(!est_idx_out(db->idxdb, kbuf, ksiz) && dpecode != DP_ENOITEM) err = TRUE;
955 if(CB_DATUMSIZE(nval) > 0){
956 if(!est_idx_add(db->idxdb, kbuf, ksiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval)))
957 err = TRUE;
958 } else {
959 if(!vlout(db->fwmdb, kbuf, ksiz) && dpecode != DP_ENOITEM) err = TRUE;
960 }
961 cbdatumclose(nval);
962 free(tbuf);
963 }
964 cbmapout(db->outcc, kbuf, ksiz);
965 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
966 }
967 rnum = cbmaprnum(ids);
968 cblistclose(keys);
969 cbmapclose(ids);
970 cbmapclose(db->outcc);
971 db->outcc = cbmapopenex(ESTOUTCCBNUM);
972 }
973 cbmapclose(db->keycc);
974 db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
975 db->kcmnum = ESTKEYCCMNUM;
976 if(err){
977 db->ecode = ESTEDB;
978 db->fatal = TRUE;
979 return FALSE;
980 }
981 return TRUE;
982 }
983
984
985 /* Synchronize updating contents of a database. */
986 int est_db_sync(ESTDB *db){
987 int err;
988 assert(db);
989 if(!dpwritable(db->metadb)){
990 db->ecode = ESTEACCES;
991 return FALSE;
992 }
993 err = FALSE;
994 if(!est_db_flush(db, -1) || !est_db_write_meta(db)) err = TRUE;
995 est_db_inform(db, "synchronizing the database for meta information");
996 if(!dpsync(db->metadb)) err = TRUE;
997 est_db_inform(db, "synchronizing the inverted index");
998 if(!est_idx_sync(db->idxdb)) err = TRUE;
999 est_db_inform(db, "synchronizing the database for forward matching");
1000 if(!vlsync(db->fwmdb)) err = TRUE;
1001 est_db_inform(db, "synchronizing the database for attrutes");
1002 if(!crsync(db->attrdb)) err = TRUE;
1003 est_db_inform(db, "synchronizing the database for texts");
1004 if(!crsync(db->textdb)) err = TRUE;
1005 est_db_inform(db, "synchronizing the database for document list");
1006 if(!vlsync(db->listdb)) err = TRUE;
1007 if(err){
1008 db->ecode = ESTEDB;
1009 db->fatal = TRUE;
1010 }
1011 return err ? FALSE : TRUE;
1012 }
1013
1014
1015 /* Optimize a database. */
1016 int est_db_optimize(ESTDB *db, int options){
1017 CBMAP *dmap;
1018 CBLIST *words;
1019 CBDATUM *nval;
1020 const char *word, *rp, *pv;
1021 char *kbuf, *vbuf;
1022 int i, err, id, ksiz, vsiz, wsiz;
1023 assert(db);
1024 if(!dpwritable(db->metadb)){
1025 db->ecode = ESTEACCES;
1026 return FALSE;
1027 }
1028 if(!est_db_flush(db, -1)) return FALSE;
1029 err = FALSE;
1030 if(!(options & ESTOPTNOPURGE)){
1031 dmap = cbmapopenex(vlrnum(db->listdb) + 1);
1032 vlcurfirst(db->listdb);
1033 while((vbuf = vlcurval(db->listdb, NULL)) != NULL){
1034 id = atoi(vbuf);
1035 cbmapput(dmap, (char *)&id, sizeof(int), "", 0, FALSE);
1036 free(vbuf);
1037 vlcurnext(db->listdb);
1038 }
1039 words = cblistopen();
1040 vlcurfirst(db->fwmdb);
1041 while((kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL){
1042 cblistpushbuf(words, kbuf, ksiz);
1043 vlcurnext(db->fwmdb);
1044 }
1045 for(i = 0; i < CB_LISTNUM(words); i++){
1046 if(i % (ESTIDXDBLRM * 4) == 0) est_idx_set_current(db->idxdb);
1047 word = CB_LISTVAL2(words, i, &wsiz);
1048 if((vbuf = est_idx_get(db->idxdb, word, wsiz, &vsiz)) != NULL){
1049 nval = cbdatumopen("", 0);
1050 rp = vbuf;
1051 while(rp < vbuf + vsiz){
1052 pv = rp;
1053 rp += 5;
1054 while(*rp != 0x0){
1055 rp += 2;
1056 }
1057 rp++;
1058 if(cbmapget(dmap, pv, sizeof(int), NULL)) cbdatumcat(nval, pv, rp - pv);
1059 }
1060 if(!est_idx_out(db->idxdb, word, wsiz)) err = TRUE;
1061 if(CB_DATUMSIZE(nval) > 0){
1062 if(!est_idx_add(db->idxdb, word, wsiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval)))
1063 err = TRUE;
1064 } else {
1065 if(!vlout(db->fwmdb, word, wsiz)) err = TRUE;
1066 }
1067 cbdatumclose(nval);
1068 free(vbuf);
1069 } else {
1070 err = TRUE;
1071 }
1072 free(kbuf);
1073 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
1074 }
1075 cblistclose(words);
1076 cbmapclose(dmap);
1077 }
1078 if(!(options & ESTOPTNODBOPT)){
1079 est_db_inform(db, "optimizing the inverted index");
1080 if(!est_idx_optimize(db->idxdb)) err = TRUE;
1081 est_db_inform(db, "optimizing the database for forward matching");
1082 if(!vloptimize(db->fwmdb)) err = TRUE;
1083 est_db_inform(db, "optimizing the database for attrutes");
1084 if(!croptimize(db->attrdb, -1)) err = TRUE;
1085 est_db_inform(db, "optimizing the database for texts");
1086 if(!croptimize(db->textdb, -1)) err = TRUE;
1087 est_db_inform(db, "optimizing the database for document list");
1088 if(!vloptimize(db->listdb)) err = TRUE;
1089 }
1090 if(err){
1091 db->ecode = ESTEDB;
1092 db->fatal = TRUE;
1093 }
1094 return err ? FALSE : TRUE;
1095 }
1096
1097
1098 /* Add a document to a database. */
1099 int est_db_put_doc(ESTDB *db, ESTDOC *doc, int options){
1100 CBMAP *ocmap, *fmap, *qmap;
1101 CBLIST *words;
1102 CBDATUM *ocbuf;
1103 const char *uri, *text, *word, *fnext, *snext, *kbuf, *vbuf;
1104 unsigned char junc[2], c;
1105 char wbuf[ESTWORDMAXLEN+3], *sbuf, *zbuf, nbuf[ESTNUMBUFSIZ];
1106 int i, j, id, err, wnum, wsiz, fnsiz, snsiz, *np, num, ksiz, vsiz, ssiz, zsiz;
1107 double tune;
1108 assert(db && doc);
1109 if(!dpwritable(db->metadb)){
1110 db->ecode = ESTEACCES;
1111 return FALSE;
1112 }
1113 if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL))){
1114 db->ecode = ESTEINVAL;
1115 return FALSE;
1116 }
1117 if((id = est_db_uri_to_id(db, uri)) > 0 &&
1118 !est_db_out_doc(db, id, (options & ESTPDCLEAN) ? ESTODCLEAN : 0)) return FALSE;
1119 if(!doc->dtexts) doc->dtexts = cblistopen();
1120 doc->id = ++(db->dseq);
1121 sprintf(nbuf, "%d", doc->id);
1122 cbmapput(doc->attrs, ESTDATTRID, -1, nbuf, -1, TRUE);
1123 ocmap = cbmapopen();
1124 fmap = cbmapopen();
1125 qmap = cbmapopen();
1126 wnum = 0;
1127 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
1128 if(i < 0){
1129 if(!(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
1130 } else {
1131 text = CB_LISTVAL(doc->dtexts, i, NULL);
1132 }
1133 words = cblistopen();
1134 switch(db->amode){
1135 case ESTAMPERFNG:
1136 est_break_text_perfng(text, words, FALSE, TRUE);
1137 break;
1138 default:
1139 est_break_text(text, words, FALSE, TRUE);
1140 break;
1141 }
1142 wnum += CB_LISTNUM(words);
1143 for(j = 0; j < CB_LISTNUM(words); j++){
1144 word = CB_LISTVAL2(words, j, &wsiz);
1145 if(wsiz > ESTWORDMAXLEN) continue;
1146 fnext = cblistval(words, j + 1, &fnsiz);
1147 snext = cblistval(words, j + 2, &snsiz);
1148 junc[0] = fnext ? dpinnerhash(fnext, fnsiz) % ESTJHASHNUM + 1: 0xff;
1149 junc[1] = snext ? dpouterhash(snext, snsiz) % ESTJHASHNUM + 1: 0xff;
1150 memcpy(wbuf, word, wsiz);
1151 memcpy(wbuf + wsiz, "\t", 1);
1152 memcpy(wbuf + wsiz + 1, junc, 2);
1153 np = (int *)cbmapget(fmap, word, wsiz, NULL);
1154 num = np ? *(int *)np : 0;
1155 num += ESTOCPOINT;
1156 cbmapput(fmap, word, wsiz, (char *)&num, sizeof(int), TRUE);
1157 if(cbmapput(qmap, wbuf, wsiz + 3, "", 0, FALSE))
1158 cbmapputcat(ocmap, word, wsiz, (char *)junc, 2);
1159 }
1160 cblistclose(words);
1161 }
1162 cbmapiterinit(ocmap);
1163 while((kbuf = cbmapiternext(ocmap, &ksiz)) != NULL){
1164 vbuf = cbmapget(ocmap, kbuf, ksiz, &vsiz);
1165 ocbuf = cbdatumopen("", 0);
1166 cbdatumcat(ocbuf, (char *)&(doc->id), sizeof(int));
1167 num = *(int *)cbmapget(fmap, kbuf, ksiz, NULL);
1168 tune = log(wnum + 3);
1169 tune = (tune * tune) / 10.0;
1170 num /= tune > 4.0 ? tune : 4.0;
1171 if(num >= 0x80) num += (0x80 - num) * 0.75;
1172 if(num >= 0xc0) num += (0xc0 - num) * 0.75;
1173 c = num < 0xff ? num : 0xff;
1174 cbdatumcat(ocbuf, (char *)&c, 1);
1175 cbdatumcat(ocbuf, vbuf, vsiz);
1176 c = 0x00;
1177 cbdatumcat(ocbuf, (char *)&c, 1);
1178 cbmapputcat(db->idxcc, kbuf, ksiz, CB_DATUMPTR(ocbuf), CB_DATUMSIZE(ocbuf));
1179 db->icsiz += CB_DATUMSIZE(ocbuf);
1180 cbdatumclose(ocbuf);
1181 }
1182 cbmapclose(qmap);
1183 cbmapclose(fmap);
1184 cbmapclose(ocmap);
1185 err = FALSE;
1186 sbuf = cbmapdump(doc->attrs, &ssiz);
1187 if(!crput(db->attrdb, (char *)&(doc->id), sizeof(int), sbuf, ssiz, CR_DKEEP)){
1188 db->ecode = ESTEDB;
1189 db->fatal = TRUE;
1190 err = TRUE;
1191 }
1192 free(sbuf);
1193 sbuf = cblistdump(doc->dtexts, &ssiz);
1194 if(!(zbuf = est_deflate(sbuf, ssiz, &zsiz))){
1195 CB_MALLOC(zbuf, 1);
1196 zsiz = 0;
1197 db->ecode = ESTEMISC;
1198 db->fatal = TRUE;
1199 err = TRUE;
1200 }
1201 if(!crput(db->textdb, (char *)&(doc->id), sizeof(int), zbuf, zsiz, CR_DKEEP)){
1202 db->ecode = ESTEDB;
1203 db->fatal = TRUE;
1204 err = TRUE;
1205 }
1206 free(sbuf);
1207 free(zbuf);
1208 sprintf(nbuf, "%d", doc->id);
1209 if(!vlput(db->listdb, uri, -1, nbuf, -1, VL_DKEEP)){
1210 db->ecode = ESTEDB;
1211 db->fatal = TRUE;
1212 err = TRUE;
1213 }
1214 db->dnum++;
1215 if(est_db_used_cache_size(db) > db->icmax){
1216 if(!est_db_flush(db, -1)) err = TRUE;
1217 est_idx_increment(db->idxdb);
1218 }
1219 return err ? FALSE : TRUE;
1220 }
1221
1222
1223 /* Remove a document from a database. */
1224 int est_db_out_doc(ESTDB *db, int id, int options){
1225 ESTDOC *doc;
1226 CBLIST *words;
1227 const char *uri, *text, *word;
1228 char numbuf[ESTNUMBUFSIZ];
1229 int i, j, len, wsiz;
1230 assert(db && id > 0);
1231 if(!dpwritable(db->metadb)){
1232 db->ecode = ESTEACCES;
1233 return FALSE;
1234 }
1235 if(!(doc = est_db_get_doc(db, id, 0))) return FALSE;
1236 if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL))){
1237 est_doc_delete(doc);
1238 db->ecode = ESTEDB;
1239 db->fatal = TRUE;
1240 return FALSE;
1241 }
1242 if(!crout(db->attrdb, (char *)&id, sizeof(int)) ||
1243 !crout(db->textdb, (char *)&id, sizeof(int)) || !vlout(db->listdb, uri, -1)){
1244 est_doc_delete(doc);
1245 db->ecode = ESTEDB;
1246 db->fatal = TRUE;
1247 return FALSE;
1248 }
1249 cbmapout(db->attrcc, (char *)&id, sizeof(int));
1250 cbmapout(db->textcc, (char *)&id, sizeof(int));
1251 if(db->spacc) cbmapout(db->spacc, (char *)&id, sizeof(int));
1252 if((options & ESTODCLEAN) && doc->dtexts){
1253 len = sprintf(numbuf, "\t%d", doc->id);
1254 cbmapput(db->outcc, numbuf, len, "", 0, FALSE);
1255 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
1256 if(i < 0){
1257 if(!(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
1258 } else {
1259 text = CB_LISTVAL(doc->dtexts, i, NULL);
1260 }
1261 words = cblistopen();
1262 switch(db->amode){
1263 case ESTAMPERFNG:
1264 est_break_text_perfng(text, words, FALSE, TRUE);
1265 break;
1266 default:
1267 est_break_text(text, words, FALSE, TRUE);
1268 break;
1269 }
1270 for(j = 0; j < CB_LISTNUM(words); j++){
1271 word = CB_LISTVAL2(words, j, &wsiz);
1272 cbmapput(db->outcc, word, wsiz, "", 0, FALSE);
1273 }
1274 cblistclose(words);
1275 }
1276 }
1277 est_doc_delete(doc);
1278 db->dnum--;
1279 return TRUE;
1280 }
1281
1282
1283 /* Retrieve a document in a database. */
1284 ESTDOC *est_db_get_doc(ESTDB *db, int id, int options){
1285 ESTDOC *doc;
1286 const char *cbuf;
1287 char *vbuf, *zbuf;
1288 int i, csiz, vsiz, zsiz, num;
1289 assert(db && id > 0);
1290 cbuf = NULL;
1291 if(options & ESTGDNOATTR){
1292 if(!crvsiz(db->attrdb, (char *)&id, sizeof(int))){
1293 if(dpecode == DP_ENOITEM){
1294 db->ecode = ESTENOITEM;
1295 return NULL;
1296 } else {
1297 db->ecode = ESTEDB;
1298 db->fatal = TRUE;
1299 return NULL;
1300 }
1301 }
1302 vbuf = NULL;
1303 } else if((cbuf = cbmapget(db->attrcc, (char *)&id, sizeof(int), &csiz)) != NULL){
1304 cbmapmove(db->attrcc, (char *)&id, sizeof(int), FALSE);
1305 vbuf = NULL;
1306 } else if(!(vbuf = crget(db->attrdb, (char *)&id, sizeof(int), 0, -1, &vsiz))){
1307 if(dpecode == DP_ENOITEM){
1308 db->ecode = ESTENOITEM;
1309 return NULL;
1310 } else {
1311 db->ecode = ESTEDB;
1312 db->fatal = TRUE;
1313 return NULL;
1314 }
1315 }
1316 doc = est_doc_new();
1317 doc->id = id;
1318 if(cbuf){
1319 doc->attrs = cbmapload(cbuf, csiz);
1320 } else if(vbuf){
1321 doc->attrs = cbmapload(vbuf, vsiz);
1322 cbmapputvbuf(db->attrcc, (char *)&id, sizeof(int), vbuf, vsiz);
1323 if(cbmaprnum(db->attrcc) > db->acmnum){
1324 num = cbmaprnum(db->attrcc) * 0.1 + 1;
1325 cbmapiterinit(db->attrcc);
1326 for(i = 0; i < num && (cbuf = cbmapiternext(db->attrcc, NULL)) != NULL; i++){
1327 cbmapout(db->attrcc, cbuf, sizeof(int));
1328 }
1329 }
1330 } else {
1331 doc->attrs = NULL;
1332 }
1333 if(!(options & ESTGDNOTEXT)){
1334 if((cbuf = cbmapget(db->textcc, (char *)&id, sizeof(int), &csiz)) != NULL){
1335 cbmapmove(db->textcc, (char *)&id, sizeof(int), FALSE);
1336 doc->dtexts = cblistload(cbuf, csiz);
1337 } else {
1338 if(!(zbuf = crget(db->textdb, (char *)&id, sizeof(int), 0, -1, &zsiz))){
1339 db->ecode = ESTEDB;
1340 db->fatal = TRUE;
1341 est_doc_delete(doc);
1342 return NULL;
1343 }
1344 if(!(vbuf = est_inflate(zbuf, zsiz, &vsiz))){
1345 db->ecode = ESTEDB;
1346 db->fatal = TRUE;
1347 free(zbuf);
1348 est_doc_delete(doc);
1349 return NULL;
1350 }
1351 doc->dtexts = cblistload(vbuf, vsiz);
1352 cbmapputvbuf(db->textcc, (char *)&id, sizeof(int), vbuf, vsiz);
1353 if(cbmaprnum(db->textcc) > db->tcmnum){
1354 num = cbmaprnum(db->textcc) * 0.1 + 1;
1355 cbmapiterinit(db->textcc);
1356 for(i = 0; i < num &&(cbuf = cbmapiternext(db->textcc, NULL)) != NULL; i++){
1357 cbmapout(db->textcc, cbuf, sizeof(int));
1358 }
1359 }
1360 free(zbuf);
1361 }
1362 }
1363 return doc;
1364 }
1365
1366
1367 /* Retrieve the value of an attribute of a document in a database. */
1368 char *est_db_get_doc_attr(ESTDB *db, int id, const char *name){
1369 const char *cbuf;
1370 char *mbuf, *vbuf;
1371 int cb, csiz, msiz, vsiz;
1372 assert(db && id > 0 && name);
1373 cb = db->spacc && !strcmp(name, db->scname);
1374 if(cb && (cbuf = cbmapget(db->spacc, (char *)&id, sizeof(int), &csiz)) != NULL){
1375 cbmapmove(db->spacc, (char *)&id, sizeof(int), FALSE);
1376 return cbmemdup(cbuf, csiz);
1377 }
1378 if(!(mbuf = crget(db->attrdb, (char *)&id, sizeof(int), 0, -1, &msiz))){
1379 db->ecode = dpecode == DP_ENOITEM ? ESTENOITEM : ESTEDB;
1380 return NULL;
1381 }
1382 if(!(vbuf = cbmaploadone(mbuf, msiz, name, -1, &vsiz))){
1383 db->ecode = ESTENOITEM;
1384 free(mbuf);
1385 return NULL;
1386 }
1387 if(cb) cbmapput(db->spacc, (char *)&id, sizeof(int), vbuf, vsiz, FALSE);
1388 free(mbuf);
1389 return vbuf;
1390 }
1391
1392
1393 /* Get the ID of a document spacified by URI. */
1394 int est_db_uri_to_id(ESTDB *db, const char *uri){
1395 char *vbuf;
1396 int id;
1397 assert(db && uri);
1398 if(!(vbuf = vlget(db->listdb, uri, -1, NULL))){
1399 db->ecode = ESTENOITEM;
1400 return -1;
1401 }
1402 id = atoi(vbuf);
1403 free(vbuf);
1404 return id;
1405 }
1406
1407
1408 /* Extract keywords of a document object. */
1409 CBMAP *est_db_etch_doc(ESTDB *db, ESTDOC *doc, int max){
1410 ESTKEYSC *scores;
1411 CBMAP *keys, *umap;
1412 CBLIST *words;
1413 const char *text, *word, *vbuf;
1414 char numbuf[ESTNUMBUFSIZ];
1415 int i, wsiz, num, smax, snum, vsiz;
1416 assert(doc && max >= 0);
1417 if(!doc->dtexts) return cbmapopenex(1);
1418 keys = cbmapopenex(max * 1.5);
1419 words = cblistopen();
1420 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
1421 if(i < 0){
1422 if(!doc->attrs || !(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
1423 } else {
1424 text = CB_LISTVAL(doc->dtexts, i, NULL);
1425 }
1426 if(db){
1427 switch(db->amode){
1428 case ESTAMPERFNG:
1429 est_break_text_perfng(text, words, FALSE, TRUE);
1430 break;
1431 default:
1432 est_break_text(text, words, FALSE, TRUE);
1433 break;
1434 }
1435 } else {
1436 est_break_text(text, words, FALSE, TRUE);
1437 }
1438 }
1439 umap = cbmapopenex(CB_LISTNUM(words) + 1);
1440 for(i = 0; i < CB_LISTNUM(words); i++){
1441 word = CB_LISTVAL2(words, i, &wsiz);
1442 if(wsiz > ESTWORDMAXLEN) continue;
1443 num = (vbuf = cbmapget(umap, word, wsiz, NULL)) ? *(int *)vbuf + 1 : 1;
1444 cbmapput(umap, word, wsiz, (char *)&num, sizeof(int), TRUE);
1445 }
1446 smax = max * (db ? ESTKEYSCALW : 1) + 1;
1447 CB_MALLOC(scores, cbmaprnum(umap) * sizeof(ESTKEYSC) + 1);
1448 snum = 0;
1449 cbmapiterinit(umap);
1450 for(i = 0; i < smax && (word = cbmapiternext(umap, &wsiz)) != NULL; i++){
1451 scores[snum].word = word;
1452 scores[snum].wsiz = wsiz;
1453 scores[snum].pt = (vbuf = cbmapget(umap, word, wsiz, NULL)) ? *(int *)vbuf : 0;
1454 snum++;
1455 }
1456 qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare);
1457 if(db){
1458 for(i = 0; i < snum; i++){
1459 if((vbuf = cbmapget(db->keycc, scores[i].word, scores[i].wsiz, NULL)) != NULL){
1460 cbmapmove(db->keycc, scores[i].word, scores[i].wsiz, FALSE);
1461 vsiz = *(int*)vbuf;
1462 } else {
1463 vsiz = est_idx_vsiz(db->idxdb, scores[i].word, scores[i].wsiz);
1464 cbmapput(db->keycc, scores[i].word, scores[i].wsiz, (char *)&vsiz, sizeof(int), FALSE);
1465 }
1466 scores[i].pt *= 400000.0 / (vsiz + 64);
1467 }
1468 if(db->kcmnum >= 0 && cbmaprnum(db->keycc) > db->kcmnum){
1469 num = db->kcmnum * 0.1 + 1;
1470 cbmapiterinit(db->keycc);
1471 for(i = 0; i < num && (word = cbmapiternext(db->keycc, &wsiz)) != NULL; i++){
1472 cbmapout(db->keycc, word, wsiz);
1473 }
1474 }
1475 qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare);
1476 }
1477 for(i = 0; i < snum && i < max; i++){
1478 vsiz = sprintf(numbuf, "%d", scores[i].pt);
1479 cbmapput(keys, scores[i].word, scores[i].wsiz, numbuf, vsiz, FALSE);
1480 }
1481 free(scores);
1482 cbmapclose(umap);
1483 cblistclose(words);
1484 return keys;
1485 }
1486
1487
1488 /* Initialize the iterator of a database. */
1489 int est_db_iter_init(ESTDB *db){
1490 assert(db);
1491 return vlcurfirst(db->listdb);
1492 }
1493
1494
1495 /* Get the next ID of the iterator of a database. */
1496 int est_db_iter_next(ESTDB *db){
1497 char *vbuf;
1498 int id;
1499 assert(db);
1500 if(!(vbuf = vlcurval(db->listdb, NULL))){
1501 if(dpecode == DP_ENOITEM){
1502 db->ecode = ESTENOITEM;
1503 return 0;
1504 } else {
1505 db->ecode = ESTEDB;
1506 db->fatal = TRUE;
1507 return -1;
1508 }
1509 }
1510 id = atoi(vbuf);
1511 free(vbuf);
1512 vlcurnext(db->listdb);
1513 return id;
1514 }
1515
1516
1517 /* Get the name of a database. */
1518 const char *est_db_name(ESTDB *db){
1519 assert(db);
1520 return db->name;
1521 }
1522
1523
1524 /* Get the number of documents in a database. */
1525 int est_db_doc_num(ESTDB *db){
1526 assert(db);
1527 return db->dnum;
1528 }
1529
1530
1531 /* Get the number of words in a database. */
1532 int est_db_word_num(ESTDB *db){
1533 assert(db);
1534 return vlrnum(db->fwmdb);
1535 }
1536
1537
1538 /* Get the size of a database. */
1539 double est_db_size(ESTDB *db){
1540 assert(db);
1541 return dpfsiz(db->metadb) + est_idx_size(db->idxdb) + vlfsiz(db->fwmdb) +
1542 crfsizd(db->attrdb) + crfsizd(db->textdb) + vlfsiz(db->listdb);
1543 }
1544
1545
1546 /* Search documents corresponding a condition for a database. */
1547 int *est_db_search(ESTDB *db, ESTCOND *cond, int *nump, CBMAP *hints){
1548 ESTSCORE *scores, *tscores;
1549 CBMAP *svmap;
1550 CBLIST *terms;
1551 const char *term, *rp;
1552 char *tmp, numbuf[ESTNUMBUFSIZ];
1553 int i, j, snum, pcnum, ncnum, tsnum, add, nnum, id, score, hnum, len, *rval;
1554 double tune;
1555 assert(db && cond && nump);
1556 scores = NULL;
1557 snum = 0;
1558 if(cond->phrase && cond->phrase[0] == ESTOPSIMILAR[0] &&
1559 cbstrfwmatch(cond->phrase, ESTOPSIMILAR)){
1560 rp = cond->phrase;
1561 rp += strlen(ESTOPSIMILAR);
1562 while(*rp > '\0' && *rp <= ' '){
1563 rp++;
1564 }
1565 svmap = est_phrase_vector(rp);
1566 scores = est_search_similar(db, svmap, &snum, ESTSMLRKNUM, ESTSMLRUNUM, cond->tfidf,
1567 cond->order ? ESTSMLRNMIN : 0.0);
1568 cbmapclose(svmap);
1569 } else if(cond->phrase){
1570 if(cond->simple){
1571 tmp = est_phrase_from_thumb(cond->phrase);
1572 terms = est_phrase_terms(tmp);
1573 free(tmp);
1574 } else {
1575 terms = est_phrase_terms(cond->phrase);
1576 }
1577 pcnum = 0;
1578 ncnum = 0;
1579 add = TRUE;
1580 for(i = 0; i < CB_LISTNUM(terms); i++){
1581 term = CB_LISTVAL(terms, i, NULL);
1582 if(!strcmp(term, ESTOPISECT)){
1583 add = TRUE;
1584 } else if(!strcmp(term, ESTOPDIFF)){
1585 add = FALSE;
1586 } else {
1587 if(!strcmp(term, ESTOPUVSET)){
1588 tscores = est_search_uvset(db, &tsnum, hints, add);
1589 } else {
1590 tscores = est_search_union(db, term, cond->gstep, &tsnum, hints, add);
1591 }
1592 if(add){
1593 if(cond->tfidf){
1594 tune = log(tsnum + 3);
1595 tune = tune * tune * tune;
1596 if(tune < 8.0) tune = 8.0;
1597 for(j = 0; j < tsnum; j++){
1598 tscores[j].score *= 10000 / tune;
1599 }
1600 }
1601 pcnum++;
1602 } else {
1603 ncnum++;
1604 }
1605 if(scores){
1606 CB_REALLOC(scores, (snum + tsnum) * sizeof(ESTSCORE) + 1);
1607 for(j = 0; j < tsnum; j++){
1608 scores[snum+j].id = tscores[j].id;
1609 scores[snum+j].score = add ? tscores[j].score : -1;
1610 }
1611 snum += tsnum;
1612 free(tscores);
1613 } else {
1614 scores = tscores;
1615 snum = tsnum;
1616 }
1617 }
1618 }
1619 if(scores){
1620 if(pcnum > 1 || ncnum > 0){
1621 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id);
1622 nnum = 0;
1623 for(i = 0; i < snum; i++){
1624 id = scores[i].id;
1625 score = scores[i].score;
1626 hnum = score >= 0 ? 1 : 0;
1627 for(j = i + 1; j < snum && scores[j].id == id; j++){
1628 if(score >= 0 && scores[j].score >= 0){
1629 score += scores[j].score;
1630 hnum++;
1631 } else {
1632 score = -1;
1633 }
1634 }
1635 if(score >= 0 && hnum >= pcnum){
1636 scores[nnum].id = id;
1637 scores[nnum].score = score;
1638 nnum++;
1639 }
1640 i = j - 1;
1641 }
1642 snum = nnum;
1643 }
1644 } else {
1645 CB_MALLOC(scores, 1);
1646 snum = 0;
1647 }
1648 cblistclose(terms);
1649 } else if(cond->attrs){
1650 scores = est_search_uvset(db, &snum, hints, TRUE);
1651 } else {
1652 CB_MALLOC(scores, 1);
1653 snum = 0;
1654 }
1655 if(cbmaprnum(db->outcc) > 0){
1656 tsnum = 0;
1657 for(i = 0; i < snum; i++){
1658 len = sprintf(numbuf, "\t%d", scores[i].id);
1659 if(cbmapget(db->outcc, numbuf, len, NULL)) continue;
1660 scores[tsnum++] = scores[i];
1661 }
1662 snum = tsnum;
1663 }
1664 if(cond->attrs || cond->order)
1665 snum = est_narrow_scores(db, cond->attrs, cond->order, scores, snum);
1666 if(!cond->order) qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score);
1667 if(hints){
1668 sprintf(numbuf, "%d", snum);
1669 cbmapput(hints, "", 0, numbuf, -1, FALSE);
1670 }
1671 if(cond->max >= 0 && cond->max < snum) snum = cond->max;
1672 CB_MALLOC(rval, snum * sizeof(int) + 1);
1673 for(i = 0; i < snum; i++){
1674 rval[i] = scores[i].id;
1675 }
1676 if(cond->scfb){
1677 CB_REALLOC(cond->scores, snum * sizeof(int) + 1);
1678 for(i = 0; i < snum; i++){
1679 cond->scores[i] = scores[i].score;
1680 }
1681 cond->snum = snum;
1682 }
1683 *nump = snum;
1684 if(*nump < 1) db->ecode = ESTENOITEM;
1685 free(scores);
1686 return rval;
1687 }
1688
1689
1690 /* Set the maximum size of the cache memory of a database. */
1691 void est_db_set_cache_size(ESTDB *db, size_t size, int anum, int tnum){
1692 assert(db);
1693 if(dpwritable(db->metadb) && size > 0) db->icmax = size;
1694 if(anum > 0) db->acmnum = anum;
1695 if(tnum > 0) db->tcmnum = tnum;
1696 }
1697
1698
1699 /* Set the special cache for narrowing and sorting with document attributes. */
1700 void est_db_set_special_cache(ESTDB *db, const char *name, int num){
1701 assert(db && name && num >= 0);
1702 if(db->spacc){
1703 free(db->scname);
1704 cbmapclose(db->spacc);
1705 }
1706 db->spacc = cbmapopenex(num + 1);
1707 db->scmnum = num;
1708 db->scname = cbmemdup(name, -1);
1709 }
1710
1711
1712
1713 /*************************************************************************************************
1714 * features for experts
1715 *************************************************************************************************/
1716
1717
1718 /* Handle to the file of random number generator. */
1719 FILE *est_random_ifp = NULL;
1720
1721
1722 /* Break a sentence of text and extract words. */
1723 void est_break_text(const char *text, CBLIST *list, int norm, int tail){
1724 CBLIST *words;
1725 const unsigned char *word, *next;
1726 unsigned char *utext;
1727 char *tmp;
1728 int i, j, k, size, cc, wsiz, nsiz, tsiz;
1729 assert(text);
1730 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
1731 if(norm) est_normalize_text(utext, size, &size);
1732 est_canonicalize_text(utext, size, FALSE);
1733 words = cblistopen();
1734 for(i = 0; i < size; i += 2){
1735 cc = est_char_category(utext[i] * 0x100 + utext[i+1]);
1736 for(j = i + 2; j < size; j += 2){
1737 if(est_char_category(utext[j] * 0x100 + utext[j+1]) != cc) break;
1738 }
1739 switch(cc){
1740 case ESTDELIMCHR:
1741 case ESTWESTALPH:
1742 cblistpush(words, (char *)(utext + i), j - i);
1743 break;
1744 case ESTEASTALPH:
1745 for(k = i; k < j; k += 2){
1746 if(j - k >= 4){
1747 cblistpush(words, (char *)(utext + k), 4);
1748 } else {
1749 cblistpush(words, (char *)(utext + k), 2);
1750 }
1751 }
1752 break;
1753 default:
1754 break;
1755 }
1756 i = j - 2;
1757 }
1758 for(i = 0; i < CB_LISTNUM(words); i++){
1759 word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1760 if(est_char_category(word[0] * 0x100 + word[1]) == ESTEASTALPH && wsiz == 2 &&
1761 i < CB_LISTNUM(words) - 1){
1762 next = (unsigned char *)cblistval(words, i + 1, &nsiz);
1763 if(nsiz > 4) nsiz = 4;
1764 if(est_char_category(next[0] * 0x100 + next[1]) == ESTEASTALPH && nsiz > 2) nsiz = 2;
1765 CB_MALLOC(tmp, wsiz + nsiz + 1);
1766 memcpy(tmp, word, wsiz);
1767 memcpy(tmp + wsiz, next, nsiz);
1768 cblistover(words, i, tmp, wsiz + nsiz);
1769 free(tmp);
1770 }
1771 }
1772 for(i = 0; i < CB_LISTNUM(words); i++){
1773 word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1774 if(!tail && wsiz == 2 && i == CB_LISTNUM(words) - 1){
1775 if(est_char_category(word[0] * 0x100 + word[1]) == ESTEASTALPH) continue;
1776 }
1777 tmp = est_uconv_out((char *)word, wsiz, &tsiz);
1778 cblistpushbuf(list, tmp, tsiz);
1779 }
1780 cblistclose(words);
1781 free(utext);
1782 }
1783
1784
1785 /* Break a sentence of text and extract words using perfect N-gram analyzer. */
1786 void est_break_text_perfng(const char *text, CBLIST *list, int norm, int tail){
1787 CBLIST *words;
1788 const unsigned char *word, *next;
1789 unsigned char *utext;
1790 char *tmp;
1791 int i, j, k, size, cc, wsiz, nsiz, tsiz;
1792 assert(text);
1793 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
1794 if(norm) est_normalize_text(utext, size, &size);
1795 est_canonicalize_text(utext, size, FALSE);
1796 words = cblistopen();
1797 for(i = 0; i < size; i += 2){
1798 cc = est_char_category_perfng(utext[i] * 0x100 + utext[i+1]);
1799 for(j = i + 2; j < size; j += 2){
1800 if(est_char_category_perfng(utext[j] * 0x100 + utext[j+1]) != cc) break;
1801 }
1802 switch(cc){
1803 case ESTEASTALPH:
1804 for(k = i; k < j; k += 2){
1805 if(j - k >= 4){
1806 cblistpush(words, (char *)(utext + k), 4);
1807 } else {
1808 cblistpush(words, (char *)(utext + k), 2);
1809 }
1810 }
1811 break;
1812 default:
1813 break;
1814 }
1815 i = j - 2;
1816 }
1817 for(i = 0; i < CB_LISTNUM(words); i++){
1818 word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1819 if(est_char_category_perfng(word[0] * 0x100 + word[1]) == ESTEASTALPH && wsiz == 2 &&
1820 i < CB_LISTNUM(words) - 1){
1821 next = (unsigned char *)cblistval(words, i + 1, &nsiz);
1822 if(nsiz > 4) nsiz = 4;
1823 if(est_char_category_perfng(next[0] * 0x100 + next[1]) == ESTEASTALPH && nsiz > 2) nsiz = 2;
1824 CB_MALLOC(tmp, wsiz + nsiz + 1);
1825 memcpy(tmp, word, wsiz);
1826 memcpy(tmp + wsiz, next, nsiz);
1827 cblistover(words, i, tmp, wsiz + nsiz);
1828 free(tmp);
1829 }
1830 }
1831 for(i = 0; i < CB_LISTNUM(words); i++){
1832 word = (unsigned char *)CB_LISTVAL2(words, i, &wsiz);
1833 if(!tail && wsiz == 2 && i == CB_LISTNUM(words) - 1){
1834 if(est_char_category_perfng(word[0] * 0x100 + word[1]) == ESTEASTALPH) continue;
1835 }
1836 tmp = est_uconv_out((char *)word, wsiz, &tsiz);
1837 cblistpushbuf(list, tmp, tsiz);
1838 }
1839 cblistclose(words);
1840 free(utext);
1841 }
1842
1843
1844 /* Convert the character encoding of a string. */
1845 char *est_iconv(const char *ptr, int size,
1846 const char *icode, const char *ocode, int *sp, int *mp){
1847 iconv_t ic;
1848 char *obuf, *wp, *rp;
1849 size_t isiz, osiz;
1850 int miss;
1851 assert(ptr && icode && ocode);
1852 if(size < 0) size = strlen(ptr);
1853 if(icode[0] == 'x' && icode[1] == '-'){
1854 if(!cbstricmp(icode, "x-sjis")){
1855 icode = "Shift_JIS";
1856 } else if(!cbstricmp(icode, "x-ujis")){
1857 icode = "EUC-JP";
1858 } else if(!cbstricmp(icode, "x-euc-jp")){
1859 icode = "EUC-JP";
1860 }
1861 } else if(icode[0] == 'w' || icode[0] == 'W'){
1862 if(!cbstricmp(icode, "windows-31j")){
1863 icode = "CP932";
1864 }
1865 }
1866 if(ocode[0] == 'x' && ocode[1] == '-'){
1867 if(!cbstricmp(ocode, "x-sjis")){
1868 ocode = "Shift_JIS";
1869 } else if(!cbstricmp(ocode, "x-ujis")){
1870 ocode = "EUC-JP";
1871 } else if(!cbstricmp(ocode, "x-euc-jp")){
1872 ocode = "EUC-JP";
1873 }
1874 } else if(ocode[0] == 'w' || ocode[0] == 'W'){
1875 if(!cbstricmp(ocode, "windows-31j")){
1876 ocode = "CP932";
1877 }
1878 }
1879 if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return NULL;
1880 isiz = size;
1881 osiz = isiz * 5;
1882 CB_MALLOC(obuf, osiz + 1);
1883 wp = obuf;
1884 rp = (char *)ptr;
1885 miss = 0;
1886 while(isiz > 0){
1887 if(iconv(ic, (void *)&rp, &isiz, &wp, &osiz) == -1){
1888 if(errno == EILSEQ && (*rp == 0x5c || *rp == 0x7e)){
1889 *wp = *rp;
1890 wp++;
1891 rp++;
1892 isiz--;
1893 } else if(errno == EILSEQ || errno == EINVAL){
1894 rp++;
1895 isiz--;
1896 miss++;
1897 } else {
1898 break;
1899 }
1900 }
1901 }
1902 *wp = '\0';
1903 if(sp) *sp = wp - obuf;
1904 if(mp) *mp = miss;
1905 if(iconv_close(ic) == -1){
1906 free(obuf);
1907 return NULL;
1908 }
1909 return obuf;
1910 }
1911
1912
1913 /* Detect the encoding of a string automatically. */
1914 const char *est_enc_name(const char *ptr, int size, int plang){
1915 const char *hypo;
1916 int i, miss, cr;
1917 assert(ptr);
1918 if(size < 0) size = strlen(ptr);
1919 if(size > ESTICCHECKSIZ) size = ESTICCHECKSIZ;
1920 if(size >= 2 && (!memcmp(ptr, "\xfe\xff", 2) || !memcmp(ptr, "\xff\xfe", 2))) return "UTF-16";
1921 for(i = 0; i < size - 1; i += 2){
1922 if(ptr[i] == 0 && ptr[i+1] != 0) return "UTF-16BE";
1923 if(ptr[i+1] == 0 && ptr[i] != 0) return "UTF-16LE";
1924 }
1925 switch(plang){
1926 case ESTLANGEN:
1927 if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1928 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1929 return "ISO-8859-1";
1930 case ESTLANGJA:
1931 for(i = 0; i < size - 3; i++){
1932 if(ptr[i] == 0x1b){
1933 i++;
1934 if(ptr[i] == '(' && strchr("BJHI", ptr[i+1])) return "ISO-2022-JP";
1935 if(ptr[i] == '$' && strchr("@B(", ptr[i+1])) return "ISO-2022-JP";
1936 }
1937 }
1938 if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1939 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1940 hypo = NULL;
1941 cr = FALSE;
1942 for(i = 0; i < size; i++){
1943 if(ptr[i] == 0xd){
1944 cr = TRUE;
1945 break;
1946 }
1947 }
1948 if(cr){
1949 if((miss = est_enc_miss(ptr, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS";
1950 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "Shift_JIS";
1951 if((miss = est_enc_miss(ptr, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP";
1952 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "EUC-JP";
1953 } else {
1954 if((miss = est_enc_miss(ptr, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP";
1955 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "EUC-JP";
1956 if((miss = est_enc_miss(ptr, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS";
1957 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "Shift_JIS";
1958 }
1959 if((miss = est_enc_miss(ptr, size, "UTF-8", "UTF-16BE")) < 1) return "UTF-8";
1960 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "UTF-8";
1961 if((miss = est_enc_miss(ptr, size, "CP932", "UTF-16BE")) < 1) return "CP932";
1962 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "CP932";
1963 return hypo ? hypo : "ISO-8859-1";
1964 case ESTLANGZH:
1965 if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1966 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1967 if(est_enc_miss(ptr, size, "EUC-CN", "UTF-16BE") < 1) return "EUC-CN";
1968 if(est_enc_miss(ptr, size, "BIG5", "UTF-16BE") < 1) return "BIG5";
1969 return "ISO-8859-1";
1970 case ESTLANGKO:
1971 if(est_enc_miss(ptr, size, "US-ASCII", "UTF-16BE") < 1) return "US-ASCII";
1972 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
1973 if(est_enc_miss(ptr, size, "EUC-KR", "UTF-16BE") < 1) return "EUC-KR";
1974 return "ISO-8859-1";
1975 default:
1976 break;
1977 }
1978 return "ISO-8859-1";
1979 }
1980
1981
1982 /* Convert a UTF-8 string into UTF-16BE. */
1983 char *est_uconv_in(const char *ptr, int size, int *sp){
1984 const unsigned char *rp;
1985 char *rbuf, *wp;
1986 assert(ptr && size >= 0 && sp);
1987 rp = (unsigned char *)ptr;
1988 CB_MALLOC(rbuf, size * 2 + 1);
1989 wp = rbuf;
1990 while(rp < (unsigned char *)ptr + size){
1991 if(*rp < 0x7f){
1992 *(wp++) = 0x00;
1993 *(wp++) = *rp;
1994 rp += 1;
1995 } else if(*rp < 0xdf){
1996 if(rp >= (unsigned char *)ptr + size - 1) break;
1997 *(wp++) = (rp[0] & 0x1f) >> 2;
1998 *(wp++) = (rp[0] << 6) | (rp[1] & 0x3f);
1999 rp += 2;
2000 } else if(*rp < 0xf0){
2001 if(rp >= (unsigned char *)ptr + size - 2) break;
2002 *(wp++) = (rp[0] << 4) | ((rp[1] & 0x3f) >> 2);
2003 *(wp++) = (rp[1] << 6) | (rp[2] & 0x3f);
2004 rp += 3;
2005 } else if(*rp < 0xf8){
2006 if(rp >= (unsigned char *)ptr + size - 3) break;
2007 *(wp++) = 0x00;
2008 *(wp++) = '?';
2009 rp += 4;
2010 } else if(*rp < 0xfb){
2011 if(rp >= (unsigned char *)ptr + size - 4) break;
2012 *(wp++) = 0x00;
2013 *(wp++) = '?';
2014 rp += 5;
2015 } else if(*rp < 0xfd){
2016 if(rp >= (unsigned char *)ptr + size - 5) break;
2017 *(wp++) = 0x00;
2018 *(wp++) = '?';
2019 rp += 6;
2020 } else {
2021 break;
2022 }
2023 }
2024 *wp = '\0';
2025 *sp = wp - rbuf;
2026 return rbuf;
2027 }
2028
2029
2030 /* Convert a UTF-16BE string into UTF-8. */
2031 char *est_uconv_out(const char *ptr, int size, int *sp){
2032 const unsigned char *rp;
2033 char *rbuf, *wp;
2034 int c;
2035 assert(ptr && size >= 0);
2036 if(size % 2 != 0) size--;
2037 rp = (unsigned char *)ptr;
2038 CB_MALLOC(rbuf, size * 2 + 1);
2039 wp = rbuf;
2040 while(rp < (unsigned char *)ptr + size){
2041 c = rp[0] * 0x100 + rp[1];
2042 if(c < 0x0080){
2043 *(wp++) = rp[1];
2044 } else if(c < 0x0900){
2045 *(wp++) = 0xc0 | (rp[0] << 2) | ((rp[1] >> 6) & 0x03);
2046 *(wp++) = 0x80 | (rp[1] & 0x3f);
2047 } else {
2048 *(wp++) = 0xe0 | ((rp[0] >> 4) & 0x0f);
2049 *(wp++) = 0x80 | ((rp[0] & 0x0f) << 2) | ((rp[1] >> 6) & 0x03);
2050 *(wp++) = 0x80 | (rp[1] & 0x3f);
2051 }
2052 rp += 2;
2053 }
2054 *wp = '\0';
2055 if(sp) *sp = wp - rbuf;
2056 return rbuf;
2057 }
2058
2059
2060 /* Compress a serial object with ZLIB. */
2061 char *est_deflate(const char *ptr, int size, int *sp){
2062 z_stream zs;
2063 char *buf;
2064 unsigned char obuf[ESTIOBUFSIZ];
2065 int rv, asiz, bsiz, osiz;
2066 assert(ptr && sp);
2067 if(size < 0) size = strlen(ptr);
2068 zs.zalloc = Z_NULL;
2069 zs.zfree = Z_NULL;
2070 zs.opaque = Z_NULL;
2071 if(deflateInit(&zs, ESTZCOMPLEVEL) != Z_OK) return NULL;
2072 asiz = ESTIOBUFSIZ;
2073 CB_MALLOC(buf, asiz);
2074 bsiz = 0;
2075 zs.next_in = (unsigned char *)ptr;
2076 zs.avail_in = size;
2077 zs.next_out = obuf;
2078 zs.avail_out = ESTIOBUFSIZ;
2079 while((rv = deflate(&zs, Z_FINISH)) == Z_OK){
2080 osiz = ESTIOBUFSIZ - zs.avail_out;
2081 if(bsiz + osiz > asiz){
2082 asiz = asiz * 2 + osiz;
2083 CB_REALLOC(buf, asiz);
2084 }
2085 memcpy(buf + bsiz, obuf, osiz);
2086 bsiz += osiz;
2087 zs.next_out = obuf;
2088 zs.avail_out = ESTIOBUFSIZ;
2089 }
2090 if(rv != Z_STREAM_END){
2091 free(buf);
2092 deflateEnd(&zs);
2093 return NULL;
2094 }
2095 osiz = ESTIOBUFSIZ - zs.avail_out;
2096 if(bsiz + osiz > asiz){
2097 asiz = asiz * 2 + osiz;
2098 CB_REALLOC(buf, asiz);
2099 }
2100 memcpy(buf + bsiz, obuf, osiz);
2101 bsiz += osiz;
2102 *sp = bsiz;
2103 deflateEnd(&zs);
2104 return buf;
2105 }
2106
2107
2108 /* Decompress a serial object compressed with ZLIB. */
2109 char *est_inflate(const char *ptr, int size, int *sp){
2110 z_stream zs;
2111 char *buf;
2112 unsigned char obuf[ESTIOBUFSIZ];
2113 int rv, asiz, bsiz, osiz;
2114 assert(ptr && size >= 0 && sp);
2115 zs.zalloc = Z_NULL;
2116 zs.zfree = Z_NULL;
2117 zs.opaque = Z_NULL;
2118 if(inflateInit(&zs) != Z_OK) return NULL;
2119 asiz = ESTIOBUFSIZ;
2120 CB_MALLOC(buf, asiz);
2121 bsiz = 0;
2122 zs.next_in = (unsigned char *)ptr;
2123 zs.avail_in = size;
2124 zs.next_out = obuf;
2125 zs.avail_out = ESTIOBUFSIZ;
2126 while((rv = inflate(&zs, Z_NO_FLUSH)) == Z_OK){
2127 osiz = ESTIOBUFSIZ - zs.avail_out;
2128 if(bsiz + osiz >= asiz){
2129 asiz = asiz * 2 + osiz;
2130 CB_REALLOC(buf, asiz);
2131 }
2132 memcpy(buf + bsiz, obuf, osiz);
2133 bsiz += osiz;
2134 zs.next_out = obuf;
2135 zs.avail_out = ESTIOBUFSIZ;
2136 }
2137 if(rv != Z_STREAM_END){
2138 free(buf);
2139 inflateEnd(&zs);
2140 return NULL;
2141 }
2142 osiz = ESTIOBUFSIZ - zs.avail_out;
2143 if(bsiz + osiz >= asiz){
2144 asiz = asiz * 2 + osiz;
2145 CB_REALLOC(buf, asiz);
2146 }
2147 memcpy(buf + bsiz, obuf, osiz);
2148 bsiz += osiz;
2149 buf[bsiz] = '\0';
2150 if(sp) *sp = bsiz;
2151 inflateEnd(&zs);
2152 return buf;
2153 }
2154
2155
2156 /* Get the border string for draft data of documents. */
2157 const char *est_border_str(void){
2158 static int first = TRUE;
2159 static char border[ESTPATHBUFSIZ];
2160 int t, p;
2161 if(first){
2162 t = (int)(time(NULL) + est_random() * INT_MAX);
2163 p = (int)(getpid() + est_random() * INT_MAX);
2164 sprintf(border, "--------[%08X%08X]--------",
2165 dpouterhash((char *)&t, sizeof(int)), dpouterhash((char *)&p, sizeof(int)));
2166 first = FALSE;
2167 }
2168 return border;
2169 }
2170
2171
2172 /* Get the real random number. */
2173 double est_random(void){
2174 static int first = TRUE;
2175 int num;
2176 if(first && !est_random_ifp){
2177 if((est_random_ifp = fopen("/dev/urandom", "rb")) != NULL){
2178 atexit(est_random_fclose);
2179 } else {
2180 srand(getpid());
2181 }
2182 first = FALSE;
2183 }
2184 if(est_random_ifp){
2185 fread(&num, sizeof(int), 1, est_random_ifp);
2186 return (num & 0x7fffffff) / (double)0x7fffffff;
2187 }
2188 return rand() / (double)RAND_MAX;
2189 }
2190
2191
2192 /* Get the random number in normal distribution. */
2193 double est_random_nd(void){
2194 double d;
2195 d = (sqrt(-2 * log(1.0 - est_random())) * cos(3.1415926535 * 2 * est_random()) + 6.0) / 12.0;
2196 if(d > 1.0) d = 1.0;
2197 if(d < 0.0) d = 0.0;
2198 return d;
2199 }
2200
2201
2202 /* Get an MD5 hash string of a key string. */
2203 char *est_make_crypt(const char *key){
2204 md5_state_t ms;
2205 char digest[32], str[64], *wp;
2206 int i;
2207 assert(key);
2208 md5_init(&ms);
2209 md5_append(&ms, (md5_byte_t *)key, strlen(key));
2210 md5_finish(&ms, (md5_byte_t *)digest);
2211 wp = str;
2212 for(i = 0; i < 16; i++){
2213 wp += sprintf(wp, "%02x", ((unsigned char *)digest)[i]);
2214 }
2215 return cbmemdup(str, -1);
2216 }
2217
2218
2219 /* Check whether a key matches an MD5 hash string. */
2220 int est_match_crypt(const char *key, const char *hash){
2221 char *khash;
2222 int rv;
2223 assert(key && hash);
2224 khash = est_make_crypt(key);
2225 rv = !strcmp(khash, hash);
2226 free(khash);
2227 return rv;
2228 }
2229
2230
2231 /* Get the hidden texts of a document object. */
2232 const char *est_doc_hidden_texts(ESTDOC *doc){
2233 const char *rv;
2234 assert(doc);
2235 rv = doc->attrs ? cbmapget(doc->attrs, "", 0, NULL) : NULL;
2236 return rv ? rv : "";
2237 }
2238
2239
2240 /* Get the phrase of a condition object. */
2241 const char *est_cond_phrase(ESTCOND *cond){
2242 assert(cond);
2243 return cond->phrase;
2244 }
2245
2246
2247 /* Get a list object of attribute expressions of a condition object. */
2248 const CBLIST *est_cond_attrs(ESTCOND *cond){
2249 assert(cond);
2250 return cond->attrs;
2251 }
2252
2253
2254 /* Get the order expression of a condition object. */
2255 const char *est_cond_order(ESTCOND *cond){
2256 assert(cond);
2257 return cond->order;
2258 }
2259
2260
2261 /* Get the maximum number of retrieval of a condition object. */
2262 int est_cond_max(ESTCOND *cond){
2263 assert(cond);
2264 return cond->max;
2265 }
2266
2267
2268 /* Get the options of a condition object. */
2269 int est_cond_options(ESTCOND *cond){
2270 assert(cond);
2271 return cond->opts;
2272 }
2273
2274
2275 /* Get the score of a document corresponding to a condition object. */
2276 int est_cond_score(ESTCOND *cond, int index){
2277 assert(cond);
2278 if(!cond->scores || index < 0 || index >= cond->snum) return -1;
2279 return cond->scores[index];
2280 }
2281
2282
2283 /* Set the error code of a database. */
2284 void est_db_set_ecode(ESTDB *db, int ecode){
2285 assert(db);
2286 db->ecode = ecode;
2287 }
2288
2289
2290 /* Edit attributes of a document object in a database. */
2291 int est_db_edit_doc(ESTDB *db, ESTDOC *doc){
2292 const char *uri;
2293 char *sbuf;
2294 int err, id, ssiz;
2295 assert(db && doc);
2296 if(!dpwritable(db->metadb)){
2297 db->ecode = ESTEACCES;
2298 return FALSE;
2299 }
2300 if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL)) || doc->id < 1){
2301 db->ecode = ESTEINVAL;
2302 return FALSE;
2303 }
2304 if((id = est_db_uri_to_id(db, uri)) > 0 && id != doc->id){
2305 db->ecode = ESTEINVAL;
2306 return FALSE;
2307 }
2308 err = FALSE;
2309 sbuf = cbmapdump(doc->attrs, &ssiz);
2310 if(!crput(db->attrdb, (char *)&(doc->id), sizeof(int), sbuf, ssiz, CR_DOVER)){
2311 db->ecode = ESTEDB;
2312 db->fatal = TRUE;
2313 err = TRUE;
2314 }
2315 free(sbuf);
2316 if(db->spacc) cbmapout(db->spacc, (char *)&(doc->id), sizeof(int));
2317 return err ? FALSE : TRUE;
2318 }
2319
2320
2321 /* Add a piece of meta data to a database. */
2322 void est_db_add_meta(ESTDB *db, const char *name, const char *value){
2323 assert(db && name);
2324 if(!dpwritable(db->metadb)){
2325 db->ecode = ESTEACCES;
2326 return;
2327 }
2328 if(!db->metacc) est_db_prepare_meta(db);
2329 if(value){
2330 cbmapput(db->metacc, name, -1, value, -1, TRUE);
2331 } else {
2332 cbmapout(db->metacc, name, -1);
2333 }
2334 }
2335
2336
2337 /* Get a list of names of meta data of a database. */
2338 CBLIST *est_db_meta_names(ESTDB *db){
2339 assert(db);
2340 if(!db->metacc) est_db_prepare_meta(db);
2341 return cbmapkeys(db->metacc);
2342 }
2343
2344
2345 /* Get the value of a piece of meta data of a database. */
2346 char *est_db_meta(ESTDB *db, const char *name){
2347 const char *vbuf;
2348 int vsiz;
2349 assert(db && name);
2350 if(!db->metacc) est_db_prepare_meta(db);
2351 if(!(vbuf = cbmapget(db->metacc, name, -1, &vsiz))) return NULL;
2352 return cbmemdup(vbuf, vsiz);
2353 }
2354
2355
2356 /* Get the number of records in the cache memory of a database. */
2357 int est_db_cache_num(ESTDB *db){
2358 assert(db);
2359 return cbmaprnum(db->idxcc);
2360 }
2361
2362
2363 /* Set the callback function for database events. */
2364 void est_db_set_informer(ESTDB *db, void (*func)(const char *)){
2365 assert(db && func);
2366 db->cbinfo = func;
2367 est_db_inform(db, "status");
2368 }
2369
2370
2371 /* Set the callback function to create a vector of keywords of a document. */
2372 void est_db_set_vectorizer(ESTDB *db, CBMAP *(*func)(void *, int, void *), void *data){
2373 assert(db && func);
2374 db->cbvec = func;
2375 db->vecdata = data;
2376 }
2377
2378
2379 /* Fill the cache for keys for TF-IDF. */
2380 void est_db_fill_key_cache(ESTDB *db){
2381 char *kbuf, *msg;
2382 int i, ksiz, vsiz;
2383 assert(db);
2384 vlcurfirst(db->fwmdb);
2385 for(i = 0; (kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL; i++){
2386 vsiz = est_idx_vsiz(db->idxdb, kbuf, ksiz);
2387 cbmapput(db->keycc, kbuf, ksiz, (char *)&vsiz, sizeof(int), TRUE);
2388 free(kbuf);
2389 vlcurnext(db->fwmdb);
2390 if(i % ESTCCCBFREQ == 0){
2391 msg = cbsprintf("filling the key cache for TF-IDF (%d)", i + 1);
2392 est_db_inform(db, msg);
2393 free(msg);
2394 }
2395 }
2396 db->kcmnum = -1;
2397 }
2398
2399
2400 /* Make a directory. */
2401 int est_mkdir(const char *path){
2402 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2403 return mkdir(path) == 0 ? TRUE : FALSE;
2404 #else
2405 assert(path);
2406 return mkdir(path, ESTDIRMODE) == 0 ? TRUE : FALSE;
2407 #endif
2408 }
2409
2410
2411 /* Remove a directory and its contents recursively. */
2412 int est_rmdir_rec(const char *path){
2413 CBLIST *files;
2414 const char *file;
2415 char pbuf[ESTPATHBUFSIZ];
2416 int i;
2417 assert(path);
2418 if((files = cbdirlist(path)) != NULL){
2419 for(i = 0; i < cblistnum(files); i++){
2420 file = cblistval(files, i, NULL);
2421 if(!strcmp(file, ESTCDIRSTR) || !strcmp(file, ESTPDIRSTR)) continue;
2422 sprintf(pbuf, "%s%c%s", path, ESTPATHCHR, file);
2423 if(unlink(pbuf) == -1) est_rmdir_rec(pbuf);
2424 }
2425 cblistclose(files);
2426 }
2427 return rmdir(path) == 0;
2428 }
2429
2430
2431 /* Get the canonicalized absolute pathname of a file. */
2432 char *est_realpath(const char *path){
2433 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2434 char pbuf[ESTPATHBUFSIZ], *p;
2435 if(GetFullPathName(path, ESTPATHBUFSIZ, pbuf, &p) == 0) sprintf(pbuf, "%s", path);
2436 return cbmemdup(pbuf, -1);
2437 #else
2438 char pbuf[ESTPATHBUFSIZ*2];
2439 assert(path);
2440 if(!realpath(path, pbuf)) sprintf(pbuf, "%s", path);
2441 return cbmemdup(pbuf, -1);
2442 #endif
2443 }
2444
2445
2446 /* Get the time of day in milliseconds. */
2447 double est_gettimeofday(void){
2448 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2449 SYSTEMTIME st;
2450 struct tm ts;
2451 GetLocalTime(&st);
2452 memset(&ts, 0, sizeof(struct tm));
2453 ts.tm_year = st.wYear - 1900;
2454 ts.tm_mon = st.wMonth - 1;
2455 ts.tm_mday = st.wDay;
2456 ts.tm_hour = st.wHour;
2457 ts.tm_min = st.wMinute;
2458 ts.tm_sec = st.wSecond;
2459 return (double)mktime(&ts) * 1000 + (double)st.wMilliseconds;
2460 #else
2461 struct timeval tv;
2462 struct timezone tz;
2463 if(gettimeofday(&tv, &tz) == -1) return 0.0;
2464 return (double)tv.tv_sec * 1000 + (double)tv.tv_usec / 1000;
2465 #endif
2466 }
2467
2468
2469 /* Suspend execution for microsecond intervals. */
2470 void est_usleep(unsigned long usec){
2471 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2472 Sleep(usec / 1000);
2473 #else
2474 usleep(usec);
2475 #endif
2476 }
2477
2478
2479 /* Send a signal to a process. */
2480 int est_kill(int pid, int sig){
2481 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
2482 return FALSE;
2483 #else
2484 return kill(pid, sig) == 0;
2485 #endif
2486 }
2487
2488
2489 /* get the media type of an extention */
2490 const char *est_ext_type(const char *ext){
2491 static const char *list[] = {
2492 ".txt", "text/plain", ".txt.en", "text/plain",
2493 ".txt.ja", "text/plain", ".asc", "text/plain",
2494 ".in", "text/plain", ".c", "text/plain",
2495 ".h", "text/plain", ".cc", "text/plain",
2496 ".java", "text/plain", ".sh", "text/plain",
2497 ".pl", "text/plain", ".py", "text/plain",
2498 ".rb", "text/plain", ".idl", "text/plain",
2499 ".csv", "text/plain", ".log", "text/plain",
2500 ".conf", "text/plain", ".rc", "text/plain",
2501 ".ini", "text/plain", ".html", "text/html",
2502 ".htm", "text/html", ".xhtml", "text/html",
2503 ".xht", "text/html", ".css", "text/css",
2504 ".js", "text/javascript", ".tsv", "text/tab-separated-values",
2505 ".eml", "message/rfc822", ".mime", "message/rfc822",
2506 ".mht", "message/rfc822", ".mhtml", "message/rfc822",
2507 ".sgml", "application/sgml", ".sgm", "application/sgml",
2508 ".xml", "application/xml", ".xsl", "application/xml",
2509 ".xslt", "application/xslt+xml", ".xhtml", "application/xhtml+xml",
2510 ".xht", "application/xhtml+xml", ".rdf", "application/rdf+xml",
2511 ".rss", "application/rss+xml", ".dtd", "application/xml-dtd",
2512 ".rtf", "application/rtf", ".pdf", "application/pdf",
2513 ".ps", "application/postscript", ".eps", "application/postscript",
2514 ".doc", "application/msword", ".xls", "application/vnd.ms-excel",
2515 ".ppt", "application/vnd.ms-powerpoint", ".xdw", "application/vnd.fujixerox.docuworks",
2516 ".swf", "application/x-shockwave-flash", ".zip", "application/zip",
2517 ".tar", "application/x-tar", ".gz", "application/x-gzip",
2518 ".bz2", "application/octet-stream", ".z", "application/octet-stream",
2519 ".lha", "application/octet-stream", ".lzh", "application/octet-stream",
2520 ".cab", "application/octet-stream", ".rar", "application/octet-stream",
2521 ".sit", "application/octet-stream", ".bin", "application/octet-stream",
2522 ".o", "application/octet-stream", ".so", "application/octet-stream",
2523 ".exe", "application/octet-stream", ".dll", "application/octet-stream",
2524 ".class", "application/octet-stream", ".png", "image/png",
2525 ".gif", "image/gif", ".jpg", "image/jpeg",
2526 ".jpeg", "image/jpeg", ".tif", "image/tiff",
2527 ".tiff", "image/tiff", ".bmp", "image/bmp",
2528 ".au", "audio/basic", ".snd", "audio/basic",
2529 ".mid", "audio/midi", ".midi", "audio/midi",
2530 ".mp2", "audio/mpeg", ".mp3", "audio/mpeg",
2531 ".wav", "audio/x-wav", ".mpg", "video/mpeg",
2532 ".mpeg", "video/mpeg", ".qt", "video/quicktime",
2533 ".mov", "video/quicktime", ".avi", "video/x-msvideo",
2534 NULL
2535 };
2536 int i;
2537 assert(ext);
2538 for(i = 0; list[i]; i++){
2539 if(!cbstricmp(ext, list[i])) return list[i+1];
2540 }
2541 return "application/octet-stream";
2542 }
2543
2544
2545
2546 /*************************************************************************************************
2547 * private objects
2548 *************************************************************************************************/
2549
2550
2551 /* Count the number of missing characters when converting.
2552 `ptr' specifies the pointer to a region.
2553 `size' specifies the size of the region.
2554 `icode' specifies the name of encoding of the input string.
2555 `ocode' specifies the name of encoding of the output string.
2556 The return value is the number of missing characters. */
2557 static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode){
2558 iconv_t ic;
2559 char obuf[ESTICCHECKSIZ], *wp, *rp;
2560 size_t isiz, osiz;
2561 int miss;
2562 assert(ptr && size >= 0 && icode && ocode);
2563 isiz = size;
2564 if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return ESTICMISSMAX;
2565 miss = 0;
2566 rp = (char *)ptr;
2567 while(isiz > 0){
2568 osiz = ESTICCHECKSIZ;
2569 wp = obuf;
2570 if(iconv(ic, (void *)&rp, &isiz, &wp, &osiz) == -1){
2571 if(errno == EILSEQ || errno == EINVAL){
2572 rp++;
2573 isiz--;
2574 miss++;
2575 if(miss >= ESTICMISSMAX) break;
2576 } else {
2577 break;
2578 }
2579 }
2580 }
2581 if(iconv_close(ic) == -1) return ESTICMISSMAX;
2582 return miss;
2583 }
2584
2585
2586 /* Normalize a text.
2587 `utext' specifies a text whose encoding is UTF-16BE.
2588 `size' specifies the size of the text.
2589 `sp' specifies the pointer to a variable to which the size of the result is assigned. */
2590 static void est_normalize_text(unsigned char *utext, int size, int *sp){
2591 int i, wi;
2592 assert(utext && size >= 0 && sp);
2593 wi = 0;
2594 for(i = 0; i < size - 1; i += 2){
2595 if(utext[i] == 0x0 && (utext[i+1] <= 0x8 || (utext[i+1] >= 0x0e && utext[i+1] <= 0x1f))){
2596 /* control characters */
2597 utext[wi] = 0x0;
2598 utext[wi+1] = 0x20;
2599 } else if(utext[i] == 0x0 && utext[i+1] == 0xa0){
2600 /* no-break space */
2601 utext[wi] = 0x0;
2602 utext[wi+1] = 0x20;
2603 } else if(utext[i] == 0x20 && utext[i+1] == 0x2){
2604 /* en space */
2605 utext[wi] = 0x0;
2606 utext[wi+1] = 0x20;
2607 } else if(utext[i] == 0x20 && utext[i+1] == 0x3){
2608 /* em space */
2609 utext[wi] = 0x0;
2610 utext[wi+1] = 0x20;
2611 } else if(utext[i] == 0x20 && utext[i+1] == 0x9){
2612 /* thin space */
2613 utext[wi] = 0x0;
2614 utext[wi+1] = 0x20;
2615 } else if(utext[i] == 0x30 && utext[i+1] == 0x0){
2616 /* fullwidth space */
2617 utext[wi] = 0x0;
2618 utext[wi+1] = 0x20;
2619 } else if(utext[i] == 0xff){
2620 if(utext[i+1] >= 0x21 && utext[i+1] <= 0x3a){
2621 /* fullwidth alphabets */
2622 utext[wi] = 0x0;
2623 utext[wi+1] = utext[i+1] - 0x21 + 0x41;
2624 } else if(utext[i+1] >= 0x41 && utext[i+1] <= 0x5a){
2625 /* fullwidth small alphabets */
2626 utext[wi] = 0x0;
2627 utext[wi+1] = utext[i+1] - 0x41 + 0x61;
2628 } else if(utext[i+1] >= 0x10 && utext[i+1] <= 0x19){
2629 /* fullwidth numbers */
2630 utext[wi] = 0x0;
2631 utext[wi+1] = utext[i+1] - 0x10 + 0x30;
2632 } else if(utext[i+1] == 0x61){
2633 /* halfwidth full stop */
2634 utext[wi] = 0x30;
2635 utext[wi+1] = 0x2;
2636 } else if(utext[i+1] == 0x62){
2637 /* halfwidth left corner */
2638 utext[wi] = 0x30;
2639 utext[wi+1] = 0xc;
2640 } else if(utext[i+1] == 0x63){
2641 /* halfwidth right corner */
2642 utext[wi] = 0x30;
2643 utext[wi+1] = 0xd;
2644 } else if(utext[i+1] == 0x64){
2645 /* halfwidth comma */
2646 utext[wi] = 0x30;
2647 utext[wi+1] = 0x1;
2648 } else if(utext[i+1] == 0x65){
2649 /* halfwidth middle dot */
2650 utext[wi] = 0x30;
2651 utext[wi+1] = 0xfb;
2652 } else if(utext[i+1] == 0x66){
2653 /* halfwidth wo */
2654 utext[wi] = 0x30;
2655 utext[wi+1] = 0xf2;
2656 } else if(utext[i+1] >= 0x67 && utext[i+1] <= 0x6b){
2657 /* halfwidth small a-o */
2658 utext[wi] = 0x30;
2659 utext[wi+1] = (utext[i+1] - 0x67) * 2 + 0xa1;
2660 } else if(utext[i+1] >= 0x6c && utext[i+1] <= 0x6e){
2661 /* halfwidth small ya-yo */
2662 utext[wi] = 0x30;
2663 utext[wi+1] = (utext[i+1] - 0x6c) * 2 + 0xe3;
2664 } else if(utext[i+1] == 0x6f){
2665 /* halfwidth small tu */
2666 utext[wi] = 0x30;
2667 utext[wi+1] = 0xc3;
2668 } else if(utext[i+1] == 0x70){
2669 /* halfwidth prolonged mark */
2670 utext[wi] = 0x30;
2671 utext[wi+1] = 0xfc;
2672 } else if(utext[i+1] >= 0x71 && utext[i+1] <= 0x75){
2673 /* halfwidth a-o */
2674 utext[wi] = 0x30;
2675 utext[wi+1] = (utext[i+1] - 0x71) * 2 + 0xa2;
2676 if(i + 2 < size - 1 && utext[i+1] == 0x73 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2677 utext[wi+1] = 0xf4;
2678 i += 2;
2679 }
2680 } else if(utext[i+1] >= 0x76 && utext[i+1] <= 0x7a){
2681 /* halfwidth ka-ko */
2682 utext[wi] = 0x30;
2683 utext[wi+1] = (utext[i+1] - 0x76) * 2 + 0xab;
2684 if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2685 utext[wi+1] += 1;
2686 i += 2;
2687 }
2688 } else if(utext[i+1] >= 0x7b && utext[i+1] <= 0x7f){
2689 /* halfwidth sa-so */
2690 utext[wi] = 0x30;
2691 utext[wi+1] = (utext[i+1] - 0x7b) * 2 + 0xb5;
2692 if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2693 utext[wi+1] += 1;
2694 i += 2;
2695 }
2696 } else if(utext[i+1] >= 0x80 && utext[i+1] <= 0x84){
2697 /* halfwidth ta-to */
2698 utext[wi] = 0x30;
2699 utext[wi+1] = (utext[i+1] - 0x80) * 2 + 0xbf + (utext[i+1] >= 0x82 ? 1 : 0);
2700 if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
2701 utext[wi+1] += 1;
2702 i += 2;
2703 }
2704 } else if(utext[i+1] >= 0x85 && utext[i+1] <= 0x89){
2705 /* halfwidth na-no */
2706 utext[wi] = 0x30;
2707 utext[wi+1] = utext[i+1] - 0x85 + 0xca;
2708 } else if(utext[i+1] >= 0x8a && utext[i+1] <= 0x8e){
2709 /* halfwidth ha-ho */
2710 utext[wi] = 0x30;
2711 utext[wi+1] = (utext[i+1] - 0x8a) * 3 + 0xcf;
2712 if(i + 2 < size - 1){
2713 if(utext[i+2] == 0xff && utext[i+3] == 0x9e){
2714 utext[wi+1] += 1;
2715 i += 2;
2716 } else if(utext[i+2] == 0xff && utext[i+3] == 0x9f){
2717 utext[wi+1] += 2;
2718 i += 2;
2719 }
2720 }
2721 } else if(utext[i+1] >= 0x8f && utext[i+1] <= 0x93){
2722 /* halfwidth ma-mo */
2723 utext[wi] = 0x30;
2724 utext[wi+1] = utext[i+1] - 0x8f + 0xde;
2725 } else if(utext[i+1] >= 0x94 && utext[i+1] <= 0x96){
2726 /* halfwidth ya-yo */
2727 utext[wi] = 0x30;
2728 utext[wi+1] = (utext[i+1] - 0x94) * 2 + 0xe4;
2729 } else if(utext[i+1] >= 0x97 && utext[i+1] <= 0x9b){
2730 /* halfwidth ra-ro */
2731 utext[wi] = 0x30;
2732 utext[wi+1] = utext[i+1] - 0x97 + 0xe9;
2733 } else if(utext[i+1] == 0x9c){
2734 /* halfwidth wa */
2735 utext[wi] = 0x30;
2736 utext[wi+1] = 0xef;
2737 } else if(utext[i+1] == 0x9d){
2738 /* halfwidth wo */
2739 utext[wi] = 0x30;
2740 utext[wi+1] = 0xf3;
2741 } else {
2742 utext[wi] = utext[i];
2743 utext[wi+1] = utext[i+1];
2744 }
2745 } else {
2746 utext[wi] = utext[i];
2747 utext[wi+1] = utext[i+1];
2748 }
2749 wi += 2;
2750 }
2751 *sp = wi;
2752 }
2753
2754
2755 /* Canonicalize a text for search keys.
2756 `utext' specifies a text whose encoding is UTF-16BE.
2757 `size' specifies the size of the text.
2758 `funcspc' specifies whether to allow functional space characters. */
2759 static void est_canonicalize_text(unsigned char *utext, int size, int funcspc){
2760 int i;
2761 for(i = 0; i < size; i += 2){
2762 if(utext[i] == 0x0){
2763 if(utext[i+1] >= 'A' && utext[i+1] <= 'Z'){
2764 /* ascii */
2765 utext[i+1] += 'a' - 'A';
2766 } else if((utext[i+1] >= 0xc0 && utext[i+1] <= 0xd6) ||
2767 (utext[i+1] >= 0xd8 && utext[i+1] <= 0xde)){
2768 /* latin-1 supplement */
2769 utext[i+1] += 0x20;
2770 } else if(!funcspc && utext[i+1] < ' '){
2771 /* functional spaces */
2772 utext[i+1] = ' ';
2773 }
2774 } else if(utext[i] == 0x1){
2775 if((utext[i+1] <= 0x36 && utext[i+1] % 2 == 0) ||
2776 (utext[i+1] >= 0x39 && utext[i+1] <= 0x47 && utext[i+1] % 2 == 1) ||
2777 (utext[i+1] >= 0x4a && utext[i+1] <= 0x76 && utext[i+1] % 2 == 0) ||
2778 (utext[i+1] >= 0x79 && utext[i+1] <= 0x7d && utext[i+1] % 2 == 1)){
2779 /* latin extended-a */
2780 utext[i+1] += 0x1;
2781 } else if(utext[i+1] == 0x78){
2782 /* y with umlaut */
2783 utext[i] = 0x0;
2784 utext[i+1] = 0xff;
2785 }
2786 } else if(utext[i] == 0x3){
2787 if(utext[i+1] >= 0x91 && utext[i+1] <= 0xa9){
2788 /* greek */
2789 utext[i+1] += 0x20;
2790 }
2791 } else if(utext[i] == 0x4){
2792 if(utext[i+1] >= 0x10 && utext[i+1] <= 0x2f){
2793 /* cyrillic */
2794 utext[i+1] += 0x20;
2795 } else if(utext[i+1] <= 0x0f){
2796 /* cyrillic with mark */
2797 utext[i+1] += 0x50;
2798 }
2799 } else if(utext[i] == 0xff){
2800 if(utext[i] >= 0xf0){
2801 /* special */
2802 utext[i] = 0x0;
2803 utext[i+1] = ' ';
2804 }
2805 }
2806 }
2807 }
2808
2809
2810 /* Categorize a character.
2811 `c' specifies the UCS number of a character.
2812 The return value is the category of the character. */
2813 static int est_char_category(int c){
2814 /* ascii space */
2815 if(c <= 0x0020) return ESTSPACECHR;
2816 /* ascii alnum */
2817 if((c >= 0x0030 && c <= 0x0039) || (c >= 0x0041 && c <= 0x005a) ||
2818 (c >= 0x0061 && c <= 0x007a)) return ESTWESTALPH;
2819 /* latin */
2820 if((c >= 0x00c0 && c <= 0x00ff && c != 0x00d7 && c != 0x00f7) || (c >= 0x0100 && c <= 0x017f))
2821 return ESTWESTALPH;
2822 /* arabic and syrian */
2823 if(c >= 0x0600 && c <= 0x08ff) return ESTEASTALPH;
2824 /* south and south east asia */
2825 if((c >= 0x0900 && c <= 0x109f) || (c >= 0x1700 && c <= 0x1cff)) return ESTEASTALPH;
2826 /* cjk */
2827 if((c >= 0x1100 && c <= 0x11ff) || (c >= 0x2e80 && c <= 0xd7af) ||
2828 (c >= 0xf900 && c <= 0xfaff) || (c >= 0xff00 && c <= 0xffef)) return ESTEASTALPH;
2829 /* asian presentation forms */
2830 if((c >= 0xfb50 && c <= 0xfdff) || (c >= 0xfe30 && c <= 0xfe4f) ||
2831 (c >= 0xfe70 && c <= 0xfeff)) return ESTEASTALPH;
2832 /* others */
2833 return ESTDELIMCHR;
2834 }
2835
2836
2837 /* Categorize a character for perfect N-gram analyzer.
2838 `c' specifies the UCS number of a character.
2839 The return value is the category of the character. */
2840 static int est_char_category_perfng(int c){
2841 if(c <= 0x0020) return ESTSPACECHR;
2842 return ESTEASTALPH;
2843 }
2844
2845
2846 /* Convert a simplified phrase into complete form.
2847 `sphrase' specifies a simplified phrase.
2848 The return value is the complete form of the phrase. */
2849 static char *est_phrase_from_thumb(const char *sphrase){
2850 CBDATUM *datum;
2851 const char *oper, *rp;
2852 unsigned char *utext;
2853 char *rtext;
2854 int size, quote;
2855 assert(sphrase);
2856 datum = cbdatumopen("", 0);
2857 utext = (unsigned char *)est_uconv_in(sphrase, strlen(sphrase), &size);
2858 est_normalize_text(utext, size, &size);
2859 est_canonicalize_text(utext, size, FALSE);
2860 rtext = est_uconv_out((char *)utext, size, NULL);
2861 cbstrsqzspc(rtext);
2862 quote = FALSE;
2863 oper = NULL;
2864 for(rp = rtext; *rp != '\0'; rp++){
2865 if(*rp == '"'){
2866 if(oper){
2867 cbdatumcat(datum, oper, -1);
2868 oper = NULL;
2869 }
2870 quote = !quote;
2871 continue;
2872 }
2873 if(quote){
2874 cbdatumcat(datum, rp, 1);
2875 continue;
2876 }
2877 switch(*rp){
2878 case ' ':
2879 if(!oper) oper = " AND ";
2880 break;
2881 case '&':
2882 oper = " AND ";
2883 break;
2884 case '|':
2885 oper = " OR ";
2886 break;
2887 case '!':
2888 oper = " ANDNOT ";
2889 break;
2890 default:
2891 if(oper){
2892 cbdatumcat(datum, oper, -1);
2893 oper = NULL;
2894 }
2895 cbdatumcat(datum, rp, 1);
2896 }
2897 }
2898 free(rtext);
2899 free(utext);
2900 return cbdatumtomalloc(datum, NULL);
2901 }
2902
2903
2904 /* Add a string to a snippet.
2905 `rtext' specifies a raw text.
2906 `ctext' specifies a canonicalized text.
2907 `size' specifies the size of the raw text and the canonicalized text.
2908 `awsiz' specifies the size of allowance for matching words.
2909 `res' specifies a datum object for the result.
2910 `rwords' specifies a list object of raw words. */
2911 static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
2912 int size, int awsiz, CBDATUM *res, const CBLIST *rwords){
2913 const unsigned char *rword;
2914 char *orig;
2915 int i, j, bi, rwsiz, step, osiz;
2916 bi = 0;
2917 for(i = 0; i < size; i += 2){
2918 for(j = 0; j < CB_LISTNUM(rwords); j++){
2919 rword = (unsigned char *)CB_LISTVAL2(rwords, j, &rwsiz);
2920 if((step = est_str_fwmatch_wide(ctext + i, size + awsiz - i, rword, rwsiz)) > 0){
2921 if(i - bi > 0){
2922 orig = est_uconv_out((char *)rtext + bi, i - bi, &osiz);
2923 cbdatumcat(res, orig, osiz);
2924 cbdatumcat(res, "\n", 1);
2925 free(orig);
2926 }
2927 orig = est_uconv_out((char *)rtext + i, step, &osiz);
2928 cbdatumcat(res, orig, osiz);
2929 free(orig);
2930 cbdatumcat(res, "\t", 1);
2931 orig = est_uconv_out((char *)rword, rwsiz, &osiz);
2932 cbdatumcat(res, orig, osiz);
2933 free(orig);
2934 cbdatumcat(res, "\n", 1);
2935 bi = i + step;
2936 i = bi - 2;
2937 break;
2938 }
2939 }
2940 }
2941 if(i - bi > 0){
2942 orig = est_uconv_out((char *)rtext + bi, i - bi, &osiz);
2943 cbdatumcat(res, orig, osiz);
2944 cbdatumcat(res, "\n", 1);
2945 free(orig);
2946 }
2947 }
2948
2949
2950 /* Check whether a string begins with a key.
2951 `string' specifies a target string whose encoding is UTF-16BE.
2952 `size' specifies the size of the target string.
2953 `key' specifies a key string whose encoding is UTF-16BE.
2954 `ksiz' specifies the size of the key string.
2955 `key' specifies the pointer
2956 The return value is the number of characters of the corresponding string, or 0 if the target
2957 string does not begin with the key. */
2958 static int est_str_fwmatch_wide(const unsigned char *str, int size,
2959 const unsigned char *key, int ksiz){
2960 int si, ki;
2961 assert(str && size >= 0 && key && ksiz >= 0);
2962 if(size < 2 || ksiz < 2 || (str[0] == 0x0 && str[1] <= 0x20)) return 0;
2963 si = 0;
2964 ki = 0;
2965 while(ki < ksiz){
2966 if(si >= size) return 0;
2967 if(str[si] == 0x0 && str[si+1] <= 0x20){
2968 si += 2;
2969 continue;
2970 }
2971 if(key[ki] == 0x0 && key[ki+1] <= 0x20){
2972 ki += 2;
2973 continue;
2974 }
2975 if(str[si] != key[ki] || str[si+1] != key[ki+1]) return 0;
2976 si += 2;
2977 ki += 2;
2978 }
2979 return si;
2980 }
2981
2982
2983 /* Open the inverted index.
2984 `name' specifies the name of a directory.
2985 `omode' specifies an open mode of Villa.
2986 `dnum' specifies the number of database files.
2987 The return value is a database object of the database. */
2988 static ESTIDX *est_idx_open(const char *name, int omode, int dnum){
2989 ESTIDX *idx;
2990 CBLIST *files;
2991 char path[ESTPATHBUFSIZ];
2992 int i;
2993 assert(name && dnum > 0);
2994 if(dnum > ESTIDXDMAX) dnum = ESTIDXDMAX;
2995 CB_MALLOC(idx, sizeof(ESTIDX));
2996 if((omode & VL_OCREAT) && !est_mkdir(name) && errno != EEXIST) return NULL;
2997 if((omode & VL_OTRUNC) && (files = cbdirlist(name)) != NULL){
2998 for(i = 0; i < CB_LISTNUM(files); i++){
2999 sprintf(path, "%s%c%s", name, ESTPATHCHR, CB_LISTVAL(files, i, NULL));
3000 unlink(path);
3001 }
3002 cblistclose(files);
3003 }
3004 for(i = 0; i < dnum; i++){
3005 sprintf(path, "%s%c%04d", name, ESTPATHCHR, i + 1);
3006 if(!(idx->dbs[i] = vlopen(path, omode, VL_CMPLEX))){
3007 while(--i >= 0){
3008 vlclose(idx->dbs[i]);
3009 }
3010 return NULL;
3011 }
3012 }
3013 idx->name = cbmemdup(name, -1);
3014 idx->omode = omode;
3015 idx->dnum = dnum;
3016 idx->cdb = idx->dbs[dnum-1];
3017 return idx;
3018 }
3019
3020
3021 /* Close the inverted index.
3022 `idx' specifies an object of the inverted index.
3023 The return value is true if success, else it is false. */
3024 static int est_idx_close(ESTIDX *idx){
3025 int i, err;
3026 assert(idx);
3027 err = FALSE;
3028 for(i = 0; i < idx->dnum; i++){
3029 if(!vlclose(idx->dbs[i])) err = TRUE;
3030 }
3031 free(idx->name);
3032 free(idx);
3033 return err ? FALSE : TRUE;
3034 }
3035
3036
3037 /* Set the tuning parameters of the inverted index.
3038 `idx' specifies an object of the inverted index.
3039 Other parameters are same with `vlsettuning' of Villa. */
3040 static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum){
3041 int i;
3042 assert(idx);
3043 for(i = 0; i < idx->dnum; i++){
3044 vlsettuning(idx->dbs[i], lrecmax, nidxmax, lcnum, ncnum);
3045 }
3046 }
3047
3048
3049 /* Increment the inverted index.
3050 `idx' specifies an object of the inverted index. */
3051 static void est_idx_increment(ESTIDX *idx){
3052 char path[ESTPATHBUFSIZ];
3053 if(idx->dnum >= ESTIDXDMAX){
3054 est_idx_set_current(idx);
3055 return;
3056 }
3057 sprintf(path, "%s%c%04d", idx->name, ESTPATHCHR, idx->dnum + 1);
3058 if((idx->dbs[idx->dnum] = vlopen(path, idx->omode | VL_OCREAT | VL_OTRUNC, VL_CMPLEX)) != NULL){
3059 idx->cdb = idx->dbs[idx->dnum];
3060 idx->dnum++;
3061 }
3062 }
3063
3064
3065 /* Add a record to the inverted index.
3066 `idx' specifies an object of the inverted index.
3067 `word' specifies a word.
3068 `vbuf' specifies the pointer to the value of a record.
3069 `vsiz' specifies the size of the value.
3070 The return value is true if success, else it is false. */
3071 static int est_idx_add(ESTIDX *idx, const char *word, int wsiz, const char *vbuf, int vsiz){
3072 assert(idx && word && wsiz >= 0 && vbuf && vsiz >= 0);
3073 return vlput(idx->cdb, word, wsiz, vbuf, vsiz, VL_DDUP);
3074 }
3075
3076
3077 /* Remove a record from the inverted index.
3078 `idx' specifies an object of the inverted index.
3079 `word' specifies a word.
3080 `wsiz' specifies the size of the word.
3081 The return value is true if success, else it is false. Even if no item correspongs, it is
3082 success. */
3083 static int est_idx_out(ESTIDX *idx, const char *word, int wsiz){
3084 int i, err;
3085 assert(idx && word && wsiz >= 0);
3086 err = FALSE;
3087 for(i = 0; i < idx->dnum; i++){
3088 if(!vloutlist(idx->dbs[i], word, wsiz) && dpecode != DP_ENOITEM) err = TRUE;
3089 }
3090 return err ? FALSE : TRUE;
3091 }
3092
3093
3094 /* Get a record from the inverted index.
3095 `idx' specifies an object of the inverted index.
3096 `word' specifies a word.
3097 `wsiz' specifies the size of the word.
3098 `sp' specifies the pointer to a variable to which the size of the region of the return value
3099 is assigned.
3100 The return value is the pointer to the region of the value of the corresponding record.
3101 if no item correspongs, empty region is returned. */
3102 static char *est_idx_get(ESTIDX *idx, const char *word, int wsiz, int *sp){
3103 CBDATUM *datum;
3104 char *vbuf;
3105 int i, vsiz;
3106 assert(idx && word && wsiz >= 0 && sp);
3107 datum = cbdatumopen("", 0);
3108 for(i = 0; i < idx->dnum; i++){
3109 if(!(vbuf = vlgetcat(idx->dbs[i], word, wsiz, &vsiz))) continue;
3110 cbdatumcat(datum, vbuf, vsiz);
3111 free(vbuf);
3112 }
3113 return cbdatumtomalloc(datum, sp);
3114 }
3115
3116
3117 /* Get the size of the value of a record in the inverted index.
3118 `idx' specifies an object of the inverted index.
3119 `word' specifies a word.
3120 `wsiz' specifies the size of the word.
3121 The return value is the size of the value of the corresponding record.
3122 if no item correspongs, 0 is returned. */
3123 static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz){
3124 char *vbuf;
3125 int i, sum, vsiz;
3126 assert(idx && word && wsiz >= 0);
3127 sum = 0;
3128 for(i = 0; i < idx->dnum; i++){
3129 if(!(vbuf = vlgetcat(idx->dbs[i], word, wsiz, &vsiz))) continue;
3130 sum += vsiz;
3131 free(vbuf);
3132 }
3133 return sum;
3134 }
3135
3136
3137 /* Get the number of division of the inverted index.
3138 `idx' specifies an object of the inverted index.
3139 The return value is the number of division of the inverted index. */
3140 static int est_idx_num(ESTIDX *idx){
3141 assert(idx);
3142 return idx->dnum;
3143 }
3144
3145
3146 /* Get the size of the inverted index.
3147 `idx' specifies an object of the inverted index.
3148 The return value is the size of the inverted index. */
3149 static int est_idx_size(ESTIDX *idx){
3150 int i, size;
3151 assert(idx);
3152 size = 0;
3153 for(i = 0; i < idx->dnum; i++){
3154 size += vlfsiz(idx->dbs[i]);
3155 }
3156 return size;
3157 }
3158
3159
3160 /* Syncronize the inverted index.
3161 `idx' specifies an object of the inverted index.
3162 The return value is the size of the inverted index. */
3163 static int est_idx_sync(ESTIDX *idx){
3164 int i;
3165 assert(idx);
3166 for(i = 0; i < idx->dnum; i++){
3167 if(!vlsync(idx->dbs[i])) return FALSE;
3168 }
3169 return TRUE;
3170 }
3171
3172
3173 /* Optimize the inverted index.
3174 `idx' specifies an object of the inverted index.
3175 The return value is the size of the inverted index. */
3176 static int est_idx_optimize(ESTIDX *idx){
3177 int i;
3178 assert(idx);
3179 for(i = 0; i < idx->dnum; i++){
3180 if(!vloptimize(idx->dbs[i])) return FALSE;
3181 }
3182 return TRUE;
3183 }
3184
3185
3186 /* Set the current database to the smallest one in the inverted index.
3187 `idx' specifies an object of the inverted index. */
3188 static void est_idx_set_current(ESTIDX *idx){
3189 int i, size, min;
3190 assert(idx);
3191 min = vlfsiz(idx->cdb);
3192 for(i = 0; i < idx->dnum; i++){
3193 if((size = vlfsiz(idx->dbs[i])) < min){
3194 idx->cdb = idx->dbs[i];
3195 min = size;
3196 }
3197 }
3198 }
3199
3200
3201 /* Write meta data to the database.
3202 `db' specifies a database object.
3203 The return value is true if success, else it is false. */
3204 static int est_db_write_meta(ESTDB *db){
3205 char vbuf[ESTNUMBUFSIZ], *sbuf;
3206 int err, ssiz;
3207 assert(db);
3208 err = FALSE;
3209 sprintf(vbuf, "%d", est_idx_num(db->idxdb));
3210 if(!dpput(db->metadb, ESTKEYIDXNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3211 sprintf(vbuf, "%d", db->dseq);
3212 if(!dpput(db->metadb, ESTKEYDSEQ, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3213 sprintf(vbuf, "%d", db->dnum);
3214 if(!dpput(db->metadb, ESTKEYDNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3215 sprintf(vbuf, "%d", db->amode);
3216 if(!dpput(db->metadb, ESTKEYAMODE, -1, vbuf, -1, DP_DOVER)) err = TRUE;
3217 if(db->metacc){
3218 sbuf = cbmapdump(db->metacc, &ssiz);
3219 if(!dpput(db->metadb, ESTKEYMETA, -1, sbuf, ssiz, DP_DOVER)) err = TRUE;
3220 free(sbuf);
3221 }
3222 if(err){
3223 db->ecode = ESTEDB;
3224 db->fatal = TRUE;
3225 }
3226 return err ? FALSE : TRUE;
3227 }
3228
3229
3230 /* Call the callback function of a database.
3231 `db' specifies a database object.
3232 `info' specifies an extra message. */
3233 static void est_db_inform(ESTDB *db, const char *info){
3234 char *msg;
3235 assert(db);
3236 if(!db->cbinfo) return;
3237 msg = cbsprintf("%s: name=%s dnum=%d wnum=%d fsiz=%.0f crnum=%d csiz=%.0f",
3238 info, db->name, db->dnum, vlrnum(db->fwmdb), (double)est_db_size(db),
3239 cbmaprnum(db->idxcc), (double)est_db_used_cache_size(db));
3240 db->cbinfo(msg);
3241 free(msg);
3242 }
3243
3244
3245 /* Get the size of used cache region.
3246 `db' specifies a database object.
3247 The return value is the size of used cache region. */
3248 static int est_db_used_cache_size(ESTDB *db){
3249 assert(db);
3250 return (db->icsiz + cbmaprnum(db->idxcc) * (sizeof(CBMAPDATUM) + ESTWORDAVGLEN)) * ESTMEMIRATIO;
3251 }
3252
3253
3254 /* Prepare cache for meta data.
3255 `db' specifies a database object. */
3256 static void est_db_prepare_meta(ESTDB *db){
3257 char *sbuf;
3258 int ssiz;
3259 assert(db);
3260 if((sbuf = dpget(db->metadb, ESTKEYMETA, -1, 0, -1, &ssiz)) != NULL){
3261 db->metacc = cbmapload(sbuf, ssiz);
3262 free(sbuf);
3263 } else {
3264 db->metacc = cbmapopenex(ESTMINIBNUM);
3265 }
3266 }
3267
3268
3269 /* Create a list of terms for search.
3270 `phrase' specifies a search phrase.
3271 The return value is a list object of the terms of the phrase. */
3272 static CBLIST *est_phrase_terms(const char *phrase){
3273 CBLIST *terms, *elems;
3274 CBDATUM *datum;
3275 const char *elem;
3276 char *tbuf, *pbuf;
3277 int i, tsiz, psiz, lw;
3278 assert(phrase);
3279 terms = cblistopen();
3280 tbuf = est_uconv_in(phrase, strlen(phrase), &tsiz);
3281 est_normalize_text((unsigned char *)tbuf, tsiz, &tsiz);
3282 pbuf = est_uconv_out(tbuf, tsiz, &psiz);
3283 elems = cbsplit(pbuf, psiz, "\a\b\t\n\v\f\r ");
3284 datum = cbdatumopen("", 0);
3285 lw = FALSE;
3286 for(i = 0; i < CB_LISTNUM(elems); i++){
3287 elem = CB_LISTVAL(elems, i, NULL);
3288 if(elem[0] == '\0') continue;
3289 if(!strcmp(elem, ESTOPUNION)){
3290 if(CB_DATUMSIZE(datum) < 1) continue;
3291 if(lw) cbdatumcat(datum, "\t", -1);
3292 lw = FALSE;
3293 } else if(!strcmp(elem, ESTOPISECT) || !strcmp(elem, ESTOPDIFF)){
3294 if(CB_DATUMSIZE(datum) < 1) continue;
3295 cblistpush(terms, CB_DATUMPTR(datum), CB_DATUMSIZE(datum));
3296 cbdatumsetsize(datum, 0);
3297 cblistpush(terms, elem, -1);
3298 lw = FALSE;
3299 } else {
3300 if(CB_DATUMSIZE(datum) > 0 && lw) cbdatumcat(datum, " ", 1);
3301 cbdatumcat(datum, elem, -1);
3302 lw = TRUE;
3303 }
3304 }
3305 if(CB_DATUMSIZE(datum) > 0) cblistpush(terms, CB_DATUMPTR(datum), CB_DATUMSIZE(datum));
3306 cbdatumclose(datum);
3307 cblistclose(elems);
3308 free(pbuf);
3309 free(tbuf);
3310 for(i = 0; i < CB_LISTNUM(terms); i++){
3311 elem = CB_LISTVAL(terms, i, NULL);
3312 if(!strcmp(elem, ESTOPUVSET) || !strcmp(elem, ESTOPISECT) ||
3313 !strcmp(elem, ESTOPDIFF)) continue;
3314 tbuf = est_uconv_in(elem, strlen(elem), &tsiz);
3315 est_canonicalize_text((unsigned char *)tbuf, tsiz, TRUE);
3316 pbuf = est_uconv_out(tbuf, tsiz, &psiz);
3317 cbstrtrim(pbuf);
3318 cblistover(terms, i, pbuf, -1);
3319 free(pbuf);
3320 free(tbuf);
3321 }
3322 for(i = CB_LISTNUM(terms) - 1; i >= 0; i--){
3323 elem = CB_LISTVAL(terms, i, NULL);
3324 if(strcmp(elem, ESTOPISECT) && strcmp(elem, ESTOPDIFF)) break;
3325 free(cblistpop(terms, NULL));
3326 }
3327 return terms;
3328 }
3329
3330
3331 /* Compare two scores by each ID.
3332 `ap' specifies the pointer to one score.
3333 `bp' specifies the pointer to the other score.
3334 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3335 static int est_score_compare_by_id(const void *ap, const void *bp){
3336 assert(ap && bp);
3337 return ((ESTSCORE *)ap)->id - ((ESTSCORE *)bp)->id;
3338 }
3339
3340
3341 /* Compare two scores by each score point.
3342 `ap' specifies the pointer to one score.
3343 `bp' specifies the pointer to the other score.
3344 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3345 static int est_score_compare_by_score(const void *ap, const void *bp){
3346 assert(ap && bp);
3347 return ((ESTSCORE *)bp)->score - ((ESTSCORE *)ap)->score;
3348 }
3349
3350
3351 /* Compare two scores by attributes of strings for ascending order.
3352 `ap' specifies the pointer to one score.
3353 `bp' specifies the pointer to the other score.
3354 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3355 static int est_score_compare_by_str_asc(const void *ap, const void *bp){
3356 assert(ap && bp);
3357 return strcmp(((ESTSCORE *)ap)->value, ((ESTSCORE *)bp)->value);
3358 }
3359
3360
3361 /* Compare two scores by attributes of strings for descending order.
3362 `ap' specifies the pointer to one score.
3363 `bp' specifies the pointer to the other score.
3364 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3365 static int est_score_compare_by_str_desc(const void *ap, const void *bp){
3366 assert(ap && bp);
3367 return strcmp(((ESTSCORE *)bp)->value, ((ESTSCORE *)ap)->value);
3368 }
3369
3370
3371 /* Compare two scores by attributes of numbers for ascending order.
3372 `ap' specifies the pointer to one score.
3373 `bp' specifies the pointer to the other score.
3374 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3375 static int est_score_compare_by_num_asc(const void *ap, const void *bp){
3376 assert(ap && bp);
3377 return (time_t)((ESTSCORE *)ap)->value - (time_t)((ESTSCORE *)bp)->value;
3378 }
3379
3380
3381 /* Compare two scores by attributes of numbers for descending order.
3382 `ap' specifies the pointer to one score.
3383 `bp' specifies the pointer to the other score.
3384 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3385 static int est_score_compare_by_num_desc(const void *ap, const void *bp){
3386 assert(ap && bp);
3387 return (time_t)((ESTSCORE *)bp)->value - (time_t)((ESTSCORE *)ap)->value;
3388 }
3389
3390
3391 /* Get the universal set of documents in a database.
3392 `db' specifies a database object.
3393 `nump' specifies the pointer to which the number of elements in the result is assigned.
3394 `hints' specifies a list object. If it is `NULL', it is not used.
3395 `add' specifies whether the result to be treated in union or difference.
3396 The return value is an array whose elements are ID numbers of corresponding documents. */
3397 static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add){
3398 ESTSCORE *scores;
3399 char *vbuf, numbuf[ESTNUMBUFSIZ];
3400 int snum, smax;
3401 assert(db && nump);
3402 smax = ESTALLOCUNIT;
3403 CB_MALLOC(scores, smax * sizeof(ESTSCORE));
3404 snum = 0;
3405 vlcurfirst(db->listdb);
3406 while((vbuf = vlcurval(db->listdb, NULL)) != NULL){
3407 if(snum >= smax){
3408 smax *= 2;
3409 CB_REALLOC(scores, smax * sizeof(ESTSCORE));
3410 }
3411 scores[snum].id = atoi(vbuf);
3412 scores[snum].score = 0;
3413 snum++;
3414 free(vbuf);
3415 vlcurnext(db->listdb);
3416 }
3417 *nump = snum;
3418 if(hints){
3419 sprintf(numbuf, "%d", snum * (add ? 1 : -1));
3420 cbmapput(hints, ESTOPUVSET, -1, numbuf, -1, FALSE);
3421 }
3422 return scores;
3423 }
3424
3425
3426 /* Expand a word to words which begins with it.
3427 `db' specifies a database object.
3428 `word' specifies a word.
3429 `list' specifies a list object to contain the results. */
3430 static void est_expand_word(ESTDB *db, const char *word, CBLIST *list){
3431 char *kbuf;
3432 int ksiz;
3433 assert(db && word && list);
3434 vlcurjump(db->fwmdb, word, -1, VL_JFORWARD);
3435 while((kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL){
3436 if(!cbstrfwmatch(kbuf, word)){
3437 free(kbuf);
3438 break;
3439 }
3440 cblistpushbuf(list, kbuf, ksiz);
3441 vlcurnext(db->fwmdb);
3442 }
3443 }
3444
3445
3446 /* Get a correspinding set of documents in a database.
3447 `db' specifies a database object.
3448 `term' specifies a union term.
3449 `gstep' specifies number of steps of N-gram.
3450 `nump' specifies the pointer to which the number of elements in the result is assigned.
3451 `hints' specifies a list object. If it is `NULL', it is not used.
3452 `add' specifies whether the result to be treated in union or difference.
3453 The return value is an array whose elements are ID numbers of corresponding documents. */
3454 static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
3455 int *nump, CBMAP *hints, int add){
3456 ESTSCORE *scores, *tscores;
3457 CBLIST *words, *grams;
3458 const char *word, *gram, *rp, *fnext, *snext, *cbuf;
3459 char *vbuf, numbuf[ESTNUMBUFSIZ];
3460 int i, j, k, snum, smax, single, tsmax, tsnum, vsiz, gcnum, gsiz, csiz, wgstep, nnum;
3461 int mfsiz, mssiz, mfhash, mshash, tfhash, tshash, id, score, hit, hnum;
3462 assert(db && term && gstep > 0 && nump);
3463 smax = ESTALLOCUNIT;
3464 CB_MALLOC(scores, smax * sizeof(ESTSCORE));
3465 snum = 0;
3466 words = cbsplit(term, -1, "\t");
3467 for(i = 0; i < CB_LISTNUM(words); i++){
3468 word = CB_LISTVAL(words, i, NULL);
3469 grams = cblistopen();
3470 switch(db->amode){
3471 case ESTAMPERFNG:
3472 est_break_text_perfng(word, grams, TRUE, FALSE);
3473 break;
3474 default:
3475 est_break_text(word, grams, TRUE, FALSE);
3476 break;
3477 }
3478 single = FALSE;
3479 if(CB_LISTNUM(grams) < 1){
3480 est_expand_word(db, word, grams);
3481 single = TRUE;
3482 }
3483 tsmax = ESTALLOCUNIT;
3484 CB_MALLOC(tscores, tsmax * sizeof(ESTSCORE));
3485 tsnum = 0;
3486 gcnum = 0;
3487 wgstep = CB_LISTNUM(grams) > 2 || gstep > 2 ? gstep : 1;
3488 if(((unsigned char *)word)[0] <= 0xdf && gstep <= 2) wgstep = 1;
3489 for(j = 0; j < CB_LISTNUM(grams); j += wgstep){
3490 gcnum++;
3491 gram = CB_LISTVAL2(grams, j, &gsiz);
3492 fnext = j < CB_LISTNUM(grams) - 1 ? CB_LISTVAL2(grams, j + 1, &mfsiz) : NULL;
3493 snext = j < CB_LISTNUM(grams) - 2 ? CB_LISTVAL2(grams, j + 2, &mssiz) : NULL;
3494 mfhash = fnext ? dpinnerhash(fnext, mfsiz) % ESTJHASHNUM + 1: 0xff;
3495 mshash = snext ? dpouterhash(snext, mssiz) % ESTJHASHNUM + 1: 0xff;
3496 vbuf = est_idx_get(db->idxdb, gram, gsiz, &vsiz);
3497 if((cbuf = cbmapget(db->idxcc, gram, gsiz, &csiz)) != NULL){
3498 if(vbuf){
3499 CB_REALLOC(vbuf, vsiz + csiz + 100);
3500 memcpy(vbuf + vsiz, cbuf, csiz);
3501 vsiz += csiz;
3502 } else {
3503 vbuf = cbmemdup(cbuf, csiz);
3504 vsiz = csiz;
3505 }
3506 }
3507 if(!vbuf) continue;
3508 rp = vbuf;
3509 while(rp < vbuf + vsiz){
3510 memcpy(&id, rp, sizeof(int));
3511 rp += sizeof(int);
3512 score = *(unsigned char *)rp;
3513 rp++;
3514 hit = mfhash == 0xff && mshash == 0xff;
3515 while(rp < vbuf + vsiz){
3516 tfhash = *(unsigned char *)rp;
3517 rp++;
3518 tshash = *(unsigned char *)rp;
3519 rp++;
3520 if((mfhash == 0xff || mfhash == tfhash) && (mshash == 0xff || mshash == tshash))
3521 hit = TRUE;
3522 if(*(unsigned char *)rp == 0x00){
3523 rp++;
3524 break;
3525 }
3526 }
3527 if(hit || single){
3528 if(tsnum >= tsmax){
3529 tsmax *= 2;
3530 CB_REALLOC(tscores, tsmax * sizeof(ESTSCORE));
3531 }
3532 tscores[tsnum].id = id;
3533 tscores[tsnum].score = score * 100;
3534 tsnum++;
3535 }
3536 }
3537 free(vbuf);
3538 }
3539 if(gcnum > 1){
3540 qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_id);
3541 nnum = 0;
3542 for(j = 0; j < tsnum; j++){
3543 id = tscores[j].id;
3544 score = tscores[j].score;
3545 hnum = 1;
3546 for(k = j + 1; k < tsnum && tscores[k].id == id; k++){
3547 score += tscores[k].score;
3548 hnum++;
3549 }
3550 if(hnum >= gcnum || single){
3551 tscores[nnum].id = id;
3552 tscores[nnum].score = score / hnum;
3553 nnum++;
3554 }
3555 j = k - 1;
3556 }
3557 tsnum = nnum;
3558 }
3559 if(hints){
3560 sprintf(numbuf, "%d", tsnum * (add ? 1 : -1));
3561 cbmapput(hints, word, -1, numbuf, -1, FALSE);
3562 }
3563 for(j = 0; j < tsnum; j++){
3564 if(snum >= smax){
3565 smax *= 2;
3566 CB_REALLOC(scores, smax * sizeof(ESTSCORE));
3567 }
3568 scores[snum].id = tscores[j].id;
3569 scores[snum].score = tscores[j].score;
3570 snum++;
3571 }
3572 free(tscores);
3573 cblistclose(grams);
3574 }
3575 cblistclose(words);
3576 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id);
3577 nnum = 0;
3578 for(i = 0; i < snum; i++){
3579 id = scores[i].id;
3580 score = scores[i].score;
3581 hnum = 1;
3582 for(j = i + 1; j < snum && scores[j].id == id; j++){
3583 score += scores[j].score;
3584 hnum++;
3585 }
3586 scores[nnum].id = id;
3587 scores[nnum].score = score / hnum;
3588 nnum++;
3589 i = j - 1;
3590 }
3591 *nump = nnum;
3592 return scores;
3593 }
3594
3595
3596 /* Narrow and sort scores of search candidates.
3597 `db' specifies a database object.
3598 `attrs' specifies a list object of narrowing attributes.
3599 `order' specifies an expression for sorting.
3600 `scores' specifies an array of scores of search candidates.
3601 `snum' specifies the number of the array.
3602 The return value is the new number of the array. */
3603 static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, const char *order,
3604 ESTSCORE *scores, int snum){
3605 ESTCATTR *list;
3606 const char *otype, *cbuf, *rp, *pv, *ibuf;
3607 unsigned char *utmp;
3608 char *oname, *wp, *mbuf, *vbuf;
3609 int i, j, k, ci, oi, anum, tsiz, nnum, csiz, msiz, miss, vsiz, num, isiz, onlen;
3610 time_t tval;
3611 assert(db && scores && snum >= 0);
3612 ci = -1;
3613 oi = -1;
3614 oname = NULL;
3615 otype = NULL;
3616 if(order){
3617 oname = cbmemdup(order, -1);
3618 cbstrtrim(oname);
3619 otype = ESTORDSTRA;
3620 if((wp = strchr(oname, ' ')) != NULL){
3621 *wp = '\0';
3622 rp = wp + 1;
3623 while(*rp == ' '){
3624 rp++;
3625 }
3626 otype = rp;
3627 }
3628 }
3629 if(attrs){
3630 anum = CB_LISTNUM(attrs);
3631 CB_MALLOC(list, sizeof(ESTCATTR) * anum + 1);
3632 for(i = 0; i < anum; i++){
3633 list[i].name = NULL;
3634 list[i].oper = NULL;
3635 list[i].val = NULL;
3636 rp = CB_LISTVAL(attrs, i, NULL);
3637 while(*rp > 0 && *rp <= ' '){
3638 rp++;
3639 }
3640 if((pv = strchr(rp, ' ')) != NULL){
3641 list[i].nsiz = pv - rp;
3642 list[i].name = cbmemdup(rp, list[i].nsiz);
3643 rp = pv;
3644 while(*rp > 0 && *rp <= ' '){
3645 rp++;
3646 }
3647 if((pv = strchr(rp, ' ')) != NULL){
3648 list[i].oper = cbmemdup(rp, pv - rp);
3649 rp = pv;
3650 while(*rp > 0 && *rp <= ' '){
3651 rp++;
3652 }
3653 list[i].vsiz = strlen(rp);
3654 list[i].val = cbmemdup(rp, list[i].vsiz);
3655 } else {
3656 list[i].oper = cbmemdup(rp, -1);
3657 }
3658 } else {
3659 list[i].nsiz = strlen(rp);
3660 list[i].name = cbmemdup(rp, list[i].nsiz);
3661 }
3662 if(!list[i].oper){
3663 list[i].oper = cbmemdup("", 0);
3664 }
3665 if(!list[i].val){
3666 list[i].vsiz = 0;
3667 list[i].val = cbmemdup("", 0);
3668 }
3669 }
3670 for(i = 0; i < anum; i++){
3671 rp = list[i].oper;
3672 if(*rp == '!'){
3673 list[i].sign = FALSE;
3674 rp++;
3675 } else {
3676 list[i].sign = TRUE;
3677 }
3678 if(*rp == 'I' || *rp == 'i'){
3679 utmp = (unsigned char *)est_uconv_in(list[i].val, list[i].vsiz, &tsiz);
3680 est_normalize_text(utmp, tsiz, &tsiz);
3681 est_canonicalize_text(utmp, tsiz, FALSE);
3682 list[i].sval = (char *)est_uconv_out((char *)utmp, tsiz, &(list[i].ssiz));
3683 free(utmp);
3684 rp++;
3685 } else {
3686 list[i].sval = NULL;
3687 list[i].ssiz = 0;
3688 }
3689 list[i].num = cbstrmktime(list[i].val);
3690 if(!cbstricmp(rp, ESTOPSTREQ)){
3691 list[i].cop = ESTOPSTREQ;
3692 } else if(!cbstricmp(rp, ESTOPSTRNE)){
3693 list[i].cop = ESTOPSTRNE;
3694 } else if(!cbstricmp(rp, ESTOPSTRINC)){
3695 list[i].cop = ESTOPSTRINC;
3696 } else if(!cbstricmp(rp, ESTOPSTRBW)){
3697 list[i].cop = ESTOPSTRBW;
3698 } else if(!cbstricmp(rp, ESTOPSTREW)){
3699 list[i].cop = ESTOPSTREW;
3700 } else if(!cbstricmp(rp, ESTOPNUMEQ)){
3701 list[i].cop = ESTOPNUMEQ;
3702 } else if(!cbstricmp(rp, ESTOPNUMNE)){
3703 list[i].cop = ESTOPNUMNE;
3704 } else if(!cbstricmp(rp, ESTOPNUMGT)){
3705 list[i].cop = ESTOPNUMGT;
3706 } else if(!cbstricmp(rp, ESTOPNUMGE)){
3707 list[i].cop = ESTOPNUMGE;
3708 } else if(!cbstricmp(rp, ESTOPNUMLT)){
3709 list[i].cop = ESTOPNUMLT;
3710 } else if(!cbstricmp(rp, ESTOPNUMLE)){
3711 list[i].cop = ESTOPNUMLE;
3712 } else {
3713 list[i].cop = NULL;
3714 }
3715 }
3716 if(db->spacc){
3717 for(i = 0; i < anum; i++){
3718 if(!strcmp(list[i].name, db->scname)){
3719 ci = i;
3720 break;
3721 }
3722 }
3723 }
3724 if(oname){
3725 for(i = 0; i < anum; i++){
3726 if(!strcmp(list[i].name, oname)){
3727 oi = i;
3728 break;
3729 }
3730 }
3731 }
3732 nnum = 0;
3733 for(i = 0; i < snum; i++){
3734 scores[i].value = NULL;
3735 if(ci >= 0){
3736 if((cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL)
3737 cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
3738 } else {
3739 cbuf = NULL;
3740 csiz = 0;
3741 }
3742 mbuf = NULL;
3743 if((cbuf && anum == 1) ||
3744 (mbuf = crget(db->attrdb, (char *)&(scores[i].id), sizeof(int), 0, -1, &msiz)) != NULL){
3745 miss = FALSE;
3746 for(j = 0; !miss && j < anum; j++){
3747 if(list[j].nsiz < 1) continue;
3748 if(mbuf){
3749 vbuf = cbmaploadone(mbuf, msiz, list[j].name, list[j].nsiz, &vsiz);
3750 } else if(csiz != 1 || cbuf[0] != '\0'){
3751 vbuf = cbmemdup(cbuf, csiz);
3752 vsiz = csiz;
3753 } else {
3754 vbuf = NULL;
3755 }
3756 if(list[j].oper[0] == '\0'){
3757 if(!vbuf) miss = TRUE;
3758 } else {
3759 if(!vbuf){
3760 vbuf = cbmemdup("", 0);
3761 vsiz = 0;
3762 }
3763 if(!est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign, list[j].val, list[j].vsiz,
3764 list[j].sval, list[j].ssiz, list[j].num))
3765 miss = TRUE;
3766 }
3767 if(j == ci && !cbuf){
3768 if(vbuf){
3769 cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
3770 } else {
3771 cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
3772 }
3773 if(cbmaprnum(db->spacc) > db->scmnum){
3774 num = db->scmnum * 0.1 + 1;
3775 cbmapiterinit(db->spacc);
3776 for(k = 0; k < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; k++){
3777 cbmapout(db->spacc, ibuf, isiz);
3778 }
3779 }
3780 }
3781 if(j == oi){
3782 scores[i].value = vbuf;
3783 } else {
3784 free(vbuf);
3785 }
3786 }
3787 if(miss){
3788 free(scores[i].value);
3789 } else {
3790 scores[nnum++] = scores[i];
3791 }
3792 }
3793 free(mbuf);
3794 }
3795 snum = nnum;
3796 for(i = 0; i < anum; i++){
3797 free(list[i].sval);
3798 free(list[i].val);
3799 free(list[i].oper);
3800 free(list[i].name);
3801 }
3802 free(list);
3803 } else {
3804 for(i = 0; i < snum; i++){
3805 scores[i].value = NULL;
3806 }
3807 }
3808 if(oname){
3809 ci = db->spacc && !strcmp(oname, db->scname);
3810 onlen = strlen(oname);
3811 for(i = 0; i < snum; i++){
3812 if(scores[i].value) continue;
3813 if(ci && (cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL){
3814 cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
3815 if(csiz == 1 && cbuf[0] == '\0'){
3816 scores[i].value = cbmemdup("", 0);
3817 } else {
3818 scores[i].value = cbmemdup(cbuf, csiz);
3819 }
3820 continue;
3821 }
3822 if((mbuf = crget(db->attrdb, (char *)&(scores[i].id), sizeof(int), 0, -1, &msiz)) != NULL){
3823 if((vbuf = cbmaploadone(mbuf, msiz, oname, onlen, &vsiz)) != NULL){
3824 if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
3825 scores[i].value = vbuf;
3826 } else {
3827 if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
3828 scores[i].value = cbmemdup("", 0);
3829 }
3830 if(ci && cbmaprnum(db->spacc) > db->scmnum){
3831 num = db->scmnum * 0.1 + 1;
3832 cbmapiterinit(db->spacc);
3833 for(j = 0; j < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; j++){
3834 cbmapout(db->spacc, ibuf, isiz);
3835 }
3836 }
3837 free(mbuf);
3838 } else {
3839 scores[i].value = cbmemdup("", 0);
3840 }
3841 }
3842 if(!cbstricmp(otype, ESTORDSTRA)){
3843 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_asc);
3844 } else if(!cbstricmp(otype, ESTORDSTRD)){
3845 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_desc);
3846 } else if(!cbstricmp(otype, ESTORDNUMA)){
3847 for(i = 0; i < snum; i++){
3848 tval = cbstrmktime(scores[i].value);
3849 free(scores[i].value);
3850 scores[i].value = (void *)tval;
3851 }
3852 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_asc);
3853 for(i = 0; i < snum; i++){
3854 scores[i].value = NULL;
3855 }
3856 } else if(!cbstricmp(otype, ESTORDNUMD)){
3857 for(i = 0; i < snum; i++){
3858 tval = cbstrmktime(scores[i].value);
3859 free(scores[i].value);
3860 scores[i].value = (void *)tval;
3861 }
3862 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_desc);
3863 for(i = 0; i < snum; i++){
3864 scores[i].value = NULL;
3865 }
3866 }
3867 for(i = 0; i < snum; i++){
3868 free(scores[i].value);
3869 }
3870 free(oname);
3871 }
3872 return snum;
3873 }
3874
3875
3876 /* Check whether a score matches an attribute condition.
3877 `tval' specifies the target value;
3878 `tsiz' specifies the size of the target value
3879 `oval' specifies the operation value;
3880 `osiz' specifies the size of the operation value
3881 `sval' specifies the operation value of small cases;
3882 `ssiz' specifies the size of the operation value of small cases.
3883 `onum' specifies the numeric value.
3884 The return value is true if it does match, else it is false. */
3885 static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
3886 const char *oval, int osiz, const char *sval, int ssiz, int onum){
3887 unsigned char *eval;
3888 char *cval;
3889 int csiz, esiz, hit;
3890 assert(tval && tsiz >= 0 && oval && osiz >= 0);
3891 cval = NULL;
3892 if(sval){
3893 eval = (unsigned char *)est_uconv_in(tval, tsiz, &esiz);
3894 est_normalize_text(eval, esiz, &esiz);
3895 est_canonicalize_text(eval, esiz, FALSE);
3896 cval = (char *)est_uconv_out((char *)eval, esiz, &csiz);
3897 free(eval);
3898 tval = cval;
3899 tsiz = csiz;
3900 oval = sval;
3901 osiz = ssiz;
3902 }
3903 if(cop == ESTOPSTREQ){
3904 hit = !strcmp(tval, oval);
3905 } else if(cop == ESTOPSTRNE){
3906 hit = strcmp(tval, oval) != 0;
3907 } else if(cop == ESTOPSTRINC){
3908 hit = strstr(tval, oval) != NULL;
3909 } else if(cop == ESTOPSTRBW){
3910 hit = cbstrfwmatch(tval, oval);
3911 } else if(cop == ESTOPSTREW){
3912 hit = cbstrbwmatch(tval, oval);
3913 } else if(cop == ESTOPNUMEQ){
3914 hit = cbstrmktime(tval) == onum;
3915 } else if(cop == ESTOPNUMNE){
3916 hit = cbstrmktime(tval) != onum;
3917 } else if(cop == ESTOPNUMGT){
3918 hit = cbstrmktime(tval) > onum;
3919 } else if(cop == ESTOPNUMGE){
3920 hit = cbstrmktime(tval) >= onum;
3921 } else if(cop == ESTOPNUMLT){
3922 hit = cbstrmktime(tval) < onum;
3923 } else if(cop == ESTOPNUMLE){
3924 hit = cbstrmktime(tval) <= onum;
3925 } else {
3926 hit = FALSE;
3927 }
3928 free(cval);
3929 return sign ? hit : !hit;
3930 }
3931
3932
3933 /* Compare two keywords by scores in descending order.
3934 `ap' specifies the pointer to one keyword.
3935 `bp' specifies the pointer to the other keyword.
3936 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
3937 static int est_keysc_compare(const void *ap, const void *bp){
3938 assert(ap && bp);
3939 return ((ESTKEYSC *)bp)->pt - ((ESTKEYSC *)ap)->pt;
3940 }
3941
3942
3943 /* Get a similar set of documents in a database.
3944 `db' specifies a database object.
3945 `svmap' specifies a map object of a seed vector.
3946 `nump' specifies the pointer to which the number of elements in the result is assigned.
3947 `knum' specifies the number of keywords to get candidates.
3948 `unum' specifies the number of adopted documents for a keyword.
3949 `tfidf' specifies whether to perform TF-IDF tuning.
3950 `nmin' specifies the minimum value for narrowing.
3951 The return value is an array whose elements are ID numbers of similar documents. */
3952 static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
3953 int knum, int unum, int tfidf, double nmin){
3954 ESTSCORE *scores, *tscores;
3955 CBMAP *tvmap;
3956 const char *word;
3957 int i, j, vnum, snum, tmax, tsnum, nnum, lid, *svec, *tvec;
3958 double dval;
3959 assert(db && svmap && nump && knum >= 0 && unum >= 0 && nmin >= 0.0);
3960 CB_MALLOC(scores, sizeof(ESTSCORE) * unum * knum);
3961 snum = 0;
3962 if((vnum = cbmaprnum(svmap)) < 1) vnum = 1;
3963 cbmapiterinit(svmap);
3964 tmax = unum;
3965 for(i = 0; i < knum && (word = cbmapiternext(svmap, NULL)) != NULL; i++){
3966 tscores = est_search_union(db, word, 1, &tsnum, NULL, TRUE);
3967 qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_score);
3968 for(j = 0; j < tmax && j < tsnum; j++){
3969 scores[snum].id = tscores[j].id;
3970 scores[snum].score = tscores[j].score;
3971 snum++;
3972 }
3973 free(tscores);
3974 tmax -= unum / knum / 1.25;
3975 }
3976 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id);
3977 nnum = 0;
3978 lid = -1;
3979 CB_MALLOC(svec, vnum * sizeof(int));
3980 CB_MALLOC(tvec, vnum * sizeof(int));
3981 est_set_svec(svmap, svec, vnum);
3982 for(i = 0; i < snum; i++){
3983 if(scores[i].id != lid){
3984 tvmap = NULL;
3985 if(db->cbvec) tvmap = db->cbvec(db, scores[i].id, db->vecdata);
3986 if(!tvmap) tvmap = est_get_tvmap(db, scores[i].id, vnum, tfidf);
3987 if(tvmap){
3988 est_set_tvec(svmap, tvmap, tvec, vnum);
3989 if((dval = est_vec_cos(svec, tvec, vnum)) >= nmin){
3990 scores[nnum].id = scores[i].id;
3991 scores[nnum].score = (int)(dval * 10000);
3992 if(scores[nnum].score == 9999) scores[nnum].score = 10000;
3993 nnum++;
3994 }
3995 cbmapclose(tvmap);
3996 }
3997 }
3998 lid = scores[i].id;
3999 }
4000 free(tvec);
4001 free(svec);
4002 snum = nnum;
4003 *nump = snum;
4004 return scores;
4005 }
4006
4007
4008 /* Create a map object of a vector for similar search from a phrase.
4009 `phrase' specifies a search phrase for similar search.
4010 The return value is a map object of the seed vector. */
4011 static CBMAP *est_phrase_vector(const char *phrase){
4012 CBMAP *svmap;
4013 CBLIST *list;
4014 const char *pv, *rp;
4015 char *utext, *rtext;
4016 int i, num, len, size;
4017 svmap = cbmapopenex(ESTMINIBNUM);
4018 list = cblistopen();
4019 while(*phrase != '\0'){
4020 if(*phrase == ESTOPWITH[0] && cbstrfwmatch(phrase, ESTOPWITH)){
4021 phrase += strlen(ESTOPWITH);
4022 pv = phrase;
4023 while(*phrase != '\0'){
4024 if(*phrase <= ' ' && cbstrfwmatch(phrase + 1, ESTOPWITH)){
4025 phrase++;
4026 break;
4027 }
4028 phrase++;
4029 }
4030 cblistpush(list, pv, phrase - pv);
4031 } else {
4032 phrase++;
4033 }
4034 }
4035 for(i = 0; i < CB_LISTNUM(list); i++){
4036 pv = CB_LISTVAL(list, i, NULL);
4037 while(*pv > '\0' && *pv <= ' '){
4038 pv++;
4039 }
4040 num = strtol(pv, (char **)&rp, 10);
4041 if(rp && (len = rp - pv) > 0 && num >= 0){
4042 utext = est_uconv_in(rp, strlen(rp), &size);
4043 est_normalize_text((unsigned char *)utext, size, &size);
4044 est_canonicalize_text((unsigned char *)utext, size, FALSE);
4045 rtext = est_uconv_out(utext, size, NULL);
4046 cbstrsqzspc(rtext);
4047 if(rtext[0] != '\0') cbmapput(svmap, rtext, -1, pv, len, FALSE);
4048 free(rtext);
4049 free(utext);
4050 }
4051 }
4052 cblistclose(list);
4053 return svmap;
4054 }
4055
4056
4057 /* Get the target vector of a document dynamically.
4058 `db' specifies a database object.
4059 `id' specifies the ID of a document.
4060 `vnum' specifies the number of dimensions of the vector.
4061 `tfidf' specifies whether to perform TF-IDF tuning.
4062 The return value is a map object of the target vector. */
4063 static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf){
4064 ESTDOC *doc;
4065 CBMAP *tvmap;
4066 assert(db && id > 0);
4067 if(!(doc = est_db_get_doc(db, id, 0))) return NULL;
4068 tvmap = est_db_etch_doc(tfidf ? db : NULL, doc, vnum);
4069 est_doc_delete(doc);
4070 return tvmap;
4071 }
4072
4073
4074 /* Set a seed vector from a map object.
4075 `svmap' specifies a map object of a seed vector.
4076 `svec' specifies a vector object.
4077 `vnum' specifies the number of dimensions of the vector. */
4078 static void est_set_svec(CBMAP *svmap, int *svec, int vnum){
4079 const char *kbuf;
4080 int i, ksiz;
4081 assert(svmap && svec && vnum > 0);
4082 cbmapiterinit(svmap);
4083 for(i = 0; i < vnum; i++){
4084 if((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
4085 svec[i] = atoi(cbmapget(svmap, kbuf, ksiz, NULL));
4086 } else {
4087 svec[i] = 0;
4088 }
4089 }
4090 }
4091
4092
4093 /* Set a target vector from a map object.
4094 `svmap' specifies a map object of a seed vector.
4095 `tvmap' specifies a map object of a target vector.
4096 `tvec' specifies a vector object.
4097 `vnum' specifies the number of dimensions of the vector. */
4098 static void est_set_tvec(CBMAP *svmap, CBMAP *tvmap, int *tvec, int vnum){
4099 const char *kbuf, *vbuf;
4100 int i, ksiz;
4101 assert(svmap && tvmap && tvec && vnum > 0);
4102 cbmapiterinit(svmap);
4103 for(i = 0; i < vnum; i++){
4104 if((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
4105 vbuf = cbmapget(tvmap, kbuf, ksiz, NULL);
4106 tvec[i] = vbuf ? atoi(vbuf) : 0;
4107 } else {
4108 tvec[i] = 0;
4109 }
4110 }
4111 }
4112
4113
4114 /* Get the absolute of a vector.
4115 `vec' specifies a vector object.
4116 `vnum' specifies the number of dimensions of the vector.
4117 The return value is the absolute of the vector. */
4118 static double est_vec_abs(const int *vec, int vnum){
4119 double rv;
4120 int i;
4121 assert(vec && vnum >= 0);
4122 rv = 0;
4123 for(i = 0; i < vnum; i++){
4124 rv += (double)vec[i] * (double)vec[i];
4125 }
4126 return sqrt(rv);
4127 }
4128
4129
4130 /* Get the inner product of two vectors.
4131 `avec' specifies a vector object.
4132 `bvec' specifies the other vector object.
4133 `vnum' specifies the number of dimensions of the vector.
4134 The return value is the inner product of two vectors. */
4135 static double est_vec_iprod(const int *avec, const int *bvec, int vnum){
4136 double rv;
4137 int i;
4138 assert(avec && bvec && vnum >= 0);
4139 rv = 0;
4140 for(i = 0; i < vnum; i++){
4141 rv += (double)avec[i] * (double)bvec[i];
4142 }
4143 return rv;
4144 }
4145
4146
4147 /* Get the cosine of the angle of two vectors.
4148 `avec' specifies a vector object.
4149 `bvec' specifies the other vector object.
4150 `vnum' specifies the number of dimensions of the vector.
4151 The return value is the cosine of the angle of two vectors. */
4152 static double est_vec_cos(const int *avec, const int *bvec, int vnum){
4153 double rv;
4154 assert(avec && bvec && vnum >= 0);
4155 rv = est_vec_iprod(avec, bvec, vnum) /
4156 ((est_vec_abs(avec, vnum) * est_vec_abs(bvec, vnum)));
4157 return rv > 0.0 ? rv : 0.0;
4158 }
4159
4160
4161 /* Close the handle to the file of random number generator. */
4162 static void est_random_fclose(void){
4163 if(est_random_ifp) fclose(est_random_ifp);
4164 }
4165
4166
4167
4168 /* END OF FILE */

  ViewVC Help
Powered by ViewVC 1.1.26