77 |
int g_inputlang = ESTLANGEN; /* prefered language */ |
int g_inputlang = ESTLANGEN; /* prefered language */ |
78 |
const char *g_pathcode = NULL; /* path encoding */ |
const char *g_pathcode = NULL; /* path encoding */ |
79 |
int g_pathfull = FALSE; /* whether to record full paths */ |
int g_pathfull = FALSE; /* whether to record full paths */ |
80 |
|
CBLIST *g_pathattrs = NULL; /* names of elements in path extension */ |
81 |
int g_oextmodes = 0; /* extra open modes */ |
int g_oextmodes = 0; /* extra open modes */ |
82 |
int g_viewmode = VM_ID; /* viewing mode */ |
int g_viewmode = VM_ID; /* viewing mode */ |
83 |
int g_filefmt = FF_AUTO; /* file format */ |
int g_filefmt = FF_AUTO; /* file format */ |
88 |
double g_cachesize = -1; /* size of the cache */ |
double g_cachesize = -1; /* size of the cache */ |
89 |
int g_doforce = FALSE; /* whether to force purging or extracting */ |
int g_doforce = FALSE; /* whether to force purging or extracting */ |
90 |
int g_kwordnum = KWORDNUM; /* number of keywords */ |
int g_kwordnum = KWORDNUM; /* number of keywords */ |
|
int g_condopts = 0; /* options of the search condtion */ |
|
91 |
int g_rdmode = RD_RAND; /* mode of random documents */ |
int g_rdmode = RD_RAND; /* mode of random documents */ |
92 |
|
|
93 |
|
|
125 |
static int procinform(const char *dbname); |
static int procinform(const char *dbname); |
126 |
static int procoptimize(const char *dbname); |
static int procoptimize(const char *dbname); |
127 |
static int procsearch(const char *dbname, const char *phrase, |
static int procsearch(const char *dbname, const char *phrase, |
128 |
const CBLIST *attrs, const char *ord, int max, int sim); |
const CBLIST *attrs, const char *ord, int max, int opts, int sim); |
129 |
static int procgather(const char *dbname, const char *filename); |
static int procgather(const char *dbname, const char *filename); |
130 |
static int procpurge(const char *dbname, const char *prefix); |
static int procpurge(const char *dbname, const char *prefix); |
131 |
static int procextkeys(const char *dbname, const char *prefix, int ni); |
static int procextkeys(const char *dbname, const char *prefix, int ni); |
137 |
static void xmlprintf(const char *format, ...); |
static void xmlprintf(const char *format, ...); |
138 |
static int strtolang(const char *str); |
static int strtolang(const char *str); |
139 |
static char *fgetl(FILE *ifp); |
static char *fgetl(FILE *ifp); |
140 |
static int doputdoc(ESTDB *db, const char *path); |
static int doputdoc(ESTDB *db, const char *path, const CBLIST *attrs); |
141 |
static const char *pathtourl(const char *path); |
static const char *pathtourl(const char *path); |
142 |
static const char *urltofile(const char *uri); |
static const char *urltofile(const char *uri); |
143 |
static char *urltopath(const char *uri); |
static char *urltopath(const char *uri); |
291 |
" [-ni] [-sf] [-hs] [-attr expr] [-ord expr] [-max num] [-sim id] db [phrase]\n", |
" [-ni] [-sf] [-hs] [-attr expr] [-ord expr] [-max num] [-sim id] db [phrase]\n", |
292 |
g_progname); |
g_progname); |
293 |
fprintf(stderr, " %s gather [-cl] [-fe|-ft|-fh|-fm] [-fx sufs cmd] [-fz] [-fo]" |
fprintf(stderr, " %s gather [-cl] [-fe|-ft|-fh|-fm] [-fx sufs cmd] [-fz] [-fo]" |
294 |
" [-ic enc] [-il lang] [-pc enc] [-pf] [-apn] [-sd] [-cm] [-cs num] db [file|dir]\n", |
" [-ic enc] [-il lang] [-pc enc] [-pf] [-px name] [-apn] [-sd] [-cm] [-cs num]" |
295 |
g_progname); |
" db [file|dir]\n", g_progname); |
296 |
fprintf(stderr, " %s purge [-cl] [-fc] db [prefix]\n", g_progname); |
fprintf(stderr, " %s purge [-cl] [-fc] db [prefix]\n", g_progname); |
297 |
fprintf(stderr, " %s extkeys [-fc] [-ni] [-kn num] db [prefix]\n", g_progname); |
fprintf(stderr, " %s extkeys [-fc] [-ni] [-kn num] db [prefix]\n", g_progname); |
298 |
fprintf(stderr, " %s draft [-ft|-fh|-fm] [-ic enc] [-il lang] [file]\n", g_progname); |
fprintf(stderr, " %s draft [-ft|-fh|-fm] [-ic enc] [-il lang] [file]\n", g_progname); |
510 |
CBDATUM *pbuf; |
CBDATUM *pbuf; |
511 |
CBLIST *attrs; |
CBLIST *attrs; |
512 |
char *dbname, *ord, *phrase, *tmp; |
char *dbname, *ord, *phrase, *tmp; |
513 |
int i, max, sim, rv; |
int i, max, opts, sim, rv; |
514 |
dbname = NULL; |
dbname = NULL; |
515 |
ord = NULL; |
ord = NULL; |
516 |
max = SEARCHMAX; |
max = SEARCHMAX; |
517 |
|
opts = 0; |
518 |
sim = -1; |
sim = -1; |
519 |
pbuf = cbdatumopen("", 0); |
pbuf = cbdatumopen("", 0); |
520 |
cbglobalgc(pbuf, (void (*)(void *))cbdatumclose); |
cbglobalgc(pbuf, (void (*)(void *))cbdatumclose); |
526 |
if(++i >= argc) usage(); |
if(++i >= argc) usage(); |
527 |
g_inputcode = argv[i]; |
g_inputcode = argv[i]; |
528 |
} else if(!strcmp(argv[i], "-gs")){ |
} else if(!strcmp(argv[i], "-gs")){ |
529 |
g_condopts |= ESTCONDSURE; |
opts |= ESTCONDSURE; |
530 |
} else if(!strcmp(argv[i], "-gf")){ |
} else if(!strcmp(argv[i], "-gf")){ |
531 |
g_condopts |= ESTCONDFAST; |
opts |= ESTCONDFAST; |
532 |
} else if(!strcmp(argv[i], "-ga")){ |
} else if(!strcmp(argv[i], "-ga")){ |
533 |
g_condopts |= ESTCONDAGIT; |
opts |= ESTCONDAGIT; |
534 |
} else if(!strcmp(argv[i], "-ni")){ |
} else if(!strcmp(argv[i], "-ni")){ |
535 |
g_condopts |= ESTCONDNOIDF; |
opts |= ESTCONDNOIDF; |
536 |
} else if(!strcmp(argv[i], "-sf")){ |
} else if(!strcmp(argv[i], "-sf")){ |
537 |
g_condopts |= ESTCONDSIMPLE; |
opts |= ESTCONDSIMPLE; |
538 |
} else if(!strcmp(argv[i], "-hs")){ |
} else if(!strcmp(argv[i], "-hs")){ |
539 |
g_condopts |= ESTCONDSCFB; |
opts |= ESTCONDSCFB; |
540 |
} else if(!strcmp(argv[i], "-vu")){ |
} else if(!strcmp(argv[i], "-vu")){ |
541 |
g_viewmode = VM_URI; |
g_viewmode = VM_URI; |
542 |
} else if(!strcmp(argv[i], "-va")){ |
} else if(!strcmp(argv[i], "-va")){ |
585 |
free(tmp); |
free(tmp); |
586 |
} |
} |
587 |
} |
} |
588 |
rv = procsearch(dbname, phrase, attrs, ord, max, sim); |
rv = procsearch(dbname, phrase, attrs, ord, max, opts, sim); |
589 |
free(phrase); |
free(phrase); |
590 |
return rv; |
return rv; |
591 |
} |
} |
597 |
const char *elem; |
const char *elem; |
598 |
char *dbname, *filename; |
char *dbname, *filename; |
599 |
int i, j, rv; |
int i, j, rv; |
600 |
|
g_pathattrs = cblistopen(); |
601 |
|
cbglobalgc(g_pathattrs, (void (*)(void *))cblistclose); |
602 |
g_xcmdmap = cbmapopenex(MINIBNUM); |
g_xcmdmap = cbmapopenex(MINIBNUM); |
603 |
cbglobalgc(g_xcmdmap, (void (*)(void *))cbmapclose); |
cbglobalgc(g_xcmdmap, (void (*)(void *))cbmapclose); |
604 |
dbname = NULL; |
dbname = NULL; |
639 |
g_pathcode = argv[i]; |
g_pathcode = argv[i]; |
640 |
} else if(!strcmp(argv[i], "-pf")){ |
} else if(!strcmp(argv[i], "-pf")){ |
641 |
g_pathfull = TRUE; |
g_pathfull = TRUE; |
642 |
|
} else if(!strcmp(argv[i], "-px")){ |
643 |
|
if(++i >= argc) usage(); |
644 |
|
cblistpush(g_pathattrs, argv[i], -1); |
645 |
} else if(!strcmp(argv[i], "-apn")){ |
} else if(!strcmp(argv[i], "-apn")){ |
646 |
g_oextmodes |= ESTDBPERFNG; |
g_oextmodes |= ESTDBPERFNG; |
647 |
} else if(!strcmp(argv[i], "-sd")){ |
} else if(!strcmp(argv[i], "-sd")){ |
1128 |
|
|
1129 |
/* perform the search command */ |
/* perform the search command */ |
1130 |
static int procsearch(const char *dbname, const char *phrase, |
static int procsearch(const char *dbname, const char *phrase, |
1131 |
const CBLIST *attrs, const char *ord, int max, int sim){ |
const CBLIST *attrs, const char *ord, int max, int opts, int sim){ |
1132 |
ESTDB *db; |
ESTDB *db; |
1133 |
ESTCOND *cond; |
ESTCOND *cond; |
1134 |
ESTDOC *doc; |
ESTDOC *doc; |
1151 |
if(sim > 0){ |
if(sim > 0){ |
1152 |
svmap = kwdb ? vectorizer(db, sim, kwdb) : NULL; |
svmap = kwdb ? vectorizer(db, sim, kwdb) : NULL; |
1153 |
if(!svmap && (doc = est_db_get_doc(db, sim, 0)) != NULL){ |
if(!svmap && (doc = est_db_get_doc(db, sim, 0)) != NULL){ |
1154 |
svmap = est_db_etch_doc((g_condopts & ESTCONDNOIDF) ? NULL : db, doc, KWORDNUM); |
svmap = est_db_etch_doc((opts & ESTCONDNOIDF) ? NULL : db, doc, KWORDNUM); |
1155 |
est_doc_delete(doc); |
est_doc_delete(doc); |
1156 |
} |
} |
1157 |
if(svmap){ |
if(svmap){ |
1179 |
} |
} |
1180 |
if(ord) est_cond_set_order(cond, ord); |
if(ord) est_cond_set_order(cond, ord); |
1181 |
if(max >= 0) est_cond_set_max(cond, max); |
if(max >= 0) est_cond_set_max(cond, max); |
1182 |
est_cond_set_options(cond, g_condopts); |
est_cond_set_options(cond, opts); |
1183 |
hints = cbmapopenex(MINIBNUM); |
hints = cbmapopenex(MINIBNUM); |
1184 |
curtime = est_gettimeofday(); |
curtime = est_gettimeofday(); |
1185 |
res = est_db_search(db, cond, &rnum, hints); |
res = est_db_search(db, cond, &rnum, hints); |
1449 |
/* perform the gather command */ |
/* perform the gather command */ |
1450 |
static int procgather(const char *dbname, const char *filename){ |
static int procgather(const char *dbname, const char *filename){ |
1451 |
ESTDB *db; |
ESTDB *db; |
1452 |
CBLIST *list, *clist; |
CBLIST *list, *clist, *attrs; |
1453 |
FILE *ifp; |
FILE *ifp; |
1454 |
const char *tmp; |
const char *tmp; |
1455 |
char *line, *path; |
char *line, *path; |
1480 |
} |
} |
1481 |
cblistclose(clist); |
cblistclose(clist); |
1482 |
} else { |
} else { |
1483 |
if(!doputdoc(db, line)){ |
if(!doputdoc(db, line, NULL)){ |
1484 |
printferror("%s: %s", line, est_err_msg(est_db_error(db))); |
printferror("%s: %s", line, est_err_msg(est_db_error(db))); |
1485 |
err = TRUE; |
err = TRUE; |
1486 |
} |
} |
1514 |
est_db_set_cache_size(db, g_cachesize, -1, -1); |
est_db_set_cache_size(db, g_cachesize, -1, -1); |
1515 |
} |
} |
1516 |
while((line = fgetl(ifp)) != NULL){ |
while((line = fgetl(ifp)) != NULL){ |
1517 |
if(!doputdoc(db, line)){ |
if(line[0] == '\0'){ |
1518 |
printferror("%s: %s", line, est_err_msg(est_db_error(db))); |
free(line); |
1519 |
err = TRUE; |
continue; |
1520 |
|
} |
1521 |
|
if(cblistnum(g_pathattrs) > 0){ |
1522 |
|
attrs = cbsplit(line, -1, "\t"); |
1523 |
|
path = cblistshift(attrs, NULL); |
1524 |
|
if(!doputdoc(db, path, attrs)){ |
1525 |
|
printferror("%s: %s", path, est_err_msg(est_db_error(db))); |
1526 |
|
err = TRUE; |
1527 |
|
} |
1528 |
|
free(path); |
1529 |
|
cblistclose(attrs); |
1530 |
|
} else { |
1531 |
|
if(!doputdoc(db, line, NULL)){ |
1532 |
|
printferror("%s: %s", line, est_err_msg(est_db_error(db))); |
1533 |
|
err = TRUE; |
1534 |
|
} |
1535 |
} |
} |
1536 |
free(line); |
free(line); |
1537 |
if(err || g_sigterm) break; |
if(err || g_sigterm) break; |
2259 |
|
|
2260 |
|
|
2261 |
/* register a document */ |
/* register a document */ |
2262 |
static int doputdoc(ESTDB *db, const char *path){ |
static int doputdoc(ESTDB *db, const char *path, const CBLIST *attrs){ |
2263 |
ESTDOC *doc, *edoc; |
ESTDOC *doc, *edoc; |
2264 |
const char *uri, *vbuf, *xcmd; |
const char *uri, *vbuf, *xcmd; |
2265 |
char *dbuf, *tbuf; |
char *dbuf, *tbuf; |
2266 |
int err, fmt, id, dsiz; |
int i, err, fmt, id, dsiz; |
2267 |
time_t emdate, fmdate; |
time_t emdate, fmdate; |
2268 |
struct stat sbuf; |
struct stat sbuf; |
2269 |
xcmd = NULL; |
xcmd = NULL; |
2336 |
break; |
break; |
2337 |
} |
} |
2338 |
} |
} |
2339 |
|
if(attrs){ |
2340 |
|
for(i = 0; i < cblistnum(g_pathattrs) && i < cblistnum(attrs); i++){ |
2341 |
|
est_doc_add_attr(doc, cblistval(g_pathattrs, i, NULL), cblistval(attrs, i, NULL)); |
2342 |
|
} |
2343 |
|
} |
2344 |
if(!est_doc_attr(doc, ESTDATTRURI)) est_doc_add_attr(doc, ESTDATTRURI, uri); |
if(!est_doc_attr(doc, ESTDATTRURI)) est_doc_add_attr(doc, ESTDATTRURI, uri); |
2345 |
est_doc_add_attr(doc, DATTRLPATH, uri); |
est_doc_add_attr(doc, DATTRLPATH, uri); |
2346 |
est_doc_add_attr(doc, DATTRLFILE, urltofile(uri)); |
est_doc_add_attr(doc, DATTRLFILE, urltofile(uri)); |