/[hyperestraier]/trunk/estcmd.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/estcmd.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3 - (show annotations)
Fri Jul 29 21:57:20 2005 UTC (18 years, 9 months ago) by dpavlin
File MIME type: text/plain
File size: 105537 byte(s)
make working copy from version 0.5.1

1 /*************************************************************************************************
2 * The command line interface for the core API
3 * Copyright (C) 2004-2005 Mikio Hirabayashi
4 * This file is part of Hyper Estraier.
5 * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6 * the GNU Lesser General Public License as published by the Free Software Foundation; either
7 * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10 * License for more details.
11 * You should have received a copy of the GNU Lesser General Public License along with Hyper
12 * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13 * Boston, MA 02111-1307 USA.
14 *************************************************************************************************/
15
16
17 #include "estraier.h"
18 #include "myconf.h"
19
20 #define NUMBUFSIZ 32 /* size of a buffer for a number */
21 #define URIBUFSIZ 8192 /* size of a buffer for an URI */
22 #define MINIBNUM 31 /* bucket number of a small map */
23 #define SEARCHMAX 10 /* maximum number of shown documents */
24 #define SNIPWWIDTH 480 /* whole width of the snippet */
25 #define SNIPHWIDTH 96 /* width of beginning of the text */
26 #define SNIPAWIDTH 96 /* width around each highlighted word */
27 #define CACHEMAX (512*1024*1024) /* max chache size by mega bytes */
28 #define DATTRLPATH "_lpath" /* name of the attribute of the local path */
29 #define DATTRLFILE "_lfile" /* name of the attribute of the local file name */
30 #define DATTRSCORE "#score" /* name of the pseudo-attribute of score */
31 #define DATTRKWORDS "#kwords" /* name of the pseudo-attribute of keywords */
32 #define KWDBNAME "kwords" /* name of the database for keywords */
33 #define KWDBBNUM 122869 /* bucket number of the keyword database */
34 #define KWDBDNUM 3 /* division number of the keyword database */
35 #define KWORDNUM 32 /* number of shown keywords */
36 #define RDOCSNUM 6 /* number of sections of a raondom document */
37 #define RDOCCNUM 256 /* number of characters for int a section */
38
39 enum { /* enumeration for viewing modes */
40 VM_ID, /* ID only */
41 VM_URI, /* ID and URI */
42 VM_ATTR, /* all attributes */
43 VM_FULL, /* all attributes and body text */
44 VM_SNIP, /* all attributes and snippet */
45 VM_HMRD, /* human readable */
46 VM_XML, /* XML */
47 VM_DUMP /* dump draft files */
48 };
49
50 enum { /* enumeration for file formats */
51 FF_AUTO, /* automatic detection */
52 FF_DRAFT, /* draft */
53 FF_TEXT, /* plain text */
54 FF_HTML, /* HTML */
55 FF_MIME, /* MIME */
56 FF_NONE /* ignored */
57 };
58
59 enum { /* enumeration for test documents */
60 RD_ENG, /* English */
61 RD_LAT, /* Latin */
62 RD_EURO, /* European mix */
63 RD_ORI, /* Oriental */
64 RD_JPN, /* Japanese */
65 RD_CHAO, /* chaos */
66 RD_RAND /* selected at random */
67 };
68
69
70 /* global variables */
71 const char *g_progname; /* program name */
72 int g_sigterm = FALSE; /* flag for termination signal */
73 int g_putopts = 0; /* options of registration */
74 int g_outopts = 0; /* options of deletion */
75 int g_optopts = 0; /* options of optimization */
76 const char *g_inputcode = "UTF-8"; /* input encoding */
77 int g_inputlang = ESTLANGEN; /* prefered language */
78 const char *g_pathcode = NULL; /* path encoding */
79 int g_pathfull = FALSE; /* whether to record full paths */
80 int g_oextmodes = 0; /* extra open modes */
81 int g_viewmode = VM_ID; /* viewing mode */
82 int g_filefmt = FF_AUTO; /* file format */
83 CBMAP *g_xcmdmap = NULL; /* map of suffixes and filter commands */
84 int g_filtorig = FALSE; /* whether to use filter for original files */
85 int g_stdate = FALSE; /* whether to adopt date by stat */
86 int g_chkmdate = FALSE; /* whether to check modification date */
87 double g_cachesize = -1; /* size of the cache */
88 int g_doforce = FALSE; /* whether to force purging or extracting */
89 int g_kwordnum = KWORDNUM; /* number of keywords */
90 int g_condopts = 0; /* options of the search condtion */
91 int g_rdmode = RD_RAND; /* mode of random documents */
92
93
94 /* function prototypes */
95 int main(int argc, char **argv);
96 static void printferror(const char *format, ...);
97 static void printfinfo(const char *format, ...);
98 static void dbinform(const char *msg);
99 static void setsignals(void);
100 static void sigtermhandler(int num);
101 static void usage(void);
102 static int runput(int argc, char **argv);
103 static int runout(int argc, char **argv);
104 static int runget(int argc, char **argv);
105 static int runlist(int argc, char **argv);
106 static int runuriid(int argc, char **argv);
107 static int runmeta(int argc, char **argv);
108 static int runinform(int argc, char **argv);
109 static int runoptimize(int argc, char **argv);
110 static int runsearch(int argc, char **argv);
111 static int rungather(int argc, char **argv);
112 static int runpurge(int argc, char **argv);
113 static int runextkeys(int argc, char **argv);
114 static int rundraft(int argc, char **argv);
115 static int runbreak(int argc, char **argv);
116 static int runrandput(int argc, char **argv);
117 static int runwicked(int argc, char **argv);
118 static int runregression(int argc, char **argv);
119 static int procput(const char *dbname, const char *filename);
120 static int procout(const char *dbname, int id, const char *expr);
121 static int procget(const char *dbname, int id, const char *expr, const char *attr);
122 static int proclist(const char *dbname);
123 static int procuriid(const char *dbname, const char *uri);
124 static int procmeta(const char *dbname, const char *mname, const char *mvalue);
125 static int procinform(const char *dbname);
126 static int procoptimize(const char *dbname);
127 static int procsearch(const char *dbname, const char *phrase,
128 const CBLIST *attrs, const char *ord, int max, int sim);
129 static int procgather(const char *dbname, const char *filename);
130 static int procpurge(const char *dbname, const char *prefix);
131 static int procextkeys(const char *dbname, const char *prefix, int ni);
132 static int procdraft(const char *filename);
133 static int procbreak(const char *filename, int wt);
134 static int procrandput(const char *dbname, int dnum);
135 static int procwicked(const char *dbname, int dnum);
136 static int procregression(const char *dbname);
137 static void xmlprintf(const char *format, ...);
138 static int strtolang(const char *str);
139 static char *fgetl(FILE *ifp);
140 static int doputdoc(ESTDB *db, const char *path);
141 static const char *pathtourl(const char *path);
142 static const char *urltofile(const char *uri);
143 static char *urltopath(const char *uri);
144 static CBMAP *vectorizer(void *db, int id, void *kwdb);
145 static ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *path,
146 const char *xcmd, const char *tmpdir,
147 const char *penc, int plang);
148 static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size, const char *enc);
149 static ESTDOC *est_doc_new_from_text(const char *buf, int size, const char *penc, int plang);
150 static ESTDOC *est_doc_new_from_html(const char *buf, int size, const char *penc, int plang);
151 static char *est_html_enc(const char *str);
152 static char *est_html_raw_text(const char *html);
153 static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang);
154 static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value);
155 static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode);
156 static char *est_random_str(int cnum, int mode);
157
158
159 /* main routine */
160 int main(int argc, char **argv){
161 const char *tmp;
162 int rv;
163 if((tmp = getenv("ESTDBGFD")) != NULL) dpdbgfd = atoi(tmp);
164 cbstdiobin();
165 g_progname = argv[0];
166 g_sigterm = FALSE;
167 if(argc < 2) usage();
168 rv = 0;
169 if(!strcmp(argv[1], "put")){
170 setsignals();
171 rv = runput(argc, argv);
172 } else if(!strcmp(argv[1], "out")){
173 setsignals();
174 rv = runout(argc, argv);
175 } else if(!strcmp(argv[1], "get")){
176 rv = runget(argc, argv);
177 } else if(!strcmp(argv[1], "list")){
178 rv = runlist(argc, argv);
179 } else if(!strcmp(argv[1], "uriid")){
180 rv = runuriid(argc, argv);
181 } else if(!strcmp(argv[1], "meta")){
182 setsignals();
183 rv = runmeta(argc, argv);
184 } else if(!strcmp(argv[1], "inform")){
185 rv = runinform(argc, argv);
186 } else if(!strcmp(argv[1], "optimize")){
187 setsignals();
188 rv = runoptimize(argc, argv);
189 } else if(!strcmp(argv[1], "search")){
190 rv = runsearch(argc, argv);
191 } else if(!strcmp(argv[1], "gather")){
192 setsignals();
193 rv = rungather(argc, argv);
194 } else if(!strcmp(argv[1], "purge")){
195 setsignals();
196 rv = runpurge(argc, argv);
197 } else if(!strcmp(argv[1], "extkeys")){
198 setsignals();
199 rv = runextkeys(argc, argv);
200 } else if(!strcmp(argv[1], "draft")){
201 rv = rundraft(argc, argv);
202 } else if(!strcmp(argv[1], "break")){
203 rv = runbreak(argc, argv);
204 } else if(!strcmp(argv[1], "randput")){
205 setsignals();
206 rv = runrandput(argc, argv);
207 } else if(!strcmp(argv[1], "wicked")){
208 setsignals();
209 rv = runwicked(argc, argv);
210 } else if(!strcmp(argv[1], "regression")){
211 setsignals();
212 rv = runregression(argc, argv);
213 } else if(!strcmp(argv[1], "version") || !strcmp(argv[1], "--version")){
214 printf("Hyper Estraier %s on %s\n", est_version, ESTSYSNAME);
215 printf("Copyright (C) 2004-2005 Mikio Hirabayashi.\n");
216 rv = 0;
217 } else {
218 usage();
219 }
220 return rv;
221 }
222
223
224 /* print formatted error string and flush the buffer */
225 static void printferror(const char *format, ...){
226 va_list ap;
227 va_start(ap, format);
228 fprintf(stderr, "%s: ERROR: ", g_progname);
229 vfprintf(stderr, format, ap);
230 fputc('\n', stderr);
231 fflush(stderr);
232 va_end(ap);
233 }
234
235
236 /* print formatted information string and flush the buffer */
237 static void printfinfo(const char *format, ...){
238 va_list ap;
239 va_start(ap, format);
240 printf("%s: INFO: ", g_progname);
241 vprintf(format, ap);
242 putchar('\n');
243 fflush(stdout);
244 va_end(ap);
245 }
246
247
248 /* callback function for database events */
249 static void dbinform(const char *msg){
250 printfinfo("%s", msg);
251 }
252
253
254 /* set signal handlers */
255 static void setsignals(void){
256 signal(1, sigtermhandler);
257 signal(2, sigtermhandler);
258 signal(3, sigtermhandler);
259 signal(13, sigtermhandler);
260 signal(15, sigtermhandler);
261 }
262
263
264 /* handler of termination signal */
265 static void sigtermhandler(int num){
266 static int tries = 0;
267 if(tries++ <= 4){
268 signal(num, sigtermhandler);
269 } else {
270 signal(num, SIG_DFL);
271 }
272 g_sigterm = TRUE;
273 printfinfo("the termination signal %d catched", num);
274 }
275
276
277 /* print the usage and exit */
278 static void usage(void){
279 fprintf(stderr, "%s: command line utility for the core API of Hyper Estraier\n", g_progname);
280 fprintf(stderr, "\n");
281 fprintf(stderr, "usage:\n");
282 fprintf(stderr, " %s put [-cl] db [file]\n", g_progname);
283 fprintf(stderr, " %s out [-cl] db expr\n", g_progname);
284 fprintf(stderr, " %s get db expr\n", g_progname);
285 fprintf(stderr, " %s list db\n", g_progname);
286 fprintf(stderr, " %s uriid db uri\n", g_progname);
287 fprintf(stderr, " %s meta db [name [value]]\n", g_progname);
288 fprintf(stderr, " %s inform db\n", g_progname);
289 fprintf(stderr, " %s optimize [-onp] [-ond] db\n", g_progname);
290 fprintf(stderr, " %s search [-ic enc] [-vu|-va|-vf|-vs|-vh|-vx|-dd] [-gs|-gf|-ga]"
291 " [-ni] [-sf] [-hs] [-attr expr] [-ord expr] [-max num] [-sim id] db [phrase]\n",
292 g_progname);
293 fprintf(stderr, " %s gather [-cl] [-fe|-ft|-fh|-fm] [-fx sufs cmd] [-fz] [-fo]"
294 " [-ic enc] [-il lang] [-pc enc] [-pf] [-apn] [-sd] [-cm] [-cs num] db [file|dir]\n",
295 g_progname);
296 fprintf(stderr, " %s purge [-cl] [-fc] db [prefix]\n", g_progname);
297 fprintf(stderr, " %s extkeys [-fc] [-ni] [-kn num] db [prefix]\n", g_progname);
298 fprintf(stderr, " %s draft [-ft|-fh|-fm] [-ic enc] [-il lang] [file]\n", g_progname);
299 fprintf(stderr, " %s break [-ic enc] [-il lang] [-apn] [-wt] [file]\n", g_progname);
300 fprintf(stderr, " %s randput [-ren|-rla|-reu|-ror|-rjp|-rch] [-cs num] db dnum\n",
301 g_progname);
302 fprintf(stderr, " %s wicked db dnum\n", g_progname);
303 fprintf(stderr, " %s regression db\n", g_progname);
304 fprintf(stderr, " %s version\n", g_progname);
305 fprintf(stderr, "\n");
306 exit(1);
307 }
308
309
310 /* parse arguments of the put command */
311 static int runput(int argc, char **argv){
312 char *dbname, *filename;
313 int i, rv;
314 dbname = NULL;
315 filename = NULL;
316 for(i = 2; i < argc; i++){
317 if(!dbname && argv[i][0] == '-'){
318 if(!strcmp(argv[i], "-cl")){
319 g_putopts |= ESTPDCLEAN;
320 } else {
321 usage();
322 }
323 } else if(!dbname){
324 dbname = argv[i];
325 } else if(!filename){
326 filename = argv[i];
327 } else {
328 usage();
329 }
330 }
331 if(!dbname) usage();
332 rv = procput(dbname, filename);
333 return rv;
334 }
335
336
337 /* parse arguments of the out command */
338 static int runout(int argc, char **argv){
339 char *dbname, *expr;
340 int i, id, rv;
341 dbname = NULL;
342 expr = NULL;
343 for(i = 2; i < argc; i++){
344 if(!dbname && argv[i][0] == '-'){
345 if(!strcmp(argv[i], "-cl")){
346 g_outopts |= ESTODCLEAN;
347 } else {
348 usage();
349 }
350 } else if(!dbname){
351 dbname = argv[i];
352 } else if(!expr){
353 expr = argv[i];
354 } else {
355 usage();
356 }
357 }
358 if(!dbname || !expr) usage();
359 if((id = atoi(expr)) > 0) expr = NULL;
360 rv = procout(dbname, id, expr);
361 return rv;
362 }
363
364
365 /* parse arguments of the get command */
366 static int runget(int argc, char **argv){
367 char *dbname, *expr, *attr;
368 int i, id, rv;
369 dbname = NULL;
370 expr = NULL;
371 attr = NULL;
372 for(i = 2; i < argc; i++){
373 if(!dbname && argv[i][0] == '-'){
374 usage();
375 } else if(!dbname){
376 dbname = argv[i];
377 } else if(!expr){
378 expr = argv[i];
379 } else if(!attr){
380 attr = argv[i];
381 } else {
382 usage();
383 }
384 }
385 if(!dbname || !expr) usage();
386 if((id = atoi(expr)) > 0) expr = NULL;
387 rv = procget(dbname, id, expr, attr);
388 return rv;
389 }
390
391
392 /* parse arguments of the list command */
393 static int runlist(int argc, char **argv){
394 char *dbname;
395 int i, rv;
396 dbname = NULL;
397 for(i = 2; i < argc; i++){
398 if(!dbname && argv[i][0] == '-'){
399 usage();
400 } else if(!dbname){
401 dbname = argv[i];
402 } else {
403 usage();
404 }
405 }
406 if(!dbname) usage();
407 rv = proclist(dbname);
408 return rv;
409 }
410
411
412 /* parse arguments of the uriid command */
413 static int runuriid(int argc, char **argv){
414 char *dbname, *uri;
415 int i, rv;
416 dbname = NULL;
417 uri = NULL;
418 for(i = 2; i < argc; i++){
419 if(!dbname && argv[i][0] == '-'){
420 usage();
421 } else if(!dbname){
422 dbname = argv[i];
423 } else if(!uri){
424 uri = argv[i];
425 } else {
426 usage();
427 }
428 }
429 if(!dbname || !uri) usage();
430 rv = procuriid(dbname, uri);
431 return rv;
432 }
433
434
435 /* parse arguments of the meta command */
436 static int runmeta(int argc, char **argv){
437 char *dbname, *mname, *mvalue;
438 int i, del, rv;
439 dbname = NULL;
440 mname = NULL;
441 mvalue = NULL;
442 del = FALSE;
443 for(i = 2; i < argc; i++){
444 if(!dbname && argv[i][0] == '-'){
445 usage();
446 } else if(!dbname){
447 dbname = argv[i];
448 } else if(!mname){
449 mname = argv[i];
450 } else if(!mvalue){
451 mvalue = argv[i];
452 } else {
453 usage();
454 }
455 }
456 if(!dbname) usage();
457 rv = procmeta(dbname, mname, mvalue);
458 return rv;
459 }
460
461
462 /* parse arguments of the inform command */
463 static int runinform(int argc, char **argv){
464 char *dbname;
465 int i, rv;
466 dbname = NULL;
467 for(i = 2; i < argc; i++){
468 if(!dbname && argv[i][0] == '-'){
469 usage();
470 } else if(!dbname){
471 dbname = argv[i];
472 } else {
473 usage();
474 }
475 }
476 if(!dbname) usage();
477 rv = procinform(dbname);
478 return rv;
479 }
480
481
482 /* parse arguments of the optimize command */
483 static int runoptimize(int argc, char **argv){
484 char *dbname;
485 int i, rv;
486 dbname = NULL;
487 for(i = 2; i < argc; i++){
488 if(!dbname && argv[i][0] == '-'){
489 if(!strcmp(argv[i], "-onp")){
490 g_optopts |= ESTOPTNOPURGE;
491 } else if(!strcmp(argv[i], "-ond")){
492 g_optopts |= ESTOPTNODBOPT;
493 } else {
494 usage();
495 }
496 } else if(!dbname){
497 dbname = argv[i];
498 } else {
499 usage();
500 }
501 }
502 if(!dbname) usage();
503 rv = procoptimize(dbname);
504 return rv;
505 }
506
507
508 /* parse arguments of the search command */
509 static int runsearch(int argc, char **argv){
510 CBDATUM *pbuf;
511 CBLIST *attrs;
512 char *dbname, *ord, *phrase, *tmp;
513 int i, max, sim, rv;
514 dbname = NULL;
515 ord = NULL;
516 max = SEARCHMAX;
517 sim = -1;
518 pbuf = cbdatumopen("", 0);
519 cbglobalgc(pbuf, (void (*)(void *))cbdatumclose);
520 attrs = cblistopen();
521 cbglobalgc(attrs, (void (*)(void *))cblistclose);
522 for(i = 2; i < argc; i++){
523 if(!dbname && argv[i][0] == '-'){
524 if(!strcmp(argv[i], "-ic")){
525 if(++i >= argc) usage();
526 g_inputcode = argv[i];
527 } else if(!strcmp(argv[i], "-gs")){
528 g_condopts |= ESTCONDSURE;
529 } else if(!strcmp(argv[i], "-gf")){
530 g_condopts |= ESTCONDFAST;
531 } else if(!strcmp(argv[i], "-ga")){
532 g_condopts |= ESTCONDAGIT;
533 } else if(!strcmp(argv[i], "-ni")){
534 g_condopts |= ESTCONDNOIDF;
535 } else if(!strcmp(argv[i], "-sf")){
536 g_condopts |= ESTCONDSIMPLE;
537 } else if(!strcmp(argv[i], "-hs")){
538 g_condopts |= ESTCONDSCFB;
539 } else if(!strcmp(argv[i], "-vu")){
540 g_viewmode = VM_URI;
541 } else if(!strcmp(argv[i], "-va")){
542 g_viewmode = VM_ATTR;
543 } else if(!strcmp(argv[i], "-vf")){
544 g_viewmode = VM_FULL;
545 } else if(!strcmp(argv[i], "-vs")){
546 g_viewmode = VM_SNIP;
547 } else if(!strcmp(argv[i], "-vh")){
548 g_viewmode = VM_HMRD;
549 } else if(!strcmp(argv[i], "-vx")){
550 g_viewmode = VM_XML;
551 } else if(!strcmp(argv[i], "-dd")){
552 g_viewmode = VM_DUMP;
553 } else if(!strcmp(argv[i], "-attr")){
554 if(++i >= argc) usage();
555 cblistpush(attrs, argv[i], -1);
556 } else if(!strcmp(argv[i], "-ord")){
557 if(++i >= argc) usage();
558 ord = argv[i];
559 } else if(!strcmp(argv[i], "-max")){
560 if(++i >= argc) usage();
561 max = atoi(argv[i]);
562 } else if(!strcmp(argv[i], "-sim")){
563 if(++i >= argc) usage();
564 sim = atoi(argv[i]);
565 } else {
566 usage();
567 }
568 } else if(!dbname){
569 dbname = argv[i];
570 } else {
571 if(cbdatumsize(pbuf) > 0) cbdatumcat(pbuf, " ", 1);
572 cbdatumcat(pbuf, argv[i], -1);
573 }
574 }
575 if(!dbname) usage();
576 if(!(phrase = est_iconv(cbdatumptr(pbuf), -1, g_inputcode, "UTF-8", NULL, NULL))){
577 printferror("%s: unsupported encoding\n", g_inputcode);
578 return 1;
579 }
580 cbstrtrim(phrase);
581 for(i = 0; i < cblistnum(attrs); i++){
582 if((tmp = est_iconv(cblistval(attrs, i, NULL), -1, g_inputcode, "UTF-8", NULL, NULL)) != NULL){
583 cblistover(attrs, i, tmp, -1);
584 free(tmp);
585 }
586 }
587 rv = procsearch(dbname, phrase, attrs, ord, max, sim);
588 free(phrase);
589 return rv;
590 }
591
592
593 /* parse arguments of the gather command */
594 static int rungather(int argc, char **argv){
595 CBLIST *list;
596 const char *elem;
597 char *dbname, *filename;
598 int i, j, rv;
599 g_xcmdmap = cbmapopenex(MINIBNUM);
600 cbglobalgc(g_xcmdmap, (void (*)(void *))cbmapclose);
601 dbname = NULL;
602 filename = NULL;
603 g_inputcode = NULL;
604 for(i = 2; i < argc; i++){
605 if(!dbname && argv[i][0] == '-'){
606 if(!strcmp(argv[i], "-cl")){
607 g_putopts |= ESTPDCLEAN;
608 } else if(!strcmp(argv[i], "-fe")){
609 g_filefmt = FF_DRAFT;
610 } else if(!strcmp(argv[i], "-ft")){
611 g_filefmt = FF_TEXT;
612 } else if(!strcmp(argv[i], "-fh")){
613 g_filefmt = FF_HTML;
614 } else if(!strcmp(argv[i], "-fm")){
615 g_filefmt = FF_MIME;
616 } else if(!strcmp(argv[i], "-fx")){
617 if((i += 2) >= argc) usage();
618 list = cbsplit(argv[i-1], -1, ",");
619 for(j = 0; j < cblistnum(list); j++){
620 elem = cblistval(list, j, NULL);
621 if(elem[0] != '\0') cbmapput(g_xcmdmap, elem, -1, argv[i], -1, FALSE);
622 }
623 cblistclose(list);
624 } else if(!strcmp(argv[i], "-fz")){
625 g_filefmt = FF_NONE;
626 } else if(!strcmp(argv[i], "-fo")){
627 g_filtorig = TRUE;
628 } else if(!strcmp(argv[i], "-ic")){
629 if(++i >= argc) usage();
630 g_inputcode = argv[i];
631 } else if(!strcmp(argv[i], "-il")){
632 if(++i >= argc) usage();
633 g_inputlang = strtolang(argv[i]);
634 } else if(!strcmp(argv[i], "-pc")){
635 if(++i >= argc) usage();
636 g_pathcode = argv[i];
637 } else if(!strcmp(argv[i], "-pf")){
638 g_pathfull = TRUE;
639 } else if(!strcmp(argv[i], "-apn")){
640 g_oextmodes |= ESTDBPERFNG;
641 } else if(!strcmp(argv[i], "-sd")){
642 g_stdate = TRUE;
643 } else if(!strcmp(argv[i], "-cm")){
644 g_chkmdate = TRUE;
645 } else if(!strcmp(argv[i], "-cs")){
646 if(++i >= argc) usage();
647 g_cachesize = strtod(argv[i], NULL) * 1024 * 1024;
648 } else {
649 usage();
650 }
651 } else if(!dbname){
652 dbname = argv[i];
653 } else if(!filename){
654 filename = argv[i];
655 } else {
656 usage();
657 }
658 }
659 if(!dbname || !filename) usage();
660 rv = procgather(dbname, filename);
661 return rv;
662 }
663
664
665 /* parse arguments of the purge command */
666 static int runpurge(int argc, char **argv){
667 char *dbname, *prefix;
668 int i, rv;
669 dbname = NULL;
670 prefix = NULL;
671 for(i = 2; i < argc; i++){
672 if(!dbname && argv[i][0] == '-'){
673 if(!strcmp(argv[i], "-cl")){
674 g_outopts |= ESTODCLEAN;
675 } else if(!strcmp(argv[i], "-fc")){
676 g_doforce = TRUE;
677 } else {
678 usage();
679 }
680 } else if(!dbname){
681 dbname = argv[i];
682 } else if(!prefix){
683 prefix = argv[i];
684 } else {
685 usage();
686 }
687 }
688 if(!dbname) usage();
689 rv = procpurge(dbname, prefix);
690 return rv;
691 }
692
693
694 /* parse arguments of the extkeys command */
695 static int runextkeys(int argc, char **argv){
696 char *dbname, *prefix;
697 int i, ni, rv;
698 dbname = NULL;
699 prefix = NULL;
700 ni = FALSE;
701 for(i = 2; i < argc; i++){
702 if(!dbname && argv[i][0] == '-'){
703 if(!strcmp(argv[i], "-fc")){
704 g_doforce = TRUE;
705 } else if(!strcmp(argv[i], "-ni")){
706 ni = TRUE;
707 } else if(!strcmp(argv[i], "-kn")){
708 if(++i >= argc) usage();
709 g_kwordnum = atoi(argv[i]);
710 } else {
711 usage();
712 }
713 } else if(!dbname){
714 dbname = argv[i];
715 } else if(!prefix){
716 prefix = argv[i];
717 } else {
718 usage();
719 }
720 }
721 if(!dbname || g_kwordnum < 1) usage();
722 rv = procextkeys(dbname, prefix, ni);
723 return rv;
724 }
725
726
727 /* parse arguments of the draft command */
728 static int rundraft(int argc, char **argv){
729 char *filename;
730 int i, rv;
731 filename = NULL;
732 g_filefmt = FF_DRAFT;
733 g_inputcode = NULL;
734 for(i = 2; i < argc; i++){
735 if(!filename && argv[i][0] == '-'){
736 if(!strcmp(argv[i], "-ft")){
737 g_filefmt = FF_TEXT;
738 } else if(!strcmp(argv[i], "-fh")){
739 g_filefmt = FF_HTML;
740 } else if(!strcmp(argv[i], "-fm")){
741 g_filefmt = FF_MIME;
742 } else if(!strcmp(argv[i], "-ic")){
743 if(++i >= argc) usage();
744 g_inputcode = argv[i];
745 } else if(!strcmp(argv[i], "-il")){
746 if(++i >= argc) usage();
747 g_inputlang = strtolang(argv[i]);
748 } else {
749 usage();
750 }
751 } else if(!filename){
752 filename = argv[i];
753 } else {
754 usage();
755 }
756 }
757 rv = procdraft(filename);
758 return rv;
759 }
760
761
762 /* parse arguments of the break command */
763 static int runbreak(int argc, char **argv){
764 char *filename;
765 int i, wt, rv;
766 filename = NULL;
767 wt = FALSE;
768 for(i = 2; i < argc; i++){
769 if(!filename && argv[i][0] == '-'){
770 if(!strcmp(argv[i], "-ic")){
771 if(++i >= argc) usage();
772 g_inputcode = argv[i];
773 } else if(!strcmp(argv[i], "-il")){
774 if(++i >= argc) usage();
775 g_inputlang = strtolang(argv[i]);
776 } else if(!strcmp(argv[i], "-apn")){
777 g_oextmodes |= ESTDBPERFNG;
778 } else if(!strcmp(argv[i], "-wt")){
779 wt = TRUE;
780 } else {
781 usage();
782 }
783 } else if(!filename){
784 filename = argv[i];
785 } else {
786 usage();
787 }
788 }
789 rv = procbreak(filename, wt);
790 return rv;
791 }
792
793
794 /* parse arguments of the randput command */
795 static int runrandput(int argc, char **argv){
796 char *dbname, *dnstr;
797 int i, dnum, rv;
798 dbname = NULL;
799 dnstr = NULL;
800 for(i = 2; i < argc; i++){
801 if(!dbname && argv[i][0] == '-'){
802 if(!strcmp(argv[i], "-ren")){
803 g_rdmode = RD_ENG;
804 } else if(!strcmp(argv[i], "-rla")){
805 g_rdmode = RD_LAT;
806 } else if(!strcmp(argv[i], "-reu")){
807 g_rdmode = RD_EURO;
808 } else if(!strcmp(argv[i], "-ror")){
809 g_rdmode = RD_ORI;
810 } else if(!strcmp(argv[i], "-rjp")){
811 g_rdmode = RD_JPN;
812 } else if(!strcmp(argv[i], "-rch")){
813 g_rdmode = RD_CHAO;
814 } else if(!strcmp(argv[i], "-cs")){
815 if(++i >= argc) usage();
816 g_cachesize = strtod(argv[i], NULL) * 1024 * 1024;
817 } else {
818 usage();
819 }
820 } else if(!dbname){
821 dbname = argv[i];
822 } else if(!dnstr){
823 dnstr = argv[i];
824 } else {
825 usage();
826 }
827 }
828 if(!dbname || !dnstr) usage();
829 if((dnum = atoi(dnstr)) < 1) usage();
830 rv = procrandput(dbname, dnum);
831 return rv;
832 }
833
834
835 /* parse arguments of the wicked command */
836 static int runwicked(int argc, char **argv){
837 char *dbname, *dnstr;
838 int i, dnum, rv;
839 dbname = NULL;
840 dnstr = NULL;
841 for(i = 2; i < argc; i++){
842 if(!dbname && argv[i][0] == '-'){
843 usage();
844 } else if(!dbname){
845 dbname = argv[i];
846 } else if(!dnstr){
847 dnstr = argv[i];
848 } else {
849 usage();
850 }
851 }
852 if(!dbname || !dnstr) usage();
853 if((dnum = atoi(dnstr)) < 1) usage();
854 rv = procwicked(dbname, dnum);
855 return rv;
856 }
857
858
859 /* parse arguments of the regression command */
860 static int runregression(int argc, char **argv){
861 char *dbname;
862 int i, rv;
863 dbname = NULL;
864 for(i = 2; i < argc; i++){
865 if(!dbname && argv[i][0] == '-'){
866 usage();
867 } else if(!dbname){
868 dbname = argv[i];
869 } else {
870 usage();
871 }
872 }
873 if(!dbname) usage();
874 rv = procregression(dbname);
875 return rv;
876 }
877
878
879 /* perform the put command */
880 static int procput(const char *dbname, const char *filename){
881 ESTDB *db;
882 ESTDOC *doc;
883 const char *uri;
884 char *draft;
885 int ecode;
886 if(!(draft = cbreadfile(filename, NULL))){
887 printferror("%s: could not open", filename ? filename : "(stdin)");
888 return 1;
889 }
890 if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT, &ecode))){
891 printferror("%s: %s", dbname, est_err_msg(ecode));
892 free(draft);
893 return 1;
894 }
895 est_db_set_informer(db, dbinform);
896 doc = est_doc_new_from_draft(draft);
897 if(!est_db_put_doc(db, doc, g_putopts)){
898 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
899 est_doc_delete(doc);
900 est_db_close(db, &ecode);
901 free(draft);
902 return 1;
903 }
904 if(!(uri = est_doc_attr(doc, ESTDATTRURI))) uri = "";
905 printfinfo("%d (%s): registered", est_doc_id(doc), uri);
906 est_doc_delete(doc);
907 if(!est_db_close(db, &ecode)){
908 printferror("%s: %s", dbname, est_err_msg(ecode));
909 free(draft);
910 return 1;
911 }
912 free(draft);
913 return 0;
914 }
915
916
917 /* perform the out command */
918 static int procout(const char *dbname, int id, const char *expr){
919 ESTDB *db;
920 int ecode;
921 if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
922 printferror("%s: %s", dbname, est_err_msg(ecode));
923 return 1;
924 }
925 est_db_set_informer(db, dbinform);
926 if(expr && (id = est_db_uri_to_id(db, expr)) < 1){
927 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
928 est_db_close(db, &ecode);
929 return 1;
930 }
931 if(!est_db_out_doc(db, id, g_outopts)){
932 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
933 est_db_close(db, &ecode);
934 return 1;
935 }
936 printfinfo("%d: deleted", id);
937 if(!est_db_close(db, &ecode)){
938 printferror("%s: %s", dbname, est_err_msg(ecode));
939 return 1;
940 }
941 return 0;
942 }
943
944
945 /* perform the get command */
946 static int procget(const char *dbname, int id, const char *expr, const char *attr){
947 ESTDB *db;
948 ESTDOC *doc;
949 char *draft;
950 int ecode;
951 if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
952 printferror("%s: %s", dbname, est_err_msg(ecode));
953 return 1;
954 }
955 if(expr && (id = est_db_uri_to_id(db, expr)) < 1){
956 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
957 est_db_close(db, &ecode);
958 return 1;
959 }
960 if(attr){
961 if(!(draft = est_db_get_doc_attr(db, id, attr))){
962 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
963 est_db_close(db, &ecode);
964 return 1;
965 }
966 printf("%s\n", draft);
967 free(draft);
968 } else {
969 if(!(doc = est_db_get_doc(db, id, 0))){
970 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
971 est_db_close(db, &ecode);
972 return 1;
973 }
974 draft = est_doc_dump_draft(doc);
975 printf("%s", draft);
976 free(draft);
977 est_doc_delete(doc);
978 }
979 if(!est_db_close(db, &ecode)){
980 printferror("%s: %s", dbname, est_err_msg(ecode));
981 return 1;
982 }
983 return 0;
984 }
985
986
987 /* perform the list command */
988 static int proclist(const char *dbname){
989 ESTDB *db;
990 ESTDOC *doc;
991 const char *vbuf;
992 int ecode, id;
993 if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
994 printferror("%s: %s", dbname, est_err_msg(ecode));
995 return 1;
996 }
997 if(!est_db_iter_init(db)){
998 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
999 est_db_close(db, &ecode);
1000 return 1;
1001 }
1002 while((id = est_db_iter_next(db)) > 0){
1003 if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1004 if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1005 printf("%d\t%s\n", id, vbuf);
1006 est_doc_delete(doc);
1007 }
1008 }
1009 if(!est_db_close(db, &ecode)){
1010 printferror("%s: %s", dbname, est_err_msg(ecode));
1011 return 1;
1012 }
1013 return 0;
1014 }
1015
1016
1017 /* perform the uriid command */
1018 static int procuriid(const char *dbname, const char *uri){
1019 ESTDB *db;
1020 int ecode, id;
1021 if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1022 printferror("%s: %s", dbname, est_err_msg(ecode));
1023 return 1;
1024 }
1025 if((id = est_db_uri_to_id(db, uri)) == -1){
1026 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1027 est_db_close(db, &ecode);
1028 return 1;
1029 }
1030 printf("%d\n", id);
1031 if(!est_db_close(db, &ecode)){
1032 printferror("%s: %s", dbname, est_err_msg(ecode));
1033 return 1;
1034 }
1035 return 0;
1036 }
1037
1038
1039 /* perform the meta command */
1040 static int procmeta(const char *dbname, const char *mname, const char *mvalue){
1041 ESTDB *db;
1042 CBLIST *names;
1043 char *vbuf;
1044 int i, ecode;
1045 if(!(db = est_db_open(dbname, mvalue ? (ESTDBWRITER | ESTDBCREAT) : (ESTDBREADER | ESTDBLCKNB),
1046 &ecode))){
1047 printferror("%s: %s", dbname, est_err_msg(ecode));
1048 return 1;
1049 }
1050 if(mname){
1051 if(mvalue){
1052 est_db_add_meta(db, mname, mvalue[0] != '\0' ? mvalue : NULL);
1053 } else {
1054 if((vbuf = est_db_meta(db, mname)) != NULL){
1055 printf("%s\n", vbuf);
1056 free(vbuf);
1057 }
1058 }
1059 } else {
1060 names = est_db_meta_names(db);
1061 for(i = 0; i < cblistnum(names); i++){
1062 printf("%s\n", cblistval(names, i, NULL));
1063 }
1064 cblistclose(names);
1065 }
1066 if(!est_db_close(db, &ecode)){
1067 printferror("%s: %s", dbname, est_err_msg(ecode));
1068 return 1;
1069 }
1070 return 0;
1071 }
1072
1073
1074 /* perform the inform command */
1075 static int procinform(const char *dbname){
1076 ESTDB *db;
1077 int ecode;
1078 if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1079 printferror("%s: %s", dbname, est_err_msg(ecode));
1080 return 1;
1081 }
1082 printf("number of documents: %d\n", est_db_doc_num(db));
1083 printf("number of words: %d\n", est_db_word_num(db));
1084 printf("file size: %.0f\n", est_db_size(db));
1085 if(!est_db_close(db, &ecode)){
1086 printferror("%s: %s", dbname, est_err_msg(ecode));
1087 return 1;
1088 }
1089 return 0;
1090 }
1091
1092
1093 /* perform the optimize command */
1094 static int procoptimize(const char *dbname){
1095 ESTDB *db;
1096 char path[URIBUFSIZ];
1097 int ecode;
1098 time_t curtime;
1099 curtime = time(NULL);
1100 if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1101 printferror("%s: %s", dbname, est_err_msg(ecode));
1102 return 1;
1103 }
1104 est_db_set_informer(db, dbinform);
1105 sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1106 unlink(path);
1107 if(!est_db_optimize(db, g_optopts)){
1108 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1109 est_db_close(db, &ecode);
1110 return 1;
1111 }
1112 if(!est_db_close(db, &ecode)){
1113 printferror("%s: %s", dbname, est_err_msg(ecode));
1114 return 1;
1115 }
1116 curtime = time(NULL) - curtime;
1117 printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1118 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1119 return 0;
1120 }
1121
1122
1123 /* perform the search command */
1124 static int procsearch(const char *dbname, const char *phrase,
1125 const CBLIST *attrs, const char *ord, int max, int sim){
1126 ESTDB *db;
1127 ESTCOND *cond;
1128 ESTDOC *doc;
1129 CURIA *kwdb;
1130 CBDATUM *pbuf;
1131 CBMAP *svmap, *hints, *kwords;
1132 CBLIST *names, *words, *lines;
1133 const char *kbuf, *vbuf, *line;
1134 char *draft, path[URIBUFSIZ], numbuf[NUMBUFSIZ], *word, *pv;
1135 int i, j, ecode, ksiz, vsiz, *res, rnum, id, sc, fin, cnt;
1136 double curtime;
1137 if(!(db = est_db_open(dbname, ESTDBREADER | ESTDBLCKNB, &ecode))){
1138 printferror("%s: %s", dbname, est_err_msg(ecode));
1139 return 1;
1140 }
1141 sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1142 if((kwdb = cropen(path, CR_OREADER, -1, -1)) != NULL)
1143 est_db_set_vectorizer(db, vectorizer, kwdb);
1144 cond = est_cond_new();
1145 if(sim > 0){
1146 svmap = kwdb ? vectorizer(db, sim, kwdb) : NULL;
1147 if(!svmap && (doc = est_db_get_doc(db, sim, 0)) != NULL){
1148 svmap = est_db_etch_doc((g_condopts & ESTCONDNOIDF) ? NULL : db, doc, KWORDNUM);
1149 est_doc_delete(doc);
1150 }
1151 if(svmap){
1152 pbuf = cbdatumopen(ESTOPSIMILAR, -1);
1153 cbmapiterinit(svmap);
1154 while((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
1155 vbuf = cbmapget(svmap, kbuf, ksiz, &vsiz);
1156 cbdatumcat(pbuf, " WITH ", -1);
1157 cbdatumcat(pbuf, vbuf, vsiz);
1158 cbdatumcat(pbuf, " ", 1);
1159 cbdatumcat(pbuf, kbuf, ksiz);
1160 }
1161 est_cond_set_phrase(cond, cbdatumptr(pbuf));
1162 cbdatumclose(pbuf);
1163 cbmapclose(svmap);
1164 }
1165 } else {
1166 while(*phrase > '\0' && *phrase <= ' '){
1167 phrase++;
1168 }
1169 if(phrase[0] != '\0' || cblistnum(attrs) < 1) est_cond_set_phrase(cond, phrase);
1170 }
1171 for(i = 0; i < cblistnum(attrs); i++){
1172 est_cond_add_attr(cond, cblistval(attrs, i, NULL));
1173 }
1174 if(ord) est_cond_set_order(cond, ord);
1175 if(max >= 0) est_cond_set_max(cond, max);
1176 est_cond_set_options(cond, g_condopts);
1177 hints = cbmapopenex(MINIBNUM);
1178 curtime = est_gettimeofday();
1179 res = est_db_search(db, cond, &rnum, hints);
1180 curtime = est_gettimeofday() - curtime;
1181 if(g_viewmode == VM_XML){
1182 xmlprintf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
1183 xmlprintf("<estresult version=\"%@\">\n", est_version);
1184 xmlprintf("<meta>\n");
1185 xmlprintf("<hit number=\"%@\"/>\n", cbmapget(hints, "", 0, NULL));
1186 cbmapiterinit(hints);
1187 while((kbuf = cbmapiternext(hints, NULL)) != NULL){
1188 if(kbuf[0] == '\0') continue;
1189 vbuf = cbmapget(hints, kbuf, -1, NULL);
1190 xmlprintf("<hit key=\"%@\" number=\"%@\"/>\n", kbuf, vbuf);
1191 }
1192 xmlprintf("<time time=\"%.3f\"/>\n", curtime / 1000.0);
1193 xmlprintf("<total documents=\"%d\" words=\"%d\"/>\n",
1194 est_db_doc_num(db), est_db_word_num(db));
1195 xmlprintf("</meta>\n");
1196 } else {
1197 printf("%s\n", est_border_str());
1198 printf("VERSION\t%s\n", _EST_PROTVER);
1199 printf("NODE\tlocal\n");
1200 printf("HIT\t%s\n", cbmapget(hints, "", 0, NULL));
1201 cbmapiterinit(hints);
1202 cnt = 1;
1203 while((kbuf = cbmapiternext(hints, NULL)) != NULL){
1204 if(kbuf[0] == '\0') continue;
1205 vbuf = cbmapget(hints, kbuf, -1, NULL);
1206 printf("HINT#%d\t%s\t%s\n", cnt, kbuf, vbuf);
1207 cnt++;
1208 }
1209 printf("TIME\t%.3f\n", curtime / 1000.0);
1210 printf("DOCNUM\t%d\n", est_db_doc_num(db));
1211 printf("WORDNUM\t%d\n", est_db_word_num(db));
1212 switch(g_viewmode){
1213 case VM_ID:
1214 printf("VIEW\tID\n");
1215 break;
1216 case VM_URI:
1217 printf("VIEW\tURI\n");
1218 break;
1219 case VM_ATTR:
1220 printf("VIEW\tATTRIBUTE\n");
1221 break;
1222 case VM_FULL:
1223 printf("VIEW\tFULL\n");
1224 break;
1225 case VM_SNIP:
1226 printf("VIEW\tSNIPPET\n");
1227 break;
1228 case VM_HMRD:
1229 printf("VIEW\tHUMAN\n");
1230 break;
1231 }
1232 printf("\n");
1233 if(g_viewmode == VM_ID || g_viewmode == VM_URI ||
1234 g_viewmode == VM_HMRD || g_viewmode == VM_DUMP) printf("%s\n", est_border_str());
1235 }
1236 for(i = 0; i < rnum ; i++){
1237 id = res[i];
1238 sc = est_cond_score(cond, i);
1239 switch(g_viewmode){
1240 case VM_URI:
1241 if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1242 if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1243 printf("%d\t%s\n", id, vbuf);
1244 est_doc_delete(doc);
1245 }
1246 break;
1247 case VM_ATTR:
1248 if((doc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
1249 if(sc >= 0){
1250 sprintf(numbuf, "%d", sc);
1251 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1252 }
1253 printf("%s\n", est_border_str());
1254 names = est_doc_attr_names(doc);
1255 for(j = 0; j < cblistnum(names); j++){
1256 kbuf = cblistval(names, j, NULL);
1257 vbuf = est_doc_attr(doc, kbuf);
1258 printf("%s=%s\n", kbuf, vbuf);
1259 }
1260 cblistclose(names);
1261 est_doc_delete(doc);
1262 }
1263 printf("\n");
1264 break;
1265 case VM_FULL:
1266 if((doc = est_db_get_doc(db, id, 0)) != NULL){
1267 if(sc >= 0){
1268 sprintf(numbuf, "%d", sc);
1269 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1270 }
1271 printf("%s\n", est_border_str());
1272 draft = est_doc_dump_draft(doc);
1273 printf("%s", draft);
1274 free(draft);
1275 est_doc_delete(doc);
1276 }
1277 break;
1278 case VM_SNIP:
1279 if((doc = est_db_get_doc(db, id, 0)) != NULL){
1280 if(sc >= 0){
1281 sprintf(numbuf, "%d", sc);
1282 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1283 }
1284 printf("%s\n", est_border_str());
1285 names = est_doc_attr_names(doc);
1286 for(j = 0; j < cblistnum(names); j++){
1287 kbuf = cblistval(names, j, NULL);
1288 vbuf = est_doc_attr(doc, kbuf);
1289 printf("%s=%s\n", kbuf, vbuf);
1290 }
1291 cblistclose(names);
1292 kwords = kwdb ? vectorizer(db, id, kwdb) : NULL;
1293 if(!kwords) kwords = est_db_etch_doc(db, doc, KWORDNUM);
1294 if(cbmaprnum(kwords) > 0){
1295 printf("%s=", DATTRKWORDS);
1296 cbmapiterinit(kwords);
1297 for(j = 0; (kbuf = cbmapiternext(kwords, NULL)) != NULL; j++){
1298 if(j > 0) printf(" ");
1299 printf("%s %s", kbuf, cbmapget(kwords, kbuf, -1, NULL));
1300 }
1301 printf("\n");
1302 }
1303 cbmapclose(kwords);
1304 printf("\n");
1305 words = cbmapkeys(hints);
1306 draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1307 printf("%s", draft);
1308 free(draft);
1309 cblistclose(words);
1310 est_doc_delete(doc);
1311 }
1312 break;
1313 case VM_HMRD:
1314 if((doc = est_db_get_doc(db, id, 0)) != NULL){
1315 if(sc >= 0){
1316 sprintf(numbuf, "%d", sc);
1317 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1318 }
1319 printf("\n");
1320 if((vbuf = est_doc_attr(doc, ESTDATTRURI)) != NULL) printf("URI: %s\n", vbuf);
1321 if((vbuf = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) printf("Title: %s\n", vbuf);
1322 printf(" ");
1323 words = cbmapkeys(hints);
1324 draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1325 lines = cbsplit(draft, -1, "\n");
1326 fin = TRUE;
1327 for(j = 0; j < cblistnum(lines); j++){
1328 line = cblistval(lines, j, NULL);
1329 if(line[0] != '\0'){
1330 word = cbmemdup(line, -1);
1331 if((pv = strchr(word, '\t')) != NULL) *pv = '\0';
1332 printf("%s", word);
1333 free(word);
1334 fin = TRUE;
1335 } else if(fin){
1336 printf(" ... ");
1337 fin = FALSE;
1338 }
1339 }
1340 cblistclose(lines);
1341 free(draft);
1342 cblistclose(words);
1343 printf("\n\n");
1344 est_doc_delete(doc);
1345 }
1346 break;
1347 case VM_XML:
1348 if((doc = est_db_get_doc(db, id, 0)) != NULL){
1349 if(sc >= 0){
1350 sprintf(numbuf, "%d", sc);
1351 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1352 }
1353 if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1354 xmlprintf("<document id=\"%d\" uri=\"%@\">\n", id, vbuf);
1355 names = est_doc_attr_names(doc);
1356 for(j = 0; j < cblistnum(names); j++){
1357 kbuf = cblistval(names, j, NULL);
1358 if(!strcmp(kbuf, ESTDATTRID) || !strcmp(kbuf, ESTDATTRURI)) continue;
1359 vbuf = est_doc_attr(doc, kbuf);
1360 xmlprintf("<attribute name=\"%@\" value=\"%@\"/>\n", kbuf, vbuf);
1361 }
1362 cblistclose(names);
1363 kwords = kwdb ? vectorizer(db, id, kwdb) : NULL;
1364 if(!kwords) kwords = est_db_etch_doc(db, doc, KWORDNUM);
1365 if(cbmaprnum(kwords) > 0){
1366 xmlprintf("<vector>");
1367 cbmapiterinit(kwords);
1368 for(j = 0; (kbuf = cbmapiternext(kwords, NULL)) != NULL; j++){
1369 xmlprintf("<element key=\"%@\" number=\"%@\"/>",
1370 kbuf, cbmapget(kwords, kbuf, -1, NULL));
1371 }
1372 xmlprintf("</vector>\n");
1373 }
1374 cbmapclose(kwords);
1375 words = cbmapkeys(hints);
1376 draft = est_doc_make_snippet(doc, words, SNIPWWIDTH, SNIPHWIDTH, SNIPAWIDTH);
1377 lines = cbsplit(draft, -1, "\n");
1378 fin = TRUE;
1379 xmlprintf("<snippet>");
1380 for(j = 0; j < cblistnum(lines); j++){
1381 line = cblistval(lines, j, NULL);
1382 if(line[0] != '\0'){
1383 word = cbmemdup(line, -1);
1384 if((pv = strchr(word, '\t')) != NULL){
1385 *pv = '\0';
1386 pv++;
1387 xmlprintf("<key normal=\"%@\">%@</key>", pv, word);
1388 } else {
1389 xmlprintf("%@", word);
1390 }
1391 free(word);
1392 fin = TRUE;
1393 } else if(fin){
1394 xmlprintf("<delimiter/>");
1395 fin = FALSE;
1396 }
1397 }
1398 xmlprintf("</snippet>\n");
1399 cblistclose(lines);
1400 free(draft);
1401 cblistclose(words);
1402 xmlprintf("</document>\n");
1403 est_doc_delete(doc);
1404 }
1405 break;
1406 case VM_DUMP:
1407 if((doc = est_db_get_doc(db, id, 0)) != NULL){
1408 if(sc >= 0){
1409 sprintf(numbuf, "%d", sc);
1410 est_doc_add_attr(doc, DATTRSCORE, numbuf);
1411 }
1412 if(!(vbuf = est_doc_attr(doc, ESTDATTRURI))) vbuf = "";
1413 sprintf(path, "%08d%cest", id, ESTEXTCHR);
1414 printf("%s\t%s\n", path, vbuf);
1415 draft = est_doc_dump_draft(doc);
1416 if(!(cbwritefile(path, draft, -1))) printferror("%s: could not open", path);
1417 free(draft);
1418 est_doc_delete(doc);
1419 }
1420 break;
1421 default:
1422 printf("%d\n", id);
1423 break;
1424 }
1425 }
1426 if(g_viewmode == VM_XML){
1427 xmlprintf("</estresult>\n");
1428 } else {
1429 printf("%s:END\n", est_border_str());
1430 }
1431 free(res);
1432 cbmapclose(hints);
1433 est_cond_delete(cond);
1434 if(kwdb) crclose(kwdb);
1435 if(!est_db_close(db, &ecode)){
1436 printferror("%s: %s", dbname, est_err_msg(ecode));
1437 return 1;
1438 }
1439 return 0;
1440 }
1441
1442
1443 /* perform the gather command */
1444 static int procgather(const char *dbname, const char *filename){
1445 ESTDB *db;
1446 CBLIST *list, *clist;
1447 FILE *ifp;
1448 const char *tmp;
1449 char *line, *path;
1450 int i, err, ecode;
1451 time_t curtime;
1452 struct stat sbuf;
1453 curtime = time(NULL);
1454 err = FALSE;
1455 if(stat(filename, &sbuf) != -1 && S_ISDIR(sbuf.st_mode)){
1456 printfinfo("reading list from the directory: %s", filename);
1457 if((db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | g_oextmodes, &ecode)) != NULL){
1458 est_db_set_informer(db, dbinform);
1459 if(g_cachesize > 0){
1460 if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1461 est_db_set_cache_size(db, g_cachesize, -1, -1);
1462 }
1463 list = cblistopen();
1464 cblistunshift(list, filename, -1);
1465 while((line = cblistshift(list, NULL)) != NULL){
1466 if(stat(line, &sbuf) != -1 && S_ISDIR(sbuf.st_mode) && (clist = cbdirlist(line)) != NULL){
1467 cblistsort(clist);
1468 for(i = cblistnum(clist) - 1; i >= 0; i--){
1469 tmp = cblistval(clist, i, NULL);
1470 if(!strcmp(tmp, ESTCDIRSTR) || !strcmp(tmp, ESTPDIRSTR)) continue;
1471 path = cbsprintf("%s%c%s", line, ESTPATHCHR, tmp);
1472 cblistunshift(list, path, -1);
1473 free(path);
1474 }
1475 cblistclose(clist);
1476 } else {
1477 if(!doputdoc(db, line)){
1478 printferror("%s: %s", line, est_err_msg(est_db_error(db)));
1479 err = TRUE;
1480 }
1481 }
1482 free(line);
1483 if(err || g_sigterm) break;
1484 }
1485 cblistclose(list);
1486 if(!est_db_close(db, &ecode)){
1487 printferror("%s: %s", dbname, est_err_msg(ecode));
1488 err = TRUE;
1489 }
1490 } else {
1491 printferror("%s: %s", dbname, est_err_msg(ecode));
1492 err = TRUE;
1493 }
1494 } else {
1495 if(!strcmp(filename, "-")){
1496 ifp = stdin;
1497 printfinfo("reading list from the standard input", filename);
1498 } else if((ifp = fopen(filename, "rb")) != NULL){
1499 printfinfo("reading list from the file: %s", filename);
1500 } else {
1501 printferror("%s: could not open", filename);
1502 return 1;
1503 }
1504 if((db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | g_oextmodes, &ecode)) != NULL){
1505 est_db_set_informer(db, dbinform);
1506 if(g_cachesize > 0){
1507 if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1508 est_db_set_cache_size(db, g_cachesize, -1, -1);
1509 }
1510 while((line = fgetl(ifp)) != NULL){
1511 if(!doputdoc(db, line)){
1512 printferror("%s: %s", line, est_err_msg(est_db_error(db)));
1513 err = TRUE;
1514 }
1515 free(line);
1516 if(err || g_sigterm) break;
1517 }
1518 if(!est_db_close(db, &ecode)){
1519 printferror("%s: %s", dbname, est_err_msg(ecode));
1520 err = TRUE;
1521 }
1522 } else {
1523 printferror("%s: %s", dbname, est_err_msg(ecode));
1524 err = TRUE;
1525 }
1526 if(ifp != stdin) fclose(ifp);
1527 }
1528 curtime = time(NULL) - curtime;
1529 if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1530 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1531 return err ? 1 : 0;
1532 }
1533
1534
1535 /* perform the purge command */
1536 static int procpurge(const char *dbname, const char *prefix){
1537 ESTDB *db;
1538 ESTCOND *cond;
1539 ESTDOC *doc;
1540 const char *luri;
1541 char *attr, *path;
1542 int i, ecode, err, *res, rnum;
1543 time_t curtime;
1544 struct stat sbuf;
1545 curtime = time(NULL);
1546 if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1547 printferror("%s: %s", dbname, est_err_msg(ecode));
1548 return 1;
1549 }
1550 est_db_set_informer(db, dbinform);
1551 cond = est_cond_new();
1552 attr = cbsprintf("%s STRBW %s", DATTRLPATH, prefix ? prefix : "");
1553 est_cond_add_attr(cond, attr);
1554 res = est_db_search(db, cond, &rnum, NULL);
1555 err = FALSE;
1556 for(i = 0; i < rnum; i++){
1557 if(!(doc = est_db_get_doc(db, res[i], ESTGDNOTEXT))) continue;
1558 if((luri = est_doc_attr(doc, DATTRLPATH)) != NULL){
1559 if(g_doforce){
1560 if(est_db_out_doc(db, res[i], g_outopts)){
1561 printfinfo("%d (%s): deleted", res[i], luri);
1562 } else {
1563 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1564 err = TRUE;
1565 }
1566 } else if((path = urltopath(luri)) != NULL){
1567 if(stat(path, &sbuf) != -1){
1568 printfinfo("%s: passed", luri);
1569 } else {
1570 if(est_db_out_doc(db, res[i], g_outopts)){
1571 printfinfo("%d (%s): deleted", res[i], luri);
1572 } else {
1573 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1574 err = TRUE;
1575 }
1576 }
1577 } else {
1578 printfinfo("%s: ignored", luri);
1579 }
1580 } else {
1581 printfinfo("(%d): ignored", res[i]);
1582 }
1583 est_doc_delete(doc);
1584 if(err || g_sigterm) break;
1585 }
1586 free(res);
1587 est_cond_delete(cond);
1588 free(attr);
1589 if(!est_db_close(db, &ecode)){
1590 printferror("%s: %s", dbname, est_err_msg(ecode));
1591 return 1;
1592 }
1593 curtime = time(NULL) - curtime;
1594 if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1595 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1596 return err ? 1 : 0;
1597 }
1598
1599
1600 /* perform the extkeys command */
1601 static int procextkeys(const char *dbname, const char *prefix, int ni){
1602 ESTDB *db;
1603 ESTCOND *cond;
1604 ESTDOC *doc;
1605 CURIA *kwdb;
1606 CBMAP *kwords;
1607 const char *uri;
1608 char path[URIBUFSIZ], *attr, *mbuf;
1609 int i, ecode, err, *res, rnum, msiz;
1610 time_t curtime;
1611 curtime = time(NULL);
1612 if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1613 printferror("%s: %s", dbname, est_err_msg(ecode));
1614 return 1;
1615 }
1616 est_db_set_informer(db, dbinform);
1617 if(!ni && (!prefix || prefix[0] == '\0')) est_db_fill_key_cache(db);
1618 sprintf(path, "%s%c%s", dbname, ESTPATHCHR, KWDBNAME);
1619 if(!(kwdb = cropen(path, CR_OWRITER | CR_OCREAT, KWDBBNUM, KWDBDNUM))){
1620 printferror("%s: the keyword database has some errors", dbname);
1621 est_db_close(db, &ecode);
1622 return 1;
1623 }
1624 crsetalign(kwdb, -4);
1625 cond = est_cond_new();
1626 attr = cbsprintf("%s STRBW %s", DATTRLPATH, prefix ? prefix : "");
1627 est_cond_add_attr(cond, attr);
1628 res = est_db_search(db, cond, &rnum, NULL);
1629 err = FALSE;
1630 for(i = 0; i < rnum; i++){
1631 if(!g_doforce && crvsiz(kwdb, (char *)&(res[i]), sizeof(int)) > 0){
1632 printfinfo("%d: passed", res[i]);
1633 continue;
1634 }
1635 if(!(doc = est_db_get_doc(db, res[i], 0))) continue;
1636 if(!(uri = est_doc_attr(doc, ESTDATTRURI))) uri = "";
1637 kwords = est_db_etch_doc(ni ? NULL : db, doc, g_kwordnum);
1638 mbuf = cbmapdump(kwords, &msiz);
1639 fflush(stdout);
1640 if(crput(kwdb, (char *)&(res[i]), sizeof(int), mbuf, msiz, CR_DOVER)){
1641 printfinfo("%d (%s): extracted", res[i], uri);
1642 } else {
1643 printferror("%s: the keyword database has some errors", dbname);
1644 err = TRUE;
1645 }
1646 free(mbuf);
1647 cbmapclose(kwords);
1648 est_doc_delete(doc);
1649 if(err || g_sigterm) break;
1650 }
1651 free(res);
1652 est_cond_delete(cond);
1653 free(attr);
1654 if(!crclose(kwdb)){
1655 printferror("%s: the keyword database has some errors", dbname);
1656 err = TRUE;
1657 }
1658 if(!est_db_close(db, &ecode)){
1659 printferror("%s: %s", dbname, est_err_msg(ecode));
1660 return 1;
1661 }
1662 curtime = time(NULL) - curtime;
1663 if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1664 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1665 return err ? 1 : 0;
1666 }
1667
1668
1669 /* perform the draft command */
1670 static int procdraft(const char *filename){
1671 ESTDOC *doc;
1672 char *buf, *draft;
1673 int size;
1674 if(!(buf = cbreadfile(filename, &size))){
1675 printferror("%s: could not open", filename ? filename : "(stdin)");
1676 return 1;
1677 }
1678 switch(g_filefmt){
1679 case FF_TEXT:
1680 doc = est_doc_new_from_text(buf, size, g_inputcode, g_inputlang);
1681 break;
1682 case FF_HTML:
1683 doc = est_doc_new_from_html(buf, size, g_inputcode, g_inputlang);
1684 break;
1685 case FF_MIME:
1686 doc = est_doc_new_from_mime(buf, size, g_inputcode, g_inputlang);
1687 break;
1688 default:
1689 doc = est_doc_new_from_draft_enc(buf, size, g_inputcode);
1690 break;
1691 }
1692 draft = est_doc_dump_draft(doc);
1693 printf("%s", draft);
1694 free(draft);
1695 est_doc_delete(doc);
1696 free(buf);
1697 return 0;
1698 }
1699
1700
1701 /* perform the break command */
1702 static int procbreak(const char *filename, int wt){
1703 CBLIST *words;
1704 char *str, *phrase;
1705 int i;
1706 if(filename && filename[0] == '@'){
1707 str = cbmemdup(filename + 1, -1);
1708 } else if(!(str = cbreadfile(filename, NULL))){
1709 printferror("%s: could not open", filename ? filename : "(stdin)");
1710 return 1;
1711 }
1712 if(!(phrase = est_iconv(str, -1, g_inputcode, "UTF-8", NULL, NULL))){
1713 printferror("%s: unsupported encoding\n", g_inputcode);
1714 free(str);
1715 return 1;
1716 }
1717 g_inputcode = NULL;
1718 words = cblistopen();
1719 if(g_oextmodes & ESTDBPERFNG){
1720 est_break_text_perfng(phrase, words, TRUE, wt);
1721 } else {
1722 est_break_text(phrase, words, TRUE, wt);
1723 }
1724 for(i = 0; i < cblistnum(words); i++){
1725 printf("%s\n", cblistval(words, i, NULL));
1726 }
1727 cblistclose(words);
1728 free(phrase);
1729 free(str);
1730 return 0;
1731 }
1732
1733
1734 /* perform the randput command */
1735 static int procrandput(const char *dbname, int dnum){
1736 ESTDB *db;
1737 ESTDOC *doc;
1738 const char *mode;
1739 char uri[URIBUFSIZ];
1740 int i, ecode, err;
1741 time_t curtime;
1742 curtime = time(NULL);
1743 if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1744 printferror("%s: %s", dbname, est_err_msg(ecode));
1745 return 1;
1746 }
1747 est_db_set_informer(db, dbinform);
1748 if(g_cachesize > 0){
1749 if(g_cachesize > CACHEMAX) g_cachesize = CACHEMAX;
1750 est_db_set_cache_size(db, g_cachesize, -1, -1);
1751 }
1752 err = FALSE;
1753 for(i = 0; i < dnum; i++){
1754 doc = est_doc_new_from_chaos(RDOCCNUM, RDOCSNUM, g_rdmode);
1755 sprintf(uri, "file:///tmp/randput-%08d-%05d.est", i + 1, getpid());
1756 est_doc_add_attr(doc, ESTDATTRURI, uri);
1757 if(est_db_put_doc(db, doc, 0)){
1758 if(!(mode = est_doc_attr(doc, "mode"))) mode = "unknown";
1759 printfinfo("%d (%s) (%s): registered", est_doc_id(doc), uri, mode);
1760 } else {
1761 printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1762 err = TRUE;
1763 }
1764 est_doc_delete(doc);
1765 if(err || g_sigterm) break;
1766 }
1767 if(!est_db_close(db, &ecode)){
1768 printferror("%s: %s", dbname, est_err_msg(ecode));
1769 return 1;
1770 }
1771 curtime = time(NULL) - curtime;
1772 if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1773 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1774 return err ? 1 : 0;
1775 }
1776
1777
1778 /* perform the wicked command */
1779 static int procwicked(const char *dbname, int dnum){
1780 ESTDB *db;
1781 ESTDOC *doc;
1782 ESTCOND *cond;
1783 CBLIST *words;
1784 char uri[URIBUFSIZ], *oper, *value, *first, *second, *phrase;
1785 int i, j, ecode, err, *res, rnum;
1786 double rnd;
1787 time_t curtime;
1788 curtime = time(NULL);
1789 if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1790 printferror("%s: %s", dbname, est_err_msg(ecode));
1791 return 1;
1792 }
1793 est_db_set_informer(db, dbinform);
1794 est_db_set_cache_size(db, 1024 * 1024 * 128, 1024, 256);
1795 est_db_set_special_cache(db, ESTDATTRURI, 128);
1796 err = FALSE;
1797 for(i = 0; i < dnum; i++){
1798 rnd = est_random();
1799 if((int)(rnd * INT_MAX) % dnum < 5){
1800 rnd = est_random();
1801 if(rnd < 0.3){
1802 if(!est_db_close(db, &ecode)){
1803 printferror("%s: %s", dbname, est_err_msg(ecode));
1804 return 1;
1805 }
1806 if(!(db = est_db_open(dbname, ESTDBWRITER, &ecode))){
1807 printferror("%s: %s", dbname, est_err_msg(ecode));
1808 return 1;
1809 }
1810 est_db_set_informer(db, dbinform);
1811 est_db_set_cache_size(db, 1024 * 1024 * 128, 1024, 256);
1812 est_db_set_special_cache(db, ESTDATTRURI, i / 10 + 1);
1813 } else if(rnd < 0.5){
1814 if(!est_db_optimize(db, (int)(est_random() * INT_MAX) % 2 == 0) ? ESTOPTNOPURGE : 0)
1815 err = TRUE;
1816 } else if(rnd < 0.8){
1817 if(!est_db_flush(db, 1024)) err = TRUE;
1818 } else {
1819 if(!est_db_sync(db)) err = TRUE;
1820 }
1821 } else if(rnd < 0.05){
1822 if(est_db_out_doc(db, (int)(est_random() * INT_MAX) % (i + 1) + 1,
1823 ((int)(est_random() * INT_MAX) % 2 == 0) ? ESTODCLEAN : 0)){
1824 printfinfo("[%d:%d]: out", i + 1, est_db_doc_num(db));
1825 } else if(est_db_error(db) != ESTENOITEM){
1826 err = TRUE;
1827 }
1828 } else if(rnd < 0.1){
1829 if((value = est_db_get_doc_attr(db, (int)(est_random() * INT_MAX) % (i + 1) + 1,
1830 ESTDATTRURI)) != NULL){
1831 printfinfo("[%d:%d]: attr: %s", i + 1, est_db_doc_num(db), value);
1832 free(value);
1833 }
1834 } else if(rnd < 0.25){
1835 rnd = est_random();
1836 if(rnd < 0.5){
1837 oper = " OR ";
1838 } else if(rnd < 0.7){
1839 oper = " AND ";
1840 } else if(rnd < 0.8){
1841 oper = " NOTAND ";
1842 } else if(rnd < 0.9){
1843 oper = " ";
1844 } else {
1845 oper = "";
1846 }
1847 first = est_random_str(5, (int)(est_random() * INT_MAX) % RD_RAND);
1848 second = est_random_str(2, (int)(est_random() * INT_MAX) % RD_RAND);
1849 phrase = cbsprintf("%s%s%s", first, oper, second);
1850 cond = est_cond_new();
1851 est_cond_set_phrase(cond, phrase);
1852 if(est_random() < 0.25) est_cond_add_attr(cond, "@uri STREW 0.est");
1853 if(est_random() < 0.25) est_cond_set_order(cond, "@uri STRD");
1854 if(est_random() < 0.05) est_cond_set_options(cond, ESTCONDSURE | ESTCONDSCFB);
1855 if(est_random() < 0.05) est_cond_set_options(cond, ESTCONDAGIT | ESTCONDNOIDF);
1856 res = est_db_search(db, cond, &rnum, NULL);
1857 printfinfo("[%d:%d]: search: %d hits", i + 1, est_db_doc_num(db), rnum);
1858 if(est_random() < 0.05){
1859 for(j = 0; j < rnum && j < 100; j++){
1860 if((doc = est_db_get_doc(db, res[j], 0)) != NULL){
1861 if(i % 10 == 0){
1862 free(est_doc_cat_texts(doc));
1863 free(est_doc_dump_draft(doc));
1864 words = cblistopen();
1865 cblistpush(words, "vw", -1);
1866 cblistpush(words, "xy", -1);
1867 cblistpush(words, "z", -1);
1868 free(est_doc_make_snippet(doc, words, 100, 10, 10));
1869 cblistclose(words);
1870 }
1871 est_doc_delete(doc);
1872 } else if(est_db_error(db) != ESTENOITEM){
1873 err = TRUE;
1874 }
1875 }
1876 }
1877 free(res);
1878 est_cond_delete(cond);
1879 free(phrase);
1880 free(first);
1881 free(second);
1882 } else {
1883 doc = est_doc_new_from_chaos(100, 3, est_random() < 0.5 ? RD_EURO : RD_RAND);
1884 if(est_random() < 0.2){
1885 sprintf(uri, "file:///tmp/wicked-%08d-%05d.est",
1886 (int)(est_random() * INT_MAX) % (i + 1) + 1, getpid());
1887 } else {
1888 sprintf(uri, "file:///tmp/wicked-%08d-%05d.est", i + 1, getpid());
1889 }
1890 est_doc_add_attr(doc, ESTDATTRURI, uri);
1891 if(!est_db_put_doc(db, doc, est_random() < 0.5 ? ESTPDCLEAN : 0)) err = TRUE;
1892 est_doc_delete(doc);
1893 }
1894 if(err || g_sigterm) break;
1895 }
1896 if(err) printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
1897 if(!est_db_close(db, &ecode)){
1898 printferror("%s: %s", dbname, est_err_msg(ecode));
1899 return 1;
1900 }
1901 curtime = time(NULL) - curtime;
1902 if(!err) printfinfo("finished successfully: elapsed time: %dh %dm %ds",
1903 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
1904 return err ? 1 : 0;
1905 }
1906
1907
1908 /* perform the regression command */
1909 static int procregression(const char *dbname){
1910 ESTDB *db;
1911 ESTDOC *doc;
1912 ESTCOND *cond;
1913 int i, ecode, err, *res, rnum;
1914 time_t curtime;
1915 curtime = time(NULL);
1916 printfinfo("# opening the database");
1917 if(!(db = est_db_open(dbname, ESTDBWRITER | ESTDBCREAT | ESTDBTRUNC, &ecode))){
1918 printferror("%s: %s", dbname, est_err_msg(ecode));
1919 return 1;
1920 }
1921 est_db_set_informer(db, dbinform);
1922 err = FALSE;
1923 if(!err){
1924 printfinfo("# checking registration of small documents");
1925 doc = est_doc_new();
1926 est_doc_add_attr(doc, ESTDATTRURI, "file:///small/one");
1927 est_doc_add_text(doc, "One!");
1928 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1929 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1930 est_doc_delete(doc);
1931 doc = est_doc_new();
1932 est_doc_add_attr(doc, ESTDATTRURI, "file:///small/two");
1933 est_doc_add_text(doc, "Two!!");
1934 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1935 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1936 est_doc_delete(doc);
1937 doc = est_doc_new();
1938 est_doc_add_attr(doc, ESTDATTRURI, "file:///small/three");
1939 est_doc_add_text(doc, "Three!!!");
1940 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1941 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1942 est_doc_delete(doc);
1943 doc = est_doc_new();
1944 est_doc_add_attr(doc, ESTDATTRURI, "file:///empty");
1945 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1946 est_doc_delete(doc);
1947 }
1948 if(!err){
1949 printfinfo("# checking registration of an english document");
1950 doc = est_doc_new();
1951 est_doc_add_attr(doc, ESTDATTRURI, "file:///english");
1952 est_doc_add_attr(doc, ESTDATTRTITLE, "Hyper Estraier");
1953 est_doc_add_text(doc, "% This is a displayed sentence. ;-)");
1954 est_doc_add_text(doc, "Hyper Estraier is a full-text search system for communities.");
1955 est_doc_add_text(doc, "A little suffering is good for the soul.");
1956 est_doc_add_text(doc, "They have been at a great feast of languages, and stolen the scraps.");
1957 est_doc_add_hidden_text(doc, "(Give it up, Yo! Give it up, Yo!)");
1958 est_doc_add_hidden_text(doc, "% This is a hidden sentence. :-<");
1959 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1960 est_doc_add_hidden_text(doc, "");
1961 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1962 est_doc_delete(doc);
1963 }
1964 if(!err){
1965 printfinfo("# checking registration of a japanese document");
1966 doc = est_doc_new();
1967 est_doc_add_attr(doc, ESTDATTRURI, "file:///japanese");
1968 est_doc_add_attr(doc, ESTDATTRTITLE, "\xe5\xb9\xb3\xe6\x9e\x97\xe5\xb9\xb9\xe9\x9b\x84");
1969 est_doc_add_text(doc, "\xe6\x9c\xac\xe6\x97\xa5\xe3\x81\xaf\xe6\x99\xb4\xe5\xa4\xa9\xe3"
1970 "\x81\xaa\xe3\x82\x8a\xe3\x80\x82");
1971 est_doc_add_text(doc, "\xe6\x9c\x95\xe3\x81\xaf\xe5\x9b\xbd\xe5\xae\xb6\xe7\xac\xac\xe4"
1972 "\xb8\x80\xe3\x81\xae\xe4\xb8\x8b\xe5\x83\x95\xe3\x81\xa7\xe3\x81"
1973 "\x82\xe3\x82\x8b\xe3\x80\x82");
1974 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1975 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1976 est_doc_delete(doc);
1977 }
1978 if(!err){
1979 printfinfo("# checking duplication of documents");
1980 doc = est_doc_new();
1981 est_doc_add_attr(doc, ESTDATTRURI, "file:///duplication");
1982 est_doc_add_text(doc, "Gamble, you gatta chance to make a Rumble!");
1983 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1984 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1985 est_doc_delete(doc);
1986 doc = est_doc_new();
1987 est_doc_add_attr(doc, ESTDATTRURI, "file:///duplication");
1988 est_doc_add_text(doc, "bring back hey, one more time!");
1989 est_doc_add_hidden_text(doc, "(Check it out, come on!)");
1990 if(!est_db_put_doc(db, doc, ESTPDCLEAN)) err = TRUE;
1991 est_doc_delete(doc);
1992 if(est_db_doc_num(db) != 7){
1993 printferror("%s: the number of documents is invalid", dbname);
1994 err = TRUE;
1995 }
1996 }
1997 if(!err){
1998 printfinfo("# checking search for unfixed documents");
1999 cond = est_cond_new();
2000 est_cond_set_phrase(cond, "check");
2001 res = est_db_search(db, cond, &rnum, NULL);
2002 if(rnum != 6){
2003 printferror("%s: the number of result is invalid", dbname);
2004 err = TRUE;
2005 }
2006 free(res);
2007 est_cond_delete(cond);
2008 }
2009 if(!err){
2010 printfinfo("# checking partial flushing of the index");
2011 if(!est_db_flush(db, 32)) err = TRUE;
2012 }
2013 if(!err){
2014 printfinfo("# checking deletion with cleaning of a document");
2015 if(!est_db_out_doc(db, 1, ESTODCLEAN)) err = TRUE;
2016 }
2017 if(!err){
2018 printfinfo("# checking synchronization");
2019 if(!est_db_sync(db)) err = TRUE;
2020 }
2021 if(!err){
2022 printfinfo("# checking deletion without cleaning of a document");
2023 if(!est_db_out_doc(db, 2, 0)) err = TRUE;
2024 }
2025 if(!err){
2026 printfinfo("# checking word search");
2027 cond = est_cond_new();
2028 est_cond_set_phrase(cond, "check it AND on");
2029 res = est_db_search(db, cond, &rnum, NULL);
2030 if(rnum != 5){
2031 printferror("%s: the number of result is invalid", dbname);
2032 err = TRUE;
2033 }
2034 free(res);
2035 est_cond_set_phrase(cond, "RUMBLE OR \xe3\x80\x82");
2036 res = est_db_search(db, cond, &rnum, NULL);
2037 if(rnum != 1){
2038 printferror("%s: the number of result is invalid", dbname);
2039 err = TRUE;
2040 }
2041 free(res);
2042 est_cond_delete(cond);
2043 }
2044 if(!err){
2045 printfinfo("# checking attribute search");
2046 cond = est_cond_new();
2047 est_cond_add_attr(cond, "@uri !ISTRINC SMaLl");
2048 res = est_db_search(db, cond, &rnum, NULL);
2049 if(rnum != est_db_doc_num(db) - 1){
2050 printferror("%s: the number of result is invalid", dbname);
2051 err = TRUE;
2052 }
2053 free(res);
2054 est_cond_delete(cond);
2055 cond = est_cond_new();
2056 est_cond_add_attr(cond, "@uri STRBW file://");
2057 est_cond_add_attr(cond, "@title STRINC \xe5\xb9\xb3");
2058 res = est_db_search(db, cond, &rnum, NULL);
2059 if(rnum != 1){
2060 printferror("%s: the number of result is invalid", dbname);
2061 err = TRUE;
2062 }
2063 free(res);
2064 est_cond_delete(cond);
2065 }
2066 if(!err){
2067 printfinfo("# checking combined search");
2068 cond = est_cond_new();
2069 est_cond_set_phrase(cond, "\xe5\x9b\xbd\xe5\xae\xb6\xe7\xac\xac\xe4\xb8\x80");
2070 est_cond_add_attr(cond, "@uri");
2071 est_cond_set_order(cond, "@title");
2072 res = est_db_search(db, cond, &rnum, NULL);
2073 if(rnum != 1){
2074 printferror("%s: the number of result is invalid", dbname);
2075 err = TRUE;
2076 }
2077 free(res);
2078 est_cond_delete(cond);
2079 cond = est_cond_new();
2080 est_cond_set_phrase(cond, "one | \xe3\x80\x82 | check & check it ! hogehoge");
2081 est_cond_add_attr(cond, "@uri STRBW file://");
2082 est_cond_set_order(cond, "@title STRD");
2083 est_cond_set_options(cond, ESTCONDSURE | ESTCONDNOIDF | ESTCONDSIMPLE);
2084 res = est_db_search(db, cond, &rnum, NULL);
2085 if(rnum != 4){
2086 printferror("%s: the number of result is invalid", dbname);
2087 err = TRUE;
2088 }
2089 free(res);
2090 est_cond_delete(cond);
2091 }
2092 if(!err){
2093 printfinfo("# checking optimization");
2094 if(!est_db_optimize(db, 0)) err = TRUE;
2095 cond = est_cond_new();
2096 est_cond_set_phrase(cond, "check");
2097 res = est_db_search(db, cond, &rnum, NULL);
2098 if(rnum != 4){
2099 printferror("%s: the number of result is invalid", dbname);
2100 err = TRUE;
2101 }
2102 free(res);
2103 est_cond_delete(cond);
2104 }
2105 if(!err){
2106 printfinfo("# checking traversal access");
2107 cond = est_cond_new();
2108 est_cond_set_phrase(cond, "[UVSET]");
2109 res = est_db_search(db, cond, &rnum, NULL);
2110 for(i = 0; i < rnum; i++){
2111 if(!(doc = est_db_get_doc(db, res[i], 0))){
2112 printferror("%s: a document cannot be retrieved", dbname);
2113 err = TRUE;
2114 break;
2115 }
2116 est_doc_delete(doc);
2117 }
2118 free(res);
2119 est_cond_delete(cond);
2120 }
2121 if(err) printferror("%s: %s", dbname, est_err_msg(est_db_error(db)));
2122 printfinfo("# closing the database");
2123 if(!est_db_close(db, &ecode)){
2124 printferror("%s: %s", dbname, est_err_msg(ecode));
2125 return 1;
2126 }
2127 curtime = time(NULL) - curtime;
2128 if(!err) printfinfo("# finished successfully: elapsed time: %dh %dm %ds",
2129 (int)(curtime / 3600), (int)((curtime / 60) % 60), (int)(curtime % 60));
2130 return err ? 1 : 0;
2131 }
2132
2133
2134 /* output escaped string */
2135 static void xmlprintf(const char *format, ...){
2136 va_list ap;
2137 char *tmp, cbuf[32];
2138 unsigned char c;
2139 int cblen;
2140 va_start(ap, format);
2141 while(*format != '\0'){
2142 if(*format == '%'){
2143 cbuf[0] = '%';
2144 cblen = 1;
2145 format++;
2146 while(strchr("0123456789 .+-", *format) && *format != '\0' && cblen < 31){
2147 cbuf[cblen++] = *format;
2148 format++;
2149 }
2150 cbuf[cblen++] = *format;
2151 cbuf[cblen] = '\0';
2152 switch(*format){
2153 case 's':
2154 tmp = va_arg(ap, char *);
2155 if(!tmp) tmp = "(null)";
2156 printf(cbuf, tmp);
2157 break;
2158 case 'd':
2159 printf(cbuf, va_arg(ap, int));
2160 break;
2161 case 'o': case 'u': case 'x': case 'X': case 'c':
2162 printf(cbuf, va_arg(ap, unsigned int));
2163 break;
2164 case 'e': case 'E': case 'f': case 'g': case 'G':
2165 printf(cbuf, va_arg(ap, double));
2166 break;
2167 case '@':
2168 tmp = va_arg(ap, char *);
2169 if(!tmp) tmp = "(null)";
2170 while(*tmp){
2171 switch(*tmp){
2172 case '&': printf("&amp;"); break;
2173 case '<': printf("&lt;"); break;
2174 case '>': printf("&gt;"); break;
2175 case '"': printf("&quot;"); break;
2176 default:
2177 if(!((*tmp >= 0 && *tmp <= 0x8) || (*tmp >= 0x0e && *tmp <= 0x1f))) putchar(*tmp);
2178 break;
2179 }
2180 tmp++;
2181 }
2182 break;
2183 case '?':
2184 tmp = va_arg(ap, char *);
2185 if(!tmp) tmp = "(null)";
2186 while(*tmp){
2187 c = *(unsigned char *)tmp;
2188 if((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
2189 (c >= '0' && c <= '9') || (c != '\0' && strchr("_-.", c))){
2190 putchar(c);
2191 } else {
2192 printf("%%%02X", c);
2193 }
2194 tmp++;
2195 }
2196 break;
2197 case '%':
2198 putchar('%');
2199 break;
2200 }
2201 } else {
2202 putchar(*format);
2203 }
2204 format++;
2205 }
2206 va_end(ap);
2207 }
2208
2209
2210 /* get the language value */
2211 static int strtolang(const char *str){
2212 if(!cbstricmp(str, "en")) return ESTLANGEN;
2213 if(!cbstricmp(str, "ja")) return ESTLANGJA;
2214 if(!cbstricmp(str, "zh")) return ESTLANGZH;
2215 if(!cbstricmp(str, "ko")) return ESTLANGKO;
2216 return ESTLANGMISC;
2217 }
2218
2219
2220 /* read a line */
2221 static char *fgetl(FILE *ifp){
2222 char *buf;
2223 int c, len, blen;
2224 buf = NULL;
2225 len = 0;
2226 blen = 1024;
2227 while((c = fgetc(ifp)) != EOF){
2228 if(blen <= len) blen *= 2;
2229 buf = cbrealloc(buf, blen + 1);
2230 if(c == '\n') c = '\0';
2231 if(c != '\r') buf[len++] = c;
2232 if(c == '\0') break;
2233 }
2234 if(!buf) return NULL;
2235 buf[len] = '\0';
2236 return buf;
2237 }
2238
2239
2240 /* register a document */
2241 static int doputdoc(ESTDB *db, const char *path){
2242 ESTDOC *doc, *edoc;
2243 const char *uri, *vbuf, *xcmd;
2244 char *dbuf, *tbuf;
2245 int err, fmt, id, dsiz;
2246 time_t emdate, fmdate;
2247 struct stat sbuf;
2248 xcmd = NULL;
2249 if(cbmaprnum(g_xcmdmap) > 0){
2250 cbmapiterinit(g_xcmdmap);
2251 while((vbuf = cbmapiternext(g_xcmdmap, NULL)) != NULL){
2252 if(cbstrbwimatch(path, vbuf)){
2253 xcmd = cbmapget(g_xcmdmap, vbuf, -1, NULL);
2254 break;
2255 }
2256 }
2257 }
2258 fmt = g_filefmt;
2259 if(g_filefmt == FF_NONE && !xcmd) return TRUE;
2260 if(g_filefmt == FF_AUTO){
2261 if(cbstrbwimatch(path, ESTEXTSTR "est")){
2262 fmt = FF_DRAFT;
2263 } else if(cbstrbwimatch(path, ESTEXTSTR "txt") || cbstrbwimatch(path, ESTEXTSTR "text") ||
2264 cbstrbwimatch(path, ESTEXTSTR "asc")){
2265 fmt = FF_TEXT;
2266 } else if(cbstrbwimatch(path, ESTEXTSTR "html") || cbstrbwimatch(path, ESTEXTSTR "htm") ||
2267 cbstrbwimatch(path, ESTEXTSTR "xhtml") || cbstrbwimatch(path, ESTEXTSTR "xht")){
2268 fmt = FF_HTML;
2269 } else if(cbstrbwimatch(path, ESTEXTSTR "eml") || cbstrbwimatch(path, ESTEXTSTR "mime") ||
2270 cbstrbwimatch(path, ESTEXTSTR "mht") || cbstrbwimatch(path, ESTEXTSTR "mhtml")){
2271 fmt = FF_MIME;
2272 } else if(!xcmd){
2273 return TRUE;
2274 }
2275 }
2276 if(stat(path, &sbuf) == -1 || !S_ISREG(sbuf.st_mode) || !(uri = pathtourl(path))){
2277 printferror("%s: could not open", path);
2278 return TRUE;
2279 }
2280 emdate = -1;
2281 if(g_chkmdate && (id = est_db_uri_to_id(db, uri)) > 0 &&
2282 (edoc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
2283 if((vbuf = est_doc_attr(edoc, ESTDATTRMDATE)) != NULL) emdate = cbstrmktime(vbuf);
2284 est_doc_delete(edoc);
2285 }
2286 if(g_stdate && emdate >= 0 && emdate >= sbuf.st_mtime){
2287 printfinfo("%s: passed", path);
2288 return TRUE;
2289 }
2290 if(g_filtorig){
2291 dbuf = cbmemdup("", 0);
2292 dsiz = 0;
2293 } else {
2294 if(!(dbuf = cbreadfile(path, &dsiz))){
2295 printferror("%s: could not open", path);
2296 return TRUE;
2297 }
2298 }
2299 if(xcmd){
2300 doc = est_doc_new_with_xcmd(dbuf, dsiz, path, xcmd, est_db_name(db),
2301 g_inputcode, g_inputlang);
2302 } else {
2303 switch(fmt){
2304 case FF_TEXT:
2305 doc = est_doc_new_from_text(dbuf, dsiz, g_inputcode, g_inputlang);
2306 break;
2307 case FF_HTML:
2308 doc = est_doc_new_from_html(dbuf, dsiz, g_inputcode, g_inputlang);
2309 break;
2310 case FF_MIME:
2311 doc = est_doc_new_from_mime(dbuf, dsiz, g_inputcode, g_inputlang);
2312 break;
2313 default:
2314 doc = est_doc_new_from_draft_enc(dbuf, dsiz, g_inputcode);
2315 break;
2316 }
2317 }
2318 if(!est_doc_attr(doc, ESTDATTRURI)) est_doc_add_attr(doc, ESTDATTRURI, uri);
2319 est_doc_add_attr(doc, DATTRLPATH, uri);
2320 est_doc_add_attr(doc, DATTRLFILE, urltofile(uri));
2321 uri = est_doc_attr(doc, ESTDATTRURI);
2322 if(g_stdate){
2323 tbuf = cbdatestrwww(sbuf.st_ctime, 0);
2324 est_doc_add_attr(doc, ESTDATTRCDATE, tbuf);
2325 free(tbuf);
2326 tbuf = cbdatestrwww(sbuf.st_mtime, 0);
2327 est_doc_add_attr(doc, ESTDATTRMDATE, tbuf);
2328 free(tbuf);
2329 }
2330 if(g_chkmdate && emdate == -1 && (id = est_db_uri_to_id(db, uri)) > 0 &&
2331 (edoc = est_db_get_doc(db, id, ESTGDNOTEXT)) != NULL){
2332 if((vbuf = est_doc_attr(edoc, ESTDATTRMDATE)) != NULL) emdate = cbstrmktime(vbuf);
2333 est_doc_delete(edoc);
2334 }
2335 fmdate = -1;
2336 if(g_chkmdate && (vbuf = est_doc_attr(doc, ESTDATTRMDATE)) != NULL) fmdate = cbstrmktime(vbuf);
2337 err = FALSE;
2338 if(emdate >= 0 && emdate >= fmdate){
2339 printfinfo("%s: passed", path);
2340 } else if(est_db_put_doc(db, doc, g_putopts)){
2341 printfinfo("%d (%s): registered", est_doc_id(doc), uri);
2342 } else {
2343 printferror("%s: %s", est_db_name(db), est_err_msg(est_db_error(db)));
2344 err = TRUE;
2345 }
2346 est_doc_delete(doc);
2347 free(dbuf);
2348 return err ? FALSE : TRUE;
2349 }
2350
2351
2352 /* get the URL of a path */
2353 static const char *pathtourl(const char *path){
2354 static char pbuf[URIBUFSIZ];
2355 const char *elem;
2356 char *wp, *ebuf;
2357 CBLIST *list;
2358 int i, esiz;
2359 if(strlen(path) >= URIBUFSIZ / 4) return NULL;
2360 if(g_pathcode){
2361 wp = est_realpath(path);
2362 if(!(ebuf = est_iconv(wp, -1, g_pathcode, "UTF-8", &esiz, NULL))){
2363 esiz = strlen(wp);
2364 ebuf = cbmemdup(wp, esiz);
2365 }
2366 list = cbsplit(ebuf, esiz, ESTPATHSTR);
2367 free(ebuf);
2368 free(wp);
2369 for(i = 0; i < cblistnum(list); i++){
2370 elem = cblistval(list, i, &esiz);
2371 if((ebuf = est_iconv(elem, esiz, "UTF-8", g_pathcode, &esiz, NULL)) != NULL){
2372 cblistover(list, i, ebuf, esiz);
2373 free(ebuf);
2374 }
2375 }
2376 } else {
2377 wp = est_realpath(path);
2378 list = cbsplit(wp, -1, ESTPATHSTR);
2379 free(wp);
2380 }
2381 wp = pbuf;
2382 wp += sprintf(wp, "file://");
2383 for(i = 0; i < cblistnum(list); i++){
2384 elem = cblistval(list, i, NULL);
2385 if(elem[0] == '\0') continue;
2386 if(i < 1 && ((elem[0] >= 'A' && elem[0] <= 'Z') || (elem[0] >= 'a' && elem[0] <= 'z')) &&
2387 elem[1] == ':'){
2388 wp += sprintf(wp, "%c|", elem[0]);
2389 continue;
2390 }
2391 ebuf = cburlencode(elem, -1);
2392 wp += sprintf(wp, "/%s", ebuf);
2393 free(ebuf);
2394 }
2395 *wp = '\0';
2396 cblistclose(list);
2397 return pbuf;
2398 }
2399
2400
2401 /* get the file name of a URL */
2402 static const char *urltofile(const char *uri){
2403 static char pbuf[URIBUFSIZ];
2404 const char *rp;
2405 char *dbuf, *ebuf;
2406 int dsiz;
2407 if(g_pathfull){
2408 if((rp = strstr(uri, "//")) != NULL){
2409 rp += 2;
2410 if(((rp[0] >= 'A' && rp[0] <= 'Z') || (rp[0] >= 'a' && rp[0] <= 'z')) &&
2411 rp[1] == '|' && rp[2] == '/') rp += 2;
2412 } else {
2413 rp = uri;
2414 }
2415 } else if((rp = strrchr(uri, '/')) != NULL){
2416 rp++;
2417 } else {
2418 rp = uri;
2419 }
2420 dbuf = cburldecode(rp, &dsiz);
2421 if((ebuf = est_iconv(dbuf, dsiz, g_pathcode ? g_pathcode : "ISO-8859-1", "UTF-8", NULL, NULL))
2422 != NULL){
2423 sprintf(pbuf, "%s", ebuf);
2424 free(ebuf);
2425 } else {
2426 sprintf(pbuf, "%s", rp);
2427 }
2428 free(dbuf);
2429 return pbuf;
2430 }
2431
2432
2433 /* geth the local path of a URL */
2434 static char *urltopath(const char *uri){
2435 static char pbuf[URIBUFSIZ];
2436 const char *elem;
2437 char *wp, *dbuf;
2438 CBLIST *list;
2439 int i;
2440 if(!cbstrfwimatch(uri, "file://")) return NULL;
2441 if(!(uri = strchr(uri + 7, '/'))) return NULL;
2442 list = cbsplit(uri, -1, "/");
2443 wp = pbuf;
2444 for(i = 0; i < cblistnum(list); i++){
2445 elem = cblistval(list, i, NULL);
2446 if(elem[0] == '\0') continue;
2447 if(i < 1 && ((elem[0] >= 'A' && elem[0] <= 'Z') || (elem[0] >= 'a' && elem[0] <= 'z')) &&
2448 elem[1] == '|'){
2449 wp += sprintf(wp, "%c:", elem[0]);
2450 continue;
2451 }
2452 dbuf = cburldecode(elem, NULL);
2453 wp += sprintf(wp, "%c%s", ESTPATHCHR, dbuf);
2454 free(dbuf);
2455 }
2456 *wp = '\0';
2457 cblistclose(list);
2458 return pbuf;
2459 }
2460
2461
2462 /* create a vector of keywords */
2463 static CBMAP *vectorizer(void *db, int id, void *kwdb){
2464 CBMAP *kwords;
2465 char *mbuf;
2466 int msiz;
2467 if(!(mbuf = crget((CURIA *)kwdb, (char *)&id, sizeof(int), 0, -1, &msiz))) return NULL;
2468 kwords = cbmapload(mbuf, msiz);
2469 free(mbuf);
2470 return kwords;
2471 }
2472
2473
2474 /* create a document object with an outer command */
2475 static ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *path,
2476 const char *xcmd, const char *tmpdir,
2477 const char *penc, int plang){
2478 ESTDOC *doc;
2479 const char *pv, *ext;
2480 char iname[URIBUFSIZ], oname[URIBUFSIZ], ebuf[URIBUFSIZ], cmd[URIBUFSIZ];
2481 char *rbuf, numbuf[NUMBUFSIZ];
2482 int fmt, rsiz;
2483 assert(buf && size >= 0 && path && xcmd && tmpdir);
2484 sprintf(ebuf, "ESTORIGFILE=%s", path);
2485 ext = NULL;
2486 if((pv = strrchr(path, ESTPATHCHR)) != NULL) path = pv;
2487 if((pv = strrchr(path, ESTEXTCHR)) != NULL) ext = pv;
2488 if(!ext) ext = "";
2489 sprintf(iname, "%s%cxcmd-in-%08d%s", tmpdir, ESTPATHCHR, getpid(), ext);
2490 sprintf(oname, "%s%cxcmd-out-%08d%cest", tmpdir, ESTPATHCHR, getpid(), ESTEXTCHR);
2491 fmt = FF_DRAFT;
2492 if(cbstrfwmatch(xcmd, "T@")){
2493 fmt = FF_TEXT;
2494 xcmd += 2;
2495 } else if(cbstrfwmatch(xcmd, "H@")){
2496 fmt = FF_HTML;
2497 xcmd += 2;
2498 } else if(cbstrfwmatch(xcmd, "M@")){
2499 fmt = FF_MIME;
2500 xcmd += 2;
2501 }
2502 sprintf(cmd, "%s %s %s", xcmd, iname, oname);
2503 if(!g_filtorig) cbwritefile(iname, buf, size);
2504 putenv(ebuf);
2505 system(cmd);
2506 if((rbuf = cbreadfile(oname, &rsiz)) != NULL){
2507 switch(fmt){
2508 case FF_TEXT:
2509 doc = est_doc_new_from_text(rbuf, rsiz, penc, plang);
2510 break;
2511 case FF_HTML:
2512 doc = est_doc_new_from_html(rbuf, rsiz, penc, plang);
2513 break;
2514 case FF_MIME:
2515 doc = est_doc_new_from_mime(rbuf, rsiz, penc, plang);
2516 break;
2517 default:
2518 doc = est_doc_new_from_draft_enc(rbuf, rsiz, penc);
2519 break;
2520 }
2521 free(rbuf);
2522 } else {
2523 doc = est_doc_new();
2524 }
2525 if(fmt != FF_DRAFT){
2526 sprintf(numbuf, "%d", size);
2527 est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2528 est_doc_add_attr(doc, ESTDATTRTYPE, est_ext_type(ext));
2529 }
2530 unlink(oname);
2531 unlink(iname);
2532 return doc;
2533 }
2534
2535
2536 /* create a document object from draft data in another encoding */
2537 static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size, const char *enc){
2538 ESTDOC *doc;
2539 char *rbuf;
2540 assert(buf);
2541 if(enc && (rbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL)) != NULL){
2542 doc = est_doc_new_from_draft(rbuf);
2543 free(rbuf);
2544 } else {
2545 doc = est_doc_new_from_draft(buf);
2546 }
2547 return doc;
2548 }
2549
2550
2551 /* create a document object from plain text */
2552 static ESTDOC *est_doc_new_from_text(const char *buf, int size, const char *penc, int plang){
2553 ESTDOC *doc;
2554 CBLIST *lines;
2555 CBDATUM *datum;
2556 const char *enc, *text, *line;
2557 char *nbuf, numbuf[NUMBUFSIZ];
2558 int i;
2559 assert(buf);
2560 doc = est_doc_new();
2561 enc = penc ? penc : est_enc_name(buf, size, plang);
2562 if(!strcmp(enc, "UTF-8")){
2563 nbuf = NULL;
2564 text = buf;
2565 } else {
2566 text = buf;
2567 nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2568 if(nbuf) text = nbuf;
2569 }
2570 lines = cbsplit(text, -1, "\n");
2571 datum = cbdatumopen("", 0);
2572 for(i = 0; i < CB_LISTNUM(lines); i++){
2573 line = CB_LISTVAL(lines, i, NULL);
2574 while(*line == ' ' || *line == '\t' || *line == '\r'){
2575 line++;
2576 }
2577 if(line[0] == '\0'){
2578 est_doc_add_text(doc, CB_DATUMPTR(datum));
2579 cbdatumsetsize(datum, 0);
2580 } else {
2581 cbdatumcat(datum, " ", 1);
2582 cbdatumcat(datum, line, -1);
2583 }
2584 }
2585 est_doc_add_text(doc, CB_DATUMPTR(datum));
2586 cbdatumclose(datum);
2587 cblistclose(lines);
2588 est_doc_add_attr(doc, ESTDATTRTYPE, "text/plain");
2589 sprintf(numbuf, "%d", size);
2590 est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2591 if(nbuf) free(nbuf);
2592 return doc;
2593 }
2594
2595
2596 /* create a document object from HTML */
2597 static ESTDOC *est_doc_new_from_html(const char *buf, int size, const char *penc, int plang){
2598 ESTDOC *doc;
2599 CBLIST *elems;
2600 CBMAP *attrs;
2601 CBDATUM *datum;
2602 const char *enc, *html, *elem, *next, *name, *content;
2603 char *nbuf, *nenc, *rbuf, *lbuf, numbuf[NUMBUFSIZ];
2604 int i, esiz;
2605 assert(buf);
2606 doc = est_doc_new();
2607 enc = est_enc_name(buf, size, plang);
2608 html = NULL;
2609 nbuf = NULL;
2610 if(!strcmp(enc, "UTF-16") || !strcmp(enc, "UTF-16BE") || !strcmp(enc, "UTF-16LE")){
2611 nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2612 } else if(!strcmp(enc, "US-ASCII")){
2613 nbuf = NULL;
2614 } else {
2615 if((nenc = penc ? cbmemdup(penc, -1) : est_html_enc(buf)) != NULL){
2616 if(cbstricmp(nenc, "UTF-8")){
2617 nbuf = est_iconv(buf, size, nenc, "UTF-8", NULL, NULL);
2618 if(!nbuf) nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2619 }
2620 free(nenc);
2621 } else {
2622 nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
2623 }
2624 }
2625 if(nbuf) html = nbuf;
2626 if(!html) html = buf;
2627 datum = cbdatumopen("", 0);
2628 elems = cbxmlbreak(html, TRUE);
2629 for(i = 0; i < CB_LISTNUM(elems); i++){
2630 elem = CB_LISTVAL2(elems, i, &esiz);
2631 if(!(next = cblistval(elems, i + 1, NULL))) next = "";
2632 if(elem[0] == '<'){
2633 if(cbstrfwimatch(elem, "<meta")){
2634 attrs = cbxmlattrs(elem);
2635 name = cbmapget(attrs, "name", -1, NULL);
2636 if(!name) name = cbmapget(attrs, "Name", -1, NULL);
2637 if(!name) name = cbmapget(attrs, "NAME", -1, NULL);
2638 if(!name) name = cbmapget(attrs, "http-equiv", -1, NULL);
2639 if(!name) name = cbmapget(attrs, "Http-equiv", -1, NULL);
2640 if(!name) name = cbmapget(attrs, "Http-Equiv", -1, NULL);
2641 if(!name) name = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
2642 content = cbmapget(attrs, "content", -1, NULL);
2643 if(!content) content = cbmapget(attrs, "Content", -1, NULL);
2644 if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
2645 if(name && content){
2646 lbuf = cbmemdup(name, -1);
2647 cbstrtolower(lbuf);
2648 cbstrsqzspc(lbuf);
2649 if(!strcmp(lbuf, "author")){
2650 if(strchr(content, '&')){
2651 rbuf = est_html_raw_text(content);
2652 est_doc_add_attr(doc, ESTDATTRAUTHOR, rbuf);
2653 free(rbuf);
2654 } else {
2655 est_doc_add_attr(doc, ESTDATTRAUTHOR, content);
2656 }
2657 }
2658 if(name[0] != '@'){
2659 if(strchr(content, '&')){
2660 rbuf = est_html_raw_text(content);
2661 est_doc_add_attr(doc, lbuf, rbuf);
2662 free(rbuf);
2663 } else {
2664 est_doc_add_attr(doc, lbuf, content);
2665 }
2666 }
2667 free(lbuf);
2668 }
2669 cbmapclose(attrs);
2670 } else if(cbstrfwimatch(elem, "<title") && next[0] != '\0' && next[0] != '<'){
2671 if(strchr(next, '&')){
2672 rbuf = est_html_raw_text(next);
2673 est_doc_add_attr(doc, ESTDATTRTITLE, rbuf);
2674 est_doc_add_hidden_text(doc, rbuf);
2675 free(rbuf);
2676 } else {
2677 est_doc_add_attr(doc, ESTDATTRTITLE, next);
2678 est_doc_add_hidden_text(doc, next);
2679 }
2680 i++;
2681 } else if(cbstrfwimatch(elem, "<style") || cbstrfwimatch(elem, "<script")){
2682 i++;
2683 } else if(cbstrfwimatch(elem, "<h1") || cbstrfwimatch(elem, "<h2") ||
2684 cbstrfwimatch(elem, "<h3") || cbstrfwimatch(elem, "<h4") ||
2685 cbstrfwimatch(elem, "<h5") || cbstrfwimatch(elem, "<h6") ||
2686 cbstrfwimatch(elem, "<p>") || cbstrfwimatch(elem, "<p ") ||
2687 cbstrfwimatch(elem, "<div") || cbstrfwimatch(elem, "<hr") ||
2688 cbstrfwimatch(elem, "<ul") || cbstrfwimatch(elem, "<ol") ||
2689 cbstrfwimatch(elem, "<dl") || cbstrfwimatch(elem, "<li") ||
2690 cbstrfwimatch(elem, "<dt") || cbstrfwimatch(elem, "<dd") ||
2691 cbstrfwimatch(elem, "<th") || cbstrfwimatch(elem, "<td") ||
2692 cbstrfwimatch(elem, "<pre")){
2693 if(strchr(CB_DATUMPTR(datum), '&')){
2694 rbuf = est_html_raw_text(CB_DATUMPTR(datum));
2695 est_doc_add_text(doc, rbuf);
2696 free(rbuf);
2697 } else {
2698 est_doc_add_text(doc, CB_DATUMPTR(datum));
2699 }
2700 cbdatumsetsize(datum, 0);
2701 }
2702 } else {
2703 cbdatumcat(datum, " ", -1);
2704 cbdatumcat(datum, elem, esiz);
2705 }
2706 }
2707 cblistclose(elems);
2708 if(strchr(CB_DATUMPTR(datum), '&')){
2709 rbuf = est_html_raw_text(CB_DATUMPTR(datum));
2710 est_doc_add_text(doc, rbuf);
2711 free(rbuf);
2712 } else {
2713 est_doc_add_text(doc, CB_DATUMPTR(datum));
2714 }
2715 cbdatumclose(datum);
2716 if(nbuf) free(nbuf);
2717 est_doc_add_attr(doc, ESTDATTRTYPE, "text/html");
2718 sprintf(numbuf, "%d", size);
2719 est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2720 return doc;
2721 }
2722
2723
2724 /* get the encoding of an HTML string */
2725 static char *est_html_enc(const char *str){
2726 CBLIST *elems;
2727 CBMAP *attrs;
2728 const char *elem, *equiv, *content;
2729 char *enc, *pv;
2730 int i;
2731 assert(str);
2732 elems = cbxmlbreak(str, TRUE);
2733 for(i = 0; i < CB_LISTNUM(elems); i++){
2734 elem = CB_LISTVAL(elems, i, NULL);
2735 if(elem[0] != '<' || !cbstrfwimatch(elem, "<meta")) continue;
2736 enc = NULL;
2737 attrs = cbxmlattrs(elem);
2738 equiv = cbmapget(attrs, "http-equiv", -1, NULL);
2739 if(!equiv) equiv = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
2740 if(!equiv) equiv = cbmapget(attrs, "Http-Equiv", -1, NULL);
2741 if(!equiv) equiv = cbmapget(attrs, "Http-equiv", -1, NULL);
2742 if(equiv && !cbstricmp(equiv, "Content-Type")){
2743 content = cbmapget(attrs, "content", -1, NULL);
2744 if(!content) content = cbmapget(attrs, "Content", -1, NULL);
2745 if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
2746 if(content && ((pv = strstr(content, "charset")) != NULL ||
2747 (pv = strstr(content, "Charset")) != NULL ||
2748 (pv = strstr(content, "CHARSET")) != NULL)){
2749 enc = cbmemdup(pv + 8, -1);
2750 if((pv = strchr(enc, ';')) != NULL || (pv = strchr(enc, '\r')) != NULL ||
2751 (pv = strchr(enc, '\n')) != NULL || (pv = strchr(enc, ' ')) != NULL) *pv = '\0';
2752 }
2753 }
2754 cbmapclose(attrs);
2755 if(enc){
2756 cblistclose(elems);
2757 return enc;
2758 }
2759 }
2760 cblistclose(elems);
2761 return NULL;
2762 }
2763
2764
2765 /* unescape entity references of HTML */
2766 static char *est_html_raw_text(const char *html){
2767 static const char *pairs[] = {
2768 /* basic symbols */
2769 "&amp;", "&", "&lt;", "<", "&gt;", ">", "&quot;", "\"", "&apos;", "'",
2770 /* ISO-8859-1 */
2771 "&nbsp;", "\xc2\xa0", "&iexcl;", "\xc2\xa1", "&cent;", "\xc2\xa2",
2772 "&pound;", "\xc2\xa3", "&curren;", "\xc2\xa4", "&yen;", "\xc2\xa5",
2773 "&brvbar;", "\xc2\xa6", "&sect;", "\xc2\xa7", "&uml;", "\xc2\xa8",
2774 "&copy;", "\xc2\xa9", "&ordf;", "\xc2\xaa", "&laquo;", "\xc2\xab",
2775 "&not;", "\xc2\xac", "&shy;", "\xc2\xad", "&reg;", "\xc2\xae",
2776 "&macr;", "\xc2\xaf", "&deg;", "\xc2\xb0", "&plusmn;", "\xc2\xb1",
2777 "&sup2;", "\xc2\xb2", "&sup3;", "\xc2\xb3", "&acute;", "\xc2\xb4",
2778 "&micro;", "\xc2\xb5", "&para;", "\xc2\xb6", "&middot;", "\xc2\xb7",
2779 "&cedil;", "\xc2\xb8", "&sup1;", "\xc2\xb9", "&ordm;", "\xc2\xba",
2780 "&raquo;", "\xc2\xbb", "&frac14;", "\xc2\xbc", "&frac12;", "\xc2\xbd",
2781 "&frac34;", "\xc2\xbe", "&iquest;", "\xc2\xbf", "&Agrave;", "\xc3\x80",
2782 "&Aacute;", "\xc3\x81", "&Acirc;", "\xc3\x82", "&Atilde;", "\xc3\x83",
2783 "&Auml;", "\xc3\x84", "&Aring;", "\xc3\x85", "&AElig;", "\xc3\x86",
2784 "&Ccedil;", "\xc3\x87", "&Egrave;", "\xc3\x88", "&Eacute;", "\xc3\x89",
2785 "&Ecirc;", "\xc3\x8a", "&Euml;", "\xc3\x8b", "&Igrave;", "\xc3\x8c",
2786 "&Iacute;", "\xc3\x8d", "&Icirc;", "\xc3\x8e", "&Iuml;", "\xc3\x8f",
2787 "&ETH;", "\xc3\x90", "&Ntilde;", "\xc3\x91", "&Ograve;", "\xc3\x92",
2788 "&Oacute;", "\xc3\x93", "&Ocirc;", "\xc3\x94", "&Otilde;", "\xc3\x95",
2789 "&Ouml;", "\xc3\x96", "&times;", "\xc3\x97", "&Oslash;", "\xc3\x98",
2790 "&Ugrave;", "\xc3\x99", "&Uacute;", "\xc3\x9a", "&Ucirc;", "\xc3\x9b",
2791 "&Uuml;", "\xc3\x9c", "&Yacute;", "\xc3\x9d", "&THORN;", "\xc3\x9e",
2792 "&szlig;", "\xc3\x9f", "&agrave;", "\xc3\xa0", "&aacute;", "\xc3\xa1",
2793 "&acirc;", "\xc3\xa2", "&atilde;", "\xc3\xa3", "&auml;", "\xc3\xa4",
2794 "&aring;", "\xc3\xa5", "&aelig;", "\xc3\xa6", "&ccedil;", "\xc3\xa7",
2795 "&egrave;", "\xc3\xa8", "&eacute;", "\xc3\xa9", "&ecirc;", "\xc3\xaa",
2796 "&euml;", "\xc3\xab", "&igrave;", "\xc3\xac", "&iacute;", "\xc3\xad",
2797 "&icirc;", "\xc3\xae", "&iuml;", "\xc3\xaf", "&eth;", "\xc3\xb0",
2798 "&ntilde;", "\xc3\xb1", "&ograve;", "\xc3\xb2", "&oacute;", "\xc3\xb3",
2799 "&ocirc;", "\xc3\xb4", "&otilde;", "\xc3\xb5", "&ouml;", "\xc3\xb6",
2800 "&divide;", "\xc3\xb7", "&oslash;", "\xc3\xb8", "&ugrave;", "\xc3\xb9",
2801 "&uacute;", "\xc3\xba", "&ucirc;", "\xc3\xbb", "&uuml;", "\xc3\xbc",
2802 "&yacute;", "\xc3\xbd", "&thorn;", "\xc3\xbe", "&yuml;", "\xc3\xbf",
2803 /* ISO-10646 */
2804 "&fnof;", "\xc6\x92", "&Alpha;", "\xce\x91", "&Beta;", "\xce\x92",
2805 "&Gamma;", "\xce\x93", "&Delta;", "\xce\x94", "&Epsilon;", "\xce\x95",
2806 "&Zeta;", "\xce\x96", "&Eta;", "\xce\x97", "&Theta;", "\xce\x98",
2807 "&Iota;", "\xce\x99", "&Kappa;", "\xce\x9a", "&Lambda;", "\xce\x9b",
2808 "&Mu;", "\xce\x9c", "&Nu;", "\xce\x9d", "&Xi;", "\xce\x9e",
2809 "&Omicron;", "\xce\x9f", "&Pi;", "\xce\xa0", "&Rho;", "\xce\xa1",
2810 "&Sigma;", "\xce\xa3", "&Tau;", "\xce\xa4", "&Upsilon;", "\xce\xa5",
2811 "&Phi;", "\xce\xa6", "&Chi;", "\xce\xa7", "&Psi;", "\xce\xa8",
2812 "&Omega;", "\xce\xa9", "&alpha;", "\xce\xb1", "&beta;", "\xce\xb2",
2813 "&gamma;", "\xce\xb3", "&delta;", "\xce\xb4", "&epsilon;", "\xce\xb5",
2814 "&zeta;", "\xce\xb6", "&eta;", "\xce\xb7", "&theta;", "\xce\xb8",
2815 "&iota;", "\xce\xb9", "&kappa;", "\xce\xba", "&lambda;", "\xce\xbb",
2816 "&mu;", "\xce\xbc", "&nu;", "\xce\xbd", "&xi;", "\xce\xbe",
2817 "&omicron;", "\xce\xbf", "&pi;", "\xcf\x80", "&rho;", "\xcf\x81",
2818 "&sigmaf;", "\xcf\x82", "&sigma;", "\xcf\x83", "&tau;", "\xcf\x84",
2819 "&upsilon;", "\xcf\x85", "&phi;", "\xcf\x86", "&chi;", "\xcf\x87",
2820 "&psi;", "\xcf\x88", "&omega;", "\xcf\x89", "&thetasym;", "\xcf\x91",
2821 "&upsih;", "\xcf\x92", "&piv;", "\xcf\x96", "&bull;", "\xe2\x80\xa2",
2822 "&hellip;", "\xe2\x80\xa6", "&prime;", "\xe2\x80\xb2", "&Prime;", "\xe2\x80\xb3",
2823 "&oline;", "\xe2\x80\xbe", "&frasl;", "\xe2\x81\x84", "&weierp;", "\xe2\x84\x98",
2824 "&image;", "\xe2\x84\x91", "&real;", "\xe2\x84\x9c", "&trade;", "\xe2\x84\xa2",
2825 "&alefsym;", "\xe2\x84\xb5", "&larr;", "\xe2\x86\x90", "&uarr;", "\xe2\x86\x91",
2826 "&rarr;", "\xe2\x86\x92", "&darr;", "\xe2\x86\x93", "&harr;", "\xe2\x86\x94",
2827 "&crarr;", "\xe2\x86\xb5", "&lArr;", "\xe2\x87\x90", "&uArr;", "\xe2\x87\x91",
2828 "&rArr;", "\xe2\x87\x92", "&dArr;", "\xe2\x87\x93", "&hArr;", "\xe2\x87\x94",
2829 "&forall;", "\xe2\x88\x80", "&part;", "\xe2\x88\x82", "&exist;", "\xe2\x88\x83",
2830 "&empty;", "\xe2\x88\x85", "&nabla;", "\xe2\x88\x87", "&isin;", "\xe2\x88\x88",
2831 "&notin;", "\xe2\x88\x89", "&ni;", "\xe2\x88\x8b", "&prod;", "\xe2\x88\x8f",
2832 "&sum;", "\xe2\x88\x91", "&minus;", "\xe2\x88\x92", "&lowast;", "\xe2\x88\x97",
2833 "&radic;", "\xe2\x88\x9a", "&prop;", "\xe2\x88\x9d", "&infin;", "\xe2\x88\x9e",
2834 "&ang;", "\xe2\x88\xa0", "&and;", "\xe2\x88\xa7", "&or;", "\xe2\x88\xa8",
2835 "&cap;", "\xe2\x88\xa9", "&cup;", "\xe2\x88\xaa", "&int;", "\xe2\x88\xab",
2836 "&there4;", "\xe2\x88\xb4", "&sim;", "\xe2\x88\xbc", "&cong;", "\xe2\x89\x85",
2837 "&asymp;", "\xe2\x89\x88", "&ne;", "\xe2\x89\xa0", "&equiv;", "\xe2\x89\xa1",
2838 "&le;", "\xe2\x89\xa4", "&ge;", "\xe2\x89\xa5", "&sub;", "\xe2\x8a\x82",
2839 "&sup;", "\xe2\x8a\x83", "&nsub;", "\xe2\x8a\x84", "&sube;", "\xe2\x8a\x86",
2840 "&supe;", "\xe2\x8a\x87", "&oplus;", "\xe2\x8a\x95", "&otimes;", "\xe2\x8a\x97",
2841 "&perp;", "\xe2\x8a\xa5", "&sdot;", "\xe2\x8b\x85", "&lceil;", "\xe2\x8c\x88",
2842 "&rceil;", "\xe2\x8c\x89", "&lfloor;", "\xe2\x8c\x8a", "&rfloor;", "\xe2\x8c\x8b",
2843 "&lang;", "\xe2\x8c\xa9", "&rang;", "\xe2\x8c\xaa", "&loz;", "\xe2\x97\x8a",
2844 "&spades;", "\xe2\x99\xa0", "&clubs;", "\xe2\x99\xa3", "&hearts;", "\xe2\x99\xa5",
2845 "&diams;", "\xe2\x99\xa6", "&OElig;", "\xc5\x92", "&oelig;", "\xc5\x93",
2846 "&Scaron;", "\xc5\xa0", "&scaron;", "\xc5\xa1", "&Yuml;", "\xc5\xb8",
2847 "&circ;", "\xcb\x86", "&tilde;", "\xcb\x9c", "&ensp;", "\xe2\x80\x82",
2848 "&emsp;", "\xe2\x80\x83", "&thinsp;", "\xe2\x80\x89", "&zwnj;", "\xe2\x80\x8c",
2849 "&zwj;", "\xe2\x80\x8d", "&lrm;", "\xe2\x80\x8e", "&rlm;", "\xe2\x80\x8f",
2850 "&ndash;", "\xe2\x80\x93", "&mdash;", "\xe2\x80\x94", "&lsquo;", "\xe2\x80\x98",
2851 "&rsquo;", "\xe2\x80\x99", "&sbquo;", "\xe2\x80\x9a", "&ldquo;", "\xe2\x80\x9c",
2852 "&rdquo;", "\xe2\x80\x9d", "&bdquo;", "\xe2\x80\x9e", "&dagger;", "\xe2\x80\xa0",
2853 "&Dagger;", "\xe2\x80\xa1", "&permil;", "\xe2\x80\xb0", "&lsaquo;", "\xe2\x80\xb9",
2854 "&rsaquo;", "\xe2\x80\xba", "&euro;", "\xe2\x82\xac",
2855 NULL
2856 };
2857 char *raw, *wp, buf[2], *tmp;
2858 int i, j, hit, num, tsiz;
2859 assert(html);
2860 CB_MALLOC(raw, strlen(html) * 3 + 1);
2861 wp = raw;
2862 while(*html != '\0'){
2863 if(*html == '&'){
2864 if(*(html + 1) == '#'){
2865 if(*(html + 2) == 'x' || *(html + 2) == 'X'){
2866 num = strtol(html + 3, NULL, 16);
2867 } else {
2868 num = atoi(html + 2);
2869 }
2870 buf[0] = num / 256;
2871 buf[1] = num % 256;
2872 if((tmp = est_uconv_out(buf, 2, &tsiz)) != NULL){
2873 for(j = 0; j < tsiz; j++){
2874 *wp = ((unsigned char *)tmp)[j];
2875 wp++;
2876 }
2877 free(tmp);
2878 }
2879 while(*html != ';' && *html != ' ' && *html != '\n' && *html != '\0'){
2880 html++;
2881 }
2882 if(*html == ';') html++;
2883 } else {
2884 hit = FALSE;
2885 for(i = 0; pairs[i] != NULL; i += 2){
2886 if(cbstrfwmatch(html, pairs[i])){
2887 wp += sprintf(wp, "%s", pairs[i+1]);
2888 html += strlen(pairs[i]);
2889 hit = TRUE;
2890 break;
2891 }
2892 }
2893 if(!hit){
2894 *wp = *html;
2895 wp++;
2896 html++;
2897 }
2898 }
2899 } else {
2900 *wp = *html;
2901 wp++;
2902 html++;
2903 }
2904 }
2905 *wp = '\0';
2906 return raw;
2907 }
2908
2909
2910 /* create a document object from MIME */
2911 static ESTDOC *est_doc_new_from_mime(const char *buf, int size, const char *penc, int plang){
2912 ESTDOC *doc, *tdoc;
2913 CBMAP *attrs;
2914 const CBLIST *texts;
2915 CBLIST *parts, *lines;
2916 CBDATUM *datum;
2917 const char *key, *val, *bound, *part, *text, *line;
2918 char *body, *swap, numbuf[NUMBUFSIZ];
2919 int i, j, bsiz, psiz, ssiz, mht;
2920 assert(buf);
2921 doc = est_doc_new();
2922 attrs = cbmapopenex(MINIBNUM);
2923 body = cbmimebreak(buf, size, attrs, &bsiz);
2924 if((val = cbmapget(attrs, "subject", -1, NULL)) != NULL){
2925 est_doc_add_attr_mime(doc, ESTDATTRTITLE, val);
2926 if((val = est_doc_attr(doc, ESTDATTRTITLE)) != NULL) est_doc_add_hidden_text(doc, val);
2927 }
2928 if((val = cbmapget(attrs, "from", -1, NULL)) != NULL)
2929 est_doc_add_attr_mime(doc, ESTDATTRAUTHOR, val);
2930 if((val = cbmapget(attrs, "date", -1, NULL)) != NULL){
2931 est_doc_add_attr_mime(doc, ESTDATTRCDATE, val);
2932 est_doc_add_attr_mime(doc, ESTDATTRMDATE, val);
2933 }
2934 est_doc_add_attr(doc, ESTDATTRTYPE, "message/rfc822");
2935 sprintf(numbuf, "%d", size);
2936 est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
2937 cbmapiterinit(attrs);
2938 while((key = cbmapiternext(attrs, NULL)) != NULL){
2939 if((key[0] >= 'A' && key[0] <= 'Z') || key[0] == '@') continue;
2940 val = cbmapget(attrs, key, -1, NULL);
2941 est_doc_add_attr_mime(doc, key, val);
2942 }
2943 if((key = cbmapget(attrs, "TYPE", -1, NULL)) != NULL && cbstrfwimatch(key, "multipart/")){
2944 mht = cbstrfwimatch(key, "multipart/related");
2945 if((bound = cbmapget(attrs, "BOUNDARY", -1, NULL)) != NULL){
2946 parts = cbmimeparts(body, bsiz, bound);
2947 for(i = 0; i < CB_LISTNUM(parts) && i < 8; i++){
2948 part = CB_LISTVAL2(parts, i, &psiz);
2949 tdoc = est_doc_new_from_mime(part, psiz, penc, plang);
2950 if(mht){
2951 if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL)
2952 est_doc_add_attr(doc, ESTDATTRTITLE, text);
2953 if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL)
2954 est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
2955 }
2956 texts = est_doc_texts(tdoc);
2957 for(j = 0; j < CB_LISTNUM(texts); j++){
2958 text = CB_LISTVAL(texts, j, NULL);
2959 est_doc_add_text(doc, text);
2960 }
2961 est_doc_delete(tdoc);
2962 }
2963 cblistclose(parts);
2964 }
2965 } else {
2966 if((key = cbmapget(attrs, "content-transfer-encoding", -1, NULL)) != NULL &&
2967 cbstrfwimatch(key, "base64")){
2968 swap = cbbasedecode(body, &ssiz);
2969 free(body);
2970 body = swap;
2971 bsiz = ssiz;
2972 } else if((key = cbmapget(attrs, "content-transfer-encoding", -1, NULL)) != NULL &&
2973 cbstrfwimatch(key, "quoted-printable")){
2974 swap = cbquotedecode(body, &ssiz);
2975 free(body);
2976 body = swap;
2977 bsiz = ssiz;
2978 }
2979 if(!(key = cbmapget(attrs, "TYPE", -1, NULL)) || cbstrfwimatch(key, "text/plain")){
2980 if(penc && (swap = est_iconv(body, bsiz, penc, "UTF-8", &ssiz, NULL)) != NULL){
2981 free(body);
2982 body = swap;
2983 bsiz = ssiz;
2984 } else if((key = cbmapget(attrs, "CHARSET", -1, NULL)) != NULL &&
2985 (swap = est_iconv(body, bsiz, key, "UTF-8", &ssiz, NULL)) != NULL){
2986 free(body);
2987 body = swap;
2988 bsiz = ssiz;
2989 }
2990 lines = cbsplit(body, bsiz, "\n");
2991 datum = cbdatumopen("", 0);
2992 for(i = 0; i < CB_LISTNUM(lines); i++){
2993 line = CB_LISTVAL(lines, i, NULL);
2994 while(*line == ' ' || *line == '>' || *line == '|' || *line == '\t' || *line == '\r'){
2995 line++;
2996 }
2997 if(line[0] == '\0'){
2998 est_doc_add_text(doc, CB_DATUMPTR(datum));
2999 cbdatumsetsize(datum, 0);
3000 } else {
3001 cbdatumcat(datum, " ", 1);
3002 cbdatumcat(datum, line, -1);
3003 }
3004 }
3005 est_doc_add_text(doc, CB_DATUMPTR(datum));
3006 cbdatumclose(datum);
3007 cblistclose(lines);
3008 } else if(cbstrfwimatch(key, "text/html")){
3009 tdoc = est_doc_new_from_html(body, bsiz, penc, plang);
3010 if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){
3011 if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text);
3012 est_doc_add_text(doc, text);
3013 }
3014 if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){
3015 if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
3016 est_doc_add_text(doc, text);
3017 }
3018 texts = est_doc_texts(tdoc);
3019 for(i = 0; i < CB_LISTNUM(texts); i++){
3020 text = CB_LISTVAL(texts, i, NULL);
3021 est_doc_add_text(doc, text);
3022 }
3023 est_doc_delete(tdoc);
3024 } else if(cbstrfwimatch(key, "message/rfc822")){
3025 tdoc = est_doc_new_from_mime(body, bsiz, penc, plang);
3026 if((text = est_doc_attr(tdoc, ESTDATTRTITLE)) != NULL){
3027 if(!est_doc_attr(doc, ESTDATTRTITLE)) est_doc_add_attr(doc, ESTDATTRTITLE, text);
3028 est_doc_add_text(doc, text);
3029 }
3030 if((text = est_doc_attr(tdoc, ESTDATTRAUTHOR)) != NULL){
3031 if(!est_doc_attr(doc, ESTDATTRAUTHOR)) est_doc_add_attr(doc, ESTDATTRAUTHOR, text);
3032 est_doc_add_text(doc, text);
3033 }
3034 texts = est_doc_texts(tdoc);
3035 for(i = 0; i < CB_LISTNUM(texts); i++){
3036 text = CB_LISTVAL(texts, i, NULL);
3037 est_doc_add_text(doc, text);
3038 }
3039 est_doc_delete(tdoc);
3040 } else if(cbstrfwimatch(key, "text/")){
3041 tdoc = est_doc_new_from_text(body, bsiz, penc, plang);
3042 texts = est_doc_texts(tdoc);
3043 for(i = 0; i < CB_LISTNUM(texts); i++){
3044 text = CB_LISTVAL(texts, i, NULL);
3045 est_doc_add_text(doc, text);
3046 }
3047 est_doc_delete(tdoc);
3048 }
3049 }
3050 free(body);
3051 cbmapclose(attrs);
3052 return doc;
3053 }
3054
3055
3056 /* set mime value as an attribute of a document */
3057 static void est_doc_add_attr_mime(ESTDOC *doc, const char *name, const char *value){
3058 char enc[64], *ebuf, *rbuf;
3059 assert(doc && name && value);
3060 ebuf = cbmimedecode(value, enc);
3061 if((rbuf = est_iconv(ebuf, -1, enc, "UTF-8", NULL, NULL)) != NULL){
3062 est_doc_add_attr(doc, name, rbuf);
3063 free(rbuf);
3064 }
3065 free(ebuf);
3066 }
3067
3068
3069 /* generate a document with random text */
3070 static ESTDOC *est_doc_new_from_chaos(int cnum, int snum, int mode){
3071 ESTDOC *doc;
3072 char *str;
3073 int i;
3074 doc = est_doc_new();
3075 snum *= pow(est_random_nd() + 0.5, 3.0);
3076 if(mode == RD_RAND){
3077 mode = est_random() * 100;
3078 if(mode < 20){
3079 mode = RD_ENG;
3080 est_doc_add_attr(doc, "mode", "english");
3081 } else if(mode < 40){
3082 mode = RD_LAT;
3083 est_doc_add_attr(doc, "mode", "latin");
3084 } else if(mode < 60){
3085 mode = RD_EURO;
3086 est_doc_add_attr(doc, "mode", "euromix");
3087 } else if(mode < 65){
3088 mode = RD_ORI;
3089 est_doc_add_attr(doc, "mode", "oriental");
3090 } else if(mode < 95){
3091 mode = RD_JPN;
3092 est_doc_add_attr(doc, "mode", "japanese");
3093 } else {
3094 mode = RD_CHAO;
3095 est_doc_add_attr(doc, "mode", "chaos");
3096 }
3097 }
3098 switch(mode){
3099 case RD_ENG: est_doc_add_attr(doc, "mode", "english"); break;
3100 case RD_LAT: est_doc_add_attr(doc, "mode", "latin"); break;
3101 case RD_ORI: est_doc_add_attr(doc, "mode", "oriental"); break;
3102 case RD_JPN: est_doc_add_attr(doc, "mode", "japanese"); break;
3103 case RD_EURO: est_doc_add_attr(doc, "mode", "euromix"); break;
3104 case RD_CHAO: est_doc_add_attr(doc, "mode", "chaos"); break;
3105 }
3106 for(i = 0; i <= snum; i++){
3107 str = est_random_str(cnum, mode);
3108 if(est_random() < 0.05){
3109 est_doc_add_hidden_text(doc, str);
3110 } else {
3111 est_doc_add_text(doc, str);
3112 }
3113 free(str);
3114 }
3115 return doc;
3116 }
3117
3118
3119 /* generate random string */
3120 static char *est_random_str(int cnum, int mode){
3121 const char echrs[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
3122 CBDATUM *buf;
3123 char wc[2], *str;
3124 int i, c, wlen, dec, mm, big, n;
3125 buf = cbdatumopen("", 0);
3126 cnum *= pow(est_random_nd() + 0.5, 3.0);
3127 wlen = est_random_nd() * 8 + 4;
3128 dec = (int)(est_random() * INT_MAX) % 10;
3129 big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100;
3130 for(i = 0; i < cnum; i++){
3131 switch(mode){
3132 case RD_ENG: case RD_LAT: case RD_EURO:
3133 mm = (int)(est_random() * INT_MAX) % 100;
3134 if((mode == RD_LAT || mode == RD_EURO) && mm < 5){
3135 c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0));
3136 } else if(mode == RD_EURO && (mm < 30 || dec > 8)){
3137 if(dec % 2 == 0){
3138 c = 0x0391 + (int)(pow(est_random_nd(), 2.0) * (0x03d6 - 0x0391));
3139 } else {
3140 c = 0x0400 + (int)(pow(est_random_nd(), 2.0) * (0x045f - 0x0400));
3141 }
3142 } else if(mm < 95){
3143 if((n = est_random_nd() * (sizeof(echrs) - 1)) == (sizeof(echrs) - 1)) n = 0;
3144 c = echrs[n];
3145 } else {
3146 c = (int)(est_random() * ('@' - ' ')) + ' ';
3147 }
3148 if(--wlen < 1){
3149 c = ' ';
3150 wlen = pow(est_random_nd(), 3.0) * 8 + 4;
3151 dec = (int)(est_random() * INT_MAX) % 10;
3152 }
3153 break;
3154 case RD_ORI:
3155 c = big + est_random_nd() * 0x100;
3156 if(--wlen < 1){
3157 wlen = pow(est_random_nd(), 3.0) * 12 + 6;
3158 big = (((int)(est_random() * INT_MAX) % 0x29)) * 0x100;
3159 }
3160 break;
3161 case RD_JPN:
3162 if(dec < 4){
3163 c = 0x3041 + pow(est_random_nd(), 3.0) * (0x3094 - 0x3041);
3164 } else if(dec < 7){
3165 c = 0x30a1 + pow(est_random_nd(), 3.0) * (0x30fe - 0x30a1);
3166 } else if(dec < 9){
3167 c = 0x4e00 + pow(est_random_nd(), 3.0) * (0x9faf - 0x4e00);
3168 } else {
3169 if(est_random() < 0.7){
3170 c = 0x00a1 + (int)(pow(est_random_nd(), 2.0) * (0x00ff - 0x00a0));
3171 } else {
3172 c = 0x3041 + est_random() * (0xffef - 0x3041);
3173 }
3174 }
3175 if(--wlen < 1){
3176 wlen = pow(est_random_nd(), 3.0) * 12 + 6;
3177 dec = (int)(est_random() * INT_MAX) % 10;
3178 }
3179 break;
3180 default:
3181 if(est_random() < 0.2){
3182 c = 0x00a1 + (int)est_random() * (0x00ff - 0x00a0);
3183 } else {
3184 c = (int)(est_random() * 0x10000);
3185 }
3186 break;
3187 }
3188 if(c <= 0 || c >= 0x10000) c = 0x0020;
3189 wc[0] = c / 0x100;
3190 wc[1] = c % 0x100;
3191 cbdatumcat(buf, wc, 2);
3192 }
3193 str = est_iconv(CB_DATUMPTR(buf), CB_DATUMSIZE(buf), "UTF-16BE", "UTF-8", NULL, NULL);
3194 cbdatumclose(buf);
3195 return str;
3196 }
3197
3198
3199
3200 /* END OF FILE */

  ViewVC Help
Powered by ViewVC 1.1.26