1 |
/** |
2 |
* HyperEstraierWrapper.cpp - C++ wrapper for Hyper Estraier |
3 |
*/ |
4 |
#include <estraier.h> |
5 |
#include <estmtdb.h> |
6 |
#include <cabin.h> |
7 |
#include <cstdlib> |
8 |
#include <string> |
9 |
#include <vector> |
10 |
#include <map> |
11 |
#include <cassert> |
12 |
|
13 |
namespace estraier { |
14 |
class Condition { |
15 |
public: |
16 |
enum { // enumeration for options |
17 |
SURE = ESTCONDSURE, // check every N-gram key |
18 |
USUAL = ESTCONDUSUAL, // check N-gram keys skipping by one |
19 |
FAST = ESTCONDFAST, // check N-gram keys skipping by two |
20 |
AGITO = ESTCONDAGITO, // check N-gram keys skipping by three |
21 |
NOIDF = ESTCONDNOIDF, // without TF-IDF tuning |
22 |
SIMPLE = ESTCONDSIMPLE, // with the simplefied phrase |
23 |
}; |
24 |
ESTCOND * cond; |
25 |
Condition() { |
26 |
/** |
27 |
* constructor |
28 |
*/ |
29 |
cond = est_cond_new(); |
30 |
} |
31 |
~Condition() { |
32 |
/** |
33 |
* destructor |
34 |
*/ |
35 |
est_cond_delete(cond); |
36 |
} |
37 |
void set_phrase(const char *phrase) { |
38 |
/** |
39 |
* set the search phrase |
40 |
*/ |
41 |
est_cond_set_phrase(cond, phrase); |
42 |
} |
43 |
void add_attr(const char *expr) { |
44 |
/** |
45 |
* set the attribute expression |
46 |
*/ |
47 |
est_cond_add_attr(cond, expr); |
48 |
} |
49 |
void set_order(const char *expr) { |
50 |
/** |
51 |
* set the order of a condition object |
52 |
*/ |
53 |
est_cond_set_order(cond, expr); |
54 |
} |
55 |
void set_max(int _max) { |
56 |
/** |
57 |
* set the maximum number of retrieval of a condition object |
58 |
*/ |
59 |
est_cond_set_max(cond, _max); |
60 |
} |
61 |
void set_options(int options) { |
62 |
/** |
63 |
* set options of retrieval of a condition object |
64 |
*/ |
65 |
est_cond_set_options(cond, options); |
66 |
} |
67 |
}; |
68 |
|
69 |
class Document { |
70 |
private: |
71 |
std::string text_buf; |
72 |
public: |
73 |
ESTDOC *doc; |
74 |
|
75 |
Document() { |
76 |
/** |
77 |
* constructor |
78 |
*/ |
79 |
doc = est_doc_new(); |
80 |
} |
81 |
Document(const char* draft) { |
82 |
/** |
83 |
* constructor |
84 |
*/ |
85 |
doc = est_doc_new_from_draft(draft); |
86 |
} |
87 |
Document(ESTDOC *_doc) { |
88 |
/** |
89 |
* constructor |
90 |
*/ |
91 |
doc = _doc; |
92 |
} |
93 |
~Document() { |
94 |
/** |
95 |
* destructor |
96 |
*/ |
97 |
est_doc_delete(doc); |
98 |
} |
99 |
void add_attr(const char * name, const char*value) { |
100 |
/** |
101 |
* add an attribute to a document object |
102 |
*/ |
103 |
est_doc_add_attr(doc, name, value); |
104 |
} |
105 |
void add_text(const char *text) { |
106 |
/** |
107 |
* add a sentence of text to a document object |
108 |
*/ |
109 |
est_doc_add_text(doc, text); |
110 |
} |
111 |
void add_hidden_text(const char * text) { |
112 |
/** |
113 |
* add a hidden sentence to a document object |
114 |
*/ |
115 |
est_doc_add_hidden_text(doc, text); |
116 |
} |
117 |
int id() { |
118 |
/** |
119 |
* get the ID number of a document object |
120 |
*/ |
121 |
return est_doc_id(doc); |
122 |
} |
123 |
std::vector<std::string> * attr_names() { |
124 |
/** |
125 |
* get a list of attribute names of a document object |
126 |
*/ |
127 |
std::vector<std::string> * vs = new std::vector<std::string>; |
128 |
CBLIST * attr_names = est_doc_attr_names(doc); |
129 |
for (int i=0; i < cblistnum(attr_names); i++) { |
130 |
vs->push_back(cblistval(attr_names, i, NULL)); |
131 |
} |
132 |
cblistclose(attr_names); |
133 |
return vs; |
134 |
} |
135 |
const char * attr(const char *name) { |
136 |
/** |
137 |
* get the value of an attribute of a document object |
138 |
*/ |
139 |
return est_doc_attr(doc, name); |
140 |
} |
141 |
const char * cat_texts() { |
142 |
/** |
143 |
* get a list of sentences of the text of a document object |
144 |
*/ |
145 |
// return est_doc_cat_texts(doc); |
146 |
return "This is mockup!"; |
147 |
} |
148 |
std::vector<std::string>* texts() { |
149 |
/** |
150 |
* get a list of sentences of the text of a document object |
151 |
*/ |
152 |
std::vector<std::string> * vs = new std::vector<std::string>; |
153 |
const CBLIST *texts; |
154 |
texts = est_doc_texts(doc); |
155 |
for(int i = 0; i < cblistnum(texts); i++) { |
156 |
vs->push_back(cblistval(texts, i, NULL)); |
157 |
} |
158 |
return vs; |
159 |
} |
160 |
const char * dump_draft() { |
161 |
/** |
162 |
* dump draft data of a document object |
163 |
*/ |
164 |
return est_doc_dump_draft(doc); |
165 |
} |
166 |
const char * make_snippet(std::vector<std::string> _words, int wwidth, int hwidth, int awidth) { |
167 |
/** |
168 |
* make a snippet of the body text of a document object |
169 |
*/ |
170 |
CBLIST * words; |
171 |
std::vector<std::string>::iterator iter; |
172 |
|
173 |
words = cblistopen(); |
174 |
|
175 |
for (iter = _words.begin(); _words.end() != iter; iter++) { |
176 |
cblistpush(words, iter->c_str(), -1); |
177 |
} |
178 |
|
179 |
const char *result = est_doc_make_snippet(doc, words, wwidth, hwidth, awidth); |
180 |
|
181 |
cblistclose(words); |
182 |
|
183 |
return result; |
184 |
} |
185 |
}; |
186 |
|
187 |
class Database { |
188 |
private: |
189 |
ESTMTDB *db; |
190 |
public: |
191 |
enum { // enumeration for error codes |
192 |
ERRNOERR = ESTENOERR, // no error |
193 |
ERRINVAL = ESTEINVAL, // invalid argument |
194 |
ERRACCES = ESTEACCES, // access forbidden |
195 |
ERRLOCK = ESTELOCK, // lock failure |
196 |
ERRDB = ESTEDB, // database problem |
197 |
ERRIO = ESTEIO, // I/O problem |
198 |
ERRNOITEM = ESTENOITEM, // no item |
199 |
ERRMISC = ESTEMISC // miscellaneous |
200 |
}; |
201 |
enum { // enumeration for open modes |
202 |
DBREADER = ESTDBREADER, // open as a reader |
203 |
DBWRITER = ESTDBWRITER, // open as a writer |
204 |
DBCREAT = ESTDBCREAT, // a writer creating |
205 |
DBTRUNC = ESTDBTRUNC, // a writer truncating |
206 |
DBNOLCK = ESTDBNOLCK, // open without locking |
207 |
DBLCKNB = ESTDBLCKNB, // lock without blocking |
208 |
DBPERFNG = ESTDBPERFNG // use perfect N-gram analyzer |
209 |
}; |
210 |
enum { // enumeration for options of document registration |
211 |
PDCLEAN = ESTPDCLEAN // clean up dispensable regions |
212 |
}; |
213 |
enum { // enumeration for options of document deletion |
214 |
ODCLEAN = ESTODCLEAN // clean up dispensable regions |
215 |
}; |
216 |
enum { // enumeration for options of optimization |
217 |
OPTNOPURGE = ESTOPTNOPURGE, // omit purging dispensable region of deleted |
218 |
OPTNODBOPT = ESTOPTNODBOPT // omit optimizization of the database files |
219 |
}; |
220 |
enum { // enumeration for options of document retrieval |
221 |
GDNOATTR = ESTGDNOATTR, // no attributes |
222 |
GDNOTEXT = ESTGDNOTEXT // no text |
223 |
}; |
224 |
Database() { |
225 |
/** |
226 |
* constructor(dummy) |
227 |
*/ |
228 |
} |
229 |
~Database() { |
230 |
close(); |
231 |
} |
232 |
bool open(const char * dbname, int mode) { |
233 |
/** |
234 |
* open the database |
235 |
*/ |
236 |
int ecode; |
237 |
db = est_mtdb_open(dbname, mode, &ecode); |
238 |
return db; |
239 |
} |
240 |
bool close() { |
241 |
/** |
242 |
* close the database |
243 |
*/ |
244 |
if (db) { |
245 |
int ecode; |
246 |
bool result = est_mtdb_close(db, &ecode); |
247 |
db = NULL; |
248 |
return result; |
249 |
} else { |
250 |
return false; |
251 |
} |
252 |
} |
253 |
bool put_doc(Document *doc, int options) { |
254 |
/** |
255 |
* add a document to a database |
256 |
*/ |
257 |
return est_mtdb_put_doc(db, doc->doc, options); |
258 |
} |
259 |
std::vector<int> * search(Condition * cond, int options) { |
260 |
/** |
261 |
* search documents corresponding a condition for a database |
262 |
*/ |
263 |
int resnum; |
264 |
int * result = est_mtdb_search(db, cond->cond, &resnum, NULL); |
265 |
std::vector<int> *numbers = new std::vector<int>; |
266 |
for (int i=0; i<resnum; i++) { |
267 |
numbers->push_back(result[i]); |
268 |
} |
269 |
return numbers; |
270 |
} |
271 |
static const char * err_msg(int ecode) { |
272 |
/** |
273 |
* get the string of an error |
274 |
*/ |
275 |
return est_err_msg(ecode); |
276 |
} |
277 |
int error() { |
278 |
/** |
279 |
* get the last happended error code of a database |
280 |
*/ |
281 |
return est_mtdb_error(db); |
282 |
} |
283 |
bool fatal() { |
284 |
/** |
285 |
* check whether a database has a fatal error |
286 |
*/ |
287 |
return est_mtdb_fatal(db); |
288 |
} |
289 |
bool flush(int _max) { |
290 |
/** |
291 |
* flush index words in the cache of a database |
292 |
*/ |
293 |
return est_mtdb_flush(db, _max); |
294 |
} |
295 |
bool sync() { |
296 |
/** |
297 |
* synchronize updating contents of a database |
298 |
*/ |
299 |
return est_mtdb_sync(db); |
300 |
} |
301 |
bool optimize(int options) { |
302 |
/** |
303 |
* optimize a database |
304 |
*/ |
305 |
return est_mtdb_optimize(db, options); |
306 |
} |
307 |
bool out_doc(int id, int options) { |
308 |
/** |
309 |
* remove a document from a database |
310 |
*/ |
311 |
return est_mtdb_out_doc(db, id, options); |
312 |
} |
313 |
Document * get_doc(int id, int options) { |
314 |
/** |
315 |
* retrieve a document in a database |
316 |
*/ |
317 |
ESTDOC *doc = est_mtdb_get_doc(db, id, options); |
318 |
if (!doc) { |
319 |
throw est_err_msg(est_mtdb_error(db)); |
320 |
} else { |
321 |
return new Document(doc); |
322 |
} |
323 |
} |
324 |
int uri_to_id(const char *uri) { |
325 |
/** |
326 |
* get the ID of a document spacified by URI |
327 |
*/ |
328 |
return est_mtdb_uri_to_id(db, uri); |
329 |
} |
330 |
std::map<std::string, std::string> * etch_doc(Document * doc, int max) { |
331 |
/** |
332 |
* extract keywords of a document object |
333 |
*/ |
334 |
std::map<std::string, std::string> * mss = new std::map<std::string, std::string>; |
335 |
|
336 |
CBMAP * keys = est_mtdb_etch_doc(db, doc->doc, max); |
337 |
|
338 |
cbmapiterinit(keys); |
339 |
int ksiz; |
340 |
while (const char *key = cbmapiternext(keys, &ksiz)) { |
341 |
mss->insert(std::make_pair(key, cbmapget(keys, key, ksiz, NULL))); |
342 |
} |
343 |
return mss; |
344 |
} |
345 |
bool iter_init() { |
346 |
/** |
347 |
* initialize the iterator of a database |
348 |
*/ |
349 |
return est_mtdb_iter_init(db); |
350 |
} |
351 |
int iter_next() { |
352 |
/** |
353 |
* get the next ID of the iterator of a database |
354 |
*/ |
355 |
return est_mtdb_iter_next(db); |
356 |
} |
357 |
const char * name() { |
358 |
/** |
359 |
* get the name of a database |
360 |
*/ |
361 |
return est_mtdb_name(db); |
362 |
} |
363 |
int doc_num() { |
364 |
/** |
365 |
* get the number of documents in a database |
366 |
*/ |
367 |
return est_mtdb_doc_num(db); |
368 |
} |
369 |
int word_num() { |
370 |
/** |
371 |
* get the number of unique words in a database |
372 |
*/ |
373 |
return est_mtdb_word_num(db); |
374 |
} |
375 |
double size() { |
376 |
/** |
377 |
* get the size of a database |
378 |
*/ |
379 |
return est_mtdb_size(db); |
380 |
} |
381 |
void set_cache_size(size_t size, int anum, int tnum) { |
382 |
/** |
383 |
* set the maximum size of the cache memory of a database |
384 |
*/ |
385 |
est_mtdb_set_cache_size(db, size, anum, tnum); |
386 |
} |
387 |
void set_special_cache(const char *name, int num) { |
388 |
/** |
389 |
* Set the special cache for narrowing and sorting |
390 |
* with document attributes |
391 |
*/ |
392 |
est_mtdb_set_special_cache(db, name, num); |
393 |
} |
394 |
}; |
395 |
}; |