1 |
/* |
2 |
The Malete project - the Z39.2/Z39.50 database framework of OpenIsis. |
3 |
Version 0.9.x (patchlevel see file Version) |
4 |
Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org |
5 |
|
6 |
This library is free software; you can redistribute it and/or |
7 |
modify it under the terms of the GNU Lesser General Public |
8 |
License as published by the Free Software Foundation; either |
9 |
version 2.1 of the License, or (at your option) any later version. |
10 |
|
11 |
This library is distributed in the hope that it will be useful, |
12 |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
14 |
See the GNU Lesser General Public License for more details. |
15 |
|
16 |
You should have received a copy of the GNU Lesser General Public |
17 |
License along with this library; if not, write to the Free Software |
18 |
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
19 |
|
20 |
see README for more information |
21 |
EOH */ |
22 |
|
23 |
/* |
24 |
$Id: cdx.c,v 1.9 2004/11/11 18:20:23 kripke Exp $ |
25 |
charset collation |
26 |
*/ |
27 |
|
28 |
#include "../core/core.h" |
29 |
|
30 |
/* |
31 |
We use |
32 |
- p codes 0,1,..p-1 for primaries (word and nonword) |
33 |
- v codes p..p+v-1 for secondary and tertiary level variants |
34 |
- m codes p+v..p+v+m-1 for maps (targets) |
35 |
|
36 |
We index code numbers to an array[p+v+m] of unsigned ints as code values. |
37 |
The highest bit is the nonword indicator. |
38 |
The lower nibble of the highest byte is the length of the code's byte sequence |
39 |
(for maps in #code values, else in # cleartext bytes). |
40 |
|
41 |
If the sequence does not fit into the lower 3 bytes, |
42 |
the value&0xffffff is the offset of the bytes. |
43 |
|
44 |
For primaries and variants, the bytes are cleartext. |
45 |
For maps, the bytes are code text (length given as #codes), |
46 |
where every code uses one byte, if p+v<=256, else two. |
47 |
|
48 |
With variants, we also have an array[p+v] of variant info. |
49 |
For a primary code 0..p-1, |
50 |
this is the number of secondary and tertiary variants, |
51 |
and the code of the first of the (sec+1)*(ter+1) variants - 1. |
52 |
|
53 |
For a variant code p..p+v-1, this is it's secondary and tertiary weights |
54 |
and the code of the associated primary. |
55 |
*/ |
56 |
|
57 |
typedef union { |
58 |
unsigned u; |
59 |
struct { |
60 |
#ifdef CPU_BIG_ENDIAN |
61 |
unsigned char hi; |
62 |
unsigned char c[3]; |
63 |
#else |
64 |
unsigned char c[3]; |
65 |
unsigned char hi; |
66 |
#endif |
67 |
} b; |
68 |
} Cv; |
69 |
|
70 |
typedef struct Var { /* variant info */ |
71 |
unsigned char sec; /* secondary variant */ |
72 |
unsigned char ter; /* tertiary variant */ |
73 |
unsigned short rel; /* code of related variant */ |
74 |
} Var; |
75 |
|
76 |
/* |
77 |
For the encoding, we map every byte to: |
78 |
- it's code number |
79 |
- the "word" bit |
80 |
- the index of a table of possible following bytes |
81 |
|
82 |
To cover the BMP with UTF-8, we need 1072 tables: |
83 |
- 32 bytes 110* initiating a 2 byte sequence, |
84 |
using one table each |
85 |
- 16 bytes 1110* initiating a 3 byte sequence, |
86 |
each having a table with 64 second bytes 10*, |
87 |
using 16+16*64 = 1040 tables |
88 |
(- the 32 tables used for the UTF-16 surrogates D800-DFFF) |
89 |
CJK 3400-9FFF alone uses 432 tables (some 27.000 ideographs) |
90 |
*/ |
91 |
typedef struct Bin { /* byte info */ |
92 |
unsigned short cod; /* byte's collation code */ |
93 |
unsigned short tab; /* table */ |
94 |
} Bin; |
95 |
|
96 |
typedef Bin Bins [256]; /* full table */ |
97 |
|
98 |
typedef struct Tab { /* table info */ |
99 |
unsigned off; /* bytes offset of Bin for min */ |
100 |
unsigned char min; |
101 |
unsigned char max; |
102 |
unsigned short unu; |
103 |
} Tab; |
104 |
|
105 |
|
106 |
enum { |
107 |
BIT_VARIANTS = 0x20, |
108 |
BIT_FRENCH = 0x10, |
109 |
TYP_PLAIN = 0, |
110 |
TYP_VARIANTS = BIT_VARIANTS, /* TODO */ |
111 |
TYP_FRENCH = BIT_VARIANTS|BIT_FRENCH, /* TODO */ |
112 |
CVLAT1 = 104 /* # primary codes for builtin lat1cdx */ |
113 |
}; |
114 |
|
115 |
/* header of a dumped/mapped cx */ |
116 |
struct Cdx { |
117 |
unsigned char mag[3]; /* magic MCX or mcx for mapped cdx */ |
118 |
unsigned char typ; /* base type | bits per (primary) code-1 */ |
119 |
/* currently only 8(7) and 16(15) bits supported */ |
120 |
unsigned short pri; /* p # primary codes incl. 0,1 */ |
121 |
unsigned short var; /* p + v # variants */ |
122 |
unsigned short map; /* p+v + m # maps */ |
123 |
unsigned short tab; /* # non-root tables 1..tab */ |
124 |
/* redundant offsets */ |
125 |
unsigned ovi; /* offset variant_infos */ |
126 |
unsigned otp; /* offset table_pointers - 4 */ |
127 |
unsigned siz; /* total size */ |
128 |
Bins bt0; /* root table, always full */ |
129 |
unsigned cv[CVLAT1]; /* actually Cv code_values[map=p+v+m] */ |
130 |
/* Var variant_infos[var=p+v], if BIT_VARIANTS */ |
131 |
/* Tab table_pointers[tab] */ |
132 |
/* Bin byte_tables ... */ |
133 |
/* unsigned char *more_bytes */ |
134 |
}; |
135 |
|
136 |
|
137 |
static const char MAGIC[3] = |
138 |
#ifdef CPU_BIG_ENDIAN |
139 |
{'M','C','X'}; |
140 |
# define W(b) (0x01000000|(b)<<16) |
141 |
# define N(b) (0x81000000|(b)<<16) |
142 |
#else |
143 |
{'m','c','x'}; |
144 |
# define W(b) (0x01000000|(b)) |
145 |
# define N(b) (0x81000000|(b)) |
146 |
#endif |
147 |
|
148 |
#if 0 |
149 |
const Cdx lat1cdx = { |
150 |
{'s','t','a'}, /* mark as static */ |
151 |
TYP_PLAIN|7, CVLAT1, CVLAT1, CVLAT1, 0,0,0,0, |
152 |
{ /* byte infos */ |
153 |
#define B(b) {b,0} |
154 |
/* 32 C0 controls */ |
155 |
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
156 |
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
157 |
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
158 |
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
159 |
/* ! " # $ % & ' */ |
160 |
B( 2),B( 3),B( 4),B( 5),B( 6),B( 7),B( 8),B( 9), |
161 |
/* ( ) * + , - . / */ |
162 |
B(10),B(11),B(12),B(13),B(14),B(15),B(16),B(17), |
163 |
/* 0 1 2 3 4 5 6 7 */ |
164 |
B(18),B(19),B(20),B(21),B(22),B(23),B(24),B(25), |
165 |
/* 8 9 : ; < = > ? */ |
166 |
B(26),B(27),B(28),B(29),B(30),B(31),B(32),B(33), |
167 |
/* @ A B C D E F G */ |
168 |
B(34),B(35),B(36),B(37),B(38),B(39),B(40),B(41), |
169 |
B(42),B(43),B(44),B(45),B(46),B(47),B(48),B(49), /* H-O */ |
170 |
B(50),B(51),B(52),B(53),B(54),B(55),B(56),B(57), /* P-W */ |
171 |
/* X Y Z [ \ ] ^ _ */ |
172 |
B(58),B(59),B(60),B(61),B(62),B(63),B(64),B(65), |
173 |
/* ` a b c d e f g */ |
174 |
B(66),B(35),B(36),B(37),B(38),B(39),B(40),B(41), |
175 |
B(42),B(43),B(44),B(45),B(46),B(47),B(48),B(49), /* h-o */ |
176 |
B(50),B(51),B(52),B(53),B(54),B(55),B(56),B(57), /* p-w */ |
177 |
/* x y z { | } ~ DEL */ |
178 |
B(58),B(59),B(60),B(67),B(68),B(69),B(70),B(0), |
179 |
/* 32 C1 controls */ |
180 |
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
181 |
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
182 |
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
183 |
B(0),B(0),B(0),B(0),B(0),B(0),B(0),B(0), |
184 |
/* 32 mostly symbols */ |
185 |
B( 2),B(71),B(72),B(73),B(74),B(75),B(76),B(77), |
186 |
B(78),B(79),B(80),B(81),B(82),B(83),B(84),B(85), |
187 |
B(86),B(87),B(88),B(89),B(90),B(91),B(92),B(93), |
188 |
B(94),B(95),B(96),B(97),B(98),B(99),B(100),B(101), |
189 |
/* 64 Latin alphas including 2 symbols */ |
190 |
B(35),B(35),B(35),B(35),B(35),B(35),B(35),B(37), /* 7A,1C */ |
191 |
B(39),B(39),B(39),B(39),B(43),B(43),B(43),B(43), /* 4E,4I */ |
192 |
B(54),B(49),B(49),B(49),B(49),B(49),B(49),B(102), /* TN5O* */ |
193 |
B(49),B(55),B(55),B(55),B(55),B(59),B(54),B(53), /* O4UYTS */ |
194 |
B(35),B(35),B(35),B(35),B(35),B(35),B(35),B(37), /* 7a,1c */ |
195 |
B(39),B(39),B(39),B(39),B(43),B(43),B(43),B(43), /* 4e,4i */ |
196 |
B(54),B(49),B(49),B(49),B(49),B(49),B(49),B(103), /* tn5o% */ |
197 |
B(49),B(55),B(55),B(55),B(55),B(59),B(54),B(59) /* o4uyty */ |
198 |
#undef B |
199 |
}, |
200 |
{ /* code values, all using immediate bytes */ |
201 |
0, N(9), |
202 |
N(32),N(33),N(34),N(35),N(36),N(37),N(38),N(39), /* !"#$%&' */ |
203 |
N(40),N(41),N(42),N(43),N(44),N(45),N(46),N(47), /* ()*+,-./ */ |
204 |
W(48),W(49),W(50),W(51),W(52),W(53),W(54),W(55), /* 01234567 */ |
205 |
W(56),W(57),N(58),N(59),N(60),N(61),N(62),N(63), /* 89:;<=>? */ |
206 |
N(64),W(65),W(66),W(67),W(68),W(69),W(70),W(71), /* @ABCDEFG */ |
207 |
W(72),W(73),W(74),W(75),W(76),W(77),W(78),W(79), /* HIJKLMNO */ |
208 |
W(80),W(81),W(82),W(83),W(84),W(85),W(86),W(87), /* PQRSTUWW */ |
209 |
W(88),W(89),W(90),N(91),N(92),N(93),N(94),N(95), /* XYZ[\]^_ */ |
210 |
N(96), /*a-z*/ N(123),N(124),N(125),N(126), /* ` {|}~ */ |
211 |
N(161),N(162),N(163),N(164),N(165),N(166),N(167), |
212 |
N(168),N(169),N(170),N(171),N(172),N(173),N(174),N(175), |
213 |
N(176),N(177),N(178),N(179),N(180),N(181),N(182),N(183), |
214 |
N(184),N(185),N(186),N(187),N(188),N(189),N(190),N(191), |
215 |
N(215), N(247) |
216 |
} |
217 |
}; /* lat1cdx */ |
218 |
#endif |
219 |
|
220 |
|
221 |
/* |
222 |
encode l byte in b to key. |
223 |
key->len is the max len on input, resulting len on return. |
224 |
stop if max key len is hit or on word boundary, if words. |
225 |
return #used bytes in b. |
226 |
*/ |
227 |
int cEnc ( const Cdx *cdx, Key *key, unsigned char *b, int l, int words ) |
228 |
{ |
229 |
const unsigned char * const base = (unsigned char *)cdx; |
230 |
const Bin * const bt0 = cdx->bt0; |
231 |
const Tab * const tp = (Tab*)(base + cdx->otp); |
232 |
const int bits = 1+(0xf&cdx->typ), mapcodeshift = 256<cdx->var ? 1 : 0; |
233 |
const unsigned char *e = b+l; |
234 |
int bitsleft = 8*key->len, pfxlen = 0; |
235 |
int unassigned = 0; |
236 |
Bin seq[CDX_MAXSEQ], *top; /* current sequence */ |
237 |
unsigned u, len; |
238 |
unsigned short code; |
239 |
Cv cv; |
240 |
|
241 |
key->len = 0; |
242 |
for ( ; b < e && bits <= bitsleft; b++ ) { |
243 |
eRr(LOG_TRACE, "byte %c of %d bits %d/%d", *b, e-b, bitsleft, bits); |
244 |
/* traverse byte info tables to find longest matching sequence */ |
245 |
for (*(top = seq) = bt0[*b]; top->tab && b < e; ) { |
246 |
const Tab * const t = tp + top->tab; |
247 |
unsigned char n = b[1]; |
248 |
LOG_DBG(LOG_DEBUG, "table %d check %d %d-%d off %d %d", |
249 |
top->tab, n, t->min, t->max, t->off, *(int*)(base+t->off)); |
250 |
if ( n < t->min || n > t->max /* out of bounds */ |
251 |
|| !(u = ((unsigned*)(base+t->off))[n - t->min]) /* unassigned */ |
252 |
) |
253 |
break; |
254 |
*++top = *(Bin*)&u; |
255 |
b++; |
256 |
} |
257 |
for ( ; top > seq && !top->cod; top-- ) /* no complete sequence */ |
258 |
b--; |
259 |
code = top->cod; |
260 |
eRr(LOG_TRACE, "code %d", code); |
261 |
if (words) { |
262 |
if ( 1<words /* leave 1st words-1 bytes alone */ |
263 |
&& b+l-e >= words /* ok, passed it for the 1st time */ |
264 |
) { |
265 |
pfxlen = key->len; |
266 |
words = 1; |
267 |
} else if (0x80000000&cdx->cv[code]) { /* hit nonword */ |
268 |
if (pfxlen == key->len) /* skip to word */ |
269 |
continue; |
270 |
break; /* had something after prefix */ |
271 |
} |
272 |
} |
273 |
if (cdx->pri > code) { |
274 |
if ( code ) |
275 |
unassigned = 0; |
276 |
else { /* unassigned */ |
277 |
if ( unassigned ) |
278 |
continue; |
279 |
unassigned = 1; |
280 |
code = 1; |
281 |
} |
282 |
switch (bits) { |
283 |
case 16: key->byt[key->len++] = code >> 8; /* always bigend */ |
284 |
case 8: key->byt[key->len++] = (char)code; |
285 |
} |
286 |
bitsleft -= bits; |
287 |
continue; |
288 |
} |
289 |
/* if (cdx->var > top->cod) variant */ |
290 |
/* else map: */ |
291 |
cv.u = cdx->cv[code]; |
292 |
if ((len = (0xf & cv.b.hi)<<mapcodeshift)) { /* len = #codes */ |
293 |
/* mapped variants TODO */ |
294 |
if ( 0 > (bitsleft -= (len << 3)) ) |
295 |
break; |
296 |
memcpy(key->byt+key->len, ~3&len ? base+(0xffffff&cv.u) : cv.b.c, len); |
297 |
key->len += len; |
298 |
} |
299 |
} |
300 |
if (/*words &&*/ key->len == pfxlen) /* if pfxlen: found no words */ |
301 |
key->len = 0; |
302 |
|
303 |
return b+l-e; |
304 |
} /* cEnc */ |
305 |
|
306 |
|
307 |
int cDec ( const Cdx *cdx, unsigned char *b, int l, Key *key ) |
308 |
{ |
309 |
const unsigned char * const base = (unsigned char *)cdx; |
310 |
const unsigned char *k = key->byt, *e = b+l-CDX_MAXSEQ; |
311 |
const int bits = 1+(0xf&cdx->typ); |
312 |
int bitsleft = 8*key->len; |
313 |
Cv cv; |
314 |
|
315 |
for ( ; b < e && bits <= bitsleft; bitsleft -= bits ) { |
316 |
unsigned short v = *k++; |
317 |
if ( 16 == bits ) |
318 |
v = v<<8 | *k++; |
319 |
cv.u = cdx->cv[v]; |
320 |
switch ( cv.b.hi &= 0xf ) { /* discard word bit */ |
321 |
case 1: *b++ = cv.b.c[0]; continue; |
322 |
case 2: *b++ = cv.b.c[0]; *b++ = cv.b.c[1]; continue; |
323 |
case 3: *b++ = cv.b.c[0]; *b++ = cv.b.c[1]; *b++ = cv.b.c[2]; continue; |
324 |
} |
325 |
memcpy(b, base+(cv.u&0xffffff), cv.b.hi); |
326 |
b += cv.b.hi; |
327 |
} |
328 |
return b+l-CDX_MAXSEQ-e; |
329 |
} /* cDec */ |
330 |
|
331 |
|
332 |
|
333 |
typedef struct { |
334 |
Cdx cdx; |
335 |
unsigned cvb[0x10000-CVLAT1]; |
336 |
Var *vi, vib[0x10000]; |
337 |
Tab tp[1100]; |
338 |
Bins *bt, btb[1100]; |
339 |
unsigned char *p, b[200000]; |
340 |
} CdxMake; |
341 |
|
342 |
|
343 |
/* make byte table entries */ |
344 |
static int mapSeq (CdxMake *mk, unsigned char *p, int len, |
345 |
unsigned short code, int save) |
346 |
{ |
347 |
Bins *bt = &mk->cdx.bt0; |
348 |
LOG_DBG(LOG_DEBUG, "map '%.*s' -> %d", len, p, code); |
349 |
if (save) { |
350 |
Cv *cv = (Cv*)mk->cdx.cv+code; |
351 |
if (15 < len) { |
352 |
eRr(ERR_INVAL, "sequence '%.*s' too long %d", len, p, len); |
353 |
return 1; |
354 |
} |
355 |
cv->b.hi = (0x80&save) | len; |
356 |
if (!(~3 & len)) |
357 |
memcpy(cv->b.c, p, len); |
358 |
else { |
359 |
cv->u |= mk->p - mk->b; |
360 |
memcpy(mk->p, p, len); |
361 |
mk->p += len; |
362 |
} |
363 |
} |
364 |
for (;; p++) { |
365 |
Bin *b = *bt+*p; |
366 |
if (!--len) { /* last byte -- assign code */ |
367 |
if (b->cod) { |
368 |
eRr(ERR_INVAL, "attempt to reassign code %d for %d", code, b->cod); |
369 |
return 1; |
370 |
} |
371 |
b->cod = code; |
372 |
return 0; |
373 |
} |
374 |
if (b->tab) |
375 |
bt = mk->btb + (b->tab - 1); |
376 |
else { |
377 |
bt = mk->bt++; |
378 |
b->tab = mk->bt - mk->btb; |
379 |
LOG_DBG(LOG_DEBUG, "new table %d", b->tab); |
380 |
} |
381 |
} |
382 |
return 0; |
383 |
} |
384 |
|
385 |
/** compile collation src */ |
386 |
static Cdx *cMake ( const Fld *src ) |
387 |
{ |
388 |
const Fld *eof = REND(src), *f; |
389 |
CdxMake mk; |
390 |
int skipalias = 1, mapcodeshift; |
391 |
unsigned short firstcode = 1; /* of last row */ |
392 |
unsigned bins=0, off, u; |
393 |
Cdx *cdx; |
394 |
|
395 |
LOG_DBG(LOG_DEBUG, "cMake %d", sizeof mk); |
396 |
memset(&mk, 0, sizeof mk); |
397 |
mk.vi = mk.vib; |
398 |
mk.bt = mk.btb; |
399 |
mk.p = mk.b; |
400 |
mk.cdx.cv[0] = 0x80000000; /* 0: unassigned */ |
401 |
mk.cdx.cv[1] = 0x81000009; /* 1: tab */ |
402 |
mk.cdx.pri = 2; |
403 |
|
404 |
/* first pass: gather primaries and aliases */ |
405 |
for (f = src; ++f<eof; ) if (MET_COL == f->tag) { |
406 |
unsigned char *p = (unsigned char*)f->val, *e, *t, save; |
407 |
unsigned short code; |
408 |
|
409 |
if (3 > f->len || TAB != p[1]) continue; |
410 |
e = p + f->len; |
411 |
switch (*p) { |
412 |
case 'A': if (skipalias) continue; save = 0; goto alias; |
413 |
case 'W': save = 1; break; |
414 |
case 'N': save = 0x80; break; |
415 |
case 'S': |
416 |
case 'T': |
417 |
skipalias = 1; |
418 |
default: continue; |
419 |
} |
420 |
firstcode = mk.cdx.pri; |
421 |
skipalias = 0; |
422 |
alias: |
423 |
code = firstcode; |
424 |
for (p+=2; e > p; p = t+1) { |
425 |
if (!(t = memchr(p, TAB, e-p))) t = e; |
426 |
if (t > p && mapSeq(&mk, p, t-p, code, save)) return 0; |
427 |
if (save || code < mk.cdx.pri-1) code++; |
428 |
} |
429 |
if (save) mk.cdx.pri = code; |
430 |
} |
431 |
mk.cdx.typ = TYP_PLAIN|(256 < mk.cdx.pri ? 15 : 7); |
432 |
|
433 |
mk.cdx.var = mk.cdx.pri; /* second pass: gather variants */ |
434 |
for (f = src; ++f<eof; ) if (MET_COL == f->tag) { |
435 |
unsigned char *p = (unsigned char*)f->val; |
436 |
|
437 |
if (3 > f->len || TAB != p[1]) continue; |
438 |
switch (*p) { |
439 |
case 'S': |
440 |
case 'T': |
441 |
eRr(ERR_INVAL, "this version does not support multilevel"); |
442 |
return 0; |
443 |
} |
444 |
} |
445 |
if (mk.cdx.var > mk.cdx.pri) |
446 |
mk.cdx.typ |= BIT_VARIANTS; |
447 |
|
448 |
mk.cdx.map = mk.cdx.var; /* third pass: resolve maps */ |
449 |
mapcodeshift = 256<mk.cdx.var ? 1 : 0; |
450 |
for (f = src; ++f<eof; ) if (MET_COL == f->tag) { |
451 |
unsigned short code, codes[CDX_MAXSEQ]; |
452 |
unsigned char ccodes[CDX_MAXSEQ], *pcodes; |
453 |
unsigned char *p = (unsigned char*)f->val, *e, *t; |
454 |
unsigned short n = 0; |
455 |
Bin seq[CDX_MAXSEQ], *bin; /* current sequence */ |
456 |
Cv *cv; |
457 |
|
458 |
if (3 > f->len || 'M' != p[0] || TAB != p[1]) continue; |
459 |
e = p + f->len; |
460 |
p += 2; |
461 |
if (!(t = memchr(p, TAB, e-p))) continue; |
462 |
|
463 |
for ( ; p<t && n<CDX_MAXSEQ; p++ ) { /* get n codes for map */ |
464 |
for (*(bin = seq) = mk.cdx.bt0[*p]; bin->tab && p < t;) { |
465 |
/* slightly simpler than in cEnc, since we have full tables */ |
466 |
int *ent = (int*)mk.btb[bin->tab-1]; |
467 |
if (!ent[p[1]]) break; /* unassigned */ |
468 |
*++bin = *(Bin*)(ent + *++p); |
469 |
if (seq+CDX_MAXSEQ == bin) break; |
470 |
} |
471 |
for (; bin>seq && (!bin->cod || mk.cdx.var<=bin->cod); bin--) |
472 |
p--; |
473 |
if (mk.cdx.var>bin->cod) /* ignore map codes */ |
474 |
codes[n++] = bin->cod ? bin->cod : 1; |
475 |
} |
476 |
p = t+1; |
477 |
|
478 |
/* got n codes ... */ |
479 |
if (1 == n) /* single code ~ alias */ |
480 |
code = codes[0]; |
481 |
else { /* make entry cv[mk.cdx.map++] (even for n=0) */ |
482 |
if (mapcodeshift) { |
483 |
pcodes = (unsigned char*)codes; /* native */ |
484 |
#ifndef CPU_BIG_ENDIAN |
485 |
{ unsigned short *us = codes+n; |
486 |
while (us-- > codes) *us = (*us>>8) | (*us<<8); /* mk be */ |
487 |
} |
488 |
#endif |
489 |
} else { |
490 |
for (u=n; u--;) ccodes[u] = codes[u]; |
491 |
pcodes = ccodes; |
492 |
} |
493 |
cv = (Cv*)mk.cdx.cv + mk.cdx.map; |
494 |
cv->b.hi = n; |
495 |
if (n && 0x80000000&mk.cdx.cv[codes[0]]) /* inherit from 1st code */ |
496 |
cv->b.hi |= 0x80; |
497 |
n <<= mapcodeshift; |
498 |
if (!(~3 & n)) |
499 |
memcpy(cv->b.c, pcodes, n); |
500 |
else { |
501 |
cv->u |= mk.p - mk.b; |
502 |
memcpy(mk.p, pcodes, n); |
503 |
mk.p += n; |
504 |
} |
505 |
code = mk.cdx.map++; |
506 |
} |
507 |
for (; e > p; p = t+1) { /* map 'em all to mk.cdx.map */ |
508 |
if (!(t = memchr(p, TAB, e-p))) t = e; |
509 |
if (t > p && mapSeq(&mk, p, t-p, code, 0)) return 0; |
510 |
} |
511 |
} |
512 |
/* finish: compact */ |
513 |
mk.cdx.tab = mk.bt - mk.btb; |
514 |
mk.cdx.otp = |
515 |
mk.cdx.ovi = (((char *)&((Cdx*)0)->cv) - (char*)0) /* offsetoff(cv) */ |
516 |
+ mk.cdx.map*sizeof (unsigned); |
517 |
if (mk.cdx.var > mk.cdx.pri) |
518 |
mk.cdx.otp += mk.cdx.var * sizeof (Var); |
519 |
off = mk.cdx.otp + mk.cdx.tab * sizeof (Tab); |
520 |
for (u=0; u<mk.cdx.tab; u++) /* compact tables */ { |
521 |
int *base = (int *)(mk.btb + u), *pi = base; |
522 |
mk.tp[u].off = off; |
523 |
while (!*pi) pi++; |
524 |
LOG_DBG(LOG_DEBUG, "found code %x", *pi); |
525 |
mk.tp[u].min = pi - base; |
526 |
for (pi = base+256; !*--pi; ) ; |
527 |
mk.tp[u].max = pi - base; |
528 |
bins += 1+mk.tp[u].max-mk.tp[u].min; |
529 |
off += (1+mk.tp[u].max-mk.tp[u].min)*sizeof (Bin); |
530 |
LOG_DBG(LOG_DEBUG, "table %d %d-%d", u+1, mk.tp[u].min, mk.tp[u].max); |
531 |
} |
532 |
mk.cdx.siz = off + (mk.p - mk.b); |
533 |
eRr(LOG_INFO, |
534 |
"%d primaries %d variants %d maps %d tables %d bins, off %d size %d", |
535 |
mk.cdx.pri, mk.cdx.var-mk.cdx.pri, mk.cdx.map-mk.cdx.var, mk.cdx.tab, |
536 |
bins, off, mk.cdx.siz); |
537 |
/* adjust indirect CVs by off */ { |
538 |
unsigned i = 0, mask = 0x0c000000; |
539 |
for (; i < mk.cdx.var; i++) |
540 |
if (0x0c000000 & mk.cdx.cv[i]) |
541 |
mk.cdx.cv[i] += off; |
542 |
if (mapcodeshift) mask |= 0x02000000; /* only 1 code fits */ |
543 |
for (; i < mk.cdx.map; i++) |
544 |
if (mask & mk.cdx.cv[i]) |
545 |
mk.cdx.cv[i] += off; |
546 |
} |
547 |
cdx = (Cdx*)mAlloc(mk.cdx.siz); |
548 |
memcpy(cdx, &mk.cdx, mk.cdx.ovi); |
549 |
if (cdx->var > cdx->pri) |
550 |
memcpy((char*)cdx + cdx->ovi, mk.vib, mk.cdx.var*sizeof (Var)); |
551 |
if (cdx->tab) { |
552 |
memcpy((char*)cdx + cdx->otp, mk.tp, mk.cdx.tab*sizeof (Tab)); |
553 |
for (u=0; u<mk.cdx.tab; u++) |
554 |
memcpy((char*)cdx + mk.tp[u].off, |
555 |
mk.btb[u] + mk.tp[u].min, |
556 |
(1+mk.tp[u].max-mk.tp[u].min)*sizeof (Bin)); |
557 |
} |
558 |
memcpy((char*)cdx + off, mk.b, mk.p - mk.b); |
559 |
cdx->otp -= sizeof (Tab); /* adjust 0 based */ |
560 |
|
561 |
return cdx; |
562 |
} /* cMake */ |
563 |
|
564 |
|
565 |
|
566 |
/* |
567 |
list of open shared cdx |
568 |
*/ |
569 |
typedef struct Foo Foo; |
570 |
|
571 |
struct Foo { |
572 |
char nln; /* namelen */ |
573 |
char nam[31]; |
574 |
Foo *foo; |
575 |
int ref; |
576 |
Cdx *cdx; |
577 |
FMap fm; |
578 |
}; |
579 |
#if 0 |
580 |
static Foo lat1foo = { 7, "Latin-1", 0, 1, (Cdx*)&lat1cdx }; |
581 |
static Foo *fools = &lat1foo; |
582 |
#endif |
583 |
static Foo *fools = 0; |
584 |
|
585 |
|
586 |
|
587 |
/** open or compile collation src */ |
588 |
const Cdx *cOpen (const Fld *src) |
589 |
{ |
590 |
char *nam = 0, *p; |
591 |
int nln = 0, coldef = 0; |
592 |
Foo *foo = fools; |
593 |
Cdx *cdx = 0; |
594 |
const Fld *eof, *f; |
595 |
char fname[20]; |
596 |
FMap fm; |
597 |
|
598 |
if (src) |
599 |
for (f = src, eof = REND(src); ++f < eof; ) |
600 |
if (MET_COL == f->tag) { |
601 |
coldef = 1; |
602 |
if (2 < f->len && 'C' == f->val[0] |
603 |
&& TAB == f->val[1] && TAB != f->val[2] |
604 |
) { /* named */ |
605 |
nam = f->val+2; |
606 |
nln = f->len-2; |
607 |
if ((p = memchr(nam, TAB, nln))) nln = p-nam; |
608 |
if (nln > 15) nln = 15; |
609 |
LOG_DBG(LOG_DEBUG, "collation name '%.*s'", nln, nam); |
610 |
for (; foo; foo = foo->foo) |
611 |
if (nln == foo->nln && !memcmp(nam, foo->nam, nln)) { /* got it */ |
612 |
foo->ref++; /* ref it */ |
613 |
return foo->cdx; /* ret it */ |
614 |
} |
615 |
/* TODO: try to map -- check type */ |
616 |
memset(fname, 0, sizeof fname); |
617 |
memcpy(fname, nam, nln); |
618 |
memcpy(fname+nln, ".mcx", 5); |
619 |
memset(&fm, 0, sizeof fm); |
620 |
fm.fil = FIL_NONE; |
621 |
if (!fMOpen(&fm, fname, FIL_RD)) { |
622 |
int size = fSize(fm.fil); |
623 |
fm.lim = (size + env.psz-1)>>env.psh; |
624 |
if ( (int)fm.lim != fMap(&fm, fm.lim) |
625 |
|| memcmp(MAGIC, fm.map, 3) |
626 |
|| size != (int)((Cdx*)fm.map)->siz |
627 |
) { |
628 |
eRr(ERR_TRASH, "bad coll file '%s'", fname); |
629 |
fMClose(&fm); |
630 |
} else { |
631 |
cdx = (Cdx*)fm.map; |
632 |
eRr(LOG_VERBOSE, "mapped coll '%s' %d bytes", fname, size); |
633 |
#ifndef WIN32 |
634 |
fClose(&fm.fil); /* don't need the file open */ |
635 |
#endif |
636 |
} |
637 |
} |
638 |
} |
639 |
} |
640 |
if (!cdx && (!coldef || !(cdx = cMake(src)))) |
641 |
return 0; |
642 |
if (nam) { /* foo an mmaped or named made */ |
643 |
Foo *newfoo = mAlloc(sizeof *newfoo); |
644 |
memcpy(newfoo->nam, nam, newfoo->nln = nln); |
645 |
newfoo->foo = fools; |
646 |
newfoo->ref = 1; |
647 |
newfoo->cdx = cdx; |
648 |
if (fm.map) /* was mapped */ |
649 |
newfoo->fm = fm; |
650 |
else { /* try to write */ |
651 |
file fil = FIL_NONE; |
652 |
if (!fOpen(&fil, fname, FIL_WR|FIL_CREAT|FIL_TRUNC)) { |
653 |
memcpy(cdx->mag, MAGIC, 3); |
654 |
fWrite(&fil, cdx, cdx->siz); |
655 |
fClose(&fil); |
656 |
eRr(LOG_INFO, "saved coll '%s' %d bytes", fname, cdx->siz); |
657 |
} |
658 |
memcpy(cdx->mag, "nam", 3); /* yet we keep using our selfmade copy */ |
659 |
} |
660 |
fools = newfoo; |
661 |
} |
662 |
return cdx; |
663 |
} /* cOpen */ |
664 |
|
665 |
|
666 |
void cClose (Cdx *cdx) |
667 |
{ |
668 |
Foo *foo, *f = 0; |
669 |
switch (cdx->mag[0]) { |
670 |
case 0: /* internal anonymous */ |
671 |
mFree(cdx); |
672 |
case 's': /* internal static */ |
673 |
return; |
674 |
#ifdef CPU_BIG_ENDIAN |
675 |
case 'M': |
676 |
#else |
677 |
case 'm': /* mapped - always named */ |
678 |
#endif |
679 |
case 'n': /* internal named */ |
680 |
for (foo = fools; foo; foo = (f = foo)->foo) |
681 |
if (foo->cdx == cdx) { |
682 |
if (!--foo->ref) { |
683 |
if ('n' == cdx->mag[0]) |
684 |
mFree(cdx); |
685 |
else |
686 |
fMClose(&foo->fm); |
687 |
if (f) |
688 |
f->foo = foo->foo; |
689 |
else |
690 |
fools = foo->foo; |
691 |
mFree(foo); |
692 |
} |
693 |
return; |
694 |
} |
695 |
/* panic time ? */ |
696 |
} |
697 |
} /* cClose */ |