1 |
dpavlin |
604 |
/* |
2 |
|
|
The Malete project - the Z39.2/Z39.50 database framework of OpenIsis. |
3 |
|
|
Version 0.9.x (patchlevel see file Version) |
4 |
|
|
Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org |
5 |
|
|
|
6 |
|
|
This library is free software; you can redistribute it and/or |
7 |
|
|
modify it under the terms of the GNU Lesser General Public |
8 |
|
|
License as published by the Free Software Foundation; either |
9 |
|
|
version 2.1 of the License, or (at your option) any later version. |
10 |
|
|
|
11 |
|
|
This library is distributed in the hope that it will be useful, |
12 |
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
14 |
|
|
See the GNU Lesser General Public License for more details. |
15 |
|
|
|
16 |
|
|
You should have received a copy of the GNU Lesser General Public |
17 |
|
|
License along with this library; if not, write to the Free Software |
18 |
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
19 |
|
|
|
20 |
|
|
see README for more information |
21 |
|
|
EOH */ |
22 |
|
|
|
23 |
|
|
/* |
24 |
|
|
$Id: utf8stat.c,v 1.2 2004/11/03 17:51:42 kripke Exp $ |
25 |
|
|
count usage of characters in UTF-8 stream |
26 |
|
|
*/ |
27 |
|
|
|
28 |
|
|
#include <unistd.h> |
29 |
|
|
#include <stdio.h> |
30 |
|
|
|
31 |
|
|
static unsigned int cnt[0x10000]; |
32 |
|
|
static unsigned char buf[0x2000+2]; |
33 |
|
|
|
34 |
|
|
/* http://www.ietf.org/rfc/rfc2279.txt */ |
35 |
|
|
int main (int argc, char **argv) |
36 |
|
|
{ |
37 |
|
|
char *b = buf; |
38 |
|
|
int whoops=0, i; |
39 |
|
|
|
40 |
|
|
while (0 < (i = read(0, b, 0x2000))) { |
41 |
|
|
register const unsigned char *e = buf+i; |
42 |
|
|
register unsigned char *p = b = buf; |
43 |
|
|
|
44 |
|
|
while (p < e) { |
45 |
|
|
if (!(0x80 & *p)) { /* high bit not set: single 7 bit 0xxxxxxx */ |
46 |
|
|
cnt[*p++]++; |
47 |
|
|
continue; |
48 |
|
|
} |
49 |
|
|
/* high bit is set */ |
50 |
|
|
if (!(0x40 & *p)) { /* 2nd bit not set: dangling follower 10xxxxxx */ |
51 |
|
|
whoops++; |
52 |
|
|
p++; |
53 |
|
|
continue; |
54 |
|
|
} |
55 |
|
|
/* we start 11... */ |
56 |
|
|
if (!(0x20 & *p)) { /* double 5+6 110xxxxx 10xxxxxx */ |
57 |
|
|
register unsigned char c = *p; /* save */ |
58 |
|
|
if (e != ++p) { |
59 |
|
|
cnt[((0x1f & c)<<6) | (0x3f & *p++)]++; |
60 |
|
|
continue; |
61 |
|
|
} |
62 |
|
|
/* have trailing lead byte */ |
63 |
|
|
buf[0] = c; |
64 |
|
|
b = buf+1; |
65 |
|
|
break; /* reload */ |
66 |
|
|
} |
67 |
|
|
/* we start 111... */ |
68 |
|
|
if (!(0x10 & *p)) { /* triple 1110xxxx 10xxxxxx 10xxxxxx */ |
69 |
|
|
if (e > p+2) { |
70 |
|
|
cnt[((0xf & p[0])<<12) | ((0x3f & p[1])<<6) | (0x3f & p[2])]++; |
71 |
|
|
p += 3; |
72 |
|
|
continue; |
73 |
|
|
} |
74 |
|
|
/* save 1 or 2 */ |
75 |
|
|
buf[0] = *p; |
76 |
|
|
b = buf+1; |
77 |
|
|
if (e != ++p) *b++ = *p; |
78 |
|
|
break; /* reload */ |
79 |
|
|
} |
80 |
|
|
printf("byte %x\n", *p); |
81 |
|
|
goto break2; |
82 |
|
|
} |
83 |
|
|
} |
84 |
|
|
break2: |
85 |
|
|
for (i=0; i<0x10000; i++) |
86 |
|
|
if (cnt[i]) { |
87 |
|
|
int l = 1; |
88 |
|
|
if (!(0xff80 & i)) { /* 0000-007F */ |
89 |
|
|
buf[0] = 0xe0&i ? i : '?'/*C0 controls*/; |
90 |
|
|
} else if (!(0xf800 & i)) { /* 0080-07FF */ |
91 |
|
|
buf[0] = 0xc0 | (i>>6); |
92 |
|
|
buf[1] = 0x80 | (0x3f&i); |
93 |
|
|
l = 2; |
94 |
|
|
} else { /* 0800-FFFF */ |
95 |
|
|
buf[0] = 0xe0 | (i>>12); |
96 |
|
|
buf[1] = 0x80 | (0x3f&(i>>6)); |
97 |
|
|
buf[2] = 0x80 | (0x3f&i); |
98 |
|
|
l = 3; |
99 |
|
|
} |
100 |
|
|
printf("%04X\t'%.*s'\t%d\n", i, l, buf, cnt[i]); |
101 |
|
|
} |
102 |
|
|
if (whoops) |
103 |
|
|
printf("whoops %d", whoops); |
104 |
|
|
return 0; |
105 |
|
|
} |