1 |
/* |
2 |
The Malete project - the Z39.2/Z39.50 database framework of OpenIsis. |
3 |
Version 0.9.x (patchlevel see file Version) |
4 |
Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org |
5 |
|
6 |
This library is free software; you can redistribute it and/or |
7 |
modify it under the terms of the GNU Lesser General Public |
8 |
License as published by the Free Software Foundation; either |
9 |
version 2.1 of the License, or (at your option) any later version. |
10 |
|
11 |
This library is distributed in the hope that it will be useful, |
12 |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
14 |
See the GNU Lesser General Public License for more details. |
15 |
|
16 |
You should have received a copy of the GNU Lesser General Public |
17 |
License along with this library; if not, write to the Free Software |
18 |
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
19 |
|
20 |
see README for more information |
21 |
EOH */ |
22 |
|
23 |
/* |
24 |
$Id: utf8stat.c,v 1.2 2004/11/03 17:51:42 kripke Exp $ |
25 |
count usage of characters in UTF-8 stream |
26 |
*/ |
27 |
|
28 |
#include <unistd.h> |
29 |
#include <stdio.h> |
30 |
|
31 |
static unsigned int cnt[0x10000]; |
32 |
static unsigned char buf[0x2000+2]; |
33 |
|
34 |
/* http://www.ietf.org/rfc/rfc2279.txt */ |
35 |
int main (int argc, char **argv) |
36 |
{ |
37 |
char *b = buf; |
38 |
int whoops=0, i; |
39 |
|
40 |
while (0 < (i = read(0, b, 0x2000))) { |
41 |
register const unsigned char *e = buf+i; |
42 |
register unsigned char *p = b = buf; |
43 |
|
44 |
while (p < e) { |
45 |
if (!(0x80 & *p)) { /* high bit not set: single 7 bit 0xxxxxxx */ |
46 |
cnt[*p++]++; |
47 |
continue; |
48 |
} |
49 |
/* high bit is set */ |
50 |
if (!(0x40 & *p)) { /* 2nd bit not set: dangling follower 10xxxxxx */ |
51 |
whoops++; |
52 |
p++; |
53 |
continue; |
54 |
} |
55 |
/* we start 11... */ |
56 |
if (!(0x20 & *p)) { /* double 5+6 110xxxxx 10xxxxxx */ |
57 |
register unsigned char c = *p; /* save */ |
58 |
if (e != ++p) { |
59 |
cnt[((0x1f & c)<<6) | (0x3f & *p++)]++; |
60 |
continue; |
61 |
} |
62 |
/* have trailing lead byte */ |
63 |
buf[0] = c; |
64 |
b = buf+1; |
65 |
break; /* reload */ |
66 |
} |
67 |
/* we start 111... */ |
68 |
if (!(0x10 & *p)) { /* triple 1110xxxx 10xxxxxx 10xxxxxx */ |
69 |
if (e > p+2) { |
70 |
cnt[((0xf & p[0])<<12) | ((0x3f & p[1])<<6) | (0x3f & p[2])]++; |
71 |
p += 3; |
72 |
continue; |
73 |
} |
74 |
/* save 1 or 2 */ |
75 |
buf[0] = *p; |
76 |
b = buf+1; |
77 |
if (e != ++p) *b++ = *p; |
78 |
break; /* reload */ |
79 |
} |
80 |
printf("byte %x\n", *p); |
81 |
goto break2; |
82 |
} |
83 |
} |
84 |
break2: |
85 |
for (i=0; i<0x10000; i++) |
86 |
if (cnt[i]) { |
87 |
int l = 1; |
88 |
if (!(0xff80 & i)) { /* 0000-007F */ |
89 |
buf[0] = 0xe0&i ? i : '?'/*C0 controls*/; |
90 |
} else if (!(0xf800 & i)) { /* 0080-07FF */ |
91 |
buf[0] = 0xc0 | (i>>6); |
92 |
buf[1] = 0x80 | (0x3f&i); |
93 |
l = 2; |
94 |
} else { /* 0800-FFFF */ |
95 |
buf[0] = 0xe0 | (i>>12); |
96 |
buf[1] = 0x80 | (0x3f&(i>>6)); |
97 |
buf[2] = 0x80 | (0x3f&i); |
98 |
l = 3; |
99 |
} |
100 |
printf("%04X\t'%.*s'\t%d\n", i, l, buf, cnt[i]); |
101 |
} |
102 |
if (whoops) |
103 |
printf("whoops %d", whoops); |
104 |
return 0; |
105 |
} |