/[webpac]/openisis/0.9.9e/uti/utf8stat.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /openisis/0.9.9e/uti/utf8stat.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 604 - (show annotations)
Mon Dec 27 21:49:01 2004 UTC (19 years, 1 month ago) by dpavlin
File MIME type: text/plain
File size: 2846 byte(s)
import of new openisis release, 0.9.9e

1 /*
2 The Malete project - the Z39.2/Z39.50 database framework of OpenIsis.
3 Version 0.9.x (patchlevel see file Version)
4 Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14 See the GNU Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with this library; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
20 see README for more information
21 EOH */
22
23 /*
24 $Id: utf8stat.c,v 1.2 2004/11/03 17:51:42 kripke Exp $
25 count usage of characters in UTF-8 stream
26 */
27
28 #include <unistd.h>
29 #include <stdio.h>
30
31 static unsigned int cnt[0x10000];
32 static unsigned char buf[0x2000+2];
33
34 /* http://www.ietf.org/rfc/rfc2279.txt */
35 int main (int argc, char **argv)
36 {
37 char *b = buf;
38 int whoops=0, i;
39
40 while (0 < (i = read(0, b, 0x2000))) {
41 register const unsigned char *e = buf+i;
42 register unsigned char *p = b = buf;
43
44 while (p < e) {
45 if (!(0x80 & *p)) { /* high bit not set: single 7 bit 0xxxxxxx */
46 cnt[*p++]++;
47 continue;
48 }
49 /* high bit is set */
50 if (!(0x40 & *p)) { /* 2nd bit not set: dangling follower 10xxxxxx */
51 whoops++;
52 p++;
53 continue;
54 }
55 /* we start 11... */
56 if (!(0x20 & *p)) { /* double 5+6 110xxxxx 10xxxxxx */
57 register unsigned char c = *p; /* save */
58 if (e != ++p) {
59 cnt[((0x1f & c)<<6) | (0x3f & *p++)]++;
60 continue;
61 }
62 /* have trailing lead byte */
63 buf[0] = c;
64 b = buf+1;
65 break; /* reload */
66 }
67 /* we start 111... */
68 if (!(0x10 & *p)) { /* triple 1110xxxx 10xxxxxx 10xxxxxx */
69 if (e > p+2) {
70 cnt[((0xf & p[0])<<12) | ((0x3f & p[1])<<6) | (0x3f & p[2])]++;
71 p += 3;
72 continue;
73 }
74 /* save 1 or 2 */
75 buf[0] = *p;
76 b = buf+1;
77 if (e != ++p) *b++ = *p;
78 break; /* reload */
79 }
80 printf("byte %x\n", *p);
81 goto break2;
82 }
83 }
84 break2:
85 for (i=0; i<0x10000; i++)
86 if (cnt[i]) {
87 int l = 1;
88 if (!(0xff80 & i)) { /* 0000-007F */
89 buf[0] = 0xe0&i ? i : '?'/*C0 controls*/;
90 } else if (!(0xf800 & i)) { /* 0080-07FF */
91 buf[0] = 0xc0 | (i>>6);
92 buf[1] = 0x80 | (0x3f&i);
93 l = 2;
94 } else { /* 0800-FFFF */
95 buf[0] = 0xe0 | (i>>12);
96 buf[1] = 0x80 | (0x3f&(i>>6));
97 buf[2] = 0x80 | (0x3f&i);
98 l = 3;
99 }
100 printf("%04X\t'%.*s'\t%d\n", i, l, buf, cnt[i]);
101 }
102 if (whoops)
103 printf("whoops %d", whoops);
104 return 0;
105 }

  ViewVC Help
Powered by ViewVC 1.1.26