/[webpac]/trunk2/openisis/unistat
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk2/openisis/unistat

Parent Directory Parent Directory | Revision Log Revision Log


Revision 337 - (show annotations)
Thu Jun 10 19:22:40 2004 UTC (15 years, 7 months ago) by dpavlin
File size: 3521 byte(s)
new trunk for webpac v2

1 #!/usr/bin/perl
2
3 print <<EOF;
4 unicode block statistics of general character categories,
5 decomposition and uppercase mappings based on
6 Blocks.txt and UnicodeData.txt
7 > http://unicode.org/Public/UNIDATA/
8
9
10 EOF
11
12
13 $dir = $ARGV[0] || '.';
14 open UDATA, $dir.'/UnicodeData.txt';
15 open BLOCKS, $dir.'/Blocks.txt';
16
17 $end = -1;
18 $gap = 0;
19 # get next block
20 sub nblock {
21 while (<BLOCKS>) {
22 last unless /^#/;
23 }
24 chomp;
25 $lend = $end;
26 ($beg,$end,$block) = /([0-9A-Z]*)..([0-9A-Z]*); (.*)/;
27 $h = {};
28 $h->{beg} = $beg;
29 $h->{end} = $end;
30 $beg = hex $beg;
31 $end = hex $end;
32 $gap += $beg - 1 - $lend if 65536 > $beg;
33 $h->{len} = $end - $beg + 1;
34 $h->{nam} = $block;
35 # print "$beg..$end($h->{len}): $block\n";
36 push @blocks, $h;
37 }
38
39 nblock;
40
41
42 # UnicodeData.txt:
43 # 0 number (hex)
44 # 1 name S
45 # 2 general category E
46 # 3 canonical combining class N
47 # 4 bidi class E
48 # 5 decomposition <type> mapping <E>S
49 # 6-8 numeric type and value E/N
50 # 9 bidi_mirrored B
51 # 10 old name S
52 # 11 comment S
53 # 12-14 upper/lower/titlecase mapping S
54 while (<UDATA>) {
55 @l = split /;/;
56 # last if '10000' eq $l[0]; # BMP only
57 $num = hex $l[0];
58 while ($num > $end) { nblock; }
59 die "$_ not in any block!" if $num < $beg;
60 # <CJK Ideograph, First>
61 if ( $l[1] =~ /, First>$/ ) {
62 $n = <UDATA>;
63 ($e) = split /;/,$n;
64 $e = hex($e);
65 # print stderr "$l[1]..Last: $num .. $e\n";
66 die "end $e of $l[1] ($num) not in block $h->{nam}" if $e > $end;
67 $h->{$l[2]} += $e - $num +1;
68 next;
69 }
70 # gen cat
71 $h->{$l[2]}++;
72 # decomp
73 if ($l[5]) {
74 ($type) = ($l[5] =~ /<(.*)>/);
75 $deco{$type||'(canonical)'}++;
76 if ( ! $type ) { # decomp Canon
77 $h->{'dC'}++;
78 }
79 }
80 # has Upper
81 if ($l[12]) {
82 $h->{'uC'}++;
83 }
84 }
85
86
87 print <<EOF;
88 *\tdecomposition mappings
89 see
90 > http://unicode.org/Public/UNIDATA/UCD.html#Character_Decomposition_Mappings
91
92 \$
93 EOF
94 for (sort keys %deco) { print join("\t",$_,$deco{$_}),"\n"; }
95 print "\$\n\n\n";
96
97
98 # 30 general categories + 2 specials
99 @cat = (
100 'Cn','Lu','Ll','Lt','Lm','Lo','Mn','Me','Mc','Nd','Nl','No','Zs','Zl','Zp',
101 'Cc','Cf','Co','Cs','Pc','Pd','Ps','Pe','Pi','Pf','Po','Sm','Sc','Sk','So'
102 );
103 @add = ( 'uC','dC' );
104
105
106 # table major categories to blocks
107 print "\n*\tmajor category/block table\n";
108 print <<EOF;
109 Categories are letter, mark, numeric, punctuation, symbol, separator and other.
110 Additional columns give number of characters which have an uppercase and
111 canonical decomposition mapping, resp.
112 Final columns give begin and end, block length and name.
113
114 \$
115 EOF
116 # headers
117 @mcat = ('L','M','N','P','S','Z','C');
118 print join("\t",
119 'Let','Mar','Num','Pun','Sym','Sep','Oth',
120 'upC', 'deC',
121 'beg','end','len','block'
122 ),"\n";
123 for $h (@blocks) {
124 $ass = 0;
125 %maj = ();
126 for (@cat) {
127 $ass += $h->{$_};
128 $tot{$_} += $h->{$_};
129 $maj{substr($_,0,1)} += $h->{$_};
130 }
131 $maj{'C'} += $h->{'Cn'} = $h->{len} - $ass; # unassigned
132 $Cn += $h->{'Cn'} if 65536 > hex($h->{beg}); # in BMP
133 for (@mcat) {
134 $tot{$_} += $maj{$_};
135 print $maj{$_},"\t";
136 }
137 for (@add) {
138 $tot{$_} += $h->{$_};
139 print $h->{$_}||'0',"\t";
140 }
141 print join("\t",$h->{beg},$h->{end},$h->{len},$h->{nam}),"\n";
142 }
143 for (@mcat,@add) { print $tot{$_},"\t"; } print "\n";
144 print "\$\n";
145 print "BMP: nonblock $gap unassigned $Cn\n\n\n";
146
147 # list blocks to categories
148 print <<EOF;
149 *\tdetailled block stats
150 see
151 > http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
152
153 \$
154 EOF
155 for $h (@blocks) {
156 print join("\t",$h->{nam},'b'.$h->{beg},'l'.$h->{len}),"\t";
157 for (@cat) {
158 if ($h->{$_}) {
159 print $_,$h->{$_},"\t";
160 }
161 }
162 print "\n";
163 }
164 print "\$\n\n\n";

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26