/[webpac]/trunk/openisis/unistat
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/openisis/unistat

Parent Directory Parent Directory | Revision Log Revision Log


Revision 237 - (hide annotations)
Mon Mar 8 17:43:12 2004 UTC (17 years, 4 months ago) by dpavlin
Original Path: openisis/current/unistat
File size: 3521 byte(s)
initial import of openisis 0.9.0 vendor drop

1 dpavlin 237 #!/usr/bin/perl
2    
3     print <<EOF;
4     unicode block statistics of general character categories,
5     decomposition and uppercase mappings based on
6     Blocks.txt and UnicodeData.txt
7     > http://unicode.org/Public/UNIDATA/
8    
9    
10     EOF
11    
12    
13     $dir = $ARGV[0] || '.';
14     open UDATA, $dir.'/UnicodeData.txt';
15     open BLOCKS, $dir.'/Blocks.txt';
16    
17     $end = -1;
18     $gap = 0;
19     # get next block
20     sub nblock {
21     while (<BLOCKS>) {
22     last unless /^#/;
23     }
24     chomp;
25     $lend = $end;
26     ($beg,$end,$block) = /([0-9A-Z]*)..([0-9A-Z]*); (.*)/;
27     $h = {};
28     $h->{beg} = $beg;
29     $h->{end} = $end;
30     $beg = hex $beg;
31     $end = hex $end;
32     $gap += $beg - 1 - $lend if 65536 > $beg;
33     $h->{len} = $end - $beg + 1;
34     $h->{nam} = $block;
35     # print "$beg..$end($h->{len}): $block\n";
36     push @blocks, $h;
37     }
38    
39     nblock;
40    
41    
42     # UnicodeData.txt:
43     # 0 number (hex)
44     # 1 name S
45     # 2 general category E
46     # 3 canonical combining class N
47     # 4 bidi class E
48     # 5 decomposition <type> mapping <E>S
49     # 6-8 numeric type and value E/N
50     # 9 bidi_mirrored B
51     # 10 old name S
52     # 11 comment S
53     # 12-14 upper/lower/titlecase mapping S
54     while (<UDATA>) {
55     @l = split /;/;
56     # last if '10000' eq $l[0]; # BMP only
57     $num = hex $l[0];
58     while ($num > $end) { nblock; }
59     die "$_ not in any block!" if $num < $beg;
60     # <CJK Ideograph, First>
61     if ( $l[1] =~ /, First>$/ ) {
62     $n = <UDATA>;
63     ($e) = split /;/,$n;
64     $e = hex($e);
65     # print stderr "$l[1]..Last: $num .. $e\n";
66     die "end $e of $l[1] ($num) not in block $h->{nam}" if $e > $end;
67     $h->{$l[2]} += $e - $num +1;
68     next;
69     }
70     # gen cat
71     $h->{$l[2]}++;
72     # decomp
73     if ($l[5]) {
74     ($type) = ($l[5] =~ /<(.*)>/);
75     $deco{$type||'(canonical)'}++;
76     if ( ! $type ) { # decomp Canon
77     $h->{'dC'}++;
78     }
79     }
80     # has Upper
81     if ($l[12]) {
82     $h->{'uC'}++;
83     }
84     }
85    
86    
87     print <<EOF;
88     *\tdecomposition mappings
89     see
90     > http://unicode.org/Public/UNIDATA/UCD.html#Character_Decomposition_Mappings
91    
92     \$
93     EOF
94     for (sort keys %deco) { print join("\t",$_,$deco{$_}),"\n"; }
95     print "\$\n\n\n";
96    
97    
98     # 30 general categories + 2 specials
99     @cat = (
100     'Cn','Lu','Ll','Lt','Lm','Lo','Mn','Me','Mc','Nd','Nl','No','Zs','Zl','Zp',
101     'Cc','Cf','Co','Cs','Pc','Pd','Ps','Pe','Pi','Pf','Po','Sm','Sc','Sk','So'
102     );
103     @add = ( 'uC','dC' );
104    
105    
106     # table major categories to blocks
107     print "\n*\tmajor category/block table\n";
108     print <<EOF;
109     Categories are letter, mark, numeric, punctuation, symbol, separator and other.
110     Additional columns give number of characters which have an uppercase and
111     canonical decomposition mapping, resp.
112     Final columns give begin and end, block length and name.
113    
114     \$
115     EOF
116     # headers
117     @mcat = ('L','M','N','P','S','Z','C');
118     print join("\t",
119     'Let','Mar','Num','Pun','Sym','Sep','Oth',
120     'upC', 'deC',
121     'beg','end','len','block'
122     ),"\n";
123     for $h (@blocks) {
124     $ass = 0;
125     %maj = ();
126     for (@cat) {
127     $ass += $h->{$_};
128     $tot{$_} += $h->{$_};
129     $maj{substr($_,0,1)} += $h->{$_};
130     }
131     $maj{'C'} += $h->{'Cn'} = $h->{len} - $ass; # unassigned
132     $Cn += $h->{'Cn'} if 65536 > hex($h->{beg}); # in BMP
133     for (@mcat) {
134     $tot{$_} += $maj{$_};
135     print $maj{$_},"\t";
136     }
137     for (@add) {
138     $tot{$_} += $h->{$_};
139     print $h->{$_}||'0',"\t";
140     }
141     print join("\t",$h->{beg},$h->{end},$h->{len},$h->{nam}),"\n";
142     }
143     for (@mcat,@add) { print $tot{$_},"\t"; } print "\n";
144     print "\$\n";
145     print "BMP: nonblock $gap unassigned $Cn\n\n\n";
146    
147     # list blocks to categories
148     print <<EOF;
149     *\tdetailled block stats
150     see
151     > http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
152    
153     \$
154     EOF
155     for $h (@blocks) {
156     print join("\t",$h->{nam},'b'.$h->{beg},'l'.$h->{len}),"\t";
157     for (@cat) {
158     if ($h->{$_}) {
159     print $_,$h->{$_},"\t";
160     }
161     }
162     print "\n";
163     }
164     print "\$\n\n\n";

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26