Revision 238 (by dpavlin, 2004/03/08 17:46:16) tagging openisis 0.9.0
#!/usr/bin/perl

print <<EOF;
unicode block statistics of general character categories,
decomposition and uppercase mappings based on
Blocks.txt and UnicodeData.txt
>	http://unicode.org/Public/UNIDATA/


EOF


$dir = $ARGV[0] || '.';
open UDATA, $dir.'/UnicodeData.txt';
open BLOCKS, $dir.'/Blocks.txt';

$end = -1;
$gap = 0;
# get next block
sub nblock {
	while (<BLOCKS>) {
		last unless /^#/;
	}
	chomp;
	$lend = $end;
	($beg,$end,$block) = /([0-9A-Z]*)..([0-9A-Z]*); (.*)/;
	$h = {};
	$h->{beg} = $beg;
	$h->{end} = $end;
	$beg = hex $beg;
	$end = hex $end;
	$gap += $beg - 1 - $lend if 65536 > $beg;
	$h->{len} = $end - $beg + 1;
	$h->{nam} = $block;
	# print "$beg..$end($h->{len}): $block\n";
	push @blocks, $h;
}

nblock;


# UnicodeData.txt:
# 0 number (hex)
# 1 name S
# 2 general category E
# 3 canonical combining class N
# 4 bidi class E
# 5 decomposition <type> mapping <E>S
# 6-8 numeric type and value E/N
# 9 bidi_mirrored B
# 10 old name S
# 11 comment S
# 12-14 upper/lower/titlecase mapping S
while (<UDATA>) {
	@l = split /;/;
	# last if '10000' eq $l[0]; # BMP only
	$num = hex $l[0];
	while ($num > $end) { nblock; }
	die "$_ not in any block!" if $num < $beg;
	# <CJK Ideograph, First>
	if ( $l[1] =~ /, First>$/ ) {
		$n = <UDATA>;
		($e) = split /;/,$n;
		$e = hex($e);
		# print stderr "$l[1]..Last: $num .. $e\n";
		die "end $e of $l[1] ($num) not in block $h->{nam}" if $e > $end;
		$h->{$l[2]} += $e - $num +1;
		next;
	}
	# gen cat
	$h->{$l[2]}++;
	# decomp
	if ($l[5]) {
		($type) = ($l[5] =~ /<(.*)>/);
		$deco{$type||'(canonical)'}++;
		if ( ! $type ) { # decomp Canon
			$h->{'dC'}++;
		}
	}
	# has Upper
	if ($l[12]) {
		$h->{'uC'}++;
	}
}


print <<EOF;
*\tdecomposition mappings
see
>	http://unicode.org/Public/UNIDATA/UCD.html#Character_Decomposition_Mappings

\$
EOF
for (sort keys %deco) { print join("\t",$_,$deco{$_}),"\n"; }
print "\$\n\n\n";


# 30 general categories + 2 specials
@cat = (
	'Cn','Lu','Ll','Lt','Lm','Lo','Mn','Me','Mc','Nd','Nl','No','Zs','Zl','Zp',
	'Cc','Cf','Co','Cs','Pc','Pd','Ps','Pe','Pi','Pf','Po','Sm','Sc','Sk','So'
);
@add = ( 'uC','dC' );


# table major categories to blocks
print "\n*\tmajor category/block table\n";
print <<EOF;
Categories are letter, mark, numeric, punctuation, symbol, separator and other.
Additional columns give number of characters which have an uppercase and
canonical decomposition mapping, resp.
Final columns give begin and end, block length and name. 

\$
EOF
# headers
@mcat = ('L','M','N','P','S','Z','C');
print join("\t",
	'Let','Mar','Num','Pun','Sym','Sep','Oth',
	'upC', 'deC',
	'beg','end','len','block'
),"\n";
for $h (@blocks) {
	$ass = 0;
	%maj = ();
	for (@cat) {
		$ass += $h->{$_};
		$tot{$_} += $h->{$_};
		$maj{substr($_,0,1)} += $h->{$_};
	}
	$maj{'C'} += $h->{'Cn'} = $h->{len} - $ass; # unassigned
	$Cn += $h->{'Cn'} if 65536 > hex($h->{beg}); # in BMP
	for (@mcat) {
		$tot{$_} += $maj{$_};
		print $maj{$_},"\t";
	}
	for (@add) {
		$tot{$_} += $h->{$_};
		print $h->{$_}||'0',"\t";
	}
	print join("\t",$h->{beg},$h->{end},$h->{len},$h->{nam}),"\n";
}
for (@mcat,@add) { print $tot{$_},"\t"; } print "\n";
print "\$\n";
print "BMP: nonblock $gap unassigned $Cn\n\n\n";

# list blocks to categories
print <<EOF;
*\tdetailled block stats
see
>	http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values

\$
EOF
for $h (@blocks) {
	print join("\t",$h->{nam},'b'.$h->{beg},'l'.$h->{len}),"\t";
	for (@cat) {
		if ($h->{$_}) {
			print $_,$h->{$_},"\t";
		}
	}
	print "\n";
}
print "\$\n\n\n";