--- links/csv2js.pl 2007/08/23 11:28:39 2 +++ links/csv2js.pl 2007/08/23 14:08:06 8 @@ -7,7 +7,7 @@ use Text::CSV::Separator qw(get_separator); use Carp qw/confess/; use LWP::Simple; -use Imager; +use Number::Bytes::Human qw/format_bytes/; use JSON::Syck; use Data::Dump qw/dump/; @@ -52,7 +52,7 @@ my $multiplier = { kb => 1024, - mb => 1024 * 1204, + mb => 1024 * 1024, gb => 1024 * 1024 * 1024, }; @@ -72,6 +72,77 @@ return shift @out; } +sub human { + my $s = shift; + + if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) { + my ( $v, $m ) = ( $1, lc($2) ); + my $factor = $multiplier->{$m}; + confess "can't find multiplier $m" unless defined $factor; + my $new = format_bytes( $v * $factor, bs => 1024 ); + warn "## [$s] $v * $factor ($m) -> $new\n"; + return $new; + } + return; +} + +sub strip_prefix { + my @data = @_; + my $prefix = shift @data; + + my $p; + + foreach my $d ( @data ) { + my $chomp = length($prefix); + # find end of common string + $chomp-- while( + lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp )) + && + $chomp > 0 + ); + if ( $chomp == 0 ) { + warn "no common prefix in ",dump( @_ ); + return @_; + } + + my $prefix = substr( $prefix, 0, $chomp ); + $p->{$prefix}++; + } + warn "prefixes found = ",dump($p); + my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p; + my $strip = shift @sorted || return @_; + warn "longest prefix: '$strip' (stripped)\n"; + return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_; +} + +sub group_by { + my ( $group, $detail ) = ( shift, shift ); + my $what = shift; + confess "expected CODE as first argument!" unless ref($what) eq 'CODE'; + my @data = @_; + + my $stat; + my @details; + + foreach my $i ( 0 .. $#data ) { + my $v = $data[$i]; + my ( $by, $rest ) = $what->($v); +# warn "## group_by: $i $v -> $by\n"; + push @{ $stat->{$by} }, $i; + $details[$i] = $rest; + } + + warn "group_by '$group' stats = ",dump( $stat ); + + foreach my $g ( keys %$stat ) { + foreach my $r ( @{ $stat->{$g} } ) { + warn "## $group $g $r\n"; + $dump[$r]->{$group} = $g; + $dump[$r]->{$detail} = $details[$r]; + } + } +} + while (<$csv_fh>) { $csv_parser->parse($_); my @fields = $csv_parser->fields; @@ -102,17 +173,13 @@ my $v = clean( $sv[$j] ); - if ( $j == 0 ) { - $h->{ $l . '_short' } = $v; + if ( my $human = human( $v ) ) { + $h->{ $l . '_' . $j . '_human' } = $human; + } else { + $h->{ $l . '_' . $j } = $v; } - if ( $v =~ m/(\d+)\s*($multiplier_regex)/) { - my $new = $1 * $multiplier_regex->{$2}; - warn "## $v -> $new\n"; - $v = $new; - } - - $split_stats->{$v}->{$j}++; + $split_stats->{$v}->{pos}->{$j}++; $split_stats->{$v}->{sum}++; push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j; } @@ -156,7 +223,18 @@ } } -#warn "split_stats = ", dump( $split_stats ), "\n"; +warn "split_stats = ", dump( $split_stats ), "\n"; + +# +# split prefix from label_0 +# +my @all = map { $_->{label_0} || die "no label_0 for ",dump($_) } @dump; +warn "all = ",dump(@all); +my @stripped = strip_prefix( @all ); +$dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped ); + +# group products by manufacturers +group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped ); warn "dump = ", dump( @dump ), "\n";