Revision 13 (by dpavlin, 2007/08/23 17:55:05) optionally mirror_images, numeric ranges are now zero-padded so they sort ok
in exhibit
#!/usr/bin/perl

use warnings;
use strict;

use Text::CSV_XS;
use Text::CSV::Separator qw(get_separator);
use Carp qw/confess/;
use LWP::Simple;
use Number::Bytes::Human qw/format_bytes/;

use JSON::Syck;
use Data::Dump qw/dump/;

$|++;

my $csv_path = 'links.csv';
my $img_path = 'img';
my $first_line_labels = 1;
my $mirror_images = 1;
my $split_fields = {
	label => sub { return split(/,\s*/,$_[0]) },
};

my @char_list = get_separator( path => $csv_path );

my $separator;
if (@char_list) {
    if (@char_list == 1) {
        $separator = $char_list[0];
    } else {
        $separator  = $char_list[0];
    }
} else {
    die "Couldn't detect the field separator.\n";
}

warn "Separator: $separator\n";

my $csv_parser = Text::CSV_XS->new({
	sep_char => $separator,
#	binary => '1',
#	always_quote => '1'
});

open my $csv_fh, '<', $csv_path;

my @dump;

my @labels;

my $split_stats;

my $multiplier = {
	kb => 1024,
	mb => 1024 * 1024,
	gb => 1024 * 1024 * 1024,
};

my $multiplier_regex = join('|',keys %$multiplier);

sub clean {
	my @out;
	foreach my $l ( @_ ) {
		my $o = $l;
		$l =~ s/^(['"])(.*)\1/$2/;
		$l =~ s/^\s+//s;
		$l =~ s/\s+$//s;
		push @out, $l;
		warn "clean '$o' -> '$l'\n" if ( $o ne $l );
	}
	return @out if wantarray;
	return shift @out;
}

sub human {
	my $s = shift;

	if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) {
		my ( $v, $m ) = ( $1, lc($2) );
		my $factor = $multiplier->{$m};
		confess "can't find multiplier $m" unless defined $factor;
		my $new = format_bytes( $v * $factor, bs => 1024 );
		warn "## [$s] $v * $factor ($m) -> $new\n";
		return $new;
	}
	return;
}

sub strip_prefix {
	my @data = @_;
	my $prefix = shift @data;

	my $p;

	foreach my $d ( @data ) {
		my $chomp = length($prefix);
		# find end of common string
		$chomp-- while(
			lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp ))
			&&
			$chomp > 0
		);
		if ( $chomp == 0 ) {
			warn "no common prefix in ",dump( @_ );
			return @_;
		}

		my $prefix = substr( $prefix, 0, $chomp );
		$p->{$prefix}++;
	}
	warn "prefixes found = ",dump($p);
	my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p;
	my $strip = shift @sorted || return @_;
	warn "longest prefix: '$strip' (stripped)\n";
	return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_;
}

sub group_by {
	my ( $group, $detail ) = ( shift, shift );
	my $what = shift;
	confess "expected CODE as first argument!" unless ref($what) eq 'CODE';
	my @data = @_;

	my $stat;
	my @details;

	foreach my $i ( 0 .. $#data ) {
		my $v = $data[$i];
		my ( $by, $rest ) = $what->($v);
#		warn "## group_by: $i $v -> $by\n";
		push @{ $stat->{$by} }, $i;
		$details[$i] = $rest;
	}

	warn "group_by '$group' stats = ",dump( $stat );

	foreach my $g ( keys %$stat ) {
		foreach my $r ( @{ $stat->{$g} } ) {
			warn "## $group $g $r\n";
			$dump[$r]->{$group} = $g;
			$dump[$r]->{$detail} = $details[$r];
		}
	}
}

sub number {
	my $v = shift;
	if ( $v =~ m/^([\d.]+),(\d+)$/) {
		my ( $i, $f ) = ( $1, $2 );
		$i =~ s/\.//g;
		my $new = ( $i . '.' . $f ) + 0;
		warn "## number $v -> $new\n";
		return $new;
	}
	return $v;
}

sub numeric_range {
	my ( $name, $groups ) = ( shift, shift );

	my $min;
	my $max;

	my @numbers;

	foreach my $i ( 0 .. $#dump ) {
		my $v = number( $dump[$i]->{$name} );
		die "element $i doesn't have $name in ",dump( $dump[$i] ) unless defined $v;

		$min ||= $v;
		$max ||= $v;

		$min = $v if $v < $min;
		$max = $v if $v > $max;

		push @numbers, $v;
	}

	sub round {
		my $v = number( shift );
		my $step = shift || 10;
		my $f = 1;
		while ( $v > $step ) {
			$f *= $step;
			$v /= $step;
		}
		my $new = int($v) * $f;
		warn "## round step: $step v: $v f: $f => $new\n";
		return $new;
	}

	my $range = $max - $min;
	my $step = $range / $groups;

	warn "## numeric_range $min - $max / $step step into $groups groups\n";

	$step = round($step);
	$min = round($min);
	$max = round($max, $step) + $step;

	my @ranges;
	my $v = $min;
	while ( $v <= $max ) {
		push @ranges, $v;
		$v += $step;
	}

	warn "## round $min - $max / $step produced ranges: ",dump( @ranges ),"\n";

	my $d = length( "$max" );

	foreach my $i ( 0 .. $#numbers ) {
		my $n = $numbers[$i];

		my $start = 0;
		foreach my $r ( @ranges ) {
			if ( $n < $r ) {
				$dump[$i]->{ $name . '_range' } =  sprintf("%0${d}d-%0${d}d", $start, $r);
				last;
			}
			$start = $r;
		}
	}
}


while (<$csv_fh>) {
    $csv_parser->parse($_);
    my @fields = $csv_parser->fields;

	if ( $first_line_labels && $. == 1 ) {
		@labels = @fields;
		next;
	}

	my $h;
	foreach my $i ( 0 .. $#fields ) {
		my $l = $labels[$i];
		die "no label for field $i '$fields[$i]'" unless $l;

		my $v = clean( $fields[$i] );
		# FIXME reject some values?

		$h->{ $l } = $v;

		if ( my $split = $split_fields->{$l} ) {
			confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE';

			my @sv = $split->( $v );

#			warn "sv = ",dump( @sv );

			foreach my $j ( 0 .. $#sv ) {

				my $v = clean( $sv[$j] );

				if ( my $human = human( $v ) ) {
					$h->{ $l . '_' . $j . '_human' } = $human;
				} else {
					$h->{ $l . '_' . $j } = $v;
				}

				$split_stats->{$v}->{pos}->{$j}++;
				$split_stats->{$v}->{sum}++;
				push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j;
			}
		}
	}
    warn "\nRecord #$. ",dump($h),"\n";

	my $id = $h->{id};

	if ( ! defined($id) || $id eq '' ) {
		warn "## skipped: $_";
		next;
	}

	my $url = "http://www.links.hr/photo/big/$id.jpg";
	my $img_thumb_path = "$img_path/t/$id.jpg";
	my $img_orig_path = "$img_path/$id.jpg";

	if ( $mirror_images && mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) {
		warn "$url -> $img_orig_path\n";
	}
	system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path;

	$h->{'image-url'} = $img_orig_path;
	$h->{'image-thumb-url'} = $img_thumb_path;

	push @dump, $h;
}

close $csv_fh;

foreach my $v ( keys %$split_stats ) {

	if ( $split_stats->{$v}->{sum} == 1 ) {
		delete( $split_stats->{$v} );
		next;
	}

	foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) {
		push @{ $dump[ $i ]->{feature} }, $v;
	}
}

warn "split_stats = ", dump( $split_stats ), "\n";

#
# split prefix from label_0
#
my @stripped = strip_prefix( map { $_->{label_0} } @dump );
$dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped );

# group products by manufacturers
group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped );

# create price ranges
numeric_range( 'gotovina', 5 );

warn "dump = ", dump( @dump ), "\n";

print "features: .", join(', .', keys %$split_stats), "\n";

my $js_path = $csv_path;
$js_path =~ s/\.csv/.js/gi;

open my $fh, '>', $js_path || die "can't open $js_path: $!";
print $fh JSON::Syck::Dump( { items => \@dump } );
close $fh;