#!/usr/bin/perl
use warnings;
use strict;
use Text::CSV_XS;
use Text::CSV::Separator qw(get_separator);
use Carp qw/confess/;
use LWP::Simple;
use Number::Bytes::Human qw/format_bytes/;
use JSON::Syck;
use Data::Dump qw/dump/;
$|++;
my $csv_path = 'links.csv';
my $img_path = 'img';
my $first_line_labels = 1;
my $mirror_images = 1;
my $split_fields = {
label => sub { return split(/,\s*/,$_[0]) },
};
my @char_list = get_separator( path => $csv_path );
my $separator;
if (@char_list) {
if (@char_list == 1) {
$separator = $char_list[0];
} else {
$separator = $char_list[0];
}
} else {
die "Couldn't detect the field separator.\n";
}
warn "Separator: $separator\n";
my $csv_parser = Text::CSV_XS->new({
sep_char => $separator,
# binary => '1',
# always_quote => '1'
});
open my $csv_fh, '<', $csv_path;
my @dump;
my @labels;
my $split_stats;
my $multiplier = {
kb => 1024,
mb => 1024 * 1024,
gb => 1024 * 1024 * 1024,
};
my $multiplier_regex = join('|',keys %$multiplier);
sub clean {
my @out;
foreach my $l ( @_ ) {
my $o = $l;
$l =~ s/^(['"])(.*)\1/$2/;
$l =~ s/^\s+//s;
$l =~ s/\s+$//s;
push @out, $l;
warn "clean '$o' -> '$l'\n" if ( $o ne $l );
}
return @out if wantarray;
return shift @out;
}
sub human {
my $s = shift;
if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) {
my ( $v, $m ) = ( $1, lc($2) );
my $factor = $multiplier->{$m};
confess "can't find multiplier $m" unless defined $factor;
my $new = format_bytes( $v * $factor, bs => 1024 );
warn "## [$s] $v * $factor ($m) -> $new\n";
return $new;
}
return;
}
sub strip_prefix {
my @data = @_;
my $prefix = shift @data;
my $p;
foreach my $d ( @data ) {
my $chomp = length($prefix);
# find end of common string
$chomp-- while(
lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp ))
&&
$chomp > 0
);
if ( $chomp == 0 ) {
warn "no common prefix in ",dump( @_ );
return @_;
}
my $prefix = substr( $prefix, 0, $chomp );
$p->{$prefix}++;
}
warn "prefixes found = ",dump($p);
my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p;
my $strip = shift @sorted || return @_;
warn "longest prefix: '$strip' (stripped)\n";
return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_;
}
sub group_by {
my ( $group, $detail ) = ( shift, shift );
my $what = shift;
confess "expected CODE as first argument!" unless ref($what) eq 'CODE';
my @data = @_;
my $stat;
my @details;
foreach my $i ( 0 .. $#data ) {
my $v = $data[$i];
my ( $by, $rest ) = $what->($v);
# warn "## group_by: $i $v -> $by\n";
push @{ $stat->{$by} }, $i;
$details[$i] = $rest;
}
warn "group_by '$group' stats = ",dump( $stat );
foreach my $g ( keys %$stat ) {
foreach my $r ( @{ $stat->{$g} } ) {
warn "## $group $g $r\n";
$dump[$r]->{$group} = $g;
$dump[$r]->{$detail} = $details[$r];
}
}
}
sub number {
my $v = shift;
if ( $v =~ m/^([\d.]+),(\d+)$/) {
my ( $i, $f ) = ( $1, $2 );
$i =~ s/\.//g;
my $new = ( $i . '.' . $f ) + 0;
warn "## number $v -> $new\n";
return $new;
}
return $v;
}
sub numeric_range {
my ( $name, $groups ) = ( shift, shift );
my $min;
my $max;
my @numbers;
foreach my $i ( 0 .. $#dump ) {
my $v = number( $dump[$i]->{$name} );
die "element $i doesn't have $name in ",dump( $dump[$i] ) unless defined $v;
$min ||= $v;
$max ||= $v;
$min = $v if $v < $min;
$max = $v if $v > $max;
push @numbers, $v;
}
sub round {
my $v = number( shift );
my $step = shift || 10;
my $f = 1;
while ( $v > $step ) {
$f *= $step;
$v /= $step;
}
my $new = int($v) * $f;
warn "## round step: $step v: $v f: $f => $new\n";
return $new;
}
my $range = $max - $min;
my $step = $range / $groups;
warn "## numeric_range $min - $max / $step step into $groups groups\n";
$step = round($step);
$min = round($min);
$max = round($max, $step) + $step;
my @ranges;
my $v = $min;
while ( $v <= $max ) {
push @ranges, $v;
$v += $step;
}
warn "## round $min - $max / $step produced ranges: ",dump( @ranges ),"\n";
my $d = length( "$max" );
foreach my $i ( 0 .. $#numbers ) {
my $n = $numbers[$i];
my $start = 0;
foreach my $r ( @ranges ) {
if ( $n < $r ) {
$dump[$i]->{ $name . '_range' } = sprintf("%0${d}d-%0${d}d", $start, $r);
last;
}
$start = $r;
}
}
}
while (<$csv_fh>) {
$csv_parser->parse($_);
my @fields = $csv_parser->fields;
if ( $first_line_labels && $. == 1 ) {
@labels = @fields;
next;
}
my $h;
foreach my $i ( 0 .. $#fields ) {
my $l = $labels[$i];
die "no label for field $i '$fields[$i]'" unless $l;
my $v = clean( $fields[$i] );
# FIXME reject some values?
$h->{ $l } = $v;
if ( my $split = $split_fields->{$l} ) {
confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE';
my @sv = $split->( $v );
# warn "sv = ",dump( @sv );
foreach my $j ( 0 .. $#sv ) {
my $v = clean( $sv[$j] );
if ( my $human = human( $v ) ) {
$h->{ $l . '_' . $j . '_human' } = $human;
} else {
$h->{ $l . '_' . $j } = $v;
}
$split_stats->{$v}->{pos}->{$j}++;
$split_stats->{$v}->{sum}++;
push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j;
}
}
}
warn "\nRecord #$. ",dump($h),"\n";
my $id = $h->{id};
if ( ! defined($id) || $id eq '' ) {
warn "## skipped: $_";
next;
}
my $url = "http://www.links.hr/photo/big/$id.jpg";
my $img_thumb_path = "$img_path/t/$id.jpg";
my $img_orig_path = "$img_path/$id.jpg";
if ( $mirror_images && mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) {
warn "$url -> $img_orig_path\n";
}
system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path;
$h->{'image-url'} = $img_orig_path;
$h->{'image-thumb-url'} = $img_thumb_path;
push @dump, $h;
}
close $csv_fh;
foreach my $v ( keys %$split_stats ) {
if ( $split_stats->{$v}->{sum} == 1 ) {
delete( $split_stats->{$v} );
next;
}
foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) {
push @{ $dump[ $i ]->{feature} }, $v;
}
}
warn "split_stats = ", dump( $split_stats ), "\n";
#
# split prefix from label_0
#
my @stripped = strip_prefix( map { $_->{label_0} } @dump );
$dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped );
# group products by manufacturers
group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped );
# create price ranges
numeric_range( 'gotovina', 5 );
warn "dump = ", dump( @dump ), "\n";
print "features: .", join(', .', keys %$split_stats), "\n";
my $js_path = $csv_path;
$js_path =~ s/\.csv/.js/gi;
open my $fh, '>', $js_path || die "can't open $js_path: $!";
print $fh JSON::Syck::Dump( { items => \@dump } );
close $fh;