#!/usr/bin/perl use warnings; use strict; use Text::CSV_XS; use Text::CSV::Separator qw(get_separator); use Carp qw/confess/; use LWP::Simple; use Number::Bytes::Human qw/format_bytes/; use JSON::Syck; use Data::Dump qw/dump/; $|++; my $csv_path = 'links.csv'; my $img_path = 'img'; my $first_line_labels = 1; my $mirror_images = 1; my $split_fields = { label => sub { return split(/,\s*/,$_[0]) }, }; my @char_list = get_separator( path => $csv_path ); my $separator; if (@char_list) { if (@char_list == 1) { $separator = $char_list[0]; } else { $separator = $char_list[0]; } } else { die "Couldn't detect the field separator.\n"; } warn "Separator: $separator\n"; my $csv_parser = Text::CSV_XS->new({ sep_char => $separator, # binary => '1', # always_quote => '1' }); open my $csv_fh, '<', $csv_path; my @dump; my @labels; my $split_stats; my $multiplier = { kb => 1024, mb => 1024 * 1024, gb => 1024 * 1024 * 1024, }; my $multiplier_regex = join('|',keys %$multiplier); sub clean { my @out; foreach my $l ( @_ ) { my $o = $l; $l =~ s/^(['"])(.*)\1/$2/; $l =~ s/^\s+//s; $l =~ s/\s+$//s; push @out, $l; warn "clean '$o' -> '$l'\n" if ( $o ne $l ); } return @out if wantarray; return shift @out; } sub human { my $s = shift; if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) { my ( $v, $m ) = ( $1, lc($2) ); my $factor = $multiplier->{$m}; confess "can't find multiplier $m" unless defined $factor; my $new = format_bytes( $v * $factor, bs => 1024 ); warn "## [$s] $v * $factor ($m) -> $new\n"; return $new; } return; } sub strip_prefix { my @data = @_; my $prefix = shift @data; my $p; foreach my $d ( @data ) { my $chomp = length($prefix); # find end of common string $chomp-- while( lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp )) && $chomp > 0 ); if ( $chomp == 0 ) { warn "no common prefix in ",dump( @_ ); return @_; } my $prefix = substr( $prefix, 0, $chomp ); $p->{$prefix}++; } warn "prefixes found = ",dump($p); my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p; my $strip = shift @sorted || return @_; warn "longest prefix: '$strip' (stripped)\n"; return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_; } sub group_by { my ( $group, $detail ) = ( shift, shift ); my $what = shift; confess "expected CODE as first argument!" unless ref($what) eq 'CODE'; my @data = @_; my $stat; my @details; foreach my $i ( 0 .. $#data ) { my $v = $data[$i]; my ( $by, $rest ) = $what->($v); # warn "## group_by: $i $v -> $by\n"; push @{ $stat->{$by} }, $i; $details[$i] = $rest; } warn "group_by '$group' stats = ",dump( $stat ); foreach my $g ( keys %$stat ) { foreach my $r ( @{ $stat->{$g} } ) { warn "## $group $g $r\n"; $dump[$r]->{$group} = $g; $dump[$r]->{$detail} = $details[$r]; } } } sub number { my $v = shift; if ( $v =~ m/^([\d.]+),(\d+)$/) { my ( $i, $f ) = ( $1, $2 ); $i =~ s/\.//g; my $new = ( $i . '.' . $f ) + 0; warn "## number $v -> $new\n"; return $new; } return $v; } sub numeric_range { my ( $name, $groups ) = ( shift, shift ); my $min; my $max; my @numbers; foreach my $i ( 0 .. $#dump ) { my $v = number( $dump[$i]->{$name} ); die "element $i doesn't have $name in ",dump( $dump[$i] ) unless defined $v; $min ||= $v; $max ||= $v; $min = $v if $v < $min; $max = $v if $v > $max; push @numbers, $v; } sub round { my $v = number( shift ); my $step = shift || 10; my $f = 1; while ( $v > $step ) { $f *= $step; $v /= $step; } my $new = int($v) * $f; warn "## round step: $step v: $v f: $f => $new\n"; return $new; } my $range = $max - $min; my $step = $range / $groups; warn "## numeric_range $min - $max / $step step into $groups groups\n"; $step = round($step); $min = round($min); $max = round($max, $step) + $step; my @ranges; my $v = $min; while ( $v <= $max ) { push @ranges, $v; $v += $step; } warn "## round $min - $max / $step produced ranges: ",dump( @ranges ),"\n"; my $d = length( "$max" ); foreach my $i ( 0 .. $#numbers ) { my $n = $numbers[$i]; my $start = 0; foreach my $r ( @ranges ) { if ( $n < $r ) { $dump[$i]->{ $name . '_range' } = sprintf("%0${d}d-%0${d}d", $start, $r); last; } $start = $r; } } } while (<$csv_fh>) { $csv_parser->parse($_); my @fields = $csv_parser->fields; if ( $first_line_labels && $. == 1 ) { @labels = @fields; next; } my $h; foreach my $i ( 0 .. $#fields ) { my $l = $labels[$i]; die "no label for field $i '$fields[$i]'" unless $l; my $v = clean( $fields[$i] ); # FIXME reject some values? $h->{ $l } = $v; if ( my $split = $split_fields->{$l} ) { confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE'; my @sv = $split->( $v ); # warn "sv = ",dump( @sv ); foreach my $j ( 0 .. $#sv ) { my $v = clean( $sv[$j] ); if ( my $human = human( $v ) ) { $h->{ $l . '_' . $j . '_human' } = $human; } else { $h->{ $l . '_' . $j } = $v; } $split_stats->{$v}->{pos}->{$j}++; $split_stats->{$v}->{sum}++; push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j; } } } warn "\nRecord #$. ",dump($h),"\n"; my $id = $h->{id}; if ( ! defined($id) || $id eq '' ) { warn "## skipped: $_"; next; } my $url = "http://www.links.hr/photo/big/$id.jpg"; my $img_thumb_path = "$img_path/t/$id.jpg"; my $img_orig_path = "$img_path/$id.jpg"; if ( $mirror_images && mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) { warn "$url -> $img_orig_path\n"; } system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path; $h->{'image-url'} = $img_orig_path; $h->{'image-thumb-url'} = $img_thumb_path; push @dump, $h; } close $csv_fh; foreach my $v ( keys %$split_stats ) { if ( $split_stats->{$v}->{sum} == 1 ) { delete( $split_stats->{$v} ); next; } foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) { push @{ $dump[ $i ]->{feature} }, $v; } } warn "split_stats = ", dump( $split_stats ), "\n"; # # split prefix from label_0 # my @stripped = strip_prefix( map { $_->{label_0} } @dump ); $dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped ); # group products by manufacturers group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped ); # create price ranges numeric_range( 'gotovina', 5 ); warn "dump = ", dump( @dump ), "\n"; print "features: .", join(', .', keys %$split_stats), "\n"; my $js_path = $csv_path; $js_path =~ s/\.csv/.js/gi; open my $fh, '>', $js_path || die "can't open $js_path: $!"; print $fh JSON::Syck::Dump( { items => \@dump } ); close $fh;