| 1 |
1 |
dpavlin |
#!/usr/bin/perl |
| 2 |
|
|
|
| 3 |
|
|
use warnings; |
| 4 |
|
|
use strict; |
| 5 |
|
|
|
| 6 |
|
|
use Text::CSV_XS; |
| 7 |
|
|
use Text::CSV::Separator qw(get_separator); |
| 8 |
|
|
use Carp qw/confess/; |
| 9 |
|
|
use LWP::Simple; |
| 10 |
5 |
dpavlin |
use Number::Bytes::Human qw/format_bytes/; |
| 11 |
1 |
dpavlin |
|
| 12 |
|
|
use JSON::Syck; |
| 13 |
|
|
use Data::Dump qw/dump/; |
| 14 |
|
|
|
| 15 |
|
|
$|++; |
| 16 |
|
|
|
| 17 |
|
|
my $csv_path = 'links.csv'; |
| 18 |
|
|
my $img_path = 'img'; |
| 19 |
|
|
my $first_line_labels = 1; |
| 20 |
13 |
dpavlin |
my $mirror_images = 1; |
| 21 |
1 |
dpavlin |
my $split_fields = { |
| 22 |
|
|
label => sub { return split(/,\s*/,$_[0]) }, |
| 23 |
|
|
}; |
| 24 |
|
|
|
| 25 |
|
|
my @char_list = get_separator( path => $csv_path ); |
| 26 |
|
|
|
| 27 |
|
|
my $separator; |
| 28 |
|
|
if (@char_list) { |
| 29 |
|
|
if (@char_list == 1) { |
| 30 |
|
|
$separator = $char_list[0]; |
| 31 |
|
|
} else { |
| 32 |
|
|
$separator = $char_list[0]; |
| 33 |
|
|
} |
| 34 |
|
|
} else { |
| 35 |
|
|
die "Couldn't detect the field separator.\n"; |
| 36 |
|
|
} |
| 37 |
|
|
|
| 38 |
|
|
warn "Separator: $separator\n"; |
| 39 |
|
|
|
| 40 |
|
|
my $csv_parser = Text::CSV_XS->new({ |
| 41 |
|
|
sep_char => $separator, |
| 42 |
|
|
# binary => '1', |
| 43 |
|
|
# always_quote => '1' |
| 44 |
|
|
}); |
| 45 |
|
|
|
| 46 |
|
|
open my $csv_fh, '<', $csv_path; |
| 47 |
|
|
|
| 48 |
|
|
my @dump; |
| 49 |
|
|
|
| 50 |
|
|
my @labels; |
| 51 |
|
|
|
| 52 |
|
|
my $split_stats; |
| 53 |
|
|
|
| 54 |
|
|
my $multiplier = { |
| 55 |
|
|
kb => 1024, |
| 56 |
5 |
dpavlin |
mb => 1024 * 1024, |
| 57 |
1 |
dpavlin |
gb => 1024 * 1024 * 1024, |
| 58 |
|
|
}; |
| 59 |
|
|
|
| 60 |
|
|
my $multiplier_regex = join('|',keys %$multiplier); |
| 61 |
|
|
|
| 62 |
|
|
sub clean { |
| 63 |
|
|
my @out; |
| 64 |
|
|
foreach my $l ( @_ ) { |
| 65 |
|
|
my $o = $l; |
| 66 |
|
|
$l =~ s/^(['"])(.*)\1/$2/; |
| 67 |
|
|
$l =~ s/^\s+//s; |
| 68 |
|
|
$l =~ s/\s+$//s; |
| 69 |
|
|
push @out, $l; |
| 70 |
|
|
warn "clean '$o' -> '$l'\n" if ( $o ne $l ); |
| 71 |
|
|
} |
| 72 |
|
|
return @out if wantarray; |
| 73 |
|
|
return shift @out; |
| 74 |
|
|
} |
| 75 |
|
|
|
| 76 |
5 |
dpavlin |
sub human { |
| 77 |
|
|
my $s = shift; |
| 78 |
|
|
|
| 79 |
7 |
dpavlin |
if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) { |
| 80 |
5 |
dpavlin |
my ( $v, $m ) = ( $1, lc($2) ); |
| 81 |
|
|
my $factor = $multiplier->{$m}; |
| 82 |
|
|
confess "can't find multiplier $m" unless defined $factor; |
| 83 |
|
|
my $new = format_bytes( $v * $factor, bs => 1024 ); |
| 84 |
|
|
warn "## [$s] $v * $factor ($m) -> $new\n"; |
| 85 |
|
|
return $new; |
| 86 |
|
|
} |
| 87 |
|
|
return; |
| 88 |
|
|
} |
| 89 |
|
|
|
| 90 |
7 |
dpavlin |
sub strip_prefix { |
| 91 |
|
|
my @data = @_; |
| 92 |
|
|
my $prefix = shift @data; |
| 93 |
|
|
|
| 94 |
|
|
my $p; |
| 95 |
|
|
|
| 96 |
|
|
foreach my $d ( @data ) { |
| 97 |
|
|
my $chomp = length($prefix); |
| 98 |
|
|
# find end of common string |
| 99 |
|
|
$chomp-- while( |
| 100 |
|
|
lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp )) |
| 101 |
|
|
&& |
| 102 |
|
|
$chomp > 0 |
| 103 |
|
|
); |
| 104 |
|
|
if ( $chomp == 0 ) { |
| 105 |
|
|
warn "no common prefix in ",dump( @_ ); |
| 106 |
|
|
return @_; |
| 107 |
|
|
} |
| 108 |
|
|
|
| 109 |
|
|
my $prefix = substr( $prefix, 0, $chomp ); |
| 110 |
|
|
$p->{$prefix}++; |
| 111 |
|
|
} |
| 112 |
|
|
warn "prefixes found = ",dump($p); |
| 113 |
|
|
my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p; |
| 114 |
|
|
my $strip = shift @sorted || return @_; |
| 115 |
8 |
dpavlin |
warn "longest prefix: '$strip' (stripped)\n"; |
| 116 |
7 |
dpavlin |
return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_; |
| 117 |
|
|
} |
| 118 |
|
|
|
| 119 |
8 |
dpavlin |
sub group_by { |
| 120 |
|
|
my ( $group, $detail ) = ( shift, shift ); |
| 121 |
|
|
my $what = shift; |
| 122 |
|
|
confess "expected CODE as first argument!" unless ref($what) eq 'CODE'; |
| 123 |
|
|
my @data = @_; |
| 124 |
|
|
|
| 125 |
|
|
my $stat; |
| 126 |
|
|
my @details; |
| 127 |
|
|
|
| 128 |
|
|
foreach my $i ( 0 .. $#data ) { |
| 129 |
|
|
my $v = $data[$i]; |
| 130 |
|
|
my ( $by, $rest ) = $what->($v); |
| 131 |
|
|
# warn "## group_by: $i $v -> $by\n"; |
| 132 |
|
|
push @{ $stat->{$by} }, $i; |
| 133 |
|
|
$details[$i] = $rest; |
| 134 |
|
|
} |
| 135 |
|
|
|
| 136 |
|
|
warn "group_by '$group' stats = ",dump( $stat ); |
| 137 |
|
|
|
| 138 |
|
|
foreach my $g ( keys %$stat ) { |
| 139 |
|
|
foreach my $r ( @{ $stat->{$g} } ) { |
| 140 |
|
|
warn "## $group $g $r\n"; |
| 141 |
|
|
$dump[$r]->{$group} = $g; |
| 142 |
|
|
$dump[$r]->{$detail} = $details[$r]; |
| 143 |
|
|
} |
| 144 |
|
|
} |
| 145 |
|
|
} |
| 146 |
|
|
|
| 147 |
12 |
dpavlin |
sub number { |
| 148 |
|
|
my $v = shift; |
| 149 |
|
|
if ( $v =~ m/^([\d.]+),(\d+)$/) { |
| 150 |
|
|
my ( $i, $f ) = ( $1, $2 ); |
| 151 |
|
|
$i =~ s/\.//g; |
| 152 |
|
|
my $new = ( $i . '.' . $f ) + 0; |
| 153 |
|
|
warn "## number $v -> $new\n"; |
| 154 |
|
|
return $new; |
| 155 |
|
|
} |
| 156 |
|
|
return $v; |
| 157 |
|
|
} |
| 158 |
|
|
|
| 159 |
|
|
sub numeric_range { |
| 160 |
|
|
my ( $name, $groups ) = ( shift, shift ); |
| 161 |
|
|
|
| 162 |
|
|
my $min; |
| 163 |
|
|
my $max; |
| 164 |
|
|
|
| 165 |
|
|
my @numbers; |
| 166 |
|
|
|
| 167 |
|
|
foreach my $i ( 0 .. $#dump ) { |
| 168 |
|
|
my $v = number( $dump[$i]->{$name} ); |
| 169 |
|
|
die "element $i doesn't have $name in ",dump( $dump[$i] ) unless defined $v; |
| 170 |
|
|
|
| 171 |
|
|
$min ||= $v; |
| 172 |
|
|
$max ||= $v; |
| 173 |
|
|
|
| 174 |
|
|
$min = $v if $v < $min; |
| 175 |
|
|
$max = $v if $v > $max; |
| 176 |
|
|
|
| 177 |
|
|
push @numbers, $v; |
| 178 |
|
|
} |
| 179 |
|
|
|
| 180 |
|
|
sub round { |
| 181 |
|
|
my $v = number( shift ); |
| 182 |
|
|
my $step = shift || 10; |
| 183 |
|
|
my $f = 1; |
| 184 |
|
|
while ( $v > $step ) { |
| 185 |
|
|
$f *= $step; |
| 186 |
|
|
$v /= $step; |
| 187 |
|
|
} |
| 188 |
|
|
my $new = int($v) * $f; |
| 189 |
|
|
warn "## round step: $step v: $v f: $f => $new\n"; |
| 190 |
|
|
return $new; |
| 191 |
|
|
} |
| 192 |
|
|
|
| 193 |
|
|
my $range = $max - $min; |
| 194 |
|
|
my $step = $range / $groups; |
| 195 |
|
|
|
| 196 |
|
|
warn "## numeric_range $min - $max / $step step into $groups groups\n"; |
| 197 |
|
|
|
| 198 |
|
|
$step = round($step); |
| 199 |
|
|
$min = round($min); |
| 200 |
|
|
$max = round($max, $step) + $step; |
| 201 |
|
|
|
| 202 |
|
|
my @ranges; |
| 203 |
|
|
my $v = $min; |
| 204 |
|
|
while ( $v <= $max ) { |
| 205 |
|
|
push @ranges, $v; |
| 206 |
|
|
$v += $step; |
| 207 |
|
|
} |
| 208 |
|
|
|
| 209 |
|
|
warn "## round $min - $max / $step produced ranges: ",dump( @ranges ),"\n"; |
| 210 |
|
|
|
| 211 |
13 |
dpavlin |
my $d = length( "$max" ); |
| 212 |
12 |
dpavlin |
|
| 213 |
|
|
foreach my $i ( 0 .. $#numbers ) { |
| 214 |
|
|
my $n = $numbers[$i]; |
| 215 |
|
|
|
| 216 |
|
|
my $start = 0; |
| 217 |
|
|
foreach my $r ( @ranges ) { |
| 218 |
|
|
if ( $n < $r ) { |
| 219 |
13 |
dpavlin |
$dump[$i]->{ $name . '_range' } = sprintf("%0${d}d-%0${d}d", $start, $r); |
| 220 |
12 |
dpavlin |
last; |
| 221 |
|
|
} |
| 222 |
|
|
$start = $r; |
| 223 |
|
|
} |
| 224 |
|
|
} |
| 225 |
|
|
} |
| 226 |
|
|
|
| 227 |
|
|
|
| 228 |
1 |
dpavlin |
while (<$csv_fh>) { |
| 229 |
|
|
$csv_parser->parse($_); |
| 230 |
|
|
my @fields = $csv_parser->fields; |
| 231 |
|
|
|
| 232 |
|
|
if ( $first_line_labels && $. == 1 ) { |
| 233 |
|
|
@labels = @fields; |
| 234 |
|
|
next; |
| 235 |
|
|
} |
| 236 |
|
|
|
| 237 |
|
|
my $h; |
| 238 |
|
|
foreach my $i ( 0 .. $#fields ) { |
| 239 |
|
|
my $l = $labels[$i]; |
| 240 |
|
|
die "no label for field $i '$fields[$i]'" unless $l; |
| 241 |
|
|
|
| 242 |
|
|
my $v = clean( $fields[$i] ); |
| 243 |
|
|
# FIXME reject some values? |
| 244 |
|
|
|
| 245 |
|
|
$h->{ $l } = $v; |
| 246 |
|
|
|
| 247 |
|
|
if ( my $split = $split_fields->{$l} ) { |
| 248 |
|
|
confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE'; |
| 249 |
|
|
|
| 250 |
|
|
my @sv = $split->( $v ); |
| 251 |
|
|
|
| 252 |
|
|
# warn "sv = ",dump( @sv ); |
| 253 |
|
|
|
| 254 |
|
|
foreach my $j ( 0 .. $#sv ) { |
| 255 |
|
|
|
| 256 |
|
|
my $v = clean( $sv[$j] ); |
| 257 |
|
|
|
| 258 |
5 |
dpavlin |
if ( my $human = human( $v ) ) { |
| 259 |
|
|
$h->{ $l . '_' . $j . '_human' } = $human; |
| 260 |
|
|
} else { |
| 261 |
|
|
$h->{ $l . '_' . $j } = $v; |
| 262 |
1 |
dpavlin |
} |
| 263 |
|
|
|
| 264 |
7 |
dpavlin |
$split_stats->{$v}->{pos}->{$j}++; |
| 265 |
1 |
dpavlin |
$split_stats->{$v}->{sum}++; |
| 266 |
|
|
push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j; |
| 267 |
|
|
} |
| 268 |
|
|
} |
| 269 |
|
|
} |
| 270 |
|
|
warn "\nRecord #$. ",dump($h),"\n"; |
| 271 |
|
|
|
| 272 |
|
|
my $id = $h->{id}; |
| 273 |
|
|
|
| 274 |
|
|
if ( ! defined($id) || $id eq '' ) { |
| 275 |
|
|
warn "## skipped: $_"; |
| 276 |
|
|
next; |
| 277 |
|
|
} |
| 278 |
|
|
|
| 279 |
|
|
my $url = "http://www.links.hr/photo/big/$id.jpg"; |
| 280 |
|
|
my $img_thumb_path = "$img_path/t/$id.jpg"; |
| 281 |
|
|
my $img_orig_path = "$img_path/$id.jpg"; |
| 282 |
|
|
|
| 283 |
13 |
dpavlin |
if ( $mirror_images && mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) { |
| 284 |
1 |
dpavlin |
warn "$url -> $img_orig_path\n"; |
| 285 |
|
|
} |
| 286 |
2 |
dpavlin |
system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path; |
| 287 |
1 |
dpavlin |
|
| 288 |
|
|
$h->{'image-url'} = $img_orig_path; |
| 289 |
|
|
$h->{'image-thumb-url'} = $img_thumb_path; |
| 290 |
|
|
|
| 291 |
|
|
push @dump, $h; |
| 292 |
|
|
} |
| 293 |
|
|
|
| 294 |
|
|
close $csv_fh; |
| 295 |
|
|
|
| 296 |
|
|
foreach my $v ( keys %$split_stats ) { |
| 297 |
|
|
|
| 298 |
|
|
if ( $split_stats->{$v}->{sum} == 1 ) { |
| 299 |
|
|
delete( $split_stats->{$v} ); |
| 300 |
|
|
next; |
| 301 |
|
|
} |
| 302 |
|
|
|
| 303 |
|
|
foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) { |
| 304 |
|
|
push @{ $dump[ $i ]->{feature} }, $v; |
| 305 |
|
|
} |
| 306 |
|
|
} |
| 307 |
|
|
|
| 308 |
7 |
dpavlin |
warn "split_stats = ", dump( $split_stats ), "\n"; |
| 309 |
1 |
dpavlin |
|
| 310 |
8 |
dpavlin |
# |
| 311 |
|
|
# split prefix from label_0 |
| 312 |
|
|
# |
| 313 |
12 |
dpavlin |
my @stripped = strip_prefix( map { $_->{label_0} } @dump ); |
| 314 |
7 |
dpavlin |
$dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped ); |
| 315 |
|
|
|
| 316 |
8 |
dpavlin |
# group products by manufacturers |
| 317 |
|
|
group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped ); |
| 318 |
|
|
|
| 319 |
12 |
dpavlin |
# create price ranges |
| 320 |
|
|
numeric_range( 'gotovina', 5 ); |
| 321 |
|
|
|
| 322 |
1 |
dpavlin |
warn "dump = ", dump( @dump ), "\n"; |
| 323 |
|
|
|
| 324 |
|
|
print "features: .", join(', .', keys %$split_stats), "\n"; |
| 325 |
|
|
|
| 326 |
|
|
my $js_path = $csv_path; |
| 327 |
|
|
$js_path =~ s/\.csv/.js/gi; |
| 328 |
|
|
|
| 329 |
|
|
open my $fh, '>', $js_path || die "can't open $js_path: $!"; |
| 330 |
|
|
print $fh JSON::Syck::Dump( { items => \@dump } ); |
| 331 |
|
|
close $fh; |
| 332 |
|
|
|