1 |
#!/usr/bin/perl |
2 |
|
3 |
use warnings; |
4 |
use strict; |
5 |
|
6 |
use Text::CSV_XS; |
7 |
use Text::CSV::Separator qw(get_separator); |
8 |
use Carp qw/confess/; |
9 |
use LWP::Simple; |
10 |
use Number::Bytes::Human qw/format_bytes/; |
11 |
|
12 |
use JSON::Syck; |
13 |
use Data::Dump qw/dump/; |
14 |
|
15 |
$|++; |
16 |
|
17 |
my $csv_path = 'links.csv'; |
18 |
my $img_path = 'img'; |
19 |
my $first_line_labels = 1; |
20 |
my $split_fields = { |
21 |
label => sub { return split(/,\s*/,$_[0]) }, |
22 |
}; |
23 |
|
24 |
my @char_list = get_separator( path => $csv_path ); |
25 |
|
26 |
my $separator; |
27 |
if (@char_list) { |
28 |
if (@char_list == 1) { |
29 |
$separator = $char_list[0]; |
30 |
} else { |
31 |
$separator = $char_list[0]; |
32 |
} |
33 |
} else { |
34 |
die "Couldn't detect the field separator.\n"; |
35 |
} |
36 |
|
37 |
warn "Separator: $separator\n"; |
38 |
|
39 |
my $csv_parser = Text::CSV_XS->new({ |
40 |
sep_char => $separator, |
41 |
# binary => '1', |
42 |
# always_quote => '1' |
43 |
}); |
44 |
|
45 |
open my $csv_fh, '<', $csv_path; |
46 |
|
47 |
my @dump; |
48 |
|
49 |
my @labels; |
50 |
|
51 |
my $split_stats; |
52 |
|
53 |
my $multiplier = { |
54 |
kb => 1024, |
55 |
mb => 1024 * 1024, |
56 |
gb => 1024 * 1024 * 1024, |
57 |
}; |
58 |
|
59 |
my $multiplier_regex = join('|',keys %$multiplier); |
60 |
|
61 |
sub clean { |
62 |
my @out; |
63 |
foreach my $l ( @_ ) { |
64 |
my $o = $l; |
65 |
$l =~ s/^(['"])(.*)\1/$2/; |
66 |
$l =~ s/^\s+//s; |
67 |
$l =~ s/\s+$//s; |
68 |
push @out, $l; |
69 |
warn "clean '$o' -> '$l'\n" if ( $o ne $l ); |
70 |
} |
71 |
return @out if wantarray; |
72 |
return shift @out; |
73 |
} |
74 |
|
75 |
sub human { |
76 |
my $s = shift; |
77 |
|
78 |
if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) { |
79 |
my ( $v, $m ) = ( $1, lc($2) ); |
80 |
my $factor = $multiplier->{$m}; |
81 |
confess "can't find multiplier $m" unless defined $factor; |
82 |
my $new = format_bytes( $v * $factor, bs => 1024 ); |
83 |
warn "## [$s] $v * $factor ($m) -> $new\n"; |
84 |
return $new; |
85 |
} |
86 |
return; |
87 |
} |
88 |
|
89 |
sub strip_prefix { |
90 |
my @data = @_; |
91 |
my $prefix = shift @data; |
92 |
|
93 |
my $p; |
94 |
|
95 |
foreach my $d ( @data ) { |
96 |
my $chomp = length($prefix); |
97 |
# find end of common string |
98 |
$chomp-- while( |
99 |
lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp )) |
100 |
&& |
101 |
$chomp > 0 |
102 |
); |
103 |
if ( $chomp == 0 ) { |
104 |
warn "no common prefix in ",dump( @_ ); |
105 |
return @_; |
106 |
} |
107 |
|
108 |
my $prefix = substr( $prefix, 0, $chomp ); |
109 |
$p->{$prefix}++; |
110 |
} |
111 |
warn "prefixes found = ",dump($p); |
112 |
my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p; |
113 |
my $strip = shift @sorted || return @_; |
114 |
warn "longest prefix: '$strip' (stripped)\n"; |
115 |
return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_; |
116 |
} |
117 |
|
118 |
sub group_by { |
119 |
my ( $group, $detail ) = ( shift, shift ); |
120 |
my $what = shift; |
121 |
confess "expected CODE as first argument!" unless ref($what) eq 'CODE'; |
122 |
my @data = @_; |
123 |
|
124 |
my $stat; |
125 |
my @details; |
126 |
|
127 |
foreach my $i ( 0 .. $#data ) { |
128 |
my $v = $data[$i]; |
129 |
my ( $by, $rest ) = $what->($v); |
130 |
# warn "## group_by: $i $v -> $by\n"; |
131 |
push @{ $stat->{$by} }, $i; |
132 |
$details[$i] = $rest; |
133 |
} |
134 |
|
135 |
warn "group_by '$group' stats = ",dump( $stat ); |
136 |
|
137 |
foreach my $g ( keys %$stat ) { |
138 |
foreach my $r ( @{ $stat->{$g} } ) { |
139 |
warn "## $group $g $r\n"; |
140 |
$dump[$r]->{$group} = $g; |
141 |
$dump[$r]->{$detail} = $details[$r]; |
142 |
} |
143 |
} |
144 |
} |
145 |
|
146 |
sub number { |
147 |
my $v = shift; |
148 |
if ( $v =~ m/^([\d.]+),(\d+)$/) { |
149 |
my ( $i, $f ) = ( $1, $2 ); |
150 |
$i =~ s/\.//g; |
151 |
my $new = ( $i . '.' . $f ) + 0; |
152 |
warn "## number $v -> $new\n"; |
153 |
return $new; |
154 |
} |
155 |
return $v; |
156 |
} |
157 |
|
158 |
sub numeric_range { |
159 |
my ( $name, $groups ) = ( shift, shift ); |
160 |
|
161 |
my $min; |
162 |
my $max; |
163 |
|
164 |
my @numbers; |
165 |
|
166 |
foreach my $i ( 0 .. $#dump ) { |
167 |
my $v = number( $dump[$i]->{$name} ); |
168 |
die "element $i doesn't have $name in ",dump( $dump[$i] ) unless defined $v; |
169 |
|
170 |
$min ||= $v; |
171 |
$max ||= $v; |
172 |
|
173 |
$min = $v if $v < $min; |
174 |
$max = $v if $v > $max; |
175 |
|
176 |
push @numbers, $v; |
177 |
} |
178 |
|
179 |
sub round { |
180 |
my $v = number( shift ); |
181 |
my $step = shift || 10; |
182 |
my $f = 1; |
183 |
while ( $v > $step ) { |
184 |
$f *= $step; |
185 |
$v /= $step; |
186 |
} |
187 |
my $new = int($v) * $f; |
188 |
warn "## round step: $step v: $v f: $f => $new\n"; |
189 |
return $new; |
190 |
} |
191 |
|
192 |
my $range = $max - $min; |
193 |
my $step = $range / $groups; |
194 |
|
195 |
warn "## numeric_range $min - $max / $step step into $groups groups\n"; |
196 |
|
197 |
$step = round($step); |
198 |
$min = round($min); |
199 |
$max = round($max, $step) + $step; |
200 |
|
201 |
my @ranges; |
202 |
my $v = $min; |
203 |
while ( $v <= $max ) { |
204 |
push @ranges, $v; |
205 |
$v += $step; |
206 |
} |
207 |
|
208 |
warn "## round $min - $max / $step produced ranges: ",dump( @ranges ),"\n"; |
209 |
|
210 |
my $usage; |
211 |
|
212 |
foreach my $i ( 0 .. $#numbers ) { |
213 |
my $n = $numbers[$i]; |
214 |
|
215 |
my $start = 0; |
216 |
foreach my $r ( @ranges ) { |
217 |
if ( $n < $r ) { |
218 |
$dump[$i]->{ $name . '_range' } = $start . '-' . $r; |
219 |
last; |
220 |
} |
221 |
$start = $r; |
222 |
} |
223 |
} |
224 |
} |
225 |
|
226 |
|
227 |
while (<$csv_fh>) { |
228 |
$csv_parser->parse($_); |
229 |
my @fields = $csv_parser->fields; |
230 |
|
231 |
if ( $first_line_labels && $. == 1 ) { |
232 |
@labels = @fields; |
233 |
next; |
234 |
} |
235 |
|
236 |
my $h; |
237 |
foreach my $i ( 0 .. $#fields ) { |
238 |
my $l = $labels[$i]; |
239 |
die "no label for field $i '$fields[$i]'" unless $l; |
240 |
|
241 |
my $v = clean( $fields[$i] ); |
242 |
# FIXME reject some values? |
243 |
|
244 |
$h->{ $l } = $v; |
245 |
|
246 |
if ( my $split = $split_fields->{$l} ) { |
247 |
confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE'; |
248 |
|
249 |
my @sv = $split->( $v ); |
250 |
|
251 |
# warn "sv = ",dump( @sv ); |
252 |
|
253 |
foreach my $j ( 0 .. $#sv ) { |
254 |
|
255 |
my $v = clean( $sv[$j] ); |
256 |
|
257 |
if ( my $human = human( $v ) ) { |
258 |
$h->{ $l . '_' . $j . '_human' } = $human; |
259 |
} else { |
260 |
$h->{ $l . '_' . $j } = $v; |
261 |
} |
262 |
|
263 |
$split_stats->{$v}->{pos}->{$j}++; |
264 |
$split_stats->{$v}->{sum}++; |
265 |
push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j; |
266 |
} |
267 |
} |
268 |
} |
269 |
warn "\nRecord #$. ",dump($h),"\n"; |
270 |
|
271 |
my $id = $h->{id}; |
272 |
|
273 |
if ( ! defined($id) || $id eq '' ) { |
274 |
warn "## skipped: $_"; |
275 |
next; |
276 |
} |
277 |
|
278 |
my $url = "http://www.links.hr/photo/big/$id.jpg"; |
279 |
my $img_thumb_path = "$img_path/t/$id.jpg"; |
280 |
my $img_orig_path = "$img_path/$id.jpg"; |
281 |
|
282 |
if ( mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) { |
283 |
warn "$url -> $img_orig_path\n"; |
284 |
} |
285 |
system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path; |
286 |
|
287 |
$h->{'image-url'} = $img_orig_path; |
288 |
$h->{'image-thumb-url'} = $img_thumb_path; |
289 |
|
290 |
push @dump, $h; |
291 |
} |
292 |
|
293 |
close $csv_fh; |
294 |
|
295 |
foreach my $v ( keys %$split_stats ) { |
296 |
|
297 |
if ( $split_stats->{$v}->{sum} == 1 ) { |
298 |
delete( $split_stats->{$v} ); |
299 |
next; |
300 |
} |
301 |
|
302 |
foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) { |
303 |
push @{ $dump[ $i ]->{feature} }, $v; |
304 |
} |
305 |
} |
306 |
|
307 |
warn "split_stats = ", dump( $split_stats ), "\n"; |
308 |
|
309 |
# |
310 |
# split prefix from label_0 |
311 |
# |
312 |
my @stripped = strip_prefix( map { $_->{label_0} } @dump ); |
313 |
$dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped ); |
314 |
|
315 |
# group products by manufacturers |
316 |
group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped ); |
317 |
|
318 |
# create price ranges |
319 |
numeric_range( 'gotovina', 5 ); |
320 |
|
321 |
warn "dump = ", dump( @dump ), "\n"; |
322 |
|
323 |
print "features: .", join(', .', keys %$split_stats), "\n"; |
324 |
|
325 |
my $js_path = $csv_path; |
326 |
$js_path =~ s/\.csv/.js/gi; |
327 |
|
328 |
open my $fh, '>', $js_path || die "can't open $js_path: $!"; |
329 |
print $fh JSON::Syck::Dump( { items => \@dump } ); |
330 |
close $fh; |
331 |
|