1 |
#!/usr/bin/perl |
2 |
|
3 |
use warnings; |
4 |
use strict; |
5 |
|
6 |
use Text::CSV_XS; |
7 |
use Text::CSV::Separator qw(get_separator); |
8 |
use Carp qw/confess/; |
9 |
use LWP::Simple; |
10 |
use Number::Bytes::Human qw/format_bytes/; |
11 |
|
12 |
use JSON::Syck; |
13 |
use Data::Dump qw/dump/; |
14 |
|
15 |
$|++; |
16 |
|
17 |
my $csv_path = 'links.csv'; |
18 |
my $img_path = 'img'; |
19 |
my $first_line_labels = 1; |
20 |
my $mirror_images = 1; |
21 |
my $split_fields = { |
22 |
label => sub { return split(/,\s*/,$_[0]) }, |
23 |
}; |
24 |
|
25 |
my @char_list = get_separator( path => $csv_path ); |
26 |
|
27 |
my $separator; |
28 |
if (@char_list) { |
29 |
if (@char_list == 1) { |
30 |
$separator = $char_list[0]; |
31 |
} else { |
32 |
$separator = $char_list[0]; |
33 |
} |
34 |
} else { |
35 |
die "Couldn't detect the field separator.\n"; |
36 |
} |
37 |
|
38 |
warn "Separator: $separator\n"; |
39 |
|
40 |
my $csv_parser = Text::CSV_XS->new({ |
41 |
sep_char => $separator, |
42 |
# binary => '1', |
43 |
# always_quote => '1' |
44 |
}); |
45 |
|
46 |
open my $csv_fh, '<', $csv_path; |
47 |
|
48 |
my @dump; |
49 |
|
50 |
my @labels; |
51 |
|
52 |
my $split_stats; |
53 |
|
54 |
my $multiplier = { |
55 |
kb => 1024, |
56 |
mb => 1024 * 1024, |
57 |
gb => 1024 * 1024 * 1024, |
58 |
}; |
59 |
|
60 |
my $multiplier_regex = join('|',keys %$multiplier); |
61 |
|
62 |
sub clean { |
63 |
my @out; |
64 |
foreach my $l ( @_ ) { |
65 |
my $o = $l; |
66 |
$l =~ s/^(['"])(.*)\1/$2/; |
67 |
$l =~ s/^\s+//s; |
68 |
$l =~ s/\s+$//s; |
69 |
push @out, $l; |
70 |
warn "clean '$o' -> '$l'\n" if ( $o ne $l ); |
71 |
} |
72 |
return @out if wantarray; |
73 |
return shift @out; |
74 |
} |
75 |
|
76 |
sub human { |
77 |
my $s = shift; |
78 |
|
79 |
if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) { |
80 |
my ( $v, $m ) = ( $1, lc($2) ); |
81 |
my $factor = $multiplier->{$m}; |
82 |
confess "can't find multiplier $m" unless defined $factor; |
83 |
my $new = format_bytes( $v * $factor, bs => 1024 ); |
84 |
warn "## [$s] $v * $factor ($m) -> $new\n"; |
85 |
return $new; |
86 |
} |
87 |
return; |
88 |
} |
89 |
|
90 |
sub strip_prefix { |
91 |
my @data = @_; |
92 |
my $prefix = shift @data; |
93 |
|
94 |
my $p; |
95 |
|
96 |
foreach my $d ( @data ) { |
97 |
my $chomp = length($prefix); |
98 |
# find end of common string |
99 |
$chomp-- while( |
100 |
lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp )) |
101 |
&& |
102 |
$chomp > 0 |
103 |
); |
104 |
if ( $chomp == 0 ) { |
105 |
warn "no common prefix in ",dump( @_ ); |
106 |
return @_; |
107 |
} |
108 |
|
109 |
my $prefix = substr( $prefix, 0, $chomp ); |
110 |
$p->{$prefix}++; |
111 |
} |
112 |
warn "prefixes found = ",dump($p); |
113 |
my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p; |
114 |
my $strip = shift @sorted || return @_; |
115 |
warn "longest prefix: '$strip' (stripped)\n"; |
116 |
return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_; |
117 |
} |
118 |
|
119 |
sub group_by { |
120 |
my ( $group, $detail ) = ( shift, shift ); |
121 |
my $what = shift; |
122 |
confess "expected CODE as first argument!" unless ref($what) eq 'CODE'; |
123 |
my @data = @_; |
124 |
|
125 |
my $stat; |
126 |
my @details; |
127 |
|
128 |
foreach my $i ( 0 .. $#data ) { |
129 |
my $v = $data[$i]; |
130 |
my ( $by, $rest ) = $what->($v); |
131 |
# warn "## group_by: $i $v -> $by\n"; |
132 |
push @{ $stat->{$by} }, $i; |
133 |
$details[$i] = $rest; |
134 |
} |
135 |
|
136 |
warn "group_by '$group' stats = ",dump( $stat ); |
137 |
|
138 |
foreach my $g ( keys %$stat ) { |
139 |
foreach my $r ( @{ $stat->{$g} } ) { |
140 |
warn "## $group $g $r\n"; |
141 |
$dump[$r]->{$group} = $g; |
142 |
$dump[$r]->{$detail} = $details[$r]; |
143 |
} |
144 |
} |
145 |
} |
146 |
|
147 |
sub number { |
148 |
my $v = shift; |
149 |
if ( $v =~ m/^([\d.]+),(\d+)$/) { |
150 |
my ( $i, $f ) = ( $1, $2 ); |
151 |
$i =~ s/\.//g; |
152 |
my $new = ( $i . '.' . $f ) + 0; |
153 |
warn "## number $v -> $new\n"; |
154 |
return $new; |
155 |
} |
156 |
return $v; |
157 |
} |
158 |
|
159 |
sub numeric_range { |
160 |
my ( $name, $groups ) = ( shift, shift ); |
161 |
|
162 |
my $min; |
163 |
my $max; |
164 |
|
165 |
my @numbers; |
166 |
|
167 |
foreach my $i ( 0 .. $#dump ) { |
168 |
my $v = number( $dump[$i]->{$name} ); |
169 |
die "element $i doesn't have $name in ",dump( $dump[$i] ) unless defined $v; |
170 |
|
171 |
$min ||= $v; |
172 |
$max ||= $v; |
173 |
|
174 |
$min = $v if $v < $min; |
175 |
$max = $v if $v > $max; |
176 |
|
177 |
push @numbers, $v; |
178 |
} |
179 |
|
180 |
sub round { |
181 |
my $v = number( shift ); |
182 |
my $step = shift || 10; |
183 |
my $f = 1; |
184 |
while ( $v > $step ) { |
185 |
$f *= $step; |
186 |
$v /= $step; |
187 |
} |
188 |
my $new = int($v) * $f; |
189 |
warn "## round step: $step v: $v f: $f => $new\n"; |
190 |
return $new; |
191 |
} |
192 |
|
193 |
my $range = $max - $min; |
194 |
my $step = $range / $groups; |
195 |
|
196 |
warn "## numeric_range $min - $max / $step step into $groups groups\n"; |
197 |
|
198 |
$step = round($step); |
199 |
$min = round($min); |
200 |
$max = round($max, $step) + $step; |
201 |
|
202 |
my @ranges; |
203 |
my $v = $min; |
204 |
while ( $v <= $max ) { |
205 |
push @ranges, $v; |
206 |
$v += $step; |
207 |
} |
208 |
|
209 |
warn "## round $min - $max / $step produced ranges: ",dump( @ranges ),"\n"; |
210 |
|
211 |
my $d = length( "$max" ); |
212 |
|
213 |
foreach my $i ( 0 .. $#numbers ) { |
214 |
my $n = $numbers[$i]; |
215 |
|
216 |
my $start = 0; |
217 |
foreach my $r ( @ranges ) { |
218 |
if ( $n < $r ) { |
219 |
$dump[$i]->{ $name . '_range' } = sprintf("%0${d}d-%0${d}d", $start, $r); |
220 |
last; |
221 |
} |
222 |
$start = $r; |
223 |
} |
224 |
} |
225 |
} |
226 |
|
227 |
|
228 |
while (<$csv_fh>) { |
229 |
$csv_parser->parse($_); |
230 |
my @fields = $csv_parser->fields; |
231 |
|
232 |
if ( $first_line_labels && $. == 1 ) { |
233 |
@labels = @fields; |
234 |
next; |
235 |
} |
236 |
|
237 |
my $h; |
238 |
foreach my $i ( 0 .. $#fields ) { |
239 |
my $l = $labels[$i]; |
240 |
die "no label for field $i '$fields[$i]'" unless $l; |
241 |
|
242 |
my $v = clean( $fields[$i] ); |
243 |
# FIXME reject some values? |
244 |
|
245 |
$h->{ $l } = $v; |
246 |
|
247 |
if ( my $split = $split_fields->{$l} ) { |
248 |
confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE'; |
249 |
|
250 |
my @sv = $split->( $v ); |
251 |
|
252 |
# warn "sv = ",dump( @sv ); |
253 |
|
254 |
foreach my $j ( 0 .. $#sv ) { |
255 |
|
256 |
my $v = clean( $sv[$j] ); |
257 |
|
258 |
if ( my $human = human( $v ) ) { |
259 |
$h->{ $l . '_' . $j . '_human' } = $human; |
260 |
} else { |
261 |
$h->{ $l . '_' . $j } = $v; |
262 |
} |
263 |
|
264 |
$split_stats->{$v}->{pos}->{$j}++; |
265 |
$split_stats->{$v}->{sum}++; |
266 |
push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j; |
267 |
} |
268 |
} |
269 |
} |
270 |
warn "\nRecord #$. ",dump($h),"\n"; |
271 |
|
272 |
my $id = $h->{id}; |
273 |
|
274 |
if ( ! defined($id) || $id eq '' ) { |
275 |
warn "## skipped: $_"; |
276 |
next; |
277 |
} |
278 |
|
279 |
my $url = "http://www.links.hr/photo/big/$id.jpg"; |
280 |
my $img_thumb_path = "$img_path/t/$id.jpg"; |
281 |
my $img_orig_path = "$img_path/$id.jpg"; |
282 |
|
283 |
if ( $mirror_images && mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) { |
284 |
warn "$url -> $img_orig_path\n"; |
285 |
} |
286 |
system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path; |
287 |
|
288 |
$h->{'image-url'} = $img_orig_path; |
289 |
$h->{'image-thumb-url'} = $img_thumb_path; |
290 |
|
291 |
push @dump, $h; |
292 |
} |
293 |
|
294 |
close $csv_fh; |
295 |
|
296 |
foreach my $v ( keys %$split_stats ) { |
297 |
|
298 |
if ( $split_stats->{$v}->{sum} == 1 ) { |
299 |
delete( $split_stats->{$v} ); |
300 |
next; |
301 |
} |
302 |
|
303 |
foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) { |
304 |
push @{ $dump[ $i ]->{feature} }, $v; |
305 |
} |
306 |
} |
307 |
|
308 |
warn "split_stats = ", dump( $split_stats ), "\n"; |
309 |
|
310 |
# |
311 |
# split prefix from label_0 |
312 |
# |
313 |
my @stripped = strip_prefix( map { $_->{label_0} } @dump ); |
314 |
$dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped ); |
315 |
|
316 |
# group products by manufacturers |
317 |
group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped ); |
318 |
|
319 |
# create price ranges |
320 |
numeric_range( 'gotovina', 5 ); |
321 |
|
322 |
warn "dump = ", dump( @dump ), "\n"; |
323 |
|
324 |
print "features: .", join(', .', keys %$split_stats), "\n"; |
325 |
|
326 |
my $js_path = $csv_path; |
327 |
$js_path =~ s/\.csv/.js/gi; |
328 |
|
329 |
open my $fh, '>', $js_path || die "can't open $js_path: $!"; |
330 |
print $fh JSON::Syck::Dump( { items => \@dump } ); |
331 |
close $fh; |
332 |
|