simile/links/csv2js.pl

#!/usr/bin/perl

use warnings;
use strict;

use Text::CSV_XS;
use Text::CSV::Separator qw(get_separator);
use Carp qw/confess/;
use LWP::Simple;
use Number::Bytes::Human qw/format_bytes/;

use JSON::Syck;
use Data::Dump qw/dump/;

$|++;

my $csv_path = 'links.csv';
my $img_path = 'img';
my $first_line_labels = 1;
my $mirror_images = 1;
my $split_fields = {
        label => sub { return split(/,\s*/,$_[0]) },
};

my @char_list = get_separator( path => $csv_path );

my $separator;
if (@char_list) {
    if (@char_list == 1) {
        $separator = $char_list[0];
    } else {
        $separator  = $char_list[0];
    }
} else {
    die "Couldn't detect the field separator.\n";
}

warn "Separator: $separator\n";

my $csv_parser = Text::CSV_XS->new({
        sep_char => $separator,
#       binary => '1',
#       always_quote => '1'
});

open my $csv_fh, '<', $csv_path;

my @dump;

my @labels;

my $split_stats;

my $multiplier = {
        kb => 1024,
        mb => 1024 * 1024,
        gb => 1024 * 1024 * 1024,
};

my $multiplier_regex = join('|',keys %$multiplier);

sub clean {
        my @out;
        foreach my $l ( @_ ) {
                my $o = $l;
                $l =~ s/^(['"])(.*)\1/$2/;
                $l =~ s/^\s+//s;
                $l =~ s/\s+$//s;
                push @out, $l;
                warn "clean '$o' -> '$l'\n" if ( $o ne $l );
        }
        return @out if wantarray;
        return shift @out;
}

sub human {
        my $s = shift;

        if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) {
                my ( $v, $m ) = ( $1, lc($2) );
                my $factor = $multiplier->{$m};
                confess "can't find multiplier $m" unless defined $factor;
                my $new = format_bytes( $v * $factor, bs => 1024 );
                warn "## [$s] $v * $factor ($m) -> $new\n";
                return $new;
        }
        return;
}

sub strip_prefix {
        my @data = @_;
        my $prefix = shift @data;

        my $p;

        foreach my $d ( @data ) {
                my $chomp = length($prefix);
                # find end of common string
                $chomp-- while(
                        lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp ))
                        &&
                        $chomp > 0
                );
                if ( $chomp == 0 ) {
                        warn "no common prefix in ",dump( @_ );
                        return @_;
                }

                my $prefix = substr( $prefix, 0, $chomp );
                $p->{$prefix}++;
        }
        warn "prefixes found = ",dump($p);
        my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p;
        my $strip = shift @sorted || return @_;
        warn "longest prefix: '$strip' (stripped)\n";
        return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_;
}

sub group_by {
        my ( $group, $detail ) = ( shift, shift );
        my $what = shift;
        confess "expected CODE as first argument!" unless ref($what) eq 'CODE';
        my @data = @_;

        my $stat;
        my @details;

        foreach my $i ( 0 .. $#data ) {
                my $v = $data[$i];
                my ( $by, $rest ) = $what->($v);
#               warn "## group_by: $i $v -> $by\n";
                push @{ $stat->{$by} }, $i;
                $details[$i] = $rest;
        }

        warn "group_by '$group' stats = ",dump( $stat );

        foreach my $g ( keys %$stat ) {
                foreach my $r ( @{ $stat->{$g} } ) {
                        warn "## $group $g $r\n";
                        $dump[$r]->{$group} = $g;
                        $dump[$r]->{$detail} = $details[$r];
                }
        }
}

sub number {
        my $v = shift;
        if ( $v =~ m/^([\d.]+),(\d+)$/) {
                my ( $i, $f ) = ( $1, $2 );
                $i =~ s/\.//g;
                my $new = ( $i . '.' . $f ) + 0;
                warn "## number $v -> $new\n";
                return $new;
        }
        return $v;
}

sub numeric_range {
        my ( $name, $groups ) = ( shift, shift );

        my $min;
        my $max;

        my @numbers;

        foreach my $i ( 0 .. $#dump ) {
                my $v = number( $dump[$i]->{$name} );
                die "element $i doesn't have $name in ",dump( $dump[$i] ) unless defined $v;

                $min ||= $v;
                $max ||= $v;

                $min = $v if $v < $min;
                $max = $v if $v > $max;

                push @numbers, $v;
        }

        sub round {
                my $v = number( shift );
                my $step = shift || 10;
                my $f = 1;
                while ( $v > $step ) {
                        $f *= $step;
                        $v /= $step;
                }
                my $new = int($v) * $f;
                warn "## round step: $step v: $v f: $f => $new\n";
                return $new;
        }

        my $range = $max - $min;
        my $step = $range / $groups;

        warn "## numeric_range $min - $max / $step step into $groups groups\n";

        $step = round($step);
        $min = round($min);
        $max = round($max, $step) + $step;

        my @ranges;
        my $v = $min;
        while ( $v <= $max ) {
                push @ranges, $v;
                $v += $step;
        }

        warn "## round $min - $max / $step produced ranges: ",dump( @ranges ),"\n";

        my $d = length( "$max" );

        foreach my $i ( 0 .. $#numbers ) {
                my $n = $numbers[$i];

                my $start = 0;
                foreach my $r ( @ranges ) {
                        if ( $n < $r ) {
                                $dump[$i]->{ $name . '_range' } =  sprintf("%0${d}d-%0${d}d", $start, $r);
                                last;
                        }
                        $start = $r;
                }
        }
}


while (<$csv_fh>) {
    $csv_parser->parse($_);
    my @fields = $csv_parser->fields;

        if ( $first_line_labels && $. == 1 ) {
                @labels = @fields;
                next;
        }

        my $h;
        foreach my $i ( 0 .. $#fields ) {
                my $l = $labels[$i];
                die "no label for field $i '$fields[$i]'" unless $l;

                my $v = clean( $fields[$i] );
                # FIXME reject some values?

                $h->{ $l } = $v;

                if ( my $split = $split_fields->{$l} ) {
                        confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE';

                        my @sv = $split->( $v );

#                       warn "sv = ",dump( @sv );

                        foreach my $j ( 0 .. $#sv ) {

                                my $v = clean( $sv[$j] );

                                if ( my $human = human( $v ) ) {
                                        $h->{ $l . '_' . $j . '_human' } = $human;
                                } else {
                                        $h->{ $l . '_' . $j } = $v;
                                }

                                $split_stats->{$v}->{pos}->{$j}++;
                                $split_stats->{$v}->{sum}++;
                                push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j;
                        }
                }
        }
    warn "\nRecord #$. ",dump($h),"\n";

        my $id = $h->{id};

        if ( ! defined($id) || $id eq '' ) {
                warn "## skipped: $_";
                next;
        }

        my $url = "http://www.links.hr/photo/big/$id.jpg";
        my $img_thumb_path = "$img_path/t/$id.jpg";
        my $img_orig_path = "$img_path/$id.jpg";

        if ( $mirror_images && mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) {
                warn "$url -> $img_orig_path\n";
        }
        system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path;

        $h->{'image-url'} = $img_orig_path;
        $h->{'image-thumb-url'} = $img_thumb_path;

        push @dump, $h;
}

close $csv_fh;

foreach my $v ( keys %$split_stats ) {

        if ( $split_stats->{$v}->{sum} == 1 ) {
                delete( $split_stats->{$v} );
                next;
        }

        foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) {
                push @{ $dump[ $i ]->{feature} }, $v;
        }
}

warn "split_stats = ", dump( $split_stats ), "\n";

#
# split prefix from label_0
#
my @stripped = strip_prefix( map { $_->{label_0} } @dump );
$dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped );

# group products by manufacturers
group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped );

# create price ranges
numeric_range( 'gotovina', 5 );

warn "dump = ", dump( @dump ), "\n";

print "features: .", join(', .', keys %$split_stats), "\n";

my $js_path = $csv_path;
$js_path =~ s/\.csv/.js/gi;

open my $fh, '>', $js_path || die "can't open $js_path: $!";
print $fh JSON::Syck::Dump( { items => \@dump } );
close $fh;

1	dpavlin	1	#!/usr/bin/perl
2
3			use warnings;
4			use strict;
5
6			use Text::CSV_XS;
7			use Text::CSV::Separator qw(get_separator);
8			use Carp qw/confess/;
9			use LWP::Simple;
10	dpavlin	5	use Number::Bytes::Human qw/format_bytes/;
11	dpavlin	1
12			use JSON::Syck;
13			use Data::Dump qw/dump/;
14
15			$\|++;
16
17			my $csv_path = 'links.csv';
18			my $img_path = 'img';
19			my $first_line_labels = 1;
20	dpavlin	13	my $mirror_images = 1;
21	dpavlin	1	my $split_fields = {
22			label => sub { return split(/,\s*/,$_[0]) },
23			};
24
25			my @char_list = get_separator( path => $csv_path );
26
27			my $separator;
28			if (@char_list) {
29			if (@char_list == 1) {
30			$separator = $char_list[0];
31			} else {
32			$separator = $char_list[0];
33			}
34			} else {
35			die "Couldn't detect the field separator.\n";
36			}
37
38			warn "Separator: $separator\n";
39
40			my $csv_parser = Text::CSV_XS->new({
41			sep_char => $separator,
42			# binary => '1',
43			# always_quote => '1'
44			});
45
46			open my $csv_fh, '<', $csv_path;
47
48			my @dump;
49
50			my @labels;
51
52			my $split_stats;
53
54			my $multiplier = {
55			kb => 1024,
56	dpavlin	5	mb => 1024 * 1024,
57	dpavlin	1	gb => 1024 * 1024 * 1024,
58			};
59
60			my $multiplier_regex = join('\|',keys %$multiplier);
61
62			sub clean {
63			my @out;
64			foreach my $l ( @_ ) {
65			my $o = $l;
66			$l =~ s/^(['"])(.*)\1/$2/;
67			$l =~ s/^\s+//s;
68			$l =~ s/\s+$//s;
69			push @out, $l;
70			warn "clean '$o' -> '$l'\n" if ( $o ne $l );
71			}
72			return @out if wantarray;
73			return shift @out;
74			}
75
76	dpavlin	5	sub human {
77			my $s = shift;
78
79	dpavlin	7	if ( $s =~ m/^\s(\d+)\s($multiplier_regex)\s*$/i) {
80	dpavlin	5	my ( $v, $m ) = ( $1, lc($2) );
81			my $factor = $multiplier->{$m};
82			confess "can't find multiplier $m" unless defined $factor;
83			my $new = format_bytes( $v * $factor, bs => 1024 );
84			warn "## [$s] $v * $factor ($m) -> $new\n";
85			return $new;
86			}
87			return;
88			}
89
90	dpavlin	7	sub strip_prefix {
91			my @data = @_;
92			my $prefix = shift @data;
93
94			my $p;
95
96			foreach my $d ( @data ) {
97			my $chomp = length($prefix);
98			# find end of common string
99			$chomp-- while(
100			lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp ))
101			&&
102			$chomp > 0
103			);
104			if ( $chomp == 0 ) {
105			warn "no common prefix in ",dump( @_ );
106			return @_;
107			}
108
109			my $prefix = substr( $prefix, 0, $chomp );
110			$p->{$prefix}++;
111			}
112			warn "prefixes found = ",dump($p);
113			my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p;
114			my $strip = shift @sorted \|\| return @_;
115	dpavlin	8	warn "longest prefix: '$strip' (stripped)\n";
116	dpavlin	7	return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_;
117			}
118
119	dpavlin	8	sub group_by {
120			my ( $group, $detail ) = ( shift, shift );
121			my $what = shift;
122			confess "expected CODE as first argument!" unless ref($what) eq 'CODE';
123			my @data = @_;
124
125			my $stat;
126			my @details;
127
128			foreach my $i ( 0 .. $#data ) {
129			my $v = $data[$i];
130			my ( $by, $rest ) = $what->($v);
131			# warn "## group_by: $i $v -> $by\n";
132			push @{ $stat->{$by} }, $i;
133			$details[$i] = $rest;
134			}
135
136			warn "group_by '$group' stats = ",dump( $stat );
137
138			foreach my $g ( keys %$stat ) {
139			foreach my $r ( @{ $stat->{$g} } ) {
140			warn "## $group $g $r\n";
141			$dump[$r]->{$group} = $g;
142			$dump[$r]->{$detail} = $details[$r];
143			}
144			}
145			}
146
147	dpavlin	12	sub number {
148			my $v = shift;
149			if ( $v =~ m/^([\d.]+),(\d+)$/) {
150			my ( $i, $f ) = ( $1, $2 );
151			$i =~ s/\.//g;
152			my $new = ( $i . '.' . $f ) + 0;
153			warn "## number $v -> $new\n";
154			return $new;
155			}
156			return $v;
157			}
158
159			sub numeric_range {
160			my ( $name, $groups ) = ( shift, shift );
161
162			my $min;
163			my $max;
164
165			my @numbers;
166
167			foreach my $i ( 0 .. $#dump ) {
168			my $v = number( $dump[$i]->{$name} );
169			die "element $i doesn't have $name in ",dump( $dump[$i] ) unless defined $v;
170
171			$min \|\|= $v;
172			$max \|\|= $v;
173
174			$min = $v if $v < $min;
175			$max = $v if $v > $max;
176
177			push @numbers, $v;
178			}
179
180			sub round {
181			my $v = number( shift );
182			my $step = shift \|\| 10;
183			my $f = 1;
184			while ( $v > $step ) {
185			$f *= $step;
186			$v /= $step;
187			}
188			my $new = int($v) * $f;
189			warn "## round step: $step v: $v f: $f => $new\n";
190			return $new;
191			}
192
193			my $range = $max - $min;
194			my $step = $range / $groups;
195
196			warn "## numeric_range $min - $max / $step step into $groups groups\n";
197
198			$step = round($step);
199			$min = round($min);
200			$max = round($max, $step) + $step;
201
202			my @ranges;
203			my $v = $min;
204			while ( $v <= $max ) {
205			push @ranges, $v;
206			$v += $step;
207			}
208
209			warn "## round $min - $max / $step produced ranges: ",dump( @ranges ),"\n";
210
211	dpavlin	13	my $d = length( "$max" );
212	dpavlin	12
213			foreach my $i ( 0 .. $#numbers ) {
214			my $n = $numbers[$i];
215
216			my $start = 0;
217			foreach my $r ( @ranges ) {
218			if ( $n < $r ) {
219	dpavlin	13	$dump[$i]->{ $name . '_range' } = sprintf("%0${d}d-%0${d}d", $start, $r);
220	dpavlin	12	last;
221			}
222			$start = $r;
223			}
224			}
225			}
226
227
228	dpavlin	1	while (<$csv_fh>) {
229			$csv_parser->parse($_);
230			my @fields = $csv_parser->fields;
231
232			if ( $first_line_labels && $. == 1 ) {
233			@labels = @fields;
234			next;
235			}
236
237			my $h;
238			foreach my $i ( 0 .. $#fields ) {
239			my $l = $labels[$i];
240			die "no label for field $i '$fields[$i]'" unless $l;
241
242			my $v = clean( $fields[$i] );
243			# FIXME reject some values?
244
245			$h->{ $l } = $v;
246
247			if ( my $split = $split_fields->{$l} ) {
248			confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE';
249
250			my @sv = $split->( $v );
251
252			# warn "sv = ",dump( @sv );
253
254			foreach my $j ( 0 .. $#sv ) {
255
256			my $v = clean( $sv[$j] );
257
258	dpavlin	5	if ( my $human = human( $v ) ) {
259			$h->{ $l . '_' . $j . '_human' } = $human;
260			} else {
261			$h->{ $l . '_' . $j } = $v;
262	dpavlin	1	}
263
264	dpavlin	7	$split_stats->{$v}->{pos}->{$j}++;
265	dpavlin	1	$split_stats->{$v}->{sum}++;
266			push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j;
267			}
268			}
269			}
270			warn "\nRecord #$. ",dump($h),"\n";
271
272			my $id = $h->{id};
273
274			if ( ! defined($id) \|\| $id eq '' ) {
275			warn "## skipped: $_";
276			next;
277			}
278
279			my $url = "http://www.links.hr/photo/big/$id.jpg";
280			my $img_thumb_path = "$img_path/t/$id.jpg";
281			my $img_orig_path = "$img_path/$id.jpg";
282
283	dpavlin	13	if ( $mirror_images && mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) {
284	dpavlin	1	warn "$url -> $img_orig_path\n";
285			}
286	dpavlin	2	system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path;
287	dpavlin	1
288			$h->{'image-url'} = $img_orig_path;
289			$h->{'image-thumb-url'} = $img_thumb_path;
290
291			push @dump, $h;
292			}
293
294			close $csv_fh;
295
296			foreach my $v ( keys %$split_stats ) {
297
298			if ( $split_stats->{$v}->{sum} == 1 ) {
299			delete( $split_stats->{$v} );
300			next;
301			}
302
303			foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) {
304			push @{ $dump[ $i ]->{feature} }, $v;
305			}
306			}
307
308	dpavlin	7	warn "split_stats = ", dump( $split_stats ), "\n";
309	dpavlin	1
310	dpavlin	8	#
311			# split prefix from label_0
312			#
313	dpavlin	12	my @stripped = strip_prefix( map { $_->{label_0} } @dump );
314	dpavlin	7	$dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped );
315
316	dpavlin	8	# group products by manufacturers
317			group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped );
318
319	dpavlin	12	# create price ranges
320			numeric_range( 'gotovina', 5 );
321
322	dpavlin	1	warn "dump = ", dump( @dump ), "\n";
323
324			print "features: .", join(', .', keys %$split_stats), "\n";
325
326			my $js_path = $csv_path;
327			$js_path =~ s/\.csv/.js/gi;
328
329			open my $fh, '>', $js_path \|\| die "can't open $js_path: $!";
330			print $fh JSON::Syck::Dump( { items => \@dump } );
331			close $fh;
332