/[simile]/links/csv2js.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /links/csv2js.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Thu Aug 23 17:55:05 2007 UTC (13 years, 2 months ago) by dpavlin
File MIME type: text/plain
File size: 6772 byte(s)
optionally mirror_images, numeric ranges are now zero-padded so they sort ok
in exhibit
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 use Text::CSV_XS;
7 use Text::CSV::Separator qw(get_separator);
8 use Carp qw/confess/;
9 use LWP::Simple;
10 use Number::Bytes::Human qw/format_bytes/;
11
12 use JSON::Syck;
13 use Data::Dump qw/dump/;
14
15 $|++;
16
17 my $csv_path = 'links.csv';
18 my $img_path = 'img';
19 my $first_line_labels = 1;
20 my $mirror_images = 1;
21 my $split_fields = {
22 label => sub { return split(/,\s*/,$_[0]) },
23 };
24
25 my @char_list = get_separator( path => $csv_path );
26
27 my $separator;
28 if (@char_list) {
29 if (@char_list == 1) {
30 $separator = $char_list[0];
31 } else {
32 $separator = $char_list[0];
33 }
34 } else {
35 die "Couldn't detect the field separator.\n";
36 }
37
38 warn "Separator: $separator\n";
39
40 my $csv_parser = Text::CSV_XS->new({
41 sep_char => $separator,
42 # binary => '1',
43 # always_quote => '1'
44 });
45
46 open my $csv_fh, '<', $csv_path;
47
48 my @dump;
49
50 my @labels;
51
52 my $split_stats;
53
54 my $multiplier = {
55 kb => 1024,
56 mb => 1024 * 1024,
57 gb => 1024 * 1024 * 1024,
58 };
59
60 my $multiplier_regex = join('|',keys %$multiplier);
61
62 sub clean {
63 my @out;
64 foreach my $l ( @_ ) {
65 my $o = $l;
66 $l =~ s/^(['"])(.*)\1/$2/;
67 $l =~ s/^\s+//s;
68 $l =~ s/\s+$//s;
69 push @out, $l;
70 warn "clean '$o' -> '$l'\n" if ( $o ne $l );
71 }
72 return @out if wantarray;
73 return shift @out;
74 }
75
76 sub human {
77 my $s = shift;
78
79 if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) {
80 my ( $v, $m ) = ( $1, lc($2) );
81 my $factor = $multiplier->{$m};
82 confess "can't find multiplier $m" unless defined $factor;
83 my $new = format_bytes( $v * $factor, bs => 1024 );
84 warn "## [$s] $v * $factor ($m) -> $new\n";
85 return $new;
86 }
87 return;
88 }
89
90 sub strip_prefix {
91 my @data = @_;
92 my $prefix = shift @data;
93
94 my $p;
95
96 foreach my $d ( @data ) {
97 my $chomp = length($prefix);
98 # find end of common string
99 $chomp-- while(
100 lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp ))
101 &&
102 $chomp > 0
103 );
104 if ( $chomp == 0 ) {
105 warn "no common prefix in ",dump( @_ );
106 return @_;
107 }
108
109 my $prefix = substr( $prefix, 0, $chomp );
110 $p->{$prefix}++;
111 }
112 warn "prefixes found = ",dump($p);
113 my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p;
114 my $strip = shift @sorted || return @_;
115 warn "longest prefix: '$strip' (stripped)\n";
116 return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_;
117 }
118
119 sub group_by {
120 my ( $group, $detail ) = ( shift, shift );
121 my $what = shift;
122 confess "expected CODE as first argument!" unless ref($what) eq 'CODE';
123 my @data = @_;
124
125 my $stat;
126 my @details;
127
128 foreach my $i ( 0 .. $#data ) {
129 my $v = $data[$i];
130 my ( $by, $rest ) = $what->($v);
131 # warn "## group_by: $i $v -> $by\n";
132 push @{ $stat->{$by} }, $i;
133 $details[$i] = $rest;
134 }
135
136 warn "group_by '$group' stats = ",dump( $stat );
137
138 foreach my $g ( keys %$stat ) {
139 foreach my $r ( @{ $stat->{$g} } ) {
140 warn "## $group $g $r\n";
141 $dump[$r]->{$group} = $g;
142 $dump[$r]->{$detail} = $details[$r];
143 }
144 }
145 }
146
147 sub number {
148 my $v = shift;
149 if ( $v =~ m/^([\d.]+),(\d+)$/) {
150 my ( $i, $f ) = ( $1, $2 );
151 $i =~ s/\.//g;
152 my $new = ( $i . '.' . $f ) + 0;
153 warn "## number $v -> $new\n";
154 return $new;
155 }
156 return $v;
157 }
158
159 sub numeric_range {
160 my ( $name, $groups ) = ( shift, shift );
161
162 my $min;
163 my $max;
164
165 my @numbers;
166
167 foreach my $i ( 0 .. $#dump ) {
168 my $v = number( $dump[$i]->{$name} );
169 die "element $i doesn't have $name in ",dump( $dump[$i] ) unless defined $v;
170
171 $min ||= $v;
172 $max ||= $v;
173
174 $min = $v if $v < $min;
175 $max = $v if $v > $max;
176
177 push @numbers, $v;
178 }
179
180 sub round {
181 my $v = number( shift );
182 my $step = shift || 10;
183 my $f = 1;
184 while ( $v > $step ) {
185 $f *= $step;
186 $v /= $step;
187 }
188 my $new = int($v) * $f;
189 warn "## round step: $step v: $v f: $f => $new\n";
190 return $new;
191 }
192
193 my $range = $max - $min;
194 my $step = $range / $groups;
195
196 warn "## numeric_range $min - $max / $step step into $groups groups\n";
197
198 $step = round($step);
199 $min = round($min);
200 $max = round($max, $step) + $step;
201
202 my @ranges;
203 my $v = $min;
204 while ( $v <= $max ) {
205 push @ranges, $v;
206 $v += $step;
207 }
208
209 warn "## round $min - $max / $step produced ranges: ",dump( @ranges ),"\n";
210
211 my $d = length( "$max" );
212
213 foreach my $i ( 0 .. $#numbers ) {
214 my $n = $numbers[$i];
215
216 my $start = 0;
217 foreach my $r ( @ranges ) {
218 if ( $n < $r ) {
219 $dump[$i]->{ $name . '_range' } = sprintf("%0${d}d-%0${d}d", $start, $r);
220 last;
221 }
222 $start = $r;
223 }
224 }
225 }
226
227
228 while (<$csv_fh>) {
229 $csv_parser->parse($_);
230 my @fields = $csv_parser->fields;
231
232 if ( $first_line_labels && $. == 1 ) {
233 @labels = @fields;
234 next;
235 }
236
237 my $h;
238 foreach my $i ( 0 .. $#fields ) {
239 my $l = $labels[$i];
240 die "no label for field $i '$fields[$i]'" unless $l;
241
242 my $v = clean( $fields[$i] );
243 # FIXME reject some values?
244
245 $h->{ $l } = $v;
246
247 if ( my $split = $split_fields->{$l} ) {
248 confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE';
249
250 my @sv = $split->( $v );
251
252 # warn "sv = ",dump( @sv );
253
254 foreach my $j ( 0 .. $#sv ) {
255
256 my $v = clean( $sv[$j] );
257
258 if ( my $human = human( $v ) ) {
259 $h->{ $l . '_' . $j . '_human' } = $human;
260 } else {
261 $h->{ $l . '_' . $j } = $v;
262 }
263
264 $split_stats->{$v}->{pos}->{$j}++;
265 $split_stats->{$v}->{sum}++;
266 push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j;
267 }
268 }
269 }
270 warn "\nRecord #$. ",dump($h),"\n";
271
272 my $id = $h->{id};
273
274 if ( ! defined($id) || $id eq '' ) {
275 warn "## skipped: $_";
276 next;
277 }
278
279 my $url = "http://www.links.hr/photo/big/$id.jpg";
280 my $img_thumb_path = "$img_path/t/$id.jpg";
281 my $img_orig_path = "$img_path/$id.jpg";
282
283 if ( $mirror_images && mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) {
284 warn "$url -> $img_orig_path\n";
285 }
286 system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path;
287
288 $h->{'image-url'} = $img_orig_path;
289 $h->{'image-thumb-url'} = $img_thumb_path;
290
291 push @dump, $h;
292 }
293
294 close $csv_fh;
295
296 foreach my $v ( keys %$split_stats ) {
297
298 if ( $split_stats->{$v}->{sum} == 1 ) {
299 delete( $split_stats->{$v} );
300 next;
301 }
302
303 foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) {
304 push @{ $dump[ $i ]->{feature} }, $v;
305 }
306 }
307
308 warn "split_stats = ", dump( $split_stats ), "\n";
309
310 #
311 # split prefix from label_0
312 #
313 my @stripped = strip_prefix( map { $_->{label_0} } @dump );
314 $dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped );
315
316 # group products by manufacturers
317 group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped );
318
319 # create price ranges
320 numeric_range( 'gotovina', 5 );
321
322 warn "dump = ", dump( @dump ), "\n";
323
324 print "features: .", join(', .', keys %$split_stats), "\n";
325
326 my $js_path = $csv_path;
327 $js_path =~ s/\.csv/.js/gi;
328
329 open my $fh, '>', $js_path || die "can't open $js_path: $!";
330 print $fh JSON::Syck::Dump( { items => \@dump } );
331 close $fh;
332

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26