/[simile]/links/csv2js.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /links/csv2js.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (hide annotations)
Thu Aug 23 17:55:05 2007 UTC (16 years, 7 months ago) by dpavlin
File MIME type: text/plain
File size: 6772 byte(s)
optionally mirror_images, numeric ranges are now zero-padded so they sort ok
in exhibit
1 dpavlin 1 #!/usr/bin/perl
2    
3     use warnings;
4     use strict;
5    
6     use Text::CSV_XS;
7     use Text::CSV::Separator qw(get_separator);
8     use Carp qw/confess/;
9     use LWP::Simple;
10 dpavlin 5 use Number::Bytes::Human qw/format_bytes/;
11 dpavlin 1
12     use JSON::Syck;
13     use Data::Dump qw/dump/;
14    
15     $|++;
16    
17     my $csv_path = 'links.csv';
18     my $img_path = 'img';
19     my $first_line_labels = 1;
20 dpavlin 13 my $mirror_images = 1;
21 dpavlin 1 my $split_fields = {
22     label => sub { return split(/,\s*/,$_[0]) },
23     };
24    
25     my @char_list = get_separator( path => $csv_path );
26    
27     my $separator;
28     if (@char_list) {
29     if (@char_list == 1) {
30     $separator = $char_list[0];
31     } else {
32     $separator = $char_list[0];
33     }
34     } else {
35     die "Couldn't detect the field separator.\n";
36     }
37    
38     warn "Separator: $separator\n";
39    
40     my $csv_parser = Text::CSV_XS->new({
41     sep_char => $separator,
42     # binary => '1',
43     # always_quote => '1'
44     });
45    
46     open my $csv_fh, '<', $csv_path;
47    
48     my @dump;
49    
50     my @labels;
51    
52     my $split_stats;
53    
54     my $multiplier = {
55     kb => 1024,
56 dpavlin 5 mb => 1024 * 1024,
57 dpavlin 1 gb => 1024 * 1024 * 1024,
58     };
59    
60     my $multiplier_regex = join('|',keys %$multiplier);
61    
62     sub clean {
63     my @out;
64     foreach my $l ( @_ ) {
65     my $o = $l;
66     $l =~ s/^(['"])(.*)\1/$2/;
67     $l =~ s/^\s+//s;
68     $l =~ s/\s+$//s;
69     push @out, $l;
70     warn "clean '$o' -> '$l'\n" if ( $o ne $l );
71     }
72     return @out if wantarray;
73     return shift @out;
74     }
75    
76 dpavlin 5 sub human {
77     my $s = shift;
78    
79 dpavlin 7 if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) {
80 dpavlin 5 my ( $v, $m ) = ( $1, lc($2) );
81     my $factor = $multiplier->{$m};
82     confess "can't find multiplier $m" unless defined $factor;
83     my $new = format_bytes( $v * $factor, bs => 1024 );
84     warn "## [$s] $v * $factor ($m) -> $new\n";
85     return $new;
86     }
87     return;
88     }
89    
90 dpavlin 7 sub strip_prefix {
91     my @data = @_;
92     my $prefix = shift @data;
93    
94     my $p;
95    
96     foreach my $d ( @data ) {
97     my $chomp = length($prefix);
98     # find end of common string
99     $chomp-- while(
100     lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp ))
101     &&
102     $chomp > 0
103     );
104     if ( $chomp == 0 ) {
105     warn "no common prefix in ",dump( @_ );
106     return @_;
107     }
108    
109     my $prefix = substr( $prefix, 0, $chomp );
110     $p->{$prefix}++;
111     }
112     warn "prefixes found = ",dump($p);
113     my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p;
114     my $strip = shift @sorted || return @_;
115 dpavlin 8 warn "longest prefix: '$strip' (stripped)\n";
116 dpavlin 7 return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_;
117     }
118    
119 dpavlin 8 sub group_by {
120     my ( $group, $detail ) = ( shift, shift );
121     my $what = shift;
122     confess "expected CODE as first argument!" unless ref($what) eq 'CODE';
123     my @data = @_;
124    
125     my $stat;
126     my @details;
127    
128     foreach my $i ( 0 .. $#data ) {
129     my $v = $data[$i];
130     my ( $by, $rest ) = $what->($v);
131     # warn "## group_by: $i $v -> $by\n";
132     push @{ $stat->{$by} }, $i;
133     $details[$i] = $rest;
134     }
135    
136     warn "group_by '$group' stats = ",dump( $stat );
137    
138     foreach my $g ( keys %$stat ) {
139     foreach my $r ( @{ $stat->{$g} } ) {
140     warn "## $group $g $r\n";
141     $dump[$r]->{$group} = $g;
142     $dump[$r]->{$detail} = $details[$r];
143     }
144     }
145     }
146    
147 dpavlin 12 sub number {
148     my $v = shift;
149     if ( $v =~ m/^([\d.]+),(\d+)$/) {
150     my ( $i, $f ) = ( $1, $2 );
151     $i =~ s/\.//g;
152     my $new = ( $i . '.' . $f ) + 0;
153     warn "## number $v -> $new\n";
154     return $new;
155     }
156     return $v;
157     }
158    
159     sub numeric_range {
160     my ( $name, $groups ) = ( shift, shift );
161    
162     my $min;
163     my $max;
164    
165     my @numbers;
166    
167     foreach my $i ( 0 .. $#dump ) {
168     my $v = number( $dump[$i]->{$name} );
169     die "element $i doesn't have $name in ",dump( $dump[$i] ) unless defined $v;
170    
171     $min ||= $v;
172     $max ||= $v;
173    
174     $min = $v if $v < $min;
175     $max = $v if $v > $max;
176    
177     push @numbers, $v;
178     }
179    
180     sub round {
181     my $v = number( shift );
182     my $step = shift || 10;
183     my $f = 1;
184     while ( $v > $step ) {
185     $f *= $step;
186     $v /= $step;
187     }
188     my $new = int($v) * $f;
189     warn "## round step: $step v: $v f: $f => $new\n";
190     return $new;
191     }
192    
193     my $range = $max - $min;
194     my $step = $range / $groups;
195    
196     warn "## numeric_range $min - $max / $step step into $groups groups\n";
197    
198     $step = round($step);
199     $min = round($min);
200     $max = round($max, $step) + $step;
201    
202     my @ranges;
203     my $v = $min;
204     while ( $v <= $max ) {
205     push @ranges, $v;
206     $v += $step;
207     }
208    
209     warn "## round $min - $max / $step produced ranges: ",dump( @ranges ),"\n";
210    
211 dpavlin 13 my $d = length( "$max" );
212 dpavlin 12
213     foreach my $i ( 0 .. $#numbers ) {
214     my $n = $numbers[$i];
215    
216     my $start = 0;
217     foreach my $r ( @ranges ) {
218     if ( $n < $r ) {
219 dpavlin 13 $dump[$i]->{ $name . '_range' } = sprintf("%0${d}d-%0${d}d", $start, $r);
220 dpavlin 12 last;
221     }
222     $start = $r;
223     }
224     }
225     }
226    
227    
228 dpavlin 1 while (<$csv_fh>) {
229     $csv_parser->parse($_);
230     my @fields = $csv_parser->fields;
231    
232     if ( $first_line_labels && $. == 1 ) {
233     @labels = @fields;
234     next;
235     }
236    
237     my $h;
238     foreach my $i ( 0 .. $#fields ) {
239     my $l = $labels[$i];
240     die "no label for field $i '$fields[$i]'" unless $l;
241    
242     my $v = clean( $fields[$i] );
243     # FIXME reject some values?
244    
245     $h->{ $l } = $v;
246    
247     if ( my $split = $split_fields->{$l} ) {
248     confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE';
249    
250     my @sv = $split->( $v );
251    
252     # warn "sv = ",dump( @sv );
253    
254     foreach my $j ( 0 .. $#sv ) {
255    
256     my $v = clean( $sv[$j] );
257    
258 dpavlin 5 if ( my $human = human( $v ) ) {
259     $h->{ $l . '_' . $j . '_human' } = $human;
260     } else {
261     $h->{ $l . '_' . $j } = $v;
262 dpavlin 1 }
263    
264 dpavlin 7 $split_stats->{$v}->{pos}->{$j}++;
265 dpavlin 1 $split_stats->{$v}->{sum}++;
266     push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j;
267     }
268     }
269     }
270     warn "\nRecord #$. ",dump($h),"\n";
271    
272     my $id = $h->{id};
273    
274     if ( ! defined($id) || $id eq '' ) {
275     warn "## skipped: $_";
276     next;
277     }
278    
279     my $url = "http://www.links.hr/photo/big/$id.jpg";
280     my $img_thumb_path = "$img_path/t/$id.jpg";
281     my $img_orig_path = "$img_path/$id.jpg";
282    
283 dpavlin 13 if ( $mirror_images && mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) {
284 dpavlin 1 warn "$url -> $img_orig_path\n";
285     }
286 dpavlin 2 system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path;
287 dpavlin 1
288     $h->{'image-url'} = $img_orig_path;
289     $h->{'image-thumb-url'} = $img_thumb_path;
290    
291     push @dump, $h;
292     }
293    
294     close $csv_fh;
295    
296     foreach my $v ( keys %$split_stats ) {
297    
298     if ( $split_stats->{$v}->{sum} == 1 ) {
299     delete( $split_stats->{$v} );
300     next;
301     }
302    
303     foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) {
304     push @{ $dump[ $i ]->{feature} }, $v;
305     }
306     }
307    
308 dpavlin 7 warn "split_stats = ", dump( $split_stats ), "\n";
309 dpavlin 1
310 dpavlin 8 #
311     # split prefix from label_0
312     #
313 dpavlin 12 my @stripped = strip_prefix( map { $_->{label_0} } @dump );
314 dpavlin 7 $dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped );
315    
316 dpavlin 8 # group products by manufacturers
317     group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped );
318    
319 dpavlin 12 # create price ranges
320     numeric_range( 'gotovina', 5 );
321    
322 dpavlin 1 warn "dump = ", dump( @dump ), "\n";
323    
324     print "features: .", join(', .', keys %$split_stats), "\n";
325    
326     my $js_path = $csv_path;
327     $js_path =~ s/\.csv/.js/gi;
328    
329     open my $fh, '>', $js_path || die "can't open $js_path: $!";
330     print $fh JSON::Syck::Dump( { items => \@dump } );
331     close $fh;
332    

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26