/[simile]/links/csv2js.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /links/csv2js.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (hide annotations)
Thu Aug 23 14:08:06 2007 UTC (12 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 5282 byte(s)
added group_by
1 dpavlin 1 #!/usr/bin/perl
2    
3     use warnings;
4     use strict;
5    
6     use Text::CSV_XS;
7     use Text::CSV::Separator qw(get_separator);
8     use Carp qw/confess/;
9     use LWP::Simple;
10 dpavlin 5 use Number::Bytes::Human qw/format_bytes/;
11 dpavlin 1
12     use JSON::Syck;
13     use Data::Dump qw/dump/;
14    
15     $|++;
16    
17     my $csv_path = 'links.csv';
18     my $img_path = 'img';
19     my $first_line_labels = 1;
20     my $split_fields = {
21     label => sub { return split(/,\s*/,$_[0]) },
22     };
23    
24     my @char_list = get_separator( path => $csv_path );
25    
26     my $separator;
27     if (@char_list) {
28     if (@char_list == 1) {
29     $separator = $char_list[0];
30     } else {
31     $separator = $char_list[0];
32     }
33     } else {
34     die "Couldn't detect the field separator.\n";
35     }
36    
37     warn "Separator: $separator\n";
38    
39     my $csv_parser = Text::CSV_XS->new({
40     sep_char => $separator,
41     # binary => '1',
42     # always_quote => '1'
43     });
44    
45     open my $csv_fh, '<', $csv_path;
46    
47     my @dump;
48    
49     my @labels;
50    
51     my $split_stats;
52    
53     my $multiplier = {
54     kb => 1024,
55 dpavlin 5 mb => 1024 * 1024,
56 dpavlin 1 gb => 1024 * 1024 * 1024,
57     };
58    
59     my $multiplier_regex = join('|',keys %$multiplier);
60    
61     sub clean {
62     my @out;
63     foreach my $l ( @_ ) {
64     my $o = $l;
65     $l =~ s/^(['"])(.*)\1/$2/;
66     $l =~ s/^\s+//s;
67     $l =~ s/\s+$//s;
68     push @out, $l;
69     warn "clean '$o' -> '$l'\n" if ( $o ne $l );
70     }
71     return @out if wantarray;
72     return shift @out;
73     }
74    
75 dpavlin 5 sub human {
76     my $s = shift;
77    
78 dpavlin 7 if ( $s =~ m/^\s*(\d+)\s*($multiplier_regex)\s*$/i) {
79 dpavlin 5 my ( $v, $m ) = ( $1, lc($2) );
80     my $factor = $multiplier->{$m};
81     confess "can't find multiplier $m" unless defined $factor;
82     my $new = format_bytes( $v * $factor, bs => 1024 );
83     warn "## [$s] $v * $factor ($m) -> $new\n";
84     return $new;
85     }
86     return;
87     }
88    
89 dpavlin 7 sub strip_prefix {
90     my @data = @_;
91     my $prefix = shift @data;
92    
93     my $p;
94    
95     foreach my $d ( @data ) {
96     my $chomp = length($prefix);
97     # find end of common string
98     $chomp-- while(
99     lc(substr( $prefix, 0, $chomp )) ne lc(substr( $d, 0, $chomp ))
100     &&
101     $chomp > 0
102     );
103     if ( $chomp == 0 ) {
104     warn "no common prefix in ",dump( @_ );
105     return @_;
106     }
107    
108     my $prefix = substr( $prefix, 0, $chomp );
109     $p->{$prefix}++;
110     }
111     warn "prefixes found = ",dump($p);
112     my @sorted = sort { $p->{$b} <=> $p->{$a} } keys %$p;
113     my $strip = shift @sorted || return @_;
114 dpavlin 8 warn "longest prefix: '$strip' (stripped)\n";
115 dpavlin 7 return map { my $v = $_; $v =~ s/^\Q$strip\E//i; $v; } @_;
116     }
117    
118 dpavlin 8 sub group_by {
119     my ( $group, $detail ) = ( shift, shift );
120     my $what = shift;
121     confess "expected CODE as first argument!" unless ref($what) eq 'CODE';
122     my @data = @_;
123    
124     my $stat;
125     my @details;
126    
127     foreach my $i ( 0 .. $#data ) {
128     my $v = $data[$i];
129     my ( $by, $rest ) = $what->($v);
130     # warn "## group_by: $i $v -> $by\n";
131     push @{ $stat->{$by} }, $i;
132     $details[$i] = $rest;
133     }
134    
135     warn "group_by '$group' stats = ",dump( $stat );
136    
137     foreach my $g ( keys %$stat ) {
138     foreach my $r ( @{ $stat->{$g} } ) {
139     warn "## $group $g $r\n";
140     $dump[$r]->{$group} = $g;
141     $dump[$r]->{$detail} = $details[$r];
142     }
143     }
144     }
145    
146 dpavlin 1 while (<$csv_fh>) {
147     $csv_parser->parse($_);
148     my @fields = $csv_parser->fields;
149    
150     if ( $first_line_labels && $. == 1 ) {
151     @labels = @fields;
152     next;
153     }
154    
155     my $h;
156     foreach my $i ( 0 .. $#fields ) {
157     my $l = $labels[$i];
158     die "no label for field $i '$fields[$i]'" unless $l;
159    
160     my $v = clean( $fields[$i] );
161     # FIXME reject some values?
162    
163     $h->{ $l } = $v;
164    
165     if ( my $split = $split_fields->{$l} ) {
166     confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE';
167    
168     my @sv = $split->( $v );
169    
170     # warn "sv = ",dump( @sv );
171    
172     foreach my $j ( 0 .. $#sv ) {
173    
174     my $v = clean( $sv[$j] );
175    
176 dpavlin 5 if ( my $human = human( $v ) ) {
177     $h->{ $l . '_' . $j . '_human' } = $human;
178     } else {
179     $h->{ $l . '_' . $j } = $v;
180 dpavlin 1 }
181    
182 dpavlin 7 $split_stats->{$v}->{pos}->{$j}++;
183 dpavlin 1 $split_stats->{$v}->{sum}++;
184     push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j;
185     }
186     }
187     }
188     warn "\nRecord #$. ",dump($h),"\n";
189    
190     my $id = $h->{id};
191    
192     if ( ! defined($id) || $id eq '' ) {
193     warn "## skipped: $_";
194     next;
195     }
196    
197     my $url = "http://www.links.hr/photo/big/$id.jpg";
198     my $img_thumb_path = "$img_path/t/$id.jpg";
199     my $img_orig_path = "$img_path/$id.jpg";
200    
201     if ( mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) {
202     warn "$url -> $img_orig_path\n";
203     }
204 dpavlin 2 system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if ! -e $img_thumb_path;
205 dpavlin 1
206     $h->{'image-url'} = $img_orig_path;
207     $h->{'image-thumb-url'} = $img_thumb_path;
208    
209     push @dump, $h;
210     }
211    
212     close $csv_fh;
213    
214     foreach my $v ( keys %$split_stats ) {
215    
216     if ( $split_stats->{$v}->{sum} == 1 ) {
217     delete( $split_stats->{$v} );
218     next;
219     }
220    
221     foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) {
222     push @{ $dump[ $i ]->{feature} }, $v;
223     }
224     }
225    
226 dpavlin 7 warn "split_stats = ", dump( $split_stats ), "\n";
227 dpavlin 1
228 dpavlin 8 #
229     # split prefix from label_0
230     #
231 dpavlin 7 my @all = map { $_->{label_0} || die "no label_0 for ",dump($_) } @dump;
232     warn "all = ",dump(@all);
233     my @stripped = strip_prefix( @all );
234     $dump[$_]->{label_0} = $stripped[$_] foreach ( 0 .. $#stripped );
235    
236 dpavlin 8 # group products by manufacturers
237     group_by( qw/manufacturer player_name/, sub { $_[0] =~ m/^(\S+)\s+(.+)/; ($1,$2) }, @stripped );
238    
239 dpavlin 1 warn "dump = ", dump( @dump ), "\n";
240    
241     print "features: .", join(', .', keys %$split_stats), "\n";
242    
243     my $js_path = $csv_path;
244     $js_path =~ s/\.csv/.js/gi;
245    
246     open my $fh, '>', $js_path || die "can't open $js_path: $!";
247     print $fh JSON::Syck::Dump( { items => \@dump } );
248     close $fh;
249    

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26