/[simile]/links/csv2js.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /links/csv2js.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (show annotations)
Thu Aug 23 09:46:24 2007 UTC (13 years, 1 month ago) by dpavlin
File MIME type: text/plain
File size: 3327 byte(s)
initial import of www.links.hr scraper for Simile's Exibition

1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 use Text::CSV_XS;
7 use Text::CSV::Separator qw(get_separator);
8 use Carp qw/confess/;
9 use LWP::Simple;
10 use Imager;
11
12 use JSON::Syck;
13 use Data::Dump qw/dump/;
14
15 $|++;
16
17 my $csv_path = 'links.csv';
18 my $img_path = 'img';
19 my $first_line_labels = 1;
20 my $split_fields = {
21 label => sub { return split(/,\s*/,$_[0]) },
22 };
23
24 my @char_list = get_separator( path => $csv_path );
25
26 my $separator;
27 if (@char_list) {
28 if (@char_list == 1) {
29 $separator = $char_list[0];
30 } else {
31 $separator = $char_list[0];
32 }
33 } else {
34 die "Couldn't detect the field separator.\n";
35 }
36
37 warn "Separator: $separator\n";
38
39 my $csv_parser = Text::CSV_XS->new({
40 sep_char => $separator,
41 # binary => '1',
42 # always_quote => '1'
43 });
44
45 open my $csv_fh, '<', $csv_path;
46
47 my @dump;
48
49 my @labels;
50
51 my $split_stats;
52
53 my $multiplier = {
54 kb => 1024,
55 mb => 1024 * 1204,
56 gb => 1024 * 1024 * 1024,
57 };
58
59 my $multiplier_regex = join('|',keys %$multiplier);
60
61 sub clean {
62 my @out;
63 foreach my $l ( @_ ) {
64 my $o = $l;
65 $l =~ s/^(['"])(.*)\1/$2/;
66 $l =~ s/^\s+//s;
67 $l =~ s/\s+$//s;
68 push @out, $l;
69 warn "clean '$o' -> '$l'\n" if ( $o ne $l );
70 }
71 return @out if wantarray;
72 return shift @out;
73 }
74
75 while (<$csv_fh>) {
76 $csv_parser->parse($_);
77 my @fields = $csv_parser->fields;
78
79 if ( $first_line_labels && $. == 1 ) {
80 @labels = @fields;
81 next;
82 }
83
84 my $h;
85 foreach my $i ( 0 .. $#fields ) {
86 my $l = $labels[$i];
87 die "no label for field $i '$fields[$i]'" unless $l;
88
89 my $v = clean( $fields[$i] );
90 # FIXME reject some values?
91
92 $h->{ $l } = $v;
93
94 if ( my $split = $split_fields->{$l} ) {
95 confess "expected CODE for \$split_files->{$l}" unless ref($split) eq 'CODE';
96
97 my @sv = $split->( $v );
98
99 # warn "sv = ",dump( @sv );
100
101 foreach my $j ( 0 .. $#sv ) {
102
103 my $v = clean( $sv[$j] );
104
105 if ( $j == 0 ) {
106 $h->{ $l . '_short' } = $v;
107 }
108
109 if ( $v =~ m/(\d+)\s*($multiplier_regex)/) {
110 my $new = $1 * $multiplier_regex->{$2};
111 warn "## $v -> $new\n";
112 $v = $new;
113 }
114
115 $split_stats->{$v}->{$j}++;
116 $split_stats->{$v}->{sum}++;
117 push @{ $split_stats->{$v}->{rec}->{$#dump + 1} }, $j;
118 }
119 }
120 }
121 warn "\nRecord #$. ",dump($h),"\n";
122
123 my $id = $h->{id};
124
125 if ( ! defined($id) || $id eq '' ) {
126 warn "## skipped: $_";
127 next;
128 }
129
130 my $url = "http://www.links.hr/photo/big/$id.jpg";
131 my $img_thumb_path = "$img_path/t/$id.jpg";
132 my $img_orig_path = "$img_path/$id.jpg";
133
134 if ( mirror( $url, $img_orig_path ) != RC_NOT_MODIFIED ) {
135 warn "$url -> $img_orig_path\n";
136 }
137 system('convert', '-geometry', '320x200', $img_orig_path, $img_thumb_path ) if -e $img_thumb_path;
138
139 $h->{'image-url'} = $img_orig_path;
140 $h->{'image-thumb-url'} = $img_thumb_path;
141
142 push @dump, $h;
143 }
144
145 close $csv_fh;
146
147 foreach my $v ( keys %$split_stats ) {
148
149 if ( $split_stats->{$v}->{sum} == 1 ) {
150 delete( $split_stats->{$v} );
151 next;
152 }
153
154 foreach my $i ( keys %{ $split_stats->{$v}->{rec} } ) {
155 push @{ $dump[ $i ]->{feature} }, $v;
156 }
157 }
158
159 #warn "split_stats = ", dump( $split_stats ), "\n";
160
161 warn "dump = ", dump( @dump ), "\n";
162
163 print "features: .", join(', .', keys %$split_stats), "\n";
164
165 my $js_path = $csv_path;
166 $js_path =~ s/\.csv/.js/gi;
167
168 open my $fh, '>', $js_path || die "can't open $js_path: $!";
169 print $fh JSON::Syck::Dump( { items => \@dump } );
170 close $fh;
171

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26