Line # Revision Author
1 178 dpavlin #!/usr/bin/perl
2 53 dpavlin
3 185 dpavlin use warnings;
4 use strict;
5
6 178 dpavlin #
7 # emulate GROUP BY availabe in relational database for shell pipes
8 #
9 185 dpavlin # Dobrica Pavlinusic <dpavlin@rot13.org>, http://www.rot13.org/~dpavlin/
10 #
11 184 dpavlin # SYNTAX: groupby.pl [-d delimiter_char] [-f](sum|count|min|max):]\d+,...
12 181 dpavlin #
13 184 dpavlin # by default, delimiter is space and -f flag is optional -- similar to cut(1)
14 185 dpavlin # you can also use matematical operations between columns
15 184 dpavlin #
16 178 dpavlin # - show memory usage of processes grouped by process name
17 185 dpavlin # ps axv | awk '{ print $10,($6+$7+$8) * 1024 }' | groupby.pl sum:2,1,count:1 | sort -k1 -nr | sum.pl -h
18 181 dpavlin #
19 183 dpavlin # - same example with added minimum and maximum usage
20 185 dpavlin # ps axv | awk '{ print $10,($6+$7+$8) * 1024 }' | groupby.pl sum:2,1,count:1,min:2,max:2 | sort -rn | align | head -10
21 183 dpavlin #
22 181 dpavlin # - stats of pppd packages traffic
23 # sudo grep pppd /var/log/debug | egrep '(rcvd|sent)' | ./groupby.pl 6-8 | sort -rn
24 #
25 185 dpavlin # - you can also use do calculations of fields in-line
26 # ps axv | ./groupby.pl 'sum:(($6+$7+$8)*1024),10,count:10,min:($6+$7+$8),max:($6+$7+$8)' | sort -k1 -nr | head | align | sum.pl -h
27 187 dpavlin #
28 # - OpenVZ examples:
29 # vzps -E axv --no-headers | ./groupby.pl 'sum:($7+$8+$9*1024),count:1,1' | sort -rn | ./sum.pl -h
30 #
31 # - example how to join some other data (convert OpenVZ VEID to hostname)
32 # vzps -E axv --no-headers | ./groupby.pl 'sum:($7+$8+$9*1024),1,count:1' --join 'sudo vzlist -H -o veid,hostname' --on 2 | sort -rn | align | ./sum.pl -h
33 178 dpavlin
34 use Data::Dump qw/dump/;
35 182 dpavlin use Getopt::Long;
36 185 dpavlin use Carp qw/confess/;
37 187 dpavlin use IPC::System::Simple qw/capture/;
38 57 dpavlin
39 178 dpavlin my $debug = 0;
40 182 dpavlin my $delimiter;
41 184 dpavlin my $cols;
42 187 dpavlin my $join;
43 my $join_on = 1;
44 178 dpavlin
45 182 dpavlin GetOptions(
46 184 dpavlin 'd|delimiter=s' => \$delimiter,
47 'f|fields=s' => \$cols,
48 187 dpavlin 'j|join=s' => \$join,
49 'on|join-on=i' => \$join_on,
50 'debug+' => \$debug,
51 182 dpavlin );
52
53 if ( defined $delimiter ) {
54 $delimiter = "\Q$delimiter\E";
55 } else {
56 $delimiter = '\s+';
57 }
58
59 184 dpavlin $cols ||= shift @ARGV;
60 die "usage: $0 [1|sum:3,1,count:1,max:1,min:1]\n" unless $cols;
61 180 dpavlin
62 181 dpavlin # rewrite dummy 1- to full form
63 if ( $cols !~ m/,/ ) {
64 $cols = "count:$cols,show:$cols";
65 $cols .= '-' unless $cols =~ m/-/;
66 }
67
68 178 dpavlin my $group_by;
69 my @col_op;
70 my $sum_cols; # check count of summed columns
71 my @sum_cols;
72 my @show_cols;
73 183 dpavlin my $extremes;
74 178 dpavlin foreach my $col ( split(/,/,$cols) ) {
75 if ( $col =~ m/^count:(.+)$/ ) {
76 # FIXME check if key is valid
77 185 dpavlin confess "count:$1 can be used only once in $cols\n" if $group_by;
78 178 dpavlin push @col_op, { count => $1 };
79 $group_by = $1;
80 185 dpavlin } elsif ( $col =~ m/^sum:(.+)$/i ) {
81 178 dpavlin push @col_op, { sum => $1 };
82 push @sum_cols, $1;
83 $sum_cols->{$1}++;
84 warn "column $1 sum created more than once in $cols\n" if $sum_cols->{$1} > 1;
85 185 dpavlin } elsif ( $col =~ m/^(min|max):(.+)$/ ) {
86 push @col_op, { 'extreme' => $1 };
87 $extremes->{ $1 }->{col} = $2;
88 178 dpavlin } else {
89 $col =~ s/^show://i;
90 push @col_op, { show => $col };
91 push @show_cols, $col;
92 }
93 }
94
95 185 dpavlin my $col;
96 178 dpavlin
97 185 dpavlin warn "## $cols col_op = ",dump( @col_op )," debug level: $debug\n" if $debug > 1;
98 warn "# group by $group_by show ", dump( @show_cols ), " sum ",dump( @sum_cols ),$/ if $debug;
99 178 dpavlin
100 185 dpavlin # convert column range to arrray of column values
101 sub col_data_range {
102 my ( $range, $want_data, $line ) = @_;
103 warn "## col_data_range",dump( $range, $want_data, $line ),$/ if $debug > 1;
104 my @cols;
105 if ($range =~ m/^(\d+)-$/) {
106 push @cols,$_ foreach ( $1 .. $#$line );
107 } elsif ($range =~ m/^-(\d+)$/) {
108 push @cols,$_ foreach ( 0 .. $1 );
109 } elsif ($range =~ m/^(\d+)-(\d+)$/) {
110 push @cols,$_ foreach ( $1 .. $2 );
111 } elsif ($range =~ m/^(\d+)$/) {
112 push @cols,$1;
113 } elsif ( $range =~ m/^\((.+[+\-\/\*].+)\)$/ ) {
114 my $code = $1;
115 if ( $want_data ) {
116 sub col_val {
117 my ($line,$col) = @_;
118 my $val = $line->[$col];
119 return 0 unless $val =~ m/^\d+$/;
120 confess "can't find column $col in ",dump( $line ) unless defined $val;
121 return $val;
122 }
123 $code =~ s/\$(\d+)/col_val($line,$1)/ge;
124 my $val = eval $code;
125 confess "eval $code from $range returned $@\n" if $@;
126 warn "# $range => $code = $val\n" if $debug;
127 # fake new column
128 push @$line, $val;
129 push @cols, $#$line;
130 180 dpavlin } else {
131 185 dpavlin push @cols, $code;
132 180 dpavlin }
133 185 dpavlin } else {
134 confess "can't parse range $range\n";
135 }
136
137 if ( $want_data ) {
138 warn "## $range -> cols ", dump( @cols ), " line " . dump( $line ) . " => " . dump( @$line[ @cols ] ), $/ if $debug > 1;
139 return @$line[ @cols ] if wantarray;
140 return join(' ', @$line[ @cols ]);
141 } else {
142 warn "## $range -> cols ", dump( @cols ) if $debug > 1;
143 180 dpavlin return @cols;
144 }
145 185 dpavlin }
146 180 dpavlin
147 185 dpavlin while(<STDIN>) {
148 chomp;
149 s/^\s+//;
150 my $line = [ '', split(/$delimiter/,$_) ];
151
152 178 dpavlin # implement count for columns
153 57 dpavlin
154 178 dpavlin my $key;
155 if ($group_by =~ m/,/) {
156 foreach my $c (split(/,/,$group_by)) {
157 185 dpavlin $key .= join('!', col_data_range($c, 1, $line));
158 178 dpavlin }
159 53 dpavlin } else {
160 185 dpavlin $key = join('!', col_data_range($group_by, 1, $line));
161 53 dpavlin }
162 178 dpavlin
163 185 dpavlin $col->{count}->{$key}++;
164 warn "## key:$key count:$col->{count}->{$key}\n" if $debug > 1;
165 178 dpavlin
166 # implement sum for columns
167 185 dpavlin foreach my $c ( @sum_cols ) {
168 $col->{sum}->{$key}->{$c} += $_ foreach col_data_range( $c, 1, $line );
169 }
170 178 dpavlin
171 # save columns to show
172 185 dpavlin $col->{show}->{$key}->{$_} = col_data_range( $_, 1, $line ) foreach @show_cols;
173 183 dpavlin
174 # max/min
175 foreach my $op ( keys %$extremes ) {
176 185 dpavlin my $col = $extremes->{$op}->{col};
177 my $val = col_data_range( $col, 1, $line );
178 $extremes->{$op}->{val}->{$key} = $val if ! defined $extremes->{$op}->{val}->{$key};
179 next unless defined $val; #$val =~ m/^\d+$/;
180 my $code = '$val ' . ( $op eq 'min' ? '<' : '>' ) . ' $extremes->{$op}->{val}->{$key}';
181 warn "# $code val = ",dump( $val ) if $debug;
182 $extremes->{$op}->{val}->{$key} = $val if eval $code;
183 confess "$@\t$code\n" if $@;
184 183 dpavlin }
185 53 dpavlin }
186
187 178 dpavlin if ( $debug ) {
188 185 dpavlin warn "# col = ",dump( $col ),$/;
189 warn "# extremes = ",dump( $extremes );
190 178 dpavlin }
191
192 187 dpavlin my $join_data;
193 warn "# join on $join_on from $join\n" if $debug;
194 map {
195 s/^\s+//;
196 s/\s+$//;
197 my @v = split(/\s+/,$_,2);
198 $join_data->{ $v[0] } = $v[1];
199 } capture( $join ) if $join;
200
201 warn "# $join join_data = ",dump( $join_data ) if $debug;
202
203 185 dpavlin foreach my $c (sort keys %{$col->{count}}) {
204 178 dpavlin my @out;
205 foreach my $op ( @col_op ) {
206 185 dpavlin if ( my $val = $op->{sum} ) {
207 push @out, $col->{sum}->{$c}->{$val};
208 178 dpavlin } elsif ( $op->{count} ) {
209 185 dpavlin push @out, $col->{count}->{$c};
210 } elsif ( $val = $op->{show} ) {
211 187 dpavlin my $data = $col->{show}->{$c}->{$val};
212 if ( $join_data && $#out == ( $join_on - 2 ) ) {
213 push @out, $join_data->{ $data } || $data;
214 } else {
215 push @out, $data;
216 }
217 183 dpavlin } elsif ( my $op = $op->{extreme} ) {
218 push @out, $extremes->{$op}->{val}->{$c};
219 178 dpavlin }
220 185 dpavlin warn "## op ",dump( $op ), " out ",dump( $out[$#out] ),$/ if $debug > 1;
221 178 dpavlin }
222 print join(' ',@out),$/;
223 53 dpavlin }