Revision 187 (by dpavlin, 2008/05/22 12:54:22) added back-tick like syntax for --join --on which enables to
replace some data with more verbose one with OpenVZ examples
#!/usr/bin/perl

use warnings;
use strict;

#
# emulate GROUP BY availabe in relational database for shell pipes
#
# Dobrica Pavlinusic <dpavlin@rot13.org>, http://www.rot13.org/~dpavlin/
#
# SYNTAX: groupby.pl [-d delimiter_char] [-f](sum|count|min|max):]\d+,...
#
# by default, delimiter is space and -f flag is optional -- similar to cut(1)
# you can also use matematical operations between columns
#
# - show memory usage of processes grouped by process name
# ps axv | awk '{ print $10,($6+$7+$8) * 1024 }' | groupby.pl sum:2,1,count:1 | sort -k1 -nr | sum.pl -h
#
# - same example with added minimum and maximum usage
# ps axv | awk '{ print $10,($6+$7+$8) * 1024 }' | groupby.pl sum:2,1,count:1,min:2,max:2 | sort -rn | align | head -10
#
# - stats of pppd packages traffic
# sudo grep pppd /var/log/debug | egrep '(rcvd|sent)' | ./groupby.pl 6-8 | sort -rn
#
# - you can also use do calculations of fields in-line
# ps axv | ./groupby.pl 'sum:(($6+$7+$8)*1024),10,count:10,min:($6+$7+$8),max:($6+$7+$8)' | sort -k1 -nr | head | align | sum.pl -h
#
# - OpenVZ examples:
# vzps -E axv --no-headers | ./groupby.pl 'sum:($7+$8+$9*1024),count:1,1' | sort -rn | ./sum.pl -h
#
# - example how to join some other data (convert OpenVZ VEID to hostname)
# vzps -E axv --no-headers | ./groupby.pl 'sum:($7+$8+$9*1024),1,count:1' --join 'sudo vzlist -H -o veid,hostname' --on 2 | sort -rn | align | ./sum.pl -h

use Data::Dump qw/dump/;
use Getopt::Long;
use Carp qw/confess/;
use IPC::System::Simple qw/capture/;

my $debug = 0;
my $delimiter;
my $cols;
my $join;
my $join_on = 1;

GetOptions(
	'd|delimiter=s'	=> \$delimiter,
	'f|fields=s'	=> \$cols,
	'j|join=s'	=> \$join,
	'on|join-on=i'	=> \$join_on,
	'debug+'	=> \$debug,
);

if ( defined $delimiter ) {
	$delimiter = "\Q$delimiter\E";
} else {
	$delimiter = '\s+';
}

$cols ||= shift @ARGV;
die "usage: $0 [1|sum:3,1,count:1,max:1,min:1]\n" unless $cols;

# rewrite dummy 1- to full form
if ( $cols !~ m/,/ ) {
	$cols = "count:$cols,show:$cols";
	$cols .= '-' unless $cols =~ m/-/;
}

my $group_by;
my @col_op;
my $sum_cols;	# check count of summed columns
my @sum_cols;
my @show_cols;
my $extremes;
foreach my $col ( split(/,/,$cols) ) {
	if ( $col =~ m/^count:(.+)$/ ) {
		# FIXME check if key is valid
		confess "count:$1 can be used only once in $cols\n" if $group_by;
		push @col_op, { count => $1 };
		$group_by = $1;
	} elsif ( $col =~ m/^sum:(.+)$/i ) {
		push @col_op, { sum => $1 };
		push @sum_cols, $1;
		$sum_cols->{$1}++;
		warn "column $1 sum created more than once in $cols\n" if $sum_cols->{$1} > 1;
	} elsif ( $col =~ m/^(min|max):(.+)$/ ) {
		push @col_op, { 'extreme' => $1 };
		$extremes->{ $1 }->{col} = $2;
	} else {
		$col =~ s/^show://i;
		push @col_op, { show => $col };
		push @show_cols, $col;
	}
}

my $col;

warn "## $cols col_op = ",dump( @col_op )," debug level: $debug\n" if $debug > 1;
warn "# group by $group_by show ", dump( @show_cols ), " sum ",dump( @sum_cols ),$/ if $debug;

# convert column range to arrray of column values
sub col_data_range {
	my ( $range, $want_data, $line ) = @_;
	warn "## col_data_range",dump( $range, $want_data, $line ),$/ if $debug > 1;
	my @cols;
	if ($range =~ m/^(\d+)-$/) {
		push @cols,$_ foreach ( $1 .. $#$line );
	} elsif ($range =~ m/^-(\d+)$/) {
		push @cols,$_ foreach ( 0 .. $1 );
	} elsif ($range =~ m/^(\d+)-(\d+)$/) {
		push @cols,$_ foreach ( $1 .. $2 );
	} elsif ($range =~ m/^(\d+)$/) {
		push @cols,$1;
	} elsif ( $range =~ m/^\((.+[+\-\/\*].+)\)$/ ) {
		my $code = $1;
		if ( $want_data ) {
			sub col_val {
				my ($line,$col) = @_;
				my $val = $line->[$col];
				return 0 unless $val =~ m/^\d+$/;
				confess "can't find column $col in ",dump( $line ) unless defined $val;
				return $val;
			}
			$code =~ s/\$(\d+)/col_val($line,$1)/ge;
			my $val = eval $code;
			confess "eval $code from $range returned $@\n" if $@;
			warn "# $range => $code = $val\n" if $debug;
			# fake new column
			push @$line, $val;
			push @cols, $#$line;
		} else {
			push @cols, $code;
		}
	} else {
		confess "can't parse range $range\n";
	}

	if ( $want_data ) {
		warn "## $range -> cols ", dump( @cols ), " line " . dump( $line ) . " => " . dump( @$line[ @cols ] ), $/ if $debug > 1;
		return @$line[ @cols ] if wantarray;
		return join(' ', @$line[ @cols ]);
	} else {
		warn "## $range -> cols ", dump( @cols ) if $debug > 1;
		return @cols;
	}
}

while(<STDIN>) {
	chomp;
	s/^\s+//;
	my $line = [ '', split(/$delimiter/,$_) ];

	# implement count for columns

	my $key;
	if ($group_by =~ m/,/) {
		foreach my $c (split(/,/,$group_by)) {
			$key .= join('!', col_data_range($c, 1, $line));
		}
	} else {
		$key = join('!', col_data_range($group_by, 1, $line));
	}

	$col->{count}->{$key}++;
	warn "## key:$key count:$col->{count}->{$key}\n" if $debug > 1;

	# implement sum for columns
	foreach my $c ( @sum_cols ) {
		$col->{sum}->{$key}->{$c} += $_ foreach col_data_range( $c, 1, $line );
	}

	# save columns to show
	$col->{show}->{$key}->{$_} = col_data_range( $_, 1, $line ) foreach @show_cols;

	# max/min
	foreach my $op ( keys %$extremes ) {
		my $col = $extremes->{$op}->{col};
		my $val = col_data_range( $col, 1, $line );
		$extremes->{$op}->{val}->{$key} = $val if ! defined $extremes->{$op}->{val}->{$key};
		next unless defined $val; #$val =~ m/^\d+$/;
		my $code = '$val ' . ( $op eq 'min' ? '<' : '>' ) . ' $extremes->{$op}->{val}->{$key}';
		warn "# $code val = ",dump( $val ) if $debug;
		$extremes->{$op}->{val}->{$key} = $val if eval $code;
		confess "$@\t$code\n" if $@;
	}
}

if ( $debug ) {
	warn "# col = ",dump( $col ),$/;
	warn "# extremes = ",dump( $extremes );
}

my $join_data;
warn "# join on $join_on from $join\n" if $debug;
map {
	s/^\s+//;
	s/\s+$//;
	my @v = split(/\s+/,$_,2);
	$join_data->{ $v[0] } = $v[1];
} capture( $join ) if $join;

warn "# $join join_data = ",dump( $join_data ) if $debug;

foreach my $c (sort keys %{$col->{count}}) {
	my @out;
	foreach my $op ( @col_op ) {
		if ( my $val = $op->{sum} ) {
			push @out, $col->{sum}->{$c}->{$val};
		} elsif ( $op->{count} ) {
			push @out, $col->{count}->{$c};
		} elsif ( $val = $op->{show} ) {
			my $data = $col->{show}->{$c}->{$val};
			if ( $join_data && $#out == ( $join_on - 2 ) ) {
				push @out, $join_data->{ $data } || $data;
			} else {
				push @out, $data;
			}
		} elsif ( my $op = $op->{extreme} ) {
			push @out, $extremes->{$op}->{val}->{$c};
		}
		warn "## op ",dump( $op ), " out ",dump( $out[$#out] ),$/ if $debug > 1;
	}
	print join(' ',@out),$/;
}