# This program is copyright 2008-2011 Percona Ireland Ltd. # Feedback and improvements are welcome. # # THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, version 2; OR the Perl Artistic License. On UNIX and similar # systems, you can issue `man perlgpl' or `man perlartistic' to read these # licenses. # # You should have received a copy of the GNU General Public License along with # this program; if not, write to the Free Software Foundation, Inc., 59 Temple # Place, Suite 330, Boston, MA 02111-1307 USA. # ########################################################################### # EventAggregator package # ########################################################################### { # Package: EventAggregator # EventAggregator aggregates event values and calculates basic statistics. package EventAggregator; use strict; use warnings FATAL => 'all'; use English qw(-no_match_vars); use constant PTDEBUG => $ENV{PTDEBUG} || 0; use List::Util qw(min max); use Data::Dumper; $Data::Dumper::Indent = 1; $Data::Dumper::Sortkeys = 1; $Data::Dumper::Quotekeys = 0; # ########################################################################### # Set up some constants for bucketing values. It is impossible to keep all # values seen in memory, but putting them into logarithmically scaled buckets # and just incrementing the bucket each time works, although it is imprecise. # See http://code.google.com/p/maatkit/wiki/EventAggregatorInternals. # ########################################################################### use constant BUCK_SIZE => 1.05; use constant BASE_LOG => log(BUCK_SIZE); use constant BASE_OFFSET => abs(1 - log(0.000001) / BASE_LOG); # 284.1617969 use constant NUM_BUCK => 1000; use constant MIN_BUCK => .000001; # Used in buckets_of() to map buckets of log10 to log1.05 buckets. my @buck_vals = map { bucket_value($_); } (0..NUM_BUCK-1); # Sub: new # # Parameters: # %args - Arguments # # Required Arguments: # groupby - Attribute to group/aggregate classes by. # worst - Attribute which defines the worst event in a class. # Samples of the worst attribute are saved. # # Optional Arguments: # attributes - Hashref of attributes to aggregate. Keys are attribute # names used in the EventAggregator object and values are # attribute names used to get values from events. # Multiple attrib names in the arrayref specify alternate # attrib names in the event. Example: # Fruit => ['apple', 'orange']. An attrib called "Fruit" # will be created using the event's "apple" value or, # if that attrib doesn't exist, its "orange" value. # If this option isn't specified, then then all attributes# are auto-detected and aggregated. # ignore_attributes - Arrayref of auto-detected attributes to ignore. # This does not apply to the attributes specified # with the optional attributes option above. # unroll_limit - If this many events have been processed and some # handlers haven't been generated yet (due to lack # of sample data), unroll the loop anyway. (default 1000) # attrib_limit - Sanity limit for attribute values. If the value # exceeds the limit, use the last-seen for this class; # if none, then 0. # type_for - Hashref of attribute=>type pairs. See $type in # for the list of types. # # Returns: # EventAggregator object sub new { my ( $class, %args ) = @_; foreach my $arg ( qw(groupby worst) ) { die "I need a $arg argument" unless $args{$arg}; } my $attributes = $args{attributes} || {}; my $self = { groupby => $args{groupby}, detect_attribs => scalar keys %$attributes == 0 ? 1 : 0, all_attribs => [ keys %$attributes ], ignore_attribs => { map { $_ => $args{attributes}->{$_} } grep { $_ ne $args{groupby} } @{$args{ignore_attributes}} }, attributes => { map { $_ => $args{attributes}->{$_} } grep { $_ ne $args{groupby} } keys %$attributes }, alt_attribs => { map { $_ => make_alt_attrib(@{$args{attributes}->{$_}}) } grep { $_ ne $args{groupby} } keys %$attributes }, worst => $args{worst}, unroll_limit => $ENV{PT_QUERY_DIGEST_CHECK_ATTRIB_LIMIT} || 1000, attrib_limit => $args{attrib_limit}, result_classes => {}, result_globals => {}, result_samples => {}, class_metrics => {}, global_metrics => {}, n_events => 0, unrolled_loops => undef, type_for => { %{$args{type_for} || { Query_time => 'num' }} }, }; return bless $self, $class; } # Delete all collected data, but don't delete things like the generated # subroutines. Resetting aggregated data is an interesting little exercise. # The generated functions that do aggregation have private namespaces with # references to some of the data. Thus, they will not necessarily do as # expected if the stored data is simply wiped out. Instead, it needs to be # zeroed out without replacing the actual objects. sub reset_aggregated_data { my ( $self ) = @_; foreach my $class ( values %{$self->{result_classes}} ) { foreach my $attrib ( values %$class ) { delete @{$attrib}{keys %$attrib}; } } foreach my $class ( values %{$self->{result_globals}} ) { delete @{$class}{keys %$class}; } delete @{$self->{result_samples}}{keys %{$self->{result_samples}}}; $self->{n_events} = 0; } # Aggregate an event hashref's properties. Code is built on the fly to do this, # based on the values being passed in. After code is built for every attribute # (or 50 events are seen and we decide to give up) the little bits of code get # unrolled into a whole subroutine to handle events. For that reason, you can't # re-use an instance. sub aggregate { my ( $self, $event ) = @_; my $group_by = $event->{$self->{groupby}}; return unless defined $group_by; $self->{n_events}++; PTDEBUG && _d('Event', $self->{n_events}); # Run only unrolled loops if available. return $self->{unrolled_loops}->($self, $event, $group_by) if $self->{unrolled_loops}; # For the first unroll_limit events, auto-detect new attribs and # run attrib handlers. if ( $self->{n_events} <= $self->{unroll_limit} ) { $self->add_new_attributes($event) if $self->{detect_attribs}; ATTRIB: foreach my $attrib ( keys %{$self->{attributes}} ) { # Attrib auto-detection can add a lot of attributes which some events # may or may not have. Aggregating a nonexistent attrib is wasteful, # so we check that the attrib or one of its alternates exists. If # one does, then we leave attrib alone because the handler sub will # also check alternates. if ( !exists $event->{$attrib} ) { PTDEBUG && _d("attrib doesn't exist in event:", $attrib); my $alt_attrib = $self->{alt_attribs}->{$attrib}->($event); PTDEBUG && _d('alt attrib:', $alt_attrib); next ATTRIB unless $alt_attrib; } # The value of the attribute ( $group_by ) may be an arrayref. GROUPBY: foreach my $val ( ref $group_by ? @$group_by : ($group_by) ) { my $class_attrib = $self->{result_classes}->{$val}->{$attrib} ||= {}; my $global_attrib = $self->{result_globals}->{$attrib} ||= {}; my $samples = $self->{result_samples}; my $handler = $self->{handlers}->{ $attrib }; if ( !$handler ) { $handler = $self->make_handler( event => $event, attribute => $attrib, alternates => $self->{attributes}->{$attrib}, worst => $self->{worst} eq $attrib, ); $self->{handlers}->{$attrib} = $handler; } next GROUPBY unless $handler; $samples->{$val} ||= $event; # Initialize to the first event. $handler->($event, $class_attrib, $global_attrib, $samples, $group_by); } } } else { # After unroll_limit events, unroll the loops. $self->_make_unrolled_loops($event); # Run unrolled loops here once. Next time, they'll be ran # before this if-else. $self->{unrolled_loops}->($self, $event, $group_by); } return; } sub _make_unrolled_loops { my ( $self, $event ) = @_; my $group_by = $event->{$self->{groupby}}; # All attributes have handlers, so let's combine them into one faster sub. # Start by getting direct handles to the location of each data store and # thing that would otherwise be looked up via hash keys. my @attrs = grep { $self->{handlers}->{$_} } keys %{$self->{attributes}}; my $globs = $self->{result_globals}; # Global stats for each my $samples = $self->{result_samples}; # Now the tricky part -- must make sure only the desired variables from # the outer scope are re-used, and any variables that should have their # own scope are declared within the subroutine. my @lines = ( 'my ( $self, $event, $group_by ) = @_;', 'my ($val, $class, $global, $idx);', (ref $group_by ? ('foreach my $group_by ( @$group_by ) {') : ()), # Create and get each attribute's storage 'my $temp = $self->{result_classes}->{ $group_by } ||= { map { $_ => { } } @attrs };', '$samples->{$group_by} ||= $event;', # Always start with the first. ); foreach my $i ( 0 .. $#attrs ) { # Access through array indexes, it's faster than hash lookups push @lines, ( '$class = $temp->{\'' . $attrs[$i] . '\'};', '$global = $globs->{\'' . $attrs[$i] . '\'};', $self->{unrolled_for}->{$attrs[$i]}, ); } if ( ref $group_by ) { push @lines, '}'; # Close the loop opened above } @lines = map { s/^/ /gm; $_ } @lines; # Indent for debugging unshift @lines, 'sub {'; push @lines, '}'; # Make the subroutine. my $code = join("\n", @lines); PTDEBUG && _d('Unrolled subroutine:', @lines); my $sub = eval $code; die $EVAL_ERROR if $EVAL_ERROR; $self->{unrolled_loops} = $sub; return; } # Return the aggregated results. sub results { my ( $self ) = @_; return { classes => $self->{result_classes}, globals => $self->{result_globals}, samples => $self->{result_samples}, }; } sub set_results { my ( $self, $results ) = @_; $self->{result_classes} = $results->{classes}; $self->{result_globals} = $results->{globals}; $self->{result_samples} = $results->{samples}; return; } sub stats { my ( $self ) = @_; return { classes => $self->{class_metrics}, globals => $self->{global_metrics}, }; } # Return the attributes that this object is tracking, and their data types, as # a hashref of name => type. sub attributes { my ( $self ) = @_; return $self->{type_for}; } sub set_attribute_types { my ( $self, $attrib_types ) = @_; $self->{type_for} = $attrib_types; return; } # Returns the type of the attribute (as decided by the aggregation process, # which inspects the values). sub type_for { my ( $self, $attrib ) = @_; return $self->{type_for}->{$attrib}; } # Sub: make_handler # Make an attribute handler subroutine for . Each attribute # needs a handler to keep trach of the min and max values, worst sample, etc. # Handlers differ depending on the type of attribute (num, bool or string). # # Parameters: # %args - Arguments # # Required Arguments: # event - Event hashref # attribute - Attribute name # # Optional Arguments: # alternates - Arrayref of alternate names for the attribute # worst - Keep a sample of the attribute's worst value (default no) # # Returns: # A subroutine that can aggregate the attribute. sub make_handler { my ( $self, %args ) = @_; my @required_args = qw(event attribute); foreach my $arg ( @required_args ) { die "I need a $arg argument" unless $args{$arg}; } my ($event, $attrib) = @args{@required_args}; my $val; eval { $val= $self->_get_value(%args); }; if ( $EVAL_ERROR ) { PTDEBUG && _d("Cannot make", $attrib, "handler:", $EVAL_ERROR); return; } return unless defined $val; # can't determine type if it's undef # Ripped off from Regexp::Common::number and modified. my $float_re = qr{[+-]?(?:(?=\d|[.])\d+(?:[.])\d{0,})(?:E[+-]?\d+)?}i; my $type = $self->type_for($attrib) ? $self->type_for($attrib) : $attrib =~ m/_crc$/ ? 'string' : $val =~ m/^(?:\d+|$float_re)$/o ? 'num' : $val =~ m/^(?:Yes|No)$/ ? 'bool' : 'string'; PTDEBUG && _d('Type for', $attrib, 'is', $type, '(sample:', $val, ')'); $self->{type_for}->{$attrib} = $type; # ######################################################################## # Begin creating the handler subroutine by writing lines of code. # ######################################################################## my @lines; # Some attrib types don't need us to track sum, unq or all--and some do. my %track = ( sum => $type =~ m/num|bool/ ? 1 : 0, # sum of values unq => $type =~ m/bool|string/ ? 1 : 0, # count of unique values seen all => $type eq 'num' ? 1 : 0, # all values in bucketed list ); # First, do any transformations to the value if needed. Right now, # it's just bool type attribs that need to be transformed. my $trf = ($type eq 'bool') ? q{(($val || '') eq 'Yes') ? 1 : 0} : undef; if ( $trf ) { push @lines, q{$val = } . $trf . ';'; } # Handle broken Query_time like 123.124345.8382 (issue 234). if ( $attrib eq 'Query_time' ) { push @lines, ( '$val =~ s/^(\d+(?:\.\d+)?).*/$1/;', '$event->{\''.$attrib.'\'} = $val;', ); } # Make sure the value is constrained to legal limits. If it's out of # bounds, just use the last-seen value for it. if ( $type eq 'num' && $self->{attrib_limit} ) { push @lines, ( "if ( \$val > $self->{attrib_limit} ) {", ' $val = $class->{last} ||= 0;', '}', '$class->{last} = $val;', ); } # Update values for this attrib in the class and global stores. We write # code for each store. The placeholder word PLACE is replaced with either # $class or $global at the end of the loop. my $lt = $type eq 'num' ? '<' : 'lt'; my $gt = $type eq 'num' ? '>' : 'gt'; foreach my $place ( qw($class $global) ) { my @tmp; # hold lines until PLACE placeholder is replaced # Track count of any and all values seen for this attribute. # This is mostly used for class->Query_time->cnt which represents # the number of queries in the class because all queries (should) # have a Query_time attribute. push @tmp, '++PLACE->{cnt};'; # count of all values seen # CRC attribs are bucketed in 1k buckets by % 1_000. We must # convert the val early so min and max don't show, e.g. 996791064 # whereas unq will contain 64 (996791064 % 1_000). if ( $attrib =~ m/_crc$/ ) { push @tmp, '$val = $val % 1_000;'; } # Track min, max and sum of values. Min and max for strings is # mostly used for timestamps; min ts is earliest and max ts is latest. push @tmp, ( 'PLACE->{min} = $val if !defined PLACE->{min} || $val ' . $lt . ' PLACE->{min};', ); push @tmp, ( 'PLACE->{max} = $val if !defined PLACE->{max} || $val ' . $gt . ' PLACE->{max};', ); if ( $track{sum} ) { push @tmp, 'PLACE->{sum} += $val;'; } # Save all values in a bucketed list. See bucket_idx() below. if ( $track{all} ) { push @tmp, ( 'exists PLACE->{all} or PLACE->{all} = {};', '++PLACE->{all}->{ EventAggregator::bucket_idx($val) };', ); } # Replace PLACE with current variable, $class or $global. push @lines, map { s/PLACE/$place/g; $_ } @tmp; } # We only save unique and worst values for the class, not globally. if ( $track{unq} ) { push @lines, '++$class->{unq}->{$val}'; } if ( $args{worst} ) { my $op = $type eq 'num' ? '>=' : 'ge'; push @lines, ( 'if ( $val ' . $op . ' ($class->{max} || 0) ) {', ' $samples->{$group_by} = $event;', '}', ); } # Make the core code. This part is saved for later, as part of an # "unrolled" subroutine. my @unrolled = ( # Get $val from primary attrib name. "\$val = \$event->{'$attrib'};", # Get $val from alternate attrib names. ( map { "\$val = \$event->{'$_'} unless defined \$val;" } grep { $_ ne $attrib } @{$args{alternates}} ), # Execute the code lines, if $val is defined. 'defined $val && do {', @lines, '};', ); $self->{unrolled_for}->{$attrib} = join("\n", @unrolled); # Finally, make a complete subroutine by wrapping the core code inside # a "sub { ... }" template. my @code = ( 'sub {', # Get args and define all variables. 'my ( $event, $class, $global, $samples, $group_by ) = @_;', 'my ($val, $idx);', # Core code from above. $self->{unrolled_for}->{$attrib}, 'return;', '}', ); $self->{code_for}->{$attrib} = join("\n", @code); PTDEBUG && _d($attrib, 'handler code:', $self->{code_for}->{$attrib}); my $sub = eval $self->{code_for}->{$attrib}; if ( $EVAL_ERROR ) { die "Failed to compile $attrib handler code: $EVAL_ERROR"; } return $sub; } # Sub: bucket_idx # Return the bucket number for the given value. Buck numbers are zero-indexed, # so although there are 1,000 buckets (NUM_BUCK), 999 is the greatest idx. # # Notice that this sub is not a class method, so either call it # from inside this module like bucket_idx() or outside this module # like EventAggregator::bucket_idx(). # # The bucketed list works this way: each range of values from MIN_BUCK in # increments of BUCK_SIZE (that is 5%) we consider a bucket. We keep NUM_BUCK # buckets. The upper end of the range is more than 1.5e15 so it should be big # enough for almost anything. The buckets are accessed by log base BUCK_SIZE, # so floor(log(N)/log(BUCK_SIZE)). The smallest bucket's index is -284. We # shift all values up 284 so we have values from 0 to 999 that can be used as # array indexes. A value that falls into a bucket simply increments the array # entry. We do NOT use POSIX::floor() because it is too expensive. # # This eliminates the need to keep and sort all values to calculate median, # standard deviation, 95th percentile, etc. So memory usage is bounded by # the number of distinct aggregated values, not the number of events. # # TODO: could export this by default to avoid having to specific package::. # # Parameters: # $val - Numeric value to bucketize # # Returns: # Bucket number (0 to NUM_BUCK-1) for the value sub bucket_idx { my ( $val ) = @_; return 0 if $val < MIN_BUCK; my $idx = int(BASE_OFFSET + log($val)/BASE_LOG); return $idx > (NUM_BUCK-1) ? (NUM_BUCK-1) : $idx; } # Sub: bucket_value # Return the value corresponding to the given bucket. The value of each # bucket is the first value that it covers. So the value of bucket 1 is # 0.000001000 because it covers [0.000001000, 0.000001050). # # Notice that this sub is not a class method, so either call it # from inside this module like bucket_idx() or outside this module # like EventAggregator::bucket_value(). # # TODO: could export this by default to avoid having to specific package::. # # Parameters: # $bucket - Bucket number (0 to NUM_BUCK-1) # # Returns: # Numeric value corresponding to the bucket sub bucket_value { my ( $bucket ) = @_; return 0 if $bucket == 0; die "Invalid bucket: $bucket" if $bucket < 0 || $bucket > (NUM_BUCK-1); # $bucket - 1 because buckets are shifted up by 1 to handle zero values. return (BUCK_SIZE**($bucket-1)) * MIN_BUCK; } # Map the 1,000 base 1.05 buckets to 8 base 10 buckets. Returns an array # of 1,000 buckets, the value of each represents its index in an 8 bucket # base 10 array. For example: base 10 bucket 0 represents vals (0, 0.000010), # and base 1.05 buckets 0..47 represent vals (0, 0.000010401). So the first # 48 elements of the returned array will have 0 as their values. # TODO: right now it's hardcoded to buckets of 10, in the future maybe not. { my @buck_tens; sub buckets_of { return @buck_tens if @buck_tens; # To make a more precise map, we first set the starting values for # each of the 8 base 10 buckets. my $start_bucket = 0; my @base10_starts = (0); map { push @base10_starts, (10**$_)*MIN_BUCK } (1..7); # Then find the base 1.05 buckets that correspond to each # base 10 bucket. The last value in each bucket's range belongs # to the next bucket, so $next_bucket-1 represents the real last # base 1.05 bucket in which the base 10 bucket's range falls. for my $base10_bucket ( 0..($#base10_starts-1) ) { my $next_bucket = bucket_idx( $base10_starts[$base10_bucket+1] ); PTDEBUG && _d('Base 10 bucket', $base10_bucket, 'maps to', 'base 1.05 buckets', $start_bucket, '..', $next_bucket-1); for my $base1_05_bucket ($start_bucket..($next_bucket-1)) { $buck_tens[$base1_05_bucket] = $base10_bucket; } $start_bucket = $next_bucket; } # Map all remaining base 1.05 buckets to base 10 bucket 7 which # is for vals > 10. map { $buck_tens[$_] = 7 } ($start_bucket..(NUM_BUCK-1)); return @buck_tens; } } # Calculate 95%, stddev and median for numeric attributes in the # global and classes stores that have all values (1k buckets). # Save the metrics in global_metrics and class_metrics. sub calculate_statistical_metrics { my ( $self, %args ) = @_; my $classes = $self->{result_classes}; my $globals = $self->{result_globals}; my $class_metrics = $self->{class_metrics}; my $global_metrics = $self->{global_metrics}; PTDEBUG && _d('Calculating statistical_metrics'); foreach my $attrib ( keys %$globals ) { if ( exists $globals->{$attrib}->{all} ) { $global_metrics->{$attrib} = $self->_calc_metrics( $globals->{$attrib}->{all}, $globals->{$attrib}, ); } foreach my $class ( keys %$classes ) { if ( exists $classes->{$class}->{$attrib}->{all} ) { $class_metrics->{$class}->{$attrib} = $self->_calc_metrics( $classes->{$class}->{$attrib}->{all}, $classes->{$class}->{$attrib} ); } } } return; } # Given a hashref of vals, returns a hashref with the following # statistical metrics: # # pct_95 => top bucket value in the 95th percentile # cutoff => How many values fall into the 95th percentile # stddev => of all values # median => of all values # # The vals hashref represents the buckets as per the above (see the comments # at the top of this file). $args should contain cnt, min and max properties. sub _calc_metrics { my ( $self, $vals, $args ) = @_; my $statistical_metrics = { pct_95 => 0, stddev => 0, median => 0, cutoff => undef, }; # These cases might happen when there is nothing to get from the event, for # example, processlist sniffing doesn't gather Rows_examined, so $args won't # have {cnt} or other properties. return $statistical_metrics unless defined $vals && %$vals && $args->{cnt}; # Return accurate metrics for some cases. my $n_vals = $args->{cnt}; if ( $n_vals == 1 || $args->{max} == $args->{min} ) { my $v = $args->{max} || 0; my $bucket = int(6 + ( log($v > 0 ? $v : MIN_BUCK) / log(10))); $bucket = $bucket > 7 ? 7 : $bucket < 0 ? 0 : $bucket; return { pct_95 => $v, stddev => 0, median => $v, cutoff => $n_vals, }; } elsif ( $n_vals == 2 ) { foreach my $v ( $args->{min}, $args->{max} ) { my $bucket = int(6 + ( log($v && $v > 0 ? $v : MIN_BUCK) / log(10))); $bucket = $bucket > 7 ? 7 : $bucket < 0 ? 0 : $bucket; } my $v = $args->{max} || 0; my $mean = (($args->{min} || 0) + $v) / 2; return { pct_95 => $v, stddev => sqrt((($v - $mean) ** 2) *2), median => $mean, cutoff => $n_vals, }; } # Determine cutoff point for 95% if there are at least 10 vals. Cutoff # serves also for the number of vals left in the 95%. E.g. with 50 vals # the cutoff is 47 which means there are 47 vals: 0..46. $cutoff is NOT # an array index. my $cutoff = $n_vals >= 10 ? int ( $n_vals * 0.95 ) : $n_vals; $statistical_metrics->{cutoff} = $cutoff; # Calculate the standard deviation and median of all values. my $total_left = $n_vals; my $top_vals = $n_vals - $cutoff; # vals > 95th my $sum_excl = 0; my $sum = 0; my $sumsq = 0; my $mid = int($n_vals / 2); my $median = 0; my $prev = NUM_BUCK-1; # Used for getting median when $cutoff is odd my $bucket_95 = 0; # top bucket in 95th PTDEBUG && _d('total vals:', $total_left, 'top vals:', $top_vals, 'mid:', $mid); # In ancient times we kept an array of 1k buckets for each numeric # attrib. Each such array cost 32_300 bytes of memory (that's not # a typo; yes, it was verified). But measurements showed that only # 1% of the buckets were used on average, meaning 99% of 32_300 was # wasted. Now we store only the used buckets in a hashref which we # map to a 1k bucket array for processing, so we don't have to tinker # with the delitcate code below. # http://code.google.com/p/maatkit/issues/detail?id=866 my @buckets = map { 0 } (0..NUM_BUCK-1); map { $buckets[$_] = $vals->{$_} } keys %$vals; $vals = \@buckets; # repoint vals from given hashref to our array BUCKET: for my $bucket ( reverse 0..(NUM_BUCK-1) ) { my $val = $vals->[$bucket]; next BUCKET unless $val; $total_left -= $val; $sum_excl += $val; $bucket_95 = $bucket if !$bucket_95 && $sum_excl > $top_vals; if ( !$median && $total_left <= $mid ) { $median = (($cutoff % 2) || ($val > 1)) ? $buck_vals[$bucket] : ($buck_vals[$bucket] + $buck_vals[$prev]) / 2; } $sum += $val * $buck_vals[$bucket]; $sumsq += $val * ($buck_vals[$bucket]**2); $prev = $bucket; } my $var = $sumsq/$n_vals - ( ($sum/$n_vals) ** 2 ); my $stddev = $var > 0 ? sqrt($var) : 0; my $maxstdev = (($args->{max} || 0) - ($args->{min} || 0)) / 2; $stddev = $stddev > $maxstdev ? $maxstdev : $stddev; PTDEBUG && _d('sum:', $sum, 'sumsq:', $sumsq, 'stddev:', $stddev, 'median:', $median, 'prev bucket:', $prev, 'total left:', $total_left, 'sum excl', $sum_excl, 'bucket 95:', $bucket_95, $buck_vals[$bucket_95]); $statistical_metrics->{stddev} = $stddev; $statistical_metrics->{pct_95} = $buck_vals[$bucket_95]; $statistical_metrics->{median} = $median; return $statistical_metrics; } # Return a hashref of the metrics for some attribute, pre-digested. # %args is: # attrib => the attribute to report on # where => the value of the fingerprint for the attrib sub metrics { my ( $self, %args ) = @_; foreach my $arg ( qw(attrib where) ) { die "I need a $arg argument" unless defined $args{$arg}; } my $attrib = $args{attrib}; my $where = $args{where}; my $stats = $self->results(); my $metrics = $self->stats(); my $store = $stats->{classes}->{$where}->{$attrib}; my $global_cnt = $stats->{globals}->{$attrib}->{cnt}; return { cnt => $store->{cnt}, pct => $global_cnt && $store->{cnt} ? $store->{cnt} / $global_cnt : 0, sum => $store->{sum}, min => $store->{min}, max => $store->{max}, avg => $store->{sum} && $store->{cnt} ? $store->{sum} / $store->{cnt} : 0, median => $metrics->{classes}->{$where}->{$attrib}->{median} || 0, pct_95 => $metrics->{classes}->{$where}->{$attrib}->{pct_95} || 0, stddev => $metrics->{classes}->{$where}->{$attrib}->{stddev} || 0, }; } # Find the top N or top % event keys, in sorted order, optionally including # outliers (ol_...) that are notable for some reason. %args looks like this: # # attrib order-by attribute (usually Query_time) # orderby order-by aggregate expression (should be numeric, usually sum) # total include events whose summed attribs are <= this number... # count ...or this many events, whichever is less... # ol_attrib ...or events where the 95th percentile of this attribute... # ol_limit ...is greater than this value, AND... # ol_freq ...the event occurred at least this many times. # The return value is two arrayref. The first is a list of arrayrefs of the # chosen (top) events. Each arrayref is the event key and an explanation of # why it was included (top|outlier). The second is a list of the non-top # event keys. sub top_events { my ( $self, %args ) = @_; my $classes = $self->{result_classes}; my @sorted = reverse sort { # Sorted list of $groupby values ($classes->{$a}->{$args{attrib}}->{$args{orderby}} == $classes->{$b}->{$args{attrib}}->{$args{orderby}}) ? $a cmp $b : $classes->{$a}->{$args{attrib}}->{$args{orderby}} <=> $classes->{$b}->{$args{attrib}}->{$args{orderby}} } grep { # Defensive programming defined $classes->{$_}->{$args{attrib}}->{$args{orderby}} } keys %$classes; my @chosen; # top events my @other; # other events (< top) my ($total, $count) = (0, 0); foreach my $groupby ( @sorted ) { # Events that fall into the top criterion for some reason if ( (!$args{total} || $total < $args{total} ) && ( !$args{count} || $count < $args{count} ) ) { push @chosen, [$groupby, 'top', $count+1]; } # Events that are notable outliers elsif ( $args{ol_attrib} && (!$args{ol_freq} || $classes->{$groupby}->{$args{ol_attrib}}->{cnt} >= $args{ol_freq}) ) { my $stats = $self->{class_metrics}->{$groupby}->{$args{ol_attrib}}; if ( ($stats->{pct_95} || 0) >= $args{ol_limit} ) { push @chosen, [$groupby, 'outlier', $count+1]; } else { push @other, [$groupby, 'misc', $count+1]; } } # Events not in the top criterion else { push @other, [$groupby, 'misc', $count+1]; } $total += $classes->{$groupby}->{$args{attrib}}->{$args{orderby}}; $count++; } return \@chosen, \@other; } # Adds all new attributes in $event to $self->{attributes}. sub add_new_attributes { my ( $self, $event ) = @_; return unless $event; map { my $attrib = $_; $self->{attributes}->{$attrib} = [$attrib]; $self->{alt_attribs}->{$attrib} = make_alt_attrib($attrib); push @{$self->{all_attribs}}, $attrib; PTDEBUG && _d('Added new attribute:', $attrib); } grep { $_ ne $self->{groupby} && !exists $self->{attributes}->{$_} && !exists $self->{ignore_attribs}->{$_} } keys %$event; return; } # Returns an arrayref of all the attributes that were either given # explicitly to new() or that were auto-detected. sub get_attributes { my ( $self ) = @_; return $self->{all_attribs}; } sub events_processed { my ( $self ) = @_; return $self->{n_events}; } sub make_alt_attrib { my ( @attribs ) = @_; my $attrib = shift @attribs; # Primary attribute. return sub {} unless @attribs; # No alternates. my @lines; push @lines, 'sub { my ( $event ) = @_; my $alt_attrib;'; push @lines, map { "\$alt_attrib = '$_' if !defined \$alt_attrib " . "&& exists \$event->{'$_'};" } @attribs; push @lines, 'return $alt_attrib; }'; PTDEBUG && _d('alt attrib sub for', $attrib, ':', @lines); my $sub = eval join("\n", @lines); die if $EVAL_ERROR; return $sub; } # Merge/add the given arrayref of EventAggregator objects. # Returns a new EventAggregator obj. sub merge { my ( @ea_objs ) = @_; PTDEBUG && _d('Merging', scalar @ea_objs, 'ea'); return unless scalar @ea_objs; # If all the ea don't have the same groupby and worst then adding # them will produce a nonsensical result. (Maybe not if worst # differs but certainly if groupby differs). And while checking this... my $ea1 = shift @ea_objs; my $r1 = $ea1->results; my $worst = $ea1->{worst}; # for merging, finding worst sample # ...get all attributes and their types to properly initialize the # returned ea obj; my %attrib_types = %{ $ea1->attributes() }; foreach my $ea ( @ea_objs ) { die "EventAggregator objects have different groupby: " . "$ea1->{groupby} and $ea->{groupby}" unless $ea1->{groupby} eq $ea->{groupby}; die "EventAggregator objects have different worst: " . "$ea1->{worst} and $ea->{worst}" unless $ea1->{worst} eq $ea->{worst}; my $attrib_types = $ea->attributes(); map { $attrib_types{$_} = $attrib_types->{$_} unless exists $attrib_types{$_}; } keys %$attrib_types; } # First, deep copy the first ea obj. Do not shallow copy, do deep copy # so the returned ea is truly its own obj and does not point to data # structs in one of the given ea. my $r_merged = { classes => {}, globals => _deep_copy_attribs($r1->{globals}), samples => {}, }; map { $r_merged->{classes}->{$_} = _deep_copy_attribs($r1->{classes}->{$_}); @{$r_merged->{samples}->{$_}}{keys %{$r1->{samples}->{$_}}} = values %{$r1->{samples}->{$_}}; } keys %{$r1->{classes}}; # Then, merge/add the other eas. r1* is the eventual return val. # r2* is the current ea being merged/added into r1*. for my $i ( 0..$#ea_objs ) { PTDEBUG && _d('Merging ea obj', ($i + 1)); my $r2 = $ea_objs[$i]->results; # Descend into each class (e.g. unique query/fingerprint), each # attribute (e.g. Query_time, etc.), and then each attribute # value (e.g. min, max, etc.). If either a class or attrib is # missing in one of the results, deep copy the extant class/attrib; # if both exist, add/merge the results. eval { CLASS: foreach my $class ( keys %{$r2->{classes}} ) { my $r1_class = $r_merged->{classes}->{$class}; my $r2_class = $r2->{classes}->{$class}; if ( $r1_class && $r2_class ) { # Class exists in both results. Add/merge all their attributes. CLASS_ATTRIB: foreach my $attrib ( keys %$r2_class ) { PTDEBUG && _d('merge', $attrib); if ( $r1_class->{$attrib} && $r2_class->{$attrib} ) { _add_attrib_vals($r1_class->{$attrib}, $r2_class->{$attrib}); } elsif ( !$r1_class->{$attrib} ) { PTDEBUG && _d('copy', $attrib); $r1_class->{$attrib} = _deep_copy_attrib_vals($r2_class->{$attrib}) } } } elsif ( !$r1_class ) { # Class is missing in r1; deep copy it from r2. PTDEBUG && _d('copy class'); $r_merged->{classes}->{$class} = _deep_copy_attribs($r2_class); } # Update the worst sample if either the r2 sample is worst than # the r1 or there's no such sample in r1. my $new_worst_sample; if ( $r_merged->{samples}->{$class} && $r2->{samples}->{$class} ) { if ( $r2->{samples}->{$class}->{$worst} > $r_merged->{samples}->{$class}->{$worst} ) { $new_worst_sample = $r2->{samples}->{$class} } } elsif ( !$r_merged->{samples}->{$class} ) { $new_worst_sample = $r2->{samples}->{$class}; } # Events don't have references to other data structs # so we don't have to worry about doing a deep copy. if ( $new_worst_sample ) { PTDEBUG && _d('New worst sample:', $worst, '=', $new_worst_sample->{$worst}, 'item:', substr($class, 0, 100)); my %new_sample; @new_sample{keys %$new_worst_sample} = values %$new_worst_sample; $r_merged->{samples}->{$class} = \%new_sample; } } }; if ( $EVAL_ERROR ) { warn "Error merging class/sample: $EVAL_ERROR"; } # Same as above but for the global attribs/vals. eval { GLOBAL_ATTRIB: PTDEBUG && _d('Merging global attributes'); foreach my $attrib ( keys %{$r2->{globals}} ) { my $r1_global = $r_merged->{globals}->{$attrib}; my $r2_global = $r2->{globals}->{$attrib}; if ( $r1_global && $r2_global ) { # Global attrib exists in both results. Add/merge all its values. PTDEBUG && _d('merge', $attrib); _add_attrib_vals($r1_global, $r2_global); } elsif ( !$r1_global ) { # Global attrib is missing in r1; deep copy it from r2 global. PTDEBUG && _d('copy', $attrib); $r_merged->{globals}->{$attrib} = _deep_copy_attrib_vals($r2_global); } } }; if ( $EVAL_ERROR ) { warn "Error merging globals: $EVAL_ERROR"; } } # Create a new EventAggregator obj, initialize it with the summed results, # and return it. my $ea_merged = new EventAggregator( groupby => $ea1->{groupby}, worst => $ea1->{worst}, attributes => { map { $_=>[$_] } keys %attrib_types }, ); $ea_merged->set_results($r_merged); $ea_merged->set_attribute_types(\%attrib_types); return $ea_merged; } # Adds/merges vals2 attrib values into vals1. sub _add_attrib_vals { my ( $vals1, $vals2 ) = @_; # Assuming both sets of values are the same attribute (that's the caller # responsibility), each should have the same values (min, max, unq, etc.) foreach my $val ( keys %$vals1 ) { my $val1 = $vals1->{$val}; my $val2 = $vals2->{$val}; if ( (!ref $val1) && (!ref $val2) ) { # min, max, cnt, sum should never be undef. die "undefined $val value" unless defined $val1 && defined $val2; # Value is scalar but return unless it's numeric. # Only numeric values have "sum". my $is_num = exists $vals1->{sum} ? 1 : 0; if ( $val eq 'max' ) { if ( $is_num ) { $vals1->{$val} = $val1 > $val2 ? $val1 : $val2; } else { $vals1->{$val} = $val1 gt $val2 ? $val1 : $val2; } } elsif ( $val eq 'min' ) { if ( $is_num ) { $vals1->{$val} = $val1 < $val2 ? $val1 : $val2; } else { $vals1->{$val} = $val1 lt $val2 ? $val1 : $val2; } } else { $vals1->{$val} += $val2; } } elsif ( (ref $val1 eq 'ARRAY') && (ref $val2 eq 'ARRAY') ) { # Value is an arrayref, so it should be 1k buckets. # Should never be empty. die "Empty $val arrayref" unless @$val1 && @$val2; my $n_buckets = (scalar @$val1) - 1; for my $i ( 0..$n_buckets ) { $vals1->{$val}->[$i] += $val2->[$i]; } } elsif ( (ref $val1 eq 'HASH') && (ref $val2 eq 'HASH') ) { # Value is a hashref, probably for unq string occurrences. # Should never be empty. die "Empty $val hashref" unless %$val1 and %$val2; map { $vals1->{$val}->{$_} += $val2->{$_} } keys %$val2; } else { # This shouldn't happen. PTDEBUG && _d('vals1:', Dumper($vals1)); PTDEBUG && _d('vals2:', Dumper($vals2)); die "$val type mismatch"; } } return; } # These _deep_copy_* subs only go 1 level deep because, so far, # no ea data struct has a ref any deeper. sub _deep_copy_attribs { my ( $attribs ) = @_; my $copy = {}; foreach my $attrib ( keys %$attribs ) { $copy->{$attrib} = _deep_copy_attrib_vals($attribs->{$attrib}); } return $copy; } sub _deep_copy_attrib_vals { my ( $vals ) = @_; my $copy; if ( ref $vals eq 'HASH' ) { $copy = {}; foreach my $val ( keys %$vals ) { if ( my $ref_type = ref $val ) { if ( $ref_type eq 'ARRAY' ) { my $n_elems = (scalar @$val) - 1; $copy->{$val} = [ map { undef } ( 0..$n_elems ) ]; for my $i ( 0..$n_elems ) { $copy->{$val}->[$i] = $vals->{$val}->[$i]; } } elsif ( $ref_type eq 'HASH' ) { $copy->{$val} = {}; map { $copy->{$val}->{$_} += $vals->{$val}->{$_} } keys %{$vals->{$val}} } else { die "I don't know how to deep copy a $ref_type reference"; } } else { $copy->{$val} = $vals->{$val}; } } } else { $copy = $vals; } return $copy; } # Sub: _get_value # Get the value of the attribute (or one of its alternatives) from the event. # Undef is a valid value. If the attrib or none of its alternatives exist # in the event, then this sub dies. # # Parameters: # %args - Arguments # # Required Arguments: # event - Event hashref # attribute - Attribute name # # Optional Arguments: # alternates - Arrayref of alternate attribute names # # Returns: # Value of attribute in the event, possibly undef sub _get_value { my ( $self, %args ) = @_; my ($event, $attrib, $alts) = @args{qw(event attribute alternates)}; return unless $event && $attrib; my $value; if ( exists $event->{$attrib} ) { $value = $event->{$attrib}; } elsif ( $alts ) { my $found_value = 0; foreach my $alt_attrib( @$alts ) { if ( exists $event->{$alt_attrib} ) { $value = $event->{$alt_attrib}; $found_value = 1; last; } } die "Event does not have attribute $attrib or any of its alternates" unless $found_value; } else { die "Event does not have attribute $attrib and there are no alterantes"; } return $value; } sub _d { my ($package, undef, $line) = caller 0; @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; } map { defined $_ ? $_ : 'undef' } @_; print STDERR "# $package:$line $PID ", join(' ', @_), "\n"; } 1; } # ########################################################################### # End EventAggregator package # ###########################################################################