001/*
002 * Copyright (C) 2012 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.math;
016
017import static com.google.common.base.Preconditions.checkState;
018import static com.google.common.math.DoubleUtils.ensureNonNegative;
019import static com.google.common.primitives.Doubles.isFinite;
020import static java.lang.Double.NaN;
021import static java.lang.Double.isNaN;
022
023import com.google.common.annotations.Beta;
024import com.google.common.annotations.GwtIncompatible;
025import java.util.Iterator;
026
027/**
028 * A mutable object which accumulates double values and tracks some basic statistics over all the
029 * values added so far. The values may be added singly or in groups. This class is not thread safe.
030 *
031 * @author Pete Gillin
032 * @author Kevin Bourrillion
033 * @since 20.0
034 */
035@Beta
036@GwtIncompatible
037@ElementTypesAreNonnullByDefault
038public final class StatsAccumulator {
039
040  // These fields must satisfy the requirements of Stats' constructor as well as those of the stat
041  // methods of this class.
042  private long count = 0;
043  private double mean = 0.0; // any finite value will do, we only use it to multiply by zero for sum
044  private double sumOfSquaresOfDeltas = 0.0;
045  private double min = NaN; // any value will do
046  private double max = NaN; // any value will do
047
048  /** Adds the given value to the dataset. */
049  public void add(double value) {
050    if (count == 0) {
051      count = 1;
052      mean = value;
053      min = value;
054      max = value;
055      if (!isFinite(value)) {
056        sumOfSquaresOfDeltas = NaN;
057      }
058    } else {
059      count++;
060      if (isFinite(value) && isFinite(mean)) {
061        // Art of Computer Programming vol. 2, Knuth, 4.2.2, (15) and (16)
062        double delta = value - mean;
063        mean += delta / count;
064        sumOfSquaresOfDeltas += delta * (value - mean);
065      } else {
066        mean = calculateNewMeanNonFinite(mean, value);
067        sumOfSquaresOfDeltas = NaN;
068      }
069      min = Math.min(min, value);
070      max = Math.max(max, value);
071    }
072  }
073
074  /**
075   * Adds the given values to the dataset.
076   *
077   * @param values a series of values, which will be converted to {@code double} values (this may
078   *     cause loss of precision)
079   */
080  public void addAll(Iterable<? extends Number> values) {
081    for (Number value : values) {
082      add(value.doubleValue());
083    }
084  }
085
086  /**
087   * Adds the given values to the dataset.
088   *
089   * @param values a series of values, which will be converted to {@code double} values (this may
090   *     cause loss of precision)
091   */
092  public void addAll(Iterator<? extends Number> values) {
093    while (values.hasNext()) {
094      add(values.next().doubleValue());
095    }
096  }
097
098  /**
099   * Adds the given values to the dataset.
100   *
101   * @param values a series of values
102   */
103  public void addAll(double... values) {
104    for (double value : values) {
105      add(value);
106    }
107  }
108
109  /**
110   * Adds the given values to the dataset.
111   *
112   * @param values a series of values
113   */
114  public void addAll(int... values) {
115    for (int value : values) {
116      add(value);
117    }
118  }
119
120  /**
121   * Adds the given values to the dataset.
122   *
123   * @param values a series of values, which will be converted to {@code double} values (this may
124   *     cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15))
125   */
126  public void addAll(long... values) {
127    for (long value : values) {
128      add(value);
129    }
130  }
131
132  /**
133   * Adds the given statistics to the dataset, as if the individual values used to compute the
134   * statistics had been added directly.
135   */
136  public void addAll(Stats values) {
137    if (values.count() == 0) {
138      return;
139    }
140    merge(values.count(), values.mean(), values.sumOfSquaresOfDeltas(), values.min(), values.max());
141  }
142
143  /**
144   * Adds the given statistics to the dataset, as if the individual values used to compute the
145   * statistics had been added directly.
146   *
147   * @since 28.2
148   */
149  public void addAll(StatsAccumulator values) {
150    if (values.count() == 0) {
151      return;
152    }
153    merge(values.count(), values.mean(), values.sumOfSquaresOfDeltas(), values.min(), values.max());
154  }
155
156  private void merge(
157      long otherCount,
158      double otherMean,
159      double otherSumOfSquaresOfDeltas,
160      double otherMin,
161      double otherMax) {
162    if (count == 0) {
163      count = otherCount;
164      mean = otherMean;
165      sumOfSquaresOfDeltas = otherSumOfSquaresOfDeltas;
166      min = otherMin;
167      max = otherMax;
168    } else {
169      count += otherCount;
170      if (isFinite(mean) && isFinite(otherMean)) {
171        // This is a generalized version of the calculation in add(double) above.
172        double delta = otherMean - mean;
173        mean += delta * otherCount / count;
174        sumOfSquaresOfDeltas += otherSumOfSquaresOfDeltas + delta * (otherMean - mean) * otherCount;
175      } else {
176        mean = calculateNewMeanNonFinite(mean, otherMean);
177        sumOfSquaresOfDeltas = NaN;
178      }
179      min = Math.min(min, otherMin);
180      max = Math.max(max, otherMax);
181    }
182  }
183
184  /** Returns an immutable snapshot of the current statistics. */
185  public Stats snapshot() {
186    return new Stats(count, mean, sumOfSquaresOfDeltas, min, max);
187  }
188
189  /** Returns the number of values. */
190  public long count() {
191    return count;
192  }
193
194  /**
195   * Returns the <a href="http://en.wikipedia.org/wiki/Arithmetic_mean">arithmetic mean</a> of the
196   * values. The count must be non-zero.
197   *
198   * <p>If these values are a sample drawn from a population, this is also an unbiased estimator of
199   * the arithmetic mean of the population.
200   *
201   * <h3>Non-finite values</h3>
202   *
203   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
204   * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the
205   * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
206   * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}.
207   * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or {@link
208   * Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}.
209   *
210   * @throws IllegalStateException if the dataset is empty
211   */
212  public double mean() {
213    checkState(count != 0);
214    return mean;
215  }
216
217  /**
218   * Returns the sum of the values.
219   *
220   * <h3>Non-finite values</h3>
221   *
222   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
223   * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the
224   * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
225   * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}.
226   * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or {@link
227   * Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}.
228   */
229  public final double sum() {
230    return mean * count;
231  }
232
233  /**
234   * Returns the <a href="http://en.wikipedia.org/wiki/Variance#Population_variance">population
235   * variance</a> of the values. The count must be non-zero.
236   *
237   * <p>This is guaranteed to return zero if the dataset contains only exactly one finite value. It
238   * is not guaranteed to return zero when the dataset consists of the same value multiple times,
239   * due to numerical errors. However, it is guaranteed never to return a negative result.
240   *
241   * <h3>Non-finite values</h3>
242   *
243   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
244   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
245   *
246   * @throws IllegalStateException if the dataset is empty
247   */
248  public final double populationVariance() {
249    checkState(count != 0);
250    if (isNaN(sumOfSquaresOfDeltas)) {
251      return NaN;
252    }
253    if (count == 1) {
254      return 0.0;
255    }
256    return ensureNonNegative(sumOfSquaresOfDeltas) / count;
257  }
258
259  /**
260   * Returns the <a
261   * href="http://en.wikipedia.org/wiki/Standard_deviation#Definition_of_population_values">
262   * population standard deviation</a> of the values. The count must be non-zero.
263   *
264   * <p>This is guaranteed to return zero if the dataset contains only exactly one finite value. It
265   * is not guaranteed to return zero when the dataset consists of the same value multiple times,
266   * due to numerical errors. However, it is guaranteed never to return a negative result.
267   *
268   * <h3>Non-finite values</h3>
269   *
270   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
271   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
272   *
273   * @throws IllegalStateException if the dataset is empty
274   */
275  public final double populationStandardDeviation() {
276    return Math.sqrt(populationVariance());
277  }
278
279  /**
280   * Returns the <a href="http://en.wikipedia.org/wiki/Variance#Sample_variance">unbiased sample
281   * variance</a> of the values. If this dataset is a sample drawn from a population, this is an
282   * unbiased estimator of the population variance of the population. The count must be greater than
283   * one.
284   *
285   * <p>This is not guaranteed to return zero when the dataset consists of the same value multiple
286   * times, due to numerical errors. However, it is guaranteed never to return a negative result.
287   *
288   * <h3>Non-finite values</h3>
289   *
290   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
291   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
292   *
293   * @throws IllegalStateException if the dataset is empty or contains a single value
294   */
295  public final double sampleVariance() {
296    checkState(count > 1);
297    if (isNaN(sumOfSquaresOfDeltas)) {
298      return NaN;
299    }
300    return ensureNonNegative(sumOfSquaresOfDeltas) / (count - 1);
301  }
302
303  /**
304   * Returns the <a
305   * href="http://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation">
306   * corrected sample standard deviation</a> of the values. If this dataset is a sample drawn from a
307   * population, this is an estimator of the population standard deviation of the population which
308   * is less biased than {@link #populationStandardDeviation()} (the unbiased estimator depends on
309   * the distribution). The count must be greater than one.
310   *
311   * <p>This is not guaranteed to return zero when the dataset consists of the same value multiple
312   * times, due to numerical errors. However, it is guaranteed never to return a negative result.
313   *
314   * <h3>Non-finite values</h3>
315   *
316   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
317   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
318   *
319   * @throws IllegalStateException if the dataset is empty or contains a single value
320   */
321  public final double sampleStandardDeviation() {
322    return Math.sqrt(sampleVariance());
323  }
324
325  /**
326   * Returns the lowest value in the dataset. The count must be non-zero.
327   *
328   * <h3>Non-finite values</h3>
329   *
330   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
331   * contains {@link Double#NEGATIVE_INFINITY} and not {@link Double#NaN} then the result is {@link
332   * Double#NEGATIVE_INFINITY}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
333   * only then the result is the lowest finite value. If it contains {@link
334   * Double#POSITIVE_INFINITY} only then the result is {@link Double#POSITIVE_INFINITY}.
335   *
336   * @throws IllegalStateException if the dataset is empty
337   */
338  public double min() {
339    checkState(count != 0);
340    return min;
341  }
342
343  /**
344   * Returns the highest value in the dataset. The count must be non-zero.
345   *
346   * <h3>Non-finite values</h3>
347   *
348   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
349   * contains {@link Double#POSITIVE_INFINITY} and not {@link Double#NaN} then the result is {@link
350   * Double#POSITIVE_INFINITY}. If it contains {@link Double#NEGATIVE_INFINITY} and finite values
351   * only then the result is the highest finite value. If it contains {@link
352   * Double#NEGATIVE_INFINITY} only then the result is {@link Double#NEGATIVE_INFINITY}.
353   *
354   * @throws IllegalStateException if the dataset is empty
355   */
356  public double max() {
357    checkState(count != 0);
358    return max;
359  }
360
361  double sumOfSquaresOfDeltas() {
362    return sumOfSquaresOfDeltas;
363  }
364
365  /**
366   * Calculates the new value for the accumulated mean when a value is added, in the case where at
367   * least one of the previous mean and the value is non-finite.
368   */
369  static double calculateNewMeanNonFinite(double previousMean, double value) {
370    /*
371     * Desired behaviour is to match the results of applying the naive mean formula. In particular,
372     * the update formula can subtract infinities in cases where the naive formula would add them.
373     *
374     * Consequently:
375     * 1. If the previous mean is finite and the new value is non-finite then the new mean is that
376     *    value (whether it is NaN or infinity).
377     * 2. If the new value is finite and the previous mean is non-finite then the mean is unchanged
378     *    (whether it is NaN or infinity).
379     * 3. If both the previous mean and the new value are non-finite and...
380     * 3a. ...either or both is NaN (so mean != value) then the new mean is NaN.
381     * 3b. ...they are both the same infinities (so mean == value) then the mean is unchanged.
382     * 3c. ...they are different infinities (so mean != value) then the new mean is NaN.
383     */
384    if (isFinite(previousMean)) {
385      // This is case 1.
386      return value;
387    } else if (isFinite(value) || previousMean == value) {
388      // This is case 2. or 3b.
389      return previousMean;
390    } else {
391      // This is case 3a. or 3c.
392      return NaN;
393    }
394  }
395}