001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.math3.stat.inference; 019 020import java.math.BigDecimal; 021import java.util.Arrays; 022import java.util.Iterator; 023 024import org.apache.commons.math3.distribution.RealDistribution; 025import org.apache.commons.math3.exception.InsufficientDataException; 026import org.apache.commons.math3.exception.MathArithmeticException; 027import org.apache.commons.math3.exception.NullArgumentException; 028import org.apache.commons.math3.exception.NumberIsTooLargeException; 029import org.apache.commons.math3.exception.OutOfRangeException; 030import org.apache.commons.math3.exception.TooManyIterationsException; 031import org.apache.commons.math3.exception.util.LocalizedFormats; 032import org.apache.commons.math3.fraction.BigFraction; 033import org.apache.commons.math3.fraction.BigFractionField; 034import org.apache.commons.math3.fraction.FractionConversionException; 035import org.apache.commons.math3.linear.Array2DRowFieldMatrix; 036import org.apache.commons.math3.linear.FieldMatrix; 037import org.apache.commons.math3.linear.MatrixUtils; 038import org.apache.commons.math3.linear.RealMatrix; 039import org.apache.commons.math3.random.RandomGenerator; 040import org.apache.commons.math3.random.Well19937c; 041import org.apache.commons.math3.util.CombinatoricsUtils; 042import org.apache.commons.math3.util.FastMath; 043import org.apache.commons.math3.util.MathArrays; 044 045import static org.apache.commons.math3.util.MathUtils.PI_SQUARED; 046import static org.apache.commons.math3.util.FastMath.PI; 047 048/** 049 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> 050 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions. 051 * <p> 052 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of 053 * sample data points from the distribution expected under the null hypothesis. For one-sample tests 054 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the 055 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and 056 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of 057 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values 058 * given in [2]. 059 * </p> 060 * <p> 061 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples 062 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test 063 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is 064 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of 065 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The 066 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as 067 * follows: 068 * <ul> 069 * <li>For very small samples (where the product of the sample sizes is less than 070 * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value for the 071 * 2-sample test.</li> 072 * <li>For mid-size samples (product of sample sizes greater than or equal to 073 * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo 074 * simulation is used to compute the p-value. The simulation randomly generates partitions of \(m + 075 * n\) into an \(m\)-set and an \(n\)-set and reports the proportion that give \(D\) values 076 * exceeding the observed value.</li> 077 * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic 078 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on 079 * the approximation.</li> 080 * </ul> 081 * </p> 082 * <p> 083 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value 084 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \) 085 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean 086 * {@code strict} parameter. This parameter is ignored for large samples. 087 * </p> 088 * <p> 089 * The methods used by the 2-sample default implementation are also exposed directly: 090 * <ul> 091 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li> 092 * <li>{@link #monteCarloP(double, int, int, boolean, int)} computes 2-sample p-values by Monte 093 * Carlo simulation</li> 094 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean} 095 * arguments in the first two methods allow the probability used to estimate the p-value to be 096 * expressed using strict or non-strict inequality. See 097 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li> 098 * </ul> 099 * </p> 100 * <p> 101 * References: 102 * <ul> 103 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by 104 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li> 105 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov 106 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li> 107 * </ul> 108 * <br/> 109 * Note that [1] contains an error in computing h, refer to <a 110 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details. 111 * </p> 112 * 113 * @since 3.3 114 */ 115public class KolmogorovSmirnovTest { 116 117 /** 118 * Bound on the number of partial sums in {@link #ksSum(double, double, int)} 119 */ 120 protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000; 121 122 /** Convergence criterion for {@link #ksSum(double, double, int)} */ 123 protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20; 124 125 /** Convergence criterion for the sums in #pelzGood(double, double, int)} */ 126 protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10; 127 128 /** When product of sample sizes is less than this value, 2-sample K-S test is exact */ 129 protected static final int SMALL_SAMPLE_PRODUCT = 200; 130 131 /** 132 * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic 133 * distribution for strict inequality p-value. 134 */ 135 protected static final int LARGE_SAMPLE_PRODUCT = 10000; 136 137 /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)} */ 138 protected static final int MONTE_CARLO_ITERATIONS = 1000000; 139 140 /** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */ 141 private final RandomGenerator rng; 142 143 /** 144 * Construct a KolmogorovSmirnovTest instance with a default random data generator. 145 */ 146 public KolmogorovSmirnovTest() { 147 rng = new Well19937c(); 148 } 149 150 /** 151 * Construct a KolmogorovSmirnovTest with the provided random data generator. 152 * 153 * @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} 154 */ 155 public KolmogorovSmirnovTest(RandomGenerator rng) { 156 this.rng = rng; 157 } 158 159 /** 160 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a 161 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 162 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If 163 * {@code exact} is true, the distribution used to compute the p-value is computed using 164 * extended precision. See {@link #cdfExact(double, int)}. 165 * 166 * @param distribution reference distribution 167 * @param data sample being being evaluated 168 * @param exact whether or not to force exact computation of the p-value 169 * @return the p-value associated with the null hypothesis that {@code data} is a sample from 170 * {@code distribution} 171 * @throws InsufficientDataException if {@code data} does not have length at least 2 172 * @throws NullArgumentException if {@code data} is null 173 */ 174 public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) { 175 return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact); 176 } 177 178 /** 179 * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where 180 * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the 181 * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at 182 * each of the values in {@code data}. 183 * 184 * @param distribution reference distribution 185 * @param data sample being evaluated 186 * @return Kolmogorov-Smirnov statistic \(D_n\) 187 * @throws InsufficientDataException if {@code data} does not have length at least 2 188 * @throws NullArgumentException if {@code data} is null 189 */ 190 public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) { 191 checkArray(data); 192 final int n = data.length; 193 final double nd = n; 194 final double[] dataCopy = new double[n]; 195 System.arraycopy(data, 0, dataCopy, 0, n); 196 Arrays.sort(dataCopy); 197 double d = 0d; 198 for (int i = 1; i <= n; i++) { 199 final double yi = distribution.cumulativeProbability(dataCopy[i - 1]); 200 final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi); 201 if (currD > d) { 202 d = currD; 203 } 204 } 205 return d; 206 } 207 208 /** 209 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a 210 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 211 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 212 * probability distribution. Specifically, what is returned is an estimate of the probability 213 * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly 214 * selected partition of the combined sample into subsamples of sizes {@code x.length} and 215 * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as 216 * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}. 217 * <ul> 218 * <li>For very small samples (where the product of the sample sizes is less than 219 * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This 220 * is accomplished by enumerating all partitions of the combined sample into two subsamples of 221 * the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the 222 * proportion of partitions that give \(D\) values exceeding the observed value.</li> 223 * <li>For mid-size samples (product of sample sizes greater than or equal to 224 * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo 225 * simulation is used to compute the p-value. The simulation randomly generates partitions and 226 * reports the proportion that give \(D\) values exceeding the observed value.</li> 227 * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the 228 * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} 229 * for details on the approximation.</li> 230 * </ul> 231 * 232 * @param x first sample dataset 233 * @param y second sample dataset 234 * @param strict whether or not the probability to compute is expressed as a strict inequality 235 * (ignored for large samples) 236 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent 237 * samples from the same distribution 238 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 239 * least 2 240 * @throws NullArgumentException if either {@code x} or {@code y} is null 241 */ 242 public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) { 243 final long lengthProduct = (long) x.length * y.length; 244 if (lengthProduct < SMALL_SAMPLE_PRODUCT) { 245 return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict); 246 } 247 if (lengthProduct < LARGE_SAMPLE_PRODUCT) { 248 return monteCarloP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict, MONTE_CARLO_ITERATIONS); 249 } 250 return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length); 251 } 252 253 /** 254 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a 255 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 256 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 257 * probability distribution. Assumes the strict form of the inequality used to compute the 258 * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}. 259 * 260 * @param x first sample dataset 261 * @param y second sample dataset 262 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent 263 * samples from the same distribution 264 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 265 * least 2 266 * @throws NullArgumentException if either {@code x} or {@code y} is null 267 */ 268 public double kolmogorovSmirnovTest(double[] x, double[] y) { 269 return kolmogorovSmirnovTest(x, y, true); 270 } 271 272 /** 273 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) 274 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the 275 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) 276 * is the empirical distribution of the {@code y} values. 277 * 278 * @param x first sample 279 * @param y second sample 280 * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and 281 * {@code y} represent samples from the same underlying distribution 282 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 283 * least 2 284 * @throws NullArgumentException if either {@code x} or {@code y} is null 285 */ 286 public double kolmogorovSmirnovStatistic(double[] x, double[] y) { 287 checkArray(x); 288 checkArray(y); 289 // Copy and sort the sample arrays 290 final double[] sx = MathArrays.copyOf(x); 291 final double[] sy = MathArrays.copyOf(y); 292 Arrays.sort(sx); 293 Arrays.sort(sy); 294 final int n = sx.length; 295 final int m = sy.length; 296 297 // Find the max difference between cdf_x and cdf_y 298 double supD = 0d; 299 // First walk x points 300 for (int i = 0; i < n; i++) { 301 final double cdf_x = (i + 1d) / n; 302 final int yIndex = Arrays.binarySearch(sy, sx[i]); 303 final double cdf_y = yIndex >= 0 ? (yIndex + 1d) / m : (-yIndex - 1d) / m; 304 final double curD = FastMath.abs(cdf_x - cdf_y); 305 if (curD > supD) { 306 supD = curD; 307 } 308 } 309 // Now look at y 310 for (int i = 0; i < m; i++) { 311 final double cdf_y = (i + 1d) / m; 312 final int xIndex = Arrays.binarySearch(sx, sy[i]); 313 final double cdf_x = xIndex >= 0 ? (xIndex + 1d) / n : (-xIndex - 1d) / n; 314 final double curD = FastMath.abs(cdf_x - cdf_y); 315 if (curD > supD) { 316 supD = curD; 317 } 318 } 319 return supD; 320 } 321 322 /** 323 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a 324 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 325 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. 326 * 327 * @param distribution reference distribution 328 * @param data sample being being evaluated 329 * @return the p-value associated with the null hypothesis that {@code data} is a sample from 330 * {@code distribution} 331 * @throws InsufficientDataException if {@code data} does not have length at least 2 332 * @throws NullArgumentException if {@code data} is null 333 */ 334 public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) { 335 return kolmogorovSmirnovTest(distribution, data, false); 336 } 337 338 /** 339 * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov 340 * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}. 341 * 342 * @param distribution reference distribution 343 * @param data sample being being evaluated 344 * @param alpha significance level of the test 345 * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution} 346 * can be rejected with confidence 1 - {@code alpha} 347 * @throws InsufficientDataException if {@code data} does not have length at least 2 348 * @throws NullArgumentException if {@code data} is null 349 */ 350 public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) { 351 if ((alpha <= 0) || (alpha > 0.5)) { 352 throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5); 353 } 354 return kolmogorovSmirnovTest(distribution, data) < alpha; 355 } 356 357 /** 358 * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme 359 * values given in [2] (see above). The result is not exact as with 360 * {@link #cdfExact(double, int)} because calculations are based on 361 * {@code double} rather than {@link org.apache.commons.math3.fraction.BigFraction}. 362 * 363 * @param d statistic 364 * @param n sample size 365 * @return \(P(D_n < d)\) 366 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 367 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 368 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\) 369 */ 370 public double cdf(double d, int n) 371 throws MathArithmeticException { 372 return cdf(d, n, false); 373 } 374 375 /** 376 * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is 377 * used everywhere at the expense of very slow execution time. Almost never choose this in real 378 * applications unless you are very sure; this is almost solely for verification purposes. 379 * Normally, you would choose {@link #cdf(double, int)}. See the class 380 * javadoc for definitions and algorithm description. 381 * 382 * @param d statistic 383 * @param n sample size 384 * @return \(P(D_n < d)\) 385 * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a 386 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 387 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\) 388 */ 389 public double cdfExact(double d, int n) 390 throws MathArithmeticException { 391 return cdf(d, n, true); 392 } 393 394 /** 395 * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme 396 * values given in [2] (see above). 397 * 398 * @param d statistic 399 * @param n sample size 400 * @param exact whether the probability should be calculated exact using 401 * {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the expense of 402 * very slow execution time, or if {@code double} should be used convenient places to 403 * gain speed. Almost never choose {@code true} in real applications unless you are very 404 * sure; {@code true} is almost solely for verification purposes. 405 * @return \(P(D_n < d)\) 406 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 407 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 408 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\). 409 */ 410 public double cdf(double d, int n, boolean exact) 411 throws MathArithmeticException { 412 413 final double ninv = 1 / ((double) n); 414 final double ninvhalf = 0.5 * ninv; 415 416 if (d <= ninvhalf) { 417 return 0; 418 } else if (ninvhalf < d && d <= ninv) { 419 double res = 1; 420 final double f = 2 * d - ninv; 421 // n! f^n = n*f * (n-1)*f * ... * 1*x 422 for (int i = 1; i <= n; ++i) { 423 res *= i * f; 424 } 425 return res; 426 } else if (1 - ninv <= d && d < 1) { 427 return 1 - 2 * Math.pow(1 - d, n); 428 } else if (1 <= d) { 429 return 1; 430 } 431 if (exact) { 432 return exactK(d,n); 433 } 434 if (n <= 140) { 435 return roundedK(d, n); 436 } 437 return pelzGood(d, n); 438 } 439 440 /** 441 * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference 442 * in class javadoc above) and {@link org.apache.commons.math3.fraction.BigFraction} (see 443 * above). 444 * 445 * @param d statistic 446 * @param n sample size 447 * @return the two-sided probability of \(P(D_n < d)\) 448 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 449 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 450 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\). 451 */ 452 private double exactK(double d, int n) 453 throws MathArithmeticException { 454 455 final int k = (int) Math.ceil(n * d); 456 457 final FieldMatrix<BigFraction> H = this.createExactH(d, n); 458 final FieldMatrix<BigFraction> Hpower = H.power(n); 459 460 BigFraction pFrac = Hpower.getEntry(k - 1, k - 1); 461 462 for (int i = 1; i <= n; ++i) { 463 pFrac = pFrac.multiply(i).divide(n); 464 } 465 466 /* 467 * BigFraction.doubleValue converts numerator to double and the denominator to double and 468 * divides afterwards. That gives NaN quite easy. This does not (scale is the number of 469 * digits): 470 */ 471 return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue(); 472 } 473 474 /** 475 * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above). 476 * 477 * @param d statistic 478 * @param n sample size 479 * @return \(P(D_n < d)\) 480 */ 481 private double roundedK(double d, int n) { 482 483 final int k = (int) Math.ceil(n * d); 484 final RealMatrix H = this.createRoundedH(d, n); 485 final RealMatrix Hpower = H.power(n); 486 487 double pFrac = Hpower.getEntry(k - 1, k - 1); 488 for (int i = 1; i <= n; ++i) { 489 pFrac *= (double) i / (double) n; 490 } 491 492 return pFrac; 493 } 494 495 /** 496 * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc. 497 * 498 * @param d value of d-statistic (x in [2]) 499 * @param n sample size 500 * @return \(P(D_n < d)\) 501 * @since 3.4 502 */ 503 public double pelzGood(double d, int n) { 504 505 // Change the variable since approximation is for the distribution evaluated at d / sqrt(n) 506 final double sqrtN = FastMath.sqrt(n); 507 final double z = d * sqrtN; 508 final double z2 = d * d * n; 509 final double z4 = z2 * z2; 510 final double z6 = z4 * z2; 511 final double z8 = z4 * z4; 512 513 // Eventual return value 514 double ret = 0; 515 516 // Compute K_0(z) 517 double sum = 0; 518 double increment = 0; 519 double kTerm = 0; 520 double z2Term = PI_SQUARED / (8 * z2); 521 int k = 1; 522 for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 523 kTerm = 2 * k - 1; 524 increment = FastMath.exp(-z2Term * kTerm * kTerm); 525 sum += increment; 526 if (increment <= PG_SUM_RELATIVE_ERROR * sum) { 527 break; 528 } 529 } 530 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 531 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 532 } 533 ret = sum * FastMath.sqrt(2 * FastMath.PI) / z; 534 535 // K_1(z) 536 // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have 537 // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...) 538 final double twoZ2 = 2 * z2; 539 sum = 0; 540 kTerm = 0; 541 double kTerm2 = 0; 542 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 543 kTerm = k + 0.5; 544 kTerm2 = kTerm * kTerm; 545 increment = (PI_SQUARED * kTerm2 - z2) * FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2); 546 sum += increment; 547 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 548 break; 549 } 550 } 551 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 552 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 553 } 554 final double sqrtHalfPi = FastMath.sqrt(PI / 2); 555 // Instead of doubling sum, divide by 3 instead of 6 556 ret += sum * sqrtHalfPi / (3 * z4 * sqrtN); 557 558 // K_2(z) 559 // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers. 560 final double z4Term = 2 * z4; 561 final double z6Term = 6 * z6; 562 z2Term = 5 * z2; 563 final double pi4 = PI_SQUARED * PI_SQUARED; 564 sum = 0; 565 kTerm = 0; 566 kTerm2 = 0; 567 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 568 kTerm = k + 0.5; 569 kTerm2 = kTerm * kTerm; 570 increment = (z6Term + z4Term + PI_SQUARED * (z4Term - z2Term) * kTerm2 + 571 pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2); 572 sum += increment; 573 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 574 break; 575 } 576 } 577 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 578 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 579 } 580 double sum2 = 0; 581 kTerm2 = 0; 582 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 583 kTerm2 = k * k; 584 increment = PI_SQUARED * kTerm2 * FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2); 585 sum2 += increment; 586 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) { 587 break; 588 } 589 } 590 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 591 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 592 } 593 // Again, adjust coefficients instead of doubling sum, sum2 594 ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z)); 595 596 // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even. 597 // Multiply coefficient denominators by 2, so omit doubling sums. 598 final double pi6 = pi4 * PI_SQUARED; 599 sum = 0; 600 double kTerm4 = 0; 601 double kTerm6 = 0; 602 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 603 kTerm = k + 0.5; 604 kTerm2 = kTerm * kTerm; 605 kTerm4 = kTerm2 * kTerm2; 606 kTerm6 = kTerm4 * kTerm2; 607 increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) + 608 PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) * 609 FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2); 610 sum += increment; 611 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 612 break; 613 } 614 } 615 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 616 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 617 } 618 sum2 = 0; 619 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 620 kTerm2 = k * k; 621 kTerm4 = kTerm2 * kTerm2; 622 increment = (-pi4 * kTerm4 + 3 * PI_SQUARED * kTerm2 * z2) * 623 FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2); 624 sum2 += increment; 625 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) { 626 break; 627 } 628 } 629 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 630 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 631 } 632 return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) + 633 + sum2 / (108 * z6)); 634 635 } 636 637 /*** 638 * Creates {@code H} of size {@code m x m} as described in [1] (see above). 639 * 640 * @param d statistic 641 * @param n sample size 642 * @return H matrix 643 * @throws NumberIsTooLargeException if fractional part is greater than 1 644 * @throws FractionConversionException if algorithm fails to convert {@code h} to a 645 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 646 * - h) / m\) for integer {@code k, m} and \(0 <= h < 1\). 647 */ 648 private FieldMatrix<BigFraction> createExactH(double d, int n) 649 throws NumberIsTooLargeException, FractionConversionException { 650 651 final int k = (int) Math.ceil(n * d); 652 final int m = 2 * k - 1; 653 final double hDouble = k - n * d; 654 if (hDouble >= 1) { 655 throw new NumberIsTooLargeException(hDouble, 1.0, false); 656 } 657 BigFraction h = null; 658 try { 659 h = new BigFraction(hDouble, 1.0e-20, 10000); 660 } catch (final FractionConversionException e1) { 661 try { 662 h = new BigFraction(hDouble, 1.0e-10, 10000); 663 } catch (final FractionConversionException e2) { 664 h = new BigFraction(hDouble, 1.0e-5, 10000); 665 } 666 } 667 final BigFraction[][] Hdata = new BigFraction[m][m]; 668 669 /* 670 * Start by filling everything with either 0 or 1. 671 */ 672 for (int i = 0; i < m; ++i) { 673 for (int j = 0; j < m; ++j) { 674 if (i - j + 1 < 0) { 675 Hdata[i][j] = BigFraction.ZERO; 676 } else { 677 Hdata[i][j] = BigFraction.ONE; 678 } 679 } 680 } 681 682 /* 683 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ... 684 * hPowers[m-1] = h^m 685 */ 686 final BigFraction[] hPowers = new BigFraction[m]; 687 hPowers[0] = h; 688 for (int i = 1; i < m; ++i) { 689 hPowers[i] = h.multiply(hPowers[i - 1]); 690 } 691 692 /* 693 * First column and last row has special values (each other reversed). 694 */ 695 for (int i = 0; i < m; ++i) { 696 Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]); 697 Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]); 698 } 699 700 /* 701 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m + 702 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check: 703 */ 704 if (h.compareTo(BigFraction.ONE_HALF) == 1) { 705 Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m)); 706 } 707 708 /* 709 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i - 710 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is 711 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then 712 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of 713 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't 714 * really necessary. 715 */ 716 for (int i = 0; i < m; ++i) { 717 for (int j = 0; j < i + 1; ++j) { 718 if (i - j + 1 > 0) { 719 for (int g = 2; g <= i - j + 1; ++g) { 720 Hdata[i][j] = Hdata[i][j].divide(g); 721 } 722 } 723 } 724 } 725 return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata); 726 } 727 728 /*** 729 * Creates {@code H} of size {@code m x m} as described in [1] (see above) 730 * using double-precision. 731 * 732 * @param d statistic 733 * @param n sample size 734 * @return H matrix 735 * @throws NumberIsTooLargeException if fractional part is greater than 1 736 */ 737 private RealMatrix createRoundedH(double d, int n) 738 throws NumberIsTooLargeException { 739 740 final int k = (int) Math.ceil(n * d); 741 final int m = 2 * k - 1; 742 final double h = k - n * d; 743 if (h >= 1) { 744 throw new NumberIsTooLargeException(h, 1.0, false); 745 } 746 final double[][] Hdata = new double[m][m]; 747 748 /* 749 * Start by filling everything with either 0 or 1. 750 */ 751 for (int i = 0; i < m; ++i) { 752 for (int j = 0; j < m; ++j) { 753 if (i - j + 1 < 0) { 754 Hdata[i][j] = 0; 755 } else { 756 Hdata[i][j] = 1; 757 } 758 } 759 } 760 761 /* 762 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ... 763 * hPowers[m-1] = h^m 764 */ 765 final double[] hPowers = new double[m]; 766 hPowers[0] = h; 767 for (int i = 1; i < m; ++i) { 768 hPowers[i] = h * hPowers[i - 1]; 769 } 770 771 /* 772 * First column and last row has special values (each other reversed). 773 */ 774 for (int i = 0; i < m; ++i) { 775 Hdata[i][0] = Hdata[i][0] - hPowers[i]; 776 Hdata[m - 1][i] -= hPowers[m - i - 1]; 777 } 778 779 /* 780 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m + 781 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check: 782 */ 783 if (Double.compare(h, 0.5) > 0) { 784 Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m); 785 } 786 787 /* 788 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i - 789 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is 790 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then 791 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of 792 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't 793 * really necessary. 794 */ 795 for (int i = 0; i < m; ++i) { 796 for (int j = 0; j < i + 1; ++j) { 797 if (i - j + 1 > 0) { 798 for (int g = 2; g <= i - j + 1; ++g) { 799 Hdata[i][j] /= g; 800 } 801 } 802 } 803 } 804 return MatrixUtils.createRealMatrix(Hdata); 805 } 806 807 /** 808 * Verifies that {@code array} has length at least 2. 809 * 810 * @param array array to test 811 * @throws NullArgumentException if array is null 812 * @throws InsufficientDataException if array is too short 813 */ 814 private void checkArray(double[] array) { 815 if (array == null) { 816 throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED); 817 } 818 if (array.length < 2) { 819 throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length, 820 2); 821 } 822 } 823 824 /** 825 * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial 826 * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums 827 * have been computed. If the sum does not converge before {@code maxIterations} iterations a 828 * {@link TooManyIterationsException} is thrown. 829 * 830 * @param t argument 831 * @param tolerance Cauchy criterion for partial sums 832 * @param maxIterations maximum number of partial sums to compute 833 * @return Kolmogorov sum evaluated at t 834 * @throws TooManyIterationsException if the series does not converge 835 */ 836 public double ksSum(double t, double tolerance, int maxIterations) { 837 // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1] 838 // from class javadoc should be used. 839 final double x = -2 * t * t; 840 int sign = -1; 841 long i = 1; 842 double partialSum = 0.5d; 843 double delta = 1; 844 while (delta > tolerance && i < maxIterations) { 845 delta = FastMath.exp(x * i * i); 846 partialSum += sign * delta; 847 sign *= -1; 848 i++; 849 } 850 if (i == maxIterations) { 851 throw new TooManyIterationsException(maxIterations); 852 } 853 return partialSum * 2; 854 } 855 856 /** 857 * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge 858 * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See 859 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 860 * <p> 861 * The returned probability is exact, obtained by enumerating all partitions of {@code m + n} 862 * into {@code m} and {@code n} sets, computing \(D_{n,m}\) for each partition and counting the 863 * number of partitions that yield \(D_{n,m}\) values exceeding (resp. greater than or equal to) 864 * {@code d}. 865 * </p> 866 * <p> 867 * <strong>USAGE NOTE</strong>: Since this method enumerates all combinations in \({m+n} \choose 868 * {n}\), it is very slow if called for large {@code m, n}. For this reason, 869 * {@link #kolmogorovSmirnovTest(double[], double[])} uses this only for {@code m * n < } 870 * {@value #SMALL_SAMPLE_PRODUCT}. 871 * </p> 872 * 873 * @param d D-statistic value 874 * @param n first sample size 875 * @param m second sample size 876 * @param strict whether or not the probability to compute is expressed as a strict inequality 877 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) 878 * greater than (resp. greater than or equal to) {@code d} 879 */ 880 public double exactP(double d, int n, int m, boolean strict) { 881 Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n); 882 long tail = 0; 883 final double[] nSet = new double[n]; 884 final double[] mSet = new double[m]; 885 while (combinationsIterator.hasNext()) { 886 // Generate an n-set 887 final int[] nSetI = combinationsIterator.next(); 888 // Copy the n-set to nSet and its complement to mSet 889 int j = 0; 890 int k = 0; 891 for (int i = 0; i < n + m; i++) { 892 if (j < n && nSetI[j] == i) { 893 nSet[j++] = i; 894 } else { 895 mSet[k++] = i; 896 } 897 } 898 final double curD = kolmogorovSmirnovStatistic(nSet, mSet); 899 if (curD > d) { 900 tail++; 901 } else if (curD == d && !strict) { 902 tail++; 903 } 904 } 905 return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n); 906 } 907 908 /** 909 * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) 910 * is the 2-sample Kolmogorov-Smirnov statistic. See 911 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 912 * <p> 913 * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2 914 * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for 915 * details on how convergence of the sum is determined. This implementation passes {@code ksSum} 916 * {@value #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and 917 * {@value #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}. 918 * </p> 919 * 920 * @param d D-statistic value 921 * @param n first sample size 922 * @param m second sample size 923 * @return approximate probability that a randomly selected m-n partition of m + n generates 924 * \(D_{n,m}\) greater than {@code d} 925 */ 926 public double approximateP(double d, int n, int m) { 927 final double dm = m; 928 final double dn = n; 929 return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)), KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT); 930 } 931 932 /** 933 * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the 934 * 2-sample Kolmogorov-Smirnov statistic. See 935 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 936 * <p> 937 * The simulation generates {@code iterations} random partitions of {@code m + n} into an 938 * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning 939 * the proportion of values that are greater than {@code d}, or greater than or equal to 940 * {@code d} if {@code strict} is {@code false}. 941 * </p> 942 * 943 * @param d D-statistic value 944 * @param n first sample size 945 * @param m second sample size 946 * @param iterations number of random partitions to generate 947 * @param strict whether or not the probability to compute is expressed as a strict inequality 948 * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) 949 * greater than (resp. greater than or equal to) {@code d} 950 */ 951 public double monteCarloP(double d, int n, int m, boolean strict, int iterations) { 952 final int[] nPlusMSet = MathArrays.natural(m + n); 953 final double[] nSet = new double[n]; 954 final double[] mSet = new double[m]; 955 int tail = 0; 956 for (int i = 0; i < iterations; i++) { 957 copyPartition(nSet, mSet, nPlusMSet, n, m); 958 final double curD = kolmogorovSmirnovStatistic(nSet, mSet); 959 if (curD > d) { 960 tail++; 961 } else if (curD == d && !strict) { 962 tail++; 963 } 964 MathArrays.shuffle(nPlusMSet, rng); 965 Arrays.sort(nPlusMSet, 0, n); 966 } 967 return (double) tail / iterations; 968 } 969 970 /** 971 * Copies the first {@code n} elements of {@code nSetI} into {@code nSet} and its complement 972 * relative to {@code m + n} into {@code mSet}. For example, if {@code m = 3}, {@code n = 3} and 973 * {@code nSetI = [1,4,5,2,3,0]} then after this method returns, we will have 974 * {@code nSet = [1,4,5], mSet = [0,2,3]}. 975 * <p> 976 * <strong>Precondition:</strong> The first {@code n} elements of {@code nSetI} must be sorted 977 * in ascending order. 978 * </p> 979 * 980 * @param nSet array to fill with the first {@code n} elements of {@code nSetI} 981 * @param mSet array to fill with the {@code m} complementary elements of {@code nSet} relative 982 * to {@code m + n} 983 * @param nSetI array whose first {@code n} elements specify the members of {@code nSet} 984 * @param n number of elements in the first output array 985 * @param m number of elements in the second output array 986 */ 987 private void copyPartition(double[] nSet, double[] mSet, int[] nSetI, int n, int m) { 988 int j = 0; 989 int k = 0; 990 for (int i = 0; i < n + m; i++) { 991 if (j < n && nSetI[j] == i) { 992 nSet[j++] = i; 993 } else { 994 mSet[k++] = i; 995 } 996 } 997 } 998}