Mercurial > repos > dereeper > pgap
comparison PGAP-1.2.1/Statistics/LineFit.pm @ 0:83e62a1aeeeb draft
Uploaded
| author | dereeper |
|---|---|
| date | Thu, 24 Jun 2021 13:51:52 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:83e62a1aeeeb |
|---|---|
| 1 package Statistics::LineFit; | |
| 2 use strict; | |
| 3 use Carp qw(carp); | |
| 4 BEGIN { | |
| 5 use Exporter (); | |
| 6 use vars qw ($AUTHOR $VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |
| 7 $AUTHOR = 'Richard Anderson <cpan(AT)richardanderson(DOT)org>'; | |
| 8 @EXPORT = @EXPORT_OK = qw(); | |
| 9 %EXPORT_TAGS = (); | |
| 10 @ISA = qw(Exporter); | |
| 11 $VERSION = 0.06; | |
| 12 } | |
| 13 | |
| 14 sub new { | |
| 15 # | |
| 16 # Purpose: Create a new Statistics::LineFit object | |
| 17 # | |
| 18 my ($caller, $validate, $hush) = @_; | |
| 19 my $self = { doneRegress => 0, | |
| 20 gotData => 0, | |
| 21 hush => defined $hush ? $hush : 0, | |
| 22 validate => defined $validate ? $validate : 0, | |
| 23 }; | |
| 24 bless $self, ref($caller) || $caller; | |
| 25 return $self; | |
| 26 } | |
| 27 | |
| 28 sub coefficients { | |
| 29 # | |
| 30 # Purpose: Return the slope and intercept from least squares line fit | |
| 31 # | |
| 32 my $self = shift; | |
| 33 unless (defined $self->{intercept} and defined $self->{slope}) { | |
| 34 $self->regress() or return; | |
| 35 } | |
| 36 return ($self->{intercept}, $self->{slope}); | |
| 37 } | |
| 38 | |
| 39 sub computeSums { | |
| 40 # | |
| 41 # Purpose: Compute sum of x, y, x**2, y**2 and x*y (private method) | |
| 42 # | |
| 43 my $self = shift; | |
| 44 my ($sumX, $sumY, $sumXX, $sumYY, $sumXY) = (0, 0, 0, 0, 0); | |
| 45 if (defined $self->{weight}) { | |
| 46 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
| 47 $sumX += $self->{weight}[$i] * $self->{x}[$i]; | |
| 48 $sumY += $self->{weight}[$i] * $self->{y}[$i]; | |
| 49 $sumXX += $self->{weight}[$i] * $self->{x}[$i] ** 2; | |
| 50 $sumYY += $self->{weight}[$i] * $self->{y}[$i] ** 2; | |
| 51 $sumXY += $self->{weight}[$i] * $self->{x}[$i] | |
| 52 * $self->{y}[$i]; | |
| 53 } | |
| 54 } else { | |
| 55 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
| 56 $sumX += $self->{x}[$i]; | |
| 57 $sumY += $self->{y}[$i]; | |
| 58 $sumXX += $self->{x}[$i] ** 2; | |
| 59 $sumYY += $self->{y}[$i] ** 2; | |
| 60 $sumXY += $self->{x}[$i] * $self->{y}[$i]; | |
| 61 } | |
| 62 } | |
| 63 return ($sumX, $sumY, $sumXX, $sumYY, $sumXY); | |
| 64 } | |
| 65 | |
| 66 sub durbinWatson { | |
| 67 # | |
| 68 # Purpose: Return the Durbin-Watson statistic | |
| 69 # | |
| 70 my $self = shift; | |
| 71 unless (defined $self->{durbinWatson}) { | |
| 72 $self->regress() or return; | |
| 73 my $sumErrDiff = 0; | |
| 74 my $errorTMinus1 = $self->{y}[0] - ($self->{intercept} + $self->{slope} | |
| 75 * $self->{x}[0]); | |
| 76 for (my $i = 1; $i < $self->{numXY}; ++$i) { | |
| 77 my $error = $self->{y}[$i] - ($self->{intercept} + $self->{slope} | |
| 78 * $self->{x}[$i]); | |
| 79 $sumErrDiff += ($error - $errorTMinus1) ** 2; | |
| 80 $errorTMinus1 = $error; | |
| 81 } | |
| 82 $self->{durbinWatson} = $self->sumSqErrors() > 0 ? | |
| 83 $sumErrDiff / $self->sumSqErrors() : 0; | |
| 84 } | |
| 85 return $self->{durbinWatson}; | |
| 86 } | |
| 87 | |
| 88 sub meanSqError { | |
| 89 # | |
| 90 # Purpose: Return the mean squared error | |
| 91 # | |
| 92 my $self = shift; | |
| 93 unless (defined $self->{meanSqError}) { | |
| 94 $self->regress() or return; | |
| 95 $self->{meanSqError} = $self->sumSqErrors() / $self->{numXY}; | |
| 96 } | |
| 97 return $self->{meanSqError}; | |
| 98 } | |
| 99 | |
| 100 sub predictedYs { | |
| 101 # | |
| 102 # Purpose: Return the predicted y values | |
| 103 # | |
| 104 my $self = shift; | |
| 105 unless (defined $self->{predictedYs}) { | |
| 106 $self->regress() or return; | |
| 107 $self->{predictedYs} = []; | |
| 108 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
| 109 $self->{predictedYs}[$i] = $self->{intercept} | |
| 110 + $self->{slope} * $self->{x}[$i]; | |
| 111 } | |
| 112 } | |
| 113 return @{$self->{predictedYs}}; | |
| 114 } | |
| 115 | |
| 116 sub regress { | |
| 117 # | |
| 118 # Purpose: Do weighted or unweighted least squares 2-D line fit (if needed) | |
| 119 # | |
| 120 # Description: | |
| 121 # The equations below apply to both the weighted and unweighted fit: the | |
| 122 # weights are normalized in setWeights(), so the sum of the weights is | |
| 123 # equal to numXY. | |
| 124 # | |
| 125 my $self = shift; | |
| 126 return $self->{regressOK} if $self->{doneRegress}; | |
| 127 unless ($self->{gotData}) { | |
| 128 carp "No valid data input - can't do regression" unless $self->{hush}; | |
| 129 return 0; | |
| 130 } | |
| 131 my ($sumX, $sumY, $sumYY, $sumXY); | |
| 132 ($sumX, $sumY, $self->{sumXX}, $sumYY, $sumXY) = $self->computeSums(); | |
| 133 $self->{sumSqDevX} = $self->{sumXX} - $sumX ** 2 / $self->{numXY}; | |
| 134 if ($self->{sumSqDevX} != 0) { | |
| 135 $self->{sumSqDevY} = $sumYY - $sumY ** 2 / $self->{numXY}; | |
| 136 $self->{sumSqDevXY} = $sumXY - $sumX * $sumY / $self->{numXY}; | |
| 137 $self->{slope} = $self->{sumSqDevXY} / $self->{sumSqDevX}; | |
| 138 $self->{intercept} = ($sumY - $self->{slope} * $sumX) / $self->{numXY}; | |
| 139 $self->{regressOK} = 1; | |
| 140 } else { | |
| 141 carp "Can't fit line when x values are all equal" unless $self->{hush}; | |
| 142 $self->{sumXX} = $self->{sumSqDevX} = undef; | |
| 143 $self->{regressOK} = 0; | |
| 144 } | |
| 145 $self->{doneRegress} = 1; | |
| 146 return $self->{regressOK}; | |
| 147 } | |
| 148 | |
| 149 sub residuals { | |
| 150 # | |
| 151 # Purpose: Return the predicted Y values minus the observed Y values | |
| 152 # | |
| 153 my $self = shift; | |
| 154 unless (defined $self->{residuals}) { | |
| 155 $self->regress() or return; | |
| 156 $self->{residuals} = []; | |
| 157 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
| 158 $self->{residuals}[$i] = $self->{y}[$i] - ($self->{intercept} | |
| 159 + $self->{slope} * $self->{x}[$i]); | |
| 160 } | |
| 161 } | |
| 162 return @{$self->{residuals}}; | |
| 163 } | |
| 164 | |
| 165 sub rSquared { | |
| 166 # | |
| 167 # Purpose: Return the correlation coefficient | |
| 168 # | |
| 169 my $self = shift; | |
| 170 unless (defined $self->{rSquared}) { | |
| 171 $self->regress() or return; | |
| 172 my $denom = $self->{sumSqDevX} * $self->{sumSqDevY}; | |
| 173 $self->{rSquared} = $denom != 0 ? $self->{sumSqDevXY} ** 2 / $denom : 1; | |
| 174 } | |
| 175 return $self->{rSquared}; | |
| 176 } | |
| 177 | |
| 178 sub setData { | |
| 179 # | |
| 180 # Purpose: Initialize (x,y) values and optional weights | |
| 181 # | |
| 182 my ($self, $x, $y, $weights) = @_; | |
| 183 $self->{doneRegress} = 0; | |
| 184 $self->{x} = $self->{y} = $self->{numXY} = $self->{weight} | |
| 185 = $self->{intercept} = $self->{slope} = $self->{rSquared} | |
| 186 = $self->{sigma} = $self->{durbinWatson} = $self->{meanSqError} | |
| 187 = $self->{sumSqErrors} = $self->{tStatInt} = $self->{tStatSlope} | |
| 188 = $self->{predictedYs} = $self->{residuals} = $self->{sumXX} | |
| 189 = $self->{sumSqDevX} = $self->{sumSqDevY} = $self->{sumSqDevXY} | |
| 190 = undef; | |
| 191 if (@$x < 2) { | |
| 192 carp "Must input more than one data point!" unless $self->{hush}; | |
| 193 return 0; | |
| 194 } | |
| 195 $self->{numXY} = @$x; | |
| 196 if (ref $x->[0]) { | |
| 197 $self->setWeights($y) or return 0; | |
| 198 $self->{x} = [ ]; | |
| 199 $self->{y} = [ ]; | |
| 200 foreach my $xy (@$x) { | |
| 201 push @{$self->{x}}, $xy->[0]; | |
| 202 push @{$self->{y}}, $xy->[1]; | |
| 203 } | |
| 204 } else { | |
| 205 if (@$x != @$y) { | |
| 206 carp "Length of x and y arrays must be equal!" unless $self->{hush}; | |
| 207 return 0; | |
| 208 } | |
| 209 $self->setWeights($weights) or return 0; | |
| 210 $self->{x} = [ @$x ]; | |
| 211 $self->{y} = [ @$y ]; | |
| 212 } | |
| 213 if ($self->{validate}) { | |
| 214 unless ($self->validData()) { | |
| 215 $self->{x} = $self->{y} = $self->{weights} = $self->{numXY} = undef; | |
| 216 return 0; | |
| 217 } | |
| 218 } | |
| 219 $self->{gotData} = 1; | |
| 220 return 1; | |
| 221 } | |
| 222 | |
| 223 sub setWeights { | |
| 224 # | |
| 225 # Purpose: Normalize and initialize line fit weighting factors (private method) | |
| 226 # | |
| 227 my ($self, $weights) = @_; | |
| 228 return 1 unless defined $weights; | |
| 229 if (@$weights != $self->{numXY}) { | |
| 230 carp "Length of weight array must equal length of data array!" | |
| 231 unless $self->{hush}; | |
| 232 return 0; | |
| 233 } | |
| 234 if ($self->{validate}) { $self->validWeights($weights) or return 0 } | |
| 235 my $sumW = my $numNonZero = 0; | |
| 236 foreach my $weight (@$weights) { | |
| 237 if ($weight < 0) { | |
| 238 carp "Weights must be non-negative numbers!" unless $self->{hush}; | |
| 239 return 0; | |
| 240 } | |
| 241 $sumW += $weight; | |
| 242 if ($weight != 0) { ++$numNonZero } | |
| 243 } | |
| 244 if ($numNonZero < 2) { | |
| 245 carp "At least two weights must be nonzero!" unless $self->{hush}; | |
| 246 return 0; | |
| 247 } | |
| 248 my $factor = @$weights / $sumW; | |
| 249 foreach my $weight (@$weights) { $weight *= $factor } | |
| 250 $self->{weight} = [ @$weights ]; | |
| 251 return 1; | |
| 252 } | |
| 253 | |
| 254 sub sigma { | |
| 255 # | |
| 256 # Purpose: Return the estimated homoscedastic standard deviation of the | |
| 257 # error term | |
| 258 # | |
| 259 my $self = shift; | |
| 260 unless (defined $self->{sigma}) { | |
| 261 $self->regress() or return; | |
| 262 $self->{sigma} = $self->{numXY} > 2 ? | |
| 263 sqrt($self->sumSqErrors() / ($self->{numXY} - 2)) : 0; | |
| 264 } | |
| 265 return $self->{sigma}; | |
| 266 } | |
| 267 | |
| 268 sub sumSqErrors { | |
| 269 # | |
| 270 # Purpose: Return the sum of the squared errors (private method) | |
| 271 # | |
| 272 my $self = shift; | |
| 273 unless (defined $self->{sumSqErrors}) { | |
| 274 $self->regress() or return; | |
| 275 $self->{sumSqErrors} = $self->{sumSqDevY} - $self->{sumSqDevX} | |
| 276 * $self->{slope} ** 2; | |
| 277 if ($self->{sumSqErrors} < 0) { $self->{sumSqErrors} = 0 } | |
| 278 } | |
| 279 return $self->{sumSqErrors}; | |
| 280 } | |
| 281 | |
| 282 sub tStatistics { | |
| 283 # | |
| 284 # Purpose: Return the T statistics | |
| 285 # | |
| 286 my $self = shift; | |
| 287 unless (defined $self->{tStatInt} and defined $self->{tStatSlope}) { | |
| 288 $self->regress() or return; | |
| 289 my $biasEstimateInt = $self->sigma() * sqrt($self->{sumXX} | |
| 290 / ($self->{sumSqDevX} * $self->{numXY})); | |
| 291 $self->{tStatInt} = $biasEstimateInt != 0 ? | |
| 292 $self->{intercept} / $biasEstimateInt : 0; | |
| 293 my $biasEstimateSlope = $self->sigma() / sqrt($self->{sumSqDevX}); | |
| 294 $self->{tStatSlope} = $biasEstimateSlope != 0 ? | |
| 295 $self->{slope} / $biasEstimateSlope : 0; | |
| 296 } | |
| 297 return ($self->{tStatInt}, $self->{tStatSlope}); | |
| 298 } | |
| 299 | |
| 300 sub validData { | |
| 301 # | |
| 302 # Purpose: Verify that the input x-y data are numeric (private method) | |
| 303 # | |
| 304 my $self = shift; | |
| 305 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
| 306 if (not defined $self->{x}[$i]) { | |
| 307 carp "Input x[$i] is not defined" unless $self->{hush}; | |
| 308 return 0; | |
| 309 } | |
| 310 if ($self->{x}[$i] !~ | |
| 311 /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) | |
| 312 { | |
| 313 carp "Input x[$i] is not a number: $self->{x}[$i]" | |
| 314 unless $self->{hush}; | |
| 315 return 0; | |
| 316 } | |
| 317 if (not defined $self->{y}[$i]) { | |
| 318 carp "Input y[$i] is not defined" unless $self->{hush}; | |
| 319 return 0; | |
| 320 } | |
| 321 if ($self->{y}[$i] !~ | |
| 322 /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) | |
| 323 { | |
| 324 carp "Input y[$i] is not a number: $self->{y}[$i]" | |
| 325 unless $self->{hush}; | |
| 326 return 0; | |
| 327 } | |
| 328 } | |
| 329 return 1; | |
| 330 } | |
| 331 | |
| 332 sub validWeights { | |
| 333 # | |
| 334 # Purpose: Verify that the input weights are numeric (private method) | |
| 335 # | |
| 336 my ($self, $weights) = @_; | |
| 337 for (my $i = 0; $i < @$weights; ++$i) { | |
| 338 if (not defined $weights->[$i]) { | |
| 339 carp "Input weights[$i] is not defined" unless $self->{hush}; | |
| 340 return 0; | |
| 341 } | |
| 342 if ($weights->[$i] | |
| 343 !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) | |
| 344 { | |
| 345 carp "Input weights[$i] is not a number: $weights->[$i]" | |
| 346 unless $self->{hush}; | |
| 347 return 0; | |
| 348 } | |
| 349 } | |
| 350 return 1; | |
| 351 } | |
| 352 | |
| 353 sub varianceOfEstimates { | |
| 354 # | |
| 355 # Purpose: Return the variances in the estimates of the intercept and slope | |
| 356 # | |
| 357 my $self = shift; | |
| 358 unless (defined $self->{intercept} and defined $self->{slope}) { | |
| 359 $self->regress() or return; | |
| 360 } | |
| 361 my @predictedYs = $self->predictedYs(); | |
| 362 my ($s, $sx, $sxx) = (0, 0, 0); | |
| 363 if (defined $self->{weight}) { | |
| 364 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
| 365 my $variance = ($predictedYs[$i] - $self->{y}[$i]) ** 2; | |
| 366 next if 0 == $variance; | |
| 367 $s += 1.0 / $variance; | |
| 368 $sx += $self->{weight}[$i] * $self->{x}[$i] / $variance; | |
| 369 $sxx += $self->{weight}[$i] * $self->{x}[$i] ** 2 / $variance; | |
| 370 } | |
| 371 } else { | |
| 372 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
| 373 my $variance = ($predictedYs[$i] - $self->{y}[$i]) ** 2; | |
| 374 next if 0 == $variance; | |
| 375 $s += 1.0 / $variance; | |
| 376 $sx += $self->{x}[$i] / $variance; | |
| 377 $sxx += $self->{x}[$i] ** 2 / $variance; | |
| 378 } | |
| 379 } | |
| 380 my $denominator = ($s * $sxx - $sx ** 2); | |
| 381 if (0 == $denominator) { | |
| 382 return; | |
| 383 } else { | |
| 384 return ($sxx / $denominator, $s / $denominator); | |
| 385 } | |
| 386 } | |
| 387 | |
| 388 1; | |
| 389 | |
| 390 __END__ | |
| 391 | |
| 392 =head1 NAME | |
| 393 | |
| 394 Statistics::LineFit - Least squares line fit, weighted or unweighted | |
| 395 | |
| 396 =head1 SYNOPSIS | |
| 397 | |
| 398 use Statistics::LineFit; | |
| 399 $lineFit = Statistics::LineFit->new(); | |
| 400 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data"; | |
| 401 ($intercept, $slope) = $lineFit->coefficients(); | |
| 402 defined $intercept or die "Can't fit line if x values are all equal"; | |
| 403 $rSquared = $lineFit->rSquared(); | |
| 404 $meanSquaredError = $lineFit->meanSqError(); | |
| 405 $durbinWatson = $lineFit->durbinWatson(); | |
| 406 $sigma = $lineFit->sigma(); | |
| 407 ($tStatIntercept, $tStatSlope) = $lineFit->tStatistics(); | |
| 408 @predictedYs = $lineFit->predictedYs(); | |
| 409 @residuals = $lineFit->residuals(); | |
| 410 (varianceIntercept, $varianceSlope) = $lineFit->varianceOfEstimates(); | |
| 411 | |
| 412 =head1 DESCRIPTION | |
| 413 | |
| 414 The Statistics::LineFit module does weighted or unweighted least-squares | |
| 415 line fitting to two-dimensional data (y = a + b * x). (This is also called | |
| 416 linear regression.) In addition to the slope and y-intercept, the module | |
| 417 can return the square of the correlation coefficient (R squared), the | |
| 418 Durbin-Watson statistic, the mean squared error, sigma, the t statistics, | |
| 419 the variance of the estimates of the slope and y-intercept, | |
| 420 the predicted y values and the residuals of the y values. (See the METHODS | |
| 421 section for a description of these statistics.) | |
| 422 | |
| 423 The module accepts input data in separate x and y arrays or a single | |
| 424 2-D array (an array of arrayrefs). The optional weights are input in a | |
| 425 separate array. The module can optionally verify that the input data and | |
| 426 weights are valid numbers. If weights are input, the line fit minimizes | |
| 427 the weighted sum of the squared errors and the following statistics are | |
| 428 weighted: the correlation coefficient, the Durbin-Watson statistic, the | |
| 429 mean squared error, sigma and the t statistics. | |
| 430 | |
| 431 The module is state-oriented and caches its results. Once you call the | |
| 432 setData() method, you can call the other methods in any order or call a | |
| 433 method several times without invoking redundant calculations. After calling | |
| 434 setData(), you can modify the input data or weights without affecting the | |
| 435 module's results. | |
| 436 | |
| 437 The decision to use or not use weighting could be made using your a | |
| 438 priori knowledge of the data or using supplemental data. If the data is | |
| 439 sparse or contains non-random noise, weighting can degrade the solution. | |
| 440 Weighting is a good option if some points are suspect or less relevant (e.g., | |
| 441 older terms in a time series, points that are known to have more noise). | |
| 442 | |
| 443 =head1 ALGORITHM | |
| 444 | |
| 445 The least-square line is the line that minimizes the sum of the squares | |
| 446 of the y residuals: | |
| 447 | |
| 448 Minimize SUM((y[i] - (a + b * x[i])) ** 2) | |
| 449 | |
| 450 Setting the parial derivatives of a and b to zero yields a solution that | |
| 451 can be expressed in terms of the means, variances and covariances of x and y: | |
| 452 | |
| 453 b = SUM((x[i] - meanX) * (y[i] - meanY)) / SUM((x[i] - meanX) ** 2) | |
| 454 | |
| 455 a = meanY - b * meanX | |
| 456 | |
| 457 Note that a and b are undefined if all the x values are the same. | |
| 458 | |
| 459 If you use weights, each term in the above sums is multiplied by the | |
| 460 value of the weight for that index. The program normalizes the weights | |
| 461 (after copying the input values) so that the sum of the weights equals | |
| 462 the number of points. This minimizes the differences between the weighted | |
| 463 and unweighted equations. | |
| 464 | |
| 465 Statistics::LineFit uses equations that are mathematically equivalent to | |
| 466 the above equations and computationally more efficient. The module runs | |
| 467 in O(N) (linear time). | |
| 468 | |
| 469 =head1 LIMITATIONS | |
| 470 | |
| 471 The regression fails if the input x values are all equal or the only unequal | |
| 472 x values have zero weights. This is an inherent limit to fitting a line of | |
| 473 the form y = a + b * x. In this case, the module issues an error message | |
| 474 and methods that return statistical values will return undefined values. | |
| 475 You can also use the return value of the regress() method to check the | |
| 476 status of the regression. | |
| 477 | |
| 478 As the sum of the squared deviations of the x values approaches zero, | |
| 479 the module's results becomes sensitive to the precision of floating point | |
| 480 operations on the host system. | |
| 481 | |
| 482 If the x values are not all the same and the apparent "best fit" line is | |
| 483 vertical, the module will fit a horizontal line. For example, an input of | |
| 484 (1, 1), (1, 7), (2, 3), (2, 5) returns a slope of zero, an intercept of 4 | |
| 485 and an R squared of zero. This is correct behavior because this line is the | |
| 486 best least-squares fit to the data for the given parameterization | |
| 487 (y = a + b * x). | |
| 488 | |
| 489 On a 32-bit system the results are accurate to about 11 significant digits, | |
| 490 depending on the input data. Many of the installation tests will fail | |
| 491 on a system with word lengths of 16 bits or fewer. (You might want to | |
| 492 upgrade your old 80286 IBM PC.) | |
| 493 | |
| 494 =head1 EXAMPLES | |
| 495 | |
| 496 =head2 Alternate calling sequence: | |
| 497 | |
| 498 use Statistics::LineFit; | |
| 499 $lineFit = Statistics::LineFit->new(); | |
| 500 $lineFit->setData(\@x, \@y) or die "Invalid regression data\n"; | |
| 501 if (defined $lineFit->rSquared() | |
| 502 and $lineFit->rSquared() > $threshold) | |
| 503 { | |
| 504 ($intercept, $slope) = $lineFit->coefficients(); | |
| 505 print "Slope: $slope Y-intercept: $intercept\n"; | |
| 506 } | |
| 507 | |
| 508 =head2 Multiple calls with same object, validate input, suppress error messages: | |
| 509 | |
| 510 use Statistics::LineFit; | |
| 511 $lineFit = Statistics::LineFit->new(1, 1); | |
| 512 while (1) { | |
| 513 @xy = read2Dxy(); # User-supplied subroutine | |
| 514 $lineFit->setData(\@xy); | |
| 515 ($intercept, $slope) = $lineFit->coefficients(); | |
| 516 if (defined $intercept) { | |
| 517 print "Slope: $slope Y-intercept: $intercept\n"; | |
| 518 } | |
| 519 } | |
| 520 | |
| 521 =head1 METHODS | |
| 522 | |
| 523 The module is state-oriented and caches its results. Once you call the | |
| 524 setData() method, you can call the other methods in any order or call | |
| 525 a method several times without invoking redundant calculations. | |
| 526 | |
| 527 The regression fails if the x values are all the same. In this case, | |
| 528 the module issues an error message and methods that return statistical | |
| 529 values will return undefined values. You can also use the return value | |
| 530 of the regress() method to check the status of the regression. | |
| 531 | |
| 532 =head2 new() - create a new Statistics::LineFit object | |
| 533 | |
| 534 $lineFit = Statistics::LineFit->new(); | |
| 535 $lineFit = Statistics::LineFit->new($validate); | |
| 536 $lineFit = Statistics::LineFit->new($validate, $hush); | |
| 537 | |
| 538 $validate = 1 -> Verify input data is numeric (slower execution) | |
| 539 0 -> Don't verify input data (default, faster execution) | |
| 540 $hush = 1 -> Suppress error messages | |
| 541 = 0 -> Enable error messages (default) | |
| 542 | |
| 543 =head2 coefficients() - Return the slope and y intercept | |
| 544 | |
| 545 ($intercept, $slope) = $lineFit->coefficients(); | |
| 546 | |
| 547 The returned list is undefined if the regression fails. | |
| 548 | |
| 549 =head2 durbinWatson() - Return the Durbin-Watson statistic | |
| 550 | |
| 551 $durbinWatson = $lineFit->durbinWatson(); | |
| 552 | |
| 553 The Durbin-Watson test is a test for first-order autocorrelation in | |
| 554 the residuals of a time series regression. The Durbin-Watson statistic | |
| 555 has a range of 0 to 4; a value of 2 indicates there is no | |
| 556 autocorrelation. | |
| 557 | |
| 558 The return value is undefined if the regression fails. If weights are | |
| 559 input, the return value is the weighted Durbin-Watson statistic. | |
| 560 | |
| 561 =head2 meanSqError() - Return the mean squared error | |
| 562 | |
| 563 $meanSquaredError = $lineFit->meanSqError(); | |
| 564 | |
| 565 The return value is undefined if the regression fails. If weights are | |
| 566 input, the return value is the weighted mean squared error. | |
| 567 | |
| 568 =head2 predictedYs() - Return the predicted y values | |
| 569 | |
| 570 @predictedYs = $lineFit->predictedYs(); | |
| 571 | |
| 572 The returned list is undefined if the regression fails. | |
| 573 | |
| 574 =head2 regress() - Do the least squares line fit (if not already done) | |
| 575 | |
| 576 $lineFit->regress() or die "Regression failed" | |
| 577 | |
| 578 You don't need to call this method because it is invoked by the other | |
| 579 methods as needed. After you call setData(), you can call regress() | |
| 580 at any time to get the status of the regression for the current data. | |
| 581 | |
| 582 =head2 residuals() - Return predicted y values minus input y values | |
| 583 | |
| 584 @residuals = $lineFit->residuals(); | |
| 585 | |
| 586 The returned list is undefined if the regression fails. | |
| 587 | |
| 588 =head2 rSquared() - Return the square of the correlation coefficient | |
| 589 | |
| 590 $rSquared = $lineFit->rSquared(); | |
| 591 | |
| 592 R squared, also called the square of the Pearson product-moment correlation | |
| 593 coefficient, is a measure of goodness-of-fit. It is the fraction of the | |
| 594 variation in Y that can be attributed to the variation in X. A perfect fit | |
| 595 will have an R squared of 1; fitting a line to the vertices of a | |
| 596 regular polygon will yield an R squared of zero. Graphical displays of data | |
| 597 with an R squared of less than about 0.1 do not show a visible linear trend. | |
| 598 | |
| 599 The return value is undefined if the regression fails. If weights are | |
| 600 input, the return value is the weighted correlation coefficient. | |
| 601 | |
| 602 =head2 setData() - Initialize (x,y) values and optional weights | |
| 603 | |
| 604 $lineFit->setData(\@x, \@y) or die "Invalid regression data"; | |
| 605 $lineFit->setData(\@x, \@y, \@weights) or die "Invalid regression data"; | |
| 606 $lineFit->setData(\@xy) or die "Invalid regression data"; | |
| 607 $lineFit->setData(\@xy, \@weights) or die "Invalid regression data"; | |
| 608 | |
| 609 @xy is an array of arrayrefs; x values are $xy[$i][0], y values are | |
| 610 $xy[$i][1]. (The module does not access any indices greater than $xy[$i][1], | |
| 611 so the arrayrefs can point to arrays that are longer than two elements.) | |
| 612 The method identifies the difference between the first and fourth calling | |
| 613 signatures by examining the first argument. | |
| 614 | |
| 615 The optional weights array must be the same length as the data array(s). | |
| 616 The weights must be non-negative numbers; at least two of the weights | |
| 617 must be nonzero. Only the relative size of the weights is significant: | |
| 618 the program normalizes the weights (after copying the input values) so | |
| 619 that the sum of the weights equals the number of points. If you want to | |
| 620 do multiple line fits using the same weights, the weights must be passed | |
| 621 to each call to setData(). | |
| 622 | |
| 623 The method will return zero if the array lengths don't match, there are | |
| 624 less than two data points, any weights are negative or less than two of | |
| 625 the weights are nonzero. If the new() method was called with validate = 1, | |
| 626 the method will also verify that the data and weights are valid numbers. | |
| 627 Once you successfully call setData(), the next call to any method other than | |
| 628 new() or setData() invokes the regression. You can modify the input data | |
| 629 or weights after calling setData() without affecting the module's results. | |
| 630 | |
| 631 =head2 sigma() - Return the standard error of the estimate | |
| 632 | |
| 633 $sigma = $lineFit->sigma(); | |
| 634 | |
| 635 Sigma is an estimate of the homoscedastic standard deviation of the | |
| 636 error. Sigma is also known as the standard error of the estimate. | |
| 637 | |
| 638 The return value is undefined if the regression fails. If weights are | |
| 639 input, the return value is the weighted standard error. | |
| 640 | |
| 641 =head2 tStatistics() - Return the t statistics | |
| 642 | |
| 643 (tStatIntercept, $tStatSlope) = $lineFit->tStatistics(); | |
| 644 | |
| 645 The t statistic, also called the t ratio or Wald statistic, is used to | |
| 646 accept or reject a hypothesis using a table of cutoff values computed from | |
| 647 the t distribution. The t-statistic suggests that the estimated value is | |
| 648 (reasonable, too small, too large) when the t-statistic is (close to zero, | |
| 649 large and positive, large and negative). | |
| 650 | |
| 651 The returned list is undefined if the regression fails. If weights | |
| 652 are input, the returned values are the weighted t statistics. | |
| 653 | |
| 654 =head2 varianceOfEstimates() - Return variances of estimates of intercept, slope | |
| 655 | |
| 656 (varianceIntercept, $varianceSlope) = $lineFit->varianceOfEstimates(); | |
| 657 | |
| 658 Assuming the data are noisy or inaccurate, the intercept and slope returned | |
| 659 by the coefficients() method are only estimates of the true intercept and | |
| 660 slope. The varianceofEstimate() method returns the variances of the | |
| 661 estimates of the intercept and slope, respectively. See Numerical Recipes | |
| 662 in C, section 15.2 (Fitting Data to a Straight Line), equation 15.2.9. | |
| 663 | |
| 664 The returned list is undefined if the regression fails. If weights | |
| 665 are input, the returned values are the weighted variances. | |
| 666 | |
| 667 =head1 SEE ALSO | |
| 668 | |
| 669 Mendenhall, W., and Sincich, T.L., 2003, A Second Course in Statistics: | |
| 670 Regression Analysis, 6th ed., Prentice Hall. | |
| 671 Press, W. H., Flannery, B. P., Teukolsky, S. A., Vetterling, W. T., 1992, | |
| 672 Numerical Recipes in C : The Art of Scientific Computing, 2nd ed., | |
| 673 Cambridge University Press. | |
| 674 The man page for perl(1). | |
| 675 The CPAN modules Statistics::OLS, Statistics::GaussHelmert and | |
| 676 Statistics::Regression. | |
| 677 | |
| 678 Statistics::LineFit is simpler to use than Statistics::GaussHelmert or | |
| 679 Statistics::Regression. Statistics::LineFit was inspired by and borrows some | |
| 680 ideas from the venerable Statistics::OLS module. | |
| 681 | |
| 682 The significant differences | |
| 683 between Statistics::LineFit and Statistics::OLS (version 0.07) are: | |
| 684 | |
| 685 =over 4 | |
| 686 | |
| 687 =item B<Statistics::LineFit is more robust.> | |
| 688 | |
| 689 Statistics::OLS returns incorrect results for certain input datasets. | |
| 690 Statistics::OLS does not deep copy its input arrays, which can lead | |
| 691 to subtle bugs. The Statistics::OLS installation test has only one | |
| 692 test and does not verify that the regression returns correct results. | |
| 693 In contrast, Statistics::LineFit has over 200 installation tests that use | |
| 694 various datasets/calling sequences to verify the accuracy of the | |
| 695 regression to within 1.0e-10. | |
| 696 | |
| 697 =item B<Statistics::LineFit is faster.> | |
| 698 | |
| 699 For a sequence of calls to new(), setData(\@x, \@y) and regress(), | |
| 700 Statistics::LineFit is faster than Statistics::OLS by factors of 2.0, 1.6 | |
| 701 and 2.4 for array lengths of 5, 100 and 10000, respectively. | |
| 702 | |
| 703 =item B<Statistics::LineFit can do weighted or unweighted regression.> | |
| 704 | |
| 705 Statistics::OLS lacks this option. | |
| 706 | |
| 707 =item B<Statistics::LineFit has a better interface.> | |
| 708 | |
| 709 Once you call the Statistics::LineFit::setData() method, you can call the | |
| 710 other methods in any order and call methods multiple times without invoking | |
| 711 redundant calculations. Statistics::LineFit lets you enable or disable | |
| 712 data verification or error messages. | |
| 713 | |
| 714 =item B<Statistics::LineFit has better code and documentation.> | |
| 715 | |
| 716 The code in Statistics::LineFit is more readable, more object oriented and | |
| 717 more compliant with Perl coding standards than the code in Statistics::OLS. | |
| 718 The documentation for Statistics::LineFit is more detailed and complete. | |
| 719 | |
| 720 =back | |
| 721 | |
| 722 =head1 AUTHOR | |
| 723 | |
| 724 Richard Anderson, cpan(AT)richardanderson(DOT)org, | |
| 725 http://www.richardanderson.org | |
| 726 | |
| 727 =head1 LICENSE | |
| 728 | |
| 729 This program is free software; you can redistribute it and/or modify it under | |
| 730 the same terms as Perl itself. | |
| 731 | |
| 732 The full text of the license can be found in the LICENSE file included in | |
| 733 the distribution and available in the CPAN listing for Statistics::LineFit | |
| 734 (see www.cpan.org or search.cpan.org). | |
| 735 | |
| 736 =head1 DISCLAIMER | |
| 737 | |
| 738 To the maximum extent permitted by applicable law, the author of this | |
| 739 module disclaims all warranties, either express or implied, including | |
| 740 but not limited to implied warranties of merchantability and fitness for | |
| 741 a particular purpose, with regard to the software and the accompanying | |
| 742 documentation. | |
| 743 | |
| 744 =cut | |
| 745 |
