| 0 | 1 package Statistics::LineFit; | 
|  | 2 use strict; | 
|  | 3 use Carp qw(carp); | 
|  | 4 BEGIN { | 
|  | 5         use Exporter (); | 
|  | 6         use vars qw ($AUTHOR $VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | 
|  | 7         $AUTHOR      = 'Richard Anderson <cpan(AT)richardanderson(DOT)org>'; | 
|  | 8         @EXPORT      = @EXPORT_OK = qw(); | 
|  | 9         %EXPORT_TAGS = (); | 
|  | 10         @ISA         = qw(Exporter); | 
|  | 11         $VERSION     = 0.06; | 
|  | 12 } | 
|  | 13 | 
|  | 14 sub new { | 
|  | 15 # | 
|  | 16 # Purpose: Create a new Statistics::LineFit object | 
|  | 17 # | 
|  | 18     my ($caller, $validate, $hush) = @_; | 
|  | 19     my $self = { doneRegress  => 0, | 
|  | 20                  gotData      => 0, | 
|  | 21                  hush         => defined $hush ? $hush : 0, | 
|  | 22                  validate     => defined $validate ? $validate : 0, | 
|  | 23                }; | 
|  | 24     bless $self, ref($caller) || $caller; | 
|  | 25     return $self; | 
|  | 26 } | 
|  | 27 | 
|  | 28 sub coefficients { | 
|  | 29 # | 
|  | 30 # Purpose: Return the slope and intercept from least squares line fit | 
|  | 31 # | 
|  | 32     my $self = shift; | 
|  | 33     unless (defined $self->{intercept} and defined $self->{slope}) { | 
|  | 34         $self->regress() or return; | 
|  | 35     } | 
|  | 36     return ($self->{intercept}, $self->{slope}); | 
|  | 37 } | 
|  | 38 | 
|  | 39 sub computeSums { | 
|  | 40 # | 
|  | 41 # Purpose: Compute sum of x, y, x**2, y**2 and x*y (private method) | 
|  | 42 # | 
|  | 43     my $self = shift; | 
|  | 44     my ($sumX, $sumY, $sumXX, $sumYY, $sumXY) = (0, 0, 0, 0, 0); | 
|  | 45     if (defined $self->{weight}) { | 
|  | 46         for (my $i = 0; $i < $self->{numXY}; ++$i) { | 
|  | 47             $sumX += $self->{weight}[$i] * $self->{x}[$i]; | 
|  | 48             $sumY += $self->{weight}[$i] * $self->{y}[$i]; | 
|  | 49             $sumXX += $self->{weight}[$i] * $self->{x}[$i] ** 2; | 
|  | 50             $sumYY += $self->{weight}[$i] * $self->{y}[$i] ** 2; | 
|  | 51             $sumXY += $self->{weight}[$i] * $self->{x}[$i] | 
|  | 52                 * $self->{y}[$i]; | 
|  | 53         } | 
|  | 54     } else { | 
|  | 55         for (my $i = 0; $i < $self->{numXY}; ++$i) { | 
|  | 56             $sumX += $self->{x}[$i]; | 
|  | 57             $sumY += $self->{y}[$i]; | 
|  | 58             $sumXX += $self->{x}[$i] ** 2; | 
|  | 59             $sumYY += $self->{y}[$i] ** 2; | 
|  | 60             $sumXY += $self->{x}[$i] * $self->{y}[$i]; | 
|  | 61         } | 
|  | 62     } | 
|  | 63     return ($sumX, $sumY, $sumXX, $sumYY, $sumXY); | 
|  | 64 } | 
|  | 65 | 
|  | 66 sub durbinWatson { | 
|  | 67 # | 
|  | 68 # Purpose: Return the Durbin-Watson statistic | 
|  | 69 # | 
|  | 70     my $self = shift; | 
|  | 71     unless (defined $self->{durbinWatson}) { | 
|  | 72         $self->regress() or return; | 
|  | 73         my $sumErrDiff = 0; | 
|  | 74         my $errorTMinus1 = $self->{y}[0] - ($self->{intercept} + $self->{slope} | 
|  | 75             * $self->{x}[0]); | 
|  | 76         for (my $i = 1; $i < $self->{numXY}; ++$i) { | 
|  | 77             my $error = $self->{y}[$i] - ($self->{intercept} + $self->{slope} | 
|  | 78                 * $self->{x}[$i]); | 
|  | 79             $sumErrDiff += ($error - $errorTMinus1) ** 2; | 
|  | 80             $errorTMinus1 = $error; | 
|  | 81         } | 
|  | 82         $self->{durbinWatson} = $self->sumSqErrors() > 0 ? | 
|  | 83             $sumErrDiff / $self->sumSqErrors() : 0; | 
|  | 84     } | 
|  | 85     return $self->{durbinWatson}; | 
|  | 86 } | 
|  | 87 | 
|  | 88 sub meanSqError { | 
|  | 89 # | 
|  | 90 # Purpose: Return the mean squared error | 
|  | 91 # | 
|  | 92     my $self = shift; | 
|  | 93     unless (defined $self->{meanSqError}) { | 
|  | 94         $self->regress() or return; | 
|  | 95         $self->{meanSqError} = $self->sumSqErrors() / $self->{numXY}; | 
|  | 96     } | 
|  | 97     return $self->{meanSqError}; | 
|  | 98 } | 
|  | 99 | 
|  | 100 sub predictedYs { | 
|  | 101 # | 
|  | 102 # Purpose: Return the predicted y values | 
|  | 103 # | 
|  | 104     my $self = shift; | 
|  | 105     unless (defined $self->{predictedYs}) { | 
|  | 106         $self->regress() or return; | 
|  | 107         $self->{predictedYs} = []; | 
|  | 108         for (my $i = 0; $i < $self->{numXY}; ++$i) { | 
|  | 109             $self->{predictedYs}[$i] = $self->{intercept} | 
|  | 110                 + $self->{slope} * $self->{x}[$i]; | 
|  | 111         } | 
|  | 112     } | 
|  | 113     return @{$self->{predictedYs}}; | 
|  | 114 } | 
|  | 115 | 
|  | 116 sub regress { | 
|  | 117 # | 
|  | 118 # Purpose: Do weighted or unweighted least squares 2-D line fit (if needed) | 
|  | 119 # | 
|  | 120 # Description: | 
|  | 121 # The equations below apply to both the weighted and unweighted fit: the | 
|  | 122 # weights are normalized in setWeights(), so the sum of the weights is | 
|  | 123 # equal to numXY. | 
|  | 124 # | 
|  | 125     my $self = shift; | 
|  | 126     return $self->{regressOK} if $self->{doneRegress}; | 
|  | 127     unless ($self->{gotData}) { | 
|  | 128         carp "No valid data input - can't do regression" unless $self->{hush}; | 
|  | 129         return 0; | 
|  | 130     } | 
|  | 131     my ($sumX, $sumY, $sumYY, $sumXY); | 
|  | 132     ($sumX, $sumY, $self->{sumXX}, $sumYY, $sumXY) = $self->computeSums(); | 
|  | 133     $self->{sumSqDevX} = $self->{sumXX} - $sumX ** 2 / $self->{numXY}; | 
|  | 134     if ($self->{sumSqDevX} != 0) { | 
|  | 135         $self->{sumSqDevY} = $sumYY - $sumY ** 2 / $self->{numXY}; | 
|  | 136         $self->{sumSqDevXY} = $sumXY - $sumX * $sumY / $self->{numXY}; | 
|  | 137         $self->{slope} = $self->{sumSqDevXY} / $self->{sumSqDevX}; | 
|  | 138         $self->{intercept} = ($sumY - $self->{slope} * $sumX) / $self->{numXY}; | 
|  | 139         $self->{regressOK} = 1; | 
|  | 140     } else { | 
|  | 141         carp "Can't fit line when x values are all equal" unless $self->{hush}; | 
|  | 142         $self->{sumXX} = $self->{sumSqDevX} = undef; | 
|  | 143         $self->{regressOK} = 0; | 
|  | 144     } | 
|  | 145     $self->{doneRegress} = 1; | 
|  | 146     return $self->{regressOK}; | 
|  | 147 } | 
|  | 148 | 
|  | 149 sub residuals { | 
|  | 150 # | 
|  | 151 # Purpose: Return the predicted Y values minus the observed Y values | 
|  | 152 # | 
|  | 153     my $self = shift; | 
|  | 154     unless (defined $self->{residuals}) { | 
|  | 155         $self->regress() or return; | 
|  | 156         $self->{residuals} = []; | 
|  | 157         for (my $i = 0; $i < $self->{numXY}; ++$i) { | 
|  | 158             $self->{residuals}[$i] = $self->{y}[$i] - ($self->{intercept} | 
|  | 159                 + $self->{slope} * $self->{x}[$i]); | 
|  | 160         } | 
|  | 161     } | 
|  | 162     return @{$self->{residuals}}; | 
|  | 163 } | 
|  | 164 | 
|  | 165 sub rSquared { | 
|  | 166 # | 
|  | 167 # Purpose: Return the correlation coefficient | 
|  | 168 # | 
|  | 169     my $self = shift; | 
|  | 170     unless (defined $self->{rSquared}) { | 
|  | 171         $self->regress() or return; | 
|  | 172         my $denom = $self->{sumSqDevX} * $self->{sumSqDevY}; | 
|  | 173         $self->{rSquared} = $denom != 0 ? $self->{sumSqDevXY} ** 2 / $denom : 1; | 
|  | 174     } | 
|  | 175     return $self->{rSquared}; | 
|  | 176 } | 
|  | 177 | 
|  | 178 sub setData { | 
|  | 179 # | 
|  | 180 # Purpose: Initialize (x,y) values and optional weights | 
|  | 181 # | 
|  | 182     my ($self, $x, $y, $weights) = @_; | 
|  | 183     $self->{doneRegress} = 0; | 
|  | 184     $self->{x} = $self->{y} = $self->{numXY} = $self->{weight} | 
|  | 185         = $self->{intercept} = $self->{slope} = $self->{rSquared} | 
|  | 186         = $self->{sigma} = $self->{durbinWatson} = $self->{meanSqError} | 
|  | 187         = $self->{sumSqErrors} = $self->{tStatInt} = $self->{tStatSlope} | 
|  | 188         = $self->{predictedYs} = $self->{residuals} = $self->{sumXX} | 
|  | 189         = $self->{sumSqDevX} = $self->{sumSqDevY} = $self->{sumSqDevXY} | 
|  | 190         = undef; | 
|  | 191     if (@$x < 2) { | 
|  | 192         carp "Must input more than one data point!" unless $self->{hush}; | 
|  | 193         return 0; | 
|  | 194     } | 
|  | 195     $self->{numXY} = @$x; | 
|  | 196     if (ref $x->[0]) { | 
|  | 197         $self->setWeights($y) or return 0; | 
|  | 198         $self->{x} = [ ]; | 
|  | 199         $self->{y} = [ ]; | 
|  | 200         foreach my $xy (@$x) { | 
|  | 201             push @{$self->{x}}, $xy->[0]; | 
|  | 202             push @{$self->{y}}, $xy->[1]; | 
|  | 203         } | 
|  | 204     } else { | 
|  | 205         if (@$x != @$y) { | 
|  | 206             carp "Length of x and y arrays must be equal!" unless $self->{hush}; | 
|  | 207             return 0; | 
|  | 208         } | 
|  | 209         $self->setWeights($weights) or return 0; | 
|  | 210         $self->{x} = [ @$x ]; | 
|  | 211         $self->{y} = [ @$y ]; | 
|  | 212     } | 
|  | 213     if ($self->{validate}) { | 
|  | 214         unless ($self->validData()) { | 
|  | 215             $self->{x} = $self->{y} = $self->{weights} = $self->{numXY} = undef; | 
|  | 216             return 0; | 
|  | 217         } | 
|  | 218     } | 
|  | 219     $self->{gotData} = 1; | 
|  | 220     return 1; | 
|  | 221 } | 
|  | 222 | 
|  | 223 sub setWeights { | 
|  | 224 # | 
|  | 225 # Purpose: Normalize and initialize line fit weighting factors (private method) | 
|  | 226 # | 
|  | 227     my ($self, $weights) = @_; | 
|  | 228     return 1 unless defined $weights; | 
|  | 229     if (@$weights != $self->{numXY}) { | 
|  | 230         carp "Length of weight array must equal length of data array!" | 
|  | 231             unless $self->{hush}; | 
|  | 232         return 0; | 
|  | 233     } | 
|  | 234     if ($self->{validate}) { $self->validWeights($weights) or return 0 } | 
|  | 235     my $sumW = my $numNonZero = 0; | 
|  | 236     foreach my $weight (@$weights) { | 
|  | 237         if ($weight < 0) { | 
|  | 238             carp "Weights must be non-negative numbers!" unless $self->{hush}; | 
|  | 239             return 0; | 
|  | 240         } | 
|  | 241         $sumW += $weight; | 
|  | 242         if ($weight != 0) { ++$numNonZero } | 
|  | 243     } | 
|  | 244     if ($numNonZero < 2) { | 
|  | 245         carp "At least two weights must be nonzero!" unless $self->{hush}; | 
|  | 246         return 0; | 
|  | 247     } | 
|  | 248     my $factor = @$weights / $sumW; | 
|  | 249     foreach my $weight (@$weights) { $weight *= $factor } | 
|  | 250     $self->{weight} = [ @$weights ]; | 
|  | 251     return 1; | 
|  | 252 } | 
|  | 253 | 
|  | 254 sub sigma { | 
|  | 255 # | 
|  | 256 # Purpose: Return the estimated homoscedastic standard deviation of the | 
|  | 257 #          error term | 
|  | 258 # | 
|  | 259     my $self = shift; | 
|  | 260     unless (defined $self->{sigma}) { | 
|  | 261         $self->regress() or return; | 
|  | 262         $self->{sigma} = $self->{numXY} > 2 ? | 
|  | 263             sqrt($self->sumSqErrors() / ($self->{numXY} - 2)) : 0; | 
|  | 264     } | 
|  | 265     return $self->{sigma}; | 
|  | 266 } | 
|  | 267 | 
|  | 268 sub sumSqErrors { | 
|  | 269 # | 
|  | 270 # Purpose: Return the sum of the squared errors (private method) | 
|  | 271 # | 
|  | 272     my $self = shift; | 
|  | 273     unless (defined $self->{sumSqErrors}) { | 
|  | 274         $self->regress() or return; | 
|  | 275         $self->{sumSqErrors} = $self->{sumSqDevY} - $self->{sumSqDevX} | 
|  | 276             * $self->{slope} ** 2; | 
|  | 277         if ($self->{sumSqErrors} < 0) { $self->{sumSqErrors} = 0 } | 
|  | 278     } | 
|  | 279     return $self->{sumSqErrors}; | 
|  | 280 } | 
|  | 281 | 
|  | 282 sub tStatistics { | 
|  | 283 # | 
|  | 284 # Purpose: Return the T statistics | 
|  | 285 # | 
|  | 286     my $self = shift; | 
|  | 287     unless (defined $self->{tStatInt} and defined $self->{tStatSlope}) { | 
|  | 288         $self->regress() or return; | 
|  | 289         my $biasEstimateInt = $self->sigma() * sqrt($self->{sumXX} | 
|  | 290             / ($self->{sumSqDevX} * $self->{numXY})); | 
|  | 291         $self->{tStatInt} = $biasEstimateInt != 0 ? | 
|  | 292             $self->{intercept} / $biasEstimateInt : 0; | 
|  | 293         my $biasEstimateSlope = $self->sigma() / sqrt($self->{sumSqDevX}); | 
|  | 294         $self->{tStatSlope} = $biasEstimateSlope != 0 ? | 
|  | 295             $self->{slope} / $biasEstimateSlope : 0; | 
|  | 296     } | 
|  | 297     return ($self->{tStatInt}, $self->{tStatSlope}); | 
|  | 298 } | 
|  | 299 | 
|  | 300 sub validData { | 
|  | 301 # | 
|  | 302 # Purpose: Verify that the input x-y data are numeric (private method) | 
|  | 303 # | 
|  | 304     my $self = shift; | 
|  | 305     for (my $i = 0; $i < $self->{numXY}; ++$i) { | 
|  | 306         if (not defined $self->{x}[$i]) { | 
|  | 307             carp "Input x[$i] is not defined" unless $self->{hush}; | 
|  | 308             return 0; | 
|  | 309         } | 
|  | 310         if ($self->{x}[$i] !~ | 
|  | 311             /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) | 
|  | 312         { | 
|  | 313             carp "Input x[$i] is not a number: $self->{x}[$i]" | 
|  | 314                 unless $self->{hush}; | 
|  | 315             return 0; | 
|  | 316         } | 
|  | 317         if (not defined $self->{y}[$i]) { | 
|  | 318             carp "Input y[$i] is not defined" unless $self->{hush}; | 
|  | 319             return 0; | 
|  | 320         } | 
|  | 321         if ($self->{y}[$i] !~ | 
|  | 322             /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) | 
|  | 323         { | 
|  | 324             carp "Input y[$i] is not a number: $self->{y}[$i]" | 
|  | 325                 unless $self->{hush}; | 
|  | 326             return 0; | 
|  | 327         } | 
|  | 328     } | 
|  | 329     return 1; | 
|  | 330 } | 
|  | 331 | 
|  | 332 sub validWeights { | 
|  | 333 # | 
|  | 334 # Purpose: Verify that the input weights are numeric (private method) | 
|  | 335 # | 
|  | 336     my ($self, $weights) = @_; | 
|  | 337     for (my $i = 0; $i < @$weights; ++$i) { | 
|  | 338         if (not defined $weights->[$i]) { | 
|  | 339             carp "Input weights[$i] is not defined" unless $self->{hush}; | 
|  | 340             return 0; | 
|  | 341         } | 
|  | 342         if ($weights->[$i] | 
|  | 343             !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) | 
|  | 344         { | 
|  | 345             carp "Input weights[$i] is not a number: $weights->[$i]" | 
|  | 346                 unless $self->{hush}; | 
|  | 347             return 0; | 
|  | 348         } | 
|  | 349     } | 
|  | 350     return 1; | 
|  | 351 } | 
|  | 352 | 
|  | 353 sub varianceOfEstimates { | 
|  | 354 # | 
|  | 355 # Purpose: Return the variances in the estimates of the intercept and slope | 
|  | 356 # | 
|  | 357     my $self = shift; | 
|  | 358     unless (defined $self->{intercept} and defined $self->{slope}) { | 
|  | 359         $self->regress() or return; | 
|  | 360     } | 
|  | 361     my @predictedYs = $self->predictedYs(); | 
|  | 362     my ($s, $sx, $sxx) = (0, 0, 0); | 
|  | 363     if (defined $self->{weight}) { | 
|  | 364         for (my $i = 0; $i < $self->{numXY}; ++$i) { | 
|  | 365             my $variance = ($predictedYs[$i] - $self->{y}[$i]) ** 2; | 
|  | 366             next if 0 == $variance; | 
|  | 367             $s += 1.0 / $variance; | 
|  | 368 	    $sx += $self->{weight}[$i] * $self->{x}[$i] / $variance; | 
|  | 369 	    $sxx += $self->{weight}[$i] * $self->{x}[$i] ** 2 / $variance; | 
|  | 370         } | 
|  | 371     } else { | 
|  | 372         for (my $i = 0; $i < $self->{numXY}; ++$i) { | 
|  | 373             my $variance = ($predictedYs[$i] - $self->{y}[$i]) ** 2; | 
|  | 374             next if 0 == $variance; | 
|  | 375             $s += 1.0 / $variance; | 
|  | 376 	    $sx += $self->{x}[$i] / $variance; | 
|  | 377 	    $sxx += $self->{x}[$i] ** 2 / $variance; | 
|  | 378         } | 
|  | 379     } | 
|  | 380     my $denominator = ($s * $sxx - $sx ** 2); | 
|  | 381     if (0 == $denominator) { | 
|  | 382         return; | 
|  | 383     } else { | 
|  | 384         return ($sxx / $denominator, $s / $denominator); | 
|  | 385     } | 
|  | 386 } | 
|  | 387 | 
|  | 388 1; | 
|  | 389 | 
|  | 390 __END__ | 
|  | 391 | 
|  | 392 =head1 NAME | 
|  | 393 | 
|  | 394 Statistics::LineFit - Least squares line fit, weighted or unweighted | 
|  | 395 | 
|  | 396 =head1 SYNOPSIS | 
|  | 397 | 
|  | 398  use Statistics::LineFit; | 
|  | 399  $lineFit = Statistics::LineFit->new(); | 
|  | 400  $lineFit->setData (\@xValues, \@yValues) or die "Invalid data"; | 
|  | 401  ($intercept, $slope) = $lineFit->coefficients(); | 
|  | 402  defined $intercept or die "Can't fit line if x values are all equal"; | 
|  | 403  $rSquared = $lineFit->rSquared(); | 
|  | 404  $meanSquaredError = $lineFit->meanSqError(); | 
|  | 405  $durbinWatson = $lineFit->durbinWatson(); | 
|  | 406  $sigma = $lineFit->sigma(); | 
|  | 407  ($tStatIntercept, $tStatSlope) = $lineFit->tStatistics(); | 
|  | 408  @predictedYs = $lineFit->predictedYs(); | 
|  | 409  @residuals = $lineFit->residuals(); | 
|  | 410  (varianceIntercept, $varianceSlope) = $lineFit->varianceOfEstimates(); | 
|  | 411 | 
|  | 412 =head1 DESCRIPTION | 
|  | 413 | 
|  | 414 The Statistics::LineFit module does weighted or unweighted least-squares | 
|  | 415 line fitting to two-dimensional data (y = a + b * x).  (This is also called | 
|  | 416 linear regression.)  In addition to the slope and y-intercept, the module | 
|  | 417 can return the square of the correlation coefficient (R squared), the | 
|  | 418 Durbin-Watson statistic, the mean squared error, sigma, the t statistics, | 
|  | 419 the variance of the estimates of the slope and y-intercept, | 
|  | 420 the predicted y values and the residuals of the y values.  (See the METHODS | 
|  | 421 section for a description of these statistics.) | 
|  | 422 | 
|  | 423 The module accepts input data in separate x and y arrays or a single | 
|  | 424 2-D array (an array of arrayrefs).  The optional weights are input in a | 
|  | 425 separate array.  The module can optionally verify that the input data and | 
|  | 426 weights are valid numbers.  If weights are input, the line fit minimizes | 
|  | 427 the weighted sum of the squared errors and the following statistics are | 
|  | 428 weighted: the correlation coefficient, the Durbin-Watson statistic, the | 
|  | 429 mean squared error, sigma and the t statistics. | 
|  | 430 | 
|  | 431 The module is state-oriented and caches its results.  Once you call the | 
|  | 432 setData() method, you can call the other methods in any order or call a | 
|  | 433 method several times without invoking redundant calculations.  After calling | 
|  | 434 setData(), you can modify the input data or weights without affecting the | 
|  | 435 module's results. | 
|  | 436 | 
|  | 437 The decision to use or not use weighting could be made using your a | 
|  | 438 priori knowledge of the data or using supplemental data.  If the data is | 
|  | 439 sparse or contains non-random noise, weighting can degrade the solution. | 
|  | 440 Weighting is a good option if some points are suspect or less relevant (e.g., | 
|  | 441 older terms in a time series, points that are known to have more noise). | 
|  | 442 | 
|  | 443 =head1 ALGORITHM | 
|  | 444 | 
|  | 445 The least-square line is the line that minimizes the sum of the squares | 
|  | 446 of the y residuals: | 
|  | 447 | 
|  | 448  Minimize SUM((y[i] - (a + b * x[i])) ** 2) | 
|  | 449 | 
|  | 450 Setting the parial derivatives of a and b to zero yields a solution that | 
|  | 451 can be expressed in terms of the means, variances and covariances of x and y: | 
|  | 452 | 
|  | 453  b = SUM((x[i] - meanX) * (y[i] - meanY)) / SUM((x[i] - meanX) ** 2) | 
|  | 454 | 
|  | 455  a = meanY - b * meanX | 
|  | 456 | 
|  | 457 Note that a and b are undefined if all the x values are the same. | 
|  | 458 | 
|  | 459 If you use weights, each term in the above sums is multiplied by the | 
|  | 460 value of the weight for that index.  The program normalizes the weights | 
|  | 461 (after copying the input values) so that the sum of the weights equals | 
|  | 462 the number of points.  This minimizes the differences between the weighted | 
|  | 463 and unweighted equations. | 
|  | 464 | 
|  | 465 Statistics::LineFit uses equations that are mathematically equivalent to | 
|  | 466 the above equations and computationally more efficient.  The module runs | 
|  | 467 in O(N) (linear time). | 
|  | 468 | 
|  | 469 =head1 LIMITATIONS | 
|  | 470 | 
|  | 471 The regression fails if the input x values are all equal or the only unequal | 
|  | 472 x values have zero weights.  This is an inherent limit to fitting a line of | 
|  | 473 the form y = a + b * x.  In this case, the module issues an error message | 
|  | 474 and methods that return statistical values will return undefined values. | 
|  | 475 You can also use the return value of the regress() method to check the | 
|  | 476 status of the regression. | 
|  | 477 | 
|  | 478 As the sum of the squared deviations of the x values approaches zero, | 
|  | 479 the module's results becomes sensitive to the precision of floating point | 
|  | 480 operations on the host system. | 
|  | 481 | 
|  | 482 If the x values are not all the same and the apparent "best fit" line is | 
|  | 483 vertical, the module will fit a horizontal line.  For example, an input of | 
|  | 484 (1, 1), (1, 7), (2, 3), (2, 5) returns a slope of zero, an intercept of 4 | 
|  | 485 and an R squared of zero.  This is correct behavior because this line is the | 
|  | 486 best least-squares fit to the data for the given parameterization | 
|  | 487 (y = a + b * x). | 
|  | 488 | 
|  | 489 On a 32-bit system the results are accurate to about 11 significant digits, | 
|  | 490 depending on the input data.  Many of the installation tests will fail | 
|  | 491 on a system with word lengths of 16 bits or fewer.  (You might want to | 
|  | 492 upgrade your old 80286 IBM PC.) | 
|  | 493 | 
|  | 494 =head1 EXAMPLES | 
|  | 495 | 
|  | 496 =head2 Alternate calling sequence: | 
|  | 497 | 
|  | 498  use Statistics::LineFit; | 
|  | 499  $lineFit = Statistics::LineFit->new(); | 
|  | 500  $lineFit->setData(\@x, \@y) or die "Invalid regression data\n"; | 
|  | 501  if (defined $lineFit->rSquared() | 
|  | 502      and $lineFit->rSquared() > $threshold) | 
|  | 503  { | 
|  | 504      ($intercept, $slope) = $lineFit->coefficients(); | 
|  | 505      print "Slope: $slope  Y-intercept: $intercept\n"; | 
|  | 506  } | 
|  | 507 | 
|  | 508 =head2 Multiple calls with same object, validate input, suppress error messages: | 
|  | 509 | 
|  | 510  use Statistics::LineFit; | 
|  | 511  $lineFit = Statistics::LineFit->new(1, 1); | 
|  | 512  while (1) { | 
|  | 513      @xy = read2Dxy();  # User-supplied subroutine | 
|  | 514      $lineFit->setData(\@xy); | 
|  | 515      ($intercept, $slope) = $lineFit->coefficients(); | 
|  | 516      if (defined $intercept) { | 
|  | 517          print "Slope: $slope  Y-intercept: $intercept\n"; | 
|  | 518      } | 
|  | 519  } | 
|  | 520 | 
|  | 521 =head1 METHODS | 
|  | 522 | 
|  | 523 The module is state-oriented and caches its results.  Once you call the | 
|  | 524 setData() method, you can call the other methods in any order or call | 
|  | 525 a method several times without invoking redundant calculations. | 
|  | 526 | 
|  | 527 The regression fails if the x values are all the same.  In this case, | 
|  | 528 the module issues an error message and methods that return statistical | 
|  | 529 values will return undefined values.  You can also use the return value | 
|  | 530 of the regress() method to check the status of the regression. | 
|  | 531 | 
|  | 532 =head2 new() - create a new Statistics::LineFit object | 
|  | 533 | 
|  | 534  $lineFit = Statistics::LineFit->new(); | 
|  | 535  $lineFit = Statistics::LineFit->new($validate); | 
|  | 536  $lineFit = Statistics::LineFit->new($validate, $hush); | 
|  | 537 | 
|  | 538  $validate = 1 -> Verify input data is numeric (slower execution) | 
|  | 539              0 -> Don't verify input data (default, faster execution) | 
|  | 540  $hush = 1 -> Suppress error messages | 
|  | 541        = 0 -> Enable error messages (default) | 
|  | 542 | 
|  | 543 =head2 coefficients() - Return the slope and y intercept | 
|  | 544 | 
|  | 545  ($intercept, $slope) = $lineFit->coefficients(); | 
|  | 546 | 
|  | 547 The returned list is undefined if the regression fails. | 
|  | 548 | 
|  | 549 =head2 durbinWatson() - Return the Durbin-Watson statistic | 
|  | 550 | 
|  | 551  $durbinWatson = $lineFit->durbinWatson(); | 
|  | 552 | 
|  | 553 The Durbin-Watson test is a test for first-order autocorrelation in | 
|  | 554 the residuals of a time series regression. The Durbin-Watson statistic | 
|  | 555 has a range of 0 to 4; a value of 2 indicates there is no | 
|  | 556 autocorrelation. | 
|  | 557 | 
|  | 558 The return value is undefined if the regression fails.  If weights are | 
|  | 559 input, the return value is the weighted Durbin-Watson statistic. | 
|  | 560 | 
|  | 561 =head2 meanSqError() - Return the mean squared error | 
|  | 562 | 
|  | 563  $meanSquaredError = $lineFit->meanSqError(); | 
|  | 564 | 
|  | 565 The return value is undefined if the regression fails.  If weights are | 
|  | 566 input, the return value is the weighted mean squared error. | 
|  | 567 | 
|  | 568 =head2 predictedYs() - Return the predicted y values | 
|  | 569 | 
|  | 570  @predictedYs = $lineFit->predictedYs(); | 
|  | 571 | 
|  | 572 The returned list is undefined if the regression fails. | 
|  | 573 | 
|  | 574 =head2 regress() - Do the least squares line fit (if not already done) | 
|  | 575 | 
|  | 576  $lineFit->regress() or die "Regression failed" | 
|  | 577 | 
|  | 578 You don't need to call this method because it is invoked by the other | 
|  | 579 methods as needed.  After you call setData(), you can call regress() | 
|  | 580 at any time to get the status of the regression for the current data. | 
|  | 581 | 
|  | 582 =head2 residuals() - Return predicted y values minus input y values | 
|  | 583 | 
|  | 584  @residuals = $lineFit->residuals(); | 
|  | 585 | 
|  | 586 The returned list is undefined if the regression fails. | 
|  | 587 | 
|  | 588 =head2 rSquared() - Return the square of the correlation coefficient | 
|  | 589 | 
|  | 590  $rSquared = $lineFit->rSquared(); | 
|  | 591 | 
|  | 592 R squared, also called the square of the Pearson product-moment correlation | 
|  | 593 coefficient, is a measure of goodness-of-fit.  It is the fraction of the | 
|  | 594 variation in Y that can be attributed to the variation in X.  A perfect fit | 
|  | 595 will have an R squared of 1; fitting a line to the vertices of a | 
|  | 596 regular polygon will yield an R squared of zero.  Graphical displays of data | 
|  | 597 with an R squared of less than about 0.1 do not show a visible linear trend. | 
|  | 598 | 
|  | 599 The return value is undefined if the regression fails.  If weights are | 
|  | 600 input, the return value is the weighted correlation coefficient. | 
|  | 601 | 
|  | 602 =head2 setData() - Initialize (x,y) values and optional weights | 
|  | 603 | 
|  | 604  $lineFit->setData(\@x, \@y) or die "Invalid regression data"; | 
|  | 605  $lineFit->setData(\@x, \@y, \@weights) or die "Invalid regression data"; | 
|  | 606  $lineFit->setData(\@xy) or die "Invalid regression data"; | 
|  | 607  $lineFit->setData(\@xy, \@weights) or die "Invalid regression data"; | 
|  | 608 | 
|  | 609 @xy is an array of arrayrefs; x values are $xy[$i][0], y values are | 
|  | 610 $xy[$i][1].  (The module does not access any indices greater than $xy[$i][1], | 
|  | 611 so the arrayrefs can point to arrays that are longer than two elements.) | 
|  | 612 The method identifies the difference between the first and fourth calling | 
|  | 613 signatures by examining the first argument. | 
|  | 614 | 
|  | 615 The optional weights array must be the same length as the data array(s). | 
|  | 616 The weights must be non-negative numbers; at least two of the weights | 
|  | 617 must be nonzero.  Only the relative size of the weights is significant: | 
|  | 618 the program normalizes the weights (after copying the input values) so | 
|  | 619 that the sum of the weights equals the number of points.  If you want to | 
|  | 620 do multiple line fits using the same weights, the weights must be passed | 
|  | 621 to each call to setData(). | 
|  | 622 | 
|  | 623 The method will return zero if the array lengths don't match, there are | 
|  | 624 less than two data points, any weights are negative or less than two of | 
|  | 625 the weights are nonzero. If the new() method was called with validate = 1, | 
|  | 626 the method will also verify that the data and weights are valid numbers. | 
|  | 627 Once you successfully call setData(), the next call to any method other than | 
|  | 628 new() or setData() invokes the regression.  You can modify the input data | 
|  | 629 or weights after calling setData() without affecting the module's results. | 
|  | 630 | 
|  | 631 =head2 sigma() - Return the standard error of the estimate | 
|  | 632 | 
|  | 633 $sigma = $lineFit->sigma(); | 
|  | 634 | 
|  | 635 Sigma is an estimate of the homoscedastic standard deviation of the | 
|  | 636 error.  Sigma is also known as the standard error of the estimate. | 
|  | 637 | 
|  | 638 The return value is undefined if the regression fails.  If weights are | 
|  | 639 input, the return value is the weighted standard error. | 
|  | 640 | 
|  | 641 =head2 tStatistics() - Return the t statistics | 
|  | 642 | 
|  | 643  (tStatIntercept, $tStatSlope) = $lineFit->tStatistics(); | 
|  | 644 | 
|  | 645 The t statistic, also called the t ratio or Wald statistic, is used to | 
|  | 646 accept or reject a hypothesis using a table of cutoff values computed from | 
|  | 647 the t distribution.  The t-statistic suggests that the estimated value is | 
|  | 648 (reasonable, too small, too large) when the t-statistic is (close to zero, | 
|  | 649 large and positive, large and negative). | 
|  | 650 | 
|  | 651 The returned list is undefined if the regression fails.  If weights | 
|  | 652 are input, the returned values are the weighted t statistics. | 
|  | 653 | 
|  | 654 =head2 varianceOfEstimates() - Return variances of estimates of intercept, slope | 
|  | 655 | 
|  | 656  (varianceIntercept, $varianceSlope) = $lineFit->varianceOfEstimates(); | 
|  | 657 | 
|  | 658 Assuming the data are noisy or inaccurate, the intercept and slope returned | 
|  | 659 by the coefficients() method are only estimates of the true intercept and | 
|  | 660 slope.  The varianceofEstimate() method returns the variances of the | 
|  | 661 estimates of the intercept and slope, respectively.  See Numerical Recipes | 
|  | 662 in C, section 15.2 (Fitting Data to a Straight Line), equation 15.2.9. | 
|  | 663 | 
|  | 664 The returned list is undefined if the regression fails.  If weights | 
|  | 665 are input, the returned values are the weighted variances. | 
|  | 666 | 
|  | 667 =head1 SEE ALSO | 
|  | 668 | 
|  | 669  Mendenhall, W., and Sincich, T.L., 2003, A Second Course in Statistics: | 
|  | 670    Regression Analysis, 6th ed., Prentice Hall. | 
|  | 671  Press, W. H., Flannery, B. P., Teukolsky, S. A., Vetterling, W. T., 1992, | 
|  | 672    Numerical Recipes in C : The Art of Scientific Computing, 2nd ed., | 
|  | 673    Cambridge University Press. | 
|  | 674  The man page for perl(1). | 
|  | 675  The CPAN modules Statistics::OLS, Statistics::GaussHelmert and | 
|  | 676    Statistics::Regression. | 
|  | 677 | 
|  | 678 Statistics::LineFit is simpler to use than Statistics::GaussHelmert or | 
|  | 679 Statistics::Regression.  Statistics::LineFit was inspired by and borrows some | 
|  | 680 ideas from the venerable Statistics::OLS module. | 
|  | 681 | 
|  | 682 The significant differences | 
|  | 683 between Statistics::LineFit and Statistics::OLS (version 0.07) are: | 
|  | 684 | 
|  | 685 =over 4 | 
|  | 686 | 
|  | 687 =item B<Statistics::LineFit is more robust.> | 
|  | 688 | 
|  | 689 Statistics::OLS returns incorrect results for certain input datasets. | 
|  | 690 Statistics::OLS does not deep copy its input arrays, which can lead | 
|  | 691 to subtle bugs.  The Statistics::OLS installation test has only one | 
|  | 692 test and does not verify that the regression returns correct results. | 
|  | 693 In contrast, Statistics::LineFit has over 200 installation tests that use | 
|  | 694 various datasets/calling sequences to verify the accuracy of the | 
|  | 695 regression to within 1.0e-10. | 
|  | 696 | 
|  | 697 =item B<Statistics::LineFit is faster.> | 
|  | 698 | 
|  | 699 For a sequence of calls to new(), setData(\@x, \@y) and regress(), | 
|  | 700 Statistics::LineFit is faster than Statistics::OLS by factors of 2.0, 1.6 | 
|  | 701 and 2.4 for array lengths of 5, 100 and 10000, respectively. | 
|  | 702 | 
|  | 703 =item B<Statistics::LineFit can do weighted or unweighted regression.> | 
|  | 704 | 
|  | 705 Statistics::OLS lacks this option. | 
|  | 706 | 
|  | 707 =item B<Statistics::LineFit has a better interface.> | 
|  | 708 | 
|  | 709 Once you call the Statistics::LineFit::setData() method, you can call the | 
|  | 710 other methods in any order and call methods multiple times without invoking | 
|  | 711 redundant calculations.  Statistics::LineFit lets you enable or disable | 
|  | 712 data verification or error messages. | 
|  | 713 | 
|  | 714 =item B<Statistics::LineFit has better code and documentation.> | 
|  | 715 | 
|  | 716 The code in Statistics::LineFit is more readable, more object oriented and | 
|  | 717 more compliant with Perl coding standards than the code in Statistics::OLS. | 
|  | 718 The documentation for Statistics::LineFit is more detailed and complete. | 
|  | 719 | 
|  | 720 =back | 
|  | 721 | 
|  | 722 =head1 AUTHOR | 
|  | 723 | 
|  | 724 Richard Anderson, cpan(AT)richardanderson(DOT)org, | 
|  | 725 http://www.richardanderson.org | 
|  | 726 | 
|  | 727 =head1 LICENSE | 
|  | 728 | 
|  | 729 This program is free software; you can redistribute it and/or modify it under | 
|  | 730 the same terms as Perl itself. | 
|  | 731 | 
|  | 732 The full text of the license can be found in the LICENSE file included in | 
|  | 733 the distribution and available in the CPAN listing for Statistics::LineFit | 
|  | 734 (see www.cpan.org or search.cpan.org). | 
|  | 735 | 
|  | 736 =head1 DISCLAIMER | 
|  | 737 | 
|  | 738 To the maximum extent permitted by applicable law, the author of this | 
|  | 739 module disclaims all warranties, either express or implied, including | 
|  | 740 but not limited to implied warranties of merchantability and fitness for | 
|  | 741 a particular purpose, with regard to the software and the accompanying | 
|  | 742 documentation. | 
|  | 743 | 
|  | 744 =cut | 
|  | 745 |