Mercurial > repos > devteam > pearson_correlation
changeset 0:5ebbb889236a draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 28 Jul 2014 11:55:57 -0400 |
parents | |
children | 302dcb834898 |
files | correlation.pl correlation.xml |
diffstat | 2 files changed, 99 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/correlation.pl Mon Jul 28 11:55:57 2014 -0400 @@ -0,0 +1,84 @@ +#!/usr/bin/perl + +########################################################################### +# Purpose: To calculate the correlation of two sets of scores in one file. +# Usage: correlation.pl infile.bed output.txt column1 column2 +# (column start from 1) +# Written by: Yi Zhang (June, 2005) +########################################################################### +if (!$ARGV[0] || !$ARGV[1] || !defined($ARGV[2]) || !defined($ARGV[3]) ) { + print STDERR "Usage: correlation.pl infile.bed output.txt column1 column2\n"; + print STDERR " (column start from 1)\n"; + exit; +} +my $file = $ARGV[0]; +my $out = $ARGV[1]; + +die "<font color=\"yellow\">The input columns contain numerical values: $ARGV[2], $ARGV[3]</font>.\n" if ($ARGV[2] =~ /[a-zA-Z]+/ || $ARGV[3] =~ /[a-zA-Z]+/); + +my $col1 = $ARGV[2] - 1; +my $col2 = $ARGV[3] - 1; + +my ($f, $o); +my (@a, @b); + +my $n_t = 0; +open($f, $file) or die "Could't open $file, $!\n"; +while(<$f>) { + chomp; + my @t = split(/\t/); + if ($n_t == 0) { + $n_t = scalar(@t) - 1; + die "<font color=\"yellow\">The input column number exceeds the size of the file: $col1, $col2, $n_t</font>\n" if ( $col1 > $n_t || $col2 > $n_t ); + } + die "<font color=\"yellow\">The columns you have selected contain non numeric characters:$t[$col1] and $t[$col2] \n</font>" if ($t[$col1] =~ /[a-zA-Z]+/ || $t[$col2] =~ /[a-zA-Z]+/); + push(@a, $t[$col1]); + push(@b, $t[$col2]); +} +close($f); + +my $result = correlation(\@a, \@b); + +open($o, ">$out") or die "Couldn't open $out, $!\n"; +$col1 = $col1 + 1; +$col2 = $col2 + 1; +print $o "The correlation of column $col1 and $col2 is $result\n"; +close($o); +print "The correlation of column $col1 and $col2 is $result\n"; + +sub correlation { + my ($array1ref, $array2ref) = @_; + my ($sum1, $sum2); + my ($sum1_squared, $sum2_squared); + foreach (@$array1ref) { $sum1 += $_; $sum1_squared += $_**2; } + foreach (@$array2ref) { $sum2 += $_; $sum2_squared += $_**2; } + my $numerator = (@$array1ref**2) * covariance($array1ref, $array2ref); + my $denominator = sqrt(((@$array1ref * $sum1_squared) - ($sum1**2)) * + ((@$array1ref * $sum2_squared) - ($sum2**2))); + my $r; + if ($denominator == 0) { + print STDERR "The denominator is 0.\n"; + exit 0; + } else { + $r = $numerator / $denominator; + } + return $r; +} + +sub covariance { + my ($array1ref, $array2ref) = @_; + my ($i, $result); + for ($i = 0; $i < @$array1ref; $i++) { + $result += $array1ref->[$i] * $array2ref->[$i]; + } + $result /= @$array1ref; + $result -= mean($array1ref) * mean($array2ref); +} + +sub mean { + my ($arrayref) = @_; + my $result; + foreach (@$arrayref) { $result += $_; } + return $result/@$arrayref; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/correlation.xml Mon Jul 28 11:55:57 2014 -0400 @@ -0,0 +1,15 @@ +<tool id="Pearson_and_apos_Correlation1" name="Pearson and apos Correlation" version="1.0.0"> + <description>between any two numeric columns</description> + <command interpreter="perl">correlation.pl $input $out_file1 $col1 $col2</command> + <inputs> +<!-- <display>on column $col1 and column $col2 of $input</display> --> + <param name="col1" size="3" type="text" value="5" label="Correlate data in column"/> + <param name="col2" size="3" type="text" value="6" label="with data in column"/> + <param format="txt" name="input" type="data" label="in Query"/> + </inputs> + <outputs> + <data format="txt" name="out_file1" /> + </outputs> + <help>Computes Pearsons correlation coefficient between any two numerical columns. Column numbers start at 1. +</help> +</tool>